• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2000-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv2022.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2000feb03
12 *   created by: Markus W. Scherer
13 *
14 *   Change history:
15 *
16 *   06/29/2000  helena  Major rewrite of the callback APIs.
17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
18 *                       Changed implementation of toUnicode
19 *                       function
20 *   08/21/2000  Ram     Added support for ISO-2022-KR
21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
22 *                       ucnvebdc.c
23 *   09/20/2000  Ram     Added support for ISO-2022-CN
24 *                       Added implementations for getNextUChar()
25 *                       for specific 2022 country variants.
26 *   10/31/2000  Ram     Implemented offsets logic functions
27 */
28 
29 #include "unicode/utypes.h"
30 
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32 
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43 
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45 
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48  * I am disabling the generic ISO-2022 converter after proposing to do so on
49  * the icu mailing list two days ago.
50  *
51  * Reasons:
52  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53  *    its designation sequences, single shifts with return to the previous state,
54  *    switch-with-no-return to UTF-16BE or similar, etc.
55  *    This is unlike the language-specific variants like ISO-2022-JP which
56  *    require a much smaller repertoire of ISO-2022 features.
57  *    These variants continue to be supported.
58  * 2. I believe that no one is really using the generic ISO-2022 converter
59  *    but rather always one of the language-specific variants.
60  *    Note that ICU's generic ISO-2022 converter has always output one escape
61  *    sequence followed by UTF-8 for the whole stream.
62  * 3. Switching between subcharsets is extremely slow, because each time
63  *    the previous converter is closed and a new one opened,
64  *    without any kind of caching, least-recently-used list, etc.
65  * 4. The code is currently buggy, and given the above it does not seem
66  *    reasonable to spend the time on maintenance.
67  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68  *    This means, for example, that when ISO-8859-7 is designated, the following
69  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70  *    The ICU ISO-2022 converter does not handle this - and has no information
71  *    about which subconverter would have to be shifted vs. which is designed
72  *    for 7-bit ISO-2022.
73  *
74  * Markus Scherer 2003-dec-03
75  */
76 #endif
77 
78 static const char SHIFT_IN_STR[]  = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80 
81 #define CR      0x0D
82 #define LF      0x0A
83 #define H_TAB   0x09
84 #define V_TAB   0x0B
85 #define SPACE   0x20
86 
87 enum {
88     HWKANA_START=0xff61,
89     HWKANA_END=0xff9f
90 };
91 
92 /*
93  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94  * as bytes 21..7E. (Subtract 0x80.)
95  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96  * as bytes 20..7F. (Subtract 0x80.)
97  * Do not encode C1 control codes with native bytes 80..9F
98  * as bytes 00..1F (C0 control codes).
99  */
100 enum {
101     GR94_START=0xa1,
102     GR94_END=0xfe,
103     GR96_START=0xa0,
104     GR96_END=0xff
105 };
106 
107 /*
108  * ISO 2022 control codes must not be converted from Unicode
109  * because they would mess up the byte stream.
110  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111  * corresponding to SO, SI, and ESC.
112  */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114 
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum  {
117         /* shared values */
118         INVALID_STATE=-1,
119         ASCII = 0,
120 
121         SS2_STATE=0x10,
122         SS3_STATE,
123 
124         /* JP */
125         ISO8859_1 = 1 ,
126         ISO8859_7 = 2 ,
127         JISX201  = 3,
128         JISX208 = 4,
129         JISX212 = 5,
130         GB2312  =6,
131         KSC5601 =7,
132         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
133 
134         /* CN */
135         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136         GB2312_1=1,
137         ISO_IR_165=2,
138         CNS_11643=3,
139 
140         /*
141          * these are used in StateEnum and ISO2022State variables,
142          * but CNS_11643 must be used to index into myConverterArray[]
143          */
144         CNS_11643_0=0x20,
145         CNS_11643_1,
146         CNS_11643_2,
147         CNS_11643_3,
148         CNS_11643_4,
149         CNS_11643_5,
150         CNS_11643_6,
151         CNS_11643_7
152 } StateEnum;
153 
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156 
157 #define CSM(cs) ((uint16_t)1<<(cs))
158 
159 /*
160  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162  *
163  * Note: The converter uses some leniency:
164  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165  *   all versions, not just JIS7 and JIS8.
166  * - ICU does not distinguish between different versions of JIS X 0208.
167  */
168 static const uint16_t jpCharsetMasks[5]={
169     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
170     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
171     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
172     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
174 };
175 
176 typedef enum {
177         ASCII1=0,
178         LATIN1,
179         SBCS,
180         DBCS,
181         MBCS,
182         HWKANA
183 }Cnv2022Type;
184 
185 typedef struct ISO2022State {
186     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188     int8_t prevG;       /* g before single shift (SS2 or SS3) */
189 } ISO2022State;
190 
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
193 
194 typedef struct{
195     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
196     UConverter *currentConverter;
197     Cnv2022Type currentType;
198     ISO2022State toU2022State, fromU2022State;
199     uint32_t key;
200     uint32_t version;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
202     UBool isFirstBuffer;
203 #endif
204     char name[30];
205     char locale[3];
206 }UConverterDataISO2022;
207 
208 /* Protos */
209 /* ISO-2022 ----------------------------------------------------------------- */
210 
211 /*Forward declaration */
212 U_CFUNC void
213 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
214                       UErrorCode * err);
215 U_CFUNC void
216 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
217                                     UErrorCode * err);
218 
219 #define ESC_2022 0x1B /*ESC*/
220 
221 typedef enum
222 {
223         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
224         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
225         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
226         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
227 } UCNV_TableStates_2022;
228 
229 /*
230 * The way these state transition arrays work is:
231 * ex : ESC$B is the sequence for JISX208
232 *      a) First Iteration: char is ESC
233 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
234 *             int x = normalize_esq_chars_2022[27] which is equal to 1
235 *         ii) Search for this value in escSeqStateTable_Key_2022[]
236 *             value of x is stored at escSeqStateTable_Key_2022[0]
237 *        iii) Save this index as offset
238 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
239 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
240 *     b) Switch on this state and continue to next char
241 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
242 *             which is normalize_esq_chars_2022[36] == 4
243 *         ii) x is currently 1(from above)
244 *               x<<=5 -- x is now 32
245 *               x+=normalize_esq_chars_2022[36]
246 *               now x is 36
247 *        iii) Search for this value in escSeqStateTable_Key_2022[]
248 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
249 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
250 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
251 *     c) Switch on this state and continue to next char
252 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
253 *        ii) x is currently 36 (from above)
254 *            x<<=5 -- x is now 1152
255 *            x+=normalize_esq_chars_2022[66]
256 *            now x is 1161
257 *       iii) Search for this value in escSeqStateTable_Key_2022[]
258 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
259 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
260 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
261 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
262 */
263 
264 
265 /*Below are the 3 arrays depicting a state transition table*/
266 static const int8_t normalize_esq_chars_2022[256] = {
267 /*       0      1       2       3       4      5       6        7       8       9           */
268 
269          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
270         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
271         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
272         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
273         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
274         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
275         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
276         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
277         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
278         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
279         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
280         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
281         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
282         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
283         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
284         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
285         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
292         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
294         ,0     ,0      ,0      ,0      ,0      ,0
295 };
296 
297 #ifdef U_ENABLE_GENERIC_ISO_2022
298 /*
299  * When the generic ISO-2022 converter is completely removed, not just disabled
300  * per #ifdef, then the following state table and the associated tables that are
301  * dimensioned with MAX_STATES_2022 should be trimmed.
302  *
303  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
304  * the associated escape sequences starting with ESC ( B should be removed.
305  * This includes the ones with key values 1097 and all of the ones above 1000000.
306  *
307  * For the latter, the tables can simply be truncated.
308  * For the former, since the tables must be kept parallel, it is probably best
309  * to simply duplicate an adjacent table cell, parallel in all tables.
310  *
311  * It may make sense to restructure the tables, especially by using small search
312  * tables for the variants instead of indexing them parallel to the table here.
313  */
314 #endif
315 
316 #define MAX_STATES_2022 74
317 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
318 /*   0           1           2           3           4           5           6           7           8           9           */
319 
320      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
321     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
322     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
323     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
324     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
325     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
326     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
327     ,35947631   ,35947635   ,35947636   ,35947638
328 };
329 
330 #ifdef U_ENABLE_GENERIC_ISO_2022
331 
332 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
333  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
334 
335      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
336     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
337     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
338     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
339     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
340     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
341     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
342     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
343 };
344 
345 #endif
346 
347 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
348 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
349      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
350     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
351     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
352     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
353     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
354     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
355     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
356     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
357 };
358 
359 
360 /* Type def for refactoring changeState_2022 code*/
361 typedef enum{
362 #ifdef U_ENABLE_GENERIC_ISO_2022
363     ISO_2022=0,
364 #endif
365     ISO_2022_JP=1,
366     ISO_2022_KR=2,
367     ISO_2022_CN=3
368 } Variant2022;
369 
370 /*********** ISO 2022 Converter Protos ***********/
371 static void
372 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
373 
374 static void
375  _ISO2022Close(UConverter *converter);
376 
377 static void
378 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
379 
380 static const char*
381 _ISO2022getName(const UConverter* cnv);
382 
383 static void
384 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
385 
386 static UConverter *
387 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
388 
389 #ifdef U_ENABLE_GENERIC_ISO_2022
390 static void
391 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
392 #endif
393 
394 /*const UConverterSharedData _ISO2022Data;*/
395 static const UConverterSharedData _ISO2022JPData;
396 static const UConverterSharedData _ISO2022KRData;
397 static const UConverterSharedData _ISO2022CNData;
398 
399 /*************** Converter implementations ******************/
400 
401 /* The purpose of this function is to get around gcc compiler warnings. */
402 static U_INLINE void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)403 fromUWriteUInt8(UConverter *cnv,
404                  const char *bytes, int32_t length,
405                  uint8_t **target, const char *targetLimit,
406                  int32_t **offsets,
407                  int32_t sourceIndex,
408                  UErrorCode *pErrorCode)
409 {
410     char *targetChars = (char *)*target;
411     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
412                          offsets, sourceIndex, pErrorCode);
413     *target = (uint8_t*)targetChars;
414 
415 }
416 
417 static U_INLINE void
setInitialStateToUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)418 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
419     if(myConverterData->version == 1) {
420         UConverter *cnv = myConverterData->currentConverter;
421 
422         cnv->toUnicodeStatus=0;     /* offset */
423         cnv->mode=0;                /* state */
424         cnv->toULength=0;           /* byteIndex */
425     }
426 }
427 
428 static U_INLINE void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)429 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
430    /* in ISO-2022-KR the designator sequence appears only once
431     * in a file so we append it only once
432     */
433     if( converter->charErrorBufferLength==0){
434 
435         converter->charErrorBufferLength = 4;
436         converter->charErrorBuffer[0] = 0x1b;
437         converter->charErrorBuffer[1] = 0x24;
438         converter->charErrorBuffer[2] = 0x29;
439         converter->charErrorBuffer[3] = 0x43;
440     }
441     if(myConverterData->version == 1) {
442         UConverter *cnv = myConverterData->currentConverter;
443 
444         cnv->fromUChar32=0;
445         cnv->fromUnicodeStatus=1;   /* prevLength */
446     }
447 }
448 
449 static void
_ISO2022Open(UConverter * cnv,const char * name,const char * locale,uint32_t options,UErrorCode * errorCode)450 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
451 
452     char myLocale[6]={' ',' ',' ',' ',' ',' '};
453 
454     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
455     if(cnv->extraInfo != NULL) {
456         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
457         uint32_t version;
458 
459         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
460         myConverterData->currentType = ASCII1;
461         cnv->fromUnicodeStatus =FALSE;
462         if(locale){
463             uprv_strncpy(myLocale, locale, sizeof(myLocale));
464         }
465         version = options & UCNV_OPTIONS_VERSION_MASK;
466         myConverterData->version = version;
467 
468         // BEGIN android-changed
469         /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
470         /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
471         if((myLocale[0]=='j' &&
472             (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
473              myLocale[1]=='s') &&
474             (myLocale[2]=='_' || myLocale[2]=='\0')))
475         {
476             size_t len=0;
477             /* open the required converters and cache them */
478             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
479                 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
480             }
481             if (myLocale[1]=='k') {  /* Use KDDI's version. */
482                 myConverterData->myConverterArray[JISX208]  = ucnv_loadSharedData("kddi-jisx-208-2007", NULL, errorCode);
483             } else if (myLocale[1]=='s') {  /* Use SoftBank's version. */
484                 myConverterData->myConverterArray[JISX208]  = ucnv_loadSharedData("softbank-jisx-208-2007", NULL, errorCode);
485             } else {
486                 myConverterData->myConverterArray[JISX208]  = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
487             }
488             // END android-changed
489 
490             if(jpCharsetMasks[version]&CSM(JISX212)) {
491                 myConverterData->myConverterArray[JISX212]  = ucnv_loadSharedData("jisx-212", NULL, errorCode);
492             }
493             if(jpCharsetMasks[version]&CSM(GB2312)) {
494                 myConverterData->myConverterArray[GB2312]   = ucnv_loadSharedData("ibm-5478", NULL, errorCode);   /* gb_2312_80-1 */
495             }
496             if(jpCharsetMasks[version]&CSM(KSC5601)) {
497                 myConverterData->myConverterArray[KSC5601]  = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
498             }
499 
500             /* set the function pointers to appropriate funtions */
501             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
502             uprv_strcpy(myConverterData->locale,"ja");
503 
504             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
505             len = uprv_strlen(myConverterData->name);
506             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
507             myConverterData->name[len+1]='\0';
508         }
509         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
510             (myLocale[2]=='_' || myLocale[2]=='\0'))
511         {
512             if (version==1){
513                 myConverterData->currentConverter=
514                     ucnv_open("icu-internal-25546",errorCode);
515 
516                 if (U_FAILURE(*errorCode)) {
517                     _ISO2022Close(cnv);
518                     return;
519                 }
520 
521                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
522                 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
523                 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
524             }else{
525                 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
526 
527                 if (U_FAILURE(*errorCode)) {
528                     _ISO2022Close(cnv);
529                     return;
530                 }
531 
532                 myConverterData->version = 0;
533                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
534             }
535 
536             /* initialize the state variables */
537             setInitialStateToUnicodeKR(cnv, myConverterData);
538             setInitialStateFromUnicodeKR(cnv, myConverterData);
539 
540             /* set the function pointers to appropriate funtions */
541             cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
542             uprv_strcpy(myConverterData->locale,"ko");
543         }
544         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
545             (myLocale[2]=='_' || myLocale[2]=='\0'))
546         {
547 
548             /* open the required converters and cache them */
549             myConverterData->myConverterArray[GB2312_1]         = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
550             if(version==1) {
551                 myConverterData->myConverterArray[ISO_IR_165]   = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
552             }
553             myConverterData->myConverterArray[CNS_11643]        = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
554 
555 
556             /* set the function pointers to appropriate funtions */
557             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
558             uprv_strcpy(myConverterData->locale,"cn");
559 
560             if (version==1){
561                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
562             }else{
563                 myConverterData->version = 0;
564                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
565             }
566         }
567         else{
568 #ifdef U_ENABLE_GENERIC_ISO_2022
569             myConverterData->isFirstBuffer = TRUE;
570 
571             /* append the UTF-8 escape sequence */
572             cnv->charErrorBufferLength = 3;
573             cnv->charErrorBuffer[0] = 0x1b;
574             cnv->charErrorBuffer[1] = 0x25;
575             cnv->charErrorBuffer[2] = 0x42;
576 
577             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
578             /* initialize the state variables */
579             uprv_strcpy(myConverterData->name,"ISO_2022");
580 #else
581             *errorCode = U_UNSUPPORTED_ERROR;
582             return;
583 #endif
584         }
585 
586         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
587 
588         if(U_FAILURE(*errorCode)) {
589             _ISO2022Close(cnv);
590         }
591     } else {
592         *errorCode = U_MEMORY_ALLOCATION_ERROR;
593     }
594 }
595 
596 
597 static void
_ISO2022Close(UConverter * converter)598 _ISO2022Close(UConverter *converter) {
599     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
600     UConverterSharedData **array = myData->myConverterArray;
601     int32_t i;
602 
603     if (converter->extraInfo != NULL) {
604         /*close the array of converter pointers and free the memory*/
605         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
606             if(array[i]!=NULL) {
607                 ucnv_unloadSharedDataIfReady(array[i]);
608             }
609         }
610 
611         ucnv_close(myData->currentConverter);
612 
613         if(!converter->isExtraLocal){
614             uprv_free (converter->extraInfo);
615             converter->extraInfo = NULL;
616         }
617     }
618 }
619 
620 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)621 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
622     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
623     if(choice<=UCNV_RESET_TO_UNICODE) {
624         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
625         myConverterData->key = 0;
626     }
627     if(choice!=UCNV_RESET_TO_UNICODE) {
628         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
629     }
630 #ifdef U_ENABLE_GENERIC_ISO_2022
631     if(myConverterData->locale[0] == 0){
632         if(choice<=UCNV_RESET_TO_UNICODE) {
633             myConverterData->isFirstBuffer = TRUE;
634             myConverterData->key = 0;
635             if (converter->mode == UCNV_SO){
636                 ucnv_close (myConverterData->currentConverter);
637                 myConverterData->currentConverter=NULL;
638             }
639             converter->mode = UCNV_SI;
640         }
641         if(choice!=UCNV_RESET_TO_UNICODE) {
642             /* re-append UTF-8 escape sequence */
643             converter->charErrorBufferLength = 3;
644             converter->charErrorBuffer[0] = 0x1b;
645             converter->charErrorBuffer[1] = 0x28;
646             converter->charErrorBuffer[2] = 0x42;
647         }
648     }
649     else
650 #endif
651     {
652         /* reset the state variables */
653         if(myConverterData->locale[0] == 'k'){
654             if(choice<=UCNV_RESET_TO_UNICODE) {
655                 setInitialStateToUnicodeKR(converter, myConverterData);
656             }
657             if(choice!=UCNV_RESET_TO_UNICODE) {
658                 setInitialStateFromUnicodeKR(converter, myConverterData);
659             }
660         }
661     }
662 }
663 
664 static const char*
_ISO2022getName(const UConverter * cnv)665 _ISO2022getName(const UConverter* cnv){
666     if(cnv->extraInfo){
667         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
668         return myData->name;
669     }
670     return NULL;
671 }
672 
673 
674 /*************** to unicode *******************/
675 /****************************************************************************
676  * Recognized escape sequences are
677  * <ESC>(B  ASCII
678  * <ESC>.A  ISO-8859-1
679  * <ESC>.F  ISO-8859-7
680  * <ESC>(J  JISX-201
681  * <ESC>(I  JISX-201
682  * <ESC>$B  JISX-208
683  * <ESC>$@  JISX-208
684  * <ESC>$(D JISX-212
685  * <ESC>$A  GB2312
686  * <ESC>$(C KSC5601
687  */
688 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
689 /*      0                1               2               3               4               5               6               7               8               9    */
690     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
691     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
692     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
693     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
694     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
695     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
696     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
697     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
698 };
699 
700 /*************** to unicode *******************/
701 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
702 /*      0                1               2               3               4               5               6               7               8               9    */
703      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
704     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
705     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
706     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
707     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
708     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
709     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
710     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
711 };
712 
713 
714 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)715 getKey_2022(char c,int32_t* key,int32_t* offset){
716     int32_t togo;
717     int32_t low = 0;
718     int32_t hi = MAX_STATES_2022;
719     int32_t oldmid=0;
720 
721     togo = normalize_esq_chars_2022[(uint8_t)c];
722     if(togo == 0) {
723         /* not a valid character anywhere in an escape sequence */
724         *key = 0;
725         *offset = 0;
726         return INVALID_2022;
727     }
728     togo = (*key << 5) + togo;
729 
730     while (hi != low)  /*binary search*/{
731 
732         register int32_t mid = (hi+low) >> 1; /*Finds median*/
733 
734         if (mid == oldmid)
735             break;
736 
737         if (escSeqStateTable_Key_2022[mid] > togo){
738             hi = mid;
739         }
740         else if (escSeqStateTable_Key_2022[mid] < togo){
741             low = mid;
742         }
743         else /*we found it*/{
744             *key = togo;
745             *offset = mid;
746             return escSeqStateTable_Value_2022[mid];
747         }
748         oldmid = mid;
749 
750     }
751 
752     *key = 0;
753     *offset = 0;
754     return INVALID_2022;
755 }
756 
757 /*runs through a state machine to determine the escape sequence - codepage correspondance
758  */
759 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)760 changeState_2022(UConverter* _this,
761                 const char** source,
762                 const char* sourceLimit,
763                 Variant2022 var,
764                 UErrorCode* err){
765     UCNV_TableStates_2022 value;
766     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
767     uint32_t key = myData2022->key;
768     int32_t offset = 0;
769     char c;
770 
771     value = VALID_NON_TERMINAL_2022;
772     while (*source < sourceLimit) {
773         c = *(*source)++;
774         _this->toUBytes[_this->toULength++]=(uint8_t)c;
775         value = getKey_2022(c,(int32_t *) &key, &offset);
776 
777         switch (value){
778 
779         case VALID_NON_TERMINAL_2022 :
780             /* continue with the loop */
781             break;
782 
783         case VALID_TERMINAL_2022:
784             key = 0;
785             goto DONE;
786 
787         case INVALID_2022:
788             goto DONE;
789 
790         case VALID_MAYBE_TERMINAL_2022:
791 #ifdef U_ENABLE_GENERIC_ISO_2022
792             /* ESC ( B is ambiguous only for ISO_2022 itself */
793             if(var == ISO_2022) {
794                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
795                 _this->toULength = 0;
796 
797                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
798 
799                 /* continue with the loop */
800                 value = VALID_NON_TERMINAL_2022;
801                 break;
802             } else
803 #endif
804             {
805                 /* not ISO_2022 itself, finish here */
806                 value = VALID_TERMINAL_2022;
807                 key = 0;
808                 goto DONE;
809             }
810         }
811     }
812 
813 DONE:
814     myData2022->key = key;
815 
816     if (value == VALID_NON_TERMINAL_2022) {
817         /* indicate that the escape sequence is incomplete: key!=0 */
818         return;
819     } else if (value == INVALID_2022 ) {
820         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
821         return;
822     } else /* value == VALID_TERMINAL_2022 */ {
823         switch(var){
824 #ifdef U_ENABLE_GENERIC_ISO_2022
825         case ISO_2022:
826         {
827             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
828             if(chosenConverterName == NULL) {
829                 /* SS2 or SS3 */
830                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
831                 return;
832             }
833 
834             _this->mode = UCNV_SI;
835             ucnv_close(myData2022->currentConverter);
836             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
837             if(U_SUCCESS(*err)) {
838                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
839                 _this->mode = UCNV_SO;
840             }
841             break;
842         }
843 #endif
844         case ISO_2022_JP:
845             {
846                 StateEnum tempState=nextStateToUnicodeJP[offset];
847                 switch(tempState) {
848                 case INVALID_STATE:
849                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
850                     break;
851                 case SS2_STATE:
852                     if(myData2022->toU2022State.cs[2]!=0) {
853                         if(myData2022->toU2022State.g<2) {
854                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
855                         }
856                         myData2022->toU2022State.g=2;
857                     } else {
858                         /* illegal to have SS2 before a matching designator */
859                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
860                     }
861                     break;
862                 /* case SS3_STATE: not used in ISO-2022-JP-x */
863                 case ISO8859_1:
864                 case ISO8859_7:
865                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
866                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
867                     } else {
868                         /* G2 charset for SS2 */
869                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
870                     }
871                     break;
872                 default:
873                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
874                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
875                     } else {
876                         /* G0 charset */
877                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
878                     }
879                     break;
880                 }
881             }
882             break;
883         case ISO_2022_CN:
884             {
885                 StateEnum tempState=nextStateToUnicodeCN[offset];
886                 switch(tempState) {
887                 case INVALID_STATE:
888                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
889                     break;
890                 case SS2_STATE:
891                     if(myData2022->toU2022State.cs[2]!=0) {
892                         if(myData2022->toU2022State.g<2) {
893                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
894                         }
895                         myData2022->toU2022State.g=2;
896                     } else {
897                         /* illegal to have SS2 before a matching designator */
898                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
899                     }
900                     break;
901                 case SS3_STATE:
902                     if(myData2022->toU2022State.cs[3]!=0) {
903                         if(myData2022->toU2022State.g<2) {
904                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
905                         }
906                         myData2022->toU2022State.g=3;
907                     } else {
908                         /* illegal to have SS3 before a matching designator */
909                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
910                     }
911                     break;
912                 case ISO_IR_165:
913                     if(myData2022->version==0) {
914                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
915                         break;
916                     }
917                     /*fall through*/
918                 case GB2312_1:
919                     /*fall through*/
920                 case CNS_11643_1:
921                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
922                     break;
923                 case CNS_11643_2:
924                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
925                     break;
926                 default:
927                     /* other CNS 11643 planes */
928                     if(myData2022->version==0) {
929                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
930                     } else {
931                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
932                     }
933                     break;
934                 }
935             }
936             break;
937         case ISO_2022_KR:
938             if(offset==0x30){
939                 /* nothing to be done, just accept this one escape sequence */
940             } else {
941                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942             }
943             break;
944 
945         default:
946             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
947             break;
948         }
949     }
950     if(U_SUCCESS(*err)) {
951         _this->toULength = 0;
952     }
953 }
954 
955 /*Checks the characters of the buffer against valid 2022 escape sequences
956 *if the match we return a pointer to the initial start of the sequence otherwise
957 *we return sourceLimit
958 */
959 /*for 2022 looks ahead in the stream
960  *to determine the longest possible convertible
961  *data stream
962  */
963 static U_INLINE const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool flush)964 getEndOfBuffer_2022(const char** source,
965                    const char* sourceLimit,
966                    UBool flush){
967 
968     const char* mySource = *source;
969 
970 #ifdef U_ENABLE_GENERIC_ISO_2022
971     if (*source >= sourceLimit)
972         return sourceLimit;
973 
974     do{
975 
976         if (*mySource == ESC_2022){
977             int8_t i;
978             int32_t key = 0;
979             int32_t offset;
980             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
981 
982             /* Kludge: I could not
983             * figure out the reason for validating an escape sequence
984             * twice - once here and once in changeState_2022().
985             * is it possible to have an ESC character in a ISO2022
986             * byte stream which is valid in a code page? Is it legal?
987             */
988             for (i=0;
989             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
990             i++) {
991                 value =  getKey_2022(*(mySource+i), &key, &offset);
992             }
993             if (value > 0 || *mySource==ESC_2022)
994                 return mySource;
995 
996             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
997                 return sourceLimit;
998         }
999     }while (++mySource < sourceLimit);
1000 
1001     return sourceLimit;
1002 #else
1003     while(mySource < sourceLimit && *mySource != ESC_2022) {
1004         ++mySource;
1005     }
1006     return mySource;
1007 #endif
1008 }
1009 
1010 
1011 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1012  * any future change in _MBCSFromUChar32() function should be reflected here.
1013  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1014  */
1015 static U_INLINE int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1016 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1017                                          UChar32 c,
1018                                          uint32_t* value,
1019                                          UBool useFallback,
1020                                          int outputType)
1021 {
1022     const int32_t *cx;
1023     const uint16_t *table;
1024     uint32_t stage2Entry;
1025     uint32_t myValue;
1026     int32_t length;
1027     const uint8_t *p;
1028     /*
1029      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1030      * Use internal version of ucnv_open() that verifies that the new structures are available,
1031      * else U_INTERNAL_PROGRAM_ERROR.
1032      */
1033     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1034     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1035         table=sharedData->mbcs.fromUnicodeTable;
1036         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1037         /* get the bytes and the length for the output */
1038         if(outputType==MBCS_OUTPUT_2){
1039             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1040             if(myValue<=0xff) {
1041                 length=1;
1042             } else {
1043                 length=2;
1044             }
1045         } else /* outputType==MBCS_OUTPUT_3 */ {
1046             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1047             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1048             if(myValue<=0xff) {
1049                 length=1;
1050             } else if(myValue<=0xffff) {
1051                 length=2;
1052             } else {
1053                 length=3;
1054             }
1055         }
1056         /* is this code point assigned, or do we use fallbacks? */
1057         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1058             /* assigned */
1059             *value=myValue;
1060             return length;
1061         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1062             /*
1063              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1064              * There is no way with this data structure for fallback output
1065              * to be a zero byte.
1066              */
1067             *value=myValue;
1068             return -length;
1069         }
1070     }
1071 
1072     cx=sharedData->mbcs.extIndexes;
1073     if(cx!=NULL) {
1074         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1075     }
1076 
1077     /* unassigned */
1078     return 0;
1079 }
1080 
1081 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1082  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1083  * @param retval pointer to output byte
1084  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1085  */
1086 static U_INLINE int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1087 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1088                                        UChar32 c,
1089                                        uint32_t* retval,
1090                                        UBool useFallback)
1091 {
1092     const uint16_t *table;
1093     int32_t value;
1094     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1095     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1096         return 0;
1097     }
1098     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1099     table=sharedData->mbcs.fromUnicodeTable;
1100     /* get the byte for the output */
1101     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1102     /* is this code point assigned, or do we use fallbacks? */
1103     *retval=(uint32_t)(value&0xff);
1104     if(value>=0xf00) {
1105         return 1;  /* roundtrip */
1106     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1107         return -1;  /* fallback taken */
1108     } else {
1109         return 0;  /* no mapping */
1110     }
1111 }
1112 
1113 /*
1114  * Check that the result is a 2-byte value with each byte in the range A1..FE
1115  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1116  * to move it to the ISO 2022 range 21..7E.
1117  * Return 0 if out of range.
1118  */
1119 static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value)1120 _2022FromGR94DBCS(uint32_t value) {
1121     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1122         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1123     ) {
1124         return value - 0x8080;  /* shift down to 21..7e byte range */
1125     } else {
1126         return 0;  /* not valid for ISO 2022 */
1127     }
1128 }
1129 
1130 #ifdef U_ENABLE_GENERIC_ISO_2022
1131 
1132 /**********************************************************************************
1133 *  ISO-2022 Converter
1134 *
1135 *
1136 */
1137 
1138 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1139 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1140                                                            UErrorCode* err){
1141     const char* mySourceLimit, *realSourceLimit;
1142     const char* sourceStart;
1143     const UChar* myTargetStart;
1144     UConverter* saveThis;
1145     UConverterDataISO2022* myData;
1146     int8_t length;
1147 
1148     saveThis = args->converter;
1149     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1150 
1151     realSourceLimit = args->sourceLimit;
1152     while (args->source < realSourceLimit) {
1153         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1154             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1155             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1156 
1157             if(args->source < mySourceLimit) {
1158                 if(myData->currentConverter==NULL) {
1159                     myData->currentConverter = ucnv_open("ASCII",err);
1160                     if(U_FAILURE(*err)){
1161                         return;
1162                     }
1163 
1164                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1165                     saveThis->mode = UCNV_SO;
1166                 }
1167 
1168                 /* convert to before the ESC or until the end of the buffer */
1169                 myData->isFirstBuffer=FALSE;
1170                 sourceStart = args->source;
1171                 myTargetStart = args->target;
1172                 args->converter = myData->currentConverter;
1173                 ucnv_toUnicode(args->converter,
1174                     &args->target,
1175                     args->targetLimit,
1176                     &args->source,
1177                     mySourceLimit,
1178                     args->offsets,
1179                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1180                     err);
1181                 args->converter = saveThis;
1182 
1183                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1184                     /* move the overflow buffer */
1185                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1186                     myData->currentConverter->UCharErrorBufferLength = 0;
1187                     if(length > 0) {
1188                         uprv_memcpy(saveThis->UCharErrorBuffer,
1189                                     myData->currentConverter->UCharErrorBuffer,
1190                                     length*U_SIZEOF_UCHAR);
1191                     }
1192                     return;
1193                 }
1194 
1195                 /*
1196                  * At least one of:
1197                  * -Error while converting
1198                  * -Done with entire buffer
1199                  * -Need to write offsets or update the current offset
1200                  *  (leave that up to the code in ucnv.c)
1201                  *
1202                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1203                  */
1204                 if (U_FAILURE(*err) ||
1205                     (args->source == realSourceLimit) ||
1206                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1207                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1208                 ) {
1209                     /* copy partial or error input for truncated detection and error handling */
1210                     if(U_FAILURE(*err)) {
1211                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1212                         if(length > 0) {
1213                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1214                         }
1215                     } else {
1216                         length = saveThis->toULength = myData->currentConverter->toULength;
1217                         if(length > 0) {
1218                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1219                             if(args->source < mySourceLimit) {
1220                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1221                             }
1222                         }
1223                     }
1224                     return;
1225                 }
1226             }
1227         }
1228 
1229         sourceStart = args->source;
1230         changeState_2022(args->converter,
1231                &(args->source),
1232                realSourceLimit,
1233                ISO_2022,
1234                err);
1235         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1236             /* let the ucnv.c code update its current offset */
1237             return;
1238         }
1239     }
1240 }
1241 
1242 #endif
1243 
1244 /*
1245  * To Unicode Callback helper function
1246  */
1247 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1248 toUnicodeCallback(UConverter *cnv,
1249                   const uint32_t sourceChar, const uint32_t targetUniChar,
1250                   UErrorCode* err){
1251     if(sourceChar>0xff){
1252         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1253         cnv->toUBytes[1] = (uint8_t)sourceChar;
1254         cnv->toULength = 2;
1255     }
1256     else{
1257         cnv->toUBytes[0] =(char) sourceChar;
1258         cnv->toULength = 1;
1259     }
1260 
1261     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1262         *err = U_INVALID_CHAR_FOUND;
1263     }
1264     else{
1265         *err = U_ILLEGAL_CHAR_FOUND;
1266     }
1267 }
1268 
1269 /**************************************ISO-2022-JP*************************************************/
1270 
1271 /************************************** IMPORTANT **************************************************
1272 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1273 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1274 * The converter iterates over each Unicode codepoint
1275 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1276 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1277 * would do as far as possible.
1278 *
1279 * If the implementation of these macros or structure of sharedData struct change in the future, make
1280 * sure that ISO-2022 is also changed.
1281 ***************************************************************************************************
1282 */
1283 
1284 /***************************************************************************************************
1285 * Rules for ISO-2022-jp encoding
1286 * (i)   Escape sequences must be fully contained within a line they should not
1287 *       span new lines or CRs
1288 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1289 *       JIS-Roman character escape sequence should follow before the line terminates
1290 * (iii) If the first character on the line is represented by two bytes then a two
1291 *       byte character escape sequence should precede it
1292 * (iv)  If no escape sequence is encountered then the characters are ASCII
1293 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1294 *       and invoked with SS2 (ESC N).
1295 * (vi)  If there is any G0 designation in text, there must be a switch to
1296 *       ASCII or to JIS X 0201-Roman before a space character (but not
1297 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1298 *       characters such as tab or CRLF.
1299 * (vi)  Supported encodings:
1300 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1301 *
1302 *  source : RFC-1554
1303 *
1304 *          JISX201, JISX208,JISX212 : new .cnv data files created
1305 *          KSC5601 : alias to ibm-949 mapping table
1306 *          GB2312 : alias to ibm-1386 mapping table
1307 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1308 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1309 */
1310 
1311 /* preference order of JP charsets */
1312 static const StateEnum jpCharsetPref[]={
1313     ASCII,
1314     JISX201,
1315     ISO8859_1,
1316     ISO8859_7,
1317     JISX208,
1318     JISX212,
1319     GB2312,
1320     KSC5601,
1321     HWKANA_7BIT
1322 };
1323 
1324 /*
1325  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1326  * not in order of jpCharsetPref[]!
1327  */
1328 static const char escSeqChars[][6] ={
1329     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1330     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1331     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1332     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1333     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1334     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1335     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1336     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1337     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1338 
1339 };
1340 static  const int32_t escSeqCharsLen[] ={
1341     3, /* length of <ESC>(B  ASCII       */
1342     3, /* length of <ESC>.A  ISO-8859-1  */
1343     3, /* length of <ESC>.F  ISO-8859-7  */
1344     3, /* length of <ESC>(J  JISX-201    */
1345     3, /* length of <ESC>$B  JISX-208    */
1346     4, /* length of <ESC>$(D JISX-212    */
1347     3, /* length of <ESC>$A  GB2312      */
1348     4, /* length of <ESC>$(C KSC5601     */
1349     3  /* length of <ESC>(I  HWKANA_7BIT */
1350 };
1351 
1352 /*
1353 * The iteration over various code pages works this way:
1354 * i)   Get the currentState from myConverterData->currentState
1355 * ii)  Check if the character is mapped to a valid character in the currentState
1356 *      Yes ->  a) set the initIterState to currentState
1357 *       b) remain in this state until an invalid character is found
1358 *      No  ->  a) go to the next code page and find the character
1359 * iii) Before changing the state increment the current state check if the current state
1360 *      is equal to the intitIteration state
1361 *      Yes ->  A character that cannot be represented in any of the supported encodings
1362 *       break and return a U_INVALID_CHARACTER error
1363 *      No  ->  Continue and find the character in next code page
1364 *
1365 *
1366 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1367 */
1368 
1369 /* Map 00..7F to Unicode according to JIS X 0201. */
1370 static U_INLINE uint32_t
jisx201ToU(uint32_t value)1371 jisx201ToU(uint32_t value) {
1372     if(value < 0x5c) {
1373         return value;
1374     } else if(value == 0x5c) {
1375         return 0xa5;
1376     } else if(value == 0x7e) {
1377         return 0x203e;
1378     } else /* value <= 0x7f */ {
1379         return value;
1380     }
1381 }
1382 
1383 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1384 static U_INLINE uint32_t
jisx201FromU(uint32_t value)1385 jisx201FromU(uint32_t value) {
1386     if(value<=0x7f) {
1387         if(value!=0x5c && value!=0x7e) {
1388             return value;
1389         }
1390     } else if(value==0xa5) {
1391         return 0x5c;
1392     } else if(value==0x203e) {
1393         return 0x7e;
1394     }
1395     return 0xfffe;
1396 }
1397 
1398 /*
1399  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1400  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1401  * Return 0 if the byte pair is out of range.
1402  */
1403 static U_INLINE uint32_t
_2022FromSJIS(uint32_t value)1404 _2022FromSJIS(uint32_t value) {
1405     uint8_t trail;
1406 
1407     if(value > 0xEFFC) {
1408         return 0;  /* beyond JIS X 0208 */
1409     }
1410 
1411     trail = (uint8_t)value;
1412 
1413     value &= 0xff00;  /* lead byte */
1414     if(value <= 0x9f00) {
1415         value -= 0x7000;
1416     } else /* 0xe000 <= value <= 0xef00 */ {
1417         value -= 0xb000;
1418     }
1419     value <<= 1;
1420 
1421     if(trail <= 0x9e) {
1422         value -= 0x100;
1423         if(trail <= 0x7e) {
1424             value |= trail - 0x1f;
1425         } else {
1426             value |= trail - 0x20;
1427         }
1428     } else /* trail <= 0xfc */ {
1429         value |= trail - 0x7e;
1430     }
1431     return value;
1432 }
1433 
1434 /*
1435  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1436  * If either byte is outside 21..7E make sure that the result is not valid
1437  * for Shift-JIS so that the converter catches it.
1438  * Some invalid byte values already turn into equally invalid Shift-JIS
1439  * byte values and need not be tested explicitly.
1440  */
1441 static U_INLINE void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1442 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1443     if(c1&1) {
1444         ++c1;
1445         if(c2 <= 0x5f) {
1446             c2 += 0x1f;
1447         } else if(c2 <= 0x7e) {
1448             c2 += 0x20;
1449         } else {
1450             c2 = 0;  /* invalid */
1451         }
1452     } else {
1453         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1454             c2 += 0x7e;
1455         } else {
1456             c2 = 0;  /* invalid */
1457         }
1458     }
1459     c1 >>= 1;
1460     if(c1 <= 0x2f) {
1461         c1 += 0x70;
1462     } else if(c1 <= 0x3f) {
1463         c1 += 0xb0;
1464     } else {
1465         c1 = 0;  /* invalid */
1466     }
1467     bytes[0] = (char)c1;
1468     bytes[1] = (char)c2;
1469 }
1470 
1471 /*
1472  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1473  * Katakana.
1474  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1475  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1476  * These were the only fallbacks in ICU's jisx-208.ucm file.
1477  */
1478 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1479     0x2123,  /* U+FF61 */
1480     0x2156,
1481     0x2157,
1482     0x2122,
1483     0x2126,
1484     0x2572,
1485     0x2521,
1486     0x2523,
1487     0x2525,
1488     0x2527,
1489     0x2529,
1490     0x2563,
1491     0x2565,
1492     0x2567,
1493     0x2543,
1494     0x213C,  /* U+FF70 */
1495     0x2522,
1496     0x2524,
1497     0x2526,
1498     0x2528,
1499     0x252A,
1500     0x252B,
1501     0x252D,
1502     0x252F,
1503     0x2531,
1504     0x2533,
1505     0x2535,
1506     0x2537,
1507     0x2539,
1508     0x253B,
1509     0x253D,
1510     0x253F,  /* U+FF80 */
1511     0x2541,
1512     0x2544,
1513     0x2546,
1514     0x2548,
1515     0x254A,
1516     0x254B,
1517     0x254C,
1518     0x254D,
1519     0x254E,
1520     0x254F,
1521     0x2552,
1522     0x2555,
1523     0x2558,
1524     0x255B,
1525     0x255E,
1526     0x255F,  /* U+FF90 */
1527     0x2560,
1528     0x2561,
1529     0x2562,
1530     0x2564,
1531     0x2566,
1532     0x2568,
1533     0x2569,
1534     0x256A,
1535     0x256B,
1536     0x256C,
1537     0x256D,
1538     0x256F,
1539     0x2573,
1540     0x212B,
1541     0x212C   /* U+FF9F */
1542 };
1543 
1544 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1545 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1546     UConverter *cnv = args->converter;
1547     UConverterDataISO2022 *converterData;
1548     ISO2022State *pFromU2022State;
1549     uint8_t *target = (uint8_t *) args->target;
1550     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1551     const UChar* source = args->source;
1552     const UChar* sourceLimit = args->sourceLimit;
1553     int32_t* offsets = args->offsets;
1554     UChar32 sourceChar;
1555     char buffer[8];
1556     int32_t len, outLen;
1557     int8_t choices[10];
1558     int32_t choiceCount;
1559     uint32_t targetValue = 0;
1560     UBool useFallback;
1561 
1562     int32_t i;
1563     int8_t cs, g;
1564 
1565     /* set up the state */
1566     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1567     pFromU2022State   = &converterData->fromU2022State;
1568 
1569     choiceCount = 0;
1570 
1571     /* check if the last codepoint of previous buffer was a lead surrogate*/
1572     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1573         goto getTrail;
1574     }
1575 
1576     while(source < sourceLimit) {
1577         if(target < targetLimit) {
1578 
1579             sourceChar  = *(source++);
1580             /*check if the char is a First surrogate*/
1581             if(UTF_IS_SURROGATE(sourceChar)) {
1582                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1583 getTrail:
1584                     /*look ahead to find the trail surrogate*/
1585                     if(source < sourceLimit) {
1586                         /* test the following code unit */
1587                         UChar trail=(UChar) *source;
1588                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1589                             source++;
1590                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1591                             cnv->fromUChar32=0x00;
1592                             /* convert this supplementary code point */
1593                             /* exit this condition tree */
1594                         } else {
1595                             /* this is an unmatched lead code unit (1st surrogate) */
1596                             /* callback(illegal) */
1597                             *err=U_ILLEGAL_CHAR_FOUND;
1598                             cnv->fromUChar32=sourceChar;
1599                             break;
1600                         }
1601                     } else {
1602                         /* no more input */
1603                         cnv->fromUChar32=sourceChar;
1604                         break;
1605                     }
1606                 } else {
1607                     /* this is an unmatched trail code unit (2nd surrogate) */
1608                     /* callback(illegal) */
1609                     *err=U_ILLEGAL_CHAR_FOUND;
1610                     cnv->fromUChar32=sourceChar;
1611                     break;
1612                 }
1613             }
1614 
1615             /* do not convert SO/SI/ESC */
1616             if(IS_2022_CONTROL(sourceChar)) {
1617                 /* callback(illegal) */
1618                 *err=U_ILLEGAL_CHAR_FOUND;
1619                 cnv->fromUChar32=sourceChar;
1620                 break;
1621             }
1622 
1623             /* do the conversion */
1624 
1625             if(choiceCount == 0) {
1626                 uint16_t csm;
1627 
1628                 /*
1629                  * The csm variable keeps track of which charsets are allowed
1630                  * and not used yet while building the choices[].
1631                  */
1632                 csm = jpCharsetMasks[converterData->version];
1633                 choiceCount = 0;
1634 
1635                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1636                 if(converterData->version == 3 || converterData->version == 4) {
1637                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1638                 }
1639                 /* Do not try single-byte half-width Katakana for other versions. */
1640                 csm &= ~CSM(HWKANA_7BIT);
1641 
1642                 /* try the current G0 charset */
1643                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1644                 csm &= ~CSM(cs);
1645 
1646                 /* try the current G2 charset */
1647                 if((cs = pFromU2022State->cs[2]) != 0) {
1648                     choices[choiceCount++] = cs;
1649                     csm &= ~CSM(cs);
1650                 }
1651 
1652                 /* try all the other possible charsets */
1653                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1654                     cs = (int8_t)jpCharsetPref[i];
1655                     if(CSM(cs) & csm) {
1656                         choices[choiceCount++] = cs;
1657                         csm &= ~CSM(cs);
1658                     }
1659                 }
1660             }
1661 
1662             cs = g = 0;
1663             /*
1664              * len==0: no mapping found yet
1665              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1666              * len>0: found a roundtrip result, done
1667              */
1668             len = 0;
1669             /*
1670              * We will turn off useFallback after finding a fallback,
1671              * but we still get fallbacks from PUA code points as usual.
1672              * Therefore, we will also need to check that we don't overwrite
1673              * an early fallback with a later one.
1674              */
1675             useFallback = cnv->useFallback;
1676 
1677             for(i = 0; i < choiceCount && len <= 0; ++i) {
1678                 uint32_t value;
1679                 int32_t len2;
1680                 int8_t cs0 = choices[i];
1681                 switch(cs0) {
1682                 case ASCII:
1683                     if(sourceChar <= 0x7f) {
1684                         targetValue = (uint32_t)sourceChar;
1685                         len = 1;
1686                         cs = cs0;
1687                         g = 0;
1688                     }
1689                     break;
1690                 case ISO8859_1:
1691                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1692                         targetValue = (uint32_t)sourceChar - 0x80;
1693                         len = 1;
1694                         cs = cs0;
1695                         g = 2;
1696                     }
1697                     break;
1698                 case HWKANA_7BIT:
1699                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1700                         if(converterData->version==3) {
1701                             /* JIS7: use G1 (SO) */
1702                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1703                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1704                             len = 1;
1705                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1706                             g = 1;
1707                         } else if(converterData->version==4) {
1708                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1709                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1710                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1711                             len = 1;
1712 
1713                             cs = pFromU2022State->cs[0];
1714                             if(IS_JP_DBCS(cs)) {
1715                                 /* switch from a DBCS charset to JISX201 */
1716                                 cs = (int8_t)JISX201;
1717                             }
1718                             /* else stay in the current G0 charset */
1719                             g = 0;
1720                         }
1721                         /* else do not use HWKANA_7BIT with other versions */
1722                     }
1723                     break;
1724                 case JISX201:
1725                     /* G0 SBCS */
1726                     value = jisx201FromU(sourceChar);
1727                     if(value <= 0x7f) {
1728                         targetValue = value;
1729                         len = 1;
1730                         cs = cs0;
1731                         g = 0;
1732                         useFallback = FALSE;
1733                     }
1734                     break;
1735                 case JISX208:
1736                     /* G0 DBCS from Shift-JIS table */
1737                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1738                                 converterData->myConverterArray[cs0],
1739                                 sourceChar, &value,
1740                                 useFallback, MBCS_OUTPUT_2);
1741                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1742                         value = _2022FromSJIS(value);
1743                         if(value != 0) {
1744                             targetValue = value;
1745                             len = len2;
1746                             cs = cs0;
1747                             g = 0;
1748                             useFallback = FALSE;
1749                         }
1750                     } else if(len == 0 && useFallback &&
1751                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1752                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1753                         len = -2;
1754                         cs = cs0;
1755                         g = 0;
1756                         useFallback = FALSE;
1757                     }
1758                     break;
1759                 case ISO8859_7:
1760                     /* G0 SBCS forced to 7-bit output */
1761                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1762                                 converterData->myConverterArray[cs0],
1763                                 sourceChar, &value,
1764                                 useFallback);
1765                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1766                         targetValue = value - 0x80;
1767                         len = len2;
1768                         cs = cs0;
1769                         g = 2;
1770                         useFallback = FALSE;
1771                     }
1772                     break;
1773                 default:
1774                     /* G0 DBCS */
1775                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1776                                 converterData->myConverterArray[cs0],
1777                                 sourceChar, &value,
1778                                 useFallback, MBCS_OUTPUT_2);
1779                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1780                         if(cs0 == KSC5601) {
1781                             /*
1782                              * Check for valid bytes for the encoding scheme.
1783                              * This is necessary because the sub-converter (windows-949)
1784                              * has a broader encoding scheme than is valid for 2022.
1785                              */
1786                             value = _2022FromGR94DBCS(value);
1787                             if(value == 0) {
1788                                 break;
1789                             }
1790                         }
1791                         targetValue = value;
1792                         len = len2;
1793                         cs = cs0;
1794                         g = 0;
1795                         useFallback = FALSE;
1796                     }
1797                     break;
1798                 }
1799             }
1800 
1801             if(len != 0) {
1802                 if(len < 0) {
1803                     len = -len;  /* fallback */
1804                 }
1805                 outLen = 0; /* count output bytes */
1806 
1807                 /* write SI if necessary (only for JIS7) */
1808                 if(pFromU2022State->g == 1 && g == 0) {
1809                     buffer[outLen++] = UCNV_SI;
1810                     pFromU2022State->g = 0;
1811                 }
1812 
1813                 /* write the designation sequence if necessary */
1814                 if(cs != pFromU2022State->cs[g]) {
1815                     int32_t escLen = escSeqCharsLen[cs];
1816                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1817                     outLen += escLen;
1818                     pFromU2022State->cs[g] = cs;
1819 
1820                     /* invalidate the choices[] */
1821                     choiceCount = 0;
1822                 }
1823 
1824                 /* write the shift sequence if necessary */
1825                 if(g != pFromU2022State->g) {
1826                     switch(g) {
1827                     /* case 0 handled before writing escapes */
1828                     case 1:
1829                         buffer[outLen++] = UCNV_SO;
1830                         pFromU2022State->g = 1;
1831                         break;
1832                     default: /* case 2 */
1833                         buffer[outLen++] = 0x1b;
1834                         buffer[outLen++] = 0x4e;
1835                         break;
1836                     /* no case 3: no SS3 in ISO-2022-JP-x */
1837                     }
1838                 }
1839 
1840                 /* write the output bytes */
1841                 if(len == 1) {
1842                     buffer[outLen++] = (char)targetValue;
1843                 } else /* len == 2 */ {
1844                     buffer[outLen++] = (char)(targetValue >> 8);
1845                     buffer[outLen++] = (char)targetValue;
1846                 }
1847             } else {
1848                 /*
1849                  * if we cannot find the character after checking all codepages
1850                  * then this is an error
1851                  */
1852                 *err = U_INVALID_CHAR_FOUND;
1853                 cnv->fromUChar32=sourceChar;
1854                 break;
1855             }
1856 
1857             if(sourceChar == CR || sourceChar == LF) {
1858                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1859                 pFromU2022State->cs[2] = 0;
1860                 choiceCount = 0;
1861             }
1862 
1863             /* output outLen>0 bytes in buffer[] */
1864             if(outLen == 1) {
1865                 *target++ = buffer[0];
1866                 if(offsets) {
1867                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1868                 }
1869             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1870                 *target++ = buffer[0];
1871                 *target++ = buffer[1];
1872                 if(offsets) {
1873                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1874                     *offsets++ = sourceIndex;
1875                     *offsets++ = sourceIndex;
1876                 }
1877             } else {
1878                 fromUWriteUInt8(
1879                     cnv,
1880                     buffer, outLen,
1881                     &target, (const char *)targetLimit,
1882                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1883                     err);
1884                 if(U_FAILURE(*err)) {
1885                     break;
1886                 }
1887             }
1888         } /* end if(myTargetIndex<myTargetLength) */
1889         else{
1890             *err =U_BUFFER_OVERFLOW_ERROR;
1891             break;
1892         }
1893 
1894     }/* end while(mySourceIndex<mySourceLength) */
1895 
1896     /*
1897      * the end of the input stream and detection of truncated input
1898      * are handled by the framework, but for ISO-2022-JP conversion
1899      * we need to be in ASCII mode at the very end
1900      *
1901      * conditions:
1902      *   successful
1903      *   in SO mode or not in ASCII mode
1904      *   end of input and no truncated input
1905      */
1906     if( U_SUCCESS(*err) &&
1907         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1908         args->flush && source>=sourceLimit && cnv->fromUChar32==0
1909     ) {
1910         int32_t sourceIndex;
1911 
1912         outLen = 0;
1913 
1914         if(pFromU2022State->g != 0) {
1915             buffer[outLen++] = UCNV_SI;
1916             pFromU2022State->g = 0;
1917         }
1918 
1919         if(pFromU2022State->cs[0] != ASCII) {
1920             int32_t escLen = escSeqCharsLen[ASCII];
1921             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1922             outLen += escLen;
1923             pFromU2022State->cs[0] = (int8_t)ASCII;
1924         }
1925 
1926         /* get the source index of the last input character */
1927         /*
1928          * TODO this would be simpler and more reliable if we used a pair
1929          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1930          * so that we could simply use the prevSourceIndex here;
1931          * this code gives an incorrect result for the rare case of an unmatched
1932          * trail surrogate that is alone in the last buffer of the text stream
1933          */
1934         sourceIndex=(int32_t)(source-args->source);
1935         if(sourceIndex>0) {
1936             --sourceIndex;
1937             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1938                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1939             ) {
1940                 --sourceIndex;
1941             }
1942         } else {
1943             sourceIndex=-1;
1944         }
1945 
1946         fromUWriteUInt8(
1947             cnv,
1948             buffer, outLen,
1949             &target, (const char *)targetLimit,
1950             &offsets, sourceIndex,
1951             err);
1952     }
1953 
1954     /*save the state and return */
1955     args->source = source;
1956     args->target = (char*)target;
1957 }
1958 
1959 /*************** to unicode *******************/
1960 
1961 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1962 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1963                                                UErrorCode* err){
1964     char tempBuf[2];
1965     const char *mySource = (char *) args->source;
1966     UChar *myTarget = args->target;
1967     const char *mySourceLimit = args->sourceLimit;
1968     uint32_t targetUniChar = 0x0000;
1969     uint32_t mySourceChar = 0x0000;
1970     UConverterDataISO2022* myData;
1971     ISO2022State *pToU2022State;
1972     StateEnum cs;
1973 
1974     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1975     pToU2022State = &myData->toU2022State;
1976 
1977     if(myData->key != 0) {
1978         /* continue with a partial escape sequence */
1979         goto escape;
1980     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1981         /* continue with a partial double-byte character */
1982         mySourceChar = args->converter->toUBytes[0];
1983         args->converter->toULength = 0;
1984         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1985         goto getTrailByte;
1986     }
1987 
1988     while(mySource < mySourceLimit){
1989 
1990         targetUniChar =missingCharMarker;
1991 
1992         if(myTarget < args->targetLimit){
1993 
1994             mySourceChar= (unsigned char) *mySource++;
1995 
1996             switch(mySourceChar) {
1997             case UCNV_SI:
1998                 if(myData->version==3) {
1999                     pToU2022State->g=0;
2000                     continue;
2001                 } else {
2002                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2003                     break;
2004                 }
2005 
2006             case UCNV_SO:
2007                 if(myData->version==3) {
2008                     /* JIS7: switch to G1 half-width Katakana */
2009                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2010                     pToU2022State->g=1;
2011                     continue;
2012                 } else {
2013                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2014                     break;
2015                 }
2016 
2017             case ESC_2022:
2018                 mySource--;
2019 escape:
2020                 changeState_2022(args->converter,&(mySource),
2021                     mySourceLimit, ISO_2022_JP,err);
2022 
2023                 /* invalid or illegal escape sequence */
2024                 if(U_FAILURE(*err)){
2025                     args->target = myTarget;
2026                     args->source = mySource;
2027                     return;
2028                 }
2029                 continue;
2030 
2031             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2032 
2033             case CR:
2034                 /*falls through*/
2035             case LF:
2036                 /* automatically reset to single-byte mode */
2037                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2038                     pToU2022State->cs[0] = (int8_t)ASCII;
2039                 }
2040                 pToU2022State->cs[2] = 0;
2041                 pToU2022State->g = 0;
2042                 /* falls through */
2043             default:
2044                 /* convert one or two bytes */
2045                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2046                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2047                     !IS_JP_DBCS(cs)
2048                 ) {
2049                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2050                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2051 
2052                     /* return from a single-shift state to the previous one */
2053                     if(pToU2022State->g >= 2) {
2054                         pToU2022State->g=pToU2022State->prevG;
2055                     }
2056                 } else switch(cs) {
2057                 case ASCII:
2058                     if(mySourceChar <= 0x7f) {
2059                         targetUniChar = mySourceChar;
2060                     }
2061                     break;
2062                 case ISO8859_1:
2063                     if(mySourceChar <= 0x7f) {
2064                         targetUniChar = mySourceChar + 0x80;
2065                     }
2066                     /* return from a single-shift state to the previous one */
2067                     pToU2022State->g=pToU2022State->prevG;
2068                     break;
2069                 case ISO8859_7:
2070                     if(mySourceChar <= 0x7f) {
2071                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2072                         targetUniChar =
2073                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2074                                 myData->myConverterArray[cs],
2075                                 mySourceChar + 0x80);
2076                     }
2077                     /* return from a single-shift state to the previous one */
2078                     pToU2022State->g=pToU2022State->prevG;
2079                     break;
2080                 case JISX201:
2081                     if(mySourceChar <= 0x7f) {
2082                         targetUniChar = jisx201ToU(mySourceChar);
2083                     }
2084                     break;
2085                 case HWKANA_7BIT:
2086                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2087                         /* 7-bit halfwidth Katakana */
2088                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2089                     }
2090                     break;
2091                 default:
2092                     /* G0 DBCS */
2093                     if(mySource < mySourceLimit) {
2094                         char trailByte;
2095 getTrailByte:
2096                         trailByte = *mySource++;
2097                         if(cs == JISX208) {
2098                             _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
2099                         } else {
2100                             tempBuf[0] = (char)mySourceChar;
2101                             tempBuf[1] = trailByte;
2102                         }
2103                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2104                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2105                     } else {
2106                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2107                         args->converter->toULength = 1;
2108                         goto endloop;
2109                     }
2110                 }  /* End of inner switch */
2111                 break;
2112             }  /* End of outer switch */
2113             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2114                 if(args->offsets){
2115                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2116                 }
2117                 *(myTarget++)=(UChar)targetUniChar;
2118             }
2119             else if(targetUniChar > missingCharMarker){
2120                 /* disassemble the surrogate pair and write to output*/
2121                 targetUniChar-=0x0010000;
2122                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2123                 if(args->offsets){
2124                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2125                 }
2126                 ++myTarget;
2127                 if(myTarget< args->targetLimit){
2128                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2129                     if(args->offsets){
2130                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2131                     }
2132                     ++myTarget;
2133                 }else{
2134                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2135                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2136                 }
2137 
2138             }
2139             else{
2140                 /* Call the callback function*/
2141                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2142                 break;
2143             }
2144         }
2145         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2146             *err =U_BUFFER_OVERFLOW_ERROR;
2147             break;
2148         }
2149     }
2150 endloop:
2151     args->target = myTarget;
2152     args->source = mySource;
2153 }
2154 
2155 
2156 /***************************************************************
2157 *   Rules for ISO-2022-KR encoding
2158 *   i) The KSC5601 designator sequence should appear only once in a file,
2159 *      at the begining of a line before any KSC5601 characters. This usually
2160 *      means that it appears by itself on the first line of the file
2161 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2162 *      and SI to shift into single byte mode
2163 */
2164 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2165 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2166 
2167     UConverter* saveConv = args->converter;
2168     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2169     args->converter=myConverterData->currentConverter;
2170 
2171     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2172     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2173     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2174 
2175     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2176         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2177             uprv_memcpy(
2178                 saveConv->charErrorBuffer,
2179                 myConverterData->currentConverter->charErrorBuffer,
2180                 myConverterData->currentConverter->charErrorBufferLength);
2181         }
2182         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2183         myConverterData->currentConverter->charErrorBufferLength = 0;
2184     }
2185     args->converter=saveConv;
2186 }
2187 
2188 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2189 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2190 
2191     const UChar *source = args->source;
2192     const UChar *sourceLimit = args->sourceLimit;
2193     unsigned char *target = (unsigned char *) args->target;
2194     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2195     int32_t* offsets = args->offsets;
2196     uint32_t targetByteUnit = 0x0000;
2197     UChar32 sourceChar = 0x0000;
2198     UBool isTargetByteDBCS;
2199     UBool oldIsTargetByteDBCS;
2200     UConverterDataISO2022 *converterData;
2201     UConverterSharedData* sharedData;
2202     UBool useFallback;
2203     int32_t length =0;
2204 
2205     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2206     /* if the version is 1 then the user is requesting
2207      * conversion with ibm-25546 pass the arguments to
2208      * MBCS converter and return
2209      */
2210     if(converterData->version==1){
2211         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2212         return;
2213     }
2214 
2215     /* initialize data */
2216     sharedData = converterData->currentConverter->sharedData;
2217     useFallback = args->converter->useFallback;
2218     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2219     oldIsTargetByteDBCS = isTargetByteDBCS;
2220 
2221     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2222     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2223         goto getTrail;
2224     }
2225     while(source < sourceLimit){
2226 
2227         targetByteUnit = missingCharMarker;
2228 
2229         if(target < (unsigned char*) args->targetLimit){
2230             sourceChar = *source++;
2231 
2232             /* do not convert SO/SI/ESC */
2233             if(IS_2022_CONTROL(sourceChar)) {
2234                 /* callback(illegal) */
2235                 *err=U_ILLEGAL_CHAR_FOUND;
2236                 args->converter->fromUChar32=sourceChar;
2237                 break;
2238             }
2239 
2240             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2241             if(length < 0) {
2242                 length = -length;  /* fallback */
2243             }
2244             /* only DBCS or SBCS characters are expected*/
2245             /* DB characters with high bit set to 1 are expected */
2246             if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
2247                 targetByteUnit=missingCharMarker;
2248             }
2249             if (targetByteUnit != missingCharMarker){
2250 
2251                 oldIsTargetByteDBCS = isTargetByteDBCS;
2252                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2253                   /* append the shift sequence */
2254                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2255 
2256                     if (isTargetByteDBCS)
2257                         *target++ = UCNV_SO;
2258                     else
2259                         *target++ = UCNV_SI;
2260                     if(offsets)
2261                         *(offsets++) = (int32_t)(source - args->source-1);
2262                 }
2263                 /* write the targetUniChar  to target */
2264                 if(targetByteUnit <= 0x00FF){
2265                     if( target < targetLimit){
2266                         *(target++) = (unsigned char) targetByteUnit;
2267                         if(offsets){
2268                             *(offsets++) = (int32_t)(source - args->source-1);
2269                         }
2270 
2271                     }else{
2272                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2273                         *err = U_BUFFER_OVERFLOW_ERROR;
2274                     }
2275                 }else{
2276                     if(target < targetLimit){
2277                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2278                         if(offsets){
2279                             *(offsets++) = (int32_t)(source - args->source-1);
2280                         }
2281                         if(target < targetLimit){
2282                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2283                             if(offsets){
2284                                 *(offsets++) = (int32_t)(source - args->source-1);
2285                             }
2286                         }else{
2287                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2288                             *err = U_BUFFER_OVERFLOW_ERROR;
2289                         }
2290                     }else{
2291                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2292                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2293                         *err = U_BUFFER_OVERFLOW_ERROR;
2294                     }
2295                 }
2296 
2297             }
2298             else{
2299                 /* oops.. the code point is unassingned
2300                  * set the error and reason
2301                  */
2302 
2303                 /*check if the char is a First surrogate*/
2304                 if(UTF_IS_SURROGATE(sourceChar)) {
2305                     if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2306 getTrail:
2307                         /*look ahead to find the trail surrogate*/
2308                         if(source <  sourceLimit) {
2309                             /* test the following code unit */
2310                             UChar trail=(UChar) *source;
2311                             if(UTF_IS_SECOND_SURROGATE(trail)) {
2312                                 source++;
2313                                 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2314                                 *err = U_INVALID_CHAR_FOUND;
2315                                 /* convert this surrogate code point */
2316                                 /* exit this condition tree */
2317                             } else {
2318                                 /* this is an unmatched lead code unit (1st surrogate) */
2319                                 /* callback(illegal) */
2320                                 *err=U_ILLEGAL_CHAR_FOUND;
2321                             }
2322                         } else {
2323                             /* no more input */
2324                             *err = U_ZERO_ERROR;
2325                         }
2326                     } else {
2327                         /* this is an unmatched trail code unit (2nd surrogate) */
2328                         /* callback(illegal) */
2329                         *err=U_ILLEGAL_CHAR_FOUND;
2330                     }
2331                 } else {
2332                     /* callback(unassigned) for a BMP code point */
2333                     *err = U_INVALID_CHAR_FOUND;
2334                 }
2335 
2336                 args->converter->fromUChar32=sourceChar;
2337                 break;
2338             }
2339         } /* end if(myTargetIndex<myTargetLength) */
2340         else{
2341             *err =U_BUFFER_OVERFLOW_ERROR;
2342             break;
2343         }
2344 
2345     }/* end while(mySourceIndex<mySourceLength) */
2346 
2347     /*
2348      * the end of the input stream and detection of truncated input
2349      * are handled by the framework, but for ISO-2022-KR conversion
2350      * we need to be in ASCII mode at the very end
2351      *
2352      * conditions:
2353      *   successful
2354      *   not in ASCII mode
2355      *   end of input and no truncated input
2356      */
2357     if( U_SUCCESS(*err) &&
2358         isTargetByteDBCS &&
2359         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2360     ) {
2361         int32_t sourceIndex;
2362 
2363         /* we are switching to ASCII */
2364         isTargetByteDBCS=FALSE;
2365 
2366         /* get the source index of the last input character */
2367         /*
2368          * TODO this would be simpler and more reliable if we used a pair
2369          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2370          * so that we could simply use the prevSourceIndex here;
2371          * this code gives an incorrect result for the rare case of an unmatched
2372          * trail surrogate that is alone in the last buffer of the text stream
2373          */
2374         sourceIndex=(int32_t)(source-args->source);
2375         if(sourceIndex>0) {
2376             --sourceIndex;
2377             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2378                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2379             ) {
2380                 --sourceIndex;
2381             }
2382         } else {
2383             sourceIndex=-1;
2384         }
2385 
2386         fromUWriteUInt8(
2387             args->converter,
2388             SHIFT_IN_STR, 1,
2389             &target, (const char *)targetLimit,
2390             &offsets, sourceIndex,
2391             err);
2392     }
2393 
2394     /*save the state and return */
2395     args->source = source;
2396     args->target = (char*)target;
2397     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2398 }
2399 
2400 /************************ To Unicode ***************************************/
2401 
2402 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2403 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2404                                                             UErrorCode* err){
2405     char const* sourceStart;
2406     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2407 
2408     UConverterToUnicodeArgs subArgs;
2409     int32_t minArgsSize;
2410 
2411     /* set up the subconverter arguments */
2412     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2413         minArgsSize = args->size;
2414     } else {
2415         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2416     }
2417 
2418     uprv_memcpy(&subArgs, args, minArgsSize);
2419     subArgs.size = (uint16_t)minArgsSize;
2420     subArgs.converter = myData->currentConverter;
2421 
2422     /* remember the original start of the input for offsets */
2423     sourceStart = args->source;
2424 
2425     if(myData->key != 0) {
2426         /* continue with a partial escape sequence */
2427         goto escape;
2428     }
2429 
2430     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2431         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2432         subArgs.source = args->source;
2433         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2434         if(subArgs.source != subArgs.sourceLimit) {
2435             /*
2436              * get the current partial byte sequence
2437              *
2438              * it needs to be moved between the public and the subconverter
2439              * so that the conversion framework, which only sees the public
2440              * converter, can handle truncated and illegal input etc.
2441              */
2442             if(args->converter->toULength > 0) {
2443                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2444             }
2445             subArgs.converter->toULength = args->converter->toULength;
2446 
2447             /*
2448              * Convert up to the end of the input, or to before the next escape character.
2449              * Does not handle conversion extensions because the preToU[] state etc.
2450              * is not copied.
2451              */
2452             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2453 
2454             if(args->offsets != NULL && sourceStart != args->source) {
2455                 /* update offsets to base them on the actual start of the input */
2456                 int32_t *offsets = args->offsets;
2457                 UChar *target = args->target;
2458                 int32_t delta = (int32_t)(args->source - sourceStart);
2459                 while(target < subArgs.target) {
2460                     if(*offsets >= 0) {
2461                         *offsets += delta;
2462                     }
2463                     ++offsets;
2464                     ++target;
2465                 }
2466             }
2467             args->source = subArgs.source;
2468             args->target = subArgs.target;
2469             args->offsets = subArgs.offsets;
2470 
2471             /* copy input/error/overflow buffers */
2472             if(subArgs.converter->toULength > 0) {
2473                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2474             }
2475             args->converter->toULength = subArgs.converter->toULength;
2476 
2477             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2478                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2479                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2480                                 subArgs.converter->UCharErrorBufferLength);
2481                 }
2482                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2483                 subArgs.converter->UCharErrorBufferLength = 0;
2484             }
2485         }
2486 
2487         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2488             return;
2489         }
2490 
2491 escape:
2492         changeState_2022(args->converter,
2493                &(args->source),
2494                args->sourceLimit,
2495                ISO_2022_KR,
2496                err);
2497     }
2498 }
2499 
2500 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2501 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2502                                                             UErrorCode* err){
2503     char tempBuf[2];
2504     const char *mySource = ( char *) args->source;
2505     UChar *myTarget = args->target;
2506     const char *mySourceLimit = args->sourceLimit;
2507     UChar32 targetUniChar = 0x0000;
2508     UChar mySourceChar = 0x0000;
2509     UConverterDataISO2022* myData;
2510     UConverterSharedData* sharedData ;
2511     UBool useFallback;
2512 
2513     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2514     if(myData->version==1){
2515         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2516         return;
2517     }
2518 
2519     /* initialize state */
2520     sharedData = myData->currentConverter->sharedData;
2521     useFallback = args->converter->useFallback;
2522 
2523     if(myData->key != 0) {
2524         /* continue with a partial escape sequence */
2525         goto escape;
2526     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2527         /* continue with a partial double-byte character */
2528         mySourceChar = args->converter->toUBytes[0];
2529         args->converter->toULength = 0;
2530         goto getTrailByte;
2531     }
2532 
2533     while(mySource< mySourceLimit){
2534 
2535         if(myTarget < args->targetLimit){
2536 
2537             mySourceChar= (unsigned char) *mySource++;
2538 
2539             if(mySourceChar==UCNV_SI){
2540                 myData->toU2022State.g = 0;
2541                 /*consume the source */
2542                 continue;
2543             }else if(mySourceChar==UCNV_SO){
2544                 myData->toU2022State.g = 1;
2545                 /*consume the source */
2546                 continue;
2547             }else if(mySourceChar==ESC_2022){
2548                 mySource--;
2549 escape:
2550                 changeState_2022(args->converter,&(mySource),
2551                                 mySourceLimit, ISO_2022_KR, err);
2552                 if(U_FAILURE(*err)){
2553                     args->target = myTarget;
2554                     args->source = mySource;
2555                     return;
2556                 }
2557                 continue;
2558             }
2559 
2560             if(myData->toU2022State.g == 1) {
2561                 if(mySource < mySourceLimit) {
2562                     char trailByte;
2563 getTrailByte:
2564                     trailByte = *mySource++;
2565                     tempBuf[0] = (char)(mySourceChar + 0x80);
2566                     tempBuf[1] = (char)(trailByte + 0x80);
2567                     mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2568                     if((mySourceChar & 0x8080) == 0) {
2569                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2570                     } else {
2571                         /* illegal bytes > 0x7f */
2572                         targetUniChar = missingCharMarker;
2573                     }
2574                 } else {
2575                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2576                     args->converter->toULength = 1;
2577                     break;
2578                 }
2579             }
2580             else{
2581                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2582             }
2583             if(targetUniChar < 0xfffe){
2584                 if(args->offsets) {
2585                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2586                 }
2587                 *(myTarget++)=(UChar)targetUniChar;
2588             }
2589             else {
2590                 /* Call the callback function*/
2591                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2592                 break;
2593             }
2594         }
2595         else{
2596             *err =U_BUFFER_OVERFLOW_ERROR;
2597             break;
2598         }
2599     }
2600     args->target = myTarget;
2601     args->source = mySource;
2602 }
2603 
2604 /*************************** END ISO2022-KR *********************************/
2605 
2606 /*************************** ISO-2022-CN *********************************
2607 *
2608 * Rules for ISO-2022-CN Encoding:
2609 * i)   The designator sequence must appear once on a line before any instance
2610 *      of character set it designates.
2611 * ii)  If two lines contain characters from the same character set, both lines
2612 *      must include the designator sequence.
2613 * iii) Once the designator sequence is known, a shifting sequence has to be found
2614 *      to invoke the  shifting
2615 * iv)  All lines start in ASCII and end in ASCII.
2616 * v)   Four shifting sequences are employed for this purpose:
2617 *
2618 *      Sequcence   ASCII Eq    Charsets
2619 *      ----------  -------    ---------
2620 *      SI           <SI>        US-ASCII
2621 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2622 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2623 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2624 *
2625 * vi)
2626 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2627 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2628 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2629 *
2630 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2631 *       characters as defined in GB 2312-80, until
2632 *       another SOdesignation appears
2633 *
2634 *
2635 *      ESC $ ) E       Indicates the bytes following SO are as defined
2636 *       in ISO-IR-165 (for details, see section 2.1),
2637 *       until another SOdesignation appears
2638 *
2639 *      ESC $ ) G       Indicates the bytes following SO are as defined
2640 *       in CNS 11643-plane-1, until another
2641 *       SOdesignation appears
2642 *
2643 *      ESC $ * H       Indicates the two bytes immediately following
2644 *       SS2 is a Chinese character as defined in CNS
2645 *       11643-plane-2, until another SS2designation
2646 *       appears
2647 *       (Meaning <ESC>N must preceed every 2 byte
2648 *        sequence.)
2649 *
2650 *      ESC $ + I       Indicates the immediate two bytes following SS3
2651 *       is a Chinese character as defined in CNS
2652 *       11643-plane-3, until another SS3designation
2653 *       appears
2654 *       (Meaning <ESC>O must preceed every 2 byte
2655 *        sequence.)
2656 *
2657 *      ESC $ + J       Indicates the immediate two bytes following SS3
2658 *       is a Chinese character as defined in CNS
2659 *       11643-plane-4, until another SS3designation
2660 *       appears
2661 *       (In English: <ESC>O must preceed every 2 byte
2662 *        sequence.)
2663 *
2664 *      ESC $ + K       Indicates the immediate two bytes following SS3
2665 *       is a Chinese character as defined in CNS
2666 *       11643-plane-5, until another SS3designation
2667 *       appears
2668 *
2669 *      ESC $ + L       Indicates the immediate two bytes following SS3
2670 *       is a Chinese character as defined in CNS
2671 *       11643-plane-6, until another SS3designation
2672 *       appears
2673 *
2674 *      ESC $ + M       Indicates the immediate two bytes following SS3
2675 *       is a Chinese character as defined in CNS
2676 *       11643-plane-7, until another SS3designation
2677 *       appears
2678 *
2679 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2680 *       has its own designation information before any Chinese characters
2681 *       appear
2682 *
2683 */
2684 
2685 /* The following are defined this way to make the strings truely readonly */
2686 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2687 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2688 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2689 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2690 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2691 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2692 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2693 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2694 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2695 
2696 /********************** ISO2022-CN Data **************************/
2697 static const char* const escSeqCharsCN[10] ={
2698         SHIFT_IN_STR,           /* ASCII */
2699         GB_2312_80_STR,
2700         ISO_IR_165_STR,
2701         CNS_11643_1992_Plane_1_STR,
2702         CNS_11643_1992_Plane_2_STR,
2703         CNS_11643_1992_Plane_3_STR,
2704         CNS_11643_1992_Plane_4_STR,
2705         CNS_11643_1992_Plane_5_STR,
2706         CNS_11643_1992_Plane_6_STR,
2707         CNS_11643_1992_Plane_7_STR
2708 };
2709 
2710 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2711 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2712     UConverter *cnv = args->converter;
2713     UConverterDataISO2022 *converterData;
2714     ISO2022State *pFromU2022State;
2715     uint8_t *target = (uint8_t *) args->target;
2716     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2717     const UChar* source = args->source;
2718     const UChar* sourceLimit = args->sourceLimit;
2719     int32_t* offsets = args->offsets;
2720     UChar32 sourceChar;
2721     char buffer[8];
2722     int32_t len;
2723     int8_t choices[3];
2724     int32_t choiceCount;
2725     uint32_t targetValue = 0;
2726     UBool useFallback;
2727 
2728     /* set up the state */
2729     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2730     pFromU2022State   = &converterData->fromU2022State;
2731 
2732     choiceCount = 0;
2733 
2734     /* check if the last codepoint of previous buffer was a lead surrogate*/
2735     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2736         goto getTrail;
2737     }
2738 
2739     while( source < sourceLimit){
2740         if(target < targetLimit){
2741 
2742             sourceChar  = *(source++);
2743             /*check if the char is a First surrogate*/
2744              if(UTF_IS_SURROGATE(sourceChar)) {
2745                 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2746 getTrail:
2747                     /*look ahead to find the trail surrogate*/
2748                     if(source < sourceLimit) {
2749                         /* test the following code unit */
2750                         UChar trail=(UChar) *source;
2751                         if(UTF_IS_SECOND_SURROGATE(trail)) {
2752                             source++;
2753                             sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2754                             cnv->fromUChar32=0x00;
2755                             /* convert this supplementary code point */
2756                             /* exit this condition tree */
2757                         } else {
2758                             /* this is an unmatched lead code unit (1st surrogate) */
2759                             /* callback(illegal) */
2760                             *err=U_ILLEGAL_CHAR_FOUND;
2761                             cnv->fromUChar32=sourceChar;
2762                             break;
2763                         }
2764                     } else {
2765                         /* no more input */
2766                         cnv->fromUChar32=sourceChar;
2767                         break;
2768                     }
2769                 } else {
2770                     /* this is an unmatched trail code unit (2nd surrogate) */
2771                     /* callback(illegal) */
2772                     *err=U_ILLEGAL_CHAR_FOUND;
2773                     cnv->fromUChar32=sourceChar;
2774                     break;
2775                 }
2776             }
2777 
2778             /* do the conversion */
2779             if(sourceChar <= 0x007f ){
2780                 /* do not convert SO/SI/ESC */
2781                 if(IS_2022_CONTROL(sourceChar)) {
2782                     /* callback(illegal) */
2783                     *err=U_ILLEGAL_CHAR_FOUND;
2784                     cnv->fromUChar32=sourceChar;
2785                     break;
2786                 }
2787 
2788                 /* US-ASCII */
2789                 if(pFromU2022State->g == 0) {
2790                     buffer[0] = (char)sourceChar;
2791                     len = 1;
2792                 } else {
2793                     buffer[0] = UCNV_SI;
2794                     buffer[1] = (char)sourceChar;
2795                     len = 2;
2796                     pFromU2022State->g = 0;
2797                     choiceCount = 0;
2798                 }
2799                 if(sourceChar == CR || sourceChar == LF) {
2800                     /* reset the state at the end of a line */
2801                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2802                     choiceCount = 0;
2803                 }
2804             }
2805             else{
2806                 /* convert U+0080..U+10ffff */
2807                 int32_t i;
2808                 int8_t cs, g;
2809 
2810                 if(choiceCount == 0) {
2811                     /* try the current SO/G1 converter first */
2812                     choices[0] = pFromU2022State->cs[1];
2813 
2814                     /* default to GB2312_1 if none is designated yet */
2815                     if(choices[0] == 0) {
2816                         choices[0] = GB2312_1;
2817                     }
2818 
2819                     if(converterData->version == 0) {
2820                         /* ISO-2022-CN */
2821 
2822                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2823                         if(choices[0] == GB2312_1) {
2824                             choices[1] = (int8_t)CNS_11643_1;
2825                         } else {
2826                             choices[1] = (int8_t)GB2312_1;
2827                         }
2828 
2829                         choiceCount = 2;
2830                     } else {
2831                         /* ISO-2022-CN-EXT */
2832 
2833                         /* try one of the other converters */
2834                         switch(choices[0]) {
2835                         case GB2312_1:
2836                             choices[1] = (int8_t)CNS_11643_1;
2837                             choices[2] = (int8_t)ISO_IR_165;
2838                             break;
2839                         case ISO_IR_165:
2840                             choices[1] = (int8_t)GB2312_1;
2841                             choices[2] = (int8_t)CNS_11643_1;
2842                             break;
2843                         default: /* CNS_11643_x */
2844                             choices[1] = (int8_t)GB2312_1;
2845                             choices[2] = (int8_t)ISO_IR_165;
2846                             break;
2847                         }
2848 
2849                         choiceCount = 3;
2850                     }
2851                 }
2852 
2853                 cs = g = 0;
2854                 /*
2855                  * len==0: no mapping found yet
2856                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2857                  * len>0: found a roundtrip result, done
2858                  */
2859                 len = 0;
2860                 /*
2861                  * We will turn off useFallback after finding a fallback,
2862                  * but we still get fallbacks from PUA code points as usual.
2863                  * Therefore, we will also need to check that we don't overwrite
2864                  * an early fallback with a later one.
2865                  */
2866                 useFallback = cnv->useFallback;
2867 
2868                 for(i = 0; i < choiceCount && len <= 0; ++i) {
2869                     int8_t cs0 = choices[i];
2870                     if(cs0 > 0) {
2871                         uint32_t value;
2872                         int32_t len2;
2873                         if(cs0 > CNS_11643_0) {
2874                             len2 = MBCS_FROM_UCHAR32_ISO2022(
2875                                         converterData->myConverterArray[CNS_11643],
2876                                         sourceChar,
2877                                         &value,
2878                                         useFallback,
2879                                         MBCS_OUTPUT_3);
2880                             if(len2 == 3 || (len2 == -3 && len == 0)) {
2881                                 targetValue = value;
2882                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
2883                                 if(len2 >= 0) {
2884                                     len = 2;
2885                                 } else {
2886                                     len = -2;
2887                                     useFallback = FALSE;
2888                                 }
2889                                 if(cs == CNS_11643_1) {
2890                                     g = 1;
2891                                 } else if(cs == CNS_11643_2) {
2892                                     g = 2;
2893                                 } else /* plane 3..7 */ if(converterData->version == 1) {
2894                                     g = 3;
2895                                 } else {
2896                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2897                                     len = 0;
2898                                 }
2899                             }
2900                         } else {
2901                             /* GB2312_1 or ISO-IR-165 */
2902                             len2 = MBCS_FROM_UCHAR32_ISO2022(
2903                                         converterData->myConverterArray[cs0],
2904                                         sourceChar,
2905                                         &value,
2906                                         useFallback,
2907                                         MBCS_OUTPUT_2);
2908                             if(len2 == 2 || (len2 == -2 && len == 0)) {
2909                                 targetValue = value;
2910                                 len = len2;
2911                                 cs = cs0;
2912                                 g = 1;
2913                                 useFallback = FALSE;
2914                             }
2915                         }
2916                     }
2917                 }
2918 
2919                 if(len != 0) {
2920                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
2921 
2922                     /* write the designation sequence if necessary */
2923                     if(cs != pFromU2022State->cs[g]) {
2924                         if(cs < CNS_11643) {
2925                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2926                         } else {
2927                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2928                         }
2929                         len = 4;
2930                         pFromU2022State->cs[g] = cs;
2931                         if(g == 1) {
2932                             /* changing the SO/G1 charset invalidates the choices[] */
2933                             choiceCount = 0;
2934                         }
2935                     }
2936 
2937                     /* write the shift sequence if necessary */
2938                     if(g != pFromU2022State->g) {
2939                         switch(g) {
2940                         case 1:
2941                             buffer[len++] = UCNV_SO;
2942 
2943                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2944                             pFromU2022State->g = 1;
2945                             break;
2946                         case 2:
2947                             buffer[len++] = 0x1b;
2948                             buffer[len++] = 0x4e;
2949                             break;
2950                         default: /* case 3 */
2951                             buffer[len++] = 0x1b;
2952                             buffer[len++] = 0x4f;
2953                             break;
2954                         }
2955                     }
2956 
2957                     /* write the two output bytes */
2958                     buffer[len++] = (char)(targetValue >> 8);
2959                     buffer[len++] = (char)targetValue;
2960                 } else {
2961                     /* if we cannot find the character after checking all codepages
2962                      * then this is an error
2963                      */
2964                     *err = U_INVALID_CHAR_FOUND;
2965                     cnv->fromUChar32=sourceChar;
2966                     break;
2967                 }
2968             }
2969 
2970             /* output len>0 bytes in buffer[] */
2971             if(len == 1) {
2972                 *target++ = buffer[0];
2973                 if(offsets) {
2974                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2975                 }
2976             } else if(len == 2 && (target + 2) <= targetLimit) {
2977                 *target++ = buffer[0];
2978                 *target++ = buffer[1];
2979                 if(offsets) {
2980                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2981                     *offsets++ = sourceIndex;
2982                     *offsets++ = sourceIndex;
2983                 }
2984             } else {
2985                 fromUWriteUInt8(
2986                     cnv,
2987                     buffer, len,
2988                     &target, (const char *)targetLimit,
2989                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2990                     err);
2991                 if(U_FAILURE(*err)) {
2992                     break;
2993                 }
2994             }
2995         } /* end if(myTargetIndex<myTargetLength) */
2996         else{
2997             *err =U_BUFFER_OVERFLOW_ERROR;
2998             break;
2999         }
3000 
3001     }/* end while(mySourceIndex<mySourceLength) */
3002 
3003     /*
3004      * the end of the input stream and detection of truncated input
3005      * are handled by the framework, but for ISO-2022-CN conversion
3006      * we need to be in ASCII mode at the very end
3007      *
3008      * conditions:
3009      *   successful
3010      *   not in ASCII mode
3011      *   end of input and no truncated input
3012      */
3013     if( U_SUCCESS(*err) &&
3014         pFromU2022State->g!=0 &&
3015         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3016     ) {
3017         int32_t sourceIndex;
3018 
3019         /* we are switching to ASCII */
3020         pFromU2022State->g=0;
3021 
3022         /* get the source index of the last input character */
3023         /*
3024          * TODO this would be simpler and more reliable if we used a pair
3025          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3026          * so that we could simply use the prevSourceIndex here;
3027          * this code gives an incorrect result for the rare case of an unmatched
3028          * trail surrogate that is alone in the last buffer of the text stream
3029          */
3030         sourceIndex=(int32_t)(source-args->source);
3031         if(sourceIndex>0) {
3032             --sourceIndex;
3033             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3034                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3035             ) {
3036                 --sourceIndex;
3037             }
3038         } else {
3039             sourceIndex=-1;
3040         }
3041 
3042         fromUWriteUInt8(
3043             cnv,
3044             SHIFT_IN_STR, 1,
3045             &target, (const char *)targetLimit,
3046             &offsets, sourceIndex,
3047             err);
3048     }
3049 
3050     /*save the state and return */
3051     args->source = source;
3052     args->target = (char*)target;
3053 }
3054 
3055 
3056 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3057 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3058                                                UErrorCode* err){
3059     char tempBuf[3];
3060     const char *mySource = (char *) args->source;
3061     UChar *myTarget = args->target;
3062     const char *mySourceLimit = args->sourceLimit;
3063     uint32_t targetUniChar = 0x0000;
3064     uint32_t mySourceChar = 0x0000;
3065     UConverterDataISO2022* myData;
3066     ISO2022State *pToU2022State;
3067 
3068     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3069     pToU2022State = &myData->toU2022State;
3070 
3071     if(myData->key != 0) {
3072         /* continue with a partial escape sequence */
3073         goto escape;
3074     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3075         /* continue with a partial double-byte character */
3076         mySourceChar = args->converter->toUBytes[0];
3077         args->converter->toULength = 0;
3078         goto getTrailByte;
3079     }
3080 
3081     while(mySource < mySourceLimit){
3082 
3083         targetUniChar =missingCharMarker;
3084 
3085         if(myTarget < args->targetLimit){
3086 
3087             mySourceChar= (unsigned char) *mySource++;
3088 
3089             switch(mySourceChar){
3090             case UCNV_SI:
3091                 pToU2022State->g=0;
3092                 continue;
3093 
3094             case UCNV_SO:
3095                 if(pToU2022State->cs[1] != 0) {
3096                     pToU2022State->g=1;
3097                     continue;
3098                 } else {
3099                     /* illegal to have SO before a matching designator */
3100                     break;
3101                 }
3102 
3103             case ESC_2022:
3104                 mySource--;
3105 escape:
3106                 changeState_2022(args->converter,&(mySource),
3107                     mySourceLimit, ISO_2022_CN,err);
3108 
3109                 /* invalid or illegal escape sequence */
3110                 if(U_FAILURE(*err)){
3111                     args->target = myTarget;
3112                     args->source = mySource;
3113                     return;
3114                 }
3115                 continue;
3116 
3117             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3118 
3119             case CR:
3120                 /*falls through*/
3121             case LF:
3122                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3123                 /* falls through */
3124             default:
3125                 /* convert one or two bytes */
3126                 if(pToU2022State->g != 0) {
3127                     if(mySource < mySourceLimit) {
3128                         UConverterSharedData *cnv;
3129                         StateEnum tempState;
3130                         int32_t tempBufLen;
3131                         char trailByte;
3132 getTrailByte:
3133                         trailByte = *mySource++;
3134                         tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3135                         if(tempState > CNS_11643_0) {
3136                             cnv = myData->myConverterArray[CNS_11643];
3137                             tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3138                             tempBuf[1] = (char) (mySourceChar);
3139                             tempBuf[2] = trailByte;
3140                             tempBufLen = 3;
3141 
3142                         }else{
3143                             cnv = myData->myConverterArray[tempState];
3144                             tempBuf[0] = (char) (mySourceChar);
3145                             tempBuf[1] = trailByte;
3146                             tempBufLen = 2;
3147                         }
3148                         mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
3149                         if(pToU2022State->g>=2) {
3150                             /* return from a single-shift state to the previous one */
3151                             pToU2022State->g=pToU2022State->prevG;
3152                         }
3153                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3154                     } else {
3155                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3156                         args->converter->toULength = 1;
3157                         goto endloop;
3158                     }
3159                 }
3160                 else{
3161                     if(mySourceChar <= 0x7f) {
3162                         targetUniChar = (UChar) mySourceChar;
3163                     }
3164                 }
3165                 break;
3166             }
3167             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3168                 if(args->offsets){
3169                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3170                 }
3171                 *(myTarget++)=(UChar)targetUniChar;
3172             }
3173             else if(targetUniChar > missingCharMarker){
3174                 /* disassemble the surrogate pair and write to output*/
3175                 targetUniChar-=0x0010000;
3176                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3177                 if(args->offsets){
3178                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3179                 }
3180                 ++myTarget;
3181                 if(myTarget< args->targetLimit){
3182                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3183                     if(args->offsets){
3184                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3185                     }
3186                     ++myTarget;
3187                 }else{
3188                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3189                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3190                 }
3191 
3192             }
3193             else{
3194                 /* Call the callback function*/
3195                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3196                 break;
3197             }
3198         }
3199         else{
3200             *err =U_BUFFER_OVERFLOW_ERROR;
3201             break;
3202         }
3203     }
3204 endloop:
3205     args->target = myTarget;
3206     args->source = mySource;
3207 }
3208 
3209 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3210 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3211     UConverter *cnv = args->converter;
3212     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3213     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3214     char *p, *subchar;
3215     char buffer[8];
3216     int32_t length;
3217 
3218     subchar=(char *)cnv->subChars;
3219     length=cnv->subCharLen; /* assume length==1 for most variants */
3220 
3221     p = buffer;
3222     switch(myConverterData->locale[0]){
3223     case 'j':
3224         {
3225             int8_t cs;
3226 
3227             if(pFromU2022State->g == 1) {
3228                 /* JIS7: switch from G1 to G0 */
3229                 pFromU2022State->g = 0;
3230                 *p++ = UCNV_SI;
3231             }
3232 
3233             cs = pFromU2022State->cs[0];
3234             if(cs != ASCII && cs != JISX201) {
3235                 /* not in ASCII or JIS X 0201: switch to ASCII */
3236                 pFromU2022State->cs[0] = (int8_t)ASCII;
3237                 *p++ = '\x1b';
3238                 *p++ = '\x28';
3239                 *p++ = '\x42';
3240             }
3241 
3242             *p++ = subchar[0];
3243             break;
3244         }
3245     case 'c':
3246         if(pFromU2022State->g != 0) {
3247             /* not in ASCII mode: switch to ASCII */
3248             pFromU2022State->g = 0;
3249             *p++ = UCNV_SI;
3250         }
3251         *p++ = subchar[0];
3252         break;
3253     case 'k':
3254         if(myConverterData->version == 0) {
3255             if(length == 1) {
3256                 if((UBool)args->converter->fromUnicodeStatus) {
3257                     /* in DBCS mode: switch to SBCS */
3258                     args->converter->fromUnicodeStatus = 0;
3259                     *p++ = UCNV_SI;
3260                 }
3261                 *p++ = subchar[0];
3262             } else /* length == 2*/ {
3263                 if(!(UBool)args->converter->fromUnicodeStatus) {
3264                     /* in SBCS mode: switch to DBCS */
3265                     args->converter->fromUnicodeStatus = 1;
3266                     *p++ = UCNV_SO;
3267                 }
3268                 *p++ = subchar[0];
3269                 *p++ = subchar[1];
3270             }
3271             break;
3272         } else {
3273             /* save the subconverter's substitution string */
3274             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3275             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3276 
3277             /* set our substitution string into the subconverter */
3278             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3279             myConverterData->currentConverter->subCharLen = (int8_t)length;
3280 
3281             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3282             args->converter = myConverterData->currentConverter;
3283             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3284             ucnv_cbFromUWriteSub(args, 0, err);
3285             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3286             args->converter = cnv;
3287 
3288             /* restore the subconverter's substitution string */
3289             myConverterData->currentConverter->subChars = currentSubChars;
3290             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3291 
3292             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3293                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3294                     uprv_memcpy(
3295                         cnv->charErrorBuffer,
3296                         myConverterData->currentConverter->charErrorBuffer,
3297                         myConverterData->currentConverter->charErrorBufferLength);
3298                 }
3299                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3300                 myConverterData->currentConverter->charErrorBufferLength = 0;
3301             }
3302             return;
3303         }
3304     default:
3305         /* not expected */
3306         break;
3307     }
3308     ucnv_cbFromUWriteBytes(args,
3309                            buffer, (int32_t)(p - buffer),
3310                            offsetIndex, err);
3311 }
3312 
3313 /*
3314  * Structure for cloning an ISO 2022 converter into a single memory block.
3315  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3316  * and then ucnv_safeClone() of the sub-converter may additionally align
3317  * currentConverter inside the cloneStruct, for which we need the deadSpace
3318  * after currentConverter.
3319  * This is because UAlignedMemory may be larger than the actually
3320  * necessary alignment size for the platform.
3321  * The other cloneStruct fields will not be moved around,
3322  * and are aligned properly with cloneStruct's alignment.
3323  */
3324 struct cloneStruct
3325 {
3326     UConverter cnv;
3327     UConverter currentConverter;
3328     UAlignedMemory deadSpace;
3329     UConverterDataISO2022 mydata;
3330 };
3331 
3332 
3333 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3334 _ISO_2022_SafeClone(
3335             const UConverter *cnv,
3336             void *stackBuffer,
3337             int32_t *pBufferSize,
3338             UErrorCode *status)
3339 {
3340     struct cloneStruct * localClone;
3341     UConverterDataISO2022 *cnvData;
3342     int32_t i, size;
3343 
3344     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3345         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3346         return NULL;
3347     }
3348 
3349     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3350     localClone = (struct cloneStruct *)stackBuffer;
3351 
3352     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3353 
3354     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3355     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3356     localClone->cnv.isExtraLocal = TRUE;
3357 
3358     /* share the subconverters */
3359 
3360     if(cnvData->currentConverter != NULL) {
3361         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3362         localClone->mydata.currentConverter =
3363             ucnv_safeClone(cnvData->currentConverter,
3364                             &localClone->currentConverter,
3365                             &size, status);
3366         if(U_FAILURE(*status)) {
3367             return NULL;
3368         }
3369     }
3370 
3371     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3372         if(cnvData->myConverterArray[i] != NULL) {
3373             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3374         }
3375     }
3376 
3377     return &localClone->cnv;
3378 }
3379 
3380 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3381 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3382                     const USetAdder *sa,
3383                     UConverterUnicodeSet which,
3384                     UErrorCode *pErrorCode)
3385 {
3386     int32_t i;
3387     UConverterDataISO2022* cnvData;
3388 
3389     if (U_FAILURE(*pErrorCode)) {
3390         return;
3391     }
3392 #ifdef U_ENABLE_GENERIC_ISO_2022
3393     if (cnv->sharedData == &_ISO2022Data) {
3394         /* We use UTF-8 in this case */
3395         sa->addRange(sa->set, 0, 0xd7FF);
3396         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3397         return;
3398     }
3399 #endif
3400 
3401     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3402 
3403     /* open a set and initialize it with code points that are algorithmically round-tripped */
3404     switch(cnvData->locale[0]){
3405     case 'j':
3406         /* include JIS X 0201 which is hardcoded */
3407         sa->add(sa->set, 0xa5);
3408         sa->add(sa->set, 0x203e);
3409         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3410             /* include Latin-1 for some variants of JP */
3411             sa->addRange(sa->set, 0, 0xff);
3412         } else {
3413             /* include ASCII for JP */
3414             sa->addRange(sa->set, 0, 0x7f);
3415         }
3416         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3417             /*
3418              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3419              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3420              * use half-width Katakana.
3421              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3422              * half-width Katakana via the ESC ( I sequence.
3423              * However, we only emit (fromUnicode) half-width Katakana according to the
3424              * definition of each variant.
3425              *
3426              * When including fallbacks,
3427              * we need to include half-width Katakana Unicode code points for all JP variants because
3428              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3429              */
3430             /* include half-width Katakana for JP */
3431             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3432         }
3433         break;
3434     case 'c':
3435     case 'z':
3436         /* include ASCII for CN */
3437         sa->addRange(sa->set, 0, 0x7f);
3438         break;
3439     case 'k':
3440         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3441         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3442                 cnvData->currentConverter, sa, which, pErrorCode);
3443         /* the loop over myConverterArray[] will simply not find another converter */
3444         break;
3445     default:
3446         break;
3447     }
3448 
3449 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3450             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3451                 cnvData->version==0 && i==CNS_11643
3452             ) {
3453                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3454                 ucnv_MBCSGetUnicodeSetForBytes(
3455                         cnvData->myConverterArray[i],
3456                         sa, UCNV_ROUNDTRIP_SET,
3457                         0, 0x81, 0x82,
3458                         pErrorCode);
3459             }
3460 #endif
3461 
3462     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3463         UConverterSetFilter filter;
3464         if(cnvData->myConverterArray[i]!=NULL) {
3465             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3466                 cnvData->version==0 && i==CNS_11643
3467             ) {
3468                 /*
3469                  * Version-specific for CN:
3470                  * CN version 0 does not map CNS planes 3..7 although
3471                  * they are all available in the CNS conversion table;
3472                  * CN version 1 (-EXT) does map them all.
3473                  * The two versions create different Unicode sets.
3474                  */
3475                 filter=UCNV_SET_FILTER_2022_CN;
3476             } else if(cnvData->locale[0]=='j' && i==JISX208) {
3477                 /*
3478                  * Only add code points that map to Shift-JIS codes
3479                  * corresponding to JIS X 0208.
3480                  */
3481                 filter=UCNV_SET_FILTER_SJIS;
3482             } else if(i==KSC5601) {
3483                 /*
3484                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3485                  * are broader than GR94.
3486                  */
3487                 filter=UCNV_SET_FILTER_GR94DBCS;
3488             } else {
3489                 filter=UCNV_SET_FILTER_NONE;
3490             }
3491             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3492         }
3493     }
3494 
3495     /*
3496      * ISO 2022 converters must not convert SO/SI/ESC despite what
3497      * sub-converters do by themselves.
3498      * Remove these characters from the set.
3499      */
3500     sa->remove(sa->set, 0x0e);
3501     sa->remove(sa->set, 0x0f);
3502     sa->remove(sa->set, 0x1b);
3503 
3504     /* ISO 2022 converters do not convert C1 controls either */
3505     sa->removeRange(sa->set, 0x80, 0x9f);
3506 }
3507 
3508 static const UConverterImpl _ISO2022Impl={
3509     UCNV_ISO_2022,
3510 
3511     NULL,
3512     NULL,
3513 
3514     _ISO2022Open,
3515     _ISO2022Close,
3516     _ISO2022Reset,
3517 
3518 #ifdef U_ENABLE_GENERIC_ISO_2022
3519     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3520     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3521     ucnv_fromUnicode_UTF8,
3522     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3523 #else
3524     NULL,
3525     NULL,
3526     NULL,
3527     NULL,
3528 #endif
3529     NULL,
3530 
3531     NULL,
3532     _ISO2022getName,
3533     _ISO_2022_WriteSub,
3534     _ISO_2022_SafeClone,
3535     _ISO_2022_GetUnicodeSet
3536 };
3537 static const UConverterStaticData _ISO2022StaticData={
3538     sizeof(UConverterStaticData),
3539     "ISO_2022",
3540     2022,
3541     UCNV_IBM,
3542     UCNV_ISO_2022,
3543     1,
3544     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3545     { 0x1a, 0, 0, 0 },
3546     1,
3547     FALSE,
3548     FALSE,
3549     0,
3550     0,
3551     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3552 };
3553 const UConverterSharedData _ISO2022Data={
3554     sizeof(UConverterSharedData),
3555     ~((uint32_t) 0),
3556     NULL,
3557     NULL,
3558     &_ISO2022StaticData,
3559     FALSE,
3560     &_ISO2022Impl,
3561     0
3562 };
3563 
3564 /*************JP****************/
3565 static const UConverterImpl _ISO2022JPImpl={
3566     UCNV_ISO_2022,
3567 
3568     NULL,
3569     NULL,
3570 
3571     _ISO2022Open,
3572     _ISO2022Close,
3573     _ISO2022Reset,
3574 
3575     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3576     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3577     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3578     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3579     NULL,
3580 
3581     NULL,
3582     _ISO2022getName,
3583     _ISO_2022_WriteSub,
3584     _ISO_2022_SafeClone,
3585     _ISO_2022_GetUnicodeSet
3586 };
3587 static const UConverterStaticData _ISO2022JPStaticData={
3588     sizeof(UConverterStaticData),
3589     "ISO_2022_JP",
3590     0,
3591     UCNV_IBM,
3592     UCNV_ISO_2022,
3593     1,
3594     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3595     { 0x1a, 0, 0, 0 },
3596     1,
3597     FALSE,
3598     FALSE,
3599     0,
3600     0,
3601     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3602 };
3603 static const UConverterSharedData _ISO2022JPData={
3604     sizeof(UConverterSharedData),
3605     ~((uint32_t) 0),
3606     NULL,
3607     NULL,
3608     &_ISO2022JPStaticData,
3609     FALSE,
3610     &_ISO2022JPImpl,
3611     0
3612 };
3613 
3614 /************* KR ***************/
3615 static const UConverterImpl _ISO2022KRImpl={
3616     UCNV_ISO_2022,
3617 
3618     NULL,
3619     NULL,
3620 
3621     _ISO2022Open,
3622     _ISO2022Close,
3623     _ISO2022Reset,
3624 
3625     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3626     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3627     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3628     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3629     NULL,
3630 
3631     NULL,
3632     _ISO2022getName,
3633     _ISO_2022_WriteSub,
3634     _ISO_2022_SafeClone,
3635     _ISO_2022_GetUnicodeSet
3636 };
3637 static const UConverterStaticData _ISO2022KRStaticData={
3638     sizeof(UConverterStaticData),
3639     "ISO_2022_KR",
3640     0,
3641     UCNV_IBM,
3642     UCNV_ISO_2022,
3643     1,
3644     3, /* max 3 bytes per UChar: SO+DBCS */
3645     { 0x1a, 0, 0, 0 },
3646     1,
3647     FALSE,
3648     FALSE,
3649     0,
3650     0,
3651     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3652 };
3653 static const UConverterSharedData _ISO2022KRData={
3654     sizeof(UConverterSharedData),
3655     ~((uint32_t) 0),
3656     NULL,
3657     NULL,
3658     &_ISO2022KRStaticData,
3659     FALSE,
3660     &_ISO2022KRImpl,
3661     0
3662 };
3663 
3664 /*************** CN ***************/
3665 static const UConverterImpl _ISO2022CNImpl={
3666 
3667     UCNV_ISO_2022,
3668 
3669     NULL,
3670     NULL,
3671 
3672     _ISO2022Open,
3673     _ISO2022Close,
3674     _ISO2022Reset,
3675 
3676     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3677     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3678     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3679     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3680     NULL,
3681 
3682     NULL,
3683     _ISO2022getName,
3684     _ISO_2022_WriteSub,
3685     _ISO_2022_SafeClone,
3686     _ISO_2022_GetUnicodeSet
3687 };
3688 static const UConverterStaticData _ISO2022CNStaticData={
3689     sizeof(UConverterStaticData),
3690     "ISO_2022_CN",
3691     0,
3692     UCNV_IBM,
3693     UCNV_ISO_2022,
3694     1,
3695     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3696     { 0x1a, 0, 0, 0 },
3697     1,
3698     FALSE,
3699     FALSE,
3700     0,
3701     0,
3702     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3703 };
3704 static const UConverterSharedData _ISO2022CNData={
3705     sizeof(UConverterSharedData),
3706     ~((uint32_t) 0),
3707     NULL,
3708     NULL,
3709     &_ISO2022CNStaticData,
3710     FALSE,
3711     &_ISO2022CNImpl,
3712     0
3713 };
3714 
3715 
3716 
3717 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3718