1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 static const uint16_t jpCharsetMasks[5]={
169 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
174 };
175
176 typedef enum {
177 ASCII1=0,
178 LATIN1,
179 SBCS,
180 DBCS,
181 MBCS,
182 HWKANA
183 }Cnv2022Type;
184
185 typedef struct ISO2022State {
186 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
187 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
188 int8_t prevG; /* g before single shift (SS2 or SS3) */
189 } ISO2022State;
190
191 #define UCNV_OPTIONS_VERSION_MASK 0xf
192 #define UCNV_2022_MAX_CONVERTERS 10
193
194 typedef struct{
195 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
196 UConverter *currentConverter;
197 Cnv2022Type currentType;
198 ISO2022State toU2022State, fromU2022State;
199 uint32_t key;
200 uint32_t version;
201 #ifdef U_ENABLE_GENERIC_ISO_2022
202 UBool isFirstBuffer;
203 #endif
204 char name[30];
205 char locale[3];
206 }UConverterDataISO2022;
207
208 /* Protos */
209 /* ISO-2022 ----------------------------------------------------------------- */
210
211 /*Forward declaration */
212 U_CFUNC void
213 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
214 UErrorCode * err);
215 U_CFUNC void
216 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
217 UErrorCode * err);
218
219 #define ESC_2022 0x1B /*ESC*/
220
221 typedef enum
222 {
223 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
224 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
225 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
226 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
227 } UCNV_TableStates_2022;
228
229 /*
230 * The way these state transition arrays work is:
231 * ex : ESC$B is the sequence for JISX208
232 * a) First Iteration: char is ESC
233 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
234 * int x = normalize_esq_chars_2022[27] which is equal to 1
235 * ii) Search for this value in escSeqStateTable_Key_2022[]
236 * value of x is stored at escSeqStateTable_Key_2022[0]
237 * iii) Save this index as offset
238 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
239 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
240 * b) Switch on this state and continue to next char
241 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
242 * which is normalize_esq_chars_2022[36] == 4
243 * ii) x is currently 1(from above)
244 * x<<=5 -- x is now 32
245 * x+=normalize_esq_chars_2022[36]
246 * now x is 36
247 * iii) Search for this value in escSeqStateTable_Key_2022[]
248 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
249 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
250 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
251 * c) Switch on this state and continue to next char
252 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
253 * ii) x is currently 36 (from above)
254 * x<<=5 -- x is now 1152
255 * x+=normalize_esq_chars_2022[66]
256 * now x is 1161
257 * iii) Search for this value in escSeqStateTable_Key_2022[]
258 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
259 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
260 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
261 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
262 */
263
264
265 /*Below are the 3 arrays depicting a state transition table*/
266 static const int8_t normalize_esq_chars_2022[256] = {
267 /* 0 1 2 3 4 5 6 7 8 9 */
268
269 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
270 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
271 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
273 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
276 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
277 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
278 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0
295 };
296
297 #ifdef U_ENABLE_GENERIC_ISO_2022
298 /*
299 * When the generic ISO-2022 converter is completely removed, not just disabled
300 * per #ifdef, then the following state table and the associated tables that are
301 * dimensioned with MAX_STATES_2022 should be trimmed.
302 *
303 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
304 * the associated escape sequences starting with ESC ( B should be removed.
305 * This includes the ones with key values 1097 and all of the ones above 1000000.
306 *
307 * For the latter, the tables can simply be truncated.
308 * For the former, since the tables must be kept parallel, it is probably best
309 * to simply duplicate an adjacent table cell, parallel in all tables.
310 *
311 * It may make sense to restructure the tables, especially by using small search
312 * tables for the variants instead of indexing them parallel to the table here.
313 */
314 #endif
315
316 #define MAX_STATES_2022 74
317 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
318 /* 0 1 2 3 4 5 6 7 8 9 */
319
320 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
321 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
322 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
323 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
324 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
325 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
326 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
327 ,35947631 ,35947635 ,35947636 ,35947638
328 };
329
330 #ifdef U_ENABLE_GENERIC_ISO_2022
331
332 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
333 /* 0 1 2 3 4 5 6 7 8 9 */
334
335 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
336 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
337 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
338 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
339 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
340 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
341 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
342 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
343 };
344
345 #endif
346
347 static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
348 /* 0 1 2 3 4 5 6 7 8 9 */
349 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
350 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
351 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 };
358
359
360 /* Type def for refactoring changeState_2022 code*/
361 typedef enum{
362 #ifdef U_ENABLE_GENERIC_ISO_2022
363 ISO_2022=0,
364 #endif
365 ISO_2022_JP=1,
366 ISO_2022_KR=2,
367 ISO_2022_CN=3
368 } Variant2022;
369
370 /*********** ISO 2022 Converter Protos ***********/
371 static void
372 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
373
374 static void
375 _ISO2022Close(UConverter *converter);
376
377 static void
378 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
379
380 static const char*
381 _ISO2022getName(const UConverter* cnv);
382
383 static void
384 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
385
386 static UConverter *
387 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
388
389 #ifdef U_ENABLE_GENERIC_ISO_2022
390 static void
391 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
392 #endif
393
394 /*const UConverterSharedData _ISO2022Data;*/
395 static const UConverterSharedData _ISO2022JPData;
396 static const UConverterSharedData _ISO2022KRData;
397 static const UConverterSharedData _ISO2022CNData;
398
399 /*************** Converter implementations ******************/
400
401 /* The purpose of this function is to get around gcc compiler warnings. */
402 static U_INLINE void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)403 fromUWriteUInt8(UConverter *cnv,
404 const char *bytes, int32_t length,
405 uint8_t **target, const char *targetLimit,
406 int32_t **offsets,
407 int32_t sourceIndex,
408 UErrorCode *pErrorCode)
409 {
410 char *targetChars = (char *)*target;
411 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
412 offsets, sourceIndex, pErrorCode);
413 *target = (uint8_t*)targetChars;
414
415 }
416
417 static U_INLINE void
setInitialStateToUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)418 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
419 if(myConverterData->version == 1) {
420 UConverter *cnv = myConverterData->currentConverter;
421
422 cnv->toUnicodeStatus=0; /* offset */
423 cnv->mode=0; /* state */
424 cnv->toULength=0; /* byteIndex */
425 }
426 }
427
428 static U_INLINE void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)429 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
430 /* in ISO-2022-KR the designator sequence appears only once
431 * in a file so we append it only once
432 */
433 if( converter->charErrorBufferLength==0){
434
435 converter->charErrorBufferLength = 4;
436 converter->charErrorBuffer[0] = 0x1b;
437 converter->charErrorBuffer[1] = 0x24;
438 converter->charErrorBuffer[2] = 0x29;
439 converter->charErrorBuffer[3] = 0x43;
440 }
441 if(myConverterData->version == 1) {
442 UConverter *cnv = myConverterData->currentConverter;
443
444 cnv->fromUChar32=0;
445 cnv->fromUnicodeStatus=1; /* prevLength */
446 }
447 }
448
449 static void
_ISO2022Open(UConverter * cnv,const char * name,const char * locale,uint32_t options,UErrorCode * errorCode)450 _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
451
452 char myLocale[6]={' ',' ',' ',' ',' ',' '};
453
454 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
455 if(cnv->extraInfo != NULL) {
456 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
457 uint32_t version;
458
459 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
460 myConverterData->currentType = ASCII1;
461 cnv->fromUnicodeStatus =FALSE;
462 if(locale){
463 uprv_strncpy(myLocale, locale, sizeof(myLocale));
464 }
465 version = options & UCNV_OPTIONS_VERSION_MASK;
466 myConverterData->version = version;
467
468 // BEGIN android-changed
469 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
470 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
471 if((myLocale[0]=='j' &&
472 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
473 myLocale[1]=='s') &&
474 (myLocale[2]=='_' || myLocale[2]=='\0')))
475 {
476 size_t len=0;
477 /* open the required converters and cache them */
478 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
479 myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
480 }
481 if (myLocale[1]=='k') { /* Use KDDI's version. */
482 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("kddi-jisx-208-2007", NULL, errorCode);
483 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */
484 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("softbank-jisx-208-2007", NULL, errorCode);
485 } else {
486 myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
487 }
488 // END android-changed
489
490 if(jpCharsetMasks[version]&CSM(JISX212)) {
491 myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
492 }
493 if(jpCharsetMasks[version]&CSM(GB2312)) {
494 myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
495 }
496 if(jpCharsetMasks[version]&CSM(KSC5601)) {
497 myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
498 }
499
500 /* set the function pointers to appropriate funtions */
501 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
502 uprv_strcpy(myConverterData->locale,"ja");
503
504 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
505 len = uprv_strlen(myConverterData->name);
506 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
507 myConverterData->name[len+1]='\0';
508 }
509 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
510 (myLocale[2]=='_' || myLocale[2]=='\0'))
511 {
512 if (version==1){
513 myConverterData->currentConverter=
514 ucnv_open("icu-internal-25546",errorCode);
515
516 if (U_FAILURE(*errorCode)) {
517 _ISO2022Close(cnv);
518 return;
519 }
520
521 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
522 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
523 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
524 }else{
525 myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
526
527 if (U_FAILURE(*errorCode)) {
528 _ISO2022Close(cnv);
529 return;
530 }
531
532 myConverterData->version = 0;
533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
534 }
535
536 /* initialize the state variables */
537 setInitialStateToUnicodeKR(cnv, myConverterData);
538 setInitialStateFromUnicodeKR(cnv, myConverterData);
539
540 /* set the function pointers to appropriate funtions */
541 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
542 uprv_strcpy(myConverterData->locale,"ko");
543 }
544 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
545 (myLocale[2]=='_' || myLocale[2]=='\0'))
546 {
547
548 /* open the required converters and cache them */
549 myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
550 if(version==1) {
551 myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
552 }
553 myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
554
555
556 /* set the function pointers to appropriate funtions */
557 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
558 uprv_strcpy(myConverterData->locale,"cn");
559
560 if (version==1){
561 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
562 }else{
563 myConverterData->version = 0;
564 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
565 }
566 }
567 else{
568 #ifdef U_ENABLE_GENERIC_ISO_2022
569 myConverterData->isFirstBuffer = TRUE;
570
571 /* append the UTF-8 escape sequence */
572 cnv->charErrorBufferLength = 3;
573 cnv->charErrorBuffer[0] = 0x1b;
574 cnv->charErrorBuffer[1] = 0x25;
575 cnv->charErrorBuffer[2] = 0x42;
576
577 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
578 /* initialize the state variables */
579 uprv_strcpy(myConverterData->name,"ISO_2022");
580 #else
581 *errorCode = U_UNSUPPORTED_ERROR;
582 return;
583 #endif
584 }
585
586 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
587
588 if(U_FAILURE(*errorCode)) {
589 _ISO2022Close(cnv);
590 }
591 } else {
592 *errorCode = U_MEMORY_ALLOCATION_ERROR;
593 }
594 }
595
596
597 static void
_ISO2022Close(UConverter * converter)598 _ISO2022Close(UConverter *converter) {
599 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
600 UConverterSharedData **array = myData->myConverterArray;
601 int32_t i;
602
603 if (converter->extraInfo != NULL) {
604 /*close the array of converter pointers and free the memory*/
605 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
606 if(array[i]!=NULL) {
607 ucnv_unloadSharedDataIfReady(array[i]);
608 }
609 }
610
611 ucnv_close(myData->currentConverter);
612
613 if(!converter->isExtraLocal){
614 uprv_free (converter->extraInfo);
615 converter->extraInfo = NULL;
616 }
617 }
618 }
619
620 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)621 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
622 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
623 if(choice<=UCNV_RESET_TO_UNICODE) {
624 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
625 myConverterData->key = 0;
626 }
627 if(choice!=UCNV_RESET_TO_UNICODE) {
628 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
629 }
630 #ifdef U_ENABLE_GENERIC_ISO_2022
631 if(myConverterData->locale[0] == 0){
632 if(choice<=UCNV_RESET_TO_UNICODE) {
633 myConverterData->isFirstBuffer = TRUE;
634 myConverterData->key = 0;
635 if (converter->mode == UCNV_SO){
636 ucnv_close (myConverterData->currentConverter);
637 myConverterData->currentConverter=NULL;
638 }
639 converter->mode = UCNV_SI;
640 }
641 if(choice!=UCNV_RESET_TO_UNICODE) {
642 /* re-append UTF-8 escape sequence */
643 converter->charErrorBufferLength = 3;
644 converter->charErrorBuffer[0] = 0x1b;
645 converter->charErrorBuffer[1] = 0x28;
646 converter->charErrorBuffer[2] = 0x42;
647 }
648 }
649 else
650 #endif
651 {
652 /* reset the state variables */
653 if(myConverterData->locale[0] == 'k'){
654 if(choice<=UCNV_RESET_TO_UNICODE) {
655 setInitialStateToUnicodeKR(converter, myConverterData);
656 }
657 if(choice!=UCNV_RESET_TO_UNICODE) {
658 setInitialStateFromUnicodeKR(converter, myConverterData);
659 }
660 }
661 }
662 }
663
664 static const char*
_ISO2022getName(const UConverter * cnv)665 _ISO2022getName(const UConverter* cnv){
666 if(cnv->extraInfo){
667 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
668 return myData->name;
669 }
670 return NULL;
671 }
672
673
674 /*************** to unicode *******************/
675 /****************************************************************************
676 * Recognized escape sequences are
677 * <ESC>(B ASCII
678 * <ESC>.A ISO-8859-1
679 * <ESC>.F ISO-8859-7
680 * <ESC>(J JISX-201
681 * <ESC>(I JISX-201
682 * <ESC>$B JISX-208
683 * <ESC>$@ JISX-208
684 * <ESC>$(D JISX-212
685 * <ESC>$A GB2312
686 * <ESC>$(C KSC5601
687 */
688 static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
689 /* 0 1 2 3 4 5 6 7 8 9 */
690 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
691 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
692 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
693 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
694 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
695 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
696 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
697 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
698 };
699
700 /*************** to unicode *******************/
701 static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
702 /* 0 1 2 3 4 5 6 7 8 9 */
703 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
704 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
705 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
706 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
707 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
708 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
709 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
710 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
711 };
712
713
714 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)715 getKey_2022(char c,int32_t* key,int32_t* offset){
716 int32_t togo;
717 int32_t low = 0;
718 int32_t hi = MAX_STATES_2022;
719 int32_t oldmid=0;
720
721 togo = normalize_esq_chars_2022[(uint8_t)c];
722 if(togo == 0) {
723 /* not a valid character anywhere in an escape sequence */
724 *key = 0;
725 *offset = 0;
726 return INVALID_2022;
727 }
728 togo = (*key << 5) + togo;
729
730 while (hi != low) /*binary search*/{
731
732 register int32_t mid = (hi+low) >> 1; /*Finds median*/
733
734 if (mid == oldmid)
735 break;
736
737 if (escSeqStateTable_Key_2022[mid] > togo){
738 hi = mid;
739 }
740 else if (escSeqStateTable_Key_2022[mid] < togo){
741 low = mid;
742 }
743 else /*we found it*/{
744 *key = togo;
745 *offset = mid;
746 return escSeqStateTable_Value_2022[mid];
747 }
748 oldmid = mid;
749
750 }
751
752 *key = 0;
753 *offset = 0;
754 return INVALID_2022;
755 }
756
757 /*runs through a state machine to determine the escape sequence - codepage correspondance
758 */
759 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)760 changeState_2022(UConverter* _this,
761 const char** source,
762 const char* sourceLimit,
763 Variant2022 var,
764 UErrorCode* err){
765 UCNV_TableStates_2022 value;
766 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
767 uint32_t key = myData2022->key;
768 int32_t offset = 0;
769 char c;
770
771 value = VALID_NON_TERMINAL_2022;
772 while (*source < sourceLimit) {
773 c = *(*source)++;
774 _this->toUBytes[_this->toULength++]=(uint8_t)c;
775 value = getKey_2022(c,(int32_t *) &key, &offset);
776
777 switch (value){
778
779 case VALID_NON_TERMINAL_2022 :
780 /* continue with the loop */
781 break;
782
783 case VALID_TERMINAL_2022:
784 key = 0;
785 goto DONE;
786
787 case INVALID_2022:
788 goto DONE;
789
790 case VALID_MAYBE_TERMINAL_2022:
791 #ifdef U_ENABLE_GENERIC_ISO_2022
792 /* ESC ( B is ambiguous only for ISO_2022 itself */
793 if(var == ISO_2022) {
794 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
795 _this->toULength = 0;
796
797 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
798
799 /* continue with the loop */
800 value = VALID_NON_TERMINAL_2022;
801 break;
802 } else
803 #endif
804 {
805 /* not ISO_2022 itself, finish here */
806 value = VALID_TERMINAL_2022;
807 key = 0;
808 goto DONE;
809 }
810 }
811 }
812
813 DONE:
814 myData2022->key = key;
815
816 if (value == VALID_NON_TERMINAL_2022) {
817 /* indicate that the escape sequence is incomplete: key!=0 */
818 return;
819 } else if (value == INVALID_2022 ) {
820 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
821 return;
822 } else /* value == VALID_TERMINAL_2022 */ {
823 switch(var){
824 #ifdef U_ENABLE_GENERIC_ISO_2022
825 case ISO_2022:
826 {
827 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
828 if(chosenConverterName == NULL) {
829 /* SS2 or SS3 */
830 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
831 return;
832 }
833
834 _this->mode = UCNV_SI;
835 ucnv_close(myData2022->currentConverter);
836 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
837 if(U_SUCCESS(*err)) {
838 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
839 _this->mode = UCNV_SO;
840 }
841 break;
842 }
843 #endif
844 case ISO_2022_JP:
845 {
846 StateEnum tempState=nextStateToUnicodeJP[offset];
847 switch(tempState) {
848 case INVALID_STATE:
849 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
850 break;
851 case SS2_STATE:
852 if(myData2022->toU2022State.cs[2]!=0) {
853 if(myData2022->toU2022State.g<2) {
854 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
855 }
856 myData2022->toU2022State.g=2;
857 } else {
858 /* illegal to have SS2 before a matching designator */
859 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
860 }
861 break;
862 /* case SS3_STATE: not used in ISO-2022-JP-x */
863 case ISO8859_1:
864 case ISO8859_7:
865 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
866 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
867 } else {
868 /* G2 charset for SS2 */
869 myData2022->toU2022State.cs[2]=(int8_t)tempState;
870 }
871 break;
872 default:
873 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
874 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
875 } else {
876 /* G0 charset */
877 myData2022->toU2022State.cs[0]=(int8_t)tempState;
878 }
879 break;
880 }
881 }
882 break;
883 case ISO_2022_CN:
884 {
885 StateEnum tempState=nextStateToUnicodeCN[offset];
886 switch(tempState) {
887 case INVALID_STATE:
888 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
889 break;
890 case SS2_STATE:
891 if(myData2022->toU2022State.cs[2]!=0) {
892 if(myData2022->toU2022State.g<2) {
893 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
894 }
895 myData2022->toU2022State.g=2;
896 } else {
897 /* illegal to have SS2 before a matching designator */
898 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
899 }
900 break;
901 case SS3_STATE:
902 if(myData2022->toU2022State.cs[3]!=0) {
903 if(myData2022->toU2022State.g<2) {
904 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
905 }
906 myData2022->toU2022State.g=3;
907 } else {
908 /* illegal to have SS3 before a matching designator */
909 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
910 }
911 break;
912 case ISO_IR_165:
913 if(myData2022->version==0) {
914 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
915 break;
916 }
917 /*fall through*/
918 case GB2312_1:
919 /*fall through*/
920 case CNS_11643_1:
921 myData2022->toU2022State.cs[1]=(int8_t)tempState;
922 break;
923 case CNS_11643_2:
924 myData2022->toU2022State.cs[2]=(int8_t)tempState;
925 break;
926 default:
927 /* other CNS 11643 planes */
928 if(myData2022->version==0) {
929 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
930 } else {
931 myData2022->toU2022State.cs[3]=(int8_t)tempState;
932 }
933 break;
934 }
935 }
936 break;
937 case ISO_2022_KR:
938 if(offset==0x30){
939 /* nothing to be done, just accept this one escape sequence */
940 } else {
941 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942 }
943 break;
944
945 default:
946 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
947 break;
948 }
949 }
950 if(U_SUCCESS(*err)) {
951 _this->toULength = 0;
952 }
953 }
954
955 /*Checks the characters of the buffer against valid 2022 escape sequences
956 *if the match we return a pointer to the initial start of the sequence otherwise
957 *we return sourceLimit
958 */
959 /*for 2022 looks ahead in the stream
960 *to determine the longest possible convertible
961 *data stream
962 */
963 static U_INLINE const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool flush)964 getEndOfBuffer_2022(const char** source,
965 const char* sourceLimit,
966 UBool flush){
967
968 const char* mySource = *source;
969
970 #ifdef U_ENABLE_GENERIC_ISO_2022
971 if (*source >= sourceLimit)
972 return sourceLimit;
973
974 do{
975
976 if (*mySource == ESC_2022){
977 int8_t i;
978 int32_t key = 0;
979 int32_t offset;
980 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
981
982 /* Kludge: I could not
983 * figure out the reason for validating an escape sequence
984 * twice - once here and once in changeState_2022().
985 * is it possible to have an ESC character in a ISO2022
986 * byte stream which is valid in a code page? Is it legal?
987 */
988 for (i=0;
989 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
990 i++) {
991 value = getKey_2022(*(mySource+i), &key, &offset);
992 }
993 if (value > 0 || *mySource==ESC_2022)
994 return mySource;
995
996 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
997 return sourceLimit;
998 }
999 }while (++mySource < sourceLimit);
1000
1001 return sourceLimit;
1002 #else
1003 while(mySource < sourceLimit && *mySource != ESC_2022) {
1004 ++mySource;
1005 }
1006 return mySource;
1007 #endif
1008 }
1009
1010
1011 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1012 * any future change in _MBCSFromUChar32() function should be reflected here.
1013 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1014 */
1015 static U_INLINE int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1016 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1017 UChar32 c,
1018 uint32_t* value,
1019 UBool useFallback,
1020 int outputType)
1021 {
1022 const int32_t *cx;
1023 const uint16_t *table;
1024 uint32_t stage2Entry;
1025 uint32_t myValue;
1026 int32_t length;
1027 const uint8_t *p;
1028 /*
1029 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1030 * Use internal version of ucnv_open() that verifies that the new structures are available,
1031 * else U_INTERNAL_PROGRAM_ERROR.
1032 */
1033 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1034 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1035 table=sharedData->mbcs.fromUnicodeTable;
1036 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1037 /* get the bytes and the length for the output */
1038 if(outputType==MBCS_OUTPUT_2){
1039 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1040 if(myValue<=0xff) {
1041 length=1;
1042 } else {
1043 length=2;
1044 }
1045 } else /* outputType==MBCS_OUTPUT_3 */ {
1046 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1047 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1048 if(myValue<=0xff) {
1049 length=1;
1050 } else if(myValue<=0xffff) {
1051 length=2;
1052 } else {
1053 length=3;
1054 }
1055 }
1056 /* is this code point assigned, or do we use fallbacks? */
1057 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1058 /* assigned */
1059 *value=myValue;
1060 return length;
1061 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1062 /*
1063 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1064 * There is no way with this data structure for fallback output
1065 * to be a zero byte.
1066 */
1067 *value=myValue;
1068 return -length;
1069 }
1070 }
1071
1072 cx=sharedData->mbcs.extIndexes;
1073 if(cx!=NULL) {
1074 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1075 }
1076
1077 /* unassigned */
1078 return 0;
1079 }
1080
1081 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1082 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1083 * @param retval pointer to output byte
1084 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1085 */
1086 static U_INLINE int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1087 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1088 UChar32 c,
1089 uint32_t* retval,
1090 UBool useFallback)
1091 {
1092 const uint16_t *table;
1093 int32_t value;
1094 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1095 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1096 return 0;
1097 }
1098 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1099 table=sharedData->mbcs.fromUnicodeTable;
1100 /* get the byte for the output */
1101 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1102 /* is this code point assigned, or do we use fallbacks? */
1103 *retval=(uint32_t)(value&0xff);
1104 if(value>=0xf00) {
1105 return 1; /* roundtrip */
1106 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1107 return -1; /* fallback taken */
1108 } else {
1109 return 0; /* no mapping */
1110 }
1111 }
1112
1113 /*
1114 * Check that the result is a 2-byte value with each byte in the range A1..FE
1115 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1116 * to move it to the ISO 2022 range 21..7E.
1117 * Return 0 if out of range.
1118 */
1119 static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value)1120 _2022FromGR94DBCS(uint32_t value) {
1121 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1122 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1123 ) {
1124 return value - 0x8080; /* shift down to 21..7e byte range */
1125 } else {
1126 return 0; /* not valid for ISO 2022 */
1127 }
1128 }
1129
1130 #ifdef U_ENABLE_GENERIC_ISO_2022
1131
1132 /**********************************************************************************
1133 * ISO-2022 Converter
1134 *
1135 *
1136 */
1137
1138 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1139 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1140 UErrorCode* err){
1141 const char* mySourceLimit, *realSourceLimit;
1142 const char* sourceStart;
1143 const UChar* myTargetStart;
1144 UConverter* saveThis;
1145 UConverterDataISO2022* myData;
1146 int8_t length;
1147
1148 saveThis = args->converter;
1149 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1150
1151 realSourceLimit = args->sourceLimit;
1152 while (args->source < realSourceLimit) {
1153 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1154 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1155 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1156
1157 if(args->source < mySourceLimit) {
1158 if(myData->currentConverter==NULL) {
1159 myData->currentConverter = ucnv_open("ASCII",err);
1160 if(U_FAILURE(*err)){
1161 return;
1162 }
1163
1164 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1165 saveThis->mode = UCNV_SO;
1166 }
1167
1168 /* convert to before the ESC or until the end of the buffer */
1169 myData->isFirstBuffer=FALSE;
1170 sourceStart = args->source;
1171 myTargetStart = args->target;
1172 args->converter = myData->currentConverter;
1173 ucnv_toUnicode(args->converter,
1174 &args->target,
1175 args->targetLimit,
1176 &args->source,
1177 mySourceLimit,
1178 args->offsets,
1179 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1180 err);
1181 args->converter = saveThis;
1182
1183 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1184 /* move the overflow buffer */
1185 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1186 myData->currentConverter->UCharErrorBufferLength = 0;
1187 if(length > 0) {
1188 uprv_memcpy(saveThis->UCharErrorBuffer,
1189 myData->currentConverter->UCharErrorBuffer,
1190 length*U_SIZEOF_UCHAR);
1191 }
1192 return;
1193 }
1194
1195 /*
1196 * At least one of:
1197 * -Error while converting
1198 * -Done with entire buffer
1199 * -Need to write offsets or update the current offset
1200 * (leave that up to the code in ucnv.c)
1201 *
1202 * or else we just stopped at an ESC byte and continue with changeState_2022()
1203 */
1204 if (U_FAILURE(*err) ||
1205 (args->source == realSourceLimit) ||
1206 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1207 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1208 ) {
1209 /* copy partial or error input for truncated detection and error handling */
1210 if(U_FAILURE(*err)) {
1211 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1212 if(length > 0) {
1213 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1214 }
1215 } else {
1216 length = saveThis->toULength = myData->currentConverter->toULength;
1217 if(length > 0) {
1218 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1219 if(args->source < mySourceLimit) {
1220 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1221 }
1222 }
1223 }
1224 return;
1225 }
1226 }
1227 }
1228
1229 sourceStart = args->source;
1230 changeState_2022(args->converter,
1231 &(args->source),
1232 realSourceLimit,
1233 ISO_2022,
1234 err);
1235 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1236 /* let the ucnv.c code update its current offset */
1237 return;
1238 }
1239 }
1240 }
1241
1242 #endif
1243
1244 /*
1245 * To Unicode Callback helper function
1246 */
1247 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1248 toUnicodeCallback(UConverter *cnv,
1249 const uint32_t sourceChar, const uint32_t targetUniChar,
1250 UErrorCode* err){
1251 if(sourceChar>0xff){
1252 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1253 cnv->toUBytes[1] = (uint8_t)sourceChar;
1254 cnv->toULength = 2;
1255 }
1256 else{
1257 cnv->toUBytes[0] =(char) sourceChar;
1258 cnv->toULength = 1;
1259 }
1260
1261 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1262 *err = U_INVALID_CHAR_FOUND;
1263 }
1264 else{
1265 *err = U_ILLEGAL_CHAR_FOUND;
1266 }
1267 }
1268
1269 /**************************************ISO-2022-JP*************************************************/
1270
1271 /************************************** IMPORTANT **************************************************
1272 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1273 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1274 * The converter iterates over each Unicode codepoint
1275 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1276 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1277 * would do as far as possible.
1278 *
1279 * If the implementation of these macros or structure of sharedData struct change in the future, make
1280 * sure that ISO-2022 is also changed.
1281 ***************************************************************************************************
1282 */
1283
1284 /***************************************************************************************************
1285 * Rules for ISO-2022-jp encoding
1286 * (i) Escape sequences must be fully contained within a line they should not
1287 * span new lines or CRs
1288 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1289 * JIS-Roman character escape sequence should follow before the line terminates
1290 * (iii) If the first character on the line is represented by two bytes then a two
1291 * byte character escape sequence should precede it
1292 * (iv) If no escape sequence is encountered then the characters are ASCII
1293 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1294 * and invoked with SS2 (ESC N).
1295 * (vi) If there is any G0 designation in text, there must be a switch to
1296 * ASCII or to JIS X 0201-Roman before a space character (but not
1297 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1298 * characters such as tab or CRLF.
1299 * (vi) Supported encodings:
1300 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1301 *
1302 * source : RFC-1554
1303 *
1304 * JISX201, JISX208,JISX212 : new .cnv data files created
1305 * KSC5601 : alias to ibm-949 mapping table
1306 * GB2312 : alias to ibm-1386 mapping table
1307 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1308 * ISO-8859-7 : alisas to ibm-9409 mapping table
1309 */
1310
1311 /* preference order of JP charsets */
1312 static const StateEnum jpCharsetPref[]={
1313 ASCII,
1314 JISX201,
1315 ISO8859_1,
1316 ISO8859_7,
1317 JISX208,
1318 JISX212,
1319 GB2312,
1320 KSC5601,
1321 HWKANA_7BIT
1322 };
1323
1324 /*
1325 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1326 * not in order of jpCharsetPref[]!
1327 */
1328 static const char escSeqChars[][6] ={
1329 "\x1B\x28\x42", /* <ESC>(B ASCII */
1330 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1331 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1332 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1333 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1334 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1335 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1336 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1337 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1338
1339 };
1340 static const int32_t escSeqCharsLen[] ={
1341 3, /* length of <ESC>(B ASCII */
1342 3, /* length of <ESC>.A ISO-8859-1 */
1343 3, /* length of <ESC>.F ISO-8859-7 */
1344 3, /* length of <ESC>(J JISX-201 */
1345 3, /* length of <ESC>$B JISX-208 */
1346 4, /* length of <ESC>$(D JISX-212 */
1347 3, /* length of <ESC>$A GB2312 */
1348 4, /* length of <ESC>$(C KSC5601 */
1349 3 /* length of <ESC>(I HWKANA_7BIT */
1350 };
1351
1352 /*
1353 * The iteration over various code pages works this way:
1354 * i) Get the currentState from myConverterData->currentState
1355 * ii) Check if the character is mapped to a valid character in the currentState
1356 * Yes -> a) set the initIterState to currentState
1357 * b) remain in this state until an invalid character is found
1358 * No -> a) go to the next code page and find the character
1359 * iii) Before changing the state increment the current state check if the current state
1360 * is equal to the intitIteration state
1361 * Yes -> A character that cannot be represented in any of the supported encodings
1362 * break and return a U_INVALID_CHARACTER error
1363 * No -> Continue and find the character in next code page
1364 *
1365 *
1366 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1367 */
1368
1369 /* Map 00..7F to Unicode according to JIS X 0201. */
1370 static U_INLINE uint32_t
jisx201ToU(uint32_t value)1371 jisx201ToU(uint32_t value) {
1372 if(value < 0x5c) {
1373 return value;
1374 } else if(value == 0x5c) {
1375 return 0xa5;
1376 } else if(value == 0x7e) {
1377 return 0x203e;
1378 } else /* value <= 0x7f */ {
1379 return value;
1380 }
1381 }
1382
1383 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1384 static U_INLINE uint32_t
jisx201FromU(uint32_t value)1385 jisx201FromU(uint32_t value) {
1386 if(value<=0x7f) {
1387 if(value!=0x5c && value!=0x7e) {
1388 return value;
1389 }
1390 } else if(value==0xa5) {
1391 return 0x5c;
1392 } else if(value==0x203e) {
1393 return 0x7e;
1394 }
1395 return 0xfffe;
1396 }
1397
1398 /*
1399 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1400 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1401 * Return 0 if the byte pair is out of range.
1402 */
1403 static U_INLINE uint32_t
_2022FromSJIS(uint32_t value)1404 _2022FromSJIS(uint32_t value) {
1405 uint8_t trail;
1406
1407 if(value > 0xEFFC) {
1408 return 0; /* beyond JIS X 0208 */
1409 }
1410
1411 trail = (uint8_t)value;
1412
1413 value &= 0xff00; /* lead byte */
1414 if(value <= 0x9f00) {
1415 value -= 0x7000;
1416 } else /* 0xe000 <= value <= 0xef00 */ {
1417 value -= 0xb000;
1418 }
1419 value <<= 1;
1420
1421 if(trail <= 0x9e) {
1422 value -= 0x100;
1423 if(trail <= 0x7e) {
1424 value |= trail - 0x1f;
1425 } else {
1426 value |= trail - 0x20;
1427 }
1428 } else /* trail <= 0xfc */ {
1429 value |= trail - 0x7e;
1430 }
1431 return value;
1432 }
1433
1434 /*
1435 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1436 * If either byte is outside 21..7E make sure that the result is not valid
1437 * for Shift-JIS so that the converter catches it.
1438 * Some invalid byte values already turn into equally invalid Shift-JIS
1439 * byte values and need not be tested explicitly.
1440 */
1441 static U_INLINE void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1442 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1443 if(c1&1) {
1444 ++c1;
1445 if(c2 <= 0x5f) {
1446 c2 += 0x1f;
1447 } else if(c2 <= 0x7e) {
1448 c2 += 0x20;
1449 } else {
1450 c2 = 0; /* invalid */
1451 }
1452 } else {
1453 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1454 c2 += 0x7e;
1455 } else {
1456 c2 = 0; /* invalid */
1457 }
1458 }
1459 c1 >>= 1;
1460 if(c1 <= 0x2f) {
1461 c1 += 0x70;
1462 } else if(c1 <= 0x3f) {
1463 c1 += 0xb0;
1464 } else {
1465 c1 = 0; /* invalid */
1466 }
1467 bytes[0] = (char)c1;
1468 bytes[1] = (char)c2;
1469 }
1470
1471 /*
1472 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1473 * Katakana.
1474 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1475 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1476 * These were the only fallbacks in ICU's jisx-208.ucm file.
1477 */
1478 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1479 0x2123, /* U+FF61 */
1480 0x2156,
1481 0x2157,
1482 0x2122,
1483 0x2126,
1484 0x2572,
1485 0x2521,
1486 0x2523,
1487 0x2525,
1488 0x2527,
1489 0x2529,
1490 0x2563,
1491 0x2565,
1492 0x2567,
1493 0x2543,
1494 0x213C, /* U+FF70 */
1495 0x2522,
1496 0x2524,
1497 0x2526,
1498 0x2528,
1499 0x252A,
1500 0x252B,
1501 0x252D,
1502 0x252F,
1503 0x2531,
1504 0x2533,
1505 0x2535,
1506 0x2537,
1507 0x2539,
1508 0x253B,
1509 0x253D,
1510 0x253F, /* U+FF80 */
1511 0x2541,
1512 0x2544,
1513 0x2546,
1514 0x2548,
1515 0x254A,
1516 0x254B,
1517 0x254C,
1518 0x254D,
1519 0x254E,
1520 0x254F,
1521 0x2552,
1522 0x2555,
1523 0x2558,
1524 0x255B,
1525 0x255E,
1526 0x255F, /* U+FF90 */
1527 0x2560,
1528 0x2561,
1529 0x2562,
1530 0x2564,
1531 0x2566,
1532 0x2568,
1533 0x2569,
1534 0x256A,
1535 0x256B,
1536 0x256C,
1537 0x256D,
1538 0x256F,
1539 0x2573,
1540 0x212B,
1541 0x212C /* U+FF9F */
1542 };
1543
1544 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1545 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1546 UConverter *cnv = args->converter;
1547 UConverterDataISO2022 *converterData;
1548 ISO2022State *pFromU2022State;
1549 uint8_t *target = (uint8_t *) args->target;
1550 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1551 const UChar* source = args->source;
1552 const UChar* sourceLimit = args->sourceLimit;
1553 int32_t* offsets = args->offsets;
1554 UChar32 sourceChar;
1555 char buffer[8];
1556 int32_t len, outLen;
1557 int8_t choices[10];
1558 int32_t choiceCount;
1559 uint32_t targetValue = 0;
1560 UBool useFallback;
1561
1562 int32_t i;
1563 int8_t cs, g;
1564
1565 /* set up the state */
1566 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1567 pFromU2022State = &converterData->fromU2022State;
1568
1569 choiceCount = 0;
1570
1571 /* check if the last codepoint of previous buffer was a lead surrogate*/
1572 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1573 goto getTrail;
1574 }
1575
1576 while(source < sourceLimit) {
1577 if(target < targetLimit) {
1578
1579 sourceChar = *(source++);
1580 /*check if the char is a First surrogate*/
1581 if(UTF_IS_SURROGATE(sourceChar)) {
1582 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1583 getTrail:
1584 /*look ahead to find the trail surrogate*/
1585 if(source < sourceLimit) {
1586 /* test the following code unit */
1587 UChar trail=(UChar) *source;
1588 if(UTF_IS_SECOND_SURROGATE(trail)) {
1589 source++;
1590 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1591 cnv->fromUChar32=0x00;
1592 /* convert this supplementary code point */
1593 /* exit this condition tree */
1594 } else {
1595 /* this is an unmatched lead code unit (1st surrogate) */
1596 /* callback(illegal) */
1597 *err=U_ILLEGAL_CHAR_FOUND;
1598 cnv->fromUChar32=sourceChar;
1599 break;
1600 }
1601 } else {
1602 /* no more input */
1603 cnv->fromUChar32=sourceChar;
1604 break;
1605 }
1606 } else {
1607 /* this is an unmatched trail code unit (2nd surrogate) */
1608 /* callback(illegal) */
1609 *err=U_ILLEGAL_CHAR_FOUND;
1610 cnv->fromUChar32=sourceChar;
1611 break;
1612 }
1613 }
1614
1615 /* do not convert SO/SI/ESC */
1616 if(IS_2022_CONTROL(sourceChar)) {
1617 /* callback(illegal) */
1618 *err=U_ILLEGAL_CHAR_FOUND;
1619 cnv->fromUChar32=sourceChar;
1620 break;
1621 }
1622
1623 /* do the conversion */
1624
1625 if(choiceCount == 0) {
1626 uint16_t csm;
1627
1628 /*
1629 * The csm variable keeps track of which charsets are allowed
1630 * and not used yet while building the choices[].
1631 */
1632 csm = jpCharsetMasks[converterData->version];
1633 choiceCount = 0;
1634
1635 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1636 if(converterData->version == 3 || converterData->version == 4) {
1637 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1638 }
1639 /* Do not try single-byte half-width Katakana for other versions. */
1640 csm &= ~CSM(HWKANA_7BIT);
1641
1642 /* try the current G0 charset */
1643 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1644 csm &= ~CSM(cs);
1645
1646 /* try the current G2 charset */
1647 if((cs = pFromU2022State->cs[2]) != 0) {
1648 choices[choiceCount++] = cs;
1649 csm &= ~CSM(cs);
1650 }
1651
1652 /* try all the other possible charsets */
1653 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1654 cs = (int8_t)jpCharsetPref[i];
1655 if(CSM(cs) & csm) {
1656 choices[choiceCount++] = cs;
1657 csm &= ~CSM(cs);
1658 }
1659 }
1660 }
1661
1662 cs = g = 0;
1663 /*
1664 * len==0: no mapping found yet
1665 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1666 * len>0: found a roundtrip result, done
1667 */
1668 len = 0;
1669 /*
1670 * We will turn off useFallback after finding a fallback,
1671 * but we still get fallbacks from PUA code points as usual.
1672 * Therefore, we will also need to check that we don't overwrite
1673 * an early fallback with a later one.
1674 */
1675 useFallback = cnv->useFallback;
1676
1677 for(i = 0; i < choiceCount && len <= 0; ++i) {
1678 uint32_t value;
1679 int32_t len2;
1680 int8_t cs0 = choices[i];
1681 switch(cs0) {
1682 case ASCII:
1683 if(sourceChar <= 0x7f) {
1684 targetValue = (uint32_t)sourceChar;
1685 len = 1;
1686 cs = cs0;
1687 g = 0;
1688 }
1689 break;
1690 case ISO8859_1:
1691 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1692 targetValue = (uint32_t)sourceChar - 0x80;
1693 len = 1;
1694 cs = cs0;
1695 g = 2;
1696 }
1697 break;
1698 case HWKANA_7BIT:
1699 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1700 if(converterData->version==3) {
1701 /* JIS7: use G1 (SO) */
1702 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1703 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1704 len = 1;
1705 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1706 g = 1;
1707 } else if(converterData->version==4) {
1708 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1709 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1710 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1711 len = 1;
1712
1713 cs = pFromU2022State->cs[0];
1714 if(IS_JP_DBCS(cs)) {
1715 /* switch from a DBCS charset to JISX201 */
1716 cs = (int8_t)JISX201;
1717 }
1718 /* else stay in the current G0 charset */
1719 g = 0;
1720 }
1721 /* else do not use HWKANA_7BIT with other versions */
1722 }
1723 break;
1724 case JISX201:
1725 /* G0 SBCS */
1726 value = jisx201FromU(sourceChar);
1727 if(value <= 0x7f) {
1728 targetValue = value;
1729 len = 1;
1730 cs = cs0;
1731 g = 0;
1732 useFallback = FALSE;
1733 }
1734 break;
1735 case JISX208:
1736 /* G0 DBCS from Shift-JIS table */
1737 len2 = MBCS_FROM_UCHAR32_ISO2022(
1738 converterData->myConverterArray[cs0],
1739 sourceChar, &value,
1740 useFallback, MBCS_OUTPUT_2);
1741 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1742 value = _2022FromSJIS(value);
1743 if(value != 0) {
1744 targetValue = value;
1745 len = len2;
1746 cs = cs0;
1747 g = 0;
1748 useFallback = FALSE;
1749 }
1750 } else if(len == 0 && useFallback &&
1751 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1752 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1753 len = -2;
1754 cs = cs0;
1755 g = 0;
1756 useFallback = FALSE;
1757 }
1758 break;
1759 case ISO8859_7:
1760 /* G0 SBCS forced to 7-bit output */
1761 len2 = MBCS_SINGLE_FROM_UCHAR32(
1762 converterData->myConverterArray[cs0],
1763 sourceChar, &value,
1764 useFallback);
1765 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1766 targetValue = value - 0x80;
1767 len = len2;
1768 cs = cs0;
1769 g = 2;
1770 useFallback = FALSE;
1771 }
1772 break;
1773 default:
1774 /* G0 DBCS */
1775 len2 = MBCS_FROM_UCHAR32_ISO2022(
1776 converterData->myConverterArray[cs0],
1777 sourceChar, &value,
1778 useFallback, MBCS_OUTPUT_2);
1779 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1780 if(cs0 == KSC5601) {
1781 /*
1782 * Check for valid bytes for the encoding scheme.
1783 * This is necessary because the sub-converter (windows-949)
1784 * has a broader encoding scheme than is valid for 2022.
1785 */
1786 value = _2022FromGR94DBCS(value);
1787 if(value == 0) {
1788 break;
1789 }
1790 }
1791 targetValue = value;
1792 len = len2;
1793 cs = cs0;
1794 g = 0;
1795 useFallback = FALSE;
1796 }
1797 break;
1798 }
1799 }
1800
1801 if(len != 0) {
1802 if(len < 0) {
1803 len = -len; /* fallback */
1804 }
1805 outLen = 0; /* count output bytes */
1806
1807 /* write SI if necessary (only for JIS7) */
1808 if(pFromU2022State->g == 1 && g == 0) {
1809 buffer[outLen++] = UCNV_SI;
1810 pFromU2022State->g = 0;
1811 }
1812
1813 /* write the designation sequence if necessary */
1814 if(cs != pFromU2022State->cs[g]) {
1815 int32_t escLen = escSeqCharsLen[cs];
1816 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1817 outLen += escLen;
1818 pFromU2022State->cs[g] = cs;
1819
1820 /* invalidate the choices[] */
1821 choiceCount = 0;
1822 }
1823
1824 /* write the shift sequence if necessary */
1825 if(g != pFromU2022State->g) {
1826 switch(g) {
1827 /* case 0 handled before writing escapes */
1828 case 1:
1829 buffer[outLen++] = UCNV_SO;
1830 pFromU2022State->g = 1;
1831 break;
1832 default: /* case 2 */
1833 buffer[outLen++] = 0x1b;
1834 buffer[outLen++] = 0x4e;
1835 break;
1836 /* no case 3: no SS3 in ISO-2022-JP-x */
1837 }
1838 }
1839
1840 /* write the output bytes */
1841 if(len == 1) {
1842 buffer[outLen++] = (char)targetValue;
1843 } else /* len == 2 */ {
1844 buffer[outLen++] = (char)(targetValue >> 8);
1845 buffer[outLen++] = (char)targetValue;
1846 }
1847 } else {
1848 /*
1849 * if we cannot find the character after checking all codepages
1850 * then this is an error
1851 */
1852 *err = U_INVALID_CHAR_FOUND;
1853 cnv->fromUChar32=sourceChar;
1854 break;
1855 }
1856
1857 if(sourceChar == CR || sourceChar == LF) {
1858 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1859 pFromU2022State->cs[2] = 0;
1860 choiceCount = 0;
1861 }
1862
1863 /* output outLen>0 bytes in buffer[] */
1864 if(outLen == 1) {
1865 *target++ = buffer[0];
1866 if(offsets) {
1867 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1868 }
1869 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1870 *target++ = buffer[0];
1871 *target++ = buffer[1];
1872 if(offsets) {
1873 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1874 *offsets++ = sourceIndex;
1875 *offsets++ = sourceIndex;
1876 }
1877 } else {
1878 fromUWriteUInt8(
1879 cnv,
1880 buffer, outLen,
1881 &target, (const char *)targetLimit,
1882 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1883 err);
1884 if(U_FAILURE(*err)) {
1885 break;
1886 }
1887 }
1888 } /* end if(myTargetIndex<myTargetLength) */
1889 else{
1890 *err =U_BUFFER_OVERFLOW_ERROR;
1891 break;
1892 }
1893
1894 }/* end while(mySourceIndex<mySourceLength) */
1895
1896 /*
1897 * the end of the input stream and detection of truncated input
1898 * are handled by the framework, but for ISO-2022-JP conversion
1899 * we need to be in ASCII mode at the very end
1900 *
1901 * conditions:
1902 * successful
1903 * in SO mode or not in ASCII mode
1904 * end of input and no truncated input
1905 */
1906 if( U_SUCCESS(*err) &&
1907 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1908 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1909 ) {
1910 int32_t sourceIndex;
1911
1912 outLen = 0;
1913
1914 if(pFromU2022State->g != 0) {
1915 buffer[outLen++] = UCNV_SI;
1916 pFromU2022State->g = 0;
1917 }
1918
1919 if(pFromU2022State->cs[0] != ASCII) {
1920 int32_t escLen = escSeqCharsLen[ASCII];
1921 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1922 outLen += escLen;
1923 pFromU2022State->cs[0] = (int8_t)ASCII;
1924 }
1925
1926 /* get the source index of the last input character */
1927 /*
1928 * TODO this would be simpler and more reliable if we used a pair
1929 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1930 * so that we could simply use the prevSourceIndex here;
1931 * this code gives an incorrect result for the rare case of an unmatched
1932 * trail surrogate that is alone in the last buffer of the text stream
1933 */
1934 sourceIndex=(int32_t)(source-args->source);
1935 if(sourceIndex>0) {
1936 --sourceIndex;
1937 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1938 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1939 ) {
1940 --sourceIndex;
1941 }
1942 } else {
1943 sourceIndex=-1;
1944 }
1945
1946 fromUWriteUInt8(
1947 cnv,
1948 buffer, outLen,
1949 &target, (const char *)targetLimit,
1950 &offsets, sourceIndex,
1951 err);
1952 }
1953
1954 /*save the state and return */
1955 args->source = source;
1956 args->target = (char*)target;
1957 }
1958
1959 /*************** to unicode *******************/
1960
1961 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1962 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1963 UErrorCode* err){
1964 char tempBuf[2];
1965 const char *mySource = (char *) args->source;
1966 UChar *myTarget = args->target;
1967 const char *mySourceLimit = args->sourceLimit;
1968 uint32_t targetUniChar = 0x0000;
1969 uint32_t mySourceChar = 0x0000;
1970 UConverterDataISO2022* myData;
1971 ISO2022State *pToU2022State;
1972 StateEnum cs;
1973
1974 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1975 pToU2022State = &myData->toU2022State;
1976
1977 if(myData->key != 0) {
1978 /* continue with a partial escape sequence */
1979 goto escape;
1980 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1981 /* continue with a partial double-byte character */
1982 mySourceChar = args->converter->toUBytes[0];
1983 args->converter->toULength = 0;
1984 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1985 goto getTrailByte;
1986 }
1987
1988 while(mySource < mySourceLimit){
1989
1990 targetUniChar =missingCharMarker;
1991
1992 if(myTarget < args->targetLimit){
1993
1994 mySourceChar= (unsigned char) *mySource++;
1995
1996 switch(mySourceChar) {
1997 case UCNV_SI:
1998 if(myData->version==3) {
1999 pToU2022State->g=0;
2000 continue;
2001 } else {
2002 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2003 break;
2004 }
2005
2006 case UCNV_SO:
2007 if(myData->version==3) {
2008 /* JIS7: switch to G1 half-width Katakana */
2009 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2010 pToU2022State->g=1;
2011 continue;
2012 } else {
2013 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2014 break;
2015 }
2016
2017 case ESC_2022:
2018 mySource--;
2019 escape:
2020 changeState_2022(args->converter,&(mySource),
2021 mySourceLimit, ISO_2022_JP,err);
2022
2023 /* invalid or illegal escape sequence */
2024 if(U_FAILURE(*err)){
2025 args->target = myTarget;
2026 args->source = mySource;
2027 return;
2028 }
2029 continue;
2030
2031 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2032
2033 case CR:
2034 /*falls through*/
2035 case LF:
2036 /* automatically reset to single-byte mode */
2037 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2038 pToU2022State->cs[0] = (int8_t)ASCII;
2039 }
2040 pToU2022State->cs[2] = 0;
2041 pToU2022State->g = 0;
2042 /* falls through */
2043 default:
2044 /* convert one or two bytes */
2045 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2046 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2047 !IS_JP_DBCS(cs)
2048 ) {
2049 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2050 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2051
2052 /* return from a single-shift state to the previous one */
2053 if(pToU2022State->g >= 2) {
2054 pToU2022State->g=pToU2022State->prevG;
2055 }
2056 } else switch(cs) {
2057 case ASCII:
2058 if(mySourceChar <= 0x7f) {
2059 targetUniChar = mySourceChar;
2060 }
2061 break;
2062 case ISO8859_1:
2063 if(mySourceChar <= 0x7f) {
2064 targetUniChar = mySourceChar + 0x80;
2065 }
2066 /* return from a single-shift state to the previous one */
2067 pToU2022State->g=pToU2022State->prevG;
2068 break;
2069 case ISO8859_7:
2070 if(mySourceChar <= 0x7f) {
2071 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2072 targetUniChar =
2073 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2074 myData->myConverterArray[cs],
2075 mySourceChar + 0x80);
2076 }
2077 /* return from a single-shift state to the previous one */
2078 pToU2022State->g=pToU2022State->prevG;
2079 break;
2080 case JISX201:
2081 if(mySourceChar <= 0x7f) {
2082 targetUniChar = jisx201ToU(mySourceChar);
2083 }
2084 break;
2085 case HWKANA_7BIT:
2086 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2087 /* 7-bit halfwidth Katakana */
2088 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2089 }
2090 break;
2091 default:
2092 /* G0 DBCS */
2093 if(mySource < mySourceLimit) {
2094 char trailByte;
2095 getTrailByte:
2096 trailByte = *mySource++;
2097 if(cs == JISX208) {
2098 _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
2099 } else {
2100 tempBuf[0] = (char)mySourceChar;
2101 tempBuf[1] = trailByte;
2102 }
2103 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2104 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2105 } else {
2106 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2107 args->converter->toULength = 1;
2108 goto endloop;
2109 }
2110 } /* End of inner switch */
2111 break;
2112 } /* End of outer switch */
2113 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2114 if(args->offsets){
2115 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2116 }
2117 *(myTarget++)=(UChar)targetUniChar;
2118 }
2119 else if(targetUniChar > missingCharMarker){
2120 /* disassemble the surrogate pair and write to output*/
2121 targetUniChar-=0x0010000;
2122 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2123 if(args->offsets){
2124 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2125 }
2126 ++myTarget;
2127 if(myTarget< args->targetLimit){
2128 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2129 if(args->offsets){
2130 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2131 }
2132 ++myTarget;
2133 }else{
2134 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2135 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2136 }
2137
2138 }
2139 else{
2140 /* Call the callback function*/
2141 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2142 break;
2143 }
2144 }
2145 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2146 *err =U_BUFFER_OVERFLOW_ERROR;
2147 break;
2148 }
2149 }
2150 endloop:
2151 args->target = myTarget;
2152 args->source = mySource;
2153 }
2154
2155
2156 /***************************************************************
2157 * Rules for ISO-2022-KR encoding
2158 * i) The KSC5601 designator sequence should appear only once in a file,
2159 * at the begining of a line before any KSC5601 characters. This usually
2160 * means that it appears by itself on the first line of the file
2161 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2162 * and SI to shift into single byte mode
2163 */
2164 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2165 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2166
2167 UConverter* saveConv = args->converter;
2168 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2169 args->converter=myConverterData->currentConverter;
2170
2171 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2172 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2173 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2174
2175 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2176 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2177 uprv_memcpy(
2178 saveConv->charErrorBuffer,
2179 myConverterData->currentConverter->charErrorBuffer,
2180 myConverterData->currentConverter->charErrorBufferLength);
2181 }
2182 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2183 myConverterData->currentConverter->charErrorBufferLength = 0;
2184 }
2185 args->converter=saveConv;
2186 }
2187
2188 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2189 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2190
2191 const UChar *source = args->source;
2192 const UChar *sourceLimit = args->sourceLimit;
2193 unsigned char *target = (unsigned char *) args->target;
2194 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2195 int32_t* offsets = args->offsets;
2196 uint32_t targetByteUnit = 0x0000;
2197 UChar32 sourceChar = 0x0000;
2198 UBool isTargetByteDBCS;
2199 UBool oldIsTargetByteDBCS;
2200 UConverterDataISO2022 *converterData;
2201 UConverterSharedData* sharedData;
2202 UBool useFallback;
2203 int32_t length =0;
2204
2205 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2206 /* if the version is 1 then the user is requesting
2207 * conversion with ibm-25546 pass the arguments to
2208 * MBCS converter and return
2209 */
2210 if(converterData->version==1){
2211 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2212 return;
2213 }
2214
2215 /* initialize data */
2216 sharedData = converterData->currentConverter->sharedData;
2217 useFallback = args->converter->useFallback;
2218 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2219 oldIsTargetByteDBCS = isTargetByteDBCS;
2220
2221 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2222 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2223 goto getTrail;
2224 }
2225 while(source < sourceLimit){
2226
2227 targetByteUnit = missingCharMarker;
2228
2229 if(target < (unsigned char*) args->targetLimit){
2230 sourceChar = *source++;
2231
2232 /* do not convert SO/SI/ESC */
2233 if(IS_2022_CONTROL(sourceChar)) {
2234 /* callback(illegal) */
2235 *err=U_ILLEGAL_CHAR_FOUND;
2236 args->converter->fromUChar32=sourceChar;
2237 break;
2238 }
2239
2240 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2241 if(length < 0) {
2242 length = -length; /* fallback */
2243 }
2244 /* only DBCS or SBCS characters are expected*/
2245 /* DB characters with high bit set to 1 are expected */
2246 if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
2247 targetByteUnit=missingCharMarker;
2248 }
2249 if (targetByteUnit != missingCharMarker){
2250
2251 oldIsTargetByteDBCS = isTargetByteDBCS;
2252 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2253 /* append the shift sequence */
2254 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2255
2256 if (isTargetByteDBCS)
2257 *target++ = UCNV_SO;
2258 else
2259 *target++ = UCNV_SI;
2260 if(offsets)
2261 *(offsets++) = (int32_t)(source - args->source-1);
2262 }
2263 /* write the targetUniChar to target */
2264 if(targetByteUnit <= 0x00FF){
2265 if( target < targetLimit){
2266 *(target++) = (unsigned char) targetByteUnit;
2267 if(offsets){
2268 *(offsets++) = (int32_t)(source - args->source-1);
2269 }
2270
2271 }else{
2272 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2273 *err = U_BUFFER_OVERFLOW_ERROR;
2274 }
2275 }else{
2276 if(target < targetLimit){
2277 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2278 if(offsets){
2279 *(offsets++) = (int32_t)(source - args->source-1);
2280 }
2281 if(target < targetLimit){
2282 *(target++) =(unsigned char) (targetByteUnit -0x80);
2283 if(offsets){
2284 *(offsets++) = (int32_t)(source - args->source-1);
2285 }
2286 }else{
2287 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2288 *err = U_BUFFER_OVERFLOW_ERROR;
2289 }
2290 }else{
2291 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2292 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2293 *err = U_BUFFER_OVERFLOW_ERROR;
2294 }
2295 }
2296
2297 }
2298 else{
2299 /* oops.. the code point is unassingned
2300 * set the error and reason
2301 */
2302
2303 /*check if the char is a First surrogate*/
2304 if(UTF_IS_SURROGATE(sourceChar)) {
2305 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2306 getTrail:
2307 /*look ahead to find the trail surrogate*/
2308 if(source < sourceLimit) {
2309 /* test the following code unit */
2310 UChar trail=(UChar) *source;
2311 if(UTF_IS_SECOND_SURROGATE(trail)) {
2312 source++;
2313 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2314 *err = U_INVALID_CHAR_FOUND;
2315 /* convert this surrogate code point */
2316 /* exit this condition tree */
2317 } else {
2318 /* this is an unmatched lead code unit (1st surrogate) */
2319 /* callback(illegal) */
2320 *err=U_ILLEGAL_CHAR_FOUND;
2321 }
2322 } else {
2323 /* no more input */
2324 *err = U_ZERO_ERROR;
2325 }
2326 } else {
2327 /* this is an unmatched trail code unit (2nd surrogate) */
2328 /* callback(illegal) */
2329 *err=U_ILLEGAL_CHAR_FOUND;
2330 }
2331 } else {
2332 /* callback(unassigned) for a BMP code point */
2333 *err = U_INVALID_CHAR_FOUND;
2334 }
2335
2336 args->converter->fromUChar32=sourceChar;
2337 break;
2338 }
2339 } /* end if(myTargetIndex<myTargetLength) */
2340 else{
2341 *err =U_BUFFER_OVERFLOW_ERROR;
2342 break;
2343 }
2344
2345 }/* end while(mySourceIndex<mySourceLength) */
2346
2347 /*
2348 * the end of the input stream and detection of truncated input
2349 * are handled by the framework, but for ISO-2022-KR conversion
2350 * we need to be in ASCII mode at the very end
2351 *
2352 * conditions:
2353 * successful
2354 * not in ASCII mode
2355 * end of input and no truncated input
2356 */
2357 if( U_SUCCESS(*err) &&
2358 isTargetByteDBCS &&
2359 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2360 ) {
2361 int32_t sourceIndex;
2362
2363 /* we are switching to ASCII */
2364 isTargetByteDBCS=FALSE;
2365
2366 /* get the source index of the last input character */
2367 /*
2368 * TODO this would be simpler and more reliable if we used a pair
2369 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2370 * so that we could simply use the prevSourceIndex here;
2371 * this code gives an incorrect result for the rare case of an unmatched
2372 * trail surrogate that is alone in the last buffer of the text stream
2373 */
2374 sourceIndex=(int32_t)(source-args->source);
2375 if(sourceIndex>0) {
2376 --sourceIndex;
2377 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2378 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2379 ) {
2380 --sourceIndex;
2381 }
2382 } else {
2383 sourceIndex=-1;
2384 }
2385
2386 fromUWriteUInt8(
2387 args->converter,
2388 SHIFT_IN_STR, 1,
2389 &target, (const char *)targetLimit,
2390 &offsets, sourceIndex,
2391 err);
2392 }
2393
2394 /*save the state and return */
2395 args->source = source;
2396 args->target = (char*)target;
2397 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2398 }
2399
2400 /************************ To Unicode ***************************************/
2401
2402 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2403 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2404 UErrorCode* err){
2405 char const* sourceStart;
2406 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2407
2408 UConverterToUnicodeArgs subArgs;
2409 int32_t minArgsSize;
2410
2411 /* set up the subconverter arguments */
2412 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2413 minArgsSize = args->size;
2414 } else {
2415 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2416 }
2417
2418 uprv_memcpy(&subArgs, args, minArgsSize);
2419 subArgs.size = (uint16_t)minArgsSize;
2420 subArgs.converter = myData->currentConverter;
2421
2422 /* remember the original start of the input for offsets */
2423 sourceStart = args->source;
2424
2425 if(myData->key != 0) {
2426 /* continue with a partial escape sequence */
2427 goto escape;
2428 }
2429
2430 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2431 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2432 subArgs.source = args->source;
2433 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2434 if(subArgs.source != subArgs.sourceLimit) {
2435 /*
2436 * get the current partial byte sequence
2437 *
2438 * it needs to be moved between the public and the subconverter
2439 * so that the conversion framework, which only sees the public
2440 * converter, can handle truncated and illegal input etc.
2441 */
2442 if(args->converter->toULength > 0) {
2443 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2444 }
2445 subArgs.converter->toULength = args->converter->toULength;
2446
2447 /*
2448 * Convert up to the end of the input, or to before the next escape character.
2449 * Does not handle conversion extensions because the preToU[] state etc.
2450 * is not copied.
2451 */
2452 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2453
2454 if(args->offsets != NULL && sourceStart != args->source) {
2455 /* update offsets to base them on the actual start of the input */
2456 int32_t *offsets = args->offsets;
2457 UChar *target = args->target;
2458 int32_t delta = (int32_t)(args->source - sourceStart);
2459 while(target < subArgs.target) {
2460 if(*offsets >= 0) {
2461 *offsets += delta;
2462 }
2463 ++offsets;
2464 ++target;
2465 }
2466 }
2467 args->source = subArgs.source;
2468 args->target = subArgs.target;
2469 args->offsets = subArgs.offsets;
2470
2471 /* copy input/error/overflow buffers */
2472 if(subArgs.converter->toULength > 0) {
2473 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2474 }
2475 args->converter->toULength = subArgs.converter->toULength;
2476
2477 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2478 if(subArgs.converter->UCharErrorBufferLength > 0) {
2479 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2480 subArgs.converter->UCharErrorBufferLength);
2481 }
2482 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2483 subArgs.converter->UCharErrorBufferLength = 0;
2484 }
2485 }
2486
2487 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2488 return;
2489 }
2490
2491 escape:
2492 changeState_2022(args->converter,
2493 &(args->source),
2494 args->sourceLimit,
2495 ISO_2022_KR,
2496 err);
2497 }
2498 }
2499
2500 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2501 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2502 UErrorCode* err){
2503 char tempBuf[2];
2504 const char *mySource = ( char *) args->source;
2505 UChar *myTarget = args->target;
2506 const char *mySourceLimit = args->sourceLimit;
2507 UChar32 targetUniChar = 0x0000;
2508 UChar mySourceChar = 0x0000;
2509 UConverterDataISO2022* myData;
2510 UConverterSharedData* sharedData ;
2511 UBool useFallback;
2512
2513 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2514 if(myData->version==1){
2515 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2516 return;
2517 }
2518
2519 /* initialize state */
2520 sharedData = myData->currentConverter->sharedData;
2521 useFallback = args->converter->useFallback;
2522
2523 if(myData->key != 0) {
2524 /* continue with a partial escape sequence */
2525 goto escape;
2526 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2527 /* continue with a partial double-byte character */
2528 mySourceChar = args->converter->toUBytes[0];
2529 args->converter->toULength = 0;
2530 goto getTrailByte;
2531 }
2532
2533 while(mySource< mySourceLimit){
2534
2535 if(myTarget < args->targetLimit){
2536
2537 mySourceChar= (unsigned char) *mySource++;
2538
2539 if(mySourceChar==UCNV_SI){
2540 myData->toU2022State.g = 0;
2541 /*consume the source */
2542 continue;
2543 }else if(mySourceChar==UCNV_SO){
2544 myData->toU2022State.g = 1;
2545 /*consume the source */
2546 continue;
2547 }else if(mySourceChar==ESC_2022){
2548 mySource--;
2549 escape:
2550 changeState_2022(args->converter,&(mySource),
2551 mySourceLimit, ISO_2022_KR, err);
2552 if(U_FAILURE(*err)){
2553 args->target = myTarget;
2554 args->source = mySource;
2555 return;
2556 }
2557 continue;
2558 }
2559
2560 if(myData->toU2022State.g == 1) {
2561 if(mySource < mySourceLimit) {
2562 char trailByte;
2563 getTrailByte:
2564 trailByte = *mySource++;
2565 tempBuf[0] = (char)(mySourceChar + 0x80);
2566 tempBuf[1] = (char)(trailByte + 0x80);
2567 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2568 if((mySourceChar & 0x8080) == 0) {
2569 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2570 } else {
2571 /* illegal bytes > 0x7f */
2572 targetUniChar = missingCharMarker;
2573 }
2574 } else {
2575 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2576 args->converter->toULength = 1;
2577 break;
2578 }
2579 }
2580 else{
2581 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2582 }
2583 if(targetUniChar < 0xfffe){
2584 if(args->offsets) {
2585 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2586 }
2587 *(myTarget++)=(UChar)targetUniChar;
2588 }
2589 else {
2590 /* Call the callback function*/
2591 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2592 break;
2593 }
2594 }
2595 else{
2596 *err =U_BUFFER_OVERFLOW_ERROR;
2597 break;
2598 }
2599 }
2600 args->target = myTarget;
2601 args->source = mySource;
2602 }
2603
2604 /*************************** END ISO2022-KR *********************************/
2605
2606 /*************************** ISO-2022-CN *********************************
2607 *
2608 * Rules for ISO-2022-CN Encoding:
2609 * i) The designator sequence must appear once on a line before any instance
2610 * of character set it designates.
2611 * ii) If two lines contain characters from the same character set, both lines
2612 * must include the designator sequence.
2613 * iii) Once the designator sequence is known, a shifting sequence has to be found
2614 * to invoke the shifting
2615 * iv) All lines start in ASCII and end in ASCII.
2616 * v) Four shifting sequences are employed for this purpose:
2617 *
2618 * Sequcence ASCII Eq Charsets
2619 * ---------- ------- ---------
2620 * SI <SI> US-ASCII
2621 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2622 * SS2 <ESC>N CNS-11643-1992 Plane 2
2623 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2624 *
2625 * vi)
2626 * SOdesignator : ESC "$" ")" finalchar_for_SO
2627 * SS2designator : ESC "$" "*" finalchar_for_SS2
2628 * SS3designator : ESC "$" "+" finalchar_for_SS3
2629 *
2630 * ESC $ ) A Indicates the bytes following SO are Chinese
2631 * characters as defined in GB 2312-80, until
2632 * another SOdesignation appears
2633 *
2634 *
2635 * ESC $ ) E Indicates the bytes following SO are as defined
2636 * in ISO-IR-165 (for details, see section 2.1),
2637 * until another SOdesignation appears
2638 *
2639 * ESC $ ) G Indicates the bytes following SO are as defined
2640 * in CNS 11643-plane-1, until another
2641 * SOdesignation appears
2642 *
2643 * ESC $ * H Indicates the two bytes immediately following
2644 * SS2 is a Chinese character as defined in CNS
2645 * 11643-plane-2, until another SS2designation
2646 * appears
2647 * (Meaning <ESC>N must preceed every 2 byte
2648 * sequence.)
2649 *
2650 * ESC $ + I Indicates the immediate two bytes following SS3
2651 * is a Chinese character as defined in CNS
2652 * 11643-plane-3, until another SS3designation
2653 * appears
2654 * (Meaning <ESC>O must preceed every 2 byte
2655 * sequence.)
2656 *
2657 * ESC $ + J Indicates the immediate two bytes following SS3
2658 * is a Chinese character as defined in CNS
2659 * 11643-plane-4, until another SS3designation
2660 * appears
2661 * (In English: <ESC>O must preceed every 2 byte
2662 * sequence.)
2663 *
2664 * ESC $ + K Indicates the immediate two bytes following SS3
2665 * is a Chinese character as defined in CNS
2666 * 11643-plane-5, until another SS3designation
2667 * appears
2668 *
2669 * ESC $ + L Indicates the immediate two bytes following SS3
2670 * is a Chinese character as defined in CNS
2671 * 11643-plane-6, until another SS3designation
2672 * appears
2673 *
2674 * ESC $ + M Indicates the immediate two bytes following SS3
2675 * is a Chinese character as defined in CNS
2676 * 11643-plane-7, until another SS3designation
2677 * appears
2678 *
2679 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2680 * has its own designation information before any Chinese characters
2681 * appear
2682 *
2683 */
2684
2685 /* The following are defined this way to make the strings truely readonly */
2686 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2687 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2688 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2689 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2690 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2691 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2692 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2693 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2694 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2695
2696 /********************** ISO2022-CN Data **************************/
2697 static const char* const escSeqCharsCN[10] ={
2698 SHIFT_IN_STR, /* ASCII */
2699 GB_2312_80_STR,
2700 ISO_IR_165_STR,
2701 CNS_11643_1992_Plane_1_STR,
2702 CNS_11643_1992_Plane_2_STR,
2703 CNS_11643_1992_Plane_3_STR,
2704 CNS_11643_1992_Plane_4_STR,
2705 CNS_11643_1992_Plane_5_STR,
2706 CNS_11643_1992_Plane_6_STR,
2707 CNS_11643_1992_Plane_7_STR
2708 };
2709
2710 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2711 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2712 UConverter *cnv = args->converter;
2713 UConverterDataISO2022 *converterData;
2714 ISO2022State *pFromU2022State;
2715 uint8_t *target = (uint8_t *) args->target;
2716 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2717 const UChar* source = args->source;
2718 const UChar* sourceLimit = args->sourceLimit;
2719 int32_t* offsets = args->offsets;
2720 UChar32 sourceChar;
2721 char buffer[8];
2722 int32_t len;
2723 int8_t choices[3];
2724 int32_t choiceCount;
2725 uint32_t targetValue = 0;
2726 UBool useFallback;
2727
2728 /* set up the state */
2729 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2730 pFromU2022State = &converterData->fromU2022State;
2731
2732 choiceCount = 0;
2733
2734 /* check if the last codepoint of previous buffer was a lead surrogate*/
2735 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2736 goto getTrail;
2737 }
2738
2739 while( source < sourceLimit){
2740 if(target < targetLimit){
2741
2742 sourceChar = *(source++);
2743 /*check if the char is a First surrogate*/
2744 if(UTF_IS_SURROGATE(sourceChar)) {
2745 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2746 getTrail:
2747 /*look ahead to find the trail surrogate*/
2748 if(source < sourceLimit) {
2749 /* test the following code unit */
2750 UChar trail=(UChar) *source;
2751 if(UTF_IS_SECOND_SURROGATE(trail)) {
2752 source++;
2753 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2754 cnv->fromUChar32=0x00;
2755 /* convert this supplementary code point */
2756 /* exit this condition tree */
2757 } else {
2758 /* this is an unmatched lead code unit (1st surrogate) */
2759 /* callback(illegal) */
2760 *err=U_ILLEGAL_CHAR_FOUND;
2761 cnv->fromUChar32=sourceChar;
2762 break;
2763 }
2764 } else {
2765 /* no more input */
2766 cnv->fromUChar32=sourceChar;
2767 break;
2768 }
2769 } else {
2770 /* this is an unmatched trail code unit (2nd surrogate) */
2771 /* callback(illegal) */
2772 *err=U_ILLEGAL_CHAR_FOUND;
2773 cnv->fromUChar32=sourceChar;
2774 break;
2775 }
2776 }
2777
2778 /* do the conversion */
2779 if(sourceChar <= 0x007f ){
2780 /* do not convert SO/SI/ESC */
2781 if(IS_2022_CONTROL(sourceChar)) {
2782 /* callback(illegal) */
2783 *err=U_ILLEGAL_CHAR_FOUND;
2784 cnv->fromUChar32=sourceChar;
2785 break;
2786 }
2787
2788 /* US-ASCII */
2789 if(pFromU2022State->g == 0) {
2790 buffer[0] = (char)sourceChar;
2791 len = 1;
2792 } else {
2793 buffer[0] = UCNV_SI;
2794 buffer[1] = (char)sourceChar;
2795 len = 2;
2796 pFromU2022State->g = 0;
2797 choiceCount = 0;
2798 }
2799 if(sourceChar == CR || sourceChar == LF) {
2800 /* reset the state at the end of a line */
2801 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2802 choiceCount = 0;
2803 }
2804 }
2805 else{
2806 /* convert U+0080..U+10ffff */
2807 int32_t i;
2808 int8_t cs, g;
2809
2810 if(choiceCount == 0) {
2811 /* try the current SO/G1 converter first */
2812 choices[0] = pFromU2022State->cs[1];
2813
2814 /* default to GB2312_1 if none is designated yet */
2815 if(choices[0] == 0) {
2816 choices[0] = GB2312_1;
2817 }
2818
2819 if(converterData->version == 0) {
2820 /* ISO-2022-CN */
2821
2822 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2823 if(choices[0] == GB2312_1) {
2824 choices[1] = (int8_t)CNS_11643_1;
2825 } else {
2826 choices[1] = (int8_t)GB2312_1;
2827 }
2828
2829 choiceCount = 2;
2830 } else {
2831 /* ISO-2022-CN-EXT */
2832
2833 /* try one of the other converters */
2834 switch(choices[0]) {
2835 case GB2312_1:
2836 choices[1] = (int8_t)CNS_11643_1;
2837 choices[2] = (int8_t)ISO_IR_165;
2838 break;
2839 case ISO_IR_165:
2840 choices[1] = (int8_t)GB2312_1;
2841 choices[2] = (int8_t)CNS_11643_1;
2842 break;
2843 default: /* CNS_11643_x */
2844 choices[1] = (int8_t)GB2312_1;
2845 choices[2] = (int8_t)ISO_IR_165;
2846 break;
2847 }
2848
2849 choiceCount = 3;
2850 }
2851 }
2852
2853 cs = g = 0;
2854 /*
2855 * len==0: no mapping found yet
2856 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2857 * len>0: found a roundtrip result, done
2858 */
2859 len = 0;
2860 /*
2861 * We will turn off useFallback after finding a fallback,
2862 * but we still get fallbacks from PUA code points as usual.
2863 * Therefore, we will also need to check that we don't overwrite
2864 * an early fallback with a later one.
2865 */
2866 useFallback = cnv->useFallback;
2867
2868 for(i = 0; i < choiceCount && len <= 0; ++i) {
2869 int8_t cs0 = choices[i];
2870 if(cs0 > 0) {
2871 uint32_t value;
2872 int32_t len2;
2873 if(cs0 > CNS_11643_0) {
2874 len2 = MBCS_FROM_UCHAR32_ISO2022(
2875 converterData->myConverterArray[CNS_11643],
2876 sourceChar,
2877 &value,
2878 useFallback,
2879 MBCS_OUTPUT_3);
2880 if(len2 == 3 || (len2 == -3 && len == 0)) {
2881 targetValue = value;
2882 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
2883 if(len2 >= 0) {
2884 len = 2;
2885 } else {
2886 len = -2;
2887 useFallback = FALSE;
2888 }
2889 if(cs == CNS_11643_1) {
2890 g = 1;
2891 } else if(cs == CNS_11643_2) {
2892 g = 2;
2893 } else /* plane 3..7 */ if(converterData->version == 1) {
2894 g = 3;
2895 } else {
2896 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2897 len = 0;
2898 }
2899 }
2900 } else {
2901 /* GB2312_1 or ISO-IR-165 */
2902 len2 = MBCS_FROM_UCHAR32_ISO2022(
2903 converterData->myConverterArray[cs0],
2904 sourceChar,
2905 &value,
2906 useFallback,
2907 MBCS_OUTPUT_2);
2908 if(len2 == 2 || (len2 == -2 && len == 0)) {
2909 targetValue = value;
2910 len = len2;
2911 cs = cs0;
2912 g = 1;
2913 useFallback = FALSE;
2914 }
2915 }
2916 }
2917 }
2918
2919 if(len != 0) {
2920 len = 0; /* count output bytes; it must have been abs(len) == 2 */
2921
2922 /* write the designation sequence if necessary */
2923 if(cs != pFromU2022State->cs[g]) {
2924 if(cs < CNS_11643) {
2925 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2926 } else {
2927 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2928 }
2929 len = 4;
2930 pFromU2022State->cs[g] = cs;
2931 if(g == 1) {
2932 /* changing the SO/G1 charset invalidates the choices[] */
2933 choiceCount = 0;
2934 }
2935 }
2936
2937 /* write the shift sequence if necessary */
2938 if(g != pFromU2022State->g) {
2939 switch(g) {
2940 case 1:
2941 buffer[len++] = UCNV_SO;
2942
2943 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2944 pFromU2022State->g = 1;
2945 break;
2946 case 2:
2947 buffer[len++] = 0x1b;
2948 buffer[len++] = 0x4e;
2949 break;
2950 default: /* case 3 */
2951 buffer[len++] = 0x1b;
2952 buffer[len++] = 0x4f;
2953 break;
2954 }
2955 }
2956
2957 /* write the two output bytes */
2958 buffer[len++] = (char)(targetValue >> 8);
2959 buffer[len++] = (char)targetValue;
2960 } else {
2961 /* if we cannot find the character after checking all codepages
2962 * then this is an error
2963 */
2964 *err = U_INVALID_CHAR_FOUND;
2965 cnv->fromUChar32=sourceChar;
2966 break;
2967 }
2968 }
2969
2970 /* output len>0 bytes in buffer[] */
2971 if(len == 1) {
2972 *target++ = buffer[0];
2973 if(offsets) {
2974 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2975 }
2976 } else if(len == 2 && (target + 2) <= targetLimit) {
2977 *target++ = buffer[0];
2978 *target++ = buffer[1];
2979 if(offsets) {
2980 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2981 *offsets++ = sourceIndex;
2982 *offsets++ = sourceIndex;
2983 }
2984 } else {
2985 fromUWriteUInt8(
2986 cnv,
2987 buffer, len,
2988 &target, (const char *)targetLimit,
2989 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2990 err);
2991 if(U_FAILURE(*err)) {
2992 break;
2993 }
2994 }
2995 } /* end if(myTargetIndex<myTargetLength) */
2996 else{
2997 *err =U_BUFFER_OVERFLOW_ERROR;
2998 break;
2999 }
3000
3001 }/* end while(mySourceIndex<mySourceLength) */
3002
3003 /*
3004 * the end of the input stream and detection of truncated input
3005 * are handled by the framework, but for ISO-2022-CN conversion
3006 * we need to be in ASCII mode at the very end
3007 *
3008 * conditions:
3009 * successful
3010 * not in ASCII mode
3011 * end of input and no truncated input
3012 */
3013 if( U_SUCCESS(*err) &&
3014 pFromU2022State->g!=0 &&
3015 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3016 ) {
3017 int32_t sourceIndex;
3018
3019 /* we are switching to ASCII */
3020 pFromU2022State->g=0;
3021
3022 /* get the source index of the last input character */
3023 /*
3024 * TODO this would be simpler and more reliable if we used a pair
3025 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3026 * so that we could simply use the prevSourceIndex here;
3027 * this code gives an incorrect result for the rare case of an unmatched
3028 * trail surrogate that is alone in the last buffer of the text stream
3029 */
3030 sourceIndex=(int32_t)(source-args->source);
3031 if(sourceIndex>0) {
3032 --sourceIndex;
3033 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3034 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3035 ) {
3036 --sourceIndex;
3037 }
3038 } else {
3039 sourceIndex=-1;
3040 }
3041
3042 fromUWriteUInt8(
3043 cnv,
3044 SHIFT_IN_STR, 1,
3045 &target, (const char *)targetLimit,
3046 &offsets, sourceIndex,
3047 err);
3048 }
3049
3050 /*save the state and return */
3051 args->source = source;
3052 args->target = (char*)target;
3053 }
3054
3055
3056 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3057 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3058 UErrorCode* err){
3059 char tempBuf[3];
3060 const char *mySource = (char *) args->source;
3061 UChar *myTarget = args->target;
3062 const char *mySourceLimit = args->sourceLimit;
3063 uint32_t targetUniChar = 0x0000;
3064 uint32_t mySourceChar = 0x0000;
3065 UConverterDataISO2022* myData;
3066 ISO2022State *pToU2022State;
3067
3068 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3069 pToU2022State = &myData->toU2022State;
3070
3071 if(myData->key != 0) {
3072 /* continue with a partial escape sequence */
3073 goto escape;
3074 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3075 /* continue with a partial double-byte character */
3076 mySourceChar = args->converter->toUBytes[0];
3077 args->converter->toULength = 0;
3078 goto getTrailByte;
3079 }
3080
3081 while(mySource < mySourceLimit){
3082
3083 targetUniChar =missingCharMarker;
3084
3085 if(myTarget < args->targetLimit){
3086
3087 mySourceChar= (unsigned char) *mySource++;
3088
3089 switch(mySourceChar){
3090 case UCNV_SI:
3091 pToU2022State->g=0;
3092 continue;
3093
3094 case UCNV_SO:
3095 if(pToU2022State->cs[1] != 0) {
3096 pToU2022State->g=1;
3097 continue;
3098 } else {
3099 /* illegal to have SO before a matching designator */
3100 break;
3101 }
3102
3103 case ESC_2022:
3104 mySource--;
3105 escape:
3106 changeState_2022(args->converter,&(mySource),
3107 mySourceLimit, ISO_2022_CN,err);
3108
3109 /* invalid or illegal escape sequence */
3110 if(U_FAILURE(*err)){
3111 args->target = myTarget;
3112 args->source = mySource;
3113 return;
3114 }
3115 continue;
3116
3117 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3118
3119 case CR:
3120 /*falls through*/
3121 case LF:
3122 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3123 /* falls through */
3124 default:
3125 /* convert one or two bytes */
3126 if(pToU2022State->g != 0) {
3127 if(mySource < mySourceLimit) {
3128 UConverterSharedData *cnv;
3129 StateEnum tempState;
3130 int32_t tempBufLen;
3131 char trailByte;
3132 getTrailByte:
3133 trailByte = *mySource++;
3134 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3135 if(tempState > CNS_11643_0) {
3136 cnv = myData->myConverterArray[CNS_11643];
3137 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3138 tempBuf[1] = (char) (mySourceChar);
3139 tempBuf[2] = trailByte;
3140 tempBufLen = 3;
3141
3142 }else{
3143 cnv = myData->myConverterArray[tempState];
3144 tempBuf[0] = (char) (mySourceChar);
3145 tempBuf[1] = trailByte;
3146 tempBufLen = 2;
3147 }
3148 mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
3149 if(pToU2022State->g>=2) {
3150 /* return from a single-shift state to the previous one */
3151 pToU2022State->g=pToU2022State->prevG;
3152 }
3153 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3154 } else {
3155 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3156 args->converter->toULength = 1;
3157 goto endloop;
3158 }
3159 }
3160 else{
3161 if(mySourceChar <= 0x7f) {
3162 targetUniChar = (UChar) mySourceChar;
3163 }
3164 }
3165 break;
3166 }
3167 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3168 if(args->offsets){
3169 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3170 }
3171 *(myTarget++)=(UChar)targetUniChar;
3172 }
3173 else if(targetUniChar > missingCharMarker){
3174 /* disassemble the surrogate pair and write to output*/
3175 targetUniChar-=0x0010000;
3176 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3177 if(args->offsets){
3178 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3179 }
3180 ++myTarget;
3181 if(myTarget< args->targetLimit){
3182 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3183 if(args->offsets){
3184 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3185 }
3186 ++myTarget;
3187 }else{
3188 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3189 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3190 }
3191
3192 }
3193 else{
3194 /* Call the callback function*/
3195 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3196 break;
3197 }
3198 }
3199 else{
3200 *err =U_BUFFER_OVERFLOW_ERROR;
3201 break;
3202 }
3203 }
3204 endloop:
3205 args->target = myTarget;
3206 args->source = mySource;
3207 }
3208
3209 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3210 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3211 UConverter *cnv = args->converter;
3212 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3213 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3214 char *p, *subchar;
3215 char buffer[8];
3216 int32_t length;
3217
3218 subchar=(char *)cnv->subChars;
3219 length=cnv->subCharLen; /* assume length==1 for most variants */
3220
3221 p = buffer;
3222 switch(myConverterData->locale[0]){
3223 case 'j':
3224 {
3225 int8_t cs;
3226
3227 if(pFromU2022State->g == 1) {
3228 /* JIS7: switch from G1 to G0 */
3229 pFromU2022State->g = 0;
3230 *p++ = UCNV_SI;
3231 }
3232
3233 cs = pFromU2022State->cs[0];
3234 if(cs != ASCII && cs != JISX201) {
3235 /* not in ASCII or JIS X 0201: switch to ASCII */
3236 pFromU2022State->cs[0] = (int8_t)ASCII;
3237 *p++ = '\x1b';
3238 *p++ = '\x28';
3239 *p++ = '\x42';
3240 }
3241
3242 *p++ = subchar[0];
3243 break;
3244 }
3245 case 'c':
3246 if(pFromU2022State->g != 0) {
3247 /* not in ASCII mode: switch to ASCII */
3248 pFromU2022State->g = 0;
3249 *p++ = UCNV_SI;
3250 }
3251 *p++ = subchar[0];
3252 break;
3253 case 'k':
3254 if(myConverterData->version == 0) {
3255 if(length == 1) {
3256 if((UBool)args->converter->fromUnicodeStatus) {
3257 /* in DBCS mode: switch to SBCS */
3258 args->converter->fromUnicodeStatus = 0;
3259 *p++ = UCNV_SI;
3260 }
3261 *p++ = subchar[0];
3262 } else /* length == 2*/ {
3263 if(!(UBool)args->converter->fromUnicodeStatus) {
3264 /* in SBCS mode: switch to DBCS */
3265 args->converter->fromUnicodeStatus = 1;
3266 *p++ = UCNV_SO;
3267 }
3268 *p++ = subchar[0];
3269 *p++ = subchar[1];
3270 }
3271 break;
3272 } else {
3273 /* save the subconverter's substitution string */
3274 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3275 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3276
3277 /* set our substitution string into the subconverter */
3278 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3279 myConverterData->currentConverter->subCharLen = (int8_t)length;
3280
3281 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3282 args->converter = myConverterData->currentConverter;
3283 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3284 ucnv_cbFromUWriteSub(args, 0, err);
3285 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3286 args->converter = cnv;
3287
3288 /* restore the subconverter's substitution string */
3289 myConverterData->currentConverter->subChars = currentSubChars;
3290 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3291
3292 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3293 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3294 uprv_memcpy(
3295 cnv->charErrorBuffer,
3296 myConverterData->currentConverter->charErrorBuffer,
3297 myConverterData->currentConverter->charErrorBufferLength);
3298 }
3299 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3300 myConverterData->currentConverter->charErrorBufferLength = 0;
3301 }
3302 return;
3303 }
3304 default:
3305 /* not expected */
3306 break;
3307 }
3308 ucnv_cbFromUWriteBytes(args,
3309 buffer, (int32_t)(p - buffer),
3310 offsetIndex, err);
3311 }
3312
3313 /*
3314 * Structure for cloning an ISO 2022 converter into a single memory block.
3315 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3316 * and then ucnv_safeClone() of the sub-converter may additionally align
3317 * currentConverter inside the cloneStruct, for which we need the deadSpace
3318 * after currentConverter.
3319 * This is because UAlignedMemory may be larger than the actually
3320 * necessary alignment size for the platform.
3321 * The other cloneStruct fields will not be moved around,
3322 * and are aligned properly with cloneStruct's alignment.
3323 */
3324 struct cloneStruct
3325 {
3326 UConverter cnv;
3327 UConverter currentConverter;
3328 UAlignedMemory deadSpace;
3329 UConverterDataISO2022 mydata;
3330 };
3331
3332
3333 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3334 _ISO_2022_SafeClone(
3335 const UConverter *cnv,
3336 void *stackBuffer,
3337 int32_t *pBufferSize,
3338 UErrorCode *status)
3339 {
3340 struct cloneStruct * localClone;
3341 UConverterDataISO2022 *cnvData;
3342 int32_t i, size;
3343
3344 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3345 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3346 return NULL;
3347 }
3348
3349 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3350 localClone = (struct cloneStruct *)stackBuffer;
3351
3352 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3353
3354 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3355 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3356 localClone->cnv.isExtraLocal = TRUE;
3357
3358 /* share the subconverters */
3359
3360 if(cnvData->currentConverter != NULL) {
3361 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3362 localClone->mydata.currentConverter =
3363 ucnv_safeClone(cnvData->currentConverter,
3364 &localClone->currentConverter,
3365 &size, status);
3366 if(U_FAILURE(*status)) {
3367 return NULL;
3368 }
3369 }
3370
3371 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3372 if(cnvData->myConverterArray[i] != NULL) {
3373 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3374 }
3375 }
3376
3377 return &localClone->cnv;
3378 }
3379
3380 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3381 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3382 const USetAdder *sa,
3383 UConverterUnicodeSet which,
3384 UErrorCode *pErrorCode)
3385 {
3386 int32_t i;
3387 UConverterDataISO2022* cnvData;
3388
3389 if (U_FAILURE(*pErrorCode)) {
3390 return;
3391 }
3392 #ifdef U_ENABLE_GENERIC_ISO_2022
3393 if (cnv->sharedData == &_ISO2022Data) {
3394 /* We use UTF-8 in this case */
3395 sa->addRange(sa->set, 0, 0xd7FF);
3396 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3397 return;
3398 }
3399 #endif
3400
3401 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3402
3403 /* open a set and initialize it with code points that are algorithmically round-tripped */
3404 switch(cnvData->locale[0]){
3405 case 'j':
3406 /* include JIS X 0201 which is hardcoded */
3407 sa->add(sa->set, 0xa5);
3408 sa->add(sa->set, 0x203e);
3409 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3410 /* include Latin-1 for some variants of JP */
3411 sa->addRange(sa->set, 0, 0xff);
3412 } else {
3413 /* include ASCII for JP */
3414 sa->addRange(sa->set, 0, 0x7f);
3415 }
3416 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3417 /*
3418 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3419 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3420 * use half-width Katakana.
3421 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3422 * half-width Katakana via the ESC ( I sequence.
3423 * However, we only emit (fromUnicode) half-width Katakana according to the
3424 * definition of each variant.
3425 *
3426 * When including fallbacks,
3427 * we need to include half-width Katakana Unicode code points for all JP variants because
3428 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3429 */
3430 /* include half-width Katakana for JP */
3431 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3432 }
3433 break;
3434 case 'c':
3435 case 'z':
3436 /* include ASCII for CN */
3437 sa->addRange(sa->set, 0, 0x7f);
3438 break;
3439 case 'k':
3440 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3441 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3442 cnvData->currentConverter, sa, which, pErrorCode);
3443 /* the loop over myConverterArray[] will simply not find another converter */
3444 break;
3445 default:
3446 break;
3447 }
3448
3449 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3450 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3451 cnvData->version==0 && i==CNS_11643
3452 ) {
3453 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3454 ucnv_MBCSGetUnicodeSetForBytes(
3455 cnvData->myConverterArray[i],
3456 sa, UCNV_ROUNDTRIP_SET,
3457 0, 0x81, 0x82,
3458 pErrorCode);
3459 }
3460 #endif
3461
3462 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3463 UConverterSetFilter filter;
3464 if(cnvData->myConverterArray[i]!=NULL) {
3465 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3466 cnvData->version==0 && i==CNS_11643
3467 ) {
3468 /*
3469 * Version-specific for CN:
3470 * CN version 0 does not map CNS planes 3..7 although
3471 * they are all available in the CNS conversion table;
3472 * CN version 1 (-EXT) does map them all.
3473 * The two versions create different Unicode sets.
3474 */
3475 filter=UCNV_SET_FILTER_2022_CN;
3476 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3477 /*
3478 * Only add code points that map to Shift-JIS codes
3479 * corresponding to JIS X 0208.
3480 */
3481 filter=UCNV_SET_FILTER_SJIS;
3482 } else if(i==KSC5601) {
3483 /*
3484 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3485 * are broader than GR94.
3486 */
3487 filter=UCNV_SET_FILTER_GR94DBCS;
3488 } else {
3489 filter=UCNV_SET_FILTER_NONE;
3490 }
3491 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3492 }
3493 }
3494
3495 /*
3496 * ISO 2022 converters must not convert SO/SI/ESC despite what
3497 * sub-converters do by themselves.
3498 * Remove these characters from the set.
3499 */
3500 sa->remove(sa->set, 0x0e);
3501 sa->remove(sa->set, 0x0f);
3502 sa->remove(sa->set, 0x1b);
3503
3504 /* ISO 2022 converters do not convert C1 controls either */
3505 sa->removeRange(sa->set, 0x80, 0x9f);
3506 }
3507
3508 static const UConverterImpl _ISO2022Impl={
3509 UCNV_ISO_2022,
3510
3511 NULL,
3512 NULL,
3513
3514 _ISO2022Open,
3515 _ISO2022Close,
3516 _ISO2022Reset,
3517
3518 #ifdef U_ENABLE_GENERIC_ISO_2022
3519 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3520 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3521 ucnv_fromUnicode_UTF8,
3522 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3523 #else
3524 NULL,
3525 NULL,
3526 NULL,
3527 NULL,
3528 #endif
3529 NULL,
3530
3531 NULL,
3532 _ISO2022getName,
3533 _ISO_2022_WriteSub,
3534 _ISO_2022_SafeClone,
3535 _ISO_2022_GetUnicodeSet
3536 };
3537 static const UConverterStaticData _ISO2022StaticData={
3538 sizeof(UConverterStaticData),
3539 "ISO_2022",
3540 2022,
3541 UCNV_IBM,
3542 UCNV_ISO_2022,
3543 1,
3544 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3545 { 0x1a, 0, 0, 0 },
3546 1,
3547 FALSE,
3548 FALSE,
3549 0,
3550 0,
3551 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3552 };
3553 const UConverterSharedData _ISO2022Data={
3554 sizeof(UConverterSharedData),
3555 ~((uint32_t) 0),
3556 NULL,
3557 NULL,
3558 &_ISO2022StaticData,
3559 FALSE,
3560 &_ISO2022Impl,
3561 0
3562 };
3563
3564 /*************JP****************/
3565 static const UConverterImpl _ISO2022JPImpl={
3566 UCNV_ISO_2022,
3567
3568 NULL,
3569 NULL,
3570
3571 _ISO2022Open,
3572 _ISO2022Close,
3573 _ISO2022Reset,
3574
3575 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3576 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3577 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3578 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3579 NULL,
3580
3581 NULL,
3582 _ISO2022getName,
3583 _ISO_2022_WriteSub,
3584 _ISO_2022_SafeClone,
3585 _ISO_2022_GetUnicodeSet
3586 };
3587 static const UConverterStaticData _ISO2022JPStaticData={
3588 sizeof(UConverterStaticData),
3589 "ISO_2022_JP",
3590 0,
3591 UCNV_IBM,
3592 UCNV_ISO_2022,
3593 1,
3594 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3595 { 0x1a, 0, 0, 0 },
3596 1,
3597 FALSE,
3598 FALSE,
3599 0,
3600 0,
3601 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3602 };
3603 static const UConverterSharedData _ISO2022JPData={
3604 sizeof(UConverterSharedData),
3605 ~((uint32_t) 0),
3606 NULL,
3607 NULL,
3608 &_ISO2022JPStaticData,
3609 FALSE,
3610 &_ISO2022JPImpl,
3611 0
3612 };
3613
3614 /************* KR ***************/
3615 static const UConverterImpl _ISO2022KRImpl={
3616 UCNV_ISO_2022,
3617
3618 NULL,
3619 NULL,
3620
3621 _ISO2022Open,
3622 _ISO2022Close,
3623 _ISO2022Reset,
3624
3625 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3626 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3627 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3628 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3629 NULL,
3630
3631 NULL,
3632 _ISO2022getName,
3633 _ISO_2022_WriteSub,
3634 _ISO_2022_SafeClone,
3635 _ISO_2022_GetUnicodeSet
3636 };
3637 static const UConverterStaticData _ISO2022KRStaticData={
3638 sizeof(UConverterStaticData),
3639 "ISO_2022_KR",
3640 0,
3641 UCNV_IBM,
3642 UCNV_ISO_2022,
3643 1,
3644 3, /* max 3 bytes per UChar: SO+DBCS */
3645 { 0x1a, 0, 0, 0 },
3646 1,
3647 FALSE,
3648 FALSE,
3649 0,
3650 0,
3651 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3652 };
3653 static const UConverterSharedData _ISO2022KRData={
3654 sizeof(UConverterSharedData),
3655 ~((uint32_t) 0),
3656 NULL,
3657 NULL,
3658 &_ISO2022KRStaticData,
3659 FALSE,
3660 &_ISO2022KRImpl,
3661 0
3662 };
3663
3664 /*************** CN ***************/
3665 static const UConverterImpl _ISO2022CNImpl={
3666
3667 UCNV_ISO_2022,
3668
3669 NULL,
3670 NULL,
3671
3672 _ISO2022Open,
3673 _ISO2022Close,
3674 _ISO2022Reset,
3675
3676 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3677 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3678 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3679 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3680 NULL,
3681
3682 NULL,
3683 _ISO2022getName,
3684 _ISO_2022_WriteSub,
3685 _ISO_2022_SafeClone,
3686 _ISO_2022_GetUnicodeSet
3687 };
3688 static const UConverterStaticData _ISO2022CNStaticData={
3689 sizeof(UConverterStaticData),
3690 "ISO_2022_CN",
3691 0,
3692 UCNV_IBM,
3693 UCNV_ISO_2022,
3694 1,
3695 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3696 { 0x1a, 0, 0, 0 },
3697 1,
3698 FALSE,
3699 FALSE,
3700 0,
3701 0,
3702 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3703 };
3704 static const UConverterSharedData _ISO2022CNData={
3705 sizeof(UConverterSharedData),
3706 ~((uint32_t) 0),
3707 NULL,
3708 NULL,
3709 &_ISO2022CNStaticData,
3710 FALSE,
3711 &_ISO2022CNImpl,
3712 0
3713 };
3714
3715
3716
3717 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3718