1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 enum { MAX_JA_VERSION=4 };
169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
175 };
176
177 typedef enum {
178 ASCII1=0,
179 LATIN1,
180 SBCS,
181 DBCS,
182 MBCS,
183 HWKANA
184 }Cnv2022Type;
185
186 typedef struct ISO2022State {
187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189 int8_t prevG; /* g before single shift (SS2 or SS3) */
190 } ISO2022State;
191
192 #define UCNV_OPTIONS_VERSION_MASK 0xf
193 #define UCNV_2022_MAX_CONVERTERS 10
194
195 typedef struct{
196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
197 UConverter *currentConverter;
198 Cnv2022Type currentType;
199 ISO2022State toU2022State, fromU2022State;
200 uint32_t key;
201 uint32_t version;
202 #ifdef U_ENABLE_GENERIC_ISO_2022
203 UBool isFirstBuffer;
204 #endif
205 UBool isEmptySegment;
206 char name[30];
207 char locale[3];
208 }UConverterDataISO2022;
209
210 /* Protos */
211 /* ISO-2022 ----------------------------------------------------------------- */
212
213 /*Forward declaration */
214 U_CFUNC void
215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
216 UErrorCode * err);
217 U_CFUNC void
218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
219 UErrorCode * err);
220
221 #define ESC_2022 0x1B /*ESC*/
222
223 typedef enum
224 {
225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229 } UCNV_TableStates_2022;
230
231 /*
232 * The way these state transition arrays work is:
233 * ex : ESC$B is the sequence for JISX208
234 * a) First Iteration: char is ESC
235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236 * int x = normalize_esq_chars_2022[27] which is equal to 1
237 * ii) Search for this value in escSeqStateTable_Key_2022[]
238 * value of x is stored at escSeqStateTable_Key_2022[0]
239 * iii) Save this index as offset
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242 * b) Switch on this state and continue to next char
243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244 * which is normalize_esq_chars_2022[36] == 4
245 * ii) x is currently 1(from above)
246 * x<<=5 -- x is now 32
247 * x+=normalize_esq_chars_2022[36]
248 * now x is 36
249 * iii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253 * c) Switch on this state and continue to next char
254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255 * ii) x is currently 36 (from above)
256 * x<<=5 -- x is now 1152
257 * x+=normalize_esq_chars_2022[66]
258 * now x is 1161
259 * iii) Search for this value in escSeqStateTable_Key_2022[]
260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
264 */
265
266
267 /*Below are the 3 arrays depicting a state transition table*/
268 static const int8_t normalize_esq_chars_2022[256] = {
269 /* 0 1 2 3 4 5 6 7 8 9 */
270
271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0
297 };
298
299 #ifdef U_ENABLE_GENERIC_ISO_2022
300 /*
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
304 *
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 *
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
312 *
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
315 */
316 #endif
317
318 #define MAX_STATES_2022 74
319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
321
322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
329 ,35947631 ,35947635 ,35947636 ,35947638
330 };
331
332 #ifdef U_ENABLE_GENERIC_ISO_2022
333
334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
335 /* 0 1 2 3 4 5 6 7 8 9 */
336
337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
345 };
346
347 #endif
348
349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
350 /* 0 1 2 3 4 5 6 7 8 9 */
351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 };
360
361
362 /* Type def for refactoring changeState_2022 code*/
363 typedef enum{
364 #ifdef U_ENABLE_GENERIC_ISO_2022
365 ISO_2022=0,
366 #endif
367 ISO_2022_JP=1,
368 ISO_2022_KR=2,
369 ISO_2022_CN=3
370 } Variant2022;
371
372 /*********** ISO 2022 Converter Protos ***********/
373 static void
374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
375
376 static void
377 _ISO2022Close(UConverter *converter);
378
379 static void
380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
381
382 static const char*
383 _ISO2022getName(const UConverter* cnv);
384
385 static void
386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
387
388 static UConverter *
389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
390
391 #ifdef U_ENABLE_GENERIC_ISO_2022
392 static void
393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
394 #endif
395
396 /*const UConverterSharedData _ISO2022Data;*/
397 static const UConverterSharedData _ISO2022JPData;
398 static const UConverterSharedData _ISO2022KRData;
399 static const UConverterSharedData _ISO2022CNData;
400
401 /*************** Converter implementations ******************/
402
403 /* The purpose of this function is to get around gcc compiler warnings. */
404 static U_INLINE void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)405 fromUWriteUInt8(UConverter *cnv,
406 const char *bytes, int32_t length,
407 uint8_t **target, const char *targetLimit,
408 int32_t **offsets,
409 int32_t sourceIndex,
410 UErrorCode *pErrorCode)
411 {
412 char *targetChars = (char *)*target;
413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
414 offsets, sourceIndex, pErrorCode);
415 *target = (uint8_t*)targetChars;
416
417 }
418
419 static U_INLINE void
setInitialStateToUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
421 if(myConverterData->version == 1) {
422 UConverter *cnv = myConverterData->currentConverter;
423
424 cnv->toUnicodeStatus=0; /* offset */
425 cnv->mode=0; /* state */
426 cnv->toULength=0; /* byteIndex */
427 }
428 }
429
430 static U_INLINE void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
432 /* in ISO-2022-KR the designator sequence appears only once
433 * in a file so we append it only once
434 */
435 if( converter->charErrorBufferLength==0){
436
437 converter->charErrorBufferLength = 4;
438 converter->charErrorBuffer[0] = 0x1b;
439 converter->charErrorBuffer[1] = 0x24;
440 converter->charErrorBuffer[2] = 0x29;
441 converter->charErrorBuffer[3] = 0x43;
442 }
443 if(myConverterData->version == 1) {
444 UConverter *cnv = myConverterData->currentConverter;
445
446 cnv->fromUChar32=0;
447 cnv->fromUnicodeStatus=1; /* prevLength */
448 }
449 }
450
451 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
453
454 char myLocale[6]={' ',' ',' ',' ',' ',' '};
455
456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
457 if(cnv->extraInfo != NULL) {
458 UConverterNamePieces stackPieces;
459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
461 uint32_t version;
462
463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
464
465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
466 myConverterData->currentType = ASCII1;
467 cnv->fromUnicodeStatus =FALSE;
468 if(pArgs->locale){
469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
470 }
471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
472 myConverterData->version = version;
473 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
474 (myLocale[2]=='_' || myLocale[2]=='\0'))
475 {
476 size_t len=0;
477 /* open the required converters and cache them */
478 if(version>MAX_JA_VERSION) {
479 /* prevent indexing beyond jpCharsetMasks[] */
480 myConverterData->version = version = 0;
481 }
482 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
483 myConverterData->myConverterArray[ISO8859_7] =
484 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
485 }
486 myConverterData->myConverterArray[JISX208] =
487 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
488 if(jpCharsetMasks[version]&CSM(JISX212)) {
489 myConverterData->myConverterArray[JISX212] =
490 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
491 }
492 if(jpCharsetMasks[version]&CSM(GB2312)) {
493 myConverterData->myConverterArray[GB2312] =
494 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
495 }
496 if(jpCharsetMasks[version]&CSM(KSC5601)) {
497 myConverterData->myConverterArray[KSC5601] =
498 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
499 }
500
501 /* set the function pointers to appropriate funtions */
502 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
503 uprv_strcpy(myConverterData->locale,"ja");
504
505 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
506 len = uprv_strlen(myConverterData->name);
507 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
508 myConverterData->name[len+1]='\0';
509 }
510 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
511 (myLocale[2]=='_' || myLocale[2]=='\0'))
512 {
513 const char *cnvName;
514 if(version==1) {
515 cnvName="icu-internal-25546";
516 } else {
517 cnvName="ksc_5601";
518 myConverterData->version=version=0;
519 }
520 if(pArgs->onlyTestIsLoadable) {
521 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
522 uprv_free(cnv->extraInfo);
523 cnv->extraInfo=NULL;
524 return;
525 } else {
526 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
527 if (U_FAILURE(*errorCode)) {
528 _ISO2022Close(cnv);
529 return;
530 }
531
532 if(version==1) {
533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
534 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
535 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
536 }else{
537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
538 }
539
540 /* initialize the state variables */
541 setInitialStateToUnicodeKR(cnv, myConverterData);
542 setInitialStateFromUnicodeKR(cnv, myConverterData);
543
544 /* set the function pointers to appropriate funtions */
545 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
546 uprv_strcpy(myConverterData->locale,"ko");
547 }
548 }
549 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
550 (myLocale[2]=='_' || myLocale[2]=='\0'))
551 {
552
553 /* open the required converters and cache them */
554 myConverterData->myConverterArray[GB2312_1] =
555 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
556 if(version==1) {
557 myConverterData->myConverterArray[ISO_IR_165] =
558 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
559 }
560 myConverterData->myConverterArray[CNS_11643] =
561 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
562
563
564 /* set the function pointers to appropriate funtions */
565 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
566 uprv_strcpy(myConverterData->locale,"cn");
567
568 if (version==0){
569 myConverterData->version = 0;
570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
571 }else if (version==1){
572 myConverterData->version = 1;
573 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
574 }else {
575 myConverterData->version = 2;
576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
577 }
578 }
579 else{
580 #ifdef U_ENABLE_GENERIC_ISO_2022
581 myConverterData->isFirstBuffer = TRUE;
582
583 /* append the UTF-8 escape sequence */
584 cnv->charErrorBufferLength = 3;
585 cnv->charErrorBuffer[0] = 0x1b;
586 cnv->charErrorBuffer[1] = 0x25;
587 cnv->charErrorBuffer[2] = 0x42;
588
589 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
590 /* initialize the state variables */
591 uprv_strcpy(myConverterData->name,"ISO_2022");
592 #else
593 *errorCode = U_UNSUPPORTED_ERROR;
594 return;
595 #endif
596 }
597
598 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
599
600 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
601 _ISO2022Close(cnv);
602 }
603 } else {
604 *errorCode = U_MEMORY_ALLOCATION_ERROR;
605 }
606 }
607
608
609 static void
_ISO2022Close(UConverter * converter)610 _ISO2022Close(UConverter *converter) {
611 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
612 UConverterSharedData **array = myData->myConverterArray;
613 int32_t i;
614
615 if (converter->extraInfo != NULL) {
616 /*close the array of converter pointers and free the memory*/
617 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
618 if(array[i]!=NULL) {
619 ucnv_unloadSharedDataIfReady(array[i]);
620 }
621 }
622
623 ucnv_close(myData->currentConverter);
624
625 if(!converter->isExtraLocal){
626 uprv_free (converter->extraInfo);
627 converter->extraInfo = NULL;
628 }
629 }
630 }
631
632 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)633 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
634 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
635 if(choice<=UCNV_RESET_TO_UNICODE) {
636 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
637 myConverterData->key = 0;
638 myConverterData->isEmptySegment = FALSE;
639 }
640 if(choice!=UCNV_RESET_TO_UNICODE) {
641 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
642 }
643 #ifdef U_ENABLE_GENERIC_ISO_2022
644 if(myConverterData->locale[0] == 0){
645 if(choice<=UCNV_RESET_TO_UNICODE) {
646 myConverterData->isFirstBuffer = TRUE;
647 myConverterData->key = 0;
648 if (converter->mode == UCNV_SO){
649 ucnv_close (myConverterData->currentConverter);
650 myConverterData->currentConverter=NULL;
651 }
652 converter->mode = UCNV_SI;
653 }
654 if(choice!=UCNV_RESET_TO_UNICODE) {
655 /* re-append UTF-8 escape sequence */
656 converter->charErrorBufferLength = 3;
657 converter->charErrorBuffer[0] = 0x1b;
658 converter->charErrorBuffer[1] = 0x28;
659 converter->charErrorBuffer[2] = 0x42;
660 }
661 }
662 else
663 #endif
664 {
665 /* reset the state variables */
666 if(myConverterData->locale[0] == 'k'){
667 if(choice<=UCNV_RESET_TO_UNICODE) {
668 setInitialStateToUnicodeKR(converter, myConverterData);
669 }
670 if(choice!=UCNV_RESET_TO_UNICODE) {
671 setInitialStateFromUnicodeKR(converter, myConverterData);
672 }
673 }
674 }
675 }
676
677 static const char*
_ISO2022getName(const UConverter * cnv)678 _ISO2022getName(const UConverter* cnv){
679 if(cnv->extraInfo){
680 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
681 return myData->name;
682 }
683 return NULL;
684 }
685
686
687 /*************** to unicode *******************/
688 /****************************************************************************
689 * Recognized escape sequences are
690 * <ESC>(B ASCII
691 * <ESC>.A ISO-8859-1
692 * <ESC>.F ISO-8859-7
693 * <ESC>(J JISX-201
694 * <ESC>(I JISX-201
695 * <ESC>$B JISX-208
696 * <ESC>$@ JISX-208
697 * <ESC>$(D JISX-212
698 * <ESC>$A GB2312
699 * <ESC>$(C KSC5601
700 */
701 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
702 /* 0 1 2 3 4 5 6 7 8 9 */
703 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
704 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
705 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
706 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
707 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
708 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
709 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
710 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
711 };
712
713 /*************** to unicode *******************/
714 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
715 /* 0 1 2 3 4 5 6 7 8 9 */
716 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
717 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
718 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
719 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
720 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
721 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
722 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
724 };
725
726
727 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)728 getKey_2022(char c,int32_t* key,int32_t* offset){
729 int32_t togo;
730 int32_t low = 0;
731 int32_t hi = MAX_STATES_2022;
732 int32_t oldmid=0;
733
734 togo = normalize_esq_chars_2022[(uint8_t)c];
735 if(togo == 0) {
736 /* not a valid character anywhere in an escape sequence */
737 *key = 0;
738 *offset = 0;
739 return INVALID_2022;
740 }
741 togo = (*key << 5) + togo;
742
743 while (hi != low) /*binary search*/{
744
745 register int32_t mid = (hi+low) >> 1; /*Finds median*/
746
747 if (mid == oldmid)
748 break;
749
750 if (escSeqStateTable_Key_2022[mid] > togo){
751 hi = mid;
752 }
753 else if (escSeqStateTable_Key_2022[mid] < togo){
754 low = mid;
755 }
756 else /*we found it*/{
757 *key = togo;
758 *offset = mid;
759 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
760 }
761 oldmid = mid;
762
763 }
764
765 *key = 0;
766 *offset = 0;
767 return INVALID_2022;
768 }
769
770 /*runs through a state machine to determine the escape sequence - codepage correspondance
771 */
772 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)773 changeState_2022(UConverter* _this,
774 const char** source,
775 const char* sourceLimit,
776 Variant2022 var,
777 UErrorCode* err){
778 UCNV_TableStates_2022 value;
779 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
780 uint32_t key = myData2022->key;
781 int32_t offset = 0;
782 int8_t initialToULength = _this->toULength;
783 char c;
784
785 value = VALID_NON_TERMINAL_2022;
786 while (*source < sourceLimit) {
787 c = *(*source)++;
788 _this->toUBytes[_this->toULength++]=(uint8_t)c;
789 value = getKey_2022(c,(int32_t *) &key, &offset);
790
791 switch (value){
792
793 case VALID_NON_TERMINAL_2022 :
794 /* continue with the loop */
795 break;
796
797 case VALID_TERMINAL_2022:
798 key = 0;
799 goto DONE;
800
801 case INVALID_2022:
802 goto DONE;
803
804 case VALID_MAYBE_TERMINAL_2022:
805 #ifdef U_ENABLE_GENERIC_ISO_2022
806 /* ESC ( B is ambiguous only for ISO_2022 itself */
807 if(var == ISO_2022) {
808 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
809 _this->toULength = 0;
810
811 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
812
813 /* continue with the loop */
814 value = VALID_NON_TERMINAL_2022;
815 break;
816 } else
817 #endif
818 {
819 /* not ISO_2022 itself, finish here */
820 value = VALID_TERMINAL_2022;
821 key = 0;
822 goto DONE;
823 }
824 }
825 }
826
827 DONE:
828 myData2022->key = key;
829
830 if (value == VALID_NON_TERMINAL_2022) {
831 /* indicate that the escape sequence is incomplete: key!=0 */
832 return;
833 } else if (value == INVALID_2022 ) {
834 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
835 } else /* value == VALID_TERMINAL_2022 */ {
836 switch(var){
837 #ifdef U_ENABLE_GENERIC_ISO_2022
838 case ISO_2022:
839 {
840 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
841 if(chosenConverterName == NULL) {
842 /* SS2 or SS3 */
843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844 _this->toUCallbackReason = UCNV_UNASSIGNED;
845 return;
846 }
847
848 _this->mode = UCNV_SI;
849 ucnv_close(myData2022->currentConverter);
850 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
851 if(U_SUCCESS(*err)) {
852 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
853 _this->mode = UCNV_SO;
854 }
855 break;
856 }
857 #endif
858 case ISO_2022_JP:
859 {
860 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
861 switch(tempState) {
862 case INVALID_STATE:
863 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
864 break;
865 case SS2_STATE:
866 if(myData2022->toU2022State.cs[2]!=0) {
867 if(myData2022->toU2022State.g<2) {
868 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
869 }
870 myData2022->toU2022State.g=2;
871 } else {
872 /* illegal to have SS2 before a matching designator */
873 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
874 }
875 break;
876 /* case SS3_STATE: not used in ISO-2022-JP-x */
877 case ISO8859_1:
878 case ISO8859_7:
879 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
880 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
881 } else {
882 /* G2 charset for SS2 */
883 myData2022->toU2022State.cs[2]=(int8_t)tempState;
884 }
885 break;
886 default:
887 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
888 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
889 } else {
890 /* G0 charset */
891 myData2022->toU2022State.cs[0]=(int8_t)tempState;
892 }
893 break;
894 }
895 }
896 break;
897 case ISO_2022_CN:
898 {
899 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
900 switch(tempState) {
901 case INVALID_STATE:
902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
903 break;
904 case SS2_STATE:
905 if(myData2022->toU2022State.cs[2]!=0) {
906 if(myData2022->toU2022State.g<2) {
907 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
908 }
909 myData2022->toU2022State.g=2;
910 } else {
911 /* illegal to have SS2 before a matching designator */
912 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
913 }
914 break;
915 case SS3_STATE:
916 if(myData2022->toU2022State.cs[3]!=0) {
917 if(myData2022->toU2022State.g<2) {
918 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
919 }
920 myData2022->toU2022State.g=3;
921 } else {
922 /* illegal to have SS3 before a matching designator */
923 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
924 }
925 break;
926 case ISO_IR_165:
927 if(myData2022->version==0) {
928 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
929 break;
930 }
931 /*fall through*/
932 case GB2312_1:
933 /*fall through*/
934 case CNS_11643_1:
935 myData2022->toU2022State.cs[1]=(int8_t)tempState;
936 break;
937 case CNS_11643_2:
938 myData2022->toU2022State.cs[2]=(int8_t)tempState;
939 break;
940 default:
941 /* other CNS 11643 planes */
942 if(myData2022->version==0) {
943 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
944 } else {
945 myData2022->toU2022State.cs[3]=(int8_t)tempState;
946 }
947 break;
948 }
949 }
950 break;
951 case ISO_2022_KR:
952 if(offset==0x30){
953 /* nothing to be done, just accept this one escape sequence */
954 } else {
955 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
956 }
957 break;
958
959 default:
960 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
961 break;
962 }
963 }
964 if(U_SUCCESS(*err)) {
965 _this->toULength = 0;
966 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
967 if(_this->toULength>1) {
968 /*
969 * Ticket 5691: consistent illegal sequences:
970 * - We include at least the first byte (ESC) in the illegal sequence.
971 * - If any of the non-initial bytes could be the start of a character,
972 * we stop the illegal sequence before the first one of those.
973 * In escape sequences, all following bytes are "printable", that is,
974 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
975 * they are valid single/lead bytes.
976 * For simplicity, we always only report the initial ESC byte as the
977 * illegal sequence and back out all other bytes we looked at.
978 */
979 /* Back out some bytes. */
980 int8_t backOutDistance=_this->toULength-1;
981 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
982 if(backOutDistance<=bytesFromThisBuffer) {
983 /* same as initialToULength<=1 */
984 *source-=backOutDistance;
985 } else {
986 /* Back out bytes from the previous buffer: Need to replay them. */
987 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
988 /* same as -(initialToULength-1) */
989 /* preToULength is negative! */
990 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
991 *source-=bytesFromThisBuffer;
992 }
993 _this->toULength=1;
994 }
995 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
996 _this->toUCallbackReason = UCNV_UNASSIGNED;
997 }
998 }
999
1000 /*Checks the characters of the buffer against valid 2022 escape sequences
1001 *if the match we return a pointer to the initial start of the sequence otherwise
1002 *we return sourceLimit
1003 */
1004 /*for 2022 looks ahead in the stream
1005 *to determine the longest possible convertible
1006 *data stream
1007 */
1008 static U_INLINE const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool flush)1009 getEndOfBuffer_2022(const char** source,
1010 const char* sourceLimit,
1011 UBool flush){
1012
1013 const char* mySource = *source;
1014
1015 #ifdef U_ENABLE_GENERIC_ISO_2022
1016 if (*source >= sourceLimit)
1017 return sourceLimit;
1018
1019 do{
1020
1021 if (*mySource == ESC_2022){
1022 int8_t i;
1023 int32_t key = 0;
1024 int32_t offset;
1025 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1026
1027 /* Kludge: I could not
1028 * figure out the reason for validating an escape sequence
1029 * twice - once here and once in changeState_2022().
1030 * is it possible to have an ESC character in a ISO2022
1031 * byte stream which is valid in a code page? Is it legal?
1032 */
1033 for (i=0;
1034 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1035 i++) {
1036 value = getKey_2022(*(mySource+i), &key, &offset);
1037 }
1038 if (value > 0 || *mySource==ESC_2022)
1039 return mySource;
1040
1041 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1042 return sourceLimit;
1043 }
1044 }while (++mySource < sourceLimit);
1045
1046 return sourceLimit;
1047 #else
1048 while(mySource < sourceLimit && *mySource != ESC_2022) {
1049 ++mySource;
1050 }
1051 return mySource;
1052 #endif
1053 }
1054
1055
1056 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1057 * any future change in _MBCSFromUChar32() function should be reflected here.
1058 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1059 */
1060 static U_INLINE int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1061 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1062 UChar32 c,
1063 uint32_t* value,
1064 UBool useFallback,
1065 int outputType)
1066 {
1067 const int32_t *cx;
1068 const uint16_t *table;
1069 uint32_t stage2Entry;
1070 uint32_t myValue;
1071 int32_t length;
1072 const uint8_t *p;
1073 /*
1074 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1075 * Use internal version of ucnv_open() that verifies that the new structures are available,
1076 * else U_INTERNAL_PROGRAM_ERROR.
1077 */
1078 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1079 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1080 table=sharedData->mbcs.fromUnicodeTable;
1081 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1082 /* get the bytes and the length for the output */
1083 if(outputType==MBCS_OUTPUT_2){
1084 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1085 if(myValue<=0xff) {
1086 length=1;
1087 } else {
1088 length=2;
1089 }
1090 } else /* outputType==MBCS_OUTPUT_3 */ {
1091 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1092 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1093 if(myValue<=0xff) {
1094 length=1;
1095 } else if(myValue<=0xffff) {
1096 length=2;
1097 } else {
1098 length=3;
1099 }
1100 }
1101 /* is this code point assigned, or do we use fallbacks? */
1102 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1103 /* assigned */
1104 *value=myValue;
1105 return length;
1106 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1107 /*
1108 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1109 * There is no way with this data structure for fallback output
1110 * to be a zero byte.
1111 */
1112 *value=myValue;
1113 return -length;
1114 }
1115 }
1116
1117 cx=sharedData->mbcs.extIndexes;
1118 if(cx!=NULL) {
1119 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1120 }
1121
1122 /* unassigned */
1123 return 0;
1124 }
1125
1126 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1127 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1128 * @param retval pointer to output byte
1129 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1130 */
1131 static U_INLINE int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1132 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1133 UChar32 c,
1134 uint32_t* retval,
1135 UBool useFallback)
1136 {
1137 const uint16_t *table;
1138 int32_t value;
1139 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1140 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1141 return 0;
1142 }
1143 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1144 table=sharedData->mbcs.fromUnicodeTable;
1145 /* get the byte for the output */
1146 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1147 /* is this code point assigned, or do we use fallbacks? */
1148 *retval=(uint32_t)(value&0xff);
1149 if(value>=0xf00) {
1150 return 1; /* roundtrip */
1151 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1152 return -1; /* fallback taken */
1153 } else {
1154 return 0; /* no mapping */
1155 }
1156 }
1157
1158 /*
1159 * Check that the result is a 2-byte value with each byte in the range A1..FE
1160 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1161 * to move it to the ISO 2022 range 21..7E.
1162 * Return 0 if out of range.
1163 */
1164 static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value)1165 _2022FromGR94DBCS(uint32_t value) {
1166 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1167 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1168 ) {
1169 return value - 0x8080; /* shift down to 21..7e byte range */
1170 } else {
1171 return 0; /* not valid for ISO 2022 */
1172 }
1173 }
1174
1175 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1176 /*
1177 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1178 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1179 * unchanged.
1180 */
1181 static U_INLINE uint32_t
1182 _2022ToGR94DBCS(uint32_t value) {
1183 uint32_t returnValue = value + 0x8080;
1184 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1185 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1186 return returnValue;
1187 } else {
1188 return value;
1189 }
1190 }
1191 #endif
1192
1193 #ifdef U_ENABLE_GENERIC_ISO_2022
1194
1195 /**********************************************************************************
1196 * ISO-2022 Converter
1197 *
1198 *
1199 */
1200
1201 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1202 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1203 UErrorCode* err){
1204 const char* mySourceLimit, *realSourceLimit;
1205 const char* sourceStart;
1206 const UChar* myTargetStart;
1207 UConverter* saveThis;
1208 UConverterDataISO2022* myData;
1209 int8_t length;
1210
1211 saveThis = args->converter;
1212 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1213
1214 realSourceLimit = args->sourceLimit;
1215 while (args->source < realSourceLimit) {
1216 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1217 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1218 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1219
1220 if(args->source < mySourceLimit) {
1221 if(myData->currentConverter==NULL) {
1222 myData->currentConverter = ucnv_open("ASCII",err);
1223 if(U_FAILURE(*err)){
1224 return;
1225 }
1226
1227 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1228 saveThis->mode = UCNV_SO;
1229 }
1230
1231 /* convert to before the ESC or until the end of the buffer */
1232 myData->isFirstBuffer=FALSE;
1233 sourceStart = args->source;
1234 myTargetStart = args->target;
1235 args->converter = myData->currentConverter;
1236 ucnv_toUnicode(args->converter,
1237 &args->target,
1238 args->targetLimit,
1239 &args->source,
1240 mySourceLimit,
1241 args->offsets,
1242 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1243 err);
1244 args->converter = saveThis;
1245
1246 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1247 /* move the overflow buffer */
1248 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1249 myData->currentConverter->UCharErrorBufferLength = 0;
1250 if(length > 0) {
1251 uprv_memcpy(saveThis->UCharErrorBuffer,
1252 myData->currentConverter->UCharErrorBuffer,
1253 length*U_SIZEOF_UCHAR);
1254 }
1255 return;
1256 }
1257
1258 /*
1259 * At least one of:
1260 * -Error while converting
1261 * -Done with entire buffer
1262 * -Need to write offsets or update the current offset
1263 * (leave that up to the code in ucnv.c)
1264 *
1265 * or else we just stopped at an ESC byte and continue with changeState_2022()
1266 */
1267 if (U_FAILURE(*err) ||
1268 (args->source == realSourceLimit) ||
1269 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1270 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1271 ) {
1272 /* copy partial or error input for truncated detection and error handling */
1273 if(U_FAILURE(*err)) {
1274 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1275 if(length > 0) {
1276 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1277 }
1278 } else {
1279 length = saveThis->toULength = myData->currentConverter->toULength;
1280 if(length > 0) {
1281 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1282 if(args->source < mySourceLimit) {
1283 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1284 }
1285 }
1286 }
1287 return;
1288 }
1289 }
1290 }
1291
1292 sourceStart = args->source;
1293 changeState_2022(args->converter,
1294 &(args->source),
1295 realSourceLimit,
1296 ISO_2022,
1297 err);
1298 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1299 /* let the ucnv.c code update its current offset */
1300 return;
1301 }
1302 }
1303 }
1304
1305 #endif
1306
1307 /*
1308 * To Unicode Callback helper function
1309 */
1310 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1311 toUnicodeCallback(UConverter *cnv,
1312 const uint32_t sourceChar, const uint32_t targetUniChar,
1313 UErrorCode* err){
1314 if(sourceChar>0xff){
1315 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1316 cnv->toUBytes[1] = (uint8_t)sourceChar;
1317 cnv->toULength = 2;
1318 }
1319 else{
1320 cnv->toUBytes[0] =(char) sourceChar;
1321 cnv->toULength = 1;
1322 }
1323
1324 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1325 *err = U_INVALID_CHAR_FOUND;
1326 }
1327 else{
1328 *err = U_ILLEGAL_CHAR_FOUND;
1329 }
1330 }
1331
1332 /**************************************ISO-2022-JP*************************************************/
1333
1334 /************************************** IMPORTANT **************************************************
1335 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1336 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1337 * The converter iterates over each Unicode codepoint
1338 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1339 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1340 * would do as far as possible.
1341 *
1342 * If the implementation of these macros or structure of sharedData struct change in the future, make
1343 * sure that ISO-2022 is also changed.
1344 ***************************************************************************************************
1345 */
1346
1347 /***************************************************************************************************
1348 * Rules for ISO-2022-jp encoding
1349 * (i) Escape sequences must be fully contained within a line they should not
1350 * span new lines or CRs
1351 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1352 * JIS-Roman character escape sequence should follow before the line terminates
1353 * (iii) If the first character on the line is represented by two bytes then a two
1354 * byte character escape sequence should precede it
1355 * (iv) If no escape sequence is encountered then the characters are ASCII
1356 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1357 * and invoked with SS2 (ESC N).
1358 * (vi) If there is any G0 designation in text, there must be a switch to
1359 * ASCII or to JIS X 0201-Roman before a space character (but not
1360 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1361 * characters such as tab or CRLF.
1362 * (vi) Supported encodings:
1363 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1364 *
1365 * source : RFC-1554
1366 *
1367 * JISX201, JISX208,JISX212 : new .cnv data files created
1368 * KSC5601 : alias to ibm-949 mapping table
1369 * GB2312 : alias to ibm-1386 mapping table
1370 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1371 * ISO-8859-7 : alisas to ibm-9409 mapping table
1372 */
1373
1374 /* preference order of JP charsets */
1375 static const StateEnum jpCharsetPref[]={
1376 ASCII,
1377 JISX201,
1378 ISO8859_1,
1379 ISO8859_7,
1380 JISX208,
1381 JISX212,
1382 GB2312,
1383 KSC5601,
1384 HWKANA_7BIT
1385 };
1386
1387 /*
1388 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1389 * not in order of jpCharsetPref[]!
1390 */
1391 static const char escSeqChars[][6] ={
1392 "\x1B\x28\x42", /* <ESC>(B ASCII */
1393 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1394 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1395 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1396 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1397 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1398 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1399 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1400 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1401
1402 };
1403 static const int8_t escSeqCharsLen[] ={
1404 3, /* length of <ESC>(B ASCII */
1405 3, /* length of <ESC>.A ISO-8859-1 */
1406 3, /* length of <ESC>.F ISO-8859-7 */
1407 3, /* length of <ESC>(J JISX-201 */
1408 3, /* length of <ESC>$B JISX-208 */
1409 4, /* length of <ESC>$(D JISX-212 */
1410 3, /* length of <ESC>$A GB2312 */
1411 4, /* length of <ESC>$(C KSC5601 */
1412 3 /* length of <ESC>(I HWKANA_7BIT */
1413 };
1414
1415 /*
1416 * The iteration over various code pages works this way:
1417 * i) Get the currentState from myConverterData->currentState
1418 * ii) Check if the character is mapped to a valid character in the currentState
1419 * Yes -> a) set the initIterState to currentState
1420 * b) remain in this state until an invalid character is found
1421 * No -> a) go to the next code page and find the character
1422 * iii) Before changing the state increment the current state check if the current state
1423 * is equal to the intitIteration state
1424 * Yes -> A character that cannot be represented in any of the supported encodings
1425 * break and return a U_INVALID_CHARACTER error
1426 * No -> Continue and find the character in next code page
1427 *
1428 *
1429 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1430 */
1431
1432 /* Map 00..7F to Unicode according to JIS X 0201. */
1433 static U_INLINE uint32_t
jisx201ToU(uint32_t value)1434 jisx201ToU(uint32_t value) {
1435 if(value < 0x5c) {
1436 return value;
1437 } else if(value == 0x5c) {
1438 return 0xa5;
1439 } else if(value == 0x7e) {
1440 return 0x203e;
1441 } else /* value <= 0x7f */ {
1442 return value;
1443 }
1444 }
1445
1446 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1447 static U_INLINE uint32_t
jisx201FromU(uint32_t value)1448 jisx201FromU(uint32_t value) {
1449 if(value<=0x7f) {
1450 if(value!=0x5c && value!=0x7e) {
1451 return value;
1452 }
1453 } else if(value==0xa5) {
1454 return 0x5c;
1455 } else if(value==0x203e) {
1456 return 0x7e;
1457 }
1458 return 0xfffe;
1459 }
1460
1461 /*
1462 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1463 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1464 * Return 0 if the byte pair is out of range.
1465 */
1466 static U_INLINE uint32_t
_2022FromSJIS(uint32_t value)1467 _2022FromSJIS(uint32_t value) {
1468 uint8_t trail;
1469
1470 if(value > 0xEFFC) {
1471 return 0; /* beyond JIS X 0208 */
1472 }
1473
1474 trail = (uint8_t)value;
1475
1476 value &= 0xff00; /* lead byte */
1477 if(value <= 0x9f00) {
1478 value -= 0x7000;
1479 } else /* 0xe000 <= value <= 0xef00 */ {
1480 value -= 0xb000;
1481 }
1482 value <<= 1;
1483
1484 if(trail <= 0x9e) {
1485 value -= 0x100;
1486 if(trail <= 0x7e) {
1487 value |= trail - 0x1f;
1488 } else {
1489 value |= trail - 0x20;
1490 }
1491 } else /* trail <= 0xfc */ {
1492 value |= trail - 0x7e;
1493 }
1494 return value;
1495 }
1496
1497 /*
1498 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1499 * If either byte is outside 21..7E make sure that the result is not valid
1500 * for Shift-JIS so that the converter catches it.
1501 * Some invalid byte values already turn into equally invalid Shift-JIS
1502 * byte values and need not be tested explicitly.
1503 */
1504 static U_INLINE void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1505 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1506 if(c1&1) {
1507 ++c1;
1508 if(c2 <= 0x5f) {
1509 c2 += 0x1f;
1510 } else if(c2 <= 0x7e) {
1511 c2 += 0x20;
1512 } else {
1513 c2 = 0; /* invalid */
1514 }
1515 } else {
1516 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1517 c2 += 0x7e;
1518 } else {
1519 c2 = 0; /* invalid */
1520 }
1521 }
1522 c1 >>= 1;
1523 if(c1 <= 0x2f) {
1524 c1 += 0x70;
1525 } else if(c1 <= 0x3f) {
1526 c1 += 0xb0;
1527 } else {
1528 c1 = 0; /* invalid */
1529 }
1530 bytes[0] = (char)c1;
1531 bytes[1] = (char)c2;
1532 }
1533
1534 /*
1535 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1536 * Katakana.
1537 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1538 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1539 * These were the only fallbacks in ICU's jisx-208.ucm file.
1540 */
1541 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1542 0x2123, /* U+FF61 */
1543 0x2156,
1544 0x2157,
1545 0x2122,
1546 0x2126,
1547 0x2572,
1548 0x2521,
1549 0x2523,
1550 0x2525,
1551 0x2527,
1552 0x2529,
1553 0x2563,
1554 0x2565,
1555 0x2567,
1556 0x2543,
1557 0x213C, /* U+FF70 */
1558 0x2522,
1559 0x2524,
1560 0x2526,
1561 0x2528,
1562 0x252A,
1563 0x252B,
1564 0x252D,
1565 0x252F,
1566 0x2531,
1567 0x2533,
1568 0x2535,
1569 0x2537,
1570 0x2539,
1571 0x253B,
1572 0x253D,
1573 0x253F, /* U+FF80 */
1574 0x2541,
1575 0x2544,
1576 0x2546,
1577 0x2548,
1578 0x254A,
1579 0x254B,
1580 0x254C,
1581 0x254D,
1582 0x254E,
1583 0x254F,
1584 0x2552,
1585 0x2555,
1586 0x2558,
1587 0x255B,
1588 0x255E,
1589 0x255F, /* U+FF90 */
1590 0x2560,
1591 0x2561,
1592 0x2562,
1593 0x2564,
1594 0x2566,
1595 0x2568,
1596 0x2569,
1597 0x256A,
1598 0x256B,
1599 0x256C,
1600 0x256D,
1601 0x256F,
1602 0x2573,
1603 0x212B,
1604 0x212C /* U+FF9F */
1605 };
1606
1607 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1608 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1609 UConverter *cnv = args->converter;
1610 UConverterDataISO2022 *converterData;
1611 ISO2022State *pFromU2022State;
1612 uint8_t *target = (uint8_t *) args->target;
1613 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1614 const UChar* source = args->source;
1615 const UChar* sourceLimit = args->sourceLimit;
1616 int32_t* offsets = args->offsets;
1617 UChar32 sourceChar;
1618 char buffer[8];
1619 int32_t len, outLen;
1620 int8_t choices[10];
1621 int32_t choiceCount;
1622 uint32_t targetValue = 0;
1623 UBool useFallback;
1624
1625 int32_t i;
1626 int8_t cs, g;
1627
1628 /* set up the state */
1629 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1630 pFromU2022State = &converterData->fromU2022State;
1631
1632 choiceCount = 0;
1633
1634 /* check if the last codepoint of previous buffer was a lead surrogate*/
1635 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1636 goto getTrail;
1637 }
1638
1639 while(source < sourceLimit) {
1640 if(target < targetLimit) {
1641
1642 sourceChar = *(source++);
1643 /*check if the char is a First surrogate*/
1644 if(UTF_IS_SURROGATE(sourceChar)) {
1645 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1646 getTrail:
1647 /*look ahead to find the trail surrogate*/
1648 if(source < sourceLimit) {
1649 /* test the following code unit */
1650 UChar trail=(UChar) *source;
1651 if(UTF_IS_SECOND_SURROGATE(trail)) {
1652 source++;
1653 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1654 cnv->fromUChar32=0x00;
1655 /* convert this supplementary code point */
1656 /* exit this condition tree */
1657 } else {
1658 /* this is an unmatched lead code unit (1st surrogate) */
1659 /* callback(illegal) */
1660 *err=U_ILLEGAL_CHAR_FOUND;
1661 cnv->fromUChar32=sourceChar;
1662 break;
1663 }
1664 } else {
1665 /* no more input */
1666 cnv->fromUChar32=sourceChar;
1667 break;
1668 }
1669 } else {
1670 /* this is an unmatched trail code unit (2nd surrogate) */
1671 /* callback(illegal) */
1672 *err=U_ILLEGAL_CHAR_FOUND;
1673 cnv->fromUChar32=sourceChar;
1674 break;
1675 }
1676 }
1677
1678 /* do not convert SO/SI/ESC */
1679 if(IS_2022_CONTROL(sourceChar)) {
1680 /* callback(illegal) */
1681 *err=U_ILLEGAL_CHAR_FOUND;
1682 cnv->fromUChar32=sourceChar;
1683 break;
1684 }
1685
1686 /* do the conversion */
1687
1688 if(choiceCount == 0) {
1689 uint16_t csm;
1690
1691 /*
1692 * The csm variable keeps track of which charsets are allowed
1693 * and not used yet while building the choices[].
1694 */
1695 csm = jpCharsetMasks[converterData->version];
1696 choiceCount = 0;
1697
1698 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1699 if(converterData->version == 3 || converterData->version == 4) {
1700 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1701 }
1702 /* Do not try single-byte half-width Katakana for other versions. */
1703 csm &= ~CSM(HWKANA_7BIT);
1704
1705 /* try the current G0 charset */
1706 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1707 csm &= ~CSM(cs);
1708
1709 /* try the current G2 charset */
1710 if((cs = pFromU2022State->cs[2]) != 0) {
1711 choices[choiceCount++] = cs;
1712 csm &= ~CSM(cs);
1713 }
1714
1715 /* try all the other possible charsets */
1716 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1717 cs = (int8_t)jpCharsetPref[i];
1718 if(CSM(cs) & csm) {
1719 choices[choiceCount++] = cs;
1720 csm &= ~CSM(cs);
1721 }
1722 }
1723 }
1724
1725 cs = g = 0;
1726 /*
1727 * len==0: no mapping found yet
1728 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1729 * len>0: found a roundtrip result, done
1730 */
1731 len = 0;
1732 /*
1733 * We will turn off useFallback after finding a fallback,
1734 * but we still get fallbacks from PUA code points as usual.
1735 * Therefore, we will also need to check that we don't overwrite
1736 * an early fallback with a later one.
1737 */
1738 useFallback = cnv->useFallback;
1739
1740 for(i = 0; i < choiceCount && len <= 0; ++i) {
1741 uint32_t value;
1742 int32_t len2;
1743 int8_t cs0 = choices[i];
1744 switch(cs0) {
1745 case ASCII:
1746 if(sourceChar <= 0x7f) {
1747 targetValue = (uint32_t)sourceChar;
1748 len = 1;
1749 cs = cs0;
1750 g = 0;
1751 }
1752 break;
1753 case ISO8859_1:
1754 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1755 targetValue = (uint32_t)sourceChar - 0x80;
1756 len = 1;
1757 cs = cs0;
1758 g = 2;
1759 }
1760 break;
1761 case HWKANA_7BIT:
1762 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1763 if(converterData->version==3) {
1764 /* JIS7: use G1 (SO) */
1765 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1766 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1767 len = 1;
1768 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1769 g = 1;
1770 } else if(converterData->version==4) {
1771 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1772 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1773 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1774 len = 1;
1775
1776 cs = pFromU2022State->cs[0];
1777 if(IS_JP_DBCS(cs)) {
1778 /* switch from a DBCS charset to JISX201 */
1779 cs = (int8_t)JISX201;
1780 }
1781 /* else stay in the current G0 charset */
1782 g = 0;
1783 }
1784 /* else do not use HWKANA_7BIT with other versions */
1785 }
1786 break;
1787 case JISX201:
1788 /* G0 SBCS */
1789 value = jisx201FromU(sourceChar);
1790 if(value <= 0x7f) {
1791 targetValue = value;
1792 len = 1;
1793 cs = cs0;
1794 g = 0;
1795 useFallback = FALSE;
1796 }
1797 break;
1798 case JISX208:
1799 /* G0 DBCS from Shift-JIS table */
1800 len2 = MBCS_FROM_UCHAR32_ISO2022(
1801 converterData->myConverterArray[cs0],
1802 sourceChar, &value,
1803 useFallback, MBCS_OUTPUT_2);
1804 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1805 value = _2022FromSJIS(value);
1806 if(value != 0) {
1807 targetValue = value;
1808 len = len2;
1809 cs = cs0;
1810 g = 0;
1811 useFallback = FALSE;
1812 }
1813 } else if(len == 0 && useFallback &&
1814 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1815 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1816 len = -2;
1817 cs = cs0;
1818 g = 0;
1819 useFallback = FALSE;
1820 }
1821 break;
1822 case ISO8859_7:
1823 /* G0 SBCS forced to 7-bit output */
1824 len2 = MBCS_SINGLE_FROM_UCHAR32(
1825 converterData->myConverterArray[cs0],
1826 sourceChar, &value,
1827 useFallback);
1828 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1829 targetValue = value - 0x80;
1830 len = len2;
1831 cs = cs0;
1832 g = 2;
1833 useFallback = FALSE;
1834 }
1835 break;
1836 default:
1837 /* G0 DBCS */
1838 len2 = MBCS_FROM_UCHAR32_ISO2022(
1839 converterData->myConverterArray[cs0],
1840 sourceChar, &value,
1841 useFallback, MBCS_OUTPUT_2);
1842 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1843 if(cs0 == KSC5601) {
1844 /*
1845 * Check for valid bytes for the encoding scheme.
1846 * This is necessary because the sub-converter (windows-949)
1847 * has a broader encoding scheme than is valid for 2022.
1848 */
1849 value = _2022FromGR94DBCS(value);
1850 if(value == 0) {
1851 break;
1852 }
1853 }
1854 targetValue = value;
1855 len = len2;
1856 cs = cs0;
1857 g = 0;
1858 useFallback = FALSE;
1859 }
1860 break;
1861 }
1862 }
1863
1864 if(len != 0) {
1865 if(len < 0) {
1866 len = -len; /* fallback */
1867 }
1868 outLen = 0; /* count output bytes */
1869
1870 /* write SI if necessary (only for JIS7) */
1871 if(pFromU2022State->g == 1 && g == 0) {
1872 buffer[outLen++] = UCNV_SI;
1873 pFromU2022State->g = 0;
1874 }
1875
1876 /* write the designation sequence if necessary */
1877 if(cs != pFromU2022State->cs[g]) {
1878 int32_t escLen = escSeqCharsLen[cs];
1879 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1880 outLen += escLen;
1881 pFromU2022State->cs[g] = cs;
1882
1883 /* invalidate the choices[] */
1884 choiceCount = 0;
1885 }
1886
1887 /* write the shift sequence if necessary */
1888 if(g != pFromU2022State->g) {
1889 switch(g) {
1890 /* case 0 handled before writing escapes */
1891 case 1:
1892 buffer[outLen++] = UCNV_SO;
1893 pFromU2022State->g = 1;
1894 break;
1895 default: /* case 2 */
1896 buffer[outLen++] = 0x1b;
1897 buffer[outLen++] = 0x4e;
1898 break;
1899 /* no case 3: no SS3 in ISO-2022-JP-x */
1900 }
1901 }
1902
1903 /* write the output bytes */
1904 if(len == 1) {
1905 buffer[outLen++] = (char)targetValue;
1906 } else /* len == 2 */ {
1907 buffer[outLen++] = (char)(targetValue >> 8);
1908 buffer[outLen++] = (char)targetValue;
1909 }
1910 } else {
1911 /*
1912 * if we cannot find the character after checking all codepages
1913 * then this is an error
1914 */
1915 *err = U_INVALID_CHAR_FOUND;
1916 cnv->fromUChar32=sourceChar;
1917 break;
1918 }
1919
1920 if(sourceChar == CR || sourceChar == LF) {
1921 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1922 pFromU2022State->cs[2] = 0;
1923 choiceCount = 0;
1924 }
1925
1926 /* output outLen>0 bytes in buffer[] */
1927 if(outLen == 1) {
1928 *target++ = buffer[0];
1929 if(offsets) {
1930 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1931 }
1932 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1933 *target++ = buffer[0];
1934 *target++ = buffer[1];
1935 if(offsets) {
1936 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1937 *offsets++ = sourceIndex;
1938 *offsets++ = sourceIndex;
1939 }
1940 } else {
1941 fromUWriteUInt8(
1942 cnv,
1943 buffer, outLen,
1944 &target, (const char *)targetLimit,
1945 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1946 err);
1947 if(U_FAILURE(*err)) {
1948 break;
1949 }
1950 }
1951 } /* end if(myTargetIndex<myTargetLength) */
1952 else{
1953 *err =U_BUFFER_OVERFLOW_ERROR;
1954 break;
1955 }
1956
1957 }/* end while(mySourceIndex<mySourceLength) */
1958
1959 /*
1960 * the end of the input stream and detection of truncated input
1961 * are handled by the framework, but for ISO-2022-JP conversion
1962 * we need to be in ASCII mode at the very end
1963 *
1964 * conditions:
1965 * successful
1966 * in SO mode or not in ASCII mode
1967 * end of input and no truncated input
1968 */
1969 if( U_SUCCESS(*err) &&
1970 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1971 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1972 ) {
1973 int32_t sourceIndex;
1974
1975 outLen = 0;
1976
1977 if(pFromU2022State->g != 0) {
1978 buffer[outLen++] = UCNV_SI;
1979 pFromU2022State->g = 0;
1980 }
1981
1982 if(pFromU2022State->cs[0] != ASCII) {
1983 int32_t escLen = escSeqCharsLen[ASCII];
1984 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1985 outLen += escLen;
1986 pFromU2022State->cs[0] = (int8_t)ASCII;
1987 }
1988
1989 /* get the source index of the last input character */
1990 /*
1991 * TODO this would be simpler and more reliable if we used a pair
1992 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1993 * so that we could simply use the prevSourceIndex here;
1994 * this code gives an incorrect result for the rare case of an unmatched
1995 * trail surrogate that is alone in the last buffer of the text stream
1996 */
1997 sourceIndex=(int32_t)(source-args->source);
1998 if(sourceIndex>0) {
1999 --sourceIndex;
2000 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2001 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2002 ) {
2003 --sourceIndex;
2004 }
2005 } else {
2006 sourceIndex=-1;
2007 }
2008
2009 fromUWriteUInt8(
2010 cnv,
2011 buffer, outLen,
2012 &target, (const char *)targetLimit,
2013 &offsets, sourceIndex,
2014 err);
2015 }
2016
2017 /*save the state and return */
2018 args->source = source;
2019 args->target = (char*)target;
2020 }
2021
2022 /*************** to unicode *******************/
2023
2024 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2025 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2026 UErrorCode* err){
2027 char tempBuf[2];
2028 const char *mySource = (char *) args->source;
2029 UChar *myTarget = args->target;
2030 const char *mySourceLimit = args->sourceLimit;
2031 uint32_t targetUniChar = 0x0000;
2032 uint32_t mySourceChar = 0x0000;
2033 uint32_t tmpSourceChar = 0x0000;
2034 UConverterDataISO2022* myData;
2035 ISO2022State *pToU2022State;
2036 StateEnum cs;
2037
2038 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2039 pToU2022State = &myData->toU2022State;
2040
2041 if(myData->key != 0) {
2042 /* continue with a partial escape sequence */
2043 goto escape;
2044 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2045 /* continue with a partial double-byte character */
2046 mySourceChar = args->converter->toUBytes[0];
2047 args->converter->toULength = 0;
2048 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2049 targetUniChar = missingCharMarker;
2050 goto getTrailByte;
2051 }
2052
2053 while(mySource < mySourceLimit){
2054
2055 targetUniChar =missingCharMarker;
2056
2057 if(myTarget < args->targetLimit){
2058
2059 mySourceChar= (unsigned char) *mySource++;
2060
2061 switch(mySourceChar) {
2062 case UCNV_SI:
2063 if(myData->version==3) {
2064 pToU2022State->g=0;
2065 continue;
2066 } else {
2067 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2069 break;
2070 }
2071
2072 case UCNV_SO:
2073 if(myData->version==3) {
2074 /* JIS7: switch to G1 half-width Katakana */
2075 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2076 pToU2022State->g=1;
2077 continue;
2078 } else {
2079 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2080 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2081 break;
2082 }
2083
2084 case ESC_2022:
2085 mySource--;
2086 escape:
2087 {
2088 const char * mySourceBefore = mySource;
2089 int8_t toULengthBefore = args->converter->toULength;
2090
2091 changeState_2022(args->converter,&(mySource),
2092 mySourceLimit, ISO_2022_JP,err);
2093
2094 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2095 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2096 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2097 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2098 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2099 }
2100 }
2101
2102 /* invalid or illegal escape sequence */
2103 if(U_FAILURE(*err)){
2104 args->target = myTarget;
2105 args->source = mySource;
2106 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2107 return;
2108 }
2109 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2110 if(myData->key==0) {
2111 myData->isEmptySegment = TRUE;
2112 }
2113 continue;
2114
2115 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2116
2117 case CR:
2118 /*falls through*/
2119 case LF:
2120 /* automatically reset to single-byte mode */
2121 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2122 pToU2022State->cs[0] = (int8_t)ASCII;
2123 }
2124 pToU2022State->cs[2] = 0;
2125 pToU2022State->g = 0;
2126 /* falls through */
2127 default:
2128 /* convert one or two bytes */
2129 myData->isEmptySegment = FALSE;
2130 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2131 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2132 !IS_JP_DBCS(cs)
2133 ) {
2134 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2135 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2136
2137 /* return from a single-shift state to the previous one */
2138 if(pToU2022State->g >= 2) {
2139 pToU2022State->g=pToU2022State->prevG;
2140 }
2141 } else switch(cs) {
2142 case ASCII:
2143 if(mySourceChar <= 0x7f) {
2144 targetUniChar = mySourceChar;
2145 }
2146 break;
2147 case ISO8859_1:
2148 if(mySourceChar <= 0x7f) {
2149 targetUniChar = mySourceChar + 0x80;
2150 }
2151 /* return from a single-shift state to the previous one */
2152 pToU2022State->g=pToU2022State->prevG;
2153 break;
2154 case ISO8859_7:
2155 if(mySourceChar <= 0x7f) {
2156 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2157 targetUniChar =
2158 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2159 myData->myConverterArray[cs],
2160 mySourceChar + 0x80);
2161 }
2162 /* return from a single-shift state to the previous one */
2163 pToU2022State->g=pToU2022State->prevG;
2164 break;
2165 case JISX201:
2166 if(mySourceChar <= 0x7f) {
2167 targetUniChar = jisx201ToU(mySourceChar);
2168 }
2169 break;
2170 case HWKANA_7BIT:
2171 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2172 /* 7-bit halfwidth Katakana */
2173 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2174 }
2175 break;
2176 default:
2177 /* G0 DBCS */
2178 if(mySource < mySourceLimit) {
2179 int leadIsOk, trailIsOk;
2180 uint8_t trailByte;
2181 getTrailByte:
2182 trailByte = (uint8_t)*mySource;
2183 /*
2184 * Ticket 5691: consistent illegal sequences:
2185 * - We include at least the first byte in the illegal sequence.
2186 * - If any of the non-initial bytes could be the start of a character,
2187 * we stop the illegal sequence before the first one of those.
2188 *
2189 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2190 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2191 * Otherwise we convert or report the pair of bytes.
2192 */
2193 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2194 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2195 if (leadIsOk && trailIsOk) {
2196 ++mySource;
2197 tmpSourceChar = (mySourceChar << 8) | trailByte;
2198 if(cs == JISX208) {
2199 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2200 mySourceChar = tmpSourceChar;
2201 } else {
2202 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2203 mySourceChar = tmpSourceChar;
2204 if (cs == KSC5601) {
2205 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2206 }
2207 tempBuf[0] = (char)(tmpSourceChar >> 8);
2208 tempBuf[1] = (char)(tmpSourceChar);
2209 }
2210 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2211 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2212 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2213 ++mySource;
2214 /* add another bit so that the code below writes 2 bytes in case of error */
2215 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2216 }
2217 } else {
2218 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2219 args->converter->toULength = 1;
2220 goto endloop;
2221 }
2222 } /* End of inner switch */
2223 break;
2224 } /* End of outer switch */
2225 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2226 if(args->offsets){
2227 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2228 }
2229 *(myTarget++)=(UChar)targetUniChar;
2230 }
2231 else if(targetUniChar > missingCharMarker){
2232 /* disassemble the surrogate pair and write to output*/
2233 targetUniChar-=0x0010000;
2234 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2235 if(args->offsets){
2236 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2237 }
2238 ++myTarget;
2239 if(myTarget< args->targetLimit){
2240 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2241 if(args->offsets){
2242 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2243 }
2244 ++myTarget;
2245 }else{
2246 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2247 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2248 }
2249
2250 }
2251 else{
2252 /* Call the callback function*/
2253 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2254 break;
2255 }
2256 }
2257 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2258 *err =U_BUFFER_OVERFLOW_ERROR;
2259 break;
2260 }
2261 }
2262 endloop:
2263 args->target = myTarget;
2264 args->source = mySource;
2265 }
2266
2267
2268 /***************************************************************
2269 * Rules for ISO-2022-KR encoding
2270 * i) The KSC5601 designator sequence should appear only once in a file,
2271 * at the begining of a line before any KSC5601 characters. This usually
2272 * means that it appears by itself on the first line of the file
2273 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2274 * and SI to shift into single byte mode
2275 */
2276 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2277 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2278
2279 UConverter* saveConv = args->converter;
2280 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2281 args->converter=myConverterData->currentConverter;
2282
2283 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2284 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2285 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2286
2287 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2288 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2289 uprv_memcpy(
2290 saveConv->charErrorBuffer,
2291 myConverterData->currentConverter->charErrorBuffer,
2292 myConverterData->currentConverter->charErrorBufferLength);
2293 }
2294 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2295 myConverterData->currentConverter->charErrorBufferLength = 0;
2296 }
2297 args->converter=saveConv;
2298 }
2299
2300 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2301 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2302
2303 const UChar *source = args->source;
2304 const UChar *sourceLimit = args->sourceLimit;
2305 unsigned char *target = (unsigned char *) args->target;
2306 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2307 int32_t* offsets = args->offsets;
2308 uint32_t targetByteUnit = 0x0000;
2309 UChar32 sourceChar = 0x0000;
2310 UBool isTargetByteDBCS;
2311 UBool oldIsTargetByteDBCS;
2312 UConverterDataISO2022 *converterData;
2313 UConverterSharedData* sharedData;
2314 UBool useFallback;
2315 int32_t length =0;
2316
2317 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2318 /* if the version is 1 then the user is requesting
2319 * conversion with ibm-25546 pass the arguments to
2320 * MBCS converter and return
2321 */
2322 if(converterData->version==1){
2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2324 return;
2325 }
2326
2327 /* initialize data */
2328 sharedData = converterData->currentConverter->sharedData;
2329 useFallback = args->converter->useFallback;
2330 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2331 oldIsTargetByteDBCS = isTargetByteDBCS;
2332
2333 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2334 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2335 goto getTrail;
2336 }
2337 while(source < sourceLimit){
2338
2339 targetByteUnit = missingCharMarker;
2340
2341 if(target < (unsigned char*) args->targetLimit){
2342 sourceChar = *source++;
2343
2344 /* do not convert SO/SI/ESC */
2345 if(IS_2022_CONTROL(sourceChar)) {
2346 /* callback(illegal) */
2347 *err=U_ILLEGAL_CHAR_FOUND;
2348 args->converter->fromUChar32=sourceChar;
2349 break;
2350 }
2351
2352 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2353 if(length < 0) {
2354 length = -length; /* fallback */
2355 }
2356 /* only DBCS or SBCS characters are expected*/
2357 /* DB characters with high bit set to 1 are expected */
2358 if( length > 2 || length==0 ||
2359 (length == 1 && targetByteUnit > 0x7f) ||
2360 (length == 2 &&
2361 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2362 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2363 ) {
2364 targetByteUnit=missingCharMarker;
2365 }
2366 if (targetByteUnit != missingCharMarker){
2367
2368 oldIsTargetByteDBCS = isTargetByteDBCS;
2369 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2370 /* append the shift sequence */
2371 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2372
2373 if (isTargetByteDBCS)
2374 *target++ = UCNV_SO;
2375 else
2376 *target++ = UCNV_SI;
2377 if(offsets)
2378 *(offsets++) = (int32_t)(source - args->source-1);
2379 }
2380 /* write the targetUniChar to target */
2381 if(targetByteUnit <= 0x00FF){
2382 if( target < targetLimit){
2383 *(target++) = (unsigned char) targetByteUnit;
2384 if(offsets){
2385 *(offsets++) = (int32_t)(source - args->source-1);
2386 }
2387
2388 }else{
2389 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2390 *err = U_BUFFER_OVERFLOW_ERROR;
2391 }
2392 }else{
2393 if(target < targetLimit){
2394 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2395 if(offsets){
2396 *(offsets++) = (int32_t)(source - args->source-1);
2397 }
2398 if(target < targetLimit){
2399 *(target++) =(unsigned char) (targetByteUnit -0x80);
2400 if(offsets){
2401 *(offsets++) = (int32_t)(source - args->source-1);
2402 }
2403 }else{
2404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2405 *err = U_BUFFER_OVERFLOW_ERROR;
2406 }
2407 }else{
2408 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2409 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2410 *err = U_BUFFER_OVERFLOW_ERROR;
2411 }
2412 }
2413
2414 }
2415 else{
2416 /* oops.. the code point is unassingned
2417 * set the error and reason
2418 */
2419
2420 /*check if the char is a First surrogate*/
2421 if(UTF_IS_SURROGATE(sourceChar)) {
2422 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2423 getTrail:
2424 /*look ahead to find the trail surrogate*/
2425 if(source < sourceLimit) {
2426 /* test the following code unit */
2427 UChar trail=(UChar) *source;
2428 if(UTF_IS_SECOND_SURROGATE(trail)) {
2429 source++;
2430 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2431 *err = U_INVALID_CHAR_FOUND;
2432 /* convert this surrogate code point */
2433 /* exit this condition tree */
2434 } else {
2435 /* this is an unmatched lead code unit (1st surrogate) */
2436 /* callback(illegal) */
2437 *err=U_ILLEGAL_CHAR_FOUND;
2438 }
2439 } else {
2440 /* no more input */
2441 *err = U_ZERO_ERROR;
2442 }
2443 } else {
2444 /* this is an unmatched trail code unit (2nd surrogate) */
2445 /* callback(illegal) */
2446 *err=U_ILLEGAL_CHAR_FOUND;
2447 }
2448 } else {
2449 /* callback(unassigned) for a BMP code point */
2450 *err = U_INVALID_CHAR_FOUND;
2451 }
2452
2453 args->converter->fromUChar32=sourceChar;
2454 break;
2455 }
2456 } /* end if(myTargetIndex<myTargetLength) */
2457 else{
2458 *err =U_BUFFER_OVERFLOW_ERROR;
2459 break;
2460 }
2461
2462 }/* end while(mySourceIndex<mySourceLength) */
2463
2464 /*
2465 * the end of the input stream and detection of truncated input
2466 * are handled by the framework, but for ISO-2022-KR conversion
2467 * we need to be in ASCII mode at the very end
2468 *
2469 * conditions:
2470 * successful
2471 * not in ASCII mode
2472 * end of input and no truncated input
2473 */
2474 if( U_SUCCESS(*err) &&
2475 isTargetByteDBCS &&
2476 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2477 ) {
2478 int32_t sourceIndex;
2479
2480 /* we are switching to ASCII */
2481 isTargetByteDBCS=FALSE;
2482
2483 /* get the source index of the last input character */
2484 /*
2485 * TODO this would be simpler and more reliable if we used a pair
2486 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2487 * so that we could simply use the prevSourceIndex here;
2488 * this code gives an incorrect result for the rare case of an unmatched
2489 * trail surrogate that is alone in the last buffer of the text stream
2490 */
2491 sourceIndex=(int32_t)(source-args->source);
2492 if(sourceIndex>0) {
2493 --sourceIndex;
2494 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2495 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2496 ) {
2497 --sourceIndex;
2498 }
2499 } else {
2500 sourceIndex=-1;
2501 }
2502
2503 fromUWriteUInt8(
2504 args->converter,
2505 SHIFT_IN_STR, 1,
2506 &target, (const char *)targetLimit,
2507 &offsets, sourceIndex,
2508 err);
2509 }
2510
2511 /*save the state and return */
2512 args->source = source;
2513 args->target = (char*)target;
2514 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2515 }
2516
2517 /************************ To Unicode ***************************************/
2518
2519 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2520 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2521 UErrorCode* err){
2522 char const* sourceStart;
2523 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2524
2525 UConverterToUnicodeArgs subArgs;
2526 int32_t minArgsSize;
2527
2528 /* set up the subconverter arguments */
2529 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2530 minArgsSize = args->size;
2531 } else {
2532 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2533 }
2534
2535 uprv_memcpy(&subArgs, args, minArgsSize);
2536 subArgs.size = (uint16_t)minArgsSize;
2537 subArgs.converter = myData->currentConverter;
2538
2539 /* remember the original start of the input for offsets */
2540 sourceStart = args->source;
2541
2542 if(myData->key != 0) {
2543 /* continue with a partial escape sequence */
2544 goto escape;
2545 }
2546
2547 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2548 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2549 subArgs.source = args->source;
2550 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2551 if(subArgs.source != subArgs.sourceLimit) {
2552 /*
2553 * get the current partial byte sequence
2554 *
2555 * it needs to be moved between the public and the subconverter
2556 * so that the conversion framework, which only sees the public
2557 * converter, can handle truncated and illegal input etc.
2558 */
2559 if(args->converter->toULength > 0) {
2560 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2561 }
2562 subArgs.converter->toULength = args->converter->toULength;
2563
2564 /*
2565 * Convert up to the end of the input, or to before the next escape character.
2566 * Does not handle conversion extensions because the preToU[] state etc.
2567 * is not copied.
2568 */
2569 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2570
2571 if(args->offsets != NULL && sourceStart != args->source) {
2572 /* update offsets to base them on the actual start of the input */
2573 int32_t *offsets = args->offsets;
2574 UChar *target = args->target;
2575 int32_t delta = (int32_t)(args->source - sourceStart);
2576 while(target < subArgs.target) {
2577 if(*offsets >= 0) {
2578 *offsets += delta;
2579 }
2580 ++offsets;
2581 ++target;
2582 }
2583 }
2584 args->source = subArgs.source;
2585 args->target = subArgs.target;
2586 args->offsets = subArgs.offsets;
2587
2588 /* copy input/error/overflow buffers */
2589 if(subArgs.converter->toULength > 0) {
2590 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2591 }
2592 args->converter->toULength = subArgs.converter->toULength;
2593
2594 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2595 if(subArgs.converter->UCharErrorBufferLength > 0) {
2596 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2597 subArgs.converter->UCharErrorBufferLength);
2598 }
2599 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2600 subArgs.converter->UCharErrorBufferLength = 0;
2601 }
2602 }
2603
2604 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2605 return;
2606 }
2607
2608 escape:
2609 changeState_2022(args->converter,
2610 &(args->source),
2611 args->sourceLimit,
2612 ISO_2022_KR,
2613 err);
2614 }
2615 }
2616
2617 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2618 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2619 UErrorCode* err){
2620 char tempBuf[2];
2621 const char *mySource = ( char *) args->source;
2622 UChar *myTarget = args->target;
2623 const char *mySourceLimit = args->sourceLimit;
2624 UChar32 targetUniChar = 0x0000;
2625 UChar mySourceChar = 0x0000;
2626 UConverterDataISO2022* myData;
2627 UConverterSharedData* sharedData ;
2628 UBool useFallback;
2629
2630 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2631 if(myData->version==1){
2632 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2633 return;
2634 }
2635
2636 /* initialize state */
2637 sharedData = myData->currentConverter->sharedData;
2638 useFallback = args->converter->useFallback;
2639
2640 if(myData->key != 0) {
2641 /* continue with a partial escape sequence */
2642 goto escape;
2643 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2644 /* continue with a partial double-byte character */
2645 mySourceChar = args->converter->toUBytes[0];
2646 args->converter->toULength = 0;
2647 goto getTrailByte;
2648 }
2649
2650 while(mySource< mySourceLimit){
2651
2652 if(myTarget < args->targetLimit){
2653
2654 mySourceChar= (unsigned char) *mySource++;
2655
2656 if(mySourceChar==UCNV_SI){
2657 myData->toU2022State.g = 0;
2658 if (myData->isEmptySegment) {
2659 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2660 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2661 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2662 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2663 args->converter->toULength = 1;
2664 args->target = myTarget;
2665 args->source = mySource;
2666 return;
2667 }
2668 /*consume the source */
2669 continue;
2670 }else if(mySourceChar==UCNV_SO){
2671 myData->toU2022State.g = 1;
2672 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2673 /*consume the source */
2674 continue;
2675 }else if(mySourceChar==ESC_2022){
2676 mySource--;
2677 escape:
2678 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2679 changeState_2022(args->converter,&(mySource),
2680 mySourceLimit, ISO_2022_KR, err);
2681 if(U_FAILURE(*err)){
2682 args->target = myTarget;
2683 args->source = mySource;
2684 return;
2685 }
2686 continue;
2687 }
2688
2689 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2690 if(myData->toU2022State.g == 1) {
2691 if(mySource < mySourceLimit) {
2692 int leadIsOk, trailIsOk;
2693 uint8_t trailByte;
2694 getTrailByte:
2695 targetUniChar = missingCharMarker;
2696 trailByte = (uint8_t)*mySource;
2697 /*
2698 * Ticket 5691: consistent illegal sequences:
2699 * - We include at least the first byte in the illegal sequence.
2700 * - If any of the non-initial bytes could be the start of a character,
2701 * we stop the illegal sequence before the first one of those.
2702 *
2703 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2704 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2705 * Otherwise we convert or report the pair of bytes.
2706 */
2707 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2708 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2709 if (leadIsOk && trailIsOk) {
2710 ++mySource;
2711 tempBuf[0] = (char)(mySourceChar + 0x80);
2712 tempBuf[1] = (char)(trailByte + 0x80);
2713 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2714 mySourceChar = (mySourceChar << 8) | trailByte;
2715 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2716 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2717 ++mySource;
2718 /* add another bit so that the code below writes 2 bytes in case of error */
2719 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2720 }
2721 } else {
2722 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2723 args->converter->toULength = 1;
2724 break;
2725 }
2726 }
2727 else if(mySourceChar <= 0x7f) {
2728 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2729 } else {
2730 targetUniChar = 0xffff;
2731 }
2732 if(targetUniChar < 0xfffe){
2733 if(args->offsets) {
2734 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2735 }
2736 *(myTarget++)=(UChar)targetUniChar;
2737 }
2738 else {
2739 /* Call the callback function*/
2740 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2741 break;
2742 }
2743 }
2744 else{
2745 *err =U_BUFFER_OVERFLOW_ERROR;
2746 break;
2747 }
2748 }
2749 args->target = myTarget;
2750 args->source = mySource;
2751 }
2752
2753 /*************************** END ISO2022-KR *********************************/
2754
2755 /*************************** ISO-2022-CN *********************************
2756 *
2757 * Rules for ISO-2022-CN Encoding:
2758 * i) The designator sequence must appear once on a line before any instance
2759 * of character set it designates.
2760 * ii) If two lines contain characters from the same character set, both lines
2761 * must include the designator sequence.
2762 * iii) Once the designator sequence is known, a shifting sequence has to be found
2763 * to invoke the shifting
2764 * iv) All lines start in ASCII and end in ASCII.
2765 * v) Four shifting sequences are employed for this purpose:
2766 *
2767 * Sequcence ASCII Eq Charsets
2768 * ---------- ------- ---------
2769 * SI <SI> US-ASCII
2770 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2771 * SS2 <ESC>N CNS-11643-1992 Plane 2
2772 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2773 *
2774 * vi)
2775 * SOdesignator : ESC "$" ")" finalchar_for_SO
2776 * SS2designator : ESC "$" "*" finalchar_for_SS2
2777 * SS3designator : ESC "$" "+" finalchar_for_SS3
2778 *
2779 * ESC $ ) A Indicates the bytes following SO are Chinese
2780 * characters as defined in GB 2312-80, until
2781 * another SOdesignation appears
2782 *
2783 *
2784 * ESC $ ) E Indicates the bytes following SO are as defined
2785 * in ISO-IR-165 (for details, see section 2.1),
2786 * until another SOdesignation appears
2787 *
2788 * ESC $ ) G Indicates the bytes following SO are as defined
2789 * in CNS 11643-plane-1, until another
2790 * SOdesignation appears
2791 *
2792 * ESC $ * H Indicates the two bytes immediately following
2793 * SS2 is a Chinese character as defined in CNS
2794 * 11643-plane-2, until another SS2designation
2795 * appears
2796 * (Meaning <ESC>N must preceed every 2 byte
2797 * sequence.)
2798 *
2799 * ESC $ + I Indicates the immediate two bytes following SS3
2800 * is a Chinese character as defined in CNS
2801 * 11643-plane-3, until another SS3designation
2802 * appears
2803 * (Meaning <ESC>O must preceed every 2 byte
2804 * sequence.)
2805 *
2806 * ESC $ + J Indicates the immediate two bytes following SS3
2807 * is a Chinese character as defined in CNS
2808 * 11643-plane-4, until another SS3designation
2809 * appears
2810 * (In English: <ESC>O must preceed every 2 byte
2811 * sequence.)
2812 *
2813 * ESC $ + K Indicates the immediate two bytes following SS3
2814 * is a Chinese character as defined in CNS
2815 * 11643-plane-5, until another SS3designation
2816 * appears
2817 *
2818 * ESC $ + L Indicates the immediate two bytes following SS3
2819 * is a Chinese character as defined in CNS
2820 * 11643-plane-6, until another SS3designation
2821 * appears
2822 *
2823 * ESC $ + M Indicates the immediate two bytes following SS3
2824 * is a Chinese character as defined in CNS
2825 * 11643-plane-7, until another SS3designation
2826 * appears
2827 *
2828 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2829 * has its own designation information before any Chinese characters
2830 * appear
2831 *
2832 */
2833
2834 /* The following are defined this way to make the strings truely readonly */
2835 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2836 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2837 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2838 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2839 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2840 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2841 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2842 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2843 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2844
2845 /********************** ISO2022-CN Data **************************/
2846 static const char* const escSeqCharsCN[10] ={
2847 SHIFT_IN_STR, /* ASCII */
2848 GB_2312_80_STR,
2849 ISO_IR_165_STR,
2850 CNS_11643_1992_Plane_1_STR,
2851 CNS_11643_1992_Plane_2_STR,
2852 CNS_11643_1992_Plane_3_STR,
2853 CNS_11643_1992_Plane_4_STR,
2854 CNS_11643_1992_Plane_5_STR,
2855 CNS_11643_1992_Plane_6_STR,
2856 CNS_11643_1992_Plane_7_STR
2857 };
2858
2859 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2860 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2861 UConverter *cnv = args->converter;
2862 UConverterDataISO2022 *converterData;
2863 ISO2022State *pFromU2022State;
2864 uint8_t *target = (uint8_t *) args->target;
2865 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2866 const UChar* source = args->source;
2867 const UChar* sourceLimit = args->sourceLimit;
2868 int32_t* offsets = args->offsets;
2869 UChar32 sourceChar;
2870 char buffer[8];
2871 int32_t len;
2872 int8_t choices[3];
2873 int32_t choiceCount;
2874 uint32_t targetValue = 0;
2875 UBool useFallback;
2876
2877 /* set up the state */
2878 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2879 pFromU2022State = &converterData->fromU2022State;
2880
2881 choiceCount = 0;
2882
2883 /* check if the last codepoint of previous buffer was a lead surrogate*/
2884 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2885 goto getTrail;
2886 }
2887
2888 while( source < sourceLimit){
2889 if(target < targetLimit){
2890
2891 sourceChar = *(source++);
2892 /*check if the char is a First surrogate*/
2893 if(UTF_IS_SURROGATE(sourceChar)) {
2894 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2895 getTrail:
2896 /*look ahead to find the trail surrogate*/
2897 if(source < sourceLimit) {
2898 /* test the following code unit */
2899 UChar trail=(UChar) *source;
2900 if(UTF_IS_SECOND_SURROGATE(trail)) {
2901 source++;
2902 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2903 cnv->fromUChar32=0x00;
2904 /* convert this supplementary code point */
2905 /* exit this condition tree */
2906 } else {
2907 /* this is an unmatched lead code unit (1st surrogate) */
2908 /* callback(illegal) */
2909 *err=U_ILLEGAL_CHAR_FOUND;
2910 cnv->fromUChar32=sourceChar;
2911 break;
2912 }
2913 } else {
2914 /* no more input */
2915 cnv->fromUChar32=sourceChar;
2916 break;
2917 }
2918 } else {
2919 /* this is an unmatched trail code unit (2nd surrogate) */
2920 /* callback(illegal) */
2921 *err=U_ILLEGAL_CHAR_FOUND;
2922 cnv->fromUChar32=sourceChar;
2923 break;
2924 }
2925 }
2926
2927 /* do the conversion */
2928 if(sourceChar <= 0x007f ){
2929 /* do not convert SO/SI/ESC */
2930 if(IS_2022_CONTROL(sourceChar)) {
2931 /* callback(illegal) */
2932 *err=U_ILLEGAL_CHAR_FOUND;
2933 cnv->fromUChar32=sourceChar;
2934 break;
2935 }
2936
2937 /* US-ASCII */
2938 if(pFromU2022State->g == 0) {
2939 buffer[0] = (char)sourceChar;
2940 len = 1;
2941 } else {
2942 buffer[0] = UCNV_SI;
2943 buffer[1] = (char)sourceChar;
2944 len = 2;
2945 pFromU2022State->g = 0;
2946 choiceCount = 0;
2947 }
2948 if(sourceChar == CR || sourceChar == LF) {
2949 /* reset the state at the end of a line */
2950 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2951 choiceCount = 0;
2952 }
2953 }
2954 else{
2955 /* convert U+0080..U+10ffff */
2956 int32_t i;
2957 int8_t cs, g;
2958
2959 if(choiceCount == 0) {
2960 /* try the current SO/G1 converter first */
2961 choices[0] = pFromU2022State->cs[1];
2962
2963 /* default to GB2312_1 if none is designated yet */
2964 if(choices[0] == 0) {
2965 choices[0] = GB2312_1;
2966 }
2967
2968 if(converterData->version == 0) {
2969 /* ISO-2022-CN */
2970
2971 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2972 if(choices[0] == GB2312_1) {
2973 choices[1] = (int8_t)CNS_11643_1;
2974 } else {
2975 choices[1] = (int8_t)GB2312_1;
2976 }
2977
2978 choiceCount = 2;
2979 } else if (converterData->version == 1) {
2980 /* ISO-2022-CN-EXT */
2981
2982 /* try one of the other converters */
2983 switch(choices[0]) {
2984 case GB2312_1:
2985 choices[1] = (int8_t)CNS_11643_1;
2986 choices[2] = (int8_t)ISO_IR_165;
2987 break;
2988 case ISO_IR_165:
2989 choices[1] = (int8_t)GB2312_1;
2990 choices[2] = (int8_t)CNS_11643_1;
2991 break;
2992 default: /* CNS_11643_x */
2993 choices[1] = (int8_t)GB2312_1;
2994 choices[2] = (int8_t)ISO_IR_165;
2995 break;
2996 }
2997
2998 choiceCount = 3;
2999 } else {
3000 choices[0] = (int8_t)CNS_11643_1;
3001 choices[1] = (int8_t)GB2312_1;
3002 }
3003 }
3004
3005 cs = g = 0;
3006 /*
3007 * len==0: no mapping found yet
3008 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3009 * len>0: found a roundtrip result, done
3010 */
3011 len = 0;
3012 /*
3013 * We will turn off useFallback after finding a fallback,
3014 * but we still get fallbacks from PUA code points as usual.
3015 * Therefore, we will also need to check that we don't overwrite
3016 * an early fallback with a later one.
3017 */
3018 useFallback = cnv->useFallback;
3019
3020 for(i = 0; i < choiceCount && len <= 0; ++i) {
3021 int8_t cs0 = choices[i];
3022 if(cs0 > 0) {
3023 uint32_t value;
3024 int32_t len2;
3025 if(cs0 >= CNS_11643_0) {
3026 len2 = MBCS_FROM_UCHAR32_ISO2022(
3027 converterData->myConverterArray[CNS_11643],
3028 sourceChar,
3029 &value,
3030 useFallback,
3031 MBCS_OUTPUT_3);
3032 if(len2 == 3 || (len2 == -3 && len == 0)) {
3033 targetValue = value;
3034 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3035 if(len2 >= 0) {
3036 len = 2;
3037 } else {
3038 len = -2;
3039 useFallback = FALSE;
3040 }
3041 if(cs == CNS_11643_1) {
3042 g = 1;
3043 } else if(cs == CNS_11643_2) {
3044 g = 2;
3045 } else /* plane 3..7 */ if(converterData->version == 1) {
3046 g = 3;
3047 } else {
3048 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3049 len = 0;
3050 }
3051 }
3052 } else {
3053 /* GB2312_1 or ISO-IR-165 */
3054 len2 = MBCS_FROM_UCHAR32_ISO2022(
3055 converterData->myConverterArray[cs0],
3056 sourceChar,
3057 &value,
3058 useFallback,
3059 MBCS_OUTPUT_2);
3060 if(len2 == 2 || (len2 == -2 && len == 0)) {
3061 targetValue = value;
3062 len = len2;
3063 cs = cs0;
3064 g = 1;
3065 useFallback = FALSE;
3066 }
3067 }
3068 }
3069 }
3070
3071 if(len != 0) {
3072 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3073
3074 /* write the designation sequence if necessary */
3075 if(cs != pFromU2022State->cs[g]) {
3076 if(cs < CNS_11643) {
3077 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3078 } else {
3079 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3080 }
3081 len = 4;
3082 pFromU2022State->cs[g] = cs;
3083 if(g == 1) {
3084 /* changing the SO/G1 charset invalidates the choices[] */
3085 choiceCount = 0;
3086 }
3087 }
3088
3089 /* write the shift sequence if necessary */
3090 if(g != pFromU2022State->g) {
3091 switch(g) {
3092 case 1:
3093 buffer[len++] = UCNV_SO;
3094
3095 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3096 pFromU2022State->g = 1;
3097 break;
3098 case 2:
3099 buffer[len++] = 0x1b;
3100 buffer[len++] = 0x4e;
3101 break;
3102 default: /* case 3 */
3103 buffer[len++] = 0x1b;
3104 buffer[len++] = 0x4f;
3105 break;
3106 }
3107 }
3108
3109 /* write the two output bytes */
3110 buffer[len++] = (char)(targetValue >> 8);
3111 buffer[len++] = (char)targetValue;
3112 } else {
3113 /* if we cannot find the character after checking all codepages
3114 * then this is an error
3115 */
3116 *err = U_INVALID_CHAR_FOUND;
3117 cnv->fromUChar32=sourceChar;
3118 break;
3119 }
3120 }
3121
3122 /* output len>0 bytes in buffer[] */
3123 if(len == 1) {
3124 *target++ = buffer[0];
3125 if(offsets) {
3126 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3127 }
3128 } else if(len == 2 && (target + 2) <= targetLimit) {
3129 *target++ = buffer[0];
3130 *target++ = buffer[1];
3131 if(offsets) {
3132 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3133 *offsets++ = sourceIndex;
3134 *offsets++ = sourceIndex;
3135 }
3136 } else {
3137 fromUWriteUInt8(
3138 cnv,
3139 buffer, len,
3140 &target, (const char *)targetLimit,
3141 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3142 err);
3143 if(U_FAILURE(*err)) {
3144 break;
3145 }
3146 }
3147 } /* end if(myTargetIndex<myTargetLength) */
3148 else{
3149 *err =U_BUFFER_OVERFLOW_ERROR;
3150 break;
3151 }
3152
3153 }/* end while(mySourceIndex<mySourceLength) */
3154
3155 /*
3156 * the end of the input stream and detection of truncated input
3157 * are handled by the framework, but for ISO-2022-CN conversion
3158 * we need to be in ASCII mode at the very end
3159 *
3160 * conditions:
3161 * successful
3162 * not in ASCII mode
3163 * end of input and no truncated input
3164 */
3165 if( U_SUCCESS(*err) &&
3166 pFromU2022State->g!=0 &&
3167 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3168 ) {
3169 int32_t sourceIndex;
3170
3171 /* we are switching to ASCII */
3172 pFromU2022State->g=0;
3173
3174 /* get the source index of the last input character */
3175 /*
3176 * TODO this would be simpler and more reliable if we used a pair
3177 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3178 * so that we could simply use the prevSourceIndex here;
3179 * this code gives an incorrect result for the rare case of an unmatched
3180 * trail surrogate that is alone in the last buffer of the text stream
3181 */
3182 sourceIndex=(int32_t)(source-args->source);
3183 if(sourceIndex>0) {
3184 --sourceIndex;
3185 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3186 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3187 ) {
3188 --sourceIndex;
3189 }
3190 } else {
3191 sourceIndex=-1;
3192 }
3193
3194 fromUWriteUInt8(
3195 cnv,
3196 SHIFT_IN_STR, 1,
3197 &target, (const char *)targetLimit,
3198 &offsets, sourceIndex,
3199 err);
3200 }
3201
3202 /*save the state and return */
3203 args->source = source;
3204 args->target = (char*)target;
3205 }
3206
3207
3208 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3209 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3210 UErrorCode* err){
3211 char tempBuf[3];
3212 const char *mySource = (char *) args->source;
3213 UChar *myTarget = args->target;
3214 const char *mySourceLimit = args->sourceLimit;
3215 uint32_t targetUniChar = 0x0000;
3216 uint32_t mySourceChar = 0x0000;
3217 UConverterDataISO2022* myData;
3218 ISO2022State *pToU2022State;
3219
3220 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3221 pToU2022State = &myData->toU2022State;
3222
3223 if(myData->key != 0) {
3224 /* continue with a partial escape sequence */
3225 goto escape;
3226 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3227 /* continue with a partial double-byte character */
3228 mySourceChar = args->converter->toUBytes[0];
3229 args->converter->toULength = 0;
3230 targetUniChar = missingCharMarker;
3231 goto getTrailByte;
3232 }
3233
3234 while(mySource < mySourceLimit){
3235
3236 targetUniChar =missingCharMarker;
3237
3238 if(myTarget < args->targetLimit){
3239
3240 mySourceChar= (unsigned char) *mySource++;
3241
3242 switch(mySourceChar){
3243 case UCNV_SI:
3244 pToU2022State->g=0;
3245 if (myData->isEmptySegment) {
3246 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3247 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3248 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3249 args->converter->toUBytes[0] = mySourceChar;
3250 args->converter->toULength = 1;
3251 args->target = myTarget;
3252 args->source = mySource;
3253 return;
3254 }
3255 continue;
3256
3257 case UCNV_SO:
3258 if(pToU2022State->cs[1] != 0) {
3259 pToU2022State->g=1;
3260 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3261 continue;
3262 } else {
3263 /* illegal to have SO before a matching designator */
3264 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3265 break;
3266 }
3267
3268 case ESC_2022:
3269 mySource--;
3270 escape:
3271 {
3272 const char * mySourceBefore = mySource;
3273 int8_t toULengthBefore = args->converter->toULength;
3274
3275 changeState_2022(args->converter,&(mySource),
3276 mySourceLimit, ISO_2022_CN,err);
3277
3278 /* After SO there must be at least one character before a designator (designator error handled separately) */
3279 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3280 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3281 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3282 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3283 }
3284 }
3285
3286 /* invalid or illegal escape sequence */
3287 if(U_FAILURE(*err)){
3288 args->target = myTarget;
3289 args->source = mySource;
3290 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3291 return;
3292 }
3293 continue;
3294
3295 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3296
3297 case CR:
3298 /*falls through*/
3299 case LF:
3300 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3301 /* falls through */
3302 default:
3303 /* convert one or two bytes */
3304 myData->isEmptySegment = FALSE;
3305 if(pToU2022State->g != 0) {
3306 if(mySource < mySourceLimit) {
3307 UConverterSharedData *cnv;
3308 StateEnum tempState;
3309 int32_t tempBufLen;
3310 int leadIsOk, trailIsOk;
3311 uint8_t trailByte;
3312 getTrailByte:
3313 trailByte = (uint8_t)*mySource;
3314 /*
3315 * Ticket 5691: consistent illegal sequences:
3316 * - We include at least the first byte in the illegal sequence.
3317 * - If any of the non-initial bytes could be the start of a character,
3318 * we stop the illegal sequence before the first one of those.
3319 *
3320 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3321 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3322 * Otherwise we convert or report the pair of bytes.
3323 */
3324 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3325 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3326 if (leadIsOk && trailIsOk) {
3327 ++mySource;
3328 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3329 if(tempState >= CNS_11643_0) {
3330 cnv = myData->myConverterArray[CNS_11643];
3331 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3332 tempBuf[1] = (char) (mySourceChar);
3333 tempBuf[2] = (char) trailByte;
3334 tempBufLen = 3;
3335
3336 }else{
3337 cnv = myData->myConverterArray[tempState];
3338 tempBuf[0] = (char) (mySourceChar);
3339 tempBuf[1] = (char) trailByte;
3340 tempBufLen = 2;
3341 }
3342 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3343 mySourceChar = (mySourceChar << 8) | trailByte;
3344 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3345 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3346 ++mySource;
3347 /* add another bit so that the code below writes 2 bytes in case of error */
3348 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3349 }
3350 if(pToU2022State->g>=2) {
3351 /* return from a single-shift state to the previous one */
3352 pToU2022State->g=pToU2022State->prevG;
3353 }
3354 } else {
3355 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3356 args->converter->toULength = 1;
3357 goto endloop;
3358 }
3359 }
3360 else{
3361 if(mySourceChar <= 0x7f) {
3362 targetUniChar = (UChar) mySourceChar;
3363 }
3364 }
3365 break;
3366 }
3367 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3368 if(args->offsets){
3369 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3370 }
3371 *(myTarget++)=(UChar)targetUniChar;
3372 }
3373 else if(targetUniChar > missingCharMarker){
3374 /* disassemble the surrogate pair and write to output*/
3375 targetUniChar-=0x0010000;
3376 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3377 if(args->offsets){
3378 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3379 }
3380 ++myTarget;
3381 if(myTarget< args->targetLimit){
3382 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3383 if(args->offsets){
3384 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3385 }
3386 ++myTarget;
3387 }else{
3388 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3389 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3390 }
3391
3392 }
3393 else{
3394 /* Call the callback function*/
3395 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3396 break;
3397 }
3398 }
3399 else{
3400 *err =U_BUFFER_OVERFLOW_ERROR;
3401 break;
3402 }
3403 }
3404 endloop:
3405 args->target = myTarget;
3406 args->source = mySource;
3407 }
3408
3409 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3410 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3411 UConverter *cnv = args->converter;
3412 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3413 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3414 char *p, *subchar;
3415 char buffer[8];
3416 int32_t length;
3417
3418 subchar=(char *)cnv->subChars;
3419 length=cnv->subCharLen; /* assume length==1 for most variants */
3420
3421 p = buffer;
3422 switch(myConverterData->locale[0]){
3423 case 'j':
3424 {
3425 int8_t cs;
3426
3427 if(pFromU2022State->g == 1) {
3428 /* JIS7: switch from G1 to G0 */
3429 pFromU2022State->g = 0;
3430 *p++ = UCNV_SI;
3431 }
3432
3433 cs = pFromU2022State->cs[0];
3434 if(cs != ASCII && cs != JISX201) {
3435 /* not in ASCII or JIS X 0201: switch to ASCII */
3436 pFromU2022State->cs[0] = (int8_t)ASCII;
3437 *p++ = '\x1b';
3438 *p++ = '\x28';
3439 *p++ = '\x42';
3440 }
3441
3442 *p++ = subchar[0];
3443 break;
3444 }
3445 case 'c':
3446 if(pFromU2022State->g != 0) {
3447 /* not in ASCII mode: switch to ASCII */
3448 pFromU2022State->g = 0;
3449 *p++ = UCNV_SI;
3450 }
3451 *p++ = subchar[0];
3452 break;
3453 case 'k':
3454 if(myConverterData->version == 0) {
3455 if(length == 1) {
3456 if((UBool)args->converter->fromUnicodeStatus) {
3457 /* in DBCS mode: switch to SBCS */
3458 args->converter->fromUnicodeStatus = 0;
3459 *p++ = UCNV_SI;
3460 }
3461 *p++ = subchar[0];
3462 } else /* length == 2*/ {
3463 if(!(UBool)args->converter->fromUnicodeStatus) {
3464 /* in SBCS mode: switch to DBCS */
3465 args->converter->fromUnicodeStatus = 1;
3466 *p++ = UCNV_SO;
3467 }
3468 *p++ = subchar[0];
3469 *p++ = subchar[1];
3470 }
3471 break;
3472 } else {
3473 /* save the subconverter's substitution string */
3474 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3475 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3476
3477 /* set our substitution string into the subconverter */
3478 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3479 myConverterData->currentConverter->subCharLen = (int8_t)length;
3480
3481 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3482 args->converter = myConverterData->currentConverter;
3483 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3484 ucnv_cbFromUWriteSub(args, 0, err);
3485 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3486 args->converter = cnv;
3487
3488 /* restore the subconverter's substitution string */
3489 myConverterData->currentConverter->subChars = currentSubChars;
3490 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3491
3492 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3493 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3494 uprv_memcpy(
3495 cnv->charErrorBuffer,
3496 myConverterData->currentConverter->charErrorBuffer,
3497 myConverterData->currentConverter->charErrorBufferLength);
3498 }
3499 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3500 myConverterData->currentConverter->charErrorBufferLength = 0;
3501 }
3502 return;
3503 }
3504 default:
3505 /* not expected */
3506 break;
3507 }
3508 ucnv_cbFromUWriteBytes(args,
3509 buffer, (int32_t)(p - buffer),
3510 offsetIndex, err);
3511 }
3512
3513 /*
3514 * Structure for cloning an ISO 2022 converter into a single memory block.
3515 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3516 * and then ucnv_safeClone() of the sub-converter may additionally align
3517 * currentConverter inside the cloneStruct, for which we need the deadSpace
3518 * after currentConverter.
3519 * This is because UAlignedMemory may be larger than the actually
3520 * necessary alignment size for the platform.
3521 * The other cloneStruct fields will not be moved around,
3522 * and are aligned properly with cloneStruct's alignment.
3523 */
3524 struct cloneStruct
3525 {
3526 UConverter cnv;
3527 UConverter currentConverter;
3528 UAlignedMemory deadSpace;
3529 UConverterDataISO2022 mydata;
3530 };
3531
3532
3533 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3534 _ISO_2022_SafeClone(
3535 const UConverter *cnv,
3536 void *stackBuffer,
3537 int32_t *pBufferSize,
3538 UErrorCode *status)
3539 {
3540 struct cloneStruct * localClone;
3541 UConverterDataISO2022 *cnvData;
3542 int32_t i, size;
3543
3544 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3545 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3546 return NULL;
3547 }
3548
3549 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3550 localClone = (struct cloneStruct *)stackBuffer;
3551
3552 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3553
3554 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3555 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3556 localClone->cnv.isExtraLocal = TRUE;
3557
3558 /* share the subconverters */
3559
3560 if(cnvData->currentConverter != NULL) {
3561 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3562 localClone->mydata.currentConverter =
3563 ucnv_safeClone(cnvData->currentConverter,
3564 &localClone->currentConverter,
3565 &size, status);
3566 if(U_FAILURE(*status)) {
3567 return NULL;
3568 }
3569 }
3570
3571 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3572 if(cnvData->myConverterArray[i] != NULL) {
3573 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3574 }
3575 }
3576
3577 return &localClone->cnv;
3578 }
3579
3580 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3581 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3582 const USetAdder *sa,
3583 UConverterUnicodeSet which,
3584 UErrorCode *pErrorCode)
3585 {
3586 int32_t i;
3587 UConverterDataISO2022* cnvData;
3588
3589 if (U_FAILURE(*pErrorCode)) {
3590 return;
3591 }
3592 #ifdef U_ENABLE_GENERIC_ISO_2022
3593 if (cnv->sharedData == &_ISO2022Data) {
3594 /* We use UTF-8 in this case */
3595 sa->addRange(sa->set, 0, 0xd7FF);
3596 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3597 return;
3598 }
3599 #endif
3600
3601 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3602
3603 /* open a set and initialize it with code points that are algorithmically round-tripped */
3604 switch(cnvData->locale[0]){
3605 case 'j':
3606 /* include JIS X 0201 which is hardcoded */
3607 sa->add(sa->set, 0xa5);
3608 sa->add(sa->set, 0x203e);
3609 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3610 /* include Latin-1 for some variants of JP */
3611 sa->addRange(sa->set, 0, 0xff);
3612 } else {
3613 /* include ASCII for JP */
3614 sa->addRange(sa->set, 0, 0x7f);
3615 }
3616 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3617 /*
3618 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3619 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3620 * use half-width Katakana.
3621 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3622 * half-width Katakana via the ESC ( I sequence.
3623 * However, we only emit (fromUnicode) half-width Katakana according to the
3624 * definition of each variant.
3625 *
3626 * When including fallbacks,
3627 * we need to include half-width Katakana Unicode code points for all JP variants because
3628 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3629 */
3630 /* include half-width Katakana for JP */
3631 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3632 }
3633 break;
3634 case 'c':
3635 case 'z':
3636 /* include ASCII for CN */
3637 sa->addRange(sa->set, 0, 0x7f);
3638 break;
3639 case 'k':
3640 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3641 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3642 cnvData->currentConverter, sa, which, pErrorCode);
3643 /* the loop over myConverterArray[] will simply not find another converter */
3644 break;
3645 default:
3646 break;
3647 }
3648
3649 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3650 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3651 cnvData->version==0 && i==CNS_11643
3652 ) {
3653 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3654 ucnv_MBCSGetUnicodeSetForBytes(
3655 cnvData->myConverterArray[i],
3656 sa, UCNV_ROUNDTRIP_SET,
3657 0, 0x81, 0x82,
3658 pErrorCode);
3659 }
3660 #endif
3661
3662 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3663 UConverterSetFilter filter;
3664 if(cnvData->myConverterArray[i]!=NULL) {
3665 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3666 cnvData->version==0 && i==CNS_11643
3667 ) {
3668 /*
3669 * Version-specific for CN:
3670 * CN version 0 does not map CNS planes 3..7 although
3671 * they are all available in the CNS conversion table;
3672 * CN version 1 (-EXT) does map them all.
3673 * The two versions create different Unicode sets.
3674 */
3675 filter=UCNV_SET_FILTER_2022_CN;
3676 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3677 /*
3678 * Only add code points that map to Shift-JIS codes
3679 * corresponding to JIS X 0208.
3680 */
3681 filter=UCNV_SET_FILTER_SJIS;
3682 } else if(i==KSC5601) {
3683 /*
3684 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3685 * are broader than GR94.
3686 */
3687 filter=UCNV_SET_FILTER_GR94DBCS;
3688 } else {
3689 filter=UCNV_SET_FILTER_NONE;
3690 }
3691 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3692 }
3693 }
3694
3695 /*
3696 * ISO 2022 converters must not convert SO/SI/ESC despite what
3697 * sub-converters do by themselves.
3698 * Remove these characters from the set.
3699 */
3700 sa->remove(sa->set, 0x0e);
3701 sa->remove(sa->set, 0x0f);
3702 sa->remove(sa->set, 0x1b);
3703
3704 /* ISO 2022 converters do not convert C1 controls either */
3705 sa->removeRange(sa->set, 0x80, 0x9f);
3706 }
3707
3708 static const UConverterImpl _ISO2022Impl={
3709 UCNV_ISO_2022,
3710
3711 NULL,
3712 NULL,
3713
3714 _ISO2022Open,
3715 _ISO2022Close,
3716 _ISO2022Reset,
3717
3718 #ifdef U_ENABLE_GENERIC_ISO_2022
3719 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3720 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3721 ucnv_fromUnicode_UTF8,
3722 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3723 #else
3724 NULL,
3725 NULL,
3726 NULL,
3727 NULL,
3728 #endif
3729 NULL,
3730
3731 NULL,
3732 _ISO2022getName,
3733 _ISO_2022_WriteSub,
3734 _ISO_2022_SafeClone,
3735 _ISO_2022_GetUnicodeSet
3736 };
3737 static const UConverterStaticData _ISO2022StaticData={
3738 sizeof(UConverterStaticData),
3739 "ISO_2022",
3740 2022,
3741 UCNV_IBM,
3742 UCNV_ISO_2022,
3743 1,
3744 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3745 { 0x1a, 0, 0, 0 },
3746 1,
3747 FALSE,
3748 FALSE,
3749 0,
3750 0,
3751 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3752 };
3753 const UConverterSharedData _ISO2022Data={
3754 sizeof(UConverterSharedData),
3755 ~((uint32_t) 0),
3756 NULL,
3757 NULL,
3758 &_ISO2022StaticData,
3759 FALSE,
3760 &_ISO2022Impl,
3761 0
3762 };
3763
3764 /*************JP****************/
3765 static const UConverterImpl _ISO2022JPImpl={
3766 UCNV_ISO_2022,
3767
3768 NULL,
3769 NULL,
3770
3771 _ISO2022Open,
3772 _ISO2022Close,
3773 _ISO2022Reset,
3774
3775 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3776 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3777 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3778 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3779 NULL,
3780
3781 NULL,
3782 _ISO2022getName,
3783 _ISO_2022_WriteSub,
3784 _ISO_2022_SafeClone,
3785 _ISO_2022_GetUnicodeSet
3786 };
3787 static const UConverterStaticData _ISO2022JPStaticData={
3788 sizeof(UConverterStaticData),
3789 "ISO_2022_JP",
3790 0,
3791 UCNV_IBM,
3792 UCNV_ISO_2022,
3793 1,
3794 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3795 { 0x1a, 0, 0, 0 },
3796 1,
3797 FALSE,
3798 FALSE,
3799 0,
3800 0,
3801 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3802 };
3803 static const UConverterSharedData _ISO2022JPData={
3804 sizeof(UConverterSharedData),
3805 ~((uint32_t) 0),
3806 NULL,
3807 NULL,
3808 &_ISO2022JPStaticData,
3809 FALSE,
3810 &_ISO2022JPImpl,
3811 0
3812 };
3813
3814 /************* KR ***************/
3815 static const UConverterImpl _ISO2022KRImpl={
3816 UCNV_ISO_2022,
3817
3818 NULL,
3819 NULL,
3820
3821 _ISO2022Open,
3822 _ISO2022Close,
3823 _ISO2022Reset,
3824
3825 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3826 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3827 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3828 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3829 NULL,
3830
3831 NULL,
3832 _ISO2022getName,
3833 _ISO_2022_WriteSub,
3834 _ISO_2022_SafeClone,
3835 _ISO_2022_GetUnicodeSet
3836 };
3837 static const UConverterStaticData _ISO2022KRStaticData={
3838 sizeof(UConverterStaticData),
3839 "ISO_2022_KR",
3840 0,
3841 UCNV_IBM,
3842 UCNV_ISO_2022,
3843 1,
3844 3, /* max 3 bytes per UChar: SO+DBCS */
3845 { 0x1a, 0, 0, 0 },
3846 1,
3847 FALSE,
3848 FALSE,
3849 0,
3850 0,
3851 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3852 };
3853 static const UConverterSharedData _ISO2022KRData={
3854 sizeof(UConverterSharedData),
3855 ~((uint32_t) 0),
3856 NULL,
3857 NULL,
3858 &_ISO2022KRStaticData,
3859 FALSE,
3860 &_ISO2022KRImpl,
3861 0
3862 };
3863
3864 /*************** CN ***************/
3865 static const UConverterImpl _ISO2022CNImpl={
3866
3867 UCNV_ISO_2022,
3868
3869 NULL,
3870 NULL,
3871
3872 _ISO2022Open,
3873 _ISO2022Close,
3874 _ISO2022Reset,
3875
3876 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3877 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3878 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3879 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3880 NULL,
3881
3882 NULL,
3883 _ISO2022getName,
3884 _ISO_2022_WriteSub,
3885 _ISO_2022_SafeClone,
3886 _ISO_2022_GetUnicodeSet
3887 };
3888 static const UConverterStaticData _ISO2022CNStaticData={
3889 sizeof(UConverterStaticData),
3890 "ISO_2022_CN",
3891 0,
3892 UCNV_IBM,
3893 UCNV_ISO_2022,
3894 1,
3895 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3896 { 0x1a, 0, 0, 0 },
3897 1,
3898 FALSE,
3899 FALSE,
3900 0,
3901 0,
3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3903 };
3904 static const UConverterSharedData _ISO2022CNData={
3905 sizeof(UConverterSharedData),
3906 ~((uint32_t) 0),
3907 NULL,
3908 NULL,
3909 &_ISO2022CNStaticData,
3910 FALSE,
3911 &_ISO2022CNImpl,
3912 0
3913 };
3914
3915
3916
3917 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3918