1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 enum { MAX_JA_VERSION=4 };
169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
175 };
176
177 typedef enum {
178 ASCII1=0,
179 LATIN1,
180 SBCS,
181 DBCS,
182 MBCS,
183 HWKANA
184 }Cnv2022Type;
185
186 typedef struct ISO2022State {
187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189 int8_t prevG; /* g before single shift (SS2 or SS3) */
190 } ISO2022State;
191
192 #define UCNV_OPTIONS_VERSION_MASK 0xf
193 #define UCNV_2022_MAX_CONVERTERS 10
194
195 typedef struct{
196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
197 UConverter *currentConverter;
198 Cnv2022Type currentType;
199 ISO2022State toU2022State, fromU2022State;
200 uint32_t key;
201 uint32_t version;
202 #ifdef U_ENABLE_GENERIC_ISO_2022
203 UBool isFirstBuffer;
204 #endif
205 UBool isEmptySegment;
206 char name[30];
207 char locale[3];
208 }UConverterDataISO2022;
209
210 /* Protos */
211 /* ISO-2022 ----------------------------------------------------------------- */
212
213 /*Forward declaration */
214 U_CFUNC void
215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
216 UErrorCode * err);
217 U_CFUNC void
218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
219 UErrorCode * err);
220
221 #define ESC_2022 0x1B /*ESC*/
222
223 typedef enum
224 {
225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229 } UCNV_TableStates_2022;
230
231 /*
232 * The way these state transition arrays work is:
233 * ex : ESC$B is the sequence for JISX208
234 * a) First Iteration: char is ESC
235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236 * int x = normalize_esq_chars_2022[27] which is equal to 1
237 * ii) Search for this value in escSeqStateTable_Key_2022[]
238 * value of x is stored at escSeqStateTable_Key_2022[0]
239 * iii) Save this index as offset
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242 * b) Switch on this state and continue to next char
243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244 * which is normalize_esq_chars_2022[36] == 4
245 * ii) x is currently 1(from above)
246 * x<<=5 -- x is now 32
247 * x+=normalize_esq_chars_2022[36]
248 * now x is 36
249 * iii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253 * c) Switch on this state and continue to next char
254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255 * ii) x is currently 36 (from above)
256 * x<<=5 -- x is now 1152
257 * x+=normalize_esq_chars_2022[66]
258 * now x is 1161
259 * iii) Search for this value in escSeqStateTable_Key_2022[]
260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
264 */
265
266
267 /*Below are the 3 arrays depicting a state transition table*/
268 static const int8_t normalize_esq_chars_2022[256] = {
269 /* 0 1 2 3 4 5 6 7 8 9 */
270
271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0
297 };
298
299 #ifdef U_ENABLE_GENERIC_ISO_2022
300 /*
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
304 *
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 *
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
312 *
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
315 */
316 #endif
317
318 #define MAX_STATES_2022 74
319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
321
322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
329 ,35947631 ,35947635 ,35947636 ,35947638
330 };
331
332 #ifdef U_ENABLE_GENERIC_ISO_2022
333
334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
335 /* 0 1 2 3 4 5 6 7 8 9 */
336
337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
345 };
346
347 #endif
348
349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
350 /* 0 1 2 3 4 5 6 7 8 9 */
351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 };
360
361
362 /* Type def for refactoring changeState_2022 code*/
363 typedef enum{
364 #ifdef U_ENABLE_GENERIC_ISO_2022
365 ISO_2022=0,
366 #endif
367 ISO_2022_JP=1,
368 ISO_2022_KR=2,
369 ISO_2022_CN=3
370 } Variant2022;
371
372 /*********** ISO 2022 Converter Protos ***********/
373 static void
374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
375
376 static void
377 _ISO2022Close(UConverter *converter);
378
379 static void
380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
381
382 static const char*
383 _ISO2022getName(const UConverter* cnv);
384
385 static void
386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
387
388 static UConverter *
389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
390
391 #ifdef U_ENABLE_GENERIC_ISO_2022
392 static void
393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
394 #endif
395
396 /*const UConverterSharedData _ISO2022Data;*/
397 static const UConverterSharedData _ISO2022JPData;
398 static const UConverterSharedData _ISO2022KRData;
399 static const UConverterSharedData _ISO2022CNData;
400
401 /*************** Converter implementations ******************/
402
403 /* The purpose of this function is to get around gcc compiler warnings. */
404 static U_INLINE void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)405 fromUWriteUInt8(UConverter *cnv,
406 const char *bytes, int32_t length,
407 uint8_t **target, const char *targetLimit,
408 int32_t **offsets,
409 int32_t sourceIndex,
410 UErrorCode *pErrorCode)
411 {
412 char *targetChars = (char *)*target;
413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
414 offsets, sourceIndex, pErrorCode);
415 *target = (uint8_t*)targetChars;
416
417 }
418
419 static U_INLINE void
setInitialStateToUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
421 if(myConverterData->version == 1) {
422 UConverter *cnv = myConverterData->currentConverter;
423
424 cnv->toUnicodeStatus=0; /* offset */
425 cnv->mode=0; /* state */
426 cnv->toULength=0; /* byteIndex */
427 }
428 }
429
430 static U_INLINE void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
432 /* in ISO-2022-KR the designator sequence appears only once
433 * in a file so we append it only once
434 */
435 if( converter->charErrorBufferLength==0){
436
437 converter->charErrorBufferLength = 4;
438 converter->charErrorBuffer[0] = 0x1b;
439 converter->charErrorBuffer[1] = 0x24;
440 converter->charErrorBuffer[2] = 0x29;
441 converter->charErrorBuffer[3] = 0x43;
442 }
443 if(myConverterData->version == 1) {
444 UConverter *cnv = myConverterData->currentConverter;
445
446 cnv->fromUChar32=0;
447 cnv->fromUnicodeStatus=1; /* prevLength */
448 }
449 }
450
451 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
453
454 char myLocale[6]={' ',' ',' ',' ',' ',' '};
455
456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
457 if(cnv->extraInfo != NULL) {
458 UConverterNamePieces stackPieces;
459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
461 uint32_t version;
462
463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
464
465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
466 myConverterData->currentType = ASCII1;
467 cnv->fromUnicodeStatus =FALSE;
468 if(pArgs->locale){
469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
470 }
471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
472 myConverterData->version = version;
473 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
474 (myLocale[2]=='_' || myLocale[2]=='\0'))
475 {
476 size_t len=0;
477 /* open the required converters and cache them */
478 if(version>MAX_JA_VERSION) {
479 /* prevent indexing beyond jpCharsetMasks[] */
480 myConverterData->version = version = 0;
481 }
482 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
483 myConverterData->myConverterArray[ISO8859_7] =
484 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
485 }
486 myConverterData->myConverterArray[JISX208] =
487 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
488 if(jpCharsetMasks[version]&CSM(JISX212)) {
489 myConverterData->myConverterArray[JISX212] =
490 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
491 }
492 if(jpCharsetMasks[version]&CSM(GB2312)) {
493 myConverterData->myConverterArray[GB2312] =
494 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
495 }
496 if(jpCharsetMasks[version]&CSM(KSC5601)) {
497 myConverterData->myConverterArray[KSC5601] =
498 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
499 }
500
501 /* set the function pointers to appropriate funtions */
502 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
503 uprv_strcpy(myConverterData->locale,"ja");
504
505 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
506 len = uprv_strlen(myConverterData->name);
507 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
508 myConverterData->name[len+1]='\0';
509 }
510 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
511 (myLocale[2]=='_' || myLocale[2]=='\0'))
512 {
513 const char *cnvName;
514 if(version==1) {
515 cnvName="icu-internal-25546";
516 } else {
517 cnvName="ksc_5601";
518 myConverterData->version=version=0;
519 }
520 if(pArgs->onlyTestIsLoadable) {
521 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
522 uprv_free(cnv->extraInfo);
523 cnv->extraInfo=NULL;
524 return;
525 } else {
526 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
527 if (U_FAILURE(*errorCode)) {
528 _ISO2022Close(cnv);
529 return;
530 }
531
532 if(version==1) {
533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
534 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
535 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
536 }else{
537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
538 }
539
540 /* initialize the state variables */
541 setInitialStateToUnicodeKR(cnv, myConverterData);
542 setInitialStateFromUnicodeKR(cnv, myConverterData);
543
544 /* set the function pointers to appropriate funtions */
545 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
546 uprv_strcpy(myConverterData->locale,"ko");
547 }
548 }
549 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
550 (myLocale[2]=='_' || myLocale[2]=='\0'))
551 {
552
553 /* open the required converters and cache them */
554 myConverterData->myConverterArray[GB2312_1] =
555 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
556 if(version==1) {
557 myConverterData->myConverterArray[ISO_IR_165] =
558 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
559 }
560 myConverterData->myConverterArray[CNS_11643] =
561 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
562
563
564 /* set the function pointers to appropriate funtions */
565 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
566 uprv_strcpy(myConverterData->locale,"cn");
567
568 if (version==1){
569 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
570 }else{
571 myConverterData->version = 0;
572 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
573 }
574 }
575 else{
576 #ifdef U_ENABLE_GENERIC_ISO_2022
577 myConverterData->isFirstBuffer = TRUE;
578
579 /* append the UTF-8 escape sequence */
580 cnv->charErrorBufferLength = 3;
581 cnv->charErrorBuffer[0] = 0x1b;
582 cnv->charErrorBuffer[1] = 0x25;
583 cnv->charErrorBuffer[2] = 0x42;
584
585 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
586 /* initialize the state variables */
587 uprv_strcpy(myConverterData->name,"ISO_2022");
588 #else
589 *errorCode = U_UNSUPPORTED_ERROR;
590 return;
591 #endif
592 }
593
594 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
595
596 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
597 _ISO2022Close(cnv);
598 }
599 } else {
600 *errorCode = U_MEMORY_ALLOCATION_ERROR;
601 }
602 }
603
604
605 static void
_ISO2022Close(UConverter * converter)606 _ISO2022Close(UConverter *converter) {
607 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
608 UConverterSharedData **array = myData->myConverterArray;
609 int32_t i;
610
611 if (converter->extraInfo != NULL) {
612 /*close the array of converter pointers and free the memory*/
613 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
614 if(array[i]!=NULL) {
615 ucnv_unloadSharedDataIfReady(array[i]);
616 }
617 }
618
619 ucnv_close(myData->currentConverter);
620
621 if(!converter->isExtraLocal){
622 uprv_free (converter->extraInfo);
623 converter->extraInfo = NULL;
624 }
625 }
626 }
627
628 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)629 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
630 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
631 if(choice<=UCNV_RESET_TO_UNICODE) {
632 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
633 myConverterData->key = 0;
634 myConverterData->isEmptySegment = FALSE;
635 }
636 if(choice!=UCNV_RESET_TO_UNICODE) {
637 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
638 }
639 #ifdef U_ENABLE_GENERIC_ISO_2022
640 if(myConverterData->locale[0] == 0){
641 if(choice<=UCNV_RESET_TO_UNICODE) {
642 myConverterData->isFirstBuffer = TRUE;
643 myConverterData->key = 0;
644 if (converter->mode == UCNV_SO){
645 ucnv_close (myConverterData->currentConverter);
646 myConverterData->currentConverter=NULL;
647 }
648 converter->mode = UCNV_SI;
649 }
650 if(choice!=UCNV_RESET_TO_UNICODE) {
651 /* re-append UTF-8 escape sequence */
652 converter->charErrorBufferLength = 3;
653 converter->charErrorBuffer[0] = 0x1b;
654 converter->charErrorBuffer[1] = 0x28;
655 converter->charErrorBuffer[2] = 0x42;
656 }
657 }
658 else
659 #endif
660 {
661 /* reset the state variables */
662 if(myConverterData->locale[0] == 'k'){
663 if(choice<=UCNV_RESET_TO_UNICODE) {
664 setInitialStateToUnicodeKR(converter, myConverterData);
665 }
666 if(choice!=UCNV_RESET_TO_UNICODE) {
667 setInitialStateFromUnicodeKR(converter, myConverterData);
668 }
669 }
670 }
671 }
672
673 static const char*
_ISO2022getName(const UConverter * cnv)674 _ISO2022getName(const UConverter* cnv){
675 if(cnv->extraInfo){
676 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
677 return myData->name;
678 }
679 return NULL;
680 }
681
682
683 /*************** to unicode *******************/
684 /****************************************************************************
685 * Recognized escape sequences are
686 * <ESC>(B ASCII
687 * <ESC>.A ISO-8859-1
688 * <ESC>.F ISO-8859-7
689 * <ESC>(J JISX-201
690 * <ESC>(I JISX-201
691 * <ESC>$B JISX-208
692 * <ESC>$@ JISX-208
693 * <ESC>$(D JISX-212
694 * <ESC>$A GB2312
695 * <ESC>$(C KSC5601
696 */
697 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
698 /* 0 1 2 3 4 5 6 7 8 9 */
699 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
700 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
701 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
702 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
703 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
704 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
705 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
706 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
707 };
708
709 /*************** to unicode *******************/
710 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
711 /* 0 1 2 3 4 5 6 7 8 9 */
712 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
713 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
714 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
715 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
716 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
717 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
718 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
719 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
720 };
721
722
723 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)724 getKey_2022(char c,int32_t* key,int32_t* offset){
725 int32_t togo;
726 int32_t low = 0;
727 int32_t hi = MAX_STATES_2022;
728 int32_t oldmid=0;
729
730 togo = normalize_esq_chars_2022[(uint8_t)c];
731 if(togo == 0) {
732 /* not a valid character anywhere in an escape sequence */
733 *key = 0;
734 *offset = 0;
735 return INVALID_2022;
736 }
737 togo = (*key << 5) + togo;
738
739 while (hi != low) /*binary search*/{
740
741 register int32_t mid = (hi+low) >> 1; /*Finds median*/
742
743 if (mid == oldmid)
744 break;
745
746 if (escSeqStateTable_Key_2022[mid] > togo){
747 hi = mid;
748 }
749 else if (escSeqStateTable_Key_2022[mid] < togo){
750 low = mid;
751 }
752 else /*we found it*/{
753 *key = togo;
754 *offset = mid;
755 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
756 }
757 oldmid = mid;
758
759 }
760
761 *key = 0;
762 *offset = 0;
763 return INVALID_2022;
764 }
765
766 /*runs through a state machine to determine the escape sequence - codepage correspondance
767 */
768 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)769 changeState_2022(UConverter* _this,
770 const char** source,
771 const char* sourceLimit,
772 Variant2022 var,
773 UErrorCode* err){
774 UCNV_TableStates_2022 value;
775 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
776 uint32_t key = myData2022->key;
777 int32_t offset = 0;
778 int8_t initialToULength = _this->toULength;
779 char c;
780
781 value = VALID_NON_TERMINAL_2022;
782 while (*source < sourceLimit) {
783 c = *(*source)++;
784 _this->toUBytes[_this->toULength++]=(uint8_t)c;
785 value = getKey_2022(c,(int32_t *) &key, &offset);
786
787 switch (value){
788
789 case VALID_NON_TERMINAL_2022 :
790 /* continue with the loop */
791 break;
792
793 case VALID_TERMINAL_2022:
794 key = 0;
795 goto DONE;
796
797 case INVALID_2022:
798 goto DONE;
799
800 case VALID_MAYBE_TERMINAL_2022:
801 #ifdef U_ENABLE_GENERIC_ISO_2022
802 /* ESC ( B is ambiguous only for ISO_2022 itself */
803 if(var == ISO_2022) {
804 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
805 _this->toULength = 0;
806
807 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
808
809 /* continue with the loop */
810 value = VALID_NON_TERMINAL_2022;
811 break;
812 } else
813 #endif
814 {
815 /* not ISO_2022 itself, finish here */
816 value = VALID_TERMINAL_2022;
817 key = 0;
818 goto DONE;
819 }
820 }
821 }
822
823 DONE:
824 myData2022->key = key;
825
826 if (value == VALID_NON_TERMINAL_2022) {
827 /* indicate that the escape sequence is incomplete: key!=0 */
828 return;
829 } else if (value == INVALID_2022 ) {
830 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
831 } else /* value == VALID_TERMINAL_2022 */ {
832 switch(var){
833 #ifdef U_ENABLE_GENERIC_ISO_2022
834 case ISO_2022:
835 {
836 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
837 if(chosenConverterName == NULL) {
838 /* SS2 or SS3 */
839 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
840 _this->toUCallbackReason = UCNV_UNASSIGNED;
841 return;
842 }
843
844 _this->mode = UCNV_SI;
845 ucnv_close(myData2022->currentConverter);
846 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
847 if(U_SUCCESS(*err)) {
848 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
849 _this->mode = UCNV_SO;
850 }
851 break;
852 }
853 #endif
854 case ISO_2022_JP:
855 {
856 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
857 switch(tempState) {
858 case INVALID_STATE:
859 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
860 break;
861 case SS2_STATE:
862 if(myData2022->toU2022State.cs[2]!=0) {
863 if(myData2022->toU2022State.g<2) {
864 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
865 }
866 myData2022->toU2022State.g=2;
867 } else {
868 /* illegal to have SS2 before a matching designator */
869 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
870 }
871 break;
872 /* case SS3_STATE: not used in ISO-2022-JP-x */
873 case ISO8859_1:
874 case ISO8859_7:
875 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
876 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
877 } else {
878 /* G2 charset for SS2 */
879 myData2022->toU2022State.cs[2]=(int8_t)tempState;
880 }
881 break;
882 default:
883 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
884 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
885 } else {
886 /* G0 charset */
887 myData2022->toU2022State.cs[0]=(int8_t)tempState;
888 }
889 break;
890 }
891 }
892 break;
893 case ISO_2022_CN:
894 {
895 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
896 switch(tempState) {
897 case INVALID_STATE:
898 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
899 break;
900 case SS2_STATE:
901 if(myData2022->toU2022State.cs[2]!=0) {
902 if(myData2022->toU2022State.g<2) {
903 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
904 }
905 myData2022->toU2022State.g=2;
906 } else {
907 /* illegal to have SS2 before a matching designator */
908 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
909 }
910 break;
911 case SS3_STATE:
912 if(myData2022->toU2022State.cs[3]!=0) {
913 if(myData2022->toU2022State.g<2) {
914 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
915 }
916 myData2022->toU2022State.g=3;
917 } else {
918 /* illegal to have SS3 before a matching designator */
919 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
920 }
921 break;
922 case ISO_IR_165:
923 if(myData2022->version==0) {
924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
925 break;
926 }
927 /*fall through*/
928 case GB2312_1:
929 /*fall through*/
930 case CNS_11643_1:
931 myData2022->toU2022State.cs[1]=(int8_t)tempState;
932 break;
933 case CNS_11643_2:
934 myData2022->toU2022State.cs[2]=(int8_t)tempState;
935 break;
936 default:
937 /* other CNS 11643 planes */
938 if(myData2022->version==0) {
939 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
940 } else {
941 myData2022->toU2022State.cs[3]=(int8_t)tempState;
942 }
943 break;
944 }
945 }
946 break;
947 case ISO_2022_KR:
948 if(offset==0x30){
949 /* nothing to be done, just accept this one escape sequence */
950 } else {
951 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
952 }
953 break;
954
955 default:
956 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
957 break;
958 }
959 }
960 if(U_SUCCESS(*err)) {
961 _this->toULength = 0;
962 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
963 if(_this->toULength>1) {
964 /*
965 * Ticket 5691: consistent illegal sequences:
966 * - We include at least the first byte (ESC) in the illegal sequence.
967 * - If any of the non-initial bytes could be the start of a character,
968 * we stop the illegal sequence before the first one of those.
969 * In escape sequences, all following bytes are "printable", that is,
970 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
971 * they are valid single/lead bytes.
972 * For simplicity, we always only report the initial ESC byte as the
973 * illegal sequence and back out all other bytes we looked at.
974 */
975 /* Back out some bytes. */
976 int8_t backOutDistance=_this->toULength-1;
977 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
978 if(backOutDistance<=bytesFromThisBuffer) {
979 /* same as initialToULength<=1 */
980 *source-=backOutDistance;
981 } else {
982 /* Back out bytes from the previous buffer: Need to replay them. */
983 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
984 /* same as -(initialToULength-1) */
985 /* preToULength is negative! */
986 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
987 *source-=bytesFromThisBuffer;
988 }
989 _this->toULength=1;
990 }
991 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
992 _this->toUCallbackReason = UCNV_UNASSIGNED;
993 }
994 }
995
996 /*Checks the characters of the buffer against valid 2022 escape sequences
997 *if the match we return a pointer to the initial start of the sequence otherwise
998 *we return sourceLimit
999 */
1000 /*for 2022 looks ahead in the stream
1001 *to determine the longest possible convertible
1002 *data stream
1003 */
1004 static U_INLINE const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool flush)1005 getEndOfBuffer_2022(const char** source,
1006 const char* sourceLimit,
1007 UBool flush){
1008
1009 const char* mySource = *source;
1010
1011 #ifdef U_ENABLE_GENERIC_ISO_2022
1012 if (*source >= sourceLimit)
1013 return sourceLimit;
1014
1015 do{
1016
1017 if (*mySource == ESC_2022){
1018 int8_t i;
1019 int32_t key = 0;
1020 int32_t offset;
1021 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1022
1023 /* Kludge: I could not
1024 * figure out the reason for validating an escape sequence
1025 * twice - once here and once in changeState_2022().
1026 * is it possible to have an ESC character in a ISO2022
1027 * byte stream which is valid in a code page? Is it legal?
1028 */
1029 for (i=0;
1030 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1031 i++) {
1032 value = getKey_2022(*(mySource+i), &key, &offset);
1033 }
1034 if (value > 0 || *mySource==ESC_2022)
1035 return mySource;
1036
1037 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1038 return sourceLimit;
1039 }
1040 }while (++mySource < sourceLimit);
1041
1042 return sourceLimit;
1043 #else
1044 while(mySource < sourceLimit && *mySource != ESC_2022) {
1045 ++mySource;
1046 }
1047 return mySource;
1048 #endif
1049 }
1050
1051
1052 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1053 * any future change in _MBCSFromUChar32() function should be reflected here.
1054 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1055 */
1056 static U_INLINE int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1057 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1058 UChar32 c,
1059 uint32_t* value,
1060 UBool useFallback,
1061 int outputType)
1062 {
1063 const int32_t *cx;
1064 const uint16_t *table;
1065 uint32_t stage2Entry;
1066 uint32_t myValue;
1067 int32_t length;
1068 const uint8_t *p;
1069 /*
1070 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1071 * Use internal version of ucnv_open() that verifies that the new structures are available,
1072 * else U_INTERNAL_PROGRAM_ERROR.
1073 */
1074 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1075 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1076 table=sharedData->mbcs.fromUnicodeTable;
1077 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1078 /* get the bytes and the length for the output */
1079 if(outputType==MBCS_OUTPUT_2){
1080 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1081 if(myValue<=0xff) {
1082 length=1;
1083 } else {
1084 length=2;
1085 }
1086 } else /* outputType==MBCS_OUTPUT_3 */ {
1087 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1088 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1089 if(myValue<=0xff) {
1090 length=1;
1091 } else if(myValue<=0xffff) {
1092 length=2;
1093 } else {
1094 length=3;
1095 }
1096 }
1097 /* is this code point assigned, or do we use fallbacks? */
1098 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1099 /* assigned */
1100 *value=myValue;
1101 return length;
1102 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1103 /*
1104 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1105 * There is no way with this data structure for fallback output
1106 * to be a zero byte.
1107 */
1108 *value=myValue;
1109 return -length;
1110 }
1111 }
1112
1113 cx=sharedData->mbcs.extIndexes;
1114 if(cx!=NULL) {
1115 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1116 }
1117
1118 /* unassigned */
1119 return 0;
1120 }
1121
1122 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1123 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1124 * @param retval pointer to output byte
1125 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1126 */
1127 static U_INLINE int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1128 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1129 UChar32 c,
1130 uint32_t* retval,
1131 UBool useFallback)
1132 {
1133 const uint16_t *table;
1134 int32_t value;
1135 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1136 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1137 return 0;
1138 }
1139 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1140 table=sharedData->mbcs.fromUnicodeTable;
1141 /* get the byte for the output */
1142 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1143 /* is this code point assigned, or do we use fallbacks? */
1144 *retval=(uint32_t)(value&0xff);
1145 if(value>=0xf00) {
1146 return 1; /* roundtrip */
1147 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1148 return -1; /* fallback taken */
1149 } else {
1150 return 0; /* no mapping */
1151 }
1152 }
1153
1154 /*
1155 * Check that the result is a 2-byte value with each byte in the range A1..FE
1156 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1157 * to move it to the ISO 2022 range 21..7E.
1158 * Return 0 if out of range.
1159 */
1160 static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value)1161 _2022FromGR94DBCS(uint32_t value) {
1162 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1163 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1164 ) {
1165 return value - 0x8080; /* shift down to 21..7e byte range */
1166 } else {
1167 return 0; /* not valid for ISO 2022 */
1168 }
1169 }
1170
1171 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1172 /*
1173 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1174 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1175 * unchanged.
1176 */
1177 static U_INLINE uint32_t
1178 _2022ToGR94DBCS(uint32_t value) {
1179 uint32_t returnValue = value + 0x8080;
1180 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1181 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1182 return returnValue;
1183 } else {
1184 return value;
1185 }
1186 }
1187 #endif
1188
1189 #ifdef U_ENABLE_GENERIC_ISO_2022
1190
1191 /**********************************************************************************
1192 * ISO-2022 Converter
1193 *
1194 *
1195 */
1196
1197 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1198 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1199 UErrorCode* err){
1200 const char* mySourceLimit, *realSourceLimit;
1201 const char* sourceStart;
1202 const UChar* myTargetStart;
1203 UConverter* saveThis;
1204 UConverterDataISO2022* myData;
1205 int8_t length;
1206
1207 saveThis = args->converter;
1208 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1209
1210 realSourceLimit = args->sourceLimit;
1211 while (args->source < realSourceLimit) {
1212 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1213 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1214 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1215
1216 if(args->source < mySourceLimit) {
1217 if(myData->currentConverter==NULL) {
1218 myData->currentConverter = ucnv_open("ASCII",err);
1219 if(U_FAILURE(*err)){
1220 return;
1221 }
1222
1223 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1224 saveThis->mode = UCNV_SO;
1225 }
1226
1227 /* convert to before the ESC or until the end of the buffer */
1228 myData->isFirstBuffer=FALSE;
1229 sourceStart = args->source;
1230 myTargetStart = args->target;
1231 args->converter = myData->currentConverter;
1232 ucnv_toUnicode(args->converter,
1233 &args->target,
1234 args->targetLimit,
1235 &args->source,
1236 mySourceLimit,
1237 args->offsets,
1238 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1239 err);
1240 args->converter = saveThis;
1241
1242 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1243 /* move the overflow buffer */
1244 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1245 myData->currentConverter->UCharErrorBufferLength = 0;
1246 if(length > 0) {
1247 uprv_memcpy(saveThis->UCharErrorBuffer,
1248 myData->currentConverter->UCharErrorBuffer,
1249 length*U_SIZEOF_UCHAR);
1250 }
1251 return;
1252 }
1253
1254 /*
1255 * At least one of:
1256 * -Error while converting
1257 * -Done with entire buffer
1258 * -Need to write offsets or update the current offset
1259 * (leave that up to the code in ucnv.c)
1260 *
1261 * or else we just stopped at an ESC byte and continue with changeState_2022()
1262 */
1263 if (U_FAILURE(*err) ||
1264 (args->source == realSourceLimit) ||
1265 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1266 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1267 ) {
1268 /* copy partial or error input for truncated detection and error handling */
1269 if(U_FAILURE(*err)) {
1270 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1271 if(length > 0) {
1272 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1273 }
1274 } else {
1275 length = saveThis->toULength = myData->currentConverter->toULength;
1276 if(length > 0) {
1277 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1278 if(args->source < mySourceLimit) {
1279 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1280 }
1281 }
1282 }
1283 return;
1284 }
1285 }
1286 }
1287
1288 sourceStart = args->source;
1289 changeState_2022(args->converter,
1290 &(args->source),
1291 realSourceLimit,
1292 ISO_2022,
1293 err);
1294 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1295 /* let the ucnv.c code update its current offset */
1296 return;
1297 }
1298 }
1299 }
1300
1301 #endif
1302
1303 /*
1304 * To Unicode Callback helper function
1305 */
1306 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1307 toUnicodeCallback(UConverter *cnv,
1308 const uint32_t sourceChar, const uint32_t targetUniChar,
1309 UErrorCode* err){
1310 if(sourceChar>0xff){
1311 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1312 cnv->toUBytes[1] = (uint8_t)sourceChar;
1313 cnv->toULength = 2;
1314 }
1315 else{
1316 cnv->toUBytes[0] =(char) sourceChar;
1317 cnv->toULength = 1;
1318 }
1319
1320 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1321 *err = U_INVALID_CHAR_FOUND;
1322 }
1323 else{
1324 *err = U_ILLEGAL_CHAR_FOUND;
1325 }
1326 }
1327
1328 /**************************************ISO-2022-JP*************************************************/
1329
1330 /************************************** IMPORTANT **************************************************
1331 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1332 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1333 * The converter iterates over each Unicode codepoint
1334 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1335 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1336 * would do as far as possible.
1337 *
1338 * If the implementation of these macros or structure of sharedData struct change in the future, make
1339 * sure that ISO-2022 is also changed.
1340 ***************************************************************************************************
1341 */
1342
1343 /***************************************************************************************************
1344 * Rules for ISO-2022-jp encoding
1345 * (i) Escape sequences must be fully contained within a line they should not
1346 * span new lines or CRs
1347 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1348 * JIS-Roman character escape sequence should follow before the line terminates
1349 * (iii) If the first character on the line is represented by two bytes then a two
1350 * byte character escape sequence should precede it
1351 * (iv) If no escape sequence is encountered then the characters are ASCII
1352 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1353 * and invoked with SS2 (ESC N).
1354 * (vi) If there is any G0 designation in text, there must be a switch to
1355 * ASCII or to JIS X 0201-Roman before a space character (but not
1356 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1357 * characters such as tab or CRLF.
1358 * (vi) Supported encodings:
1359 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1360 *
1361 * source : RFC-1554
1362 *
1363 * JISX201, JISX208,JISX212 : new .cnv data files created
1364 * KSC5601 : alias to ibm-949 mapping table
1365 * GB2312 : alias to ibm-1386 mapping table
1366 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1367 * ISO-8859-7 : alisas to ibm-9409 mapping table
1368 */
1369
1370 /* preference order of JP charsets */
1371 static const StateEnum jpCharsetPref[]={
1372 ASCII,
1373 JISX201,
1374 ISO8859_1,
1375 ISO8859_7,
1376 JISX208,
1377 JISX212,
1378 GB2312,
1379 KSC5601,
1380 HWKANA_7BIT
1381 };
1382
1383 /*
1384 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1385 * not in order of jpCharsetPref[]!
1386 */
1387 static const char escSeqChars[][6] ={
1388 "\x1B\x28\x42", /* <ESC>(B ASCII */
1389 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1390 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1391 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1392 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1393 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1394 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1395 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1396 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1397
1398 };
1399 static const int8_t escSeqCharsLen[] ={
1400 3, /* length of <ESC>(B ASCII */
1401 3, /* length of <ESC>.A ISO-8859-1 */
1402 3, /* length of <ESC>.F ISO-8859-7 */
1403 3, /* length of <ESC>(J JISX-201 */
1404 3, /* length of <ESC>$B JISX-208 */
1405 4, /* length of <ESC>$(D JISX-212 */
1406 3, /* length of <ESC>$A GB2312 */
1407 4, /* length of <ESC>$(C KSC5601 */
1408 3 /* length of <ESC>(I HWKANA_7BIT */
1409 };
1410
1411 /*
1412 * The iteration over various code pages works this way:
1413 * i) Get the currentState from myConverterData->currentState
1414 * ii) Check if the character is mapped to a valid character in the currentState
1415 * Yes -> a) set the initIterState to currentState
1416 * b) remain in this state until an invalid character is found
1417 * No -> a) go to the next code page and find the character
1418 * iii) Before changing the state increment the current state check if the current state
1419 * is equal to the intitIteration state
1420 * Yes -> A character that cannot be represented in any of the supported encodings
1421 * break and return a U_INVALID_CHARACTER error
1422 * No -> Continue and find the character in next code page
1423 *
1424 *
1425 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1426 */
1427
1428 /* Map 00..7F to Unicode according to JIS X 0201. */
1429 static U_INLINE uint32_t
jisx201ToU(uint32_t value)1430 jisx201ToU(uint32_t value) {
1431 if(value < 0x5c) {
1432 return value;
1433 } else if(value == 0x5c) {
1434 return 0xa5;
1435 } else if(value == 0x7e) {
1436 return 0x203e;
1437 } else /* value <= 0x7f */ {
1438 return value;
1439 }
1440 }
1441
1442 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1443 static U_INLINE uint32_t
jisx201FromU(uint32_t value)1444 jisx201FromU(uint32_t value) {
1445 if(value<=0x7f) {
1446 if(value!=0x5c && value!=0x7e) {
1447 return value;
1448 }
1449 } else if(value==0xa5) {
1450 return 0x5c;
1451 } else if(value==0x203e) {
1452 return 0x7e;
1453 }
1454 return 0xfffe;
1455 }
1456
1457 /*
1458 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1459 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1460 * Return 0 if the byte pair is out of range.
1461 */
1462 static U_INLINE uint32_t
_2022FromSJIS(uint32_t value)1463 _2022FromSJIS(uint32_t value) {
1464 uint8_t trail;
1465
1466 if(value > 0xEFFC) {
1467 return 0; /* beyond JIS X 0208 */
1468 }
1469
1470 trail = (uint8_t)value;
1471
1472 value &= 0xff00; /* lead byte */
1473 if(value <= 0x9f00) {
1474 value -= 0x7000;
1475 } else /* 0xe000 <= value <= 0xef00 */ {
1476 value -= 0xb000;
1477 }
1478 value <<= 1;
1479
1480 if(trail <= 0x9e) {
1481 value -= 0x100;
1482 if(trail <= 0x7e) {
1483 value |= trail - 0x1f;
1484 } else {
1485 value |= trail - 0x20;
1486 }
1487 } else /* trail <= 0xfc */ {
1488 value |= trail - 0x7e;
1489 }
1490 return value;
1491 }
1492
1493 /*
1494 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1495 * If either byte is outside 21..7E make sure that the result is not valid
1496 * for Shift-JIS so that the converter catches it.
1497 * Some invalid byte values already turn into equally invalid Shift-JIS
1498 * byte values and need not be tested explicitly.
1499 */
1500 static U_INLINE void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1501 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1502 if(c1&1) {
1503 ++c1;
1504 if(c2 <= 0x5f) {
1505 c2 += 0x1f;
1506 } else if(c2 <= 0x7e) {
1507 c2 += 0x20;
1508 } else {
1509 c2 = 0; /* invalid */
1510 }
1511 } else {
1512 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1513 c2 += 0x7e;
1514 } else {
1515 c2 = 0; /* invalid */
1516 }
1517 }
1518 c1 >>= 1;
1519 if(c1 <= 0x2f) {
1520 c1 += 0x70;
1521 } else if(c1 <= 0x3f) {
1522 c1 += 0xb0;
1523 } else {
1524 c1 = 0; /* invalid */
1525 }
1526 bytes[0] = (char)c1;
1527 bytes[1] = (char)c2;
1528 }
1529
1530 /*
1531 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1532 * Katakana.
1533 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1534 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1535 * These were the only fallbacks in ICU's jisx-208.ucm file.
1536 */
1537 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1538 0x2123, /* U+FF61 */
1539 0x2156,
1540 0x2157,
1541 0x2122,
1542 0x2126,
1543 0x2572,
1544 0x2521,
1545 0x2523,
1546 0x2525,
1547 0x2527,
1548 0x2529,
1549 0x2563,
1550 0x2565,
1551 0x2567,
1552 0x2543,
1553 0x213C, /* U+FF70 */
1554 0x2522,
1555 0x2524,
1556 0x2526,
1557 0x2528,
1558 0x252A,
1559 0x252B,
1560 0x252D,
1561 0x252F,
1562 0x2531,
1563 0x2533,
1564 0x2535,
1565 0x2537,
1566 0x2539,
1567 0x253B,
1568 0x253D,
1569 0x253F, /* U+FF80 */
1570 0x2541,
1571 0x2544,
1572 0x2546,
1573 0x2548,
1574 0x254A,
1575 0x254B,
1576 0x254C,
1577 0x254D,
1578 0x254E,
1579 0x254F,
1580 0x2552,
1581 0x2555,
1582 0x2558,
1583 0x255B,
1584 0x255E,
1585 0x255F, /* U+FF90 */
1586 0x2560,
1587 0x2561,
1588 0x2562,
1589 0x2564,
1590 0x2566,
1591 0x2568,
1592 0x2569,
1593 0x256A,
1594 0x256B,
1595 0x256C,
1596 0x256D,
1597 0x256F,
1598 0x2573,
1599 0x212B,
1600 0x212C /* U+FF9F */
1601 };
1602
1603 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1604 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1605 UConverter *cnv = args->converter;
1606 UConverterDataISO2022 *converterData;
1607 ISO2022State *pFromU2022State;
1608 uint8_t *target = (uint8_t *) args->target;
1609 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1610 const UChar* source = args->source;
1611 const UChar* sourceLimit = args->sourceLimit;
1612 int32_t* offsets = args->offsets;
1613 UChar32 sourceChar;
1614 char buffer[8];
1615 int32_t len, outLen;
1616 int8_t choices[10];
1617 int32_t choiceCount;
1618 uint32_t targetValue = 0;
1619 UBool useFallback;
1620
1621 int32_t i;
1622 int8_t cs, g;
1623
1624 /* set up the state */
1625 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1626 pFromU2022State = &converterData->fromU2022State;
1627
1628 choiceCount = 0;
1629
1630 /* check if the last codepoint of previous buffer was a lead surrogate*/
1631 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1632 goto getTrail;
1633 }
1634
1635 while(source < sourceLimit) {
1636 if(target < targetLimit) {
1637
1638 sourceChar = *(source++);
1639 /*check if the char is a First surrogate*/
1640 if(UTF_IS_SURROGATE(sourceChar)) {
1641 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1642 getTrail:
1643 /*look ahead to find the trail surrogate*/
1644 if(source < sourceLimit) {
1645 /* test the following code unit */
1646 UChar trail=(UChar) *source;
1647 if(UTF_IS_SECOND_SURROGATE(trail)) {
1648 source++;
1649 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1650 cnv->fromUChar32=0x00;
1651 /* convert this supplementary code point */
1652 /* exit this condition tree */
1653 } else {
1654 /* this is an unmatched lead code unit (1st surrogate) */
1655 /* callback(illegal) */
1656 *err=U_ILLEGAL_CHAR_FOUND;
1657 cnv->fromUChar32=sourceChar;
1658 break;
1659 }
1660 } else {
1661 /* no more input */
1662 cnv->fromUChar32=sourceChar;
1663 break;
1664 }
1665 } else {
1666 /* this is an unmatched trail code unit (2nd surrogate) */
1667 /* callback(illegal) */
1668 *err=U_ILLEGAL_CHAR_FOUND;
1669 cnv->fromUChar32=sourceChar;
1670 break;
1671 }
1672 }
1673
1674 /* do not convert SO/SI/ESC */
1675 if(IS_2022_CONTROL(sourceChar)) {
1676 /* callback(illegal) */
1677 *err=U_ILLEGAL_CHAR_FOUND;
1678 cnv->fromUChar32=sourceChar;
1679 break;
1680 }
1681
1682 /* do the conversion */
1683
1684 if(choiceCount == 0) {
1685 uint16_t csm;
1686
1687 /*
1688 * The csm variable keeps track of which charsets are allowed
1689 * and not used yet while building the choices[].
1690 */
1691 csm = jpCharsetMasks[converterData->version];
1692 choiceCount = 0;
1693
1694 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1695 if(converterData->version == 3 || converterData->version == 4) {
1696 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1697 }
1698 /* Do not try single-byte half-width Katakana for other versions. */
1699 csm &= ~CSM(HWKANA_7BIT);
1700
1701 /* try the current G0 charset */
1702 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1703 csm &= ~CSM(cs);
1704
1705 /* try the current G2 charset */
1706 if((cs = pFromU2022State->cs[2]) != 0) {
1707 choices[choiceCount++] = cs;
1708 csm &= ~CSM(cs);
1709 }
1710
1711 /* try all the other possible charsets */
1712 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1713 cs = (int8_t)jpCharsetPref[i];
1714 if(CSM(cs) & csm) {
1715 choices[choiceCount++] = cs;
1716 csm &= ~CSM(cs);
1717 }
1718 }
1719 }
1720
1721 cs = g = 0;
1722 /*
1723 * len==0: no mapping found yet
1724 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1725 * len>0: found a roundtrip result, done
1726 */
1727 len = 0;
1728 /*
1729 * We will turn off useFallback after finding a fallback,
1730 * but we still get fallbacks from PUA code points as usual.
1731 * Therefore, we will also need to check that we don't overwrite
1732 * an early fallback with a later one.
1733 */
1734 useFallback = cnv->useFallback;
1735
1736 for(i = 0; i < choiceCount && len <= 0; ++i) {
1737 uint32_t value;
1738 int32_t len2;
1739 int8_t cs0 = choices[i];
1740 switch(cs0) {
1741 case ASCII:
1742 if(sourceChar <= 0x7f) {
1743 targetValue = (uint32_t)sourceChar;
1744 len = 1;
1745 cs = cs0;
1746 g = 0;
1747 }
1748 break;
1749 case ISO8859_1:
1750 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1751 targetValue = (uint32_t)sourceChar - 0x80;
1752 len = 1;
1753 cs = cs0;
1754 g = 2;
1755 }
1756 break;
1757 case HWKANA_7BIT:
1758 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1759 if(converterData->version==3) {
1760 /* JIS7: use G1 (SO) */
1761 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1762 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1763 len = 1;
1764 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1765 g = 1;
1766 } else if(converterData->version==4) {
1767 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1768 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1769 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1770 len = 1;
1771
1772 cs = pFromU2022State->cs[0];
1773 if(IS_JP_DBCS(cs)) {
1774 /* switch from a DBCS charset to JISX201 */
1775 cs = (int8_t)JISX201;
1776 }
1777 /* else stay in the current G0 charset */
1778 g = 0;
1779 }
1780 /* else do not use HWKANA_7BIT with other versions */
1781 }
1782 break;
1783 case JISX201:
1784 /* G0 SBCS */
1785 value = jisx201FromU(sourceChar);
1786 if(value <= 0x7f) {
1787 targetValue = value;
1788 len = 1;
1789 cs = cs0;
1790 g = 0;
1791 useFallback = FALSE;
1792 }
1793 break;
1794 case JISX208:
1795 /* G0 DBCS from Shift-JIS table */
1796 len2 = MBCS_FROM_UCHAR32_ISO2022(
1797 converterData->myConverterArray[cs0],
1798 sourceChar, &value,
1799 useFallback, MBCS_OUTPUT_2);
1800 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1801 value = _2022FromSJIS(value);
1802 if(value != 0) {
1803 targetValue = value;
1804 len = len2;
1805 cs = cs0;
1806 g = 0;
1807 useFallback = FALSE;
1808 }
1809 } else if(len == 0 && useFallback &&
1810 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1811 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1812 len = -2;
1813 cs = cs0;
1814 g = 0;
1815 useFallback = FALSE;
1816 }
1817 break;
1818 case ISO8859_7:
1819 /* G0 SBCS forced to 7-bit output */
1820 len2 = MBCS_SINGLE_FROM_UCHAR32(
1821 converterData->myConverterArray[cs0],
1822 sourceChar, &value,
1823 useFallback);
1824 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1825 targetValue = value - 0x80;
1826 len = len2;
1827 cs = cs0;
1828 g = 2;
1829 useFallback = FALSE;
1830 }
1831 break;
1832 default:
1833 /* G0 DBCS */
1834 len2 = MBCS_FROM_UCHAR32_ISO2022(
1835 converterData->myConverterArray[cs0],
1836 sourceChar, &value,
1837 useFallback, MBCS_OUTPUT_2);
1838 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1839 if(cs0 == KSC5601) {
1840 /*
1841 * Check for valid bytes for the encoding scheme.
1842 * This is necessary because the sub-converter (windows-949)
1843 * has a broader encoding scheme than is valid for 2022.
1844 */
1845 value = _2022FromGR94DBCS(value);
1846 if(value == 0) {
1847 break;
1848 }
1849 }
1850 targetValue = value;
1851 len = len2;
1852 cs = cs0;
1853 g = 0;
1854 useFallback = FALSE;
1855 }
1856 break;
1857 }
1858 }
1859
1860 if(len != 0) {
1861 if(len < 0) {
1862 len = -len; /* fallback */
1863 }
1864 outLen = 0; /* count output bytes */
1865
1866 /* write SI if necessary (only for JIS7) */
1867 if(pFromU2022State->g == 1 && g == 0) {
1868 buffer[outLen++] = UCNV_SI;
1869 pFromU2022State->g = 0;
1870 }
1871
1872 /* write the designation sequence if necessary */
1873 if(cs != pFromU2022State->cs[g]) {
1874 int32_t escLen = escSeqCharsLen[cs];
1875 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1876 outLen += escLen;
1877 pFromU2022State->cs[g] = cs;
1878
1879 /* invalidate the choices[] */
1880 choiceCount = 0;
1881 }
1882
1883 /* write the shift sequence if necessary */
1884 if(g != pFromU2022State->g) {
1885 switch(g) {
1886 /* case 0 handled before writing escapes */
1887 case 1:
1888 buffer[outLen++] = UCNV_SO;
1889 pFromU2022State->g = 1;
1890 break;
1891 default: /* case 2 */
1892 buffer[outLen++] = 0x1b;
1893 buffer[outLen++] = 0x4e;
1894 break;
1895 /* no case 3: no SS3 in ISO-2022-JP-x */
1896 }
1897 }
1898
1899 /* write the output bytes */
1900 if(len == 1) {
1901 buffer[outLen++] = (char)targetValue;
1902 } else /* len == 2 */ {
1903 buffer[outLen++] = (char)(targetValue >> 8);
1904 buffer[outLen++] = (char)targetValue;
1905 }
1906 } else {
1907 /*
1908 * if we cannot find the character after checking all codepages
1909 * then this is an error
1910 */
1911 *err = U_INVALID_CHAR_FOUND;
1912 cnv->fromUChar32=sourceChar;
1913 break;
1914 }
1915
1916 if(sourceChar == CR || sourceChar == LF) {
1917 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1918 pFromU2022State->cs[2] = 0;
1919 choiceCount = 0;
1920 }
1921
1922 /* output outLen>0 bytes in buffer[] */
1923 if(outLen == 1) {
1924 *target++ = buffer[0];
1925 if(offsets) {
1926 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1927 }
1928 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1929 *target++ = buffer[0];
1930 *target++ = buffer[1];
1931 if(offsets) {
1932 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1933 *offsets++ = sourceIndex;
1934 *offsets++ = sourceIndex;
1935 }
1936 } else {
1937 fromUWriteUInt8(
1938 cnv,
1939 buffer, outLen,
1940 &target, (const char *)targetLimit,
1941 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1942 err);
1943 if(U_FAILURE(*err)) {
1944 break;
1945 }
1946 }
1947 } /* end if(myTargetIndex<myTargetLength) */
1948 else{
1949 *err =U_BUFFER_OVERFLOW_ERROR;
1950 break;
1951 }
1952
1953 }/* end while(mySourceIndex<mySourceLength) */
1954
1955 /*
1956 * the end of the input stream and detection of truncated input
1957 * are handled by the framework, but for ISO-2022-JP conversion
1958 * we need to be in ASCII mode at the very end
1959 *
1960 * conditions:
1961 * successful
1962 * in SO mode or not in ASCII mode
1963 * end of input and no truncated input
1964 */
1965 if( U_SUCCESS(*err) &&
1966 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1967 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1968 ) {
1969 int32_t sourceIndex;
1970
1971 outLen = 0;
1972
1973 if(pFromU2022State->g != 0) {
1974 buffer[outLen++] = UCNV_SI;
1975 pFromU2022State->g = 0;
1976 }
1977
1978 if(pFromU2022State->cs[0] != ASCII) {
1979 int32_t escLen = escSeqCharsLen[ASCII];
1980 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1981 outLen += escLen;
1982 pFromU2022State->cs[0] = (int8_t)ASCII;
1983 }
1984
1985 /* get the source index of the last input character */
1986 /*
1987 * TODO this would be simpler and more reliable if we used a pair
1988 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1989 * so that we could simply use the prevSourceIndex here;
1990 * this code gives an incorrect result for the rare case of an unmatched
1991 * trail surrogate that is alone in the last buffer of the text stream
1992 */
1993 sourceIndex=(int32_t)(source-args->source);
1994 if(sourceIndex>0) {
1995 --sourceIndex;
1996 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1997 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1998 ) {
1999 --sourceIndex;
2000 }
2001 } else {
2002 sourceIndex=-1;
2003 }
2004
2005 fromUWriteUInt8(
2006 cnv,
2007 buffer, outLen,
2008 &target, (const char *)targetLimit,
2009 &offsets, sourceIndex,
2010 err);
2011 }
2012
2013 /*save the state and return */
2014 args->source = source;
2015 args->target = (char*)target;
2016 }
2017
2018 /*************** to unicode *******************/
2019
2020 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2021 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2022 UErrorCode* err){
2023 char tempBuf[2];
2024 const char *mySource = (char *) args->source;
2025 UChar *myTarget = args->target;
2026 const char *mySourceLimit = args->sourceLimit;
2027 uint32_t targetUniChar = 0x0000;
2028 uint32_t mySourceChar = 0x0000;
2029 uint32_t tmpSourceChar = 0x0000;
2030 UConverterDataISO2022* myData;
2031 ISO2022State *pToU2022State;
2032 StateEnum cs;
2033
2034 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2035 pToU2022State = &myData->toU2022State;
2036
2037 if(myData->key != 0) {
2038 /* continue with a partial escape sequence */
2039 goto escape;
2040 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2041 /* continue with a partial double-byte character */
2042 mySourceChar = args->converter->toUBytes[0];
2043 args->converter->toULength = 0;
2044 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2045 targetUniChar = missingCharMarker;
2046 goto getTrailByte;
2047 }
2048
2049 while(mySource < mySourceLimit){
2050
2051 targetUniChar =missingCharMarker;
2052
2053 if(myTarget < args->targetLimit){
2054
2055 mySourceChar= (unsigned char) *mySource++;
2056
2057 switch(mySourceChar) {
2058 case UCNV_SI:
2059 if(myData->version==3) {
2060 pToU2022State->g=0;
2061 continue;
2062 } else {
2063 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2064 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2065 break;
2066 }
2067
2068 case UCNV_SO:
2069 if(myData->version==3) {
2070 /* JIS7: switch to G1 half-width Katakana */
2071 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2072 pToU2022State->g=1;
2073 continue;
2074 } else {
2075 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2076 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2077 break;
2078 }
2079
2080 case ESC_2022:
2081 mySource--;
2082 escape:
2083 {
2084 const char * mySourceBefore = mySource;
2085 int8_t toULengthBefore = args->converter->toULength;
2086
2087 changeState_2022(args->converter,&(mySource),
2088 mySourceLimit, ISO_2022_JP,err);
2089
2090 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2091 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2092 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2093 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2094 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
2095 }
2096 }
2097
2098 /* invalid or illegal escape sequence */
2099 if(U_FAILURE(*err)){
2100 args->target = myTarget;
2101 args->source = mySource;
2102 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2103 return;
2104 }
2105 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2106 if(myData->key==0) {
2107 myData->isEmptySegment = TRUE;
2108 }
2109 continue;
2110
2111 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2112
2113 case CR:
2114 /*falls through*/
2115 case LF:
2116 /* automatically reset to single-byte mode */
2117 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2118 pToU2022State->cs[0] = (int8_t)ASCII;
2119 }
2120 pToU2022State->cs[2] = 0;
2121 pToU2022State->g = 0;
2122 /* falls through */
2123 default:
2124 /* convert one or two bytes */
2125 myData->isEmptySegment = FALSE;
2126 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2127 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2128 !IS_JP_DBCS(cs)
2129 ) {
2130 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2131 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2132
2133 /* return from a single-shift state to the previous one */
2134 if(pToU2022State->g >= 2) {
2135 pToU2022State->g=pToU2022State->prevG;
2136 }
2137 } else switch(cs) {
2138 case ASCII:
2139 if(mySourceChar <= 0x7f) {
2140 targetUniChar = mySourceChar;
2141 }
2142 break;
2143 case ISO8859_1:
2144 if(mySourceChar <= 0x7f) {
2145 targetUniChar = mySourceChar + 0x80;
2146 }
2147 /* return from a single-shift state to the previous one */
2148 pToU2022State->g=pToU2022State->prevG;
2149 break;
2150 case ISO8859_7:
2151 if(mySourceChar <= 0x7f) {
2152 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2153 targetUniChar =
2154 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2155 myData->myConverterArray[cs],
2156 mySourceChar + 0x80);
2157 }
2158 /* return from a single-shift state to the previous one */
2159 pToU2022State->g=pToU2022State->prevG;
2160 break;
2161 case JISX201:
2162 if(mySourceChar <= 0x7f) {
2163 targetUniChar = jisx201ToU(mySourceChar);
2164 }
2165 break;
2166 case HWKANA_7BIT:
2167 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2168 /* 7-bit halfwidth Katakana */
2169 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2170 }
2171 break;
2172 default:
2173 /* G0 DBCS */
2174 if(mySource < mySourceLimit) {
2175 int leadIsOk, trailIsOk;
2176 uint8_t trailByte;
2177 getTrailByte:
2178 trailByte = (uint8_t)*mySource;
2179 /*
2180 * Ticket 5691: consistent illegal sequences:
2181 * - We include at least the first byte in the illegal sequence.
2182 * - If any of the non-initial bytes could be the start of a character,
2183 * we stop the illegal sequence before the first one of those.
2184 *
2185 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2186 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2187 * Otherwise we convert or report the pair of bytes.
2188 */
2189 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2190 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2191 if (leadIsOk && trailIsOk) {
2192 ++mySource;
2193 tmpSourceChar = (mySourceChar << 8) | trailByte;
2194 if(cs == JISX208) {
2195 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2196 mySourceChar = tmpSourceChar;
2197 } else {
2198 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2199 mySourceChar = tmpSourceChar;
2200 if (cs == KSC5601) {
2201 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2202 }
2203 tempBuf[0] = (char)(tmpSourceChar >> 8);
2204 tempBuf[1] = (char)(tmpSourceChar);
2205 }
2206 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2207 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2208 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2209 ++mySource;
2210 /* add another bit so that the code below writes 2 bytes in case of error */
2211 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2212 }
2213 } else {
2214 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2215 args->converter->toULength = 1;
2216 goto endloop;
2217 }
2218 } /* End of inner switch */
2219 break;
2220 } /* End of outer switch */
2221 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2222 if(args->offsets){
2223 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2224 }
2225 *(myTarget++)=(UChar)targetUniChar;
2226 }
2227 else if(targetUniChar > missingCharMarker){
2228 /* disassemble the surrogate pair and write to output*/
2229 targetUniChar-=0x0010000;
2230 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2231 if(args->offsets){
2232 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2233 }
2234 ++myTarget;
2235 if(myTarget< args->targetLimit){
2236 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2237 if(args->offsets){
2238 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2239 }
2240 ++myTarget;
2241 }else{
2242 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2243 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2244 }
2245
2246 }
2247 else{
2248 /* Call the callback function*/
2249 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2250 break;
2251 }
2252 }
2253 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2254 *err =U_BUFFER_OVERFLOW_ERROR;
2255 break;
2256 }
2257 }
2258 endloop:
2259 args->target = myTarget;
2260 args->source = mySource;
2261 }
2262
2263
2264 /***************************************************************
2265 * Rules for ISO-2022-KR encoding
2266 * i) The KSC5601 designator sequence should appear only once in a file,
2267 * at the begining of a line before any KSC5601 characters. This usually
2268 * means that it appears by itself on the first line of the file
2269 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2270 * and SI to shift into single byte mode
2271 */
2272 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2273 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2274
2275 UConverter* saveConv = args->converter;
2276 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2277 args->converter=myConverterData->currentConverter;
2278
2279 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2280 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2281 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2282
2283 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2284 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2285 uprv_memcpy(
2286 saveConv->charErrorBuffer,
2287 myConverterData->currentConverter->charErrorBuffer,
2288 myConverterData->currentConverter->charErrorBufferLength);
2289 }
2290 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2291 myConverterData->currentConverter->charErrorBufferLength = 0;
2292 }
2293 args->converter=saveConv;
2294 }
2295
2296 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2297 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2298
2299 const UChar *source = args->source;
2300 const UChar *sourceLimit = args->sourceLimit;
2301 unsigned char *target = (unsigned char *) args->target;
2302 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2303 int32_t* offsets = args->offsets;
2304 uint32_t targetByteUnit = 0x0000;
2305 UChar32 sourceChar = 0x0000;
2306 UBool isTargetByteDBCS;
2307 UBool oldIsTargetByteDBCS;
2308 UConverterDataISO2022 *converterData;
2309 UConverterSharedData* sharedData;
2310 UBool useFallback;
2311 int32_t length =0;
2312
2313 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2314 /* if the version is 1 then the user is requesting
2315 * conversion with ibm-25546 pass the arguments to
2316 * MBCS converter and return
2317 */
2318 if(converterData->version==1){
2319 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2320 return;
2321 }
2322
2323 /* initialize data */
2324 sharedData = converterData->currentConverter->sharedData;
2325 useFallback = args->converter->useFallback;
2326 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2327 oldIsTargetByteDBCS = isTargetByteDBCS;
2328
2329 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2330 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2331 goto getTrail;
2332 }
2333 while(source < sourceLimit){
2334
2335 targetByteUnit = missingCharMarker;
2336
2337 if(target < (unsigned char*) args->targetLimit){
2338 sourceChar = *source++;
2339
2340 /* do not convert SO/SI/ESC */
2341 if(IS_2022_CONTROL(sourceChar)) {
2342 /* callback(illegal) */
2343 *err=U_ILLEGAL_CHAR_FOUND;
2344 args->converter->fromUChar32=sourceChar;
2345 break;
2346 }
2347
2348 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2349 if(length < 0) {
2350 length = -length; /* fallback */
2351 }
2352 /* only DBCS or SBCS characters are expected*/
2353 /* DB characters with high bit set to 1 are expected */
2354 if( length > 2 || length==0 ||
2355 (length == 1 && targetByteUnit > 0x7f) ||
2356 (length == 2 &&
2357 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2358 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2359 ) {
2360 targetByteUnit=missingCharMarker;
2361 }
2362 if (targetByteUnit != missingCharMarker){
2363
2364 oldIsTargetByteDBCS = isTargetByteDBCS;
2365 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2366 /* append the shift sequence */
2367 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2368
2369 if (isTargetByteDBCS)
2370 *target++ = UCNV_SO;
2371 else
2372 *target++ = UCNV_SI;
2373 if(offsets)
2374 *(offsets++) = (int32_t)(source - args->source-1);
2375 }
2376 /* write the targetUniChar to target */
2377 if(targetByteUnit <= 0x00FF){
2378 if( target < targetLimit){
2379 *(target++) = (unsigned char) targetByteUnit;
2380 if(offsets){
2381 *(offsets++) = (int32_t)(source - args->source-1);
2382 }
2383
2384 }else{
2385 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2386 *err = U_BUFFER_OVERFLOW_ERROR;
2387 }
2388 }else{
2389 if(target < targetLimit){
2390 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2391 if(offsets){
2392 *(offsets++) = (int32_t)(source - args->source-1);
2393 }
2394 if(target < targetLimit){
2395 *(target++) =(unsigned char) (targetByteUnit -0x80);
2396 if(offsets){
2397 *(offsets++) = (int32_t)(source - args->source-1);
2398 }
2399 }else{
2400 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2401 *err = U_BUFFER_OVERFLOW_ERROR;
2402 }
2403 }else{
2404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2405 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2406 *err = U_BUFFER_OVERFLOW_ERROR;
2407 }
2408 }
2409
2410 }
2411 else{
2412 /* oops.. the code point is unassingned
2413 * set the error and reason
2414 */
2415
2416 /*check if the char is a First surrogate*/
2417 if(UTF_IS_SURROGATE(sourceChar)) {
2418 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2419 getTrail:
2420 /*look ahead to find the trail surrogate*/
2421 if(source < sourceLimit) {
2422 /* test the following code unit */
2423 UChar trail=(UChar) *source;
2424 if(UTF_IS_SECOND_SURROGATE(trail)) {
2425 source++;
2426 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2427 *err = U_INVALID_CHAR_FOUND;
2428 /* convert this surrogate code point */
2429 /* exit this condition tree */
2430 } else {
2431 /* this is an unmatched lead code unit (1st surrogate) */
2432 /* callback(illegal) */
2433 *err=U_ILLEGAL_CHAR_FOUND;
2434 }
2435 } else {
2436 /* no more input */
2437 *err = U_ZERO_ERROR;
2438 }
2439 } else {
2440 /* this is an unmatched trail code unit (2nd surrogate) */
2441 /* callback(illegal) */
2442 *err=U_ILLEGAL_CHAR_FOUND;
2443 }
2444 } else {
2445 /* callback(unassigned) for a BMP code point */
2446 *err = U_INVALID_CHAR_FOUND;
2447 }
2448
2449 args->converter->fromUChar32=sourceChar;
2450 break;
2451 }
2452 } /* end if(myTargetIndex<myTargetLength) */
2453 else{
2454 *err =U_BUFFER_OVERFLOW_ERROR;
2455 break;
2456 }
2457
2458 }/* end while(mySourceIndex<mySourceLength) */
2459
2460 /*
2461 * the end of the input stream and detection of truncated input
2462 * are handled by the framework, but for ISO-2022-KR conversion
2463 * we need to be in ASCII mode at the very end
2464 *
2465 * conditions:
2466 * successful
2467 * not in ASCII mode
2468 * end of input and no truncated input
2469 */
2470 if( U_SUCCESS(*err) &&
2471 isTargetByteDBCS &&
2472 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2473 ) {
2474 int32_t sourceIndex;
2475
2476 /* we are switching to ASCII */
2477 isTargetByteDBCS=FALSE;
2478
2479 /* get the source index of the last input character */
2480 /*
2481 * TODO this would be simpler and more reliable if we used a pair
2482 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2483 * so that we could simply use the prevSourceIndex here;
2484 * this code gives an incorrect result for the rare case of an unmatched
2485 * trail surrogate that is alone in the last buffer of the text stream
2486 */
2487 sourceIndex=(int32_t)(source-args->source);
2488 if(sourceIndex>0) {
2489 --sourceIndex;
2490 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2491 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2492 ) {
2493 --sourceIndex;
2494 }
2495 } else {
2496 sourceIndex=-1;
2497 }
2498
2499 fromUWriteUInt8(
2500 args->converter,
2501 SHIFT_IN_STR, 1,
2502 &target, (const char *)targetLimit,
2503 &offsets, sourceIndex,
2504 err);
2505 }
2506
2507 /*save the state and return */
2508 args->source = source;
2509 args->target = (char*)target;
2510 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2511 }
2512
2513 /************************ To Unicode ***************************************/
2514
2515 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2516 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2517 UErrorCode* err){
2518 char const* sourceStart;
2519 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2520
2521 UConverterToUnicodeArgs subArgs;
2522 int32_t minArgsSize;
2523
2524 /* set up the subconverter arguments */
2525 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2526 minArgsSize = args->size;
2527 } else {
2528 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2529 }
2530
2531 uprv_memcpy(&subArgs, args, minArgsSize);
2532 subArgs.size = (uint16_t)minArgsSize;
2533 subArgs.converter = myData->currentConverter;
2534
2535 /* remember the original start of the input for offsets */
2536 sourceStart = args->source;
2537
2538 if(myData->key != 0) {
2539 /* continue with a partial escape sequence */
2540 goto escape;
2541 }
2542
2543 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2544 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2545 subArgs.source = args->source;
2546 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2547 if(subArgs.source != subArgs.sourceLimit) {
2548 /*
2549 * get the current partial byte sequence
2550 *
2551 * it needs to be moved between the public and the subconverter
2552 * so that the conversion framework, which only sees the public
2553 * converter, can handle truncated and illegal input etc.
2554 */
2555 if(args->converter->toULength > 0) {
2556 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2557 }
2558 subArgs.converter->toULength = args->converter->toULength;
2559
2560 /*
2561 * Convert up to the end of the input, or to before the next escape character.
2562 * Does not handle conversion extensions because the preToU[] state etc.
2563 * is not copied.
2564 */
2565 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2566
2567 if(args->offsets != NULL && sourceStart != args->source) {
2568 /* update offsets to base them on the actual start of the input */
2569 int32_t *offsets = args->offsets;
2570 UChar *target = args->target;
2571 int32_t delta = (int32_t)(args->source - sourceStart);
2572 while(target < subArgs.target) {
2573 if(*offsets >= 0) {
2574 *offsets += delta;
2575 }
2576 ++offsets;
2577 ++target;
2578 }
2579 }
2580 args->source = subArgs.source;
2581 args->target = subArgs.target;
2582 args->offsets = subArgs.offsets;
2583
2584 /* copy input/error/overflow buffers */
2585 if(subArgs.converter->toULength > 0) {
2586 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2587 }
2588 args->converter->toULength = subArgs.converter->toULength;
2589
2590 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2591 if(subArgs.converter->UCharErrorBufferLength > 0) {
2592 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2593 subArgs.converter->UCharErrorBufferLength);
2594 }
2595 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2596 subArgs.converter->UCharErrorBufferLength = 0;
2597 }
2598 }
2599
2600 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2601 return;
2602 }
2603
2604 escape:
2605 changeState_2022(args->converter,
2606 &(args->source),
2607 args->sourceLimit,
2608 ISO_2022_KR,
2609 err);
2610 }
2611 }
2612
2613 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2614 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2615 UErrorCode* err){
2616 char tempBuf[2];
2617 const char *mySource = ( char *) args->source;
2618 UChar *myTarget = args->target;
2619 const char *mySourceLimit = args->sourceLimit;
2620 UChar32 targetUniChar = 0x0000;
2621 UChar mySourceChar = 0x0000;
2622 UConverterDataISO2022* myData;
2623 UConverterSharedData* sharedData ;
2624 UBool useFallback;
2625
2626 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2627 if(myData->version==1){
2628 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2629 return;
2630 }
2631
2632 /* initialize state */
2633 sharedData = myData->currentConverter->sharedData;
2634 useFallback = args->converter->useFallback;
2635
2636 if(myData->key != 0) {
2637 /* continue with a partial escape sequence */
2638 goto escape;
2639 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2640 /* continue with a partial double-byte character */
2641 mySourceChar = args->converter->toUBytes[0];
2642 args->converter->toULength = 0;
2643 goto getTrailByte;
2644 }
2645
2646 while(mySource< mySourceLimit){
2647
2648 if(myTarget < args->targetLimit){
2649
2650 mySourceChar= (unsigned char) *mySource++;
2651
2652 if(mySourceChar==UCNV_SI){
2653 myData->toU2022State.g = 0;
2654 if (myData->isEmptySegment) {
2655 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2656 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2657 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2658 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2659 args->converter->toULength = 1;
2660 args->target = myTarget;
2661 args->source = mySource;
2662 return;
2663 }
2664 /*consume the source */
2665 continue;
2666 }else if(mySourceChar==UCNV_SO){
2667 myData->toU2022State.g = 1;
2668 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2669 /*consume the source */
2670 continue;
2671 }else if(mySourceChar==ESC_2022){
2672 mySource--;
2673 escape:
2674 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2675 changeState_2022(args->converter,&(mySource),
2676 mySourceLimit, ISO_2022_KR, err);
2677 if(U_FAILURE(*err)){
2678 args->target = myTarget;
2679 args->source = mySource;
2680 return;
2681 }
2682 continue;
2683 }
2684
2685 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2686 if(myData->toU2022State.g == 1) {
2687 if(mySource < mySourceLimit) {
2688 int leadIsOk, trailIsOk;
2689 uint8_t trailByte;
2690 getTrailByte:
2691 targetUniChar = missingCharMarker;
2692 trailByte = (uint8_t)*mySource;
2693 /*
2694 * Ticket 5691: consistent illegal sequences:
2695 * - We include at least the first byte in the illegal sequence.
2696 * - If any of the non-initial bytes could be the start of a character,
2697 * we stop the illegal sequence before the first one of those.
2698 *
2699 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2700 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2701 * Otherwise we convert or report the pair of bytes.
2702 */
2703 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2704 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2705 if (leadIsOk && trailIsOk) {
2706 ++mySource;
2707 tempBuf[0] = (char)(mySourceChar + 0x80);
2708 tempBuf[1] = (char)(trailByte + 0x80);
2709 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2710 mySourceChar = (mySourceChar << 8) | trailByte;
2711 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2712 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2713 ++mySource;
2714 /* add another bit so that the code below writes 2 bytes in case of error */
2715 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2716 }
2717 } else {
2718 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2719 args->converter->toULength = 1;
2720 break;
2721 }
2722 }
2723 else if(mySourceChar <= 0x7f) {
2724 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2725 } else {
2726 targetUniChar = 0xffff;
2727 }
2728 if(targetUniChar < 0xfffe){
2729 if(args->offsets) {
2730 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2731 }
2732 *(myTarget++)=(UChar)targetUniChar;
2733 }
2734 else {
2735 /* Call the callback function*/
2736 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2737 break;
2738 }
2739 }
2740 else{
2741 *err =U_BUFFER_OVERFLOW_ERROR;
2742 break;
2743 }
2744 }
2745 args->target = myTarget;
2746 args->source = mySource;
2747 }
2748
2749 /*************************** END ISO2022-KR *********************************/
2750
2751 /*************************** ISO-2022-CN *********************************
2752 *
2753 * Rules for ISO-2022-CN Encoding:
2754 * i) The designator sequence must appear once on a line before any instance
2755 * of character set it designates.
2756 * ii) If two lines contain characters from the same character set, both lines
2757 * must include the designator sequence.
2758 * iii) Once the designator sequence is known, a shifting sequence has to be found
2759 * to invoke the shifting
2760 * iv) All lines start in ASCII and end in ASCII.
2761 * v) Four shifting sequences are employed for this purpose:
2762 *
2763 * Sequcence ASCII Eq Charsets
2764 * ---------- ------- ---------
2765 * SI <SI> US-ASCII
2766 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2767 * SS2 <ESC>N CNS-11643-1992 Plane 2
2768 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2769 *
2770 * vi)
2771 * SOdesignator : ESC "$" ")" finalchar_for_SO
2772 * SS2designator : ESC "$" "*" finalchar_for_SS2
2773 * SS3designator : ESC "$" "+" finalchar_for_SS3
2774 *
2775 * ESC $ ) A Indicates the bytes following SO are Chinese
2776 * characters as defined in GB 2312-80, until
2777 * another SOdesignation appears
2778 *
2779 *
2780 * ESC $ ) E Indicates the bytes following SO are as defined
2781 * in ISO-IR-165 (for details, see section 2.1),
2782 * until another SOdesignation appears
2783 *
2784 * ESC $ ) G Indicates the bytes following SO are as defined
2785 * in CNS 11643-plane-1, until another
2786 * SOdesignation appears
2787 *
2788 * ESC $ * H Indicates the two bytes immediately following
2789 * SS2 is a Chinese character as defined in CNS
2790 * 11643-plane-2, until another SS2designation
2791 * appears
2792 * (Meaning <ESC>N must preceed every 2 byte
2793 * sequence.)
2794 *
2795 * ESC $ + I Indicates the immediate two bytes following SS3
2796 * is a Chinese character as defined in CNS
2797 * 11643-plane-3, until another SS3designation
2798 * appears
2799 * (Meaning <ESC>O must preceed every 2 byte
2800 * sequence.)
2801 *
2802 * ESC $ + J Indicates the immediate two bytes following SS3
2803 * is a Chinese character as defined in CNS
2804 * 11643-plane-4, until another SS3designation
2805 * appears
2806 * (In English: <ESC>O must preceed every 2 byte
2807 * sequence.)
2808 *
2809 * ESC $ + K Indicates the immediate two bytes following SS3
2810 * is a Chinese character as defined in CNS
2811 * 11643-plane-5, until another SS3designation
2812 * appears
2813 *
2814 * ESC $ + L Indicates the immediate two bytes following SS3
2815 * is a Chinese character as defined in CNS
2816 * 11643-plane-6, until another SS3designation
2817 * appears
2818 *
2819 * ESC $ + M Indicates the immediate two bytes following SS3
2820 * is a Chinese character as defined in CNS
2821 * 11643-plane-7, until another SS3designation
2822 * appears
2823 *
2824 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2825 * has its own designation information before any Chinese characters
2826 * appear
2827 *
2828 */
2829
2830 /* The following are defined this way to make the strings truely readonly */
2831 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2832 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2833 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2834 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2835 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2836 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2837 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2838 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2839 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2840
2841 /********************** ISO2022-CN Data **************************/
2842 static const char* const escSeqCharsCN[10] ={
2843 SHIFT_IN_STR, /* ASCII */
2844 GB_2312_80_STR,
2845 ISO_IR_165_STR,
2846 CNS_11643_1992_Plane_1_STR,
2847 CNS_11643_1992_Plane_2_STR,
2848 CNS_11643_1992_Plane_3_STR,
2849 CNS_11643_1992_Plane_4_STR,
2850 CNS_11643_1992_Plane_5_STR,
2851 CNS_11643_1992_Plane_6_STR,
2852 CNS_11643_1992_Plane_7_STR
2853 };
2854
2855 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2856 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2857 UConverter *cnv = args->converter;
2858 UConverterDataISO2022 *converterData;
2859 ISO2022State *pFromU2022State;
2860 uint8_t *target = (uint8_t *) args->target;
2861 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2862 const UChar* source = args->source;
2863 const UChar* sourceLimit = args->sourceLimit;
2864 int32_t* offsets = args->offsets;
2865 UChar32 sourceChar;
2866 char buffer[8];
2867 int32_t len;
2868 int8_t choices[3];
2869 int32_t choiceCount;
2870 uint32_t targetValue = 0;
2871 UBool useFallback;
2872
2873 /* set up the state */
2874 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2875 pFromU2022State = &converterData->fromU2022State;
2876
2877 choiceCount = 0;
2878
2879 /* check if the last codepoint of previous buffer was a lead surrogate*/
2880 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2881 goto getTrail;
2882 }
2883
2884 while( source < sourceLimit){
2885 if(target < targetLimit){
2886
2887 sourceChar = *(source++);
2888 /*check if the char is a First surrogate*/
2889 if(UTF_IS_SURROGATE(sourceChar)) {
2890 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2891 getTrail:
2892 /*look ahead to find the trail surrogate*/
2893 if(source < sourceLimit) {
2894 /* test the following code unit */
2895 UChar trail=(UChar) *source;
2896 if(UTF_IS_SECOND_SURROGATE(trail)) {
2897 source++;
2898 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2899 cnv->fromUChar32=0x00;
2900 /* convert this supplementary code point */
2901 /* exit this condition tree */
2902 } else {
2903 /* this is an unmatched lead code unit (1st surrogate) */
2904 /* callback(illegal) */
2905 *err=U_ILLEGAL_CHAR_FOUND;
2906 cnv->fromUChar32=sourceChar;
2907 break;
2908 }
2909 } else {
2910 /* no more input */
2911 cnv->fromUChar32=sourceChar;
2912 break;
2913 }
2914 } else {
2915 /* this is an unmatched trail code unit (2nd surrogate) */
2916 /* callback(illegal) */
2917 *err=U_ILLEGAL_CHAR_FOUND;
2918 cnv->fromUChar32=sourceChar;
2919 break;
2920 }
2921 }
2922
2923 /* do the conversion */
2924 if(sourceChar <= 0x007f ){
2925 /* do not convert SO/SI/ESC */
2926 if(IS_2022_CONTROL(sourceChar)) {
2927 /* callback(illegal) */
2928 *err=U_ILLEGAL_CHAR_FOUND;
2929 cnv->fromUChar32=sourceChar;
2930 break;
2931 }
2932
2933 /* US-ASCII */
2934 if(pFromU2022State->g == 0) {
2935 buffer[0] = (char)sourceChar;
2936 len = 1;
2937 } else {
2938 buffer[0] = UCNV_SI;
2939 buffer[1] = (char)sourceChar;
2940 len = 2;
2941 pFromU2022State->g = 0;
2942 choiceCount = 0;
2943 }
2944 if(sourceChar == CR || sourceChar == LF) {
2945 /* reset the state at the end of a line */
2946 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2947 choiceCount = 0;
2948 }
2949 }
2950 else{
2951 /* convert U+0080..U+10ffff */
2952 int32_t i;
2953 int8_t cs, g;
2954
2955 if(choiceCount == 0) {
2956 /* try the current SO/G1 converter first */
2957 choices[0] = pFromU2022State->cs[1];
2958
2959 /* default to GB2312_1 if none is designated yet */
2960 if(choices[0] == 0) {
2961 choices[0] = GB2312_1;
2962 }
2963
2964 if(converterData->version == 0) {
2965 /* ISO-2022-CN */
2966
2967 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2968 if(choices[0] == GB2312_1) {
2969 choices[1] = (int8_t)CNS_11643_1;
2970 } else {
2971 choices[1] = (int8_t)GB2312_1;
2972 }
2973
2974 choiceCount = 2;
2975 } else {
2976 /* ISO-2022-CN-EXT */
2977
2978 /* try one of the other converters */
2979 switch(choices[0]) {
2980 case GB2312_1:
2981 choices[1] = (int8_t)CNS_11643_1;
2982 choices[2] = (int8_t)ISO_IR_165;
2983 break;
2984 case ISO_IR_165:
2985 choices[1] = (int8_t)GB2312_1;
2986 choices[2] = (int8_t)CNS_11643_1;
2987 break;
2988 default: /* CNS_11643_x */
2989 choices[1] = (int8_t)GB2312_1;
2990 choices[2] = (int8_t)ISO_IR_165;
2991 break;
2992 }
2993
2994 choiceCount = 3;
2995 }
2996 }
2997
2998 cs = g = 0;
2999 /*
3000 * len==0: no mapping found yet
3001 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3002 * len>0: found a roundtrip result, done
3003 */
3004 len = 0;
3005 /*
3006 * We will turn off useFallback after finding a fallback,
3007 * but we still get fallbacks from PUA code points as usual.
3008 * Therefore, we will also need to check that we don't overwrite
3009 * an early fallback with a later one.
3010 */
3011 useFallback = cnv->useFallback;
3012
3013 for(i = 0; i < choiceCount && len <= 0; ++i) {
3014 int8_t cs0 = choices[i];
3015 if(cs0 > 0) {
3016 uint32_t value;
3017 int32_t len2;
3018 if(cs0 >= CNS_11643_0) {
3019 len2 = MBCS_FROM_UCHAR32_ISO2022(
3020 converterData->myConverterArray[CNS_11643],
3021 sourceChar,
3022 &value,
3023 useFallback,
3024 MBCS_OUTPUT_3);
3025 if(len2 == 3 || (len2 == -3 && len == 0)) {
3026 targetValue = value;
3027 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3028 if(len2 >= 0) {
3029 len = 2;
3030 } else {
3031 len = -2;
3032 useFallback = FALSE;
3033 }
3034 if(cs == CNS_11643_1) {
3035 g = 1;
3036 } else if(cs == CNS_11643_2) {
3037 g = 2;
3038 } else /* plane 3..7 */ if(converterData->version == 1) {
3039 g = 3;
3040 } else {
3041 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3042 len = 0;
3043 }
3044 }
3045 } else {
3046 /* GB2312_1 or ISO-IR-165 */
3047 len2 = MBCS_FROM_UCHAR32_ISO2022(
3048 converterData->myConverterArray[cs0],
3049 sourceChar,
3050 &value,
3051 useFallback,
3052 MBCS_OUTPUT_2);
3053 if(len2 == 2 || (len2 == -2 && len == 0)) {
3054 targetValue = value;
3055 len = len2;
3056 cs = cs0;
3057 g = 1;
3058 useFallback = FALSE;
3059 }
3060 }
3061 }
3062 }
3063
3064 if(len != 0) {
3065 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3066
3067 /* write the designation sequence if necessary */
3068 if(cs != pFromU2022State->cs[g]) {
3069 if(cs < CNS_11643) {
3070 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3071 } else {
3072 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3073 }
3074 len = 4;
3075 pFromU2022State->cs[g] = cs;
3076 if(g == 1) {
3077 /* changing the SO/G1 charset invalidates the choices[] */
3078 choiceCount = 0;
3079 }
3080 }
3081
3082 /* write the shift sequence if necessary */
3083 if(g != pFromU2022State->g) {
3084 switch(g) {
3085 case 1:
3086 buffer[len++] = UCNV_SO;
3087
3088 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3089 pFromU2022State->g = 1;
3090 break;
3091 case 2:
3092 buffer[len++] = 0x1b;
3093 buffer[len++] = 0x4e;
3094 break;
3095 default: /* case 3 */
3096 buffer[len++] = 0x1b;
3097 buffer[len++] = 0x4f;
3098 break;
3099 }
3100 }
3101
3102 /* write the two output bytes */
3103 buffer[len++] = (char)(targetValue >> 8);
3104 buffer[len++] = (char)targetValue;
3105 } else {
3106 /* if we cannot find the character after checking all codepages
3107 * then this is an error
3108 */
3109 *err = U_INVALID_CHAR_FOUND;
3110 cnv->fromUChar32=sourceChar;
3111 break;
3112 }
3113 }
3114
3115 /* output len>0 bytes in buffer[] */
3116 if(len == 1) {
3117 *target++ = buffer[0];
3118 if(offsets) {
3119 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3120 }
3121 } else if(len == 2 && (target + 2) <= targetLimit) {
3122 *target++ = buffer[0];
3123 *target++ = buffer[1];
3124 if(offsets) {
3125 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3126 *offsets++ = sourceIndex;
3127 *offsets++ = sourceIndex;
3128 }
3129 } else {
3130 fromUWriteUInt8(
3131 cnv,
3132 buffer, len,
3133 &target, (const char *)targetLimit,
3134 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3135 err);
3136 if(U_FAILURE(*err)) {
3137 break;
3138 }
3139 }
3140 } /* end if(myTargetIndex<myTargetLength) */
3141 else{
3142 *err =U_BUFFER_OVERFLOW_ERROR;
3143 break;
3144 }
3145
3146 }/* end while(mySourceIndex<mySourceLength) */
3147
3148 /*
3149 * the end of the input stream and detection of truncated input
3150 * are handled by the framework, but for ISO-2022-CN conversion
3151 * we need to be in ASCII mode at the very end
3152 *
3153 * conditions:
3154 * successful
3155 * not in ASCII mode
3156 * end of input and no truncated input
3157 */
3158 if( U_SUCCESS(*err) &&
3159 pFromU2022State->g!=0 &&
3160 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3161 ) {
3162 int32_t sourceIndex;
3163
3164 /* we are switching to ASCII */
3165 pFromU2022State->g=0;
3166
3167 /* get the source index of the last input character */
3168 /*
3169 * TODO this would be simpler and more reliable if we used a pair
3170 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3171 * so that we could simply use the prevSourceIndex here;
3172 * this code gives an incorrect result for the rare case of an unmatched
3173 * trail surrogate that is alone in the last buffer of the text stream
3174 */
3175 sourceIndex=(int32_t)(source-args->source);
3176 if(sourceIndex>0) {
3177 --sourceIndex;
3178 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3179 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3180 ) {
3181 --sourceIndex;
3182 }
3183 } else {
3184 sourceIndex=-1;
3185 }
3186
3187 fromUWriteUInt8(
3188 cnv,
3189 SHIFT_IN_STR, 1,
3190 &target, (const char *)targetLimit,
3191 &offsets, sourceIndex,
3192 err);
3193 }
3194
3195 /*save the state and return */
3196 args->source = source;
3197 args->target = (char*)target;
3198 }
3199
3200
3201 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3202 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3203 UErrorCode* err){
3204 char tempBuf[3];
3205 const char *mySource = (char *) args->source;
3206 UChar *myTarget = args->target;
3207 const char *mySourceLimit = args->sourceLimit;
3208 uint32_t targetUniChar = 0x0000;
3209 uint32_t mySourceChar = 0x0000;
3210 UConverterDataISO2022* myData;
3211 ISO2022State *pToU2022State;
3212
3213 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3214 pToU2022State = &myData->toU2022State;
3215
3216 if(myData->key != 0) {
3217 /* continue with a partial escape sequence */
3218 goto escape;
3219 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3220 /* continue with a partial double-byte character */
3221 mySourceChar = args->converter->toUBytes[0];
3222 args->converter->toULength = 0;
3223 targetUniChar = missingCharMarker;
3224 goto getTrailByte;
3225 }
3226
3227 while(mySource < mySourceLimit){
3228
3229 targetUniChar =missingCharMarker;
3230
3231 if(myTarget < args->targetLimit){
3232
3233 mySourceChar= (unsigned char) *mySource++;
3234
3235 switch(mySourceChar){
3236 case UCNV_SI:
3237 pToU2022State->g=0;
3238 if (myData->isEmptySegment) {
3239 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3240 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3241 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3242 args->converter->toUBytes[0] = mySourceChar;
3243 args->converter->toULength = 1;
3244 args->target = myTarget;
3245 args->source = mySource;
3246 return;
3247 }
3248 continue;
3249
3250 case UCNV_SO:
3251 if(pToU2022State->cs[1] != 0) {
3252 pToU2022State->g=1;
3253 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3254 continue;
3255 } else {
3256 /* illegal to have SO before a matching designator */
3257 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3258 break;
3259 }
3260
3261 case ESC_2022:
3262 mySource--;
3263 escape:
3264 {
3265 const char * mySourceBefore = mySource;
3266 int8_t toULengthBefore = args->converter->toULength;
3267
3268 changeState_2022(args->converter,&(mySource),
3269 mySourceLimit, ISO_2022_CN,err);
3270
3271 /* After SO there must be at least one character before a designator (designator error handled separately) */
3272 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3273 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3274 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3275 args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
3276 }
3277 }
3278
3279 /* invalid or illegal escape sequence */
3280 if(U_FAILURE(*err)){
3281 args->target = myTarget;
3282 args->source = mySource;
3283 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3284 return;
3285 }
3286 continue;
3287
3288 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3289
3290 case CR:
3291 /*falls through*/
3292 case LF:
3293 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3294 /* falls through */
3295 default:
3296 /* convert one or two bytes */
3297 myData->isEmptySegment = FALSE;
3298 if(pToU2022State->g != 0) {
3299 if(mySource < mySourceLimit) {
3300 UConverterSharedData *cnv;
3301 StateEnum tempState;
3302 int32_t tempBufLen;
3303 int leadIsOk, trailIsOk;
3304 uint8_t trailByte;
3305 getTrailByte:
3306 trailByte = (uint8_t)*mySource;
3307 /*
3308 * Ticket 5691: consistent illegal sequences:
3309 * - We include at least the first byte in the illegal sequence.
3310 * - If any of the non-initial bytes could be the start of a character,
3311 * we stop the illegal sequence before the first one of those.
3312 *
3313 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3314 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3315 * Otherwise we convert or report the pair of bytes.
3316 */
3317 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3318 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3319 if (leadIsOk && trailIsOk) {
3320 ++mySource;
3321 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3322 if(tempState >= CNS_11643_0) {
3323 cnv = myData->myConverterArray[CNS_11643];
3324 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3325 tempBuf[1] = (char) (mySourceChar);
3326 tempBuf[2] = (char) trailByte;
3327 tempBufLen = 3;
3328
3329 }else{
3330 cnv = myData->myConverterArray[tempState];
3331 tempBuf[0] = (char) (mySourceChar);
3332 tempBuf[1] = (char) trailByte;
3333 tempBufLen = 2;
3334 }
3335 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3336 mySourceChar = (mySourceChar << 8) | trailByte;
3337 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3338 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3339 ++mySource;
3340 /* add another bit so that the code below writes 2 bytes in case of error */
3341 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3342 }
3343 if(pToU2022State->g>=2) {
3344 /* return from a single-shift state to the previous one */
3345 pToU2022State->g=pToU2022State->prevG;
3346 }
3347 } else {
3348 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3349 args->converter->toULength = 1;
3350 goto endloop;
3351 }
3352 }
3353 else{
3354 if(mySourceChar <= 0x7f) {
3355 targetUniChar = (UChar) mySourceChar;
3356 }
3357 }
3358 break;
3359 }
3360 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3361 if(args->offsets){
3362 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3363 }
3364 *(myTarget++)=(UChar)targetUniChar;
3365 }
3366 else if(targetUniChar > missingCharMarker){
3367 /* disassemble the surrogate pair and write to output*/
3368 targetUniChar-=0x0010000;
3369 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3370 if(args->offsets){
3371 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3372 }
3373 ++myTarget;
3374 if(myTarget< args->targetLimit){
3375 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3376 if(args->offsets){
3377 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3378 }
3379 ++myTarget;
3380 }else{
3381 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3382 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3383 }
3384
3385 }
3386 else{
3387 /* Call the callback function*/
3388 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3389 break;
3390 }
3391 }
3392 else{
3393 *err =U_BUFFER_OVERFLOW_ERROR;
3394 break;
3395 }
3396 }
3397 endloop:
3398 args->target = myTarget;
3399 args->source = mySource;
3400 }
3401
3402 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3404 UConverter *cnv = args->converter;
3405 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3406 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3407 char *p, *subchar;
3408 char buffer[8];
3409 int32_t length;
3410
3411 subchar=(char *)cnv->subChars;
3412 length=cnv->subCharLen; /* assume length==1 for most variants */
3413
3414 p = buffer;
3415 switch(myConverterData->locale[0]){
3416 case 'j':
3417 {
3418 int8_t cs;
3419
3420 if(pFromU2022State->g == 1) {
3421 /* JIS7: switch from G1 to G0 */
3422 pFromU2022State->g = 0;
3423 *p++ = UCNV_SI;
3424 }
3425
3426 cs = pFromU2022State->cs[0];
3427 if(cs != ASCII && cs != JISX201) {
3428 /* not in ASCII or JIS X 0201: switch to ASCII */
3429 pFromU2022State->cs[0] = (int8_t)ASCII;
3430 *p++ = '\x1b';
3431 *p++ = '\x28';
3432 *p++ = '\x42';
3433 }
3434
3435 *p++ = subchar[0];
3436 break;
3437 }
3438 case 'c':
3439 if(pFromU2022State->g != 0) {
3440 /* not in ASCII mode: switch to ASCII */
3441 pFromU2022State->g = 0;
3442 *p++ = UCNV_SI;
3443 }
3444 *p++ = subchar[0];
3445 break;
3446 case 'k':
3447 if(myConverterData->version == 0) {
3448 if(length == 1) {
3449 if((UBool)args->converter->fromUnicodeStatus) {
3450 /* in DBCS mode: switch to SBCS */
3451 args->converter->fromUnicodeStatus = 0;
3452 *p++ = UCNV_SI;
3453 }
3454 *p++ = subchar[0];
3455 } else /* length == 2*/ {
3456 if(!(UBool)args->converter->fromUnicodeStatus) {
3457 /* in SBCS mode: switch to DBCS */
3458 args->converter->fromUnicodeStatus = 1;
3459 *p++ = UCNV_SO;
3460 }
3461 *p++ = subchar[0];
3462 *p++ = subchar[1];
3463 }
3464 break;
3465 } else {
3466 /* save the subconverter's substitution string */
3467 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3468 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3469
3470 /* set our substitution string into the subconverter */
3471 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3472 myConverterData->currentConverter->subCharLen = (int8_t)length;
3473
3474 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3475 args->converter = myConverterData->currentConverter;
3476 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3477 ucnv_cbFromUWriteSub(args, 0, err);
3478 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3479 args->converter = cnv;
3480
3481 /* restore the subconverter's substitution string */
3482 myConverterData->currentConverter->subChars = currentSubChars;
3483 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3484
3485 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3486 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3487 uprv_memcpy(
3488 cnv->charErrorBuffer,
3489 myConverterData->currentConverter->charErrorBuffer,
3490 myConverterData->currentConverter->charErrorBufferLength);
3491 }
3492 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3493 myConverterData->currentConverter->charErrorBufferLength = 0;
3494 }
3495 return;
3496 }
3497 default:
3498 /* not expected */
3499 break;
3500 }
3501 ucnv_cbFromUWriteBytes(args,
3502 buffer, (int32_t)(p - buffer),
3503 offsetIndex, err);
3504 }
3505
3506 /*
3507 * Structure for cloning an ISO 2022 converter into a single memory block.
3508 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3509 * and then ucnv_safeClone() of the sub-converter may additionally align
3510 * currentConverter inside the cloneStruct, for which we need the deadSpace
3511 * after currentConverter.
3512 * This is because UAlignedMemory may be larger than the actually
3513 * necessary alignment size for the platform.
3514 * The other cloneStruct fields will not be moved around,
3515 * and are aligned properly with cloneStruct's alignment.
3516 */
3517 struct cloneStruct
3518 {
3519 UConverter cnv;
3520 UConverter currentConverter;
3521 UAlignedMemory deadSpace;
3522 UConverterDataISO2022 mydata;
3523 };
3524
3525
3526 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3527 _ISO_2022_SafeClone(
3528 const UConverter *cnv,
3529 void *stackBuffer,
3530 int32_t *pBufferSize,
3531 UErrorCode *status)
3532 {
3533 struct cloneStruct * localClone;
3534 UConverterDataISO2022 *cnvData;
3535 int32_t i, size;
3536
3537 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3538 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3539 return NULL;
3540 }
3541
3542 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3543 localClone = (struct cloneStruct *)stackBuffer;
3544
3545 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3546
3547 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3548 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3549 localClone->cnv.isExtraLocal = TRUE;
3550
3551 /* share the subconverters */
3552
3553 if(cnvData->currentConverter != NULL) {
3554 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3555 localClone->mydata.currentConverter =
3556 ucnv_safeClone(cnvData->currentConverter,
3557 &localClone->currentConverter,
3558 &size, status);
3559 if(U_FAILURE(*status)) {
3560 return NULL;
3561 }
3562 }
3563
3564 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3565 if(cnvData->myConverterArray[i] != NULL) {
3566 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3567 }
3568 }
3569
3570 return &localClone->cnv;
3571 }
3572
3573 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3574 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3575 const USetAdder *sa,
3576 UConverterUnicodeSet which,
3577 UErrorCode *pErrorCode)
3578 {
3579 int32_t i;
3580 UConverterDataISO2022* cnvData;
3581
3582 if (U_FAILURE(*pErrorCode)) {
3583 return;
3584 }
3585 #ifdef U_ENABLE_GENERIC_ISO_2022
3586 if (cnv->sharedData == &_ISO2022Data) {
3587 /* We use UTF-8 in this case */
3588 sa->addRange(sa->set, 0, 0xd7FF);
3589 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3590 return;
3591 }
3592 #endif
3593
3594 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3595
3596 /* open a set and initialize it with code points that are algorithmically round-tripped */
3597 switch(cnvData->locale[0]){
3598 case 'j':
3599 /* include JIS X 0201 which is hardcoded */
3600 sa->add(sa->set, 0xa5);
3601 sa->add(sa->set, 0x203e);
3602 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3603 /* include Latin-1 for some variants of JP */
3604 sa->addRange(sa->set, 0, 0xff);
3605 } else {
3606 /* include ASCII for JP */
3607 sa->addRange(sa->set, 0, 0x7f);
3608 }
3609 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3610 /*
3611 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3612 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3613 * use half-width Katakana.
3614 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3615 * half-width Katakana via the ESC ( I sequence.
3616 * However, we only emit (fromUnicode) half-width Katakana according to the
3617 * definition of each variant.
3618 *
3619 * When including fallbacks,
3620 * we need to include half-width Katakana Unicode code points for all JP variants because
3621 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3622 */
3623 /* include half-width Katakana for JP */
3624 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3625 }
3626 break;
3627 case 'c':
3628 case 'z':
3629 /* include ASCII for CN */
3630 sa->addRange(sa->set, 0, 0x7f);
3631 break;
3632 case 'k':
3633 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3634 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3635 cnvData->currentConverter, sa, which, pErrorCode);
3636 /* the loop over myConverterArray[] will simply not find another converter */
3637 break;
3638 default:
3639 break;
3640 }
3641
3642 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3643 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3644 cnvData->version==0 && i==CNS_11643
3645 ) {
3646 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3647 ucnv_MBCSGetUnicodeSetForBytes(
3648 cnvData->myConverterArray[i],
3649 sa, UCNV_ROUNDTRIP_SET,
3650 0, 0x81, 0x82,
3651 pErrorCode);
3652 }
3653 #endif
3654
3655 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3656 UConverterSetFilter filter;
3657 if(cnvData->myConverterArray[i]!=NULL) {
3658 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3659 cnvData->version==0 && i==CNS_11643
3660 ) {
3661 /*
3662 * Version-specific for CN:
3663 * CN version 0 does not map CNS planes 3..7 although
3664 * they are all available in the CNS conversion table;
3665 * CN version 1 (-EXT) does map them all.
3666 * The two versions create different Unicode sets.
3667 */
3668 filter=UCNV_SET_FILTER_2022_CN;
3669 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3670 /*
3671 * Only add code points that map to Shift-JIS codes
3672 * corresponding to JIS X 0208.
3673 */
3674 filter=UCNV_SET_FILTER_SJIS;
3675 } else if(i==KSC5601) {
3676 /*
3677 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3678 * are broader than GR94.
3679 */
3680 filter=UCNV_SET_FILTER_GR94DBCS;
3681 } else {
3682 filter=UCNV_SET_FILTER_NONE;
3683 }
3684 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3685 }
3686 }
3687
3688 /*
3689 * ISO 2022 converters must not convert SO/SI/ESC despite what
3690 * sub-converters do by themselves.
3691 * Remove these characters from the set.
3692 */
3693 sa->remove(sa->set, 0x0e);
3694 sa->remove(sa->set, 0x0f);
3695 sa->remove(sa->set, 0x1b);
3696
3697 /* ISO 2022 converters do not convert C1 controls either */
3698 sa->removeRange(sa->set, 0x80, 0x9f);
3699 }
3700
3701 static const UConverterImpl _ISO2022Impl={
3702 UCNV_ISO_2022,
3703
3704 NULL,
3705 NULL,
3706
3707 _ISO2022Open,
3708 _ISO2022Close,
3709 _ISO2022Reset,
3710
3711 #ifdef U_ENABLE_GENERIC_ISO_2022
3712 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3713 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3714 ucnv_fromUnicode_UTF8,
3715 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3716 #else
3717 NULL,
3718 NULL,
3719 NULL,
3720 NULL,
3721 #endif
3722 NULL,
3723
3724 NULL,
3725 _ISO2022getName,
3726 _ISO_2022_WriteSub,
3727 _ISO_2022_SafeClone,
3728 _ISO_2022_GetUnicodeSet
3729 };
3730 static const UConverterStaticData _ISO2022StaticData={
3731 sizeof(UConverterStaticData),
3732 "ISO_2022",
3733 2022,
3734 UCNV_IBM,
3735 UCNV_ISO_2022,
3736 1,
3737 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3738 { 0x1a, 0, 0, 0 },
3739 1,
3740 FALSE,
3741 FALSE,
3742 0,
3743 0,
3744 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3745 };
3746 const UConverterSharedData _ISO2022Data={
3747 sizeof(UConverterSharedData),
3748 ~((uint32_t) 0),
3749 NULL,
3750 NULL,
3751 &_ISO2022StaticData,
3752 FALSE,
3753 &_ISO2022Impl,
3754 0
3755 };
3756
3757 /*************JP****************/
3758 static const UConverterImpl _ISO2022JPImpl={
3759 UCNV_ISO_2022,
3760
3761 NULL,
3762 NULL,
3763
3764 _ISO2022Open,
3765 _ISO2022Close,
3766 _ISO2022Reset,
3767
3768 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3769 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3770 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3771 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3772 NULL,
3773
3774 NULL,
3775 _ISO2022getName,
3776 _ISO_2022_WriteSub,
3777 _ISO_2022_SafeClone,
3778 _ISO_2022_GetUnicodeSet
3779 };
3780 static const UConverterStaticData _ISO2022JPStaticData={
3781 sizeof(UConverterStaticData),
3782 "ISO_2022_JP",
3783 0,
3784 UCNV_IBM,
3785 UCNV_ISO_2022,
3786 1,
3787 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3788 { 0x1a, 0, 0, 0 },
3789 1,
3790 FALSE,
3791 FALSE,
3792 0,
3793 0,
3794 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3795 };
3796 static const UConverterSharedData _ISO2022JPData={
3797 sizeof(UConverterSharedData),
3798 ~((uint32_t) 0),
3799 NULL,
3800 NULL,
3801 &_ISO2022JPStaticData,
3802 FALSE,
3803 &_ISO2022JPImpl,
3804 0
3805 };
3806
3807 /************* KR ***************/
3808 static const UConverterImpl _ISO2022KRImpl={
3809 UCNV_ISO_2022,
3810
3811 NULL,
3812 NULL,
3813
3814 _ISO2022Open,
3815 _ISO2022Close,
3816 _ISO2022Reset,
3817
3818 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3819 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3820 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3821 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3822 NULL,
3823
3824 NULL,
3825 _ISO2022getName,
3826 _ISO_2022_WriteSub,
3827 _ISO_2022_SafeClone,
3828 _ISO_2022_GetUnicodeSet
3829 };
3830 static const UConverterStaticData _ISO2022KRStaticData={
3831 sizeof(UConverterStaticData),
3832 "ISO_2022_KR",
3833 0,
3834 UCNV_IBM,
3835 UCNV_ISO_2022,
3836 1,
3837 3, /* max 3 bytes per UChar: SO+DBCS */
3838 { 0x1a, 0, 0, 0 },
3839 1,
3840 FALSE,
3841 FALSE,
3842 0,
3843 0,
3844 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3845 };
3846 static const UConverterSharedData _ISO2022KRData={
3847 sizeof(UConverterSharedData),
3848 ~((uint32_t) 0),
3849 NULL,
3850 NULL,
3851 &_ISO2022KRStaticData,
3852 FALSE,
3853 &_ISO2022KRImpl,
3854 0
3855 };
3856
3857 /*************** CN ***************/
3858 static const UConverterImpl _ISO2022CNImpl={
3859
3860 UCNV_ISO_2022,
3861
3862 NULL,
3863 NULL,
3864
3865 _ISO2022Open,
3866 _ISO2022Close,
3867 _ISO2022Reset,
3868
3869 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3870 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3871 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3872 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3873 NULL,
3874
3875 NULL,
3876 _ISO2022getName,
3877 _ISO_2022_WriteSub,
3878 _ISO_2022_SafeClone,
3879 _ISO_2022_GetUnicodeSet
3880 };
3881 static const UConverterStaticData _ISO2022CNStaticData={
3882 sizeof(UConverterStaticData),
3883 "ISO_2022_CN",
3884 0,
3885 UCNV_IBM,
3886 UCNV_ISO_2022,
3887 1,
3888 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3889 { 0x1a, 0, 0, 0 },
3890 1,
3891 FALSE,
3892 FALSE,
3893 0,
3894 0,
3895 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3896 };
3897 static const UConverterSharedData _ISO2022CNData={
3898 sizeof(UConverterSharedData),
3899 ~((uint32_t) 0),
3900 NULL,
3901 NULL,
3902 &_ISO2022CNStaticData,
3903 FALSE,
3904 &_ISO2022CNImpl,
3905 0
3906 };
3907
3908
3909
3910 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3911