1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "ucnv_imp.h"
38 #include "ucnv_bld.h"
39 #include "ucnv_cnv.h"
40 #include "ucnvmbcs.h"
41 #include "cstring.h"
42 #include "cmemory.h"
43
44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 static const char SHIFT_IN_STR[] = "\x0F";
79 static const char SHIFT_OUT_STR[] = "\x0E";
80
81 #define CR 0x0D
82 #define LF 0x0A
83 #define H_TAB 0x09
84 #define V_TAB 0x0B
85 #define SPACE 0x20
86
87 enum {
88 HWKANA_START=0xff61,
89 HWKANA_END=0xff9f
90 };
91
92 /*
93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
94 * as bytes 21..7E. (Subtract 0x80.)
95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
96 * as bytes 20..7F. (Subtract 0x80.)
97 * Do not encode C1 control codes with native bytes 80..9F
98 * as bytes 00..1F (C0 control codes).
99 */
100 enum {
101 GR94_START=0xa1,
102 GR94_END=0xfe,
103 GR96_START=0xa0,
104 GR96_END=0xff
105 };
106
107 /*
108 * ISO 2022 control codes must not be converted from Unicode
109 * because they would mess up the byte stream.
110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
111 * corresponding to SO, SI, and ESC.
112 */
113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
114
115 /* for ISO-2022-JP and -CN implementations */
116 typedef enum {
117 /* shared values */
118 INVALID_STATE=-1,
119 ASCII = 0,
120
121 SS2_STATE=0x10,
122 SS3_STATE,
123
124 /* JP */
125 ISO8859_1 = 1 ,
126 ISO8859_7 = 2 ,
127 JISX201 = 3,
128 JISX208 = 4,
129 JISX212 = 5,
130 GB2312 =6,
131 KSC5601 =7,
132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
133
134 /* CN */
135 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
136 GB2312_1=1,
137 ISO_IR_165=2,
138 CNS_11643=3,
139
140 /*
141 * these are used in StateEnum and ISO2022State variables,
142 * but CNS_11643 must be used to index into myConverterArray[]
143 */
144 CNS_11643_0=0x20,
145 CNS_11643_1,
146 CNS_11643_2,
147 CNS_11643_3,
148 CNS_11643_4,
149 CNS_11643_5,
150 CNS_11643_6,
151 CNS_11643_7
152 } StateEnum;
153
154 /* is the StateEnum charset value for a DBCS charset? */
155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
156
157 #define CSM(cs) ((uint16_t)1<<(cs))
158
159 /*
160 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
161 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
162 *
163 * Note: The converter uses some leniency:
164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
165 * all versions, not just JIS7 and JIS8.
166 * - ICU does not distinguish between different versions of JIS X 0208.
167 */
168 enum { MAX_JA_VERSION=4 };
169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
175 };
176
177 typedef enum {
178 ASCII1=0,
179 LATIN1,
180 SBCS,
181 DBCS,
182 MBCS,
183 HWKANA
184 }Cnv2022Type;
185
186 typedef struct ISO2022State {
187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
189 int8_t prevG; /* g before single shift (SS2 or SS3) */
190 } ISO2022State;
191
192 #define UCNV_OPTIONS_VERSION_MASK 0xf
193 #define UCNV_2022_MAX_CONVERTERS 10
194
195 typedef struct{
196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
197 UConverter *currentConverter;
198 Cnv2022Type currentType;
199 ISO2022State toU2022State, fromU2022State;
200 uint32_t key;
201 uint32_t version;
202 #ifdef U_ENABLE_GENERIC_ISO_2022
203 UBool isFirstBuffer;
204 #endif
205 UBool isEmptySegment;
206 char name[30];
207 char locale[3];
208 }UConverterDataISO2022;
209
210 /* Protos */
211 /* ISO-2022 ----------------------------------------------------------------- */
212
213 /*Forward declaration */
214 U_CFUNC void
215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
216 UErrorCode * err);
217 U_CFUNC void
218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
219 UErrorCode * err);
220
221 #define ESC_2022 0x1B /*ESC*/
222
223 typedef enum
224 {
225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
229 } UCNV_TableStates_2022;
230
231 /*
232 * The way these state transition arrays work is:
233 * ex : ESC$B is the sequence for JISX208
234 * a) First Iteration: char is ESC
235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
236 * int x = normalize_esq_chars_2022[27] which is equal to 1
237 * ii) Search for this value in escSeqStateTable_Key_2022[]
238 * value of x is stored at escSeqStateTable_Key_2022[0]
239 * iii) Save this index as offset
240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
242 * b) Switch on this state and continue to next char
243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
244 * which is normalize_esq_chars_2022[36] == 4
245 * ii) x is currently 1(from above)
246 * x<<=5 -- x is now 32
247 * x+=normalize_esq_chars_2022[36]
248 * now x is 36
249 * iii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
253 * c) Switch on this state and continue to next char
254 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
255 * ii) x is currently 36 (from above)
256 * x<<=5 -- x is now 1152
257 * x+=normalize_esq_chars_2022[66]
258 * now x is 1161
259 * iii) Search for this value in escSeqStateTable_Key_2022[]
260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
264 */
265
266
267 /*Below are the 3 arrays depicting a state transition table*/
268 static const int8_t normalize_esq_chars_2022[256] = {
269 /* 0 1 2 3 4 5 6 7 8 9 */
270
271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0
297 };
298
299 #ifdef U_ENABLE_GENERIC_ISO_2022
300 /*
301 * When the generic ISO-2022 converter is completely removed, not just disabled
302 * per #ifdef, then the following state table and the associated tables that are
303 * dimensioned with MAX_STATES_2022 should be trimmed.
304 *
305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
306 * the associated escape sequences starting with ESC ( B should be removed.
307 * This includes the ones with key values 1097 and all of the ones above 1000000.
308 *
309 * For the latter, the tables can simply be truncated.
310 * For the former, since the tables must be kept parallel, it is probably best
311 * to simply duplicate an adjacent table cell, parallel in all tables.
312 *
313 * It may make sense to restructure the tables, especially by using small search
314 * tables for the variants instead of indexing them parallel to the table here.
315 */
316 #endif
317
318 #define MAX_STATES_2022 74
319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
320 /* 0 1 2 3 4 5 6 7 8 9 */
321
322 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
329 ,35947631 ,35947635 ,35947636 ,35947638
330 };
331
332 #ifdef U_ENABLE_GENERIC_ISO_2022
333
334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
335 /* 0 1 2 3 4 5 6 7 8 9 */
336
337 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
339 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
345 };
346
347 #endif
348
349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
350 /* 0 1 2 3 4 5 6 7 8 9 */
351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 };
360
361
362 /* Type def for refactoring changeState_2022 code*/
363 typedef enum{
364 #ifdef U_ENABLE_GENERIC_ISO_2022
365 ISO_2022=0,
366 #endif
367 ISO_2022_JP=1,
368 ISO_2022_KR=2,
369 ISO_2022_CN=3
370 } Variant2022;
371
372 /*********** ISO 2022 Converter Protos ***********/
373 static void
374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
375
376 static void
377 _ISO2022Close(UConverter *converter);
378
379 static void
380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
381
382 static const char*
383 _ISO2022getName(const UConverter* cnv);
384
385 static void
386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
387
388 static UConverter *
389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
390
391 #ifdef U_ENABLE_GENERIC_ISO_2022
392 static void
393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
394 #endif
395
396 /*const UConverterSharedData _ISO2022Data;*/
397 static const UConverterSharedData _ISO2022JPData;
398 static const UConverterSharedData _ISO2022KRData;
399 static const UConverterSharedData _ISO2022CNData;
400
401 /*************** Converter implementations ******************/
402
403 /* The purpose of this function is to get around gcc compiler warnings. */
404 static U_INLINE void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)405 fromUWriteUInt8(UConverter *cnv,
406 const char *bytes, int32_t length,
407 uint8_t **target, const char *targetLimit,
408 int32_t **offsets,
409 int32_t sourceIndex,
410 UErrorCode *pErrorCode)
411 {
412 char *targetChars = (char *)*target;
413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
414 offsets, sourceIndex, pErrorCode);
415 *target = (uint8_t*)targetChars;
416
417 }
418
419 static U_INLINE void
setInitialStateToUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
421 if(myConverterData->version == 1) {
422 UConverter *cnv = myConverterData->currentConverter;
423
424 cnv->toUnicodeStatus=0; /* offset */
425 cnv->mode=0; /* state */
426 cnv->toULength=0; /* byteIndex */
427 }
428 }
429
430 static U_INLINE void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
432 /* in ISO-2022-KR the designator sequence appears only once
433 * in a file so we append it only once
434 */
435 if( converter->charErrorBufferLength==0){
436
437 converter->charErrorBufferLength = 4;
438 converter->charErrorBuffer[0] = 0x1b;
439 converter->charErrorBuffer[1] = 0x24;
440 converter->charErrorBuffer[2] = 0x29;
441 converter->charErrorBuffer[3] = 0x43;
442 }
443 if(myConverterData->version == 1) {
444 UConverter *cnv = myConverterData->currentConverter;
445
446 cnv->fromUChar32=0;
447 cnv->fromUnicodeStatus=1; /* prevLength */
448 }
449 }
450
451 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
453
454 char myLocale[6]={' ',' ',' ',' ',' ',' '};
455
456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
457 if(cnv->extraInfo != NULL) {
458 UConverterNamePieces stackPieces;
459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) };
460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
461 uint32_t version;
462
463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
464
465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
466 myConverterData->currentType = ASCII1;
467 cnv->fromUnicodeStatus =FALSE;
468 if(pArgs->locale){
469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
470 }
471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
472 myConverterData->version = version;
473
474 /* BEGIN android-changed */
475 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
476 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
477 if((myLocale[0]=='j' &&
478 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
479 myLocale[1]=='s') &&
480 (myLocale[2]=='_' || myLocale[2]=='\0')))
481 {
482 size_t len=0;
483 /* open the required converters and cache them */
484 if(version>MAX_JA_VERSION) {
485 /* prevent indexing beyond jpCharsetMasks[] */
486 myConverterData->version = version = 0;
487 }
488 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
489 myConverterData->myConverterArray[ISO8859_7] =
490 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
491 }
492 if (myLocale[1]=='k') { /* Use KDDI's version. */
493 myConverterData->myConverterArray[JISX208] =
494 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
495 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */
496 myConverterData->myConverterArray[JISX208] =
497 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
498 } else {
499 myConverterData->myConverterArray[JISX208] =
500 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
501 }
502 /* END android-changed */
503
504 if(jpCharsetMasks[version]&CSM(JISX212)) {
505 myConverterData->myConverterArray[JISX212] =
506 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
507 }
508 if(jpCharsetMasks[version]&CSM(GB2312)) {
509 myConverterData->myConverterArray[GB2312] =
510 /* BEGIN android-changed */
511 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
512 /* END android-changed */
513 }
514 if(jpCharsetMasks[version]&CSM(KSC5601)) {
515 myConverterData->myConverterArray[KSC5601] =
516 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
517 }
518
519 /* set the function pointers to appropriate funtions */
520 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
521 uprv_strcpy(myConverterData->locale,"ja");
522
523 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
524 len = uprv_strlen(myConverterData->name);
525 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
526 myConverterData->name[len+1]='\0';
527 }
528 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
529 (myLocale[2]=='_' || myLocale[2]=='\0'))
530 {
531 const char *cnvName;
532 if(version==1) {
533 cnvName="icu-internal-25546";
534 } else {
535 /* BEGIN android-changed */
536 cnvName="ksc_5601";
537 /* END android-changed */
538 myConverterData->version=version=0;
539 }
540 if(pArgs->onlyTestIsLoadable) {
541 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
542 uprv_free(cnv->extraInfo);
543 cnv->extraInfo=NULL;
544 return;
545 } else {
546 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
547 if (U_FAILURE(*errorCode)) {
548 _ISO2022Close(cnv);
549 return;
550 }
551
552 if(version==1) {
553 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
554 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
555 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
556 }else{
557 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
558 }
559
560 /* initialize the state variables */
561 setInitialStateToUnicodeKR(cnv, myConverterData);
562 setInitialStateFromUnicodeKR(cnv, myConverterData);
563
564 /* set the function pointers to appropriate funtions */
565 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
566 uprv_strcpy(myConverterData->locale,"ko");
567 }
568 }
569 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
570 (myLocale[2]=='_' || myLocale[2]=='\0'))
571 {
572
573 /* open the required converters and cache them */
574 /* BEGIN android-changed */
575 myConverterData->myConverterArray[GB2312_1] =
576 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
577 if(version==1) {
578 myConverterData->myConverterArray[ISO_IR_165] =
579 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
580 }
581 myConverterData->myConverterArray[CNS_11643] =
582 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
583 /* END android-changed */
584
585
586 /* set the function pointers to appropriate funtions */
587 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
588 uprv_strcpy(myConverterData->locale,"cn");
589
590 if (version==0){
591 myConverterData->version = 0;
592 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
593 }else if (version==1){
594 myConverterData->version = 1;
595 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
596 }else {
597 myConverterData->version = 2;
598 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
599 }
600 }
601 else{
602 #ifdef U_ENABLE_GENERIC_ISO_2022
603 myConverterData->isFirstBuffer = TRUE;
604
605 /* append the UTF-8 escape sequence */
606 cnv->charErrorBufferLength = 3;
607 cnv->charErrorBuffer[0] = 0x1b;
608 cnv->charErrorBuffer[1] = 0x25;
609 cnv->charErrorBuffer[2] = 0x42;
610
611 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
612 /* initialize the state variables */
613 uprv_strcpy(myConverterData->name,"ISO_2022");
614 #else
615 *errorCode = U_UNSUPPORTED_ERROR;
616 return;
617 #endif
618 }
619
620 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
621
622 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
623 _ISO2022Close(cnv);
624 }
625 } else {
626 *errorCode = U_MEMORY_ALLOCATION_ERROR;
627 }
628 }
629
630
631 static void
_ISO2022Close(UConverter * converter)632 _ISO2022Close(UConverter *converter) {
633 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
634 UConverterSharedData **array = myData->myConverterArray;
635 int32_t i;
636
637 if (converter->extraInfo != NULL) {
638 /*close the array of converter pointers and free the memory*/
639 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
640 if(array[i]!=NULL) {
641 ucnv_unloadSharedDataIfReady(array[i]);
642 }
643 }
644
645 ucnv_close(myData->currentConverter);
646
647 if(!converter->isExtraLocal){
648 uprv_free (converter->extraInfo);
649 converter->extraInfo = NULL;
650 }
651 }
652 }
653
654 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)655 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
656 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
657 if(choice<=UCNV_RESET_TO_UNICODE) {
658 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
659 myConverterData->key = 0;
660 myConverterData->isEmptySegment = FALSE;
661 }
662 if(choice!=UCNV_RESET_TO_UNICODE) {
663 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
664 }
665 #ifdef U_ENABLE_GENERIC_ISO_2022
666 if(myConverterData->locale[0] == 0){
667 if(choice<=UCNV_RESET_TO_UNICODE) {
668 myConverterData->isFirstBuffer = TRUE;
669 myConverterData->key = 0;
670 if (converter->mode == UCNV_SO){
671 ucnv_close (myConverterData->currentConverter);
672 myConverterData->currentConverter=NULL;
673 }
674 converter->mode = UCNV_SI;
675 }
676 if(choice!=UCNV_RESET_TO_UNICODE) {
677 /* re-append UTF-8 escape sequence */
678 converter->charErrorBufferLength = 3;
679 converter->charErrorBuffer[0] = 0x1b;
680 converter->charErrorBuffer[1] = 0x28;
681 converter->charErrorBuffer[2] = 0x42;
682 }
683 }
684 else
685 #endif
686 {
687 /* reset the state variables */
688 if(myConverterData->locale[0] == 'k'){
689 if(choice<=UCNV_RESET_TO_UNICODE) {
690 setInitialStateToUnicodeKR(converter, myConverterData);
691 }
692 if(choice!=UCNV_RESET_TO_UNICODE) {
693 setInitialStateFromUnicodeKR(converter, myConverterData);
694 }
695 }
696 }
697 }
698
699 static const char*
_ISO2022getName(const UConverter * cnv)700 _ISO2022getName(const UConverter* cnv){
701 if(cnv->extraInfo){
702 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
703 return myData->name;
704 }
705 return NULL;
706 }
707
708
709 /*************** to unicode *******************/
710 /****************************************************************************
711 * Recognized escape sequences are
712 * <ESC>(B ASCII
713 * <ESC>.A ISO-8859-1
714 * <ESC>.F ISO-8859-7
715 * <ESC>(J JISX-201
716 * <ESC>(I JISX-201
717 * <ESC>$B JISX-208
718 * <ESC>$@ JISX-208
719 * <ESC>$(D JISX-212
720 * <ESC>$A GB2312
721 * <ESC>$(C KSC5601
722 */
723 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
724 /* 0 1 2 3 4 5 6 7 8 9 */
725 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
726 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
727 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
728 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
729 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
730 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
731 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
732 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
733 };
734
735 /*************** to unicode *******************/
736 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
737 /* 0 1 2 3 4 5 6 7 8 9 */
738 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
739 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
740 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
741 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
742 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
743 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
745 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
746 };
747
748
749 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)750 getKey_2022(char c,int32_t* key,int32_t* offset){
751 int32_t togo;
752 int32_t low = 0;
753 int32_t hi = MAX_STATES_2022;
754 int32_t oldmid=0;
755
756 togo = normalize_esq_chars_2022[(uint8_t)c];
757 if(togo == 0) {
758 /* not a valid character anywhere in an escape sequence */
759 *key = 0;
760 *offset = 0;
761 return INVALID_2022;
762 }
763 togo = (*key << 5) + togo;
764
765 while (hi != low) /*binary search*/{
766
767 register int32_t mid = (hi+low) >> 1; /*Finds median*/
768
769 if (mid == oldmid)
770 break;
771
772 if (escSeqStateTable_Key_2022[mid] > togo){
773 hi = mid;
774 }
775 else if (escSeqStateTable_Key_2022[mid] < togo){
776 low = mid;
777 }
778 else /*we found it*/{
779 *key = togo;
780 *offset = mid;
781 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
782 }
783 oldmid = mid;
784
785 }
786
787 *key = 0;
788 *offset = 0;
789 return INVALID_2022;
790 }
791
792 /*runs through a state machine to determine the escape sequence - codepage correspondance
793 */
794 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)795 changeState_2022(UConverter* _this,
796 const char** source,
797 const char* sourceLimit,
798 Variant2022 var,
799 UErrorCode* err){
800 UCNV_TableStates_2022 value;
801 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
802 uint32_t key = myData2022->key;
803 int32_t offset = 0;
804 int8_t initialToULength = _this->toULength;
805 char c;
806
807 value = VALID_NON_TERMINAL_2022;
808 while (*source < sourceLimit) {
809 c = *(*source)++;
810 _this->toUBytes[_this->toULength++]=(uint8_t)c;
811 value = getKey_2022(c,(int32_t *) &key, &offset);
812
813 switch (value){
814
815 case VALID_NON_TERMINAL_2022 :
816 /* continue with the loop */
817 break;
818
819 case VALID_TERMINAL_2022:
820 key = 0;
821 goto DONE;
822
823 case INVALID_2022:
824 goto DONE;
825
826 case VALID_MAYBE_TERMINAL_2022:
827 #ifdef U_ENABLE_GENERIC_ISO_2022
828 /* ESC ( B is ambiguous only for ISO_2022 itself */
829 if(var == ISO_2022) {
830 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
831 _this->toULength = 0;
832
833 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
834
835 /* continue with the loop */
836 value = VALID_NON_TERMINAL_2022;
837 break;
838 } else
839 #endif
840 {
841 /* not ISO_2022 itself, finish here */
842 value = VALID_TERMINAL_2022;
843 key = 0;
844 goto DONE;
845 }
846 }
847 }
848
849 DONE:
850 myData2022->key = key;
851
852 if (value == VALID_NON_TERMINAL_2022) {
853 /* indicate that the escape sequence is incomplete: key!=0 */
854 return;
855 } else if (value == INVALID_2022 ) {
856 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
857 } else /* value == VALID_TERMINAL_2022 */ {
858 switch(var){
859 #ifdef U_ENABLE_GENERIC_ISO_2022
860 case ISO_2022:
861 {
862 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
863 if(chosenConverterName == NULL) {
864 /* SS2 or SS3 */
865 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
866 _this->toUCallbackReason = UCNV_UNASSIGNED;
867 return;
868 }
869
870 _this->mode = UCNV_SI;
871 ucnv_close(myData2022->currentConverter);
872 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
873 if(U_SUCCESS(*err)) {
874 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
875 _this->mode = UCNV_SO;
876 }
877 break;
878 }
879 #endif
880 case ISO_2022_JP:
881 {
882 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
883 switch(tempState) {
884 case INVALID_STATE:
885 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
886 break;
887 case SS2_STATE:
888 if(myData2022->toU2022State.cs[2]!=0) {
889 if(myData2022->toU2022State.g<2) {
890 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
891 }
892 myData2022->toU2022State.g=2;
893 } else {
894 /* illegal to have SS2 before a matching designator */
895 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
896 }
897 break;
898 /* case SS3_STATE: not used in ISO-2022-JP-x */
899 case ISO8859_1:
900 case ISO8859_7:
901 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
903 } else {
904 /* G2 charset for SS2 */
905 myData2022->toU2022State.cs[2]=(int8_t)tempState;
906 }
907 break;
908 default:
909 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
910 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
911 } else {
912 /* G0 charset */
913 myData2022->toU2022State.cs[0]=(int8_t)tempState;
914 }
915 break;
916 }
917 }
918 break;
919 case ISO_2022_CN:
920 {
921 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
922 switch(tempState) {
923 case INVALID_STATE:
924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
925 break;
926 case SS2_STATE:
927 if(myData2022->toU2022State.cs[2]!=0) {
928 if(myData2022->toU2022State.g<2) {
929 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
930 }
931 myData2022->toU2022State.g=2;
932 } else {
933 /* illegal to have SS2 before a matching designator */
934 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
935 }
936 break;
937 case SS3_STATE:
938 if(myData2022->toU2022State.cs[3]!=0) {
939 if(myData2022->toU2022State.g<2) {
940 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
941 }
942 myData2022->toU2022State.g=3;
943 } else {
944 /* illegal to have SS3 before a matching designator */
945 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
946 }
947 break;
948 case ISO_IR_165:
949 if(myData2022->version==0) {
950 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
951 break;
952 }
953 /*fall through*/
954 case GB2312_1:
955 /*fall through*/
956 case CNS_11643_1:
957 myData2022->toU2022State.cs[1]=(int8_t)tempState;
958 break;
959 case CNS_11643_2:
960 myData2022->toU2022State.cs[2]=(int8_t)tempState;
961 break;
962 default:
963 /* other CNS 11643 planes */
964 if(myData2022->version==0) {
965 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
966 } else {
967 myData2022->toU2022State.cs[3]=(int8_t)tempState;
968 }
969 break;
970 }
971 }
972 break;
973 case ISO_2022_KR:
974 if(offset==0x30){
975 /* nothing to be done, just accept this one escape sequence */
976 } else {
977 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
978 }
979 break;
980
981 default:
982 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
983 break;
984 }
985 }
986 if(U_SUCCESS(*err)) {
987 _this->toULength = 0;
988 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
989 if(_this->toULength>1) {
990 /*
991 * Ticket 5691: consistent illegal sequences:
992 * - We include at least the first byte (ESC) in the illegal sequence.
993 * - If any of the non-initial bytes could be the start of a character,
994 * we stop the illegal sequence before the first one of those.
995 * In escape sequences, all following bytes are "printable", that is,
996 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
997 * they are valid single/lead bytes.
998 * For simplicity, we always only report the initial ESC byte as the
999 * illegal sequence and back out all other bytes we looked at.
1000 */
1001 /* Back out some bytes. */
1002 int8_t backOutDistance=_this->toULength-1;
1003 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1004 if(backOutDistance<=bytesFromThisBuffer) {
1005 /* same as initialToULength<=1 */
1006 *source-=backOutDistance;
1007 } else {
1008 /* Back out bytes from the previous buffer: Need to replay them. */
1009 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1010 /* same as -(initialToULength-1) */
1011 /* preToULength is negative! */
1012 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1013 *source-=bytesFromThisBuffer;
1014 }
1015 _this->toULength=1;
1016 }
1017 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1018 _this->toUCallbackReason = UCNV_UNASSIGNED;
1019 }
1020 }
1021
1022 /*Checks the characters of the buffer against valid 2022 escape sequences
1023 *if the match we return a pointer to the initial start of the sequence otherwise
1024 *we return sourceLimit
1025 */
1026 /*for 2022 looks ahead in the stream
1027 *to determine the longest possible convertible
1028 *data stream
1029 */
1030 static U_INLINE const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool flush)1031 getEndOfBuffer_2022(const char** source,
1032 const char* sourceLimit,
1033 UBool flush){
1034
1035 const char* mySource = *source;
1036
1037 #ifdef U_ENABLE_GENERIC_ISO_2022
1038 if (*source >= sourceLimit)
1039 return sourceLimit;
1040
1041 do{
1042
1043 if (*mySource == ESC_2022){
1044 int8_t i;
1045 int32_t key = 0;
1046 int32_t offset;
1047 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1048
1049 /* Kludge: I could not
1050 * figure out the reason for validating an escape sequence
1051 * twice - once here and once in changeState_2022().
1052 * is it possible to have an ESC character in a ISO2022
1053 * byte stream which is valid in a code page? Is it legal?
1054 */
1055 for (i=0;
1056 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1057 i++) {
1058 value = getKey_2022(*(mySource+i), &key, &offset);
1059 }
1060 if (value > 0 || *mySource==ESC_2022)
1061 return mySource;
1062
1063 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1064 return sourceLimit;
1065 }
1066 }while (++mySource < sourceLimit);
1067
1068 return sourceLimit;
1069 #else
1070 while(mySource < sourceLimit && *mySource != ESC_2022) {
1071 ++mySource;
1072 }
1073 return mySource;
1074 #endif
1075 }
1076
1077
1078 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1079 * any future change in _MBCSFromUChar32() function should be reflected here.
1080 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1081 */
1082 static U_INLINE int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1083 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1084 UChar32 c,
1085 uint32_t* value,
1086 UBool useFallback,
1087 int outputType)
1088 {
1089 const int32_t *cx;
1090 const uint16_t *table;
1091 uint32_t stage2Entry;
1092 uint32_t myValue;
1093 int32_t length;
1094 const uint8_t *p;
1095 /*
1096 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1097 * Use internal version of ucnv_open() that verifies that the new structures are available,
1098 * else U_INTERNAL_PROGRAM_ERROR.
1099 */
1100 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1101 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1102 table=sharedData->mbcs.fromUnicodeTable;
1103 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1104 /* get the bytes and the length for the output */
1105 if(outputType==MBCS_OUTPUT_2){
1106 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1107 if(myValue<=0xff) {
1108 length=1;
1109 } else {
1110 length=2;
1111 }
1112 } else /* outputType==MBCS_OUTPUT_3 */ {
1113 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1114 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1115 if(myValue<=0xff) {
1116 length=1;
1117 } else if(myValue<=0xffff) {
1118 length=2;
1119 } else {
1120 length=3;
1121 }
1122 }
1123 /* is this code point assigned, or do we use fallbacks? */
1124 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1125 /* assigned */
1126 *value=myValue;
1127 return length;
1128 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1129 /*
1130 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1131 * There is no way with this data structure for fallback output
1132 * to be a zero byte.
1133 */
1134 *value=myValue;
1135 return -length;
1136 }
1137 }
1138
1139 cx=sharedData->mbcs.extIndexes;
1140 if(cx!=NULL) {
1141 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1142 }
1143
1144 /* unassigned */
1145 return 0;
1146 }
1147
1148 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1149 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1150 * @param retval pointer to output byte
1151 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1152 */
1153 static U_INLINE int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1154 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1155 UChar32 c,
1156 uint32_t* retval,
1157 UBool useFallback)
1158 {
1159 const uint16_t *table;
1160 int32_t value;
1161 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1162 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1163 return 0;
1164 }
1165 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1166 table=sharedData->mbcs.fromUnicodeTable;
1167 /* get the byte for the output */
1168 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1169 /* is this code point assigned, or do we use fallbacks? */
1170 *retval=(uint32_t)(value&0xff);
1171 if(value>=0xf00) {
1172 return 1; /* roundtrip */
1173 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1174 return -1; /* fallback taken */
1175 } else {
1176 return 0; /* no mapping */
1177 }
1178 }
1179
1180 /*
1181 * Check that the result is a 2-byte value with each byte in the range A1..FE
1182 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1183 * to move it to the ISO 2022 range 21..7E.
1184 * Return 0 if out of range.
1185 */
1186 static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value)1187 _2022FromGR94DBCS(uint32_t value) {
1188 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1189 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1190 ) {
1191 return value - 0x8080; /* shift down to 21..7e byte range */
1192 } else {
1193 return 0; /* not valid for ISO 2022 */
1194 }
1195 }
1196
1197 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1198 /*
1199 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1200 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1201 * unchanged.
1202 */
1203 static U_INLINE uint32_t
1204 _2022ToGR94DBCS(uint32_t value) {
1205 uint32_t returnValue = value + 0x8080;
1206 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1207 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1208 return returnValue;
1209 } else {
1210 return value;
1211 }
1212 }
1213 #endif
1214
1215 #ifdef U_ENABLE_GENERIC_ISO_2022
1216
1217 /**********************************************************************************
1218 * ISO-2022 Converter
1219 *
1220 *
1221 */
1222
1223 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1224 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1225 UErrorCode* err){
1226 const char* mySourceLimit, *realSourceLimit;
1227 const char* sourceStart;
1228 const UChar* myTargetStart;
1229 UConverter* saveThis;
1230 UConverterDataISO2022* myData;
1231 int8_t length;
1232
1233 saveThis = args->converter;
1234 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1235
1236 realSourceLimit = args->sourceLimit;
1237 while (args->source < realSourceLimit) {
1238 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1239 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1240 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1241
1242 if(args->source < mySourceLimit) {
1243 if(myData->currentConverter==NULL) {
1244 myData->currentConverter = ucnv_open("ASCII",err);
1245 if(U_FAILURE(*err)){
1246 return;
1247 }
1248
1249 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1250 saveThis->mode = UCNV_SO;
1251 }
1252
1253 /* convert to before the ESC or until the end of the buffer */
1254 myData->isFirstBuffer=FALSE;
1255 sourceStart = args->source;
1256 myTargetStart = args->target;
1257 args->converter = myData->currentConverter;
1258 ucnv_toUnicode(args->converter,
1259 &args->target,
1260 args->targetLimit,
1261 &args->source,
1262 mySourceLimit,
1263 args->offsets,
1264 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1265 err);
1266 args->converter = saveThis;
1267
1268 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1269 /* move the overflow buffer */
1270 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1271 myData->currentConverter->UCharErrorBufferLength = 0;
1272 if(length > 0) {
1273 uprv_memcpy(saveThis->UCharErrorBuffer,
1274 myData->currentConverter->UCharErrorBuffer,
1275 length*U_SIZEOF_UCHAR);
1276 }
1277 return;
1278 }
1279
1280 /*
1281 * At least one of:
1282 * -Error while converting
1283 * -Done with entire buffer
1284 * -Need to write offsets or update the current offset
1285 * (leave that up to the code in ucnv.c)
1286 *
1287 * or else we just stopped at an ESC byte and continue with changeState_2022()
1288 */
1289 if (U_FAILURE(*err) ||
1290 (args->source == realSourceLimit) ||
1291 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1292 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1293 ) {
1294 /* copy partial or error input for truncated detection and error handling */
1295 if(U_FAILURE(*err)) {
1296 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1297 if(length > 0) {
1298 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1299 }
1300 } else {
1301 length = saveThis->toULength = myData->currentConverter->toULength;
1302 if(length > 0) {
1303 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1304 if(args->source < mySourceLimit) {
1305 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1306 }
1307 }
1308 }
1309 return;
1310 }
1311 }
1312 }
1313
1314 sourceStart = args->source;
1315 changeState_2022(args->converter,
1316 &(args->source),
1317 realSourceLimit,
1318 ISO_2022,
1319 err);
1320 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1321 /* let the ucnv.c code update its current offset */
1322 return;
1323 }
1324 }
1325 }
1326
1327 #endif
1328
1329 /*
1330 * To Unicode Callback helper function
1331 */
1332 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1333 toUnicodeCallback(UConverter *cnv,
1334 const uint32_t sourceChar, const uint32_t targetUniChar,
1335 UErrorCode* err){
1336 if(sourceChar>0xff){
1337 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1338 cnv->toUBytes[1] = (uint8_t)sourceChar;
1339 cnv->toULength = 2;
1340 }
1341 else{
1342 cnv->toUBytes[0] =(char) sourceChar;
1343 cnv->toULength = 1;
1344 }
1345
1346 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1347 *err = U_INVALID_CHAR_FOUND;
1348 }
1349 else{
1350 *err = U_ILLEGAL_CHAR_FOUND;
1351 }
1352 }
1353
1354 /**************************************ISO-2022-JP*************************************************/
1355
1356 /************************************** IMPORTANT **************************************************
1357 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1358 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1359 * The converter iterates over each Unicode codepoint
1360 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1361 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1362 * would do as far as possible.
1363 *
1364 * If the implementation of these macros or structure of sharedData struct change in the future, make
1365 * sure that ISO-2022 is also changed.
1366 ***************************************************************************************************
1367 */
1368
1369 /***************************************************************************************************
1370 * Rules for ISO-2022-jp encoding
1371 * (i) Escape sequences must be fully contained within a line they should not
1372 * span new lines or CRs
1373 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1374 * JIS-Roman character escape sequence should follow before the line terminates
1375 * (iii) If the first character on the line is represented by two bytes then a two
1376 * byte character escape sequence should precede it
1377 * (iv) If no escape sequence is encountered then the characters are ASCII
1378 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1379 * and invoked with SS2 (ESC N).
1380 * (vi) If there is any G0 designation in text, there must be a switch to
1381 * ASCII or to JIS X 0201-Roman before a space character (but not
1382 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1383 * characters such as tab or CRLF.
1384 * (vi) Supported encodings:
1385 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1386 *
1387 * source : RFC-1554
1388 *
1389 * JISX201, JISX208,JISX212 : new .cnv data files created
1390 * KSC5601 : alias to ibm-949 mapping table
1391 * GB2312 : alias to ibm-1386 mapping table
1392 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1393 * ISO-8859-7 : alisas to ibm-9409 mapping table
1394 */
1395
1396 /* preference order of JP charsets */
1397 static const StateEnum jpCharsetPref[]={
1398 ASCII,
1399 JISX201,
1400 ISO8859_1,
1401 ISO8859_7,
1402 JISX208,
1403 JISX212,
1404 GB2312,
1405 KSC5601,
1406 HWKANA_7BIT
1407 };
1408
1409 /*
1410 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1411 * not in order of jpCharsetPref[]!
1412 */
1413 static const char escSeqChars[][6] ={
1414 "\x1B\x28\x42", /* <ESC>(B ASCII */
1415 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1416 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1417 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1418 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1419 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1420 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1421 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1422 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1423
1424 };
1425 static const int8_t escSeqCharsLen[] ={
1426 3, /* length of <ESC>(B ASCII */
1427 3, /* length of <ESC>.A ISO-8859-1 */
1428 3, /* length of <ESC>.F ISO-8859-7 */
1429 3, /* length of <ESC>(J JISX-201 */
1430 3, /* length of <ESC>$B JISX-208 */
1431 4, /* length of <ESC>$(D JISX-212 */
1432 3, /* length of <ESC>$A GB2312 */
1433 4, /* length of <ESC>$(C KSC5601 */
1434 3 /* length of <ESC>(I HWKANA_7BIT */
1435 };
1436
1437 /*
1438 * The iteration over various code pages works this way:
1439 * i) Get the currentState from myConverterData->currentState
1440 * ii) Check if the character is mapped to a valid character in the currentState
1441 * Yes -> a) set the initIterState to currentState
1442 * b) remain in this state until an invalid character is found
1443 * No -> a) go to the next code page and find the character
1444 * iii) Before changing the state increment the current state check if the current state
1445 * is equal to the intitIteration state
1446 * Yes -> A character that cannot be represented in any of the supported encodings
1447 * break and return a U_INVALID_CHARACTER error
1448 * No -> Continue and find the character in next code page
1449 *
1450 *
1451 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1452 */
1453
1454 /* Map 00..7F to Unicode according to JIS X 0201. */
1455 static U_INLINE uint32_t
jisx201ToU(uint32_t value)1456 jisx201ToU(uint32_t value) {
1457 if(value < 0x5c) {
1458 return value;
1459 } else if(value == 0x5c) {
1460 return 0xa5;
1461 } else if(value == 0x7e) {
1462 return 0x203e;
1463 } else /* value <= 0x7f */ {
1464 return value;
1465 }
1466 }
1467
1468 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1469 static U_INLINE uint32_t
jisx201FromU(uint32_t value)1470 jisx201FromU(uint32_t value) {
1471 if(value<=0x7f) {
1472 if(value!=0x5c && value!=0x7e) {
1473 return value;
1474 }
1475 } else if(value==0xa5) {
1476 return 0x5c;
1477 } else if(value==0x203e) {
1478 return 0x7e;
1479 }
1480 return 0xfffe;
1481 }
1482
1483 /*
1484 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1485 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1486 * Return 0 if the byte pair is out of range.
1487 */
1488 static U_INLINE uint32_t
_2022FromSJIS(uint32_t value)1489 _2022FromSJIS(uint32_t value) {
1490 uint8_t trail;
1491
1492 if(value > 0xEFFC) {
1493 return 0; /* beyond JIS X 0208 */
1494 }
1495
1496 trail = (uint8_t)value;
1497
1498 value &= 0xff00; /* lead byte */
1499 if(value <= 0x9f00) {
1500 value -= 0x7000;
1501 } else /* 0xe000 <= value <= 0xef00 */ {
1502 value -= 0xb000;
1503 }
1504 value <<= 1;
1505
1506 if(trail <= 0x9e) {
1507 value -= 0x100;
1508 if(trail <= 0x7e) {
1509 value |= trail - 0x1f;
1510 } else {
1511 value |= trail - 0x20;
1512 }
1513 } else /* trail <= 0xfc */ {
1514 value |= trail - 0x7e;
1515 }
1516 return value;
1517 }
1518
1519 /*
1520 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1521 * If either byte is outside 21..7E make sure that the result is not valid
1522 * for Shift-JIS so that the converter catches it.
1523 * Some invalid byte values already turn into equally invalid Shift-JIS
1524 * byte values and need not be tested explicitly.
1525 */
1526 static U_INLINE void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1527 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1528 if(c1&1) {
1529 ++c1;
1530 if(c2 <= 0x5f) {
1531 c2 += 0x1f;
1532 } else if(c2 <= 0x7e) {
1533 c2 += 0x20;
1534 } else {
1535 c2 = 0; /* invalid */
1536 }
1537 } else {
1538 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1539 c2 += 0x7e;
1540 } else {
1541 c2 = 0; /* invalid */
1542 }
1543 }
1544 c1 >>= 1;
1545 if(c1 <= 0x2f) {
1546 c1 += 0x70;
1547 } else if(c1 <= 0x3f) {
1548 c1 += 0xb0;
1549 } else {
1550 c1 = 0; /* invalid */
1551 }
1552 bytes[0] = (char)c1;
1553 bytes[1] = (char)c2;
1554 }
1555
1556 /*
1557 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1558 * Katakana.
1559 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1560 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1561 * These were the only fallbacks in ICU's jisx-208.ucm file.
1562 */
1563 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1564 0x2123, /* U+FF61 */
1565 0x2156,
1566 0x2157,
1567 0x2122,
1568 0x2126,
1569 0x2572,
1570 0x2521,
1571 0x2523,
1572 0x2525,
1573 0x2527,
1574 0x2529,
1575 0x2563,
1576 0x2565,
1577 0x2567,
1578 0x2543,
1579 0x213C, /* U+FF70 */
1580 0x2522,
1581 0x2524,
1582 0x2526,
1583 0x2528,
1584 0x252A,
1585 0x252B,
1586 0x252D,
1587 0x252F,
1588 0x2531,
1589 0x2533,
1590 0x2535,
1591 0x2537,
1592 0x2539,
1593 0x253B,
1594 0x253D,
1595 0x253F, /* U+FF80 */
1596 0x2541,
1597 0x2544,
1598 0x2546,
1599 0x2548,
1600 0x254A,
1601 0x254B,
1602 0x254C,
1603 0x254D,
1604 0x254E,
1605 0x254F,
1606 0x2552,
1607 0x2555,
1608 0x2558,
1609 0x255B,
1610 0x255E,
1611 0x255F, /* U+FF90 */
1612 0x2560,
1613 0x2561,
1614 0x2562,
1615 0x2564,
1616 0x2566,
1617 0x2568,
1618 0x2569,
1619 0x256A,
1620 0x256B,
1621 0x256C,
1622 0x256D,
1623 0x256F,
1624 0x2573,
1625 0x212B,
1626 0x212C /* U+FF9F */
1627 };
1628
1629 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1630 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1631 UConverter *cnv = args->converter;
1632 UConverterDataISO2022 *converterData;
1633 ISO2022State *pFromU2022State;
1634 uint8_t *target = (uint8_t *) args->target;
1635 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1636 const UChar* source = args->source;
1637 const UChar* sourceLimit = args->sourceLimit;
1638 int32_t* offsets = args->offsets;
1639 UChar32 sourceChar;
1640 char buffer[8];
1641 int32_t len, outLen;
1642 int8_t choices[10];
1643 int32_t choiceCount;
1644 uint32_t targetValue = 0;
1645 UBool useFallback;
1646
1647 int32_t i;
1648 int8_t cs, g;
1649
1650 /* set up the state */
1651 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1652 pFromU2022State = &converterData->fromU2022State;
1653
1654 choiceCount = 0;
1655
1656 /* check if the last codepoint of previous buffer was a lead surrogate*/
1657 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1658 goto getTrail;
1659 }
1660
1661 while(source < sourceLimit) {
1662 if(target < targetLimit) {
1663
1664 sourceChar = *(source++);
1665 /*check if the char is a First surrogate*/
1666 if(UTF_IS_SURROGATE(sourceChar)) {
1667 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1668 getTrail:
1669 /*look ahead to find the trail surrogate*/
1670 if(source < sourceLimit) {
1671 /* test the following code unit */
1672 UChar trail=(UChar) *source;
1673 if(UTF_IS_SECOND_SURROGATE(trail)) {
1674 source++;
1675 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1676 cnv->fromUChar32=0x00;
1677 /* convert this supplementary code point */
1678 /* exit this condition tree */
1679 } else {
1680 /* this is an unmatched lead code unit (1st surrogate) */
1681 /* callback(illegal) */
1682 *err=U_ILLEGAL_CHAR_FOUND;
1683 cnv->fromUChar32=sourceChar;
1684 break;
1685 }
1686 } else {
1687 /* no more input */
1688 cnv->fromUChar32=sourceChar;
1689 break;
1690 }
1691 } else {
1692 /* this is an unmatched trail code unit (2nd surrogate) */
1693 /* callback(illegal) */
1694 *err=U_ILLEGAL_CHAR_FOUND;
1695 cnv->fromUChar32=sourceChar;
1696 break;
1697 }
1698 }
1699
1700 /* do not convert SO/SI/ESC */
1701 if(IS_2022_CONTROL(sourceChar)) {
1702 /* callback(illegal) */
1703 *err=U_ILLEGAL_CHAR_FOUND;
1704 cnv->fromUChar32=sourceChar;
1705 break;
1706 }
1707
1708 /* do the conversion */
1709
1710 if(choiceCount == 0) {
1711 uint16_t csm;
1712
1713 /*
1714 * The csm variable keeps track of which charsets are allowed
1715 * and not used yet while building the choices[].
1716 */
1717 csm = jpCharsetMasks[converterData->version];
1718 choiceCount = 0;
1719
1720 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1721 if(converterData->version == 3 || converterData->version == 4) {
1722 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1723 }
1724 /* Do not try single-byte half-width Katakana for other versions. */
1725 csm &= ~CSM(HWKANA_7BIT);
1726
1727 /* try the current G0 charset */
1728 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1729 csm &= ~CSM(cs);
1730
1731 /* try the current G2 charset */
1732 if((cs = pFromU2022State->cs[2]) != 0) {
1733 choices[choiceCount++] = cs;
1734 csm &= ~CSM(cs);
1735 }
1736
1737 /* try all the other possible charsets */
1738 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1739 cs = (int8_t)jpCharsetPref[i];
1740 if(CSM(cs) & csm) {
1741 choices[choiceCount++] = cs;
1742 csm &= ~CSM(cs);
1743 }
1744 }
1745 }
1746
1747 cs = g = 0;
1748 /*
1749 * len==0: no mapping found yet
1750 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1751 * len>0: found a roundtrip result, done
1752 */
1753 len = 0;
1754 /*
1755 * We will turn off useFallback after finding a fallback,
1756 * but we still get fallbacks from PUA code points as usual.
1757 * Therefore, we will also need to check that we don't overwrite
1758 * an early fallback with a later one.
1759 */
1760 useFallback = cnv->useFallback;
1761
1762 for(i = 0; i < choiceCount && len <= 0; ++i) {
1763 uint32_t value;
1764 int32_t len2;
1765 int8_t cs0 = choices[i];
1766 switch(cs0) {
1767 case ASCII:
1768 if(sourceChar <= 0x7f) {
1769 targetValue = (uint32_t)sourceChar;
1770 len = 1;
1771 cs = cs0;
1772 g = 0;
1773 }
1774 break;
1775 case ISO8859_1:
1776 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1777 targetValue = (uint32_t)sourceChar - 0x80;
1778 len = 1;
1779 cs = cs0;
1780 g = 2;
1781 }
1782 break;
1783 case HWKANA_7BIT:
1784 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1785 if(converterData->version==3) {
1786 /* JIS7: use G1 (SO) */
1787 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1788 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1789 len = 1;
1790 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1791 g = 1;
1792 } else if(converterData->version==4) {
1793 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1794 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1795 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1796 len = 1;
1797
1798 cs = pFromU2022State->cs[0];
1799 if(IS_JP_DBCS(cs)) {
1800 /* switch from a DBCS charset to JISX201 */
1801 cs = (int8_t)JISX201;
1802 }
1803 /* else stay in the current G0 charset */
1804 g = 0;
1805 }
1806 /* else do not use HWKANA_7BIT with other versions */
1807 }
1808 break;
1809 case JISX201:
1810 /* G0 SBCS */
1811 value = jisx201FromU(sourceChar);
1812 if(value <= 0x7f) {
1813 targetValue = value;
1814 len = 1;
1815 cs = cs0;
1816 g = 0;
1817 useFallback = FALSE;
1818 }
1819 break;
1820 case JISX208:
1821 /* G0 DBCS from Shift-JIS table */
1822 len2 = MBCS_FROM_UCHAR32_ISO2022(
1823 converterData->myConverterArray[cs0],
1824 sourceChar, &value,
1825 useFallback, MBCS_OUTPUT_2);
1826 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1827 value = _2022FromSJIS(value);
1828 if(value != 0) {
1829 targetValue = value;
1830 len = len2;
1831 cs = cs0;
1832 g = 0;
1833 useFallback = FALSE;
1834 }
1835 } else if(len == 0 && useFallback &&
1836 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1837 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1838 len = -2;
1839 cs = cs0;
1840 g = 0;
1841 useFallback = FALSE;
1842 }
1843 break;
1844 case ISO8859_7:
1845 /* G0 SBCS forced to 7-bit output */
1846 len2 = MBCS_SINGLE_FROM_UCHAR32(
1847 converterData->myConverterArray[cs0],
1848 sourceChar, &value,
1849 useFallback);
1850 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1851 targetValue = value - 0x80;
1852 len = len2;
1853 cs = cs0;
1854 g = 2;
1855 useFallback = FALSE;
1856 }
1857 break;
1858 default:
1859 /* G0 DBCS */
1860 len2 = MBCS_FROM_UCHAR32_ISO2022(
1861 converterData->myConverterArray[cs0],
1862 sourceChar, &value,
1863 useFallback, MBCS_OUTPUT_2);
1864 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1865 if(cs0 == KSC5601) {
1866 /*
1867 * Check for valid bytes for the encoding scheme.
1868 * This is necessary because the sub-converter (windows-949)
1869 * has a broader encoding scheme than is valid for 2022.
1870 */
1871 value = _2022FromGR94DBCS(value);
1872 if(value == 0) {
1873 break;
1874 }
1875 }
1876 targetValue = value;
1877 len = len2;
1878 cs = cs0;
1879 g = 0;
1880 useFallback = FALSE;
1881 }
1882 break;
1883 }
1884 }
1885
1886 if(len != 0) {
1887 if(len < 0) {
1888 len = -len; /* fallback */
1889 }
1890 outLen = 0; /* count output bytes */
1891
1892 /* write SI if necessary (only for JIS7) */
1893 if(pFromU2022State->g == 1 && g == 0) {
1894 buffer[outLen++] = UCNV_SI;
1895 pFromU2022State->g = 0;
1896 }
1897
1898 /* write the designation sequence if necessary */
1899 if(cs != pFromU2022State->cs[g]) {
1900 int32_t escLen = escSeqCharsLen[cs];
1901 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1902 outLen += escLen;
1903 pFromU2022State->cs[g] = cs;
1904
1905 /* invalidate the choices[] */
1906 choiceCount = 0;
1907 }
1908
1909 /* write the shift sequence if necessary */
1910 if(g != pFromU2022State->g) {
1911 switch(g) {
1912 /* case 0 handled before writing escapes */
1913 case 1:
1914 buffer[outLen++] = UCNV_SO;
1915 pFromU2022State->g = 1;
1916 break;
1917 default: /* case 2 */
1918 buffer[outLen++] = 0x1b;
1919 buffer[outLen++] = 0x4e;
1920 break;
1921 /* no case 3: no SS3 in ISO-2022-JP-x */
1922 }
1923 }
1924
1925 /* write the output bytes */
1926 if(len == 1) {
1927 buffer[outLen++] = (char)targetValue;
1928 } else /* len == 2 */ {
1929 buffer[outLen++] = (char)(targetValue >> 8);
1930 buffer[outLen++] = (char)targetValue;
1931 }
1932 } else {
1933 /*
1934 * if we cannot find the character after checking all codepages
1935 * then this is an error
1936 */
1937 *err = U_INVALID_CHAR_FOUND;
1938 cnv->fromUChar32=sourceChar;
1939 break;
1940 }
1941
1942 if(sourceChar == CR || sourceChar == LF) {
1943 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1944 pFromU2022State->cs[2] = 0;
1945 choiceCount = 0;
1946 }
1947
1948 /* output outLen>0 bytes in buffer[] */
1949 if(outLen == 1) {
1950 *target++ = buffer[0];
1951 if(offsets) {
1952 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1953 }
1954 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1955 *target++ = buffer[0];
1956 *target++ = buffer[1];
1957 if(offsets) {
1958 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1959 *offsets++ = sourceIndex;
1960 *offsets++ = sourceIndex;
1961 }
1962 } else {
1963 fromUWriteUInt8(
1964 cnv,
1965 buffer, outLen,
1966 &target, (const char *)targetLimit,
1967 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1968 err);
1969 if(U_FAILURE(*err)) {
1970 break;
1971 }
1972 }
1973 } /* end if(myTargetIndex<myTargetLength) */
1974 else{
1975 *err =U_BUFFER_OVERFLOW_ERROR;
1976 break;
1977 }
1978
1979 }/* end while(mySourceIndex<mySourceLength) */
1980
1981 /*
1982 * the end of the input stream and detection of truncated input
1983 * are handled by the framework, but for ISO-2022-JP conversion
1984 * we need to be in ASCII mode at the very end
1985 *
1986 * conditions:
1987 * successful
1988 * in SO mode or not in ASCII mode
1989 * end of input and no truncated input
1990 */
1991 if( U_SUCCESS(*err) &&
1992 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1993 args->flush && source>=sourceLimit && cnv->fromUChar32==0
1994 ) {
1995 int32_t sourceIndex;
1996
1997 outLen = 0;
1998
1999 if(pFromU2022State->g != 0) {
2000 buffer[outLen++] = UCNV_SI;
2001 pFromU2022State->g = 0;
2002 }
2003
2004 if(pFromU2022State->cs[0] != ASCII) {
2005 int32_t escLen = escSeqCharsLen[ASCII];
2006 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2007 outLen += escLen;
2008 pFromU2022State->cs[0] = (int8_t)ASCII;
2009 }
2010
2011 /* get the source index of the last input character */
2012 /*
2013 * TODO this would be simpler and more reliable if we used a pair
2014 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2015 * so that we could simply use the prevSourceIndex here;
2016 * this code gives an incorrect result for the rare case of an unmatched
2017 * trail surrogate that is alone in the last buffer of the text stream
2018 */
2019 sourceIndex=(int32_t)(source-args->source);
2020 if(sourceIndex>0) {
2021 --sourceIndex;
2022 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2023 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2024 ) {
2025 --sourceIndex;
2026 }
2027 } else {
2028 sourceIndex=-1;
2029 }
2030
2031 fromUWriteUInt8(
2032 cnv,
2033 buffer, outLen,
2034 &target, (const char *)targetLimit,
2035 &offsets, sourceIndex,
2036 err);
2037 }
2038
2039 /*save the state and return */
2040 args->source = source;
2041 args->target = (char*)target;
2042 }
2043
2044 /*************** to unicode *******************/
2045
2046 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2047 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2048 UErrorCode* err){
2049 char tempBuf[2];
2050 const char *mySource = (char *) args->source;
2051 UChar *myTarget = args->target;
2052 const char *mySourceLimit = args->sourceLimit;
2053 uint32_t targetUniChar = 0x0000;
2054 uint32_t mySourceChar = 0x0000;
2055 uint32_t tmpSourceChar = 0x0000;
2056 UConverterDataISO2022* myData;
2057 ISO2022State *pToU2022State;
2058 StateEnum cs;
2059
2060 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2061 pToU2022State = &myData->toU2022State;
2062
2063 if(myData->key != 0) {
2064 /* continue with a partial escape sequence */
2065 goto escape;
2066 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2067 /* continue with a partial double-byte character */
2068 mySourceChar = args->converter->toUBytes[0];
2069 args->converter->toULength = 0;
2070 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2071 targetUniChar = missingCharMarker;
2072 goto getTrailByte;
2073 }
2074
2075 while(mySource < mySourceLimit){
2076
2077 targetUniChar =missingCharMarker;
2078
2079 if(myTarget < args->targetLimit){
2080
2081 mySourceChar= (unsigned char) *mySource++;
2082
2083 switch(mySourceChar) {
2084 case UCNV_SI:
2085 if(myData->version==3) {
2086 pToU2022State->g=0;
2087 continue;
2088 } else {
2089 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2090 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2091 break;
2092 }
2093
2094 case UCNV_SO:
2095 if(myData->version==3) {
2096 /* JIS7: switch to G1 half-width Katakana */
2097 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2098 pToU2022State->g=1;
2099 continue;
2100 } else {
2101 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2102 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2103 break;
2104 }
2105
2106 case ESC_2022:
2107 mySource--;
2108 escape:
2109 {
2110 const char * mySourceBefore = mySource;
2111 int8_t toULengthBefore = args->converter->toULength;
2112
2113 changeState_2022(args->converter,&(mySource),
2114 mySourceLimit, ISO_2022_JP,err);
2115
2116 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2117 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2118 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2119 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2120 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2121 }
2122 }
2123
2124 /* invalid or illegal escape sequence */
2125 if(U_FAILURE(*err)){
2126 args->target = myTarget;
2127 args->source = mySource;
2128 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2129 return;
2130 }
2131 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2132 if(myData->key==0) {
2133 myData->isEmptySegment = TRUE;
2134 }
2135 continue;
2136
2137 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2138
2139 case CR:
2140 /*falls through*/
2141 case LF:
2142 /* automatically reset to single-byte mode */
2143 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2144 pToU2022State->cs[0] = (int8_t)ASCII;
2145 }
2146 pToU2022State->cs[2] = 0;
2147 pToU2022State->g = 0;
2148 /* falls through */
2149 default:
2150 /* convert one or two bytes */
2151 myData->isEmptySegment = FALSE;
2152 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2153 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2154 !IS_JP_DBCS(cs)
2155 ) {
2156 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2157 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2158
2159 /* return from a single-shift state to the previous one */
2160 if(pToU2022State->g >= 2) {
2161 pToU2022State->g=pToU2022State->prevG;
2162 }
2163 } else switch(cs) {
2164 case ASCII:
2165 if(mySourceChar <= 0x7f) {
2166 targetUniChar = mySourceChar;
2167 }
2168 break;
2169 case ISO8859_1:
2170 if(mySourceChar <= 0x7f) {
2171 targetUniChar = mySourceChar + 0x80;
2172 }
2173 /* return from a single-shift state to the previous one */
2174 pToU2022State->g=pToU2022State->prevG;
2175 break;
2176 case ISO8859_7:
2177 if(mySourceChar <= 0x7f) {
2178 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2179 targetUniChar =
2180 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2181 myData->myConverterArray[cs],
2182 mySourceChar + 0x80);
2183 }
2184 /* return from a single-shift state to the previous one */
2185 pToU2022State->g=pToU2022State->prevG;
2186 break;
2187 case JISX201:
2188 if(mySourceChar <= 0x7f) {
2189 targetUniChar = jisx201ToU(mySourceChar);
2190 }
2191 break;
2192 case HWKANA_7BIT:
2193 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2194 /* 7-bit halfwidth Katakana */
2195 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2196 }
2197 break;
2198 default:
2199 /* G0 DBCS */
2200 if(mySource < mySourceLimit) {
2201 int leadIsOk, trailIsOk;
2202 uint8_t trailByte;
2203 getTrailByte:
2204 trailByte = (uint8_t)*mySource;
2205 /*
2206 * Ticket 5691: consistent illegal sequences:
2207 * - We include at least the first byte in the illegal sequence.
2208 * - If any of the non-initial bytes could be the start of a character,
2209 * we stop the illegal sequence before the first one of those.
2210 *
2211 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2212 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2213 * Otherwise we convert or report the pair of bytes.
2214 */
2215 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2216 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2217 if (leadIsOk && trailIsOk) {
2218 ++mySource;
2219 tmpSourceChar = (mySourceChar << 8) | trailByte;
2220 if(cs == JISX208) {
2221 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2222 mySourceChar = tmpSourceChar;
2223 } else {
2224 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2225 mySourceChar = tmpSourceChar;
2226 if (cs == KSC5601) {
2227 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2228 }
2229 tempBuf[0] = (char)(tmpSourceChar >> 8);
2230 tempBuf[1] = (char)(tmpSourceChar);
2231 }
2232 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2233 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2234 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2235 ++mySource;
2236 /* add another bit so that the code below writes 2 bytes in case of error */
2237 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2238 }
2239 } else {
2240 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2241 args->converter->toULength = 1;
2242 goto endloop;
2243 }
2244 } /* End of inner switch */
2245 break;
2246 } /* End of outer switch */
2247 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2248 if(args->offsets){
2249 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2250 }
2251 *(myTarget++)=(UChar)targetUniChar;
2252 }
2253 else if(targetUniChar > missingCharMarker){
2254 /* disassemble the surrogate pair and write to output*/
2255 targetUniChar-=0x0010000;
2256 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2257 if(args->offsets){
2258 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2259 }
2260 ++myTarget;
2261 if(myTarget< args->targetLimit){
2262 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2263 if(args->offsets){
2264 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2265 }
2266 ++myTarget;
2267 }else{
2268 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2269 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2270 }
2271
2272 }
2273 else{
2274 /* Call the callback function*/
2275 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2276 break;
2277 }
2278 }
2279 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2280 *err =U_BUFFER_OVERFLOW_ERROR;
2281 break;
2282 }
2283 }
2284 endloop:
2285 args->target = myTarget;
2286 args->source = mySource;
2287 }
2288
2289
2290 /***************************************************************
2291 * Rules for ISO-2022-KR encoding
2292 * i) The KSC5601 designator sequence should appear only once in a file,
2293 * at the begining of a line before any KSC5601 characters. This usually
2294 * means that it appears by itself on the first line of the file
2295 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2296 * and SI to shift into single byte mode
2297 */
2298 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2299 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2300
2301 UConverter* saveConv = args->converter;
2302 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2303 args->converter=myConverterData->currentConverter;
2304
2305 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2306 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2307 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2308
2309 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2310 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2311 uprv_memcpy(
2312 saveConv->charErrorBuffer,
2313 myConverterData->currentConverter->charErrorBuffer,
2314 myConverterData->currentConverter->charErrorBufferLength);
2315 }
2316 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2317 myConverterData->currentConverter->charErrorBufferLength = 0;
2318 }
2319 args->converter=saveConv;
2320 }
2321
2322 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2324
2325 const UChar *source = args->source;
2326 const UChar *sourceLimit = args->sourceLimit;
2327 unsigned char *target = (unsigned char *) args->target;
2328 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2329 int32_t* offsets = args->offsets;
2330 uint32_t targetByteUnit = 0x0000;
2331 UChar32 sourceChar = 0x0000;
2332 UBool isTargetByteDBCS;
2333 UBool oldIsTargetByteDBCS;
2334 UConverterDataISO2022 *converterData;
2335 UConverterSharedData* sharedData;
2336 UBool useFallback;
2337 int32_t length =0;
2338
2339 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2340 /* if the version is 1 then the user is requesting
2341 * conversion with ibm-25546 pass the arguments to
2342 * MBCS converter and return
2343 */
2344 if(converterData->version==1){
2345 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2346 return;
2347 }
2348
2349 /* initialize data */
2350 sharedData = converterData->currentConverter->sharedData;
2351 useFallback = args->converter->useFallback;
2352 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2353 oldIsTargetByteDBCS = isTargetByteDBCS;
2354
2355 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2356 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2357 goto getTrail;
2358 }
2359 while(source < sourceLimit){
2360
2361 targetByteUnit = missingCharMarker;
2362
2363 if(target < (unsigned char*) args->targetLimit){
2364 sourceChar = *source++;
2365
2366 /* do not convert SO/SI/ESC */
2367 if(IS_2022_CONTROL(sourceChar)) {
2368 /* callback(illegal) */
2369 *err=U_ILLEGAL_CHAR_FOUND;
2370 args->converter->fromUChar32=sourceChar;
2371 break;
2372 }
2373
2374 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2375 if(length < 0) {
2376 length = -length; /* fallback */
2377 }
2378 /* only DBCS or SBCS characters are expected*/
2379 /* DB characters with high bit set to 1 are expected */
2380 if( length > 2 || length==0 ||
2381 (length == 1 && targetByteUnit > 0x7f) ||
2382 (length == 2 &&
2383 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2384 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2385 ) {
2386 targetByteUnit=missingCharMarker;
2387 }
2388 if (targetByteUnit != missingCharMarker){
2389
2390 oldIsTargetByteDBCS = isTargetByteDBCS;
2391 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2392 /* append the shift sequence */
2393 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2394
2395 if (isTargetByteDBCS)
2396 *target++ = UCNV_SO;
2397 else
2398 *target++ = UCNV_SI;
2399 if(offsets)
2400 *(offsets++) = (int32_t)(source - args->source-1);
2401 }
2402 /* write the targetUniChar to target */
2403 if(targetByteUnit <= 0x00FF){
2404 if( target < targetLimit){
2405 *(target++) = (unsigned char) targetByteUnit;
2406 if(offsets){
2407 *(offsets++) = (int32_t)(source - args->source-1);
2408 }
2409
2410 }else{
2411 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2412 *err = U_BUFFER_OVERFLOW_ERROR;
2413 }
2414 }else{
2415 if(target < targetLimit){
2416 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2417 if(offsets){
2418 *(offsets++) = (int32_t)(source - args->source-1);
2419 }
2420 if(target < targetLimit){
2421 *(target++) =(unsigned char) (targetByteUnit -0x80);
2422 if(offsets){
2423 *(offsets++) = (int32_t)(source - args->source-1);
2424 }
2425 }else{
2426 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2427 *err = U_BUFFER_OVERFLOW_ERROR;
2428 }
2429 }else{
2430 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2431 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2432 *err = U_BUFFER_OVERFLOW_ERROR;
2433 }
2434 }
2435
2436 }
2437 else{
2438 /* oops.. the code point is unassingned
2439 * set the error and reason
2440 */
2441
2442 /*check if the char is a First surrogate*/
2443 if(UTF_IS_SURROGATE(sourceChar)) {
2444 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2445 getTrail:
2446 /*look ahead to find the trail surrogate*/
2447 if(source < sourceLimit) {
2448 /* test the following code unit */
2449 UChar trail=(UChar) *source;
2450 if(UTF_IS_SECOND_SURROGATE(trail)) {
2451 source++;
2452 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2453 *err = U_INVALID_CHAR_FOUND;
2454 /* convert this surrogate code point */
2455 /* exit this condition tree */
2456 } else {
2457 /* this is an unmatched lead code unit (1st surrogate) */
2458 /* callback(illegal) */
2459 *err=U_ILLEGAL_CHAR_FOUND;
2460 }
2461 } else {
2462 /* no more input */
2463 *err = U_ZERO_ERROR;
2464 }
2465 } else {
2466 /* this is an unmatched trail code unit (2nd surrogate) */
2467 /* callback(illegal) */
2468 *err=U_ILLEGAL_CHAR_FOUND;
2469 }
2470 } else {
2471 /* callback(unassigned) for a BMP code point */
2472 *err = U_INVALID_CHAR_FOUND;
2473 }
2474
2475 args->converter->fromUChar32=sourceChar;
2476 break;
2477 }
2478 } /* end if(myTargetIndex<myTargetLength) */
2479 else{
2480 *err =U_BUFFER_OVERFLOW_ERROR;
2481 break;
2482 }
2483
2484 }/* end while(mySourceIndex<mySourceLength) */
2485
2486 /*
2487 * the end of the input stream and detection of truncated input
2488 * are handled by the framework, but for ISO-2022-KR conversion
2489 * we need to be in ASCII mode at the very end
2490 *
2491 * conditions:
2492 * successful
2493 * not in ASCII mode
2494 * end of input and no truncated input
2495 */
2496 if( U_SUCCESS(*err) &&
2497 isTargetByteDBCS &&
2498 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2499 ) {
2500 int32_t sourceIndex;
2501
2502 /* we are switching to ASCII */
2503 isTargetByteDBCS=FALSE;
2504
2505 /* get the source index of the last input character */
2506 /*
2507 * TODO this would be simpler and more reliable if we used a pair
2508 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2509 * so that we could simply use the prevSourceIndex here;
2510 * this code gives an incorrect result for the rare case of an unmatched
2511 * trail surrogate that is alone in the last buffer of the text stream
2512 */
2513 sourceIndex=(int32_t)(source-args->source);
2514 if(sourceIndex>0) {
2515 --sourceIndex;
2516 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2517 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2518 ) {
2519 --sourceIndex;
2520 }
2521 } else {
2522 sourceIndex=-1;
2523 }
2524
2525 fromUWriteUInt8(
2526 args->converter,
2527 SHIFT_IN_STR, 1,
2528 &target, (const char *)targetLimit,
2529 &offsets, sourceIndex,
2530 err);
2531 }
2532
2533 /*save the state and return */
2534 args->source = source;
2535 args->target = (char*)target;
2536 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2537 }
2538
2539 /************************ To Unicode ***************************************/
2540
2541 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2542 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2543 UErrorCode* err){
2544 char const* sourceStart;
2545 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2546
2547 UConverterToUnicodeArgs subArgs;
2548 int32_t minArgsSize;
2549
2550 /* set up the subconverter arguments */
2551 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2552 minArgsSize = args->size;
2553 } else {
2554 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2555 }
2556
2557 uprv_memcpy(&subArgs, args, minArgsSize);
2558 subArgs.size = (uint16_t)minArgsSize;
2559 subArgs.converter = myData->currentConverter;
2560
2561 /* remember the original start of the input for offsets */
2562 sourceStart = args->source;
2563
2564 if(myData->key != 0) {
2565 /* continue with a partial escape sequence */
2566 goto escape;
2567 }
2568
2569 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2570 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2571 subArgs.source = args->source;
2572 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2573 if(subArgs.source != subArgs.sourceLimit) {
2574 /*
2575 * get the current partial byte sequence
2576 *
2577 * it needs to be moved between the public and the subconverter
2578 * so that the conversion framework, which only sees the public
2579 * converter, can handle truncated and illegal input etc.
2580 */
2581 if(args->converter->toULength > 0) {
2582 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2583 }
2584 subArgs.converter->toULength = args->converter->toULength;
2585
2586 /*
2587 * Convert up to the end of the input, or to before the next escape character.
2588 * Does not handle conversion extensions because the preToU[] state etc.
2589 * is not copied.
2590 */
2591 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2592
2593 if(args->offsets != NULL && sourceStart != args->source) {
2594 /* update offsets to base them on the actual start of the input */
2595 int32_t *offsets = args->offsets;
2596 UChar *target = args->target;
2597 int32_t delta = (int32_t)(args->source - sourceStart);
2598 while(target < subArgs.target) {
2599 if(*offsets >= 0) {
2600 *offsets += delta;
2601 }
2602 ++offsets;
2603 ++target;
2604 }
2605 }
2606 args->source = subArgs.source;
2607 args->target = subArgs.target;
2608 args->offsets = subArgs.offsets;
2609
2610 /* copy input/error/overflow buffers */
2611 if(subArgs.converter->toULength > 0) {
2612 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2613 }
2614 args->converter->toULength = subArgs.converter->toULength;
2615
2616 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2617 if(subArgs.converter->UCharErrorBufferLength > 0) {
2618 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2619 subArgs.converter->UCharErrorBufferLength);
2620 }
2621 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2622 subArgs.converter->UCharErrorBufferLength = 0;
2623 }
2624 }
2625
2626 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2627 return;
2628 }
2629
2630 escape:
2631 changeState_2022(args->converter,
2632 &(args->source),
2633 args->sourceLimit,
2634 ISO_2022_KR,
2635 err);
2636 }
2637 }
2638
2639 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2640 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2641 UErrorCode* err){
2642 char tempBuf[2];
2643 const char *mySource = ( char *) args->source;
2644 UChar *myTarget = args->target;
2645 const char *mySourceLimit = args->sourceLimit;
2646 UChar32 targetUniChar = 0x0000;
2647 UChar mySourceChar = 0x0000;
2648 UConverterDataISO2022* myData;
2649 UConverterSharedData* sharedData ;
2650 UBool useFallback;
2651
2652 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2653 if(myData->version==1){
2654 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2655 return;
2656 }
2657
2658 /* initialize state */
2659 sharedData = myData->currentConverter->sharedData;
2660 useFallback = args->converter->useFallback;
2661
2662 if(myData->key != 0) {
2663 /* continue with a partial escape sequence */
2664 goto escape;
2665 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2666 /* continue with a partial double-byte character */
2667 mySourceChar = args->converter->toUBytes[0];
2668 args->converter->toULength = 0;
2669 goto getTrailByte;
2670 }
2671
2672 while(mySource< mySourceLimit){
2673
2674 if(myTarget < args->targetLimit){
2675
2676 mySourceChar= (unsigned char) *mySource++;
2677
2678 if(mySourceChar==UCNV_SI){
2679 myData->toU2022State.g = 0;
2680 if (myData->isEmptySegment) {
2681 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2682 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2683 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2684 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2685 args->converter->toULength = 1;
2686 args->target = myTarget;
2687 args->source = mySource;
2688 return;
2689 }
2690 /*consume the source */
2691 continue;
2692 }else if(mySourceChar==UCNV_SO){
2693 myData->toU2022State.g = 1;
2694 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2695 /*consume the source */
2696 continue;
2697 }else if(mySourceChar==ESC_2022){
2698 mySource--;
2699 escape:
2700 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2701 changeState_2022(args->converter,&(mySource),
2702 mySourceLimit, ISO_2022_KR, err);
2703 if(U_FAILURE(*err)){
2704 args->target = myTarget;
2705 args->source = mySource;
2706 return;
2707 }
2708 continue;
2709 }
2710
2711 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2712 if(myData->toU2022State.g == 1) {
2713 if(mySource < mySourceLimit) {
2714 int leadIsOk, trailIsOk;
2715 uint8_t trailByte;
2716 getTrailByte:
2717 targetUniChar = missingCharMarker;
2718 trailByte = (uint8_t)*mySource;
2719 /*
2720 * Ticket 5691: consistent illegal sequences:
2721 * - We include at least the first byte in the illegal sequence.
2722 * - If any of the non-initial bytes could be the start of a character,
2723 * we stop the illegal sequence before the first one of those.
2724 *
2725 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2726 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2727 * Otherwise we convert or report the pair of bytes.
2728 */
2729 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2730 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2731 if (leadIsOk && trailIsOk) {
2732 ++mySource;
2733 tempBuf[0] = (char)(mySourceChar + 0x80);
2734 tempBuf[1] = (char)(trailByte + 0x80);
2735 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2736 mySourceChar = (mySourceChar << 8) | trailByte;
2737 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2738 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2739 ++mySource;
2740 /* add another bit so that the code below writes 2 bytes in case of error */
2741 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2742 }
2743 } else {
2744 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2745 args->converter->toULength = 1;
2746 break;
2747 }
2748 }
2749 else if(mySourceChar <= 0x7f) {
2750 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2751 } else {
2752 targetUniChar = 0xffff;
2753 }
2754 if(targetUniChar < 0xfffe){
2755 if(args->offsets) {
2756 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2757 }
2758 *(myTarget++)=(UChar)targetUniChar;
2759 }
2760 else {
2761 /* Call the callback function*/
2762 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2763 break;
2764 }
2765 }
2766 else{
2767 *err =U_BUFFER_OVERFLOW_ERROR;
2768 break;
2769 }
2770 }
2771 args->target = myTarget;
2772 args->source = mySource;
2773 }
2774
2775 /*************************** END ISO2022-KR *********************************/
2776
2777 /*************************** ISO-2022-CN *********************************
2778 *
2779 * Rules for ISO-2022-CN Encoding:
2780 * i) The designator sequence must appear once on a line before any instance
2781 * of character set it designates.
2782 * ii) If two lines contain characters from the same character set, both lines
2783 * must include the designator sequence.
2784 * iii) Once the designator sequence is known, a shifting sequence has to be found
2785 * to invoke the shifting
2786 * iv) All lines start in ASCII and end in ASCII.
2787 * v) Four shifting sequences are employed for this purpose:
2788 *
2789 * Sequcence ASCII Eq Charsets
2790 * ---------- ------- ---------
2791 * SI <SI> US-ASCII
2792 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2793 * SS2 <ESC>N CNS-11643-1992 Plane 2
2794 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2795 *
2796 * vi)
2797 * SOdesignator : ESC "$" ")" finalchar_for_SO
2798 * SS2designator : ESC "$" "*" finalchar_for_SS2
2799 * SS3designator : ESC "$" "+" finalchar_for_SS3
2800 *
2801 * ESC $ ) A Indicates the bytes following SO are Chinese
2802 * characters as defined in GB 2312-80, until
2803 * another SOdesignation appears
2804 *
2805 *
2806 * ESC $ ) E Indicates the bytes following SO are as defined
2807 * in ISO-IR-165 (for details, see section 2.1),
2808 * until another SOdesignation appears
2809 *
2810 * ESC $ ) G Indicates the bytes following SO are as defined
2811 * in CNS 11643-plane-1, until another
2812 * SOdesignation appears
2813 *
2814 * ESC $ * H Indicates the two bytes immediately following
2815 * SS2 is a Chinese character as defined in CNS
2816 * 11643-plane-2, until another SS2designation
2817 * appears
2818 * (Meaning <ESC>N must preceed every 2 byte
2819 * sequence.)
2820 *
2821 * ESC $ + I Indicates the immediate two bytes following SS3
2822 * is a Chinese character as defined in CNS
2823 * 11643-plane-3, until another SS3designation
2824 * appears
2825 * (Meaning <ESC>O must preceed every 2 byte
2826 * sequence.)
2827 *
2828 * ESC $ + J Indicates the immediate two bytes following SS3
2829 * is a Chinese character as defined in CNS
2830 * 11643-plane-4, until another SS3designation
2831 * appears
2832 * (In English: <ESC>O must preceed every 2 byte
2833 * sequence.)
2834 *
2835 * ESC $ + K Indicates the immediate two bytes following SS3
2836 * is a Chinese character as defined in CNS
2837 * 11643-plane-5, until another SS3designation
2838 * appears
2839 *
2840 * ESC $ + L Indicates the immediate two bytes following SS3
2841 * is a Chinese character as defined in CNS
2842 * 11643-plane-6, until another SS3designation
2843 * appears
2844 *
2845 * ESC $ + M Indicates the immediate two bytes following SS3
2846 * is a Chinese character as defined in CNS
2847 * 11643-plane-7, until another SS3designation
2848 * appears
2849 *
2850 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2851 * has its own designation information before any Chinese characters
2852 * appear
2853 *
2854 */
2855
2856 /* The following are defined this way to make the strings truely readonly */
2857 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2858 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2859 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2860 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2861 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2862 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2863 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2864 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2865 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2866
2867 /********************** ISO2022-CN Data **************************/
2868 static const char* const escSeqCharsCN[10] ={
2869 SHIFT_IN_STR, /* ASCII */
2870 GB_2312_80_STR,
2871 ISO_IR_165_STR,
2872 CNS_11643_1992_Plane_1_STR,
2873 CNS_11643_1992_Plane_2_STR,
2874 CNS_11643_1992_Plane_3_STR,
2875 CNS_11643_1992_Plane_4_STR,
2876 CNS_11643_1992_Plane_5_STR,
2877 CNS_11643_1992_Plane_6_STR,
2878 CNS_11643_1992_Plane_7_STR
2879 };
2880
2881 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2882 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2883 UConverter *cnv = args->converter;
2884 UConverterDataISO2022 *converterData;
2885 ISO2022State *pFromU2022State;
2886 uint8_t *target = (uint8_t *) args->target;
2887 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2888 const UChar* source = args->source;
2889 const UChar* sourceLimit = args->sourceLimit;
2890 int32_t* offsets = args->offsets;
2891 UChar32 sourceChar;
2892 char buffer[8];
2893 int32_t len;
2894 int8_t choices[3];
2895 int32_t choiceCount;
2896 uint32_t targetValue = 0;
2897 UBool useFallback;
2898
2899 /* set up the state */
2900 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2901 pFromU2022State = &converterData->fromU2022State;
2902
2903 choiceCount = 0;
2904
2905 /* check if the last codepoint of previous buffer was a lead surrogate*/
2906 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2907 goto getTrail;
2908 }
2909
2910 while( source < sourceLimit){
2911 if(target < targetLimit){
2912
2913 sourceChar = *(source++);
2914 /*check if the char is a First surrogate*/
2915 if(UTF_IS_SURROGATE(sourceChar)) {
2916 if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2917 getTrail:
2918 /*look ahead to find the trail surrogate*/
2919 if(source < sourceLimit) {
2920 /* test the following code unit */
2921 UChar trail=(UChar) *source;
2922 if(UTF_IS_SECOND_SURROGATE(trail)) {
2923 source++;
2924 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2925 cnv->fromUChar32=0x00;
2926 /* convert this supplementary code point */
2927 /* exit this condition tree */
2928 } else {
2929 /* this is an unmatched lead code unit (1st surrogate) */
2930 /* callback(illegal) */
2931 *err=U_ILLEGAL_CHAR_FOUND;
2932 cnv->fromUChar32=sourceChar;
2933 break;
2934 }
2935 } else {
2936 /* no more input */
2937 cnv->fromUChar32=sourceChar;
2938 break;
2939 }
2940 } else {
2941 /* this is an unmatched trail code unit (2nd surrogate) */
2942 /* callback(illegal) */
2943 *err=U_ILLEGAL_CHAR_FOUND;
2944 cnv->fromUChar32=sourceChar;
2945 break;
2946 }
2947 }
2948
2949 /* do the conversion */
2950 if(sourceChar <= 0x007f ){
2951 /* do not convert SO/SI/ESC */
2952 if(IS_2022_CONTROL(sourceChar)) {
2953 /* callback(illegal) */
2954 *err=U_ILLEGAL_CHAR_FOUND;
2955 cnv->fromUChar32=sourceChar;
2956 break;
2957 }
2958
2959 /* US-ASCII */
2960 if(pFromU2022State->g == 0) {
2961 buffer[0] = (char)sourceChar;
2962 len = 1;
2963 } else {
2964 buffer[0] = UCNV_SI;
2965 buffer[1] = (char)sourceChar;
2966 len = 2;
2967 pFromU2022State->g = 0;
2968 choiceCount = 0;
2969 }
2970 if(sourceChar == CR || sourceChar == LF) {
2971 /* reset the state at the end of a line */
2972 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2973 choiceCount = 0;
2974 }
2975 }
2976 else{
2977 /* convert U+0080..U+10ffff */
2978 int32_t i;
2979 int8_t cs, g;
2980
2981 if(choiceCount == 0) {
2982 /* try the current SO/G1 converter first */
2983 choices[0] = pFromU2022State->cs[1];
2984
2985 /* default to GB2312_1 if none is designated yet */
2986 if(choices[0] == 0) {
2987 choices[0] = GB2312_1;
2988 }
2989
2990 if(converterData->version == 0) {
2991 /* ISO-2022-CN */
2992
2993 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2994 if(choices[0] == GB2312_1) {
2995 choices[1] = (int8_t)CNS_11643_1;
2996 } else {
2997 choices[1] = (int8_t)GB2312_1;
2998 }
2999
3000 choiceCount = 2;
3001 } else if (converterData->version == 1) {
3002 /* ISO-2022-CN-EXT */
3003
3004 /* try one of the other converters */
3005 switch(choices[0]) {
3006 case GB2312_1:
3007 choices[1] = (int8_t)CNS_11643_1;
3008 choices[2] = (int8_t)ISO_IR_165;
3009 break;
3010 case ISO_IR_165:
3011 choices[1] = (int8_t)GB2312_1;
3012 choices[2] = (int8_t)CNS_11643_1;
3013 break;
3014 default: /* CNS_11643_x */
3015 choices[1] = (int8_t)GB2312_1;
3016 choices[2] = (int8_t)ISO_IR_165;
3017 break;
3018 }
3019
3020 choiceCount = 3;
3021 } else {
3022 choices[0] = (int8_t)CNS_11643_1;
3023 choices[1] = (int8_t)GB2312_1;
3024 }
3025 }
3026
3027 cs = g = 0;
3028 /*
3029 * len==0: no mapping found yet
3030 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3031 * len>0: found a roundtrip result, done
3032 */
3033 len = 0;
3034 /*
3035 * We will turn off useFallback after finding a fallback,
3036 * but we still get fallbacks from PUA code points as usual.
3037 * Therefore, we will also need to check that we don't overwrite
3038 * an early fallback with a later one.
3039 */
3040 useFallback = cnv->useFallback;
3041
3042 for(i = 0; i < choiceCount && len <= 0; ++i) {
3043 int8_t cs0 = choices[i];
3044 if(cs0 > 0) {
3045 uint32_t value;
3046 int32_t len2;
3047 if(cs0 >= CNS_11643_0) {
3048 len2 = MBCS_FROM_UCHAR32_ISO2022(
3049 converterData->myConverterArray[CNS_11643],
3050 sourceChar,
3051 &value,
3052 useFallback,
3053 MBCS_OUTPUT_3);
3054 if(len2 == 3 || (len2 == -3 && len == 0)) {
3055 targetValue = value;
3056 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3057 if(len2 >= 0) {
3058 len = 2;
3059 } else {
3060 len = -2;
3061 useFallback = FALSE;
3062 }
3063 if(cs == CNS_11643_1) {
3064 g = 1;
3065 } else if(cs == CNS_11643_2) {
3066 g = 2;
3067 } else /* plane 3..7 */ if(converterData->version == 1) {
3068 g = 3;
3069 } else {
3070 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3071 len = 0;
3072 }
3073 }
3074 } else {
3075 /* GB2312_1 or ISO-IR-165 */
3076 len2 = MBCS_FROM_UCHAR32_ISO2022(
3077 converterData->myConverterArray[cs0],
3078 sourceChar,
3079 &value,
3080 useFallback,
3081 MBCS_OUTPUT_2);
3082 if(len2 == 2 || (len2 == -2 && len == 0)) {
3083 targetValue = value;
3084 len = len2;
3085 cs = cs0;
3086 g = 1;
3087 useFallback = FALSE;
3088 }
3089 }
3090 }
3091 }
3092
3093 if(len != 0) {
3094 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3095
3096 /* write the designation sequence if necessary */
3097 if(cs != pFromU2022State->cs[g]) {
3098 if(cs < CNS_11643) {
3099 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3100 } else {
3101 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3102 }
3103 len = 4;
3104 pFromU2022State->cs[g] = cs;
3105 if(g == 1) {
3106 /* changing the SO/G1 charset invalidates the choices[] */
3107 choiceCount = 0;
3108 }
3109 }
3110
3111 /* write the shift sequence if necessary */
3112 if(g != pFromU2022State->g) {
3113 switch(g) {
3114 case 1:
3115 buffer[len++] = UCNV_SO;
3116
3117 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3118 pFromU2022State->g = 1;
3119 break;
3120 case 2:
3121 buffer[len++] = 0x1b;
3122 buffer[len++] = 0x4e;
3123 break;
3124 default: /* case 3 */
3125 buffer[len++] = 0x1b;
3126 buffer[len++] = 0x4f;
3127 break;
3128 }
3129 }
3130
3131 /* write the two output bytes */
3132 buffer[len++] = (char)(targetValue >> 8);
3133 buffer[len++] = (char)targetValue;
3134 } else {
3135 /* if we cannot find the character after checking all codepages
3136 * then this is an error
3137 */
3138 *err = U_INVALID_CHAR_FOUND;
3139 cnv->fromUChar32=sourceChar;
3140 break;
3141 }
3142 }
3143
3144 /* output len>0 bytes in buffer[] */
3145 if(len == 1) {
3146 *target++ = buffer[0];
3147 if(offsets) {
3148 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3149 }
3150 } else if(len == 2 && (target + 2) <= targetLimit) {
3151 *target++ = buffer[0];
3152 *target++ = buffer[1];
3153 if(offsets) {
3154 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3155 *offsets++ = sourceIndex;
3156 *offsets++ = sourceIndex;
3157 }
3158 } else {
3159 fromUWriteUInt8(
3160 cnv,
3161 buffer, len,
3162 &target, (const char *)targetLimit,
3163 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3164 err);
3165 if(U_FAILURE(*err)) {
3166 break;
3167 }
3168 }
3169 } /* end if(myTargetIndex<myTargetLength) */
3170 else{
3171 *err =U_BUFFER_OVERFLOW_ERROR;
3172 break;
3173 }
3174
3175 }/* end while(mySourceIndex<mySourceLength) */
3176
3177 /*
3178 * the end of the input stream and detection of truncated input
3179 * are handled by the framework, but for ISO-2022-CN conversion
3180 * we need to be in ASCII mode at the very end
3181 *
3182 * conditions:
3183 * successful
3184 * not in ASCII mode
3185 * end of input and no truncated input
3186 */
3187 if( U_SUCCESS(*err) &&
3188 pFromU2022State->g!=0 &&
3189 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3190 ) {
3191 int32_t sourceIndex;
3192
3193 /* we are switching to ASCII */
3194 pFromU2022State->g=0;
3195
3196 /* get the source index of the last input character */
3197 /*
3198 * TODO this would be simpler and more reliable if we used a pair
3199 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3200 * so that we could simply use the prevSourceIndex here;
3201 * this code gives an incorrect result for the rare case of an unmatched
3202 * trail surrogate that is alone in the last buffer of the text stream
3203 */
3204 sourceIndex=(int32_t)(source-args->source);
3205 if(sourceIndex>0) {
3206 --sourceIndex;
3207 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3208 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3209 ) {
3210 --sourceIndex;
3211 }
3212 } else {
3213 sourceIndex=-1;
3214 }
3215
3216 fromUWriteUInt8(
3217 cnv,
3218 SHIFT_IN_STR, 1,
3219 &target, (const char *)targetLimit,
3220 &offsets, sourceIndex,
3221 err);
3222 }
3223
3224 /*save the state and return */
3225 args->source = source;
3226 args->target = (char*)target;
3227 }
3228
3229
3230 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3231 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3232 UErrorCode* err){
3233 char tempBuf[3];
3234 const char *mySource = (char *) args->source;
3235 UChar *myTarget = args->target;
3236 const char *mySourceLimit = args->sourceLimit;
3237 uint32_t targetUniChar = 0x0000;
3238 uint32_t mySourceChar = 0x0000;
3239 UConverterDataISO2022* myData;
3240 ISO2022State *pToU2022State;
3241
3242 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3243 pToU2022State = &myData->toU2022State;
3244
3245 if(myData->key != 0) {
3246 /* continue with a partial escape sequence */
3247 goto escape;
3248 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3249 /* continue with a partial double-byte character */
3250 mySourceChar = args->converter->toUBytes[0];
3251 args->converter->toULength = 0;
3252 targetUniChar = missingCharMarker;
3253 goto getTrailByte;
3254 }
3255
3256 while(mySource < mySourceLimit){
3257
3258 targetUniChar =missingCharMarker;
3259
3260 if(myTarget < args->targetLimit){
3261
3262 mySourceChar= (unsigned char) *mySource++;
3263
3264 switch(mySourceChar){
3265 case UCNV_SI:
3266 pToU2022State->g=0;
3267 if (myData->isEmptySegment) {
3268 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3269 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3270 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3271 args->converter->toUBytes[0] = mySourceChar;
3272 args->converter->toULength = 1;
3273 args->target = myTarget;
3274 args->source = mySource;
3275 return;
3276 }
3277 continue;
3278
3279 case UCNV_SO:
3280 if(pToU2022State->cs[1] != 0) {
3281 pToU2022State->g=1;
3282 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3283 continue;
3284 } else {
3285 /* illegal to have SO before a matching designator */
3286 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3287 break;
3288 }
3289
3290 case ESC_2022:
3291 mySource--;
3292 escape:
3293 {
3294 const char * mySourceBefore = mySource;
3295 int8_t toULengthBefore = args->converter->toULength;
3296
3297 changeState_2022(args->converter,&(mySource),
3298 mySourceLimit, ISO_2022_CN,err);
3299
3300 /* After SO there must be at least one character before a designator (designator error handled separately) */
3301 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3302 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3303 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3304 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3305 }
3306 }
3307
3308 /* invalid or illegal escape sequence */
3309 if(U_FAILURE(*err)){
3310 args->target = myTarget;
3311 args->source = mySource;
3312 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3313 return;
3314 }
3315 continue;
3316
3317 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3318
3319 case CR:
3320 /*falls through*/
3321 case LF:
3322 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3323 /* falls through */
3324 default:
3325 /* convert one or two bytes */
3326 myData->isEmptySegment = FALSE;
3327 if(pToU2022State->g != 0) {
3328 if(mySource < mySourceLimit) {
3329 UConverterSharedData *cnv;
3330 StateEnum tempState;
3331 int32_t tempBufLen;
3332 int leadIsOk, trailIsOk;
3333 uint8_t trailByte;
3334 getTrailByte:
3335 trailByte = (uint8_t)*mySource;
3336 /*
3337 * Ticket 5691: consistent illegal sequences:
3338 * - We include at least the first byte in the illegal sequence.
3339 * - If any of the non-initial bytes could be the start of a character,
3340 * we stop the illegal sequence before the first one of those.
3341 *
3342 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3343 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3344 * Otherwise we convert or report the pair of bytes.
3345 */
3346 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3347 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3348 if (leadIsOk && trailIsOk) {
3349 ++mySource;
3350 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3351 if(tempState >= CNS_11643_0) {
3352 cnv = myData->myConverterArray[CNS_11643];
3353 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3354 tempBuf[1] = (char) (mySourceChar);
3355 tempBuf[2] = (char) trailByte;
3356 tempBufLen = 3;
3357
3358 }else{
3359 cnv = myData->myConverterArray[tempState];
3360 tempBuf[0] = (char) (mySourceChar);
3361 tempBuf[1] = (char) trailByte;
3362 tempBufLen = 2;
3363 }
3364 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3365 mySourceChar = (mySourceChar << 8) | trailByte;
3366 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3367 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3368 ++mySource;
3369 /* add another bit so that the code below writes 2 bytes in case of error */
3370 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3371 }
3372 if(pToU2022State->g>=2) {
3373 /* return from a single-shift state to the previous one */
3374 pToU2022State->g=pToU2022State->prevG;
3375 }
3376 } else {
3377 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3378 args->converter->toULength = 1;
3379 goto endloop;
3380 }
3381 }
3382 else{
3383 if(mySourceChar <= 0x7f) {
3384 targetUniChar = (UChar) mySourceChar;
3385 }
3386 }
3387 break;
3388 }
3389 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3390 if(args->offsets){
3391 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3392 }
3393 *(myTarget++)=(UChar)targetUniChar;
3394 }
3395 else if(targetUniChar > missingCharMarker){
3396 /* disassemble the surrogate pair and write to output*/
3397 targetUniChar-=0x0010000;
3398 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3399 if(args->offsets){
3400 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3401 }
3402 ++myTarget;
3403 if(myTarget< args->targetLimit){
3404 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3405 if(args->offsets){
3406 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3407 }
3408 ++myTarget;
3409 }else{
3410 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3411 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3412 }
3413
3414 }
3415 else{
3416 /* Call the callback function*/
3417 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3418 break;
3419 }
3420 }
3421 else{
3422 *err =U_BUFFER_OVERFLOW_ERROR;
3423 break;
3424 }
3425 }
3426 endloop:
3427 args->target = myTarget;
3428 args->source = mySource;
3429 }
3430
3431 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3432 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3433 UConverter *cnv = args->converter;
3434 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3435 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3436 char *p, *subchar;
3437 char buffer[8];
3438 int32_t length;
3439
3440 subchar=(char *)cnv->subChars;
3441 length=cnv->subCharLen; /* assume length==1 for most variants */
3442
3443 p = buffer;
3444 switch(myConverterData->locale[0]){
3445 case 'j':
3446 {
3447 int8_t cs;
3448
3449 if(pFromU2022State->g == 1) {
3450 /* JIS7: switch from G1 to G0 */
3451 pFromU2022State->g = 0;
3452 *p++ = UCNV_SI;
3453 }
3454
3455 cs = pFromU2022State->cs[0];
3456 if(cs != ASCII && cs != JISX201) {
3457 /* not in ASCII or JIS X 0201: switch to ASCII */
3458 pFromU2022State->cs[0] = (int8_t)ASCII;
3459 *p++ = '\x1b';
3460 *p++ = '\x28';
3461 *p++ = '\x42';
3462 }
3463
3464 *p++ = subchar[0];
3465 break;
3466 }
3467 case 'c':
3468 if(pFromU2022State->g != 0) {
3469 /* not in ASCII mode: switch to ASCII */
3470 pFromU2022State->g = 0;
3471 *p++ = UCNV_SI;
3472 }
3473 *p++ = subchar[0];
3474 break;
3475 case 'k':
3476 if(myConverterData->version == 0) {
3477 if(length == 1) {
3478 if((UBool)args->converter->fromUnicodeStatus) {
3479 /* in DBCS mode: switch to SBCS */
3480 args->converter->fromUnicodeStatus = 0;
3481 *p++ = UCNV_SI;
3482 }
3483 *p++ = subchar[0];
3484 } else /* length == 2*/ {
3485 if(!(UBool)args->converter->fromUnicodeStatus) {
3486 /* in SBCS mode: switch to DBCS */
3487 args->converter->fromUnicodeStatus = 1;
3488 *p++ = UCNV_SO;
3489 }
3490 *p++ = subchar[0];
3491 *p++ = subchar[1];
3492 }
3493 break;
3494 } else {
3495 /* save the subconverter's substitution string */
3496 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3497 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3498
3499 /* set our substitution string into the subconverter */
3500 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3501 myConverterData->currentConverter->subCharLen = (int8_t)length;
3502
3503 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3504 args->converter = myConverterData->currentConverter;
3505 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3506 ucnv_cbFromUWriteSub(args, 0, err);
3507 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3508 args->converter = cnv;
3509
3510 /* restore the subconverter's substitution string */
3511 myConverterData->currentConverter->subChars = currentSubChars;
3512 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3513
3514 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3515 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3516 uprv_memcpy(
3517 cnv->charErrorBuffer,
3518 myConverterData->currentConverter->charErrorBuffer,
3519 myConverterData->currentConverter->charErrorBufferLength);
3520 }
3521 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3522 myConverterData->currentConverter->charErrorBufferLength = 0;
3523 }
3524 return;
3525 }
3526 default:
3527 /* not expected */
3528 break;
3529 }
3530 ucnv_cbFromUWriteBytes(args,
3531 buffer, (int32_t)(p - buffer),
3532 offsetIndex, err);
3533 }
3534
3535 /*
3536 * Structure for cloning an ISO 2022 converter into a single memory block.
3537 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3538 * and then ucnv_safeClone() of the sub-converter may additionally align
3539 * currentConverter inside the cloneStruct, for which we need the deadSpace
3540 * after currentConverter.
3541 * This is because UAlignedMemory may be larger than the actually
3542 * necessary alignment size for the platform.
3543 * The other cloneStruct fields will not be moved around,
3544 * and are aligned properly with cloneStruct's alignment.
3545 */
3546 struct cloneStruct
3547 {
3548 UConverter cnv;
3549 UConverter currentConverter;
3550 UAlignedMemory deadSpace;
3551 UConverterDataISO2022 mydata;
3552 };
3553
3554
3555 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3556 _ISO_2022_SafeClone(
3557 const UConverter *cnv,
3558 void *stackBuffer,
3559 int32_t *pBufferSize,
3560 UErrorCode *status)
3561 {
3562 struct cloneStruct * localClone;
3563 UConverterDataISO2022 *cnvData;
3564 int32_t i, size;
3565
3566 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3567 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3568 return NULL;
3569 }
3570
3571 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3572 localClone = (struct cloneStruct *)stackBuffer;
3573
3574 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3575
3576 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3577 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3578 localClone->cnv.isExtraLocal = TRUE;
3579
3580 /* share the subconverters */
3581
3582 if(cnvData->currentConverter != NULL) {
3583 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3584 localClone->mydata.currentConverter =
3585 ucnv_safeClone(cnvData->currentConverter,
3586 &localClone->currentConverter,
3587 &size, status);
3588 if(U_FAILURE(*status)) {
3589 return NULL;
3590 }
3591 }
3592
3593 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3594 if(cnvData->myConverterArray[i] != NULL) {
3595 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3596 }
3597 }
3598
3599 return &localClone->cnv;
3600 }
3601
3602 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3603 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3604 const USetAdder *sa,
3605 UConverterUnicodeSet which,
3606 UErrorCode *pErrorCode)
3607 {
3608 int32_t i;
3609 UConverterDataISO2022* cnvData;
3610
3611 if (U_FAILURE(*pErrorCode)) {
3612 return;
3613 }
3614 #ifdef U_ENABLE_GENERIC_ISO_2022
3615 if (cnv->sharedData == &_ISO2022Data) {
3616 /* We use UTF-8 in this case */
3617 sa->addRange(sa->set, 0, 0xd7FF);
3618 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3619 return;
3620 }
3621 #endif
3622
3623 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3624
3625 /* open a set and initialize it with code points that are algorithmically round-tripped */
3626 switch(cnvData->locale[0]){
3627 case 'j':
3628 /* include JIS X 0201 which is hardcoded */
3629 sa->add(sa->set, 0xa5);
3630 sa->add(sa->set, 0x203e);
3631 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3632 /* include Latin-1 for some variants of JP */
3633 sa->addRange(sa->set, 0, 0xff);
3634 } else {
3635 /* include ASCII for JP */
3636 sa->addRange(sa->set, 0, 0x7f);
3637 }
3638 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3639 /*
3640 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3641 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3642 * use half-width Katakana.
3643 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3644 * half-width Katakana via the ESC ( I sequence.
3645 * However, we only emit (fromUnicode) half-width Katakana according to the
3646 * definition of each variant.
3647 *
3648 * When including fallbacks,
3649 * we need to include half-width Katakana Unicode code points for all JP variants because
3650 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3651 */
3652 /* include half-width Katakana for JP */
3653 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3654 }
3655 break;
3656 case 'c':
3657 case 'z':
3658 /* include ASCII for CN */
3659 sa->addRange(sa->set, 0, 0x7f);
3660 break;
3661 case 'k':
3662 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3663 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3664 cnvData->currentConverter, sa, which, pErrorCode);
3665 /* the loop over myConverterArray[] will simply not find another converter */
3666 break;
3667 default:
3668 break;
3669 }
3670
3671 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3672 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3673 cnvData->version==0 && i==CNS_11643
3674 ) {
3675 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3676 ucnv_MBCSGetUnicodeSetForBytes(
3677 cnvData->myConverterArray[i],
3678 sa, UCNV_ROUNDTRIP_SET,
3679 0, 0x81, 0x82,
3680 pErrorCode);
3681 }
3682 #endif
3683
3684 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3685 UConverterSetFilter filter;
3686 if(cnvData->myConverterArray[i]!=NULL) {
3687 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3688 cnvData->version==0 && i==CNS_11643
3689 ) {
3690 /*
3691 * Version-specific for CN:
3692 * CN version 0 does not map CNS planes 3..7 although
3693 * they are all available in the CNS conversion table;
3694 * CN version 1 (-EXT) does map them all.
3695 * The two versions create different Unicode sets.
3696 */
3697 filter=UCNV_SET_FILTER_2022_CN;
3698 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3699 /*
3700 * Only add code points that map to Shift-JIS codes
3701 * corresponding to JIS X 0208.
3702 */
3703 filter=UCNV_SET_FILTER_SJIS;
3704 } else if(i==KSC5601) {
3705 /*
3706 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3707 * are broader than GR94.
3708 */
3709 filter=UCNV_SET_FILTER_GR94DBCS;
3710 } else {
3711 filter=UCNV_SET_FILTER_NONE;
3712 }
3713 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3714 }
3715 }
3716
3717 /*
3718 * ISO 2022 converters must not convert SO/SI/ESC despite what
3719 * sub-converters do by themselves.
3720 * Remove these characters from the set.
3721 */
3722 sa->remove(sa->set, 0x0e);
3723 sa->remove(sa->set, 0x0f);
3724 sa->remove(sa->set, 0x1b);
3725
3726 /* ISO 2022 converters do not convert C1 controls either */
3727 sa->removeRange(sa->set, 0x80, 0x9f);
3728 }
3729
3730 static const UConverterImpl _ISO2022Impl={
3731 UCNV_ISO_2022,
3732
3733 NULL,
3734 NULL,
3735
3736 _ISO2022Open,
3737 _ISO2022Close,
3738 _ISO2022Reset,
3739
3740 #ifdef U_ENABLE_GENERIC_ISO_2022
3741 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3742 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3743 ucnv_fromUnicode_UTF8,
3744 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3745 #else
3746 NULL,
3747 NULL,
3748 NULL,
3749 NULL,
3750 #endif
3751 NULL,
3752
3753 NULL,
3754 _ISO2022getName,
3755 _ISO_2022_WriteSub,
3756 _ISO_2022_SafeClone,
3757 _ISO_2022_GetUnicodeSet
3758 };
3759 static const UConverterStaticData _ISO2022StaticData={
3760 sizeof(UConverterStaticData),
3761 "ISO_2022",
3762 2022,
3763 UCNV_IBM,
3764 UCNV_ISO_2022,
3765 1,
3766 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3767 { 0x1a, 0, 0, 0 },
3768 1,
3769 FALSE,
3770 FALSE,
3771 0,
3772 0,
3773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3774 };
3775 const UConverterSharedData _ISO2022Data={
3776 sizeof(UConverterSharedData),
3777 ~((uint32_t) 0),
3778 NULL,
3779 NULL,
3780 &_ISO2022StaticData,
3781 FALSE,
3782 &_ISO2022Impl,
3783 0
3784 };
3785
3786 /*************JP****************/
3787 static const UConverterImpl _ISO2022JPImpl={
3788 UCNV_ISO_2022,
3789
3790 NULL,
3791 NULL,
3792
3793 _ISO2022Open,
3794 _ISO2022Close,
3795 _ISO2022Reset,
3796
3797 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3798 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3799 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3800 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3801 NULL,
3802
3803 NULL,
3804 _ISO2022getName,
3805 _ISO_2022_WriteSub,
3806 _ISO_2022_SafeClone,
3807 _ISO_2022_GetUnicodeSet
3808 };
3809 static const UConverterStaticData _ISO2022JPStaticData={
3810 sizeof(UConverterStaticData),
3811 "ISO_2022_JP",
3812 0,
3813 UCNV_IBM,
3814 UCNV_ISO_2022,
3815 1,
3816 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3817 { 0x1a, 0, 0, 0 },
3818 1,
3819 FALSE,
3820 FALSE,
3821 0,
3822 0,
3823 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3824 };
3825 static const UConverterSharedData _ISO2022JPData={
3826 sizeof(UConverterSharedData),
3827 ~((uint32_t) 0),
3828 NULL,
3829 NULL,
3830 &_ISO2022JPStaticData,
3831 FALSE,
3832 &_ISO2022JPImpl,
3833 0
3834 };
3835
3836 /************* KR ***************/
3837 static const UConverterImpl _ISO2022KRImpl={
3838 UCNV_ISO_2022,
3839
3840 NULL,
3841 NULL,
3842
3843 _ISO2022Open,
3844 _ISO2022Close,
3845 _ISO2022Reset,
3846
3847 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3848 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3849 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3850 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3851 NULL,
3852
3853 NULL,
3854 _ISO2022getName,
3855 _ISO_2022_WriteSub,
3856 _ISO_2022_SafeClone,
3857 _ISO_2022_GetUnicodeSet
3858 };
3859 static const UConverterStaticData _ISO2022KRStaticData={
3860 sizeof(UConverterStaticData),
3861 "ISO_2022_KR",
3862 0,
3863 UCNV_IBM,
3864 UCNV_ISO_2022,
3865 1,
3866 3, /* max 3 bytes per UChar: SO+DBCS */
3867 { 0x1a, 0, 0, 0 },
3868 1,
3869 FALSE,
3870 FALSE,
3871 0,
3872 0,
3873 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3874 };
3875 static const UConverterSharedData _ISO2022KRData={
3876 sizeof(UConverterSharedData),
3877 ~((uint32_t) 0),
3878 NULL,
3879 NULL,
3880 &_ISO2022KRStaticData,
3881 FALSE,
3882 &_ISO2022KRImpl,
3883 0
3884 };
3885
3886 /*************** CN ***************/
3887 static const UConverterImpl _ISO2022CNImpl={
3888
3889 UCNV_ISO_2022,
3890
3891 NULL,
3892 NULL,
3893
3894 _ISO2022Open,
3895 _ISO2022Close,
3896 _ISO2022Reset,
3897
3898 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3899 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3900 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3901 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3902 NULL,
3903
3904 NULL,
3905 _ISO2022getName,
3906 _ISO_2022_WriteSub,
3907 _ISO_2022_SafeClone,
3908 _ISO_2022_GetUnicodeSet
3909 };
3910 static const UConverterStaticData _ISO2022CNStaticData={
3911 sizeof(UConverterStaticData),
3912 "ISO_2022_CN",
3913 0,
3914 UCNV_IBM,
3915 UCNV_ISO_2022,
3916 1,
3917 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3918 { 0x1a, 0, 0, 0 },
3919 1,
3920 FALSE,
3921 FALSE,
3922 0,
3923 0,
3924 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3925 };
3926 static const UConverterSharedData _ISO2022CNData={
3927 sizeof(UConverterSharedData),
3928 ~((uint32_t) 0),
3929 NULL,
3930 NULL,
3931 &_ISO2022CNStaticData,
3932 FALSE,
3933 &_ISO2022CNImpl,
3934 0
3935 };
3936
3937
3938
3939 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3940