1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
38 #include "ucnv_imp.h"
39 #include "ucnv_bld.h"
40 #include "ucnv_cnv.h"
41 #include "ucnvmbcs.h"
42 #include "cstring.h"
43 #include "cmemory.h"
44 #include "uassert.h"
45
46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78 #endif
79
80 static const char SHIFT_IN_STR[] = "\x0F";
81 // static const char SHIFT_OUT_STR[] = "\x0E";
82
83 #define CR 0x0D
84 #define LF 0x0A
85 #define H_TAB 0x09
86 #define V_TAB 0x0B
87 #define SPACE 0x20
88
89 enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92 };
93
94 /*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102 enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107 };
108
109 /*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117 /* for ISO-2022-JP and -CN implementations */
118 typedef enum {
119 /* shared values */
120 INVALID_STATE=-1,
121 ASCII = 0,
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
135
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
154 } StateEnum;
155
156 /* is the StateEnum charset value for a DBCS charset? */
157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159 #define CSM(cs) ((uint16_t)1<<(cs))
160
161 /*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 * all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
170 enum { MAX_JA_VERSION=4 };
171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
177 };
178
179 typedef enum {
180 ASCII1=0,
181 LATIN1,
182 SBCS,
183 DBCS,
184 MBCS,
185 HWKANA
186 }Cnv2022Type;
187
188 typedef struct ISO2022State {
189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
191 int8_t prevG; /* g before single shift (SS2 or SS3) */
192 } ISO2022State;
193
194 #define UCNV_OPTIONS_VERSION_MASK 0xf
195 #define UCNV_2022_MAX_CONVERTERS 10
196
197 typedef struct{
198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
199 UConverter *currentConverter;
200 Cnv2022Type currentType;
201 ISO2022State toU2022State, fromU2022State;
202 uint32_t key;
203 uint32_t version;
204 #ifdef U_ENABLE_GENERIC_ISO_2022
205 UBool isFirstBuffer;
206 #endif
207 UBool isEmptySegment;
208 char name[30];
209 char locale[3];
210 }UConverterDataISO2022;
211
212 /* Protos */
213 /* ISO-2022 ----------------------------------------------------------------- */
214
215 /*Forward declaration */
216 U_CFUNC void
217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
218 UErrorCode * err);
219 U_CFUNC void
220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
221 UErrorCode * err);
222
223 #define ESC_2022 0x1B /*ESC*/
224
225 typedef enum
226 {
227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
231 } UCNV_TableStates_2022;
232
233 /*
234 * The way these state transition arrays work is:
235 * ex : ESC$B is the sequence for JISX208
236 * a) First Iteration: char is ESC
237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
238 * int x = normalize_esq_chars_2022[27] which is equal to 1
239 * ii) Search for this value in escSeqStateTable_Key_2022[]
240 * value of x is stored at escSeqStateTable_Key_2022[0]
241 * iii) Save this index as offset
242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
244 * b) Switch on this state and continue to next char
245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
246 * which is normalize_esq_chars_2022[36] == 4
247 * ii) x is currently 1(from above)
248 * x<<=5 -- x is now 32
249 * x+=normalize_esq_chars_2022[36]
250 * now x is 36
251 * iii) Search for this value in escSeqStateTable_Key_2022[]
252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
255 * c) Switch on this state and continue to next char
256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
257 * ii) x is currently 36 (from above)
258 * x<<=5 -- x is now 1152
259 * x+=normalize_esq_chars_2022[66]
260 * now x is 1161
261 * iii) Search for this value in escSeqStateTable_Key_2022[]
262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266 */
267
268
269 /*Below are the 3 arrays depicting a state transition table*/
270 static const int8_t normalize_esq_chars_2022[256] = {
271 /* 0 1 2 3 4 5 6 7 8 9 */
272
273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0
299 };
300
301 #ifdef U_ENABLE_GENERIC_ISO_2022
302 /*
303 * When the generic ISO-2022 converter is completely removed, not just disabled
304 * per #ifdef, then the following state table and the associated tables that are
305 * dimensioned with MAX_STATES_2022 should be trimmed.
306 *
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
308 * the associated escape sequences starting with ESC ( B should be removed.
309 * This includes the ones with key values 1097 and all of the ones above 1000000.
310 *
311 * For the latter, the tables can simply be truncated.
312 * For the former, since the tables must be kept parallel, it is probably best
313 * to simply duplicate an adjacent table cell, parallel in all tables.
314 *
315 * It may make sense to restructure the tables, especially by using small search
316 * tables for the variants instead of indexing them parallel to the table here.
317 */
318 #endif
319
320 #define MAX_STATES_2022 74
321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
322 /* 0 1 2 3 4 5 6 7 8 9 */
323
324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
331 ,35947631 ,35947635 ,35947636 ,35947638
332 };
333
334 #ifdef U_ENABLE_GENERIC_ISO_2022
335
336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
337 /* 0 1 2 3 4 5 6 7 8 9 */
338
339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
347 };
348
349 #endif
350
351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
352 /* 0 1 2 3 4 5 6 7 8 9 */
353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
361 };
362
363
364 /* Type def for refactoring changeState_2022 code*/
365 typedef enum{
366 #ifdef U_ENABLE_GENERIC_ISO_2022
367 ISO_2022=0,
368 #endif
369 ISO_2022_JP=1,
370 ISO_2022_KR=2,
371 ISO_2022_CN=3
372 } Variant2022;
373
374 /*********** ISO 2022 Converter Protos ***********/
375 static void
376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
377
378 static void
379 _ISO2022Close(UConverter *converter);
380
381 static void
382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
383
384 static const char*
385 _ISO2022getName(const UConverter* cnv);
386
387 static void
388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
389
390 static UConverter *
391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
392
393 #ifdef U_ENABLE_GENERIC_ISO_2022
394 static void
395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
396 #endif
397
398 namespace {
399
400 /*const UConverterSharedData _ISO2022Data;*/
401 extern const UConverterSharedData _ISO2022JPData;
402 extern const UConverterSharedData _ISO2022KRData;
403 extern const UConverterSharedData _ISO2022CNData;
404
405 } // namespace
406
407 /*************** Converter implementations ******************/
408
409 /* The purpose of this function is to get around gcc compiler warnings. */
410 static inline void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)411 fromUWriteUInt8(UConverter *cnv,
412 const char *bytes, int32_t length,
413 uint8_t **target, const char *targetLimit,
414 int32_t **offsets,
415 int32_t sourceIndex,
416 UErrorCode *pErrorCode)
417 {
418 char *targetChars = (char *)*target;
419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
420 offsets, sourceIndex, pErrorCode);
421 *target = (uint8_t*)targetChars;
422
423 }
424
425 static inline void
setInitialStateToUnicodeKR(UConverter *,UConverterDataISO2022 * myConverterData)426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
427 if(myConverterData->version == 1) {
428 UConverter *cnv = myConverterData->currentConverter;
429
430 cnv->toUnicodeStatus=0; /* offset */
431 cnv->mode=0; /* state */
432 cnv->toULength=0; /* byteIndex */
433 }
434 }
435
436 static inline void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
438 /* in ISO-2022-KR the designator sequence appears only once
439 * in a file so we append it only once
440 */
441 if( converter->charErrorBufferLength==0){
442
443 converter->charErrorBufferLength = 4;
444 converter->charErrorBuffer[0] = 0x1b;
445 converter->charErrorBuffer[1] = 0x24;
446 converter->charErrorBuffer[2] = 0x29;
447 converter->charErrorBuffer[3] = 0x43;
448 }
449 if(myConverterData->version == 1) {
450 UConverter *cnv = myConverterData->currentConverter;
451
452 cnv->fromUChar32=0;
453 cnv->fromUnicodeStatus=1; /* prevLength */
454 }
455 }
456
457 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
459
460 char myLocale[6]={' ',' ',' ',' ',' ',' '};
461
462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
463 if(cnv->extraInfo != NULL) {
464 UConverterNamePieces stackPieces;
465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
467 uint32_t version;
468
469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
470
471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
472 myConverterData->currentType = ASCII1;
473 cnv->fromUnicodeStatus =FALSE;
474 if(pArgs->locale){
475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
476 }
477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
478 myConverterData->version = version;
479 /* Begin Google-specific change. */
480 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
481 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
482 if((myLocale[0]=='j' &&
483 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
484 myLocale[1]=='s') &&
485 (myLocale[2]=='_' || myLocale[2]=='\0')))
486 {
487 size_t len=0;
488 /* open the required converters and cache them */
489 if(version>MAX_JA_VERSION) {
490 /* prevent indexing beyond jpCharsetMasks[] */
491 myConverterData->version = version = 0;
492 }
493 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
494 myConverterData->myConverterArray[ISO8859_7] =
495 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
496 }
497 if (myLocale[1]=='k') { /* Use KDDI's version. */
498 myConverterData->myConverterArray[JISX208] =
499 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
500 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */
501 myConverterData->myConverterArray[JISX208] =
502 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
503 } else {
504 /*
505 * Change for http://b/issue?id=937017 :
506 * Restore JIS X 0208 ISO-2022-JP mappings from before
507 * sharing the table with the Shift-JIS converter
508 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797).
509 * TODO(mscherer): Create and use a new, unified Google Shift-JIS
510 * table for both Shift-JIS and ISO-2022-JP.
511 */
512 myConverterData->myConverterArray[JISX208] =
513 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode);
514 }
515 /* End Google-specific change. */
516 /* END android-changed */
517
518 if(jpCharsetMasks[version]&CSM(JISX212)) {
519 myConverterData->myConverterArray[JISX212] =
520 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
521 }
522 if(jpCharsetMasks[version]&CSM(GB2312)) {
523 myConverterData->myConverterArray[GB2312] =
524 /* BEGIN android-changed */
525 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
526 /* END android-changed */
527 }
528 if(jpCharsetMasks[version]&CSM(KSC5601)) {
529 myConverterData->myConverterArray[KSC5601] =
530 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
531 }
532
533 /* set the function pointers to appropriate funtions */
534 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
535 uprv_strcpy(myConverterData->locale,"ja");
536
537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
538 len = uprv_strlen(myConverterData->name);
539 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
540 myConverterData->name[len+1]='\0';
541 }
542 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
543 (myLocale[2]=='_' || myLocale[2]=='\0'))
544 {
545 const char *cnvName;
546 if(version==1) {
547 cnvName="icu-internal-25546";
548 } else {
549 /* BEGIN android-changed */
550 cnvName="ksc_5601";
551 /* END android-changed */
552 myConverterData->version=version=0;
553 }
554 if(pArgs->onlyTestIsLoadable) {
555 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
556 uprv_free(cnv->extraInfo);
557 cnv->extraInfo=NULL;
558 return;
559 } else {
560 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
561 if (U_FAILURE(*errorCode)) {
562 _ISO2022Close(cnv);
563 return;
564 }
565
566 if(version==1) {
567 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
568 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
569 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
570 }else{
571 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
572 }
573
574 /* initialize the state variables */
575 setInitialStateToUnicodeKR(cnv, myConverterData);
576 setInitialStateFromUnicodeKR(cnv, myConverterData);
577
578 /* set the function pointers to appropriate funtions */
579 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
580 uprv_strcpy(myConverterData->locale,"ko");
581 }
582 }
583 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
584 (myLocale[2]=='_' || myLocale[2]=='\0'))
585 {
586
587 /* open the required converters and cache them */
588 /* BEGIN android-changed */
589 myConverterData->myConverterArray[GB2312_1] =
590 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
591 if(version==1) {
592 myConverterData->myConverterArray[ISO_IR_165] =
593 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
594 }
595 myConverterData->myConverterArray[CNS_11643] =
596 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
597 /* END android-changed */
598
599
600 /* set the function pointers to appropriate funtions */
601 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
602 uprv_strcpy(myConverterData->locale,"cn");
603
604 if (version==0){
605 myConverterData->version = 0;
606 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
607 }else if (version==1){
608 myConverterData->version = 1;
609 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
610 }else {
611 myConverterData->version = 2;
612 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
613 }
614 }
615 else{
616 #ifdef U_ENABLE_GENERIC_ISO_2022
617 myConverterData->isFirstBuffer = TRUE;
618
619 /* append the UTF-8 escape sequence */
620 cnv->charErrorBufferLength = 3;
621 cnv->charErrorBuffer[0] = 0x1b;
622 cnv->charErrorBuffer[1] = 0x25;
623 cnv->charErrorBuffer[2] = 0x42;
624
625 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
626 /* initialize the state variables */
627 uprv_strcpy(myConverterData->name,"ISO_2022");
628 #else
629 *errorCode = U_UNSUPPORTED_ERROR;
630 return;
631 #endif
632 }
633
634 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
635
636 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
637 _ISO2022Close(cnv);
638 }
639 } else {
640 *errorCode = U_MEMORY_ALLOCATION_ERROR;
641 }
642 }
643
644
645 static void
_ISO2022Close(UConverter * converter)646 _ISO2022Close(UConverter *converter) {
647 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
648 UConverterSharedData **array = myData->myConverterArray;
649 int32_t i;
650
651 if (converter->extraInfo != NULL) {
652 /*close the array of converter pointers and free the memory*/
653 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
654 if(array[i]!=NULL) {
655 ucnv_unloadSharedDataIfReady(array[i]);
656 }
657 }
658
659 ucnv_close(myData->currentConverter);
660
661 if(!converter->isExtraLocal){
662 uprv_free (converter->extraInfo);
663 converter->extraInfo = NULL;
664 }
665 }
666 }
667
668 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)669 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
670 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
671 if(choice<=UCNV_RESET_TO_UNICODE) {
672 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
673 myConverterData->key = 0;
674 myConverterData->isEmptySegment = FALSE;
675 }
676 if(choice!=UCNV_RESET_TO_UNICODE) {
677 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
678 }
679 #ifdef U_ENABLE_GENERIC_ISO_2022
680 if(myConverterData->locale[0] == 0){
681 if(choice<=UCNV_RESET_TO_UNICODE) {
682 myConverterData->isFirstBuffer = TRUE;
683 myConverterData->key = 0;
684 if (converter->mode == UCNV_SO){
685 ucnv_close (myConverterData->currentConverter);
686 myConverterData->currentConverter=NULL;
687 }
688 converter->mode = UCNV_SI;
689 }
690 if(choice!=UCNV_RESET_TO_UNICODE) {
691 /* re-append UTF-8 escape sequence */
692 converter->charErrorBufferLength = 3;
693 converter->charErrorBuffer[0] = 0x1b;
694 converter->charErrorBuffer[1] = 0x28;
695 converter->charErrorBuffer[2] = 0x42;
696 }
697 }
698 else
699 #endif
700 {
701 /* reset the state variables */
702 if(myConverterData->locale[0] == 'k'){
703 if(choice<=UCNV_RESET_TO_UNICODE) {
704 setInitialStateToUnicodeKR(converter, myConverterData);
705 }
706 if(choice!=UCNV_RESET_TO_UNICODE) {
707 setInitialStateFromUnicodeKR(converter, myConverterData);
708 }
709 }
710 }
711 }
712
713 static const char*
_ISO2022getName(const UConverter * cnv)714 _ISO2022getName(const UConverter* cnv){
715 if(cnv->extraInfo){
716 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
717 return myData->name;
718 }
719 return NULL;
720 }
721
722
723 /*************** to unicode *******************/
724 /****************************************************************************
725 * Recognized escape sequences are
726 * <ESC>(B ASCII
727 * <ESC>.A ISO-8859-1
728 * <ESC>.F ISO-8859-7
729 * <ESC>(J JISX-201
730 * <ESC>(I JISX-201
731 * <ESC>$B JISX-208
732 * <ESC>$@ JISX-208
733 * <ESC>$(D JISX-212
734 * <ESC>$A GB2312
735 * <ESC>$(C KSC5601
736 */
737 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
738 /* 0 1 2 3 4 5 6 7 8 9 */
739 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
740 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
741 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
742 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
743 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
745 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
746 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
747 };
748
749 /*************** to unicode *******************/
750 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
751 /* 0 1 2 3 4 5 6 7 8 9 */
752 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
753 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
754 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
755 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
756 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
757 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
758 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
759 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
760 };
761
762
763 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)764 getKey_2022(char c,int32_t* key,int32_t* offset){
765 int32_t togo;
766 int32_t low = 0;
767 int32_t hi = MAX_STATES_2022;
768 int32_t oldmid=0;
769
770 togo = normalize_esq_chars_2022[(uint8_t)c];
771 if(togo == 0) {
772 /* not a valid character anywhere in an escape sequence */
773 *key = 0;
774 *offset = 0;
775 return INVALID_2022;
776 }
777 togo = (*key << 5) + togo;
778
779 while (hi != low) /*binary search*/{
780
781 register int32_t mid = (hi+low) >> 1; /*Finds median*/
782
783 if (mid == oldmid)
784 break;
785
786 if (escSeqStateTable_Key_2022[mid] > togo){
787 hi = mid;
788 }
789 else if (escSeqStateTable_Key_2022[mid] < togo){
790 low = mid;
791 }
792 else /*we found it*/{
793 *key = togo;
794 *offset = mid;
795 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
796 }
797 oldmid = mid;
798
799 }
800
801 *key = 0;
802 *offset = 0;
803 return INVALID_2022;
804 }
805
806 /*runs through a state machine to determine the escape sequence - codepage correspondance
807 */
808 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)809 changeState_2022(UConverter* _this,
810 const char** source,
811 const char* sourceLimit,
812 Variant2022 var,
813 UErrorCode* err){
814 UCNV_TableStates_2022 value;
815 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
816 uint32_t key = myData2022->key;
817 int32_t offset = 0;
818 int8_t initialToULength = _this->toULength;
819 char c;
820
821 value = VALID_NON_TERMINAL_2022;
822 while (*source < sourceLimit) {
823 c = *(*source)++;
824 _this->toUBytes[_this->toULength++]=(uint8_t)c;
825 value = getKey_2022(c,(int32_t *) &key, &offset);
826
827 switch (value){
828
829 case VALID_NON_TERMINAL_2022 :
830 /* continue with the loop */
831 break;
832
833 case VALID_TERMINAL_2022:
834 key = 0;
835 goto DONE;
836
837 case INVALID_2022:
838 goto DONE;
839
840 case VALID_MAYBE_TERMINAL_2022:
841 #ifdef U_ENABLE_GENERIC_ISO_2022
842 /* ESC ( B is ambiguous only for ISO_2022 itself */
843 if(var == ISO_2022) {
844 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
845 _this->toULength = 0;
846
847 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
848
849 /* continue with the loop */
850 value = VALID_NON_TERMINAL_2022;
851 break;
852 } else
853 #endif
854 {
855 /* not ISO_2022 itself, finish here */
856 value = VALID_TERMINAL_2022;
857 key = 0;
858 goto DONE;
859 }
860 }
861 }
862
863 DONE:
864 myData2022->key = key;
865
866 if (value == VALID_NON_TERMINAL_2022) {
867 /* indicate that the escape sequence is incomplete: key!=0 */
868 return;
869 } else if (value == INVALID_2022 ) {
870 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
871 } else /* value == VALID_TERMINAL_2022 */ {
872 switch(var){
873 #ifdef U_ENABLE_GENERIC_ISO_2022
874 case ISO_2022:
875 {
876 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
877 if(chosenConverterName == NULL) {
878 /* SS2 or SS3 */
879 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
880 _this->toUCallbackReason = UCNV_UNASSIGNED;
881 return;
882 }
883
884 _this->mode = UCNV_SI;
885 ucnv_close(myData2022->currentConverter);
886 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
887 if(U_SUCCESS(*err)) {
888 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
889 _this->mode = UCNV_SO;
890 }
891 break;
892 }
893 #endif
894 case ISO_2022_JP:
895 {
896 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
897 switch(tempState) {
898 case INVALID_STATE:
899 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
900 break;
901 case SS2_STATE:
902 if(myData2022->toU2022State.cs[2]!=0) {
903 if(myData2022->toU2022State.g<2) {
904 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
905 }
906 myData2022->toU2022State.g=2;
907 } else {
908 /* illegal to have SS2 before a matching designator */
909 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
910 }
911 break;
912 /* case SS3_STATE: not used in ISO-2022-JP-x */
913 case ISO8859_1:
914 case ISO8859_7:
915 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
916 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917 } else {
918 /* G2 charset for SS2 */
919 myData2022->toU2022State.cs[2]=(int8_t)tempState;
920 }
921 break;
922 default:
923 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
924 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
925 } else {
926 /* G0 charset */
927 myData2022->toU2022State.cs[0]=(int8_t)tempState;
928 }
929 break;
930 }
931 }
932 break;
933 case ISO_2022_CN:
934 {
935 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
936 switch(tempState) {
937 case INVALID_STATE:
938 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
939 break;
940 case SS2_STATE:
941 if(myData2022->toU2022State.cs[2]!=0) {
942 if(myData2022->toU2022State.g<2) {
943 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
944 }
945 myData2022->toU2022State.g=2;
946 } else {
947 /* illegal to have SS2 before a matching designator */
948 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
949 }
950 break;
951 case SS3_STATE:
952 if(myData2022->toU2022State.cs[3]!=0) {
953 if(myData2022->toU2022State.g<2) {
954 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
955 }
956 myData2022->toU2022State.g=3;
957 } else {
958 /* illegal to have SS3 before a matching designator */
959 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
960 }
961 break;
962 case ISO_IR_165:
963 if(myData2022->version==0) {
964 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
965 break;
966 }
967 /*fall through*/
968 case GB2312_1:
969 /*fall through*/
970 case CNS_11643_1:
971 myData2022->toU2022State.cs[1]=(int8_t)tempState;
972 break;
973 case CNS_11643_2:
974 myData2022->toU2022State.cs[2]=(int8_t)tempState;
975 break;
976 default:
977 /* other CNS 11643 planes */
978 if(myData2022->version==0) {
979 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
980 } else {
981 myData2022->toU2022State.cs[3]=(int8_t)tempState;
982 }
983 break;
984 }
985 }
986 break;
987 case ISO_2022_KR:
988 if(offset==0x30){
989 /* nothing to be done, just accept this one escape sequence */
990 } else {
991 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
992 }
993 break;
994
995 default:
996 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
997 break;
998 }
999 }
1000 if(U_SUCCESS(*err)) {
1001 _this->toULength = 0;
1002 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1003 if(_this->toULength>1) {
1004 /*
1005 * Ticket 5691: consistent illegal sequences:
1006 * - We include at least the first byte (ESC) in the illegal sequence.
1007 * - If any of the non-initial bytes could be the start of a character,
1008 * we stop the illegal sequence before the first one of those.
1009 * In escape sequences, all following bytes are "printable", that is,
1010 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1011 * they are valid single/lead bytes.
1012 * For simplicity, we always only report the initial ESC byte as the
1013 * illegal sequence and back out all other bytes we looked at.
1014 */
1015 /* Back out some bytes. */
1016 int8_t backOutDistance=_this->toULength-1;
1017 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1018 if(backOutDistance<=bytesFromThisBuffer) {
1019 /* same as initialToULength<=1 */
1020 *source-=backOutDistance;
1021 } else {
1022 /* Back out bytes from the previous buffer: Need to replay them. */
1023 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1024 /* same as -(initialToULength-1) */
1025 /* preToULength is negative! */
1026 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1027 *source-=bytesFromThisBuffer;
1028 }
1029 _this->toULength=1;
1030 }
1031 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1032 _this->toUCallbackReason = UCNV_UNASSIGNED;
1033 }
1034 }
1035
1036 /*Checks the characters of the buffer against valid 2022 escape sequences
1037 *if the match we return a pointer to the initial start of the sequence otherwise
1038 *we return sourceLimit
1039 */
1040 /*for 2022 looks ahead in the stream
1041 *to determine the longest possible convertible
1042 *data stream
1043 */
1044 static inline const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool)1045 getEndOfBuffer_2022(const char** source,
1046 const char* sourceLimit,
1047 UBool /*flush*/){
1048
1049 const char* mySource = *source;
1050
1051 #ifdef U_ENABLE_GENERIC_ISO_2022
1052 if (*source >= sourceLimit)
1053 return sourceLimit;
1054
1055 do{
1056
1057 if (*mySource == ESC_2022){
1058 int8_t i;
1059 int32_t key = 0;
1060 int32_t offset;
1061 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1062
1063 /* Kludge: I could not
1064 * figure out the reason for validating an escape sequence
1065 * twice - once here and once in changeState_2022().
1066 * is it possible to have an ESC character in a ISO2022
1067 * byte stream which is valid in a code page? Is it legal?
1068 */
1069 for (i=0;
1070 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1071 i++) {
1072 value = getKey_2022(*(mySource+i), &key, &offset);
1073 }
1074 if (value > 0 || *mySource==ESC_2022)
1075 return mySource;
1076
1077 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1078 return sourceLimit;
1079 }
1080 }while (++mySource < sourceLimit);
1081
1082 return sourceLimit;
1083 #else
1084 while(mySource < sourceLimit && *mySource != ESC_2022) {
1085 ++mySource;
1086 }
1087 return mySource;
1088 #endif
1089 }
1090
1091
1092 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1093 * any future change in _MBCSFromUChar32() function should be reflected here.
1094 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1095 */
1096 static inline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1097 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1098 UChar32 c,
1099 uint32_t* value,
1100 UBool useFallback,
1101 int outputType)
1102 {
1103 const int32_t *cx;
1104 const uint16_t *table;
1105 uint32_t stage2Entry;
1106 uint32_t myValue;
1107 int32_t length;
1108 const uint8_t *p;
1109 /*
1110 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1111 * Use internal version of ucnv_open() that verifies that the new structures are available,
1112 * else U_INTERNAL_PROGRAM_ERROR.
1113 */
1114 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1115 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1116 table=sharedData->mbcs.fromUnicodeTable;
1117 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1118 /* get the bytes and the length for the output */
1119 if(outputType==MBCS_OUTPUT_2){
1120 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1121 if(myValue<=0xff) {
1122 length=1;
1123 } else {
1124 length=2;
1125 }
1126 } else /* outputType==MBCS_OUTPUT_3 */ {
1127 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1128 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1129 if(myValue<=0xff) {
1130 length=1;
1131 } else if(myValue<=0xffff) {
1132 length=2;
1133 } else {
1134 length=3;
1135 }
1136 }
1137 /* is this code point assigned, or do we use fallbacks? */
1138 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1139 /* assigned */
1140 *value=myValue;
1141 return length;
1142 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1143 /*
1144 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1145 * There is no way with this data structure for fallback output
1146 * to be a zero byte.
1147 */
1148 *value=myValue;
1149 return -length;
1150 }
1151 }
1152
1153 cx=sharedData->mbcs.extIndexes;
1154 if(cx!=NULL) {
1155 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1156 }
1157
1158 /* unassigned */
1159 return 0;
1160 }
1161
1162 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1163 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1164 * @param retval pointer to output byte
1165 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1166 */
1167 static inline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1168 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1169 UChar32 c,
1170 uint32_t* retval,
1171 UBool useFallback)
1172 {
1173 const uint16_t *table;
1174 int32_t value;
1175 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1176 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1177 return 0;
1178 }
1179 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1180 table=sharedData->mbcs.fromUnicodeTable;
1181 /* get the byte for the output */
1182 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1183 /* is this code point assigned, or do we use fallbacks? */
1184 *retval=(uint32_t)(value&0xff);
1185 if(value>=0xf00) {
1186 return 1; /* roundtrip */
1187 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1188 return -1; /* fallback taken */
1189 } else {
1190 return 0; /* no mapping */
1191 }
1192 }
1193
1194 /*
1195 * Check that the result is a 2-byte value with each byte in the range A1..FE
1196 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1197 * to move it to the ISO 2022 range 21..7E.
1198 * Return 0 if out of range.
1199 */
1200 static inline uint32_t
_2022FromGR94DBCS(uint32_t value)1201 _2022FromGR94DBCS(uint32_t value) {
1202 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1203 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1204 ) {
1205 return value - 0x8080; /* shift down to 21..7e byte range */
1206 } else {
1207 return 0; /* not valid for ISO 2022 */
1208 }
1209 }
1210
1211 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1212 /*
1213 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1214 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1215 * unchanged.
1216 */
1217 static inline uint32_t
1218 _2022ToGR94DBCS(uint32_t value) {
1219 uint32_t returnValue = value + 0x8080;
1220 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1221 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1222 return returnValue;
1223 } else {
1224 return value;
1225 }
1226 }
1227 #endif
1228
1229 #ifdef U_ENABLE_GENERIC_ISO_2022
1230
1231 /**********************************************************************************
1232 * ISO-2022 Converter
1233 *
1234 *
1235 */
1236
1237 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1238 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1239 UErrorCode* err){
1240 const char* mySourceLimit, *realSourceLimit;
1241 const char* sourceStart;
1242 const UChar* myTargetStart;
1243 UConverter* saveThis;
1244 UConverterDataISO2022* myData;
1245 int8_t length;
1246
1247 saveThis = args->converter;
1248 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1249
1250 realSourceLimit = args->sourceLimit;
1251 while (args->source < realSourceLimit) {
1252 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1253 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1254 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1255
1256 if(args->source < mySourceLimit) {
1257 if(myData->currentConverter==NULL) {
1258 myData->currentConverter = ucnv_open("ASCII",err);
1259 if(U_FAILURE(*err)){
1260 return;
1261 }
1262
1263 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1264 saveThis->mode = UCNV_SO;
1265 }
1266
1267 /* convert to before the ESC or until the end of the buffer */
1268 myData->isFirstBuffer=FALSE;
1269 sourceStart = args->source;
1270 myTargetStart = args->target;
1271 args->converter = myData->currentConverter;
1272 ucnv_toUnicode(args->converter,
1273 &args->target,
1274 args->targetLimit,
1275 &args->source,
1276 mySourceLimit,
1277 args->offsets,
1278 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1279 err);
1280 args->converter = saveThis;
1281
1282 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1283 /* move the overflow buffer */
1284 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1285 myData->currentConverter->UCharErrorBufferLength = 0;
1286 if(length > 0) {
1287 uprv_memcpy(saveThis->UCharErrorBuffer,
1288 myData->currentConverter->UCharErrorBuffer,
1289 length*U_SIZEOF_UCHAR);
1290 }
1291 return;
1292 }
1293
1294 /*
1295 * At least one of:
1296 * -Error while converting
1297 * -Done with entire buffer
1298 * -Need to write offsets or update the current offset
1299 * (leave that up to the code in ucnv.c)
1300 *
1301 * or else we just stopped at an ESC byte and continue with changeState_2022()
1302 */
1303 if (U_FAILURE(*err) ||
1304 (args->source == realSourceLimit) ||
1305 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1306 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1307 ) {
1308 /* copy partial or error input for truncated detection and error handling */
1309 if(U_FAILURE(*err)) {
1310 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1311 if(length > 0) {
1312 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1313 }
1314 } else {
1315 length = saveThis->toULength = myData->currentConverter->toULength;
1316 if(length > 0) {
1317 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1318 if(args->source < mySourceLimit) {
1319 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1320 }
1321 }
1322 }
1323 return;
1324 }
1325 }
1326 }
1327
1328 sourceStart = args->source;
1329 changeState_2022(args->converter,
1330 &(args->source),
1331 realSourceLimit,
1332 ISO_2022,
1333 err);
1334 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1335 /* let the ucnv.c code update its current offset */
1336 return;
1337 }
1338 }
1339 }
1340
1341 #endif
1342
1343 /*
1344 * To Unicode Callback helper function
1345 */
1346 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1347 toUnicodeCallback(UConverter *cnv,
1348 const uint32_t sourceChar, const uint32_t targetUniChar,
1349 UErrorCode* err){
1350 if(sourceChar>0xff){
1351 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1352 cnv->toUBytes[1] = (uint8_t)sourceChar;
1353 cnv->toULength = 2;
1354 }
1355 else{
1356 cnv->toUBytes[0] =(char) sourceChar;
1357 cnv->toULength = 1;
1358 }
1359
1360 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1361 *err = U_INVALID_CHAR_FOUND;
1362 }
1363 else{
1364 *err = U_ILLEGAL_CHAR_FOUND;
1365 }
1366 }
1367
1368 /**************************************ISO-2022-JP*************************************************/
1369
1370 /************************************** IMPORTANT **************************************************
1371 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1372 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1373 * The converter iterates over each Unicode codepoint
1374 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1375 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1376 * would do as far as possible.
1377 *
1378 * If the implementation of these macros or structure of sharedData struct change in the future, make
1379 * sure that ISO-2022 is also changed.
1380 ***************************************************************************************************
1381 */
1382
1383 /***************************************************************************************************
1384 * Rules for ISO-2022-jp encoding
1385 * (i) Escape sequences must be fully contained within a line they should not
1386 * span new lines or CRs
1387 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1388 * JIS-Roman character escape sequence should follow before the line terminates
1389 * (iii) If the first character on the line is represented by two bytes then a two
1390 * byte character escape sequence should precede it
1391 * (iv) If no escape sequence is encountered then the characters are ASCII
1392 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1393 * and invoked with SS2 (ESC N).
1394 * (vi) If there is any G0 designation in text, there must be a switch to
1395 * ASCII or to JIS X 0201-Roman before a space character (but not
1396 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1397 * characters such as tab or CRLF.
1398 * (vi) Supported encodings:
1399 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1400 *
1401 * source : RFC-1554
1402 *
1403 * JISX201, JISX208,JISX212 : new .cnv data files created
1404 * KSC5601 : alias to ibm-949 mapping table
1405 * GB2312 : alias to ibm-1386 mapping table
1406 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1407 * ISO-8859-7 : alisas to ibm-9409 mapping table
1408 */
1409
1410 /* preference order of JP charsets */
1411 static const StateEnum jpCharsetPref[]={
1412 ASCII,
1413 JISX201,
1414 ISO8859_1,
1415 ISO8859_7,
1416 JISX208,
1417 JISX212,
1418 GB2312,
1419 KSC5601,
1420 HWKANA_7BIT
1421 };
1422
1423 /*
1424 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1425 * not in order of jpCharsetPref[]!
1426 */
1427 static const char escSeqChars[][6] ={
1428 "\x1B\x28\x42", /* <ESC>(B ASCII */
1429 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1430 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1431 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1432 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1433 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1434 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1435 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1436 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1437
1438 };
1439 static const int8_t escSeqCharsLen[] ={
1440 3, /* length of <ESC>(B ASCII */
1441 3, /* length of <ESC>.A ISO-8859-1 */
1442 3, /* length of <ESC>.F ISO-8859-7 */
1443 3, /* length of <ESC>(J JISX-201 */
1444 3, /* length of <ESC>$B JISX-208 */
1445 4, /* length of <ESC>$(D JISX-212 */
1446 3, /* length of <ESC>$A GB2312 */
1447 4, /* length of <ESC>$(C KSC5601 */
1448 3 /* length of <ESC>(I HWKANA_7BIT */
1449 };
1450
1451 /*
1452 * The iteration over various code pages works this way:
1453 * i) Get the currentState from myConverterData->currentState
1454 * ii) Check if the character is mapped to a valid character in the currentState
1455 * Yes -> a) set the initIterState to currentState
1456 * b) remain in this state until an invalid character is found
1457 * No -> a) go to the next code page and find the character
1458 * iii) Before changing the state increment the current state check if the current state
1459 * is equal to the intitIteration state
1460 * Yes -> A character that cannot be represented in any of the supported encodings
1461 * break and return a U_INVALID_CHARACTER error
1462 * No -> Continue and find the character in next code page
1463 *
1464 *
1465 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1466 */
1467
1468 /* Map 00..7F to Unicode according to JIS X 0201. */
1469 static inline uint32_t
jisx201ToU(uint32_t value)1470 jisx201ToU(uint32_t value) {
1471 if(value < 0x5c) {
1472 return value;
1473 } else if(value == 0x5c) {
1474 return 0xa5;
1475 } else if(value == 0x7e) {
1476 return 0x203e;
1477 } else /* value <= 0x7f */ {
1478 return value;
1479 }
1480 }
1481
1482 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1483 static inline uint32_t
jisx201FromU(uint32_t value)1484 jisx201FromU(uint32_t value) {
1485 if(value<=0x7f) {
1486 if(value!=0x5c && value!=0x7e) {
1487 return value;
1488 }
1489 } else if(value==0xa5) {
1490 return 0x5c;
1491 } else if(value==0x203e) {
1492 return 0x7e;
1493 }
1494 return 0xfffe;
1495 }
1496
1497 /*
1498 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1499 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1500 * Return 0 if the byte pair is out of range.
1501 */
1502 static inline uint32_t
_2022FromSJIS(uint32_t value)1503 _2022FromSJIS(uint32_t value) {
1504 uint8_t trail;
1505
1506 if(value > 0xEFFC) {
1507 return 0; /* beyond JIS X 0208 */
1508 }
1509
1510 trail = (uint8_t)value;
1511
1512 value &= 0xff00; /* lead byte */
1513 if(value <= 0x9f00) {
1514 value -= 0x7000;
1515 } else /* 0xe000 <= value <= 0xef00 */ {
1516 value -= 0xb000;
1517 }
1518 value <<= 1;
1519
1520 if(trail <= 0x9e) {
1521 value -= 0x100;
1522 if(trail <= 0x7e) {
1523 value |= trail - 0x1f;
1524 } else {
1525 value |= trail - 0x20;
1526 }
1527 } else /* trail <= 0xfc */ {
1528 value |= trail - 0x7e;
1529 }
1530 return value;
1531 }
1532
1533 /*
1534 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1535 * If either byte is outside 21..7E make sure that the result is not valid
1536 * for Shift-JIS so that the converter catches it.
1537 * Some invalid byte values already turn into equally invalid Shift-JIS
1538 * byte values and need not be tested explicitly.
1539 */
1540 static inline void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1541 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1542 if(c1&1) {
1543 ++c1;
1544 if(c2 <= 0x5f) {
1545 c2 += 0x1f;
1546 } else if(c2 <= 0x7e) {
1547 c2 += 0x20;
1548 } else {
1549 c2 = 0; /* invalid */
1550 }
1551 } else {
1552 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1553 c2 += 0x7e;
1554 } else {
1555 c2 = 0; /* invalid */
1556 }
1557 }
1558 c1 >>= 1;
1559 if(c1 <= 0x2f) {
1560 c1 += 0x70;
1561 } else if(c1 <= 0x3f) {
1562 c1 += 0xb0;
1563 } else {
1564 c1 = 0; /* invalid */
1565 }
1566 bytes[0] = (char)c1;
1567 bytes[1] = (char)c2;
1568 }
1569
1570 /*
1571 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1572 * Katakana.
1573 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1574 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1575 * These were the only fallbacks in ICU's jisx-208.ucm file.
1576 */
1577 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1578 0x2123, /* U+FF61 */
1579 0x2156,
1580 0x2157,
1581 0x2122,
1582 0x2126,
1583 0x2572,
1584 0x2521,
1585 0x2523,
1586 0x2525,
1587 0x2527,
1588 0x2529,
1589 0x2563,
1590 0x2565,
1591 0x2567,
1592 0x2543,
1593 0x213C, /* U+FF70 */
1594 0x2522,
1595 0x2524,
1596 0x2526,
1597 0x2528,
1598 0x252A,
1599 0x252B,
1600 0x252D,
1601 0x252F,
1602 0x2531,
1603 0x2533,
1604 0x2535,
1605 0x2537,
1606 0x2539,
1607 0x253B,
1608 0x253D,
1609 0x253F, /* U+FF80 */
1610 0x2541,
1611 0x2544,
1612 0x2546,
1613 0x2548,
1614 0x254A,
1615 0x254B,
1616 0x254C,
1617 0x254D,
1618 0x254E,
1619 0x254F,
1620 0x2552,
1621 0x2555,
1622 0x2558,
1623 0x255B,
1624 0x255E,
1625 0x255F, /* U+FF90 */
1626 0x2560,
1627 0x2561,
1628 0x2562,
1629 0x2564,
1630 0x2566,
1631 0x2568,
1632 0x2569,
1633 0x256A,
1634 0x256B,
1635 0x256C,
1636 0x256D,
1637 0x256F,
1638 0x2573,
1639 0x212B,
1640 0x212C /* U+FF9F */
1641 };
1642
1643 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1644 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1645 UConverter *cnv = args->converter;
1646 UConverterDataISO2022 *converterData;
1647 ISO2022State *pFromU2022State;
1648 uint8_t *target = (uint8_t *) args->target;
1649 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1650 const UChar* source = args->source;
1651 const UChar* sourceLimit = args->sourceLimit;
1652 int32_t* offsets = args->offsets;
1653 UChar32 sourceChar;
1654 char buffer[8];
1655 int32_t len, outLen;
1656 int8_t choices[10];
1657 int32_t choiceCount;
1658 uint32_t targetValue = 0;
1659 UBool useFallback;
1660
1661 int32_t i;
1662 int8_t cs, g;
1663
1664 /* set up the state */
1665 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1666 pFromU2022State = &converterData->fromU2022State;
1667
1668 choiceCount = 0;
1669
1670 /* check if the last codepoint of previous buffer was a lead surrogate*/
1671 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1672 goto getTrail;
1673 }
1674
1675 while(source < sourceLimit) {
1676 if(target < targetLimit) {
1677
1678 sourceChar = *(source++);
1679 /*check if the char is a First surrogate*/
1680 if(U16_IS_SURROGATE(sourceChar)) {
1681 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1682 getTrail:
1683 /*look ahead to find the trail surrogate*/
1684 if(source < sourceLimit) {
1685 /* test the following code unit */
1686 UChar trail=(UChar) *source;
1687 if(U16_IS_TRAIL(trail)) {
1688 source++;
1689 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1690 cnv->fromUChar32=0x00;
1691 /* convert this supplementary code point */
1692 /* exit this condition tree */
1693 } else {
1694 /* this is an unmatched lead code unit (1st surrogate) */
1695 /* callback(illegal) */
1696 *err=U_ILLEGAL_CHAR_FOUND;
1697 cnv->fromUChar32=sourceChar;
1698 break;
1699 }
1700 } else {
1701 /* no more input */
1702 cnv->fromUChar32=sourceChar;
1703 break;
1704 }
1705 } else {
1706 /* this is an unmatched trail code unit (2nd surrogate) */
1707 /* callback(illegal) */
1708 *err=U_ILLEGAL_CHAR_FOUND;
1709 cnv->fromUChar32=sourceChar;
1710 break;
1711 }
1712 }
1713
1714 /* do not convert SO/SI/ESC */
1715 if(IS_2022_CONTROL(sourceChar)) {
1716 /* callback(illegal) */
1717 *err=U_ILLEGAL_CHAR_FOUND;
1718 cnv->fromUChar32=sourceChar;
1719 break;
1720 }
1721
1722 /* do the conversion */
1723
1724 if(choiceCount == 0) {
1725 uint16_t csm;
1726
1727 /*
1728 * The csm variable keeps track of which charsets are allowed
1729 * and not used yet while building the choices[].
1730 */
1731 csm = jpCharsetMasks[converterData->version];
1732 choiceCount = 0;
1733
1734 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1735 if(converterData->version == 3 || converterData->version == 4) {
1736 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1737 }
1738 /* Do not try single-byte half-width Katakana for other versions. */
1739 csm &= ~CSM(HWKANA_7BIT);
1740
1741 /* try the current G0 charset */
1742 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1743 csm &= ~CSM(cs);
1744
1745 /* try the current G2 charset */
1746 if((cs = pFromU2022State->cs[2]) != 0) {
1747 choices[choiceCount++] = cs;
1748 csm &= ~CSM(cs);
1749 }
1750
1751 /* try all the other possible charsets */
1752 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1753 cs = (int8_t)jpCharsetPref[i];
1754 if(CSM(cs) & csm) {
1755 choices[choiceCount++] = cs;
1756 csm &= ~CSM(cs);
1757 }
1758 }
1759 }
1760
1761 cs = g = 0;
1762 /*
1763 * len==0: no mapping found yet
1764 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1765 * len>0: found a roundtrip result, done
1766 */
1767 len = 0;
1768 /*
1769 * We will turn off useFallback after finding a fallback,
1770 * but we still get fallbacks from PUA code points as usual.
1771 * Therefore, we will also need to check that we don't overwrite
1772 * an early fallback with a later one.
1773 */
1774 useFallback = cnv->useFallback;
1775
1776 for(i = 0; i < choiceCount && len <= 0; ++i) {
1777 uint32_t value;
1778 int32_t len2;
1779 int8_t cs0 = choices[i];
1780 switch(cs0) {
1781 case ASCII:
1782 if(sourceChar <= 0x7f) {
1783 targetValue = (uint32_t)sourceChar;
1784 len = 1;
1785 cs = cs0;
1786 g = 0;
1787 }
1788 break;
1789 case ISO8859_1:
1790 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1791 targetValue = (uint32_t)sourceChar - 0x80;
1792 len = 1;
1793 cs = cs0;
1794 g = 2;
1795 }
1796 break;
1797 case HWKANA_7BIT:
1798 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1799 if(converterData->version==3) {
1800 /* JIS7: use G1 (SO) */
1801 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1802 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1803 len = 1;
1804 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1805 g = 1;
1806 } else if(converterData->version==4) {
1807 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1808 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1809 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1810 len = 1;
1811
1812 cs = pFromU2022State->cs[0];
1813 if(IS_JP_DBCS(cs)) {
1814 /* switch from a DBCS charset to JISX201 */
1815 cs = (int8_t)JISX201;
1816 }
1817 /* else stay in the current G0 charset */
1818 g = 0;
1819 }
1820 /* else do not use HWKANA_7BIT with other versions */
1821 }
1822 break;
1823 case JISX201:
1824 /* G0 SBCS */
1825 value = jisx201FromU(sourceChar);
1826 if(value <= 0x7f) {
1827 targetValue = value;
1828 len = 1;
1829 cs = cs0;
1830 g = 0;
1831 useFallback = FALSE;
1832 }
1833 break;
1834 case JISX208:
1835 /* G0 DBCS from Shift-JIS table */
1836 len2 = MBCS_FROM_UCHAR32_ISO2022(
1837 converterData->myConverterArray[cs0],
1838 sourceChar, &value,
1839 useFallback, MBCS_OUTPUT_2);
1840 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1841 value = _2022FromSJIS(value);
1842 if(value != 0) {
1843 targetValue = value;
1844 len = len2;
1845 cs = cs0;
1846 g = 0;
1847 useFallback = FALSE;
1848 }
1849 } else if(len == 0 && useFallback &&
1850 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1851 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1852 len = -2;
1853 cs = cs0;
1854 g = 0;
1855 useFallback = FALSE;
1856 }
1857 break;
1858 case ISO8859_7:
1859 /* G0 SBCS forced to 7-bit output */
1860 len2 = MBCS_SINGLE_FROM_UCHAR32(
1861 converterData->myConverterArray[cs0],
1862 sourceChar, &value,
1863 useFallback);
1864 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1865 targetValue = value - 0x80;
1866 len = len2;
1867 cs = cs0;
1868 g = 2;
1869 useFallback = FALSE;
1870 }
1871 break;
1872 default:
1873 /* G0 DBCS */
1874 len2 = MBCS_FROM_UCHAR32_ISO2022(
1875 converterData->myConverterArray[cs0],
1876 sourceChar, &value,
1877 useFallback, MBCS_OUTPUT_2);
1878 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1879 if(cs0 == KSC5601) {
1880 /*
1881 * Check for valid bytes for the encoding scheme.
1882 * This is necessary because the sub-converter (windows-949)
1883 * has a broader encoding scheme than is valid for 2022.
1884 */
1885 value = _2022FromGR94DBCS(value);
1886 if(value == 0) {
1887 break;
1888 }
1889 }
1890 targetValue = value;
1891 len = len2;
1892 cs = cs0;
1893 g = 0;
1894 useFallback = FALSE;
1895 }
1896 break;
1897 }
1898 }
1899
1900 if(len != 0) {
1901 if(len < 0) {
1902 len = -len; /* fallback */
1903 }
1904 outLen = 0; /* count output bytes */
1905
1906 /* write SI if necessary (only for JIS7) */
1907 if(pFromU2022State->g == 1 && g == 0) {
1908 buffer[outLen++] = UCNV_SI;
1909 pFromU2022State->g = 0;
1910 }
1911
1912 /* write the designation sequence if necessary */
1913 if(cs != pFromU2022State->cs[g]) {
1914 int32_t escLen = escSeqCharsLen[cs];
1915 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1916 outLen += escLen;
1917 pFromU2022State->cs[g] = cs;
1918
1919 /* invalidate the choices[] */
1920 choiceCount = 0;
1921 }
1922
1923 /* write the shift sequence if necessary */
1924 if(g != pFromU2022State->g) {
1925 switch(g) {
1926 /* case 0 handled before writing escapes */
1927 case 1:
1928 buffer[outLen++] = UCNV_SO;
1929 pFromU2022State->g = 1;
1930 break;
1931 default: /* case 2 */
1932 buffer[outLen++] = 0x1b;
1933 buffer[outLen++] = 0x4e;
1934 break;
1935 /* no case 3: no SS3 in ISO-2022-JP-x */
1936 }
1937 }
1938
1939 /* write the output bytes */
1940 if(len == 1) {
1941 buffer[outLen++] = (char)targetValue;
1942 } else /* len == 2 */ {
1943 buffer[outLen++] = (char)(targetValue >> 8);
1944 buffer[outLen++] = (char)targetValue;
1945 }
1946 } else {
1947 /*
1948 * if we cannot find the character after checking all codepages
1949 * then this is an error
1950 */
1951 *err = U_INVALID_CHAR_FOUND;
1952 cnv->fromUChar32=sourceChar;
1953 break;
1954 }
1955
1956 if(sourceChar == CR || sourceChar == LF) {
1957 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1958 pFromU2022State->cs[2] = 0;
1959 choiceCount = 0;
1960 }
1961
1962 /* output outLen>0 bytes in buffer[] */
1963 if(outLen == 1) {
1964 *target++ = buffer[0];
1965 if(offsets) {
1966 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1967 }
1968 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1969 *target++ = buffer[0];
1970 *target++ = buffer[1];
1971 if(offsets) {
1972 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1973 *offsets++ = sourceIndex;
1974 *offsets++ = sourceIndex;
1975 }
1976 } else {
1977 fromUWriteUInt8(
1978 cnv,
1979 buffer, outLen,
1980 &target, (const char *)targetLimit,
1981 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1982 err);
1983 if(U_FAILURE(*err)) {
1984 break;
1985 }
1986 }
1987 } /* end if(myTargetIndex<myTargetLength) */
1988 else{
1989 *err =U_BUFFER_OVERFLOW_ERROR;
1990 break;
1991 }
1992
1993 }/* end while(mySourceIndex<mySourceLength) */
1994
1995 /*
1996 * the end of the input stream and detection of truncated input
1997 * are handled by the framework, but for ISO-2022-JP conversion
1998 * we need to be in ASCII mode at the very end
1999 *
2000 * conditions:
2001 * successful
2002 * in SO mode or not in ASCII mode
2003 * end of input and no truncated input
2004 */
2005 if( U_SUCCESS(*err) &&
2006 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2007 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2008 ) {
2009 int32_t sourceIndex;
2010
2011 outLen = 0;
2012
2013 if(pFromU2022State->g != 0) {
2014 buffer[outLen++] = UCNV_SI;
2015 pFromU2022State->g = 0;
2016 }
2017
2018 if(pFromU2022State->cs[0] != ASCII) {
2019 int32_t escLen = escSeqCharsLen[ASCII];
2020 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2021 outLen += escLen;
2022 pFromU2022State->cs[0] = (int8_t)ASCII;
2023 }
2024
2025 /* get the source index of the last input character */
2026 /*
2027 * TODO this would be simpler and more reliable if we used a pair
2028 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2029 * so that we could simply use the prevSourceIndex here;
2030 * this code gives an incorrect result for the rare case of an unmatched
2031 * trail surrogate that is alone in the last buffer of the text stream
2032 */
2033 sourceIndex=(int32_t)(source-args->source);
2034 if(sourceIndex>0) {
2035 --sourceIndex;
2036 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2037 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2038 ) {
2039 --sourceIndex;
2040 }
2041 } else {
2042 sourceIndex=-1;
2043 }
2044
2045 fromUWriteUInt8(
2046 cnv,
2047 buffer, outLen,
2048 &target, (const char *)targetLimit,
2049 &offsets, sourceIndex,
2050 err);
2051 }
2052
2053 /*save the state and return */
2054 args->source = source;
2055 args->target = (char*)target;
2056 }
2057
2058 /*************** to unicode *******************/
2059
2060 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2061 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2062 UErrorCode* err){
2063 char tempBuf[2];
2064 const char *mySource = (char *) args->source;
2065 UChar *myTarget = args->target;
2066 const char *mySourceLimit = args->sourceLimit;
2067 uint32_t targetUniChar = 0x0000;
2068 uint32_t mySourceChar = 0x0000;
2069 uint32_t tmpSourceChar = 0x0000;
2070 UConverterDataISO2022* myData;
2071 ISO2022State *pToU2022State;
2072 StateEnum cs;
2073
2074 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2075 pToU2022State = &myData->toU2022State;
2076
2077 if(myData->key != 0) {
2078 /* continue with a partial escape sequence */
2079 goto escape;
2080 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2081 /* continue with a partial double-byte character */
2082 mySourceChar = args->converter->toUBytes[0];
2083 args->converter->toULength = 0;
2084 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2085 targetUniChar = missingCharMarker;
2086 goto getTrailByte;
2087 }
2088
2089 while(mySource < mySourceLimit){
2090
2091 targetUniChar =missingCharMarker;
2092
2093 if(myTarget < args->targetLimit){
2094
2095 mySourceChar= (unsigned char) *mySource++;
2096
2097 switch(mySourceChar) {
2098 case UCNV_SI:
2099 if(myData->version==3) {
2100 pToU2022State->g=0;
2101 continue;
2102 } else {
2103 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2104 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2105 break;
2106 }
2107
2108 case UCNV_SO:
2109 if(myData->version==3) {
2110 /* JIS7: switch to G1 half-width Katakana */
2111 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2112 pToU2022State->g=1;
2113 continue;
2114 } else {
2115 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2116 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2117 break;
2118 }
2119
2120 case ESC_2022:
2121 mySource--;
2122 escape:
2123 {
2124 const char * mySourceBefore = mySource;
2125 int8_t toULengthBefore = args->converter->toULength;
2126
2127 changeState_2022(args->converter,&(mySource),
2128 mySourceLimit, ISO_2022_JP,err);
2129
2130 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2131 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2132 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2133 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2134 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2135 }
2136 }
2137
2138 /* invalid or illegal escape sequence */
2139 if(U_FAILURE(*err)){
2140 args->target = myTarget;
2141 args->source = mySource;
2142 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2143 return;
2144 }
2145 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2146 if(myData->key==0) {
2147 myData->isEmptySegment = TRUE;
2148 }
2149 continue;
2150
2151 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2152
2153 case CR:
2154 /*falls through*/
2155 case LF:
2156 /* automatically reset to single-byte mode */
2157 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2158 pToU2022State->cs[0] = (int8_t)ASCII;
2159 }
2160 pToU2022State->cs[2] = 0;
2161 pToU2022State->g = 0;
2162 /* falls through */
2163 default:
2164 /* convert one or two bytes */
2165 myData->isEmptySegment = FALSE;
2166 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2167 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2168 !IS_JP_DBCS(cs)
2169 ) {
2170 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2171 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2172
2173 /* return from a single-shift state to the previous one */
2174 if(pToU2022State->g >= 2) {
2175 pToU2022State->g=pToU2022State->prevG;
2176 }
2177 } else switch(cs) {
2178 case ASCII:
2179 if(mySourceChar <= 0x7f) {
2180 targetUniChar = mySourceChar;
2181 }
2182 break;
2183 case ISO8859_1:
2184 if(mySourceChar <= 0x7f) {
2185 targetUniChar = mySourceChar + 0x80;
2186 }
2187 /* return from a single-shift state to the previous one */
2188 pToU2022State->g=pToU2022State->prevG;
2189 break;
2190 case ISO8859_7:
2191 if(mySourceChar <= 0x7f) {
2192 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2193 targetUniChar =
2194 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2195 myData->myConverterArray[cs],
2196 mySourceChar + 0x80);
2197 }
2198 /* return from a single-shift state to the previous one */
2199 pToU2022State->g=pToU2022State->prevG;
2200 break;
2201 case JISX201:
2202 if(mySourceChar <= 0x7f) {
2203 targetUniChar = jisx201ToU(mySourceChar);
2204 }
2205 break;
2206 case HWKANA_7BIT:
2207 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2208 /* 7-bit halfwidth Katakana */
2209 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2210 }
2211 break;
2212 default:
2213 /* G0 DBCS */
2214 if(mySource < mySourceLimit) {
2215 int leadIsOk, trailIsOk;
2216 uint8_t trailByte;
2217 getTrailByte:
2218 trailByte = (uint8_t)*mySource;
2219 /*
2220 * Ticket 5691: consistent illegal sequences:
2221 * - We include at least the first byte in the illegal sequence.
2222 * - If any of the non-initial bytes could be the start of a character,
2223 * we stop the illegal sequence before the first one of those.
2224 *
2225 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2226 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2227 * Otherwise we convert or report the pair of bytes.
2228 */
2229 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2230 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2231 if (leadIsOk && trailIsOk) {
2232 ++mySource;
2233 tmpSourceChar = (mySourceChar << 8) | trailByte;
2234 if(cs == JISX208) {
2235 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2236 mySourceChar = tmpSourceChar;
2237 } else {
2238 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2239 mySourceChar = tmpSourceChar;
2240 if (cs == KSC5601) {
2241 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2242 }
2243 tempBuf[0] = (char)(tmpSourceChar >> 8);
2244 tempBuf[1] = (char)(tmpSourceChar);
2245 }
2246 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2247 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2248 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2249 ++mySource;
2250 /* add another bit so that the code below writes 2 bytes in case of error */
2251 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2252 }
2253 } else {
2254 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2255 args->converter->toULength = 1;
2256 goto endloop;
2257 }
2258 } /* End of inner switch */
2259 break;
2260 } /* End of outer switch */
2261 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2262 if(args->offsets){
2263 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2264 }
2265 *(myTarget++)=(UChar)targetUniChar;
2266 }
2267 else if(targetUniChar > missingCharMarker){
2268 /* disassemble the surrogate pair and write to output*/
2269 targetUniChar-=0x0010000;
2270 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2271 if(args->offsets){
2272 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2273 }
2274 ++myTarget;
2275 if(myTarget< args->targetLimit){
2276 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2277 if(args->offsets){
2278 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2279 }
2280 ++myTarget;
2281 }else{
2282 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2283 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2284 }
2285
2286 }
2287 else{
2288 /* Call the callback function*/
2289 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2290 break;
2291 }
2292 }
2293 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2294 *err =U_BUFFER_OVERFLOW_ERROR;
2295 break;
2296 }
2297 }
2298 endloop:
2299 args->target = myTarget;
2300 args->source = mySource;
2301 }
2302
2303
2304 /***************************************************************
2305 * Rules for ISO-2022-KR encoding
2306 * i) The KSC5601 designator sequence should appear only once in a file,
2307 * at the begining of a line before any KSC5601 characters. This usually
2308 * means that it appears by itself on the first line of the file
2309 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2310 * and SI to shift into single byte mode
2311 */
2312 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2313 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2314
2315 UConverter* saveConv = args->converter;
2316 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2317 args->converter=myConverterData->currentConverter;
2318
2319 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2320 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2321 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2322
2323 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2324 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2325 uprv_memcpy(
2326 saveConv->charErrorBuffer,
2327 myConverterData->currentConverter->charErrorBuffer,
2328 myConverterData->currentConverter->charErrorBufferLength);
2329 }
2330 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2331 myConverterData->currentConverter->charErrorBufferLength = 0;
2332 }
2333 args->converter=saveConv;
2334 }
2335
2336 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2337 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2338
2339 const UChar *source = args->source;
2340 const UChar *sourceLimit = args->sourceLimit;
2341 unsigned char *target = (unsigned char *) args->target;
2342 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2343 int32_t* offsets = args->offsets;
2344 uint32_t targetByteUnit = 0x0000;
2345 UChar32 sourceChar = 0x0000;
2346 UBool isTargetByteDBCS;
2347 UBool oldIsTargetByteDBCS;
2348 UConverterDataISO2022 *converterData;
2349 UConverterSharedData* sharedData;
2350 UBool useFallback;
2351 int32_t length =0;
2352
2353 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2354 /* if the version is 1 then the user is requesting
2355 * conversion with ibm-25546 pass the arguments to
2356 * MBCS converter and return
2357 */
2358 if(converterData->version==1){
2359 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2360 return;
2361 }
2362
2363 /* initialize data */
2364 sharedData = converterData->currentConverter->sharedData;
2365 useFallback = args->converter->useFallback;
2366 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2367 oldIsTargetByteDBCS = isTargetByteDBCS;
2368
2369 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2370 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2371 goto getTrail;
2372 }
2373 while(source < sourceLimit){
2374
2375 targetByteUnit = missingCharMarker;
2376
2377 if(target < (unsigned char*) args->targetLimit){
2378 sourceChar = *source++;
2379
2380 /* do not convert SO/SI/ESC */
2381 if(IS_2022_CONTROL(sourceChar)) {
2382 /* callback(illegal) */
2383 *err=U_ILLEGAL_CHAR_FOUND;
2384 args->converter->fromUChar32=sourceChar;
2385 break;
2386 }
2387
2388 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2389 if(length < 0) {
2390 length = -length; /* fallback */
2391 }
2392 /* only DBCS or SBCS characters are expected*/
2393 /* DB characters with high bit set to 1 are expected */
2394 if( length > 2 || length==0 ||
2395 (length == 1 && targetByteUnit > 0x7f) ||
2396 (length == 2 &&
2397 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2398 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2399 ) {
2400 targetByteUnit=missingCharMarker;
2401 }
2402 if (targetByteUnit != missingCharMarker){
2403
2404 oldIsTargetByteDBCS = isTargetByteDBCS;
2405 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2406 /* append the shift sequence */
2407 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2408
2409 if (isTargetByteDBCS)
2410 *target++ = UCNV_SO;
2411 else
2412 *target++ = UCNV_SI;
2413 if(offsets)
2414 *(offsets++) = (int32_t)(source - args->source-1);
2415 }
2416 /* write the targetUniChar to target */
2417 if(targetByteUnit <= 0x00FF){
2418 if( target < targetLimit){
2419 *(target++) = (unsigned char) targetByteUnit;
2420 if(offsets){
2421 *(offsets++) = (int32_t)(source - args->source-1);
2422 }
2423
2424 }else{
2425 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2426 *err = U_BUFFER_OVERFLOW_ERROR;
2427 }
2428 }else{
2429 if(target < targetLimit){
2430 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2431 if(offsets){
2432 *(offsets++) = (int32_t)(source - args->source-1);
2433 }
2434 if(target < targetLimit){
2435 *(target++) =(unsigned char) (targetByteUnit -0x80);
2436 if(offsets){
2437 *(offsets++) = (int32_t)(source - args->source-1);
2438 }
2439 }else{
2440 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2441 *err = U_BUFFER_OVERFLOW_ERROR;
2442 }
2443 }else{
2444 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2445 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2446 *err = U_BUFFER_OVERFLOW_ERROR;
2447 }
2448 }
2449
2450 }
2451 else{
2452 /* oops.. the code point is unassingned
2453 * set the error and reason
2454 */
2455
2456 /*check if the char is a First surrogate*/
2457 if(U16_IS_SURROGATE(sourceChar)) {
2458 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2459 getTrail:
2460 /*look ahead to find the trail surrogate*/
2461 if(source < sourceLimit) {
2462 /* test the following code unit */
2463 UChar trail=(UChar) *source;
2464 if(U16_IS_TRAIL(trail)) {
2465 source++;
2466 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2467 *err = U_INVALID_CHAR_FOUND;
2468 /* convert this surrogate code point */
2469 /* exit this condition tree */
2470 } else {
2471 /* this is an unmatched lead code unit (1st surrogate) */
2472 /* callback(illegal) */
2473 *err=U_ILLEGAL_CHAR_FOUND;
2474 }
2475 } else {
2476 /* no more input */
2477 *err = U_ZERO_ERROR;
2478 }
2479 } else {
2480 /* this is an unmatched trail code unit (2nd surrogate) */
2481 /* callback(illegal) */
2482 *err=U_ILLEGAL_CHAR_FOUND;
2483 }
2484 } else {
2485 /* callback(unassigned) for a BMP code point */
2486 *err = U_INVALID_CHAR_FOUND;
2487 }
2488
2489 args->converter->fromUChar32=sourceChar;
2490 break;
2491 }
2492 } /* end if(myTargetIndex<myTargetLength) */
2493 else{
2494 *err =U_BUFFER_OVERFLOW_ERROR;
2495 break;
2496 }
2497
2498 }/* end while(mySourceIndex<mySourceLength) */
2499
2500 /*
2501 * the end of the input stream and detection of truncated input
2502 * are handled by the framework, but for ISO-2022-KR conversion
2503 * we need to be in ASCII mode at the very end
2504 *
2505 * conditions:
2506 * successful
2507 * not in ASCII mode
2508 * end of input and no truncated input
2509 */
2510 if( U_SUCCESS(*err) &&
2511 isTargetByteDBCS &&
2512 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2513 ) {
2514 int32_t sourceIndex;
2515
2516 /* we are switching to ASCII */
2517 isTargetByteDBCS=FALSE;
2518
2519 /* get the source index of the last input character */
2520 /*
2521 * TODO this would be simpler and more reliable if we used a pair
2522 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2523 * so that we could simply use the prevSourceIndex here;
2524 * this code gives an incorrect result for the rare case of an unmatched
2525 * trail surrogate that is alone in the last buffer of the text stream
2526 */
2527 sourceIndex=(int32_t)(source-args->source);
2528 if(sourceIndex>0) {
2529 --sourceIndex;
2530 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2531 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2532 ) {
2533 --sourceIndex;
2534 }
2535 } else {
2536 sourceIndex=-1;
2537 }
2538
2539 fromUWriteUInt8(
2540 args->converter,
2541 SHIFT_IN_STR, 1,
2542 &target, (const char *)targetLimit,
2543 &offsets, sourceIndex,
2544 err);
2545 }
2546
2547 /*save the state and return */
2548 args->source = source;
2549 args->target = (char*)target;
2550 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2551 }
2552
2553 /************************ To Unicode ***************************************/
2554
2555 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2556 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2557 UErrorCode* err){
2558 char const* sourceStart;
2559 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2560
2561 UConverterToUnicodeArgs subArgs;
2562 int32_t minArgsSize;
2563
2564 /* set up the subconverter arguments */
2565 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2566 minArgsSize = args->size;
2567 } else {
2568 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2569 }
2570
2571 uprv_memcpy(&subArgs, args, minArgsSize);
2572 subArgs.size = (uint16_t)minArgsSize;
2573 subArgs.converter = myData->currentConverter;
2574
2575 /* remember the original start of the input for offsets */
2576 sourceStart = args->source;
2577
2578 if(myData->key != 0) {
2579 /* continue with a partial escape sequence */
2580 goto escape;
2581 }
2582
2583 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2584 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2585 subArgs.source = args->source;
2586 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2587 if(subArgs.source != subArgs.sourceLimit) {
2588 /*
2589 * get the current partial byte sequence
2590 *
2591 * it needs to be moved between the public and the subconverter
2592 * so that the conversion framework, which only sees the public
2593 * converter, can handle truncated and illegal input etc.
2594 */
2595 if(args->converter->toULength > 0) {
2596 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2597 }
2598 subArgs.converter->toULength = args->converter->toULength;
2599
2600 /*
2601 * Convert up to the end of the input, or to before the next escape character.
2602 * Does not handle conversion extensions because the preToU[] state etc.
2603 * is not copied.
2604 */
2605 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2606
2607 if(args->offsets != NULL && sourceStart != args->source) {
2608 /* update offsets to base them on the actual start of the input */
2609 int32_t *offsets = args->offsets;
2610 UChar *target = args->target;
2611 int32_t delta = (int32_t)(args->source - sourceStart);
2612 while(target < subArgs.target) {
2613 if(*offsets >= 0) {
2614 *offsets += delta;
2615 }
2616 ++offsets;
2617 ++target;
2618 }
2619 }
2620 args->source = subArgs.source;
2621 args->target = subArgs.target;
2622 args->offsets = subArgs.offsets;
2623
2624 /* copy input/error/overflow buffers */
2625 if(subArgs.converter->toULength > 0) {
2626 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2627 }
2628 args->converter->toULength = subArgs.converter->toULength;
2629
2630 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2631 if(subArgs.converter->UCharErrorBufferLength > 0) {
2632 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2633 subArgs.converter->UCharErrorBufferLength);
2634 }
2635 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2636 subArgs.converter->UCharErrorBufferLength = 0;
2637 }
2638 }
2639
2640 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2641 return;
2642 }
2643
2644 escape:
2645 changeState_2022(args->converter,
2646 &(args->source),
2647 args->sourceLimit,
2648 ISO_2022_KR,
2649 err);
2650 }
2651 }
2652
2653 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2654 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2655 UErrorCode* err){
2656 char tempBuf[2];
2657 const char *mySource = ( char *) args->source;
2658 UChar *myTarget = args->target;
2659 const char *mySourceLimit = args->sourceLimit;
2660 UChar32 targetUniChar = 0x0000;
2661 UChar mySourceChar = 0x0000;
2662 UConverterDataISO2022* myData;
2663 UConverterSharedData* sharedData ;
2664 UBool useFallback;
2665
2666 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2667 if(myData->version==1){
2668 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2669 return;
2670 }
2671
2672 /* initialize state */
2673 sharedData = myData->currentConverter->sharedData;
2674 useFallback = args->converter->useFallback;
2675
2676 if(myData->key != 0) {
2677 /* continue with a partial escape sequence */
2678 goto escape;
2679 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2680 /* continue with a partial double-byte character */
2681 mySourceChar = args->converter->toUBytes[0];
2682 args->converter->toULength = 0;
2683 goto getTrailByte;
2684 }
2685
2686 while(mySource< mySourceLimit){
2687
2688 if(myTarget < args->targetLimit){
2689
2690 mySourceChar= (unsigned char) *mySource++;
2691
2692 if(mySourceChar==UCNV_SI){
2693 myData->toU2022State.g = 0;
2694 if (myData->isEmptySegment) {
2695 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2696 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2697 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2698 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2699 args->converter->toULength = 1;
2700 args->target = myTarget;
2701 args->source = mySource;
2702 return;
2703 }
2704 /*consume the source */
2705 continue;
2706 }else if(mySourceChar==UCNV_SO){
2707 myData->toU2022State.g = 1;
2708 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2709 /*consume the source */
2710 continue;
2711 }else if(mySourceChar==ESC_2022){
2712 mySource--;
2713 escape:
2714 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2715 changeState_2022(args->converter,&(mySource),
2716 mySourceLimit, ISO_2022_KR, err);
2717 if(U_FAILURE(*err)){
2718 args->target = myTarget;
2719 args->source = mySource;
2720 return;
2721 }
2722 continue;
2723 }
2724
2725 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2726 if(myData->toU2022State.g == 1) {
2727 if(mySource < mySourceLimit) {
2728 int leadIsOk, trailIsOk;
2729 uint8_t trailByte;
2730 getTrailByte:
2731 targetUniChar = missingCharMarker;
2732 trailByte = (uint8_t)*mySource;
2733 /*
2734 * Ticket 5691: consistent illegal sequences:
2735 * - We include at least the first byte in the illegal sequence.
2736 * - If any of the non-initial bytes could be the start of a character,
2737 * we stop the illegal sequence before the first one of those.
2738 *
2739 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2740 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2741 * Otherwise we convert or report the pair of bytes.
2742 */
2743 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2744 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2745 if (leadIsOk && trailIsOk) {
2746 ++mySource;
2747 tempBuf[0] = (char)(mySourceChar + 0x80);
2748 tempBuf[1] = (char)(trailByte + 0x80);
2749 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2750 mySourceChar = (mySourceChar << 8) | trailByte;
2751 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2752 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2753 ++mySource;
2754 /* add another bit so that the code below writes 2 bytes in case of error */
2755 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2756 }
2757 } else {
2758 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2759 args->converter->toULength = 1;
2760 break;
2761 }
2762 }
2763 else if(mySourceChar <= 0x7f) {
2764 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2765 } else {
2766 targetUniChar = 0xffff;
2767 }
2768 if(targetUniChar < 0xfffe){
2769 if(args->offsets) {
2770 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2771 }
2772 *(myTarget++)=(UChar)targetUniChar;
2773 }
2774 else {
2775 /* Call the callback function*/
2776 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2777 break;
2778 }
2779 }
2780 else{
2781 *err =U_BUFFER_OVERFLOW_ERROR;
2782 break;
2783 }
2784 }
2785 args->target = myTarget;
2786 args->source = mySource;
2787 }
2788
2789 /*************************** END ISO2022-KR *********************************/
2790
2791 /*************************** ISO-2022-CN *********************************
2792 *
2793 * Rules for ISO-2022-CN Encoding:
2794 * i) The designator sequence must appear once on a line before any instance
2795 * of character set it designates.
2796 * ii) If two lines contain characters from the same character set, both lines
2797 * must include the designator sequence.
2798 * iii) Once the designator sequence is known, a shifting sequence has to be found
2799 * to invoke the shifting
2800 * iv) All lines start in ASCII and end in ASCII.
2801 * v) Four shifting sequences are employed for this purpose:
2802 *
2803 * Sequcence ASCII Eq Charsets
2804 * ---------- ------- ---------
2805 * SI <SI> US-ASCII
2806 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2807 * SS2 <ESC>N CNS-11643-1992 Plane 2
2808 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2809 *
2810 * vi)
2811 * SOdesignator : ESC "$" ")" finalchar_for_SO
2812 * SS2designator : ESC "$" "*" finalchar_for_SS2
2813 * SS3designator : ESC "$" "+" finalchar_for_SS3
2814 *
2815 * ESC $ ) A Indicates the bytes following SO are Chinese
2816 * characters as defined in GB 2312-80, until
2817 * another SOdesignation appears
2818 *
2819 *
2820 * ESC $ ) E Indicates the bytes following SO are as defined
2821 * in ISO-IR-165 (for details, see section 2.1),
2822 * until another SOdesignation appears
2823 *
2824 * ESC $ ) G Indicates the bytes following SO are as defined
2825 * in CNS 11643-plane-1, until another
2826 * SOdesignation appears
2827 *
2828 * ESC $ * H Indicates the two bytes immediately following
2829 * SS2 is a Chinese character as defined in CNS
2830 * 11643-plane-2, until another SS2designation
2831 * appears
2832 * (Meaning <ESC>N must preceed every 2 byte
2833 * sequence.)
2834 *
2835 * ESC $ + I Indicates the immediate two bytes following SS3
2836 * is a Chinese character as defined in CNS
2837 * 11643-plane-3, until another SS3designation
2838 * appears
2839 * (Meaning <ESC>O must preceed every 2 byte
2840 * sequence.)
2841 *
2842 * ESC $ + J Indicates the immediate two bytes following SS3
2843 * is a Chinese character as defined in CNS
2844 * 11643-plane-4, until another SS3designation
2845 * appears
2846 * (In English: <ESC>O must preceed every 2 byte
2847 * sequence.)
2848 *
2849 * ESC $ + K Indicates the immediate two bytes following SS3
2850 * is a Chinese character as defined in CNS
2851 * 11643-plane-5, until another SS3designation
2852 * appears
2853 *
2854 * ESC $ + L Indicates the immediate two bytes following SS3
2855 * is a Chinese character as defined in CNS
2856 * 11643-plane-6, until another SS3designation
2857 * appears
2858 *
2859 * ESC $ + M Indicates the immediate two bytes following SS3
2860 * is a Chinese character as defined in CNS
2861 * 11643-plane-7, until another SS3designation
2862 * appears
2863 *
2864 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2865 * has its own designation information before any Chinese characters
2866 * appear
2867 *
2868 */
2869
2870 /* The following are defined this way to make the strings truly readonly */
2871 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2872 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2873 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2874 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2875 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2876 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2877 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2878 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2879 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2880
2881 /********************** ISO2022-CN Data **************************/
2882 static const char* const escSeqCharsCN[10] ={
2883 SHIFT_IN_STR, /* 0 ASCII */
2884 GB_2312_80_STR, /* 1 GB2312_1 */
2885 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2886 CNS_11643_1992_Plane_1_STR,
2887 CNS_11643_1992_Plane_2_STR,
2888 CNS_11643_1992_Plane_3_STR,
2889 CNS_11643_1992_Plane_4_STR,
2890 CNS_11643_1992_Plane_5_STR,
2891 CNS_11643_1992_Plane_6_STR,
2892 CNS_11643_1992_Plane_7_STR
2893 };
2894
2895 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2896 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2897 UConverter *cnv = args->converter;
2898 UConverterDataISO2022 *converterData;
2899 ISO2022State *pFromU2022State;
2900 uint8_t *target = (uint8_t *) args->target;
2901 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2902 const UChar* source = args->source;
2903 const UChar* sourceLimit = args->sourceLimit;
2904 int32_t* offsets = args->offsets;
2905 UChar32 sourceChar;
2906 char buffer[8];
2907 int32_t len;
2908 int8_t choices[3];
2909 int32_t choiceCount;
2910 uint32_t targetValue = 0;
2911 UBool useFallback;
2912
2913 /* set up the state */
2914 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2915 pFromU2022State = &converterData->fromU2022State;
2916
2917 choiceCount = 0;
2918
2919 /* check if the last codepoint of previous buffer was a lead surrogate*/
2920 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2921 goto getTrail;
2922 }
2923
2924 while( source < sourceLimit){
2925 if(target < targetLimit){
2926
2927 sourceChar = *(source++);
2928 /*check if the char is a First surrogate*/
2929 if(U16_IS_SURROGATE(sourceChar)) {
2930 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2931 getTrail:
2932 /*look ahead to find the trail surrogate*/
2933 if(source < sourceLimit) {
2934 /* test the following code unit */
2935 UChar trail=(UChar) *source;
2936 if(U16_IS_TRAIL(trail)) {
2937 source++;
2938 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2939 cnv->fromUChar32=0x00;
2940 /* convert this supplementary code point */
2941 /* exit this condition tree */
2942 } else {
2943 /* this is an unmatched lead code unit (1st surrogate) */
2944 /* callback(illegal) */
2945 *err=U_ILLEGAL_CHAR_FOUND;
2946 cnv->fromUChar32=sourceChar;
2947 break;
2948 }
2949 } else {
2950 /* no more input */
2951 cnv->fromUChar32=sourceChar;
2952 break;
2953 }
2954 } else {
2955 /* this is an unmatched trail code unit (2nd surrogate) */
2956 /* callback(illegal) */
2957 *err=U_ILLEGAL_CHAR_FOUND;
2958 cnv->fromUChar32=sourceChar;
2959 break;
2960 }
2961 }
2962
2963 /* do the conversion */
2964 if(sourceChar <= 0x007f ){
2965 /* do not convert SO/SI/ESC */
2966 if(IS_2022_CONTROL(sourceChar)) {
2967 /* callback(illegal) */
2968 *err=U_ILLEGAL_CHAR_FOUND;
2969 cnv->fromUChar32=sourceChar;
2970 break;
2971 }
2972
2973 /* US-ASCII */
2974 if(pFromU2022State->g == 0) {
2975 buffer[0] = (char)sourceChar;
2976 len = 1;
2977 } else {
2978 buffer[0] = UCNV_SI;
2979 buffer[1] = (char)sourceChar;
2980 len = 2;
2981 pFromU2022State->g = 0;
2982 choiceCount = 0;
2983 }
2984 if(sourceChar == CR || sourceChar == LF) {
2985 /* reset the state at the end of a line */
2986 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2987 choiceCount = 0;
2988 }
2989 }
2990 else{
2991 /* convert U+0080..U+10ffff */
2992 int32_t i;
2993 int8_t cs, g;
2994
2995 if(choiceCount == 0) {
2996 /* try the current SO/G1 converter first */
2997 choices[0] = pFromU2022State->cs[1];
2998
2999 /* default to GB2312_1 if none is designated yet */
3000 if(choices[0] == 0) {
3001 choices[0] = GB2312_1;
3002 }
3003
3004 if(converterData->version == 0) {
3005 /* ISO-2022-CN */
3006
3007 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3008 if(choices[0] == GB2312_1) {
3009 choices[1] = (int8_t)CNS_11643_1;
3010 } else {
3011 choices[1] = (int8_t)GB2312_1;
3012 }
3013
3014 choiceCount = 2;
3015 } else if (converterData->version == 1) {
3016 /* ISO-2022-CN-EXT */
3017
3018 /* try one of the other converters */
3019 switch(choices[0]) {
3020 case GB2312_1:
3021 choices[1] = (int8_t)CNS_11643_1;
3022 choices[2] = (int8_t)ISO_IR_165;
3023 break;
3024 case ISO_IR_165:
3025 choices[1] = (int8_t)GB2312_1;
3026 choices[2] = (int8_t)CNS_11643_1;
3027 break;
3028 default: /* CNS_11643_x */
3029 choices[1] = (int8_t)GB2312_1;
3030 choices[2] = (int8_t)ISO_IR_165;
3031 break;
3032 }
3033
3034 choiceCount = 3;
3035 } else {
3036 choices[0] = (int8_t)CNS_11643_1;
3037 choices[1] = (int8_t)GB2312_1;
3038 }
3039 }
3040
3041 cs = g = 0;
3042 /*
3043 * len==0: no mapping found yet
3044 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3045 * len>0: found a roundtrip result, done
3046 */
3047 len = 0;
3048 /*
3049 * We will turn off useFallback after finding a fallback,
3050 * but we still get fallbacks from PUA code points as usual.
3051 * Therefore, we will also need to check that we don't overwrite
3052 * an early fallback with a later one.
3053 */
3054 useFallback = cnv->useFallback;
3055
3056 for(i = 0; i < choiceCount && len <= 0; ++i) {
3057 int8_t cs0 = choices[i];
3058 if(cs0 > 0) {
3059 uint32_t value;
3060 int32_t len2;
3061 if(cs0 >= CNS_11643_0) {
3062 len2 = MBCS_FROM_UCHAR32_ISO2022(
3063 converterData->myConverterArray[CNS_11643],
3064 sourceChar,
3065 &value,
3066 useFallback,
3067 MBCS_OUTPUT_3);
3068 if(len2 == 3 || (len2 == -3 && len == 0)) {
3069 targetValue = value;
3070 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3071 if(len2 >= 0) {
3072 len = 2;
3073 } else {
3074 len = -2;
3075 useFallback = FALSE;
3076 }
3077 if(cs == CNS_11643_1) {
3078 g = 1;
3079 } else if(cs == CNS_11643_2) {
3080 g = 2;
3081 } else /* plane 3..7 */ if(converterData->version == 1) {
3082 g = 3;
3083 } else {
3084 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3085 len = 0;
3086 }
3087 }
3088 } else {
3089 /* GB2312_1 or ISO-IR-165 */
3090 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3091 len2 = MBCS_FROM_UCHAR32_ISO2022(
3092 converterData->myConverterArray[cs0],
3093 sourceChar,
3094 &value,
3095 useFallback,
3096 MBCS_OUTPUT_2);
3097 if(len2 == 2 || (len2 == -2 && len == 0)) {
3098 targetValue = value;
3099 len = len2;
3100 cs = cs0;
3101 g = 1;
3102 useFallback = FALSE;
3103 }
3104 }
3105 }
3106 }
3107
3108 if(len != 0) {
3109 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3110
3111 /* write the designation sequence if necessary */
3112 if(cs != pFromU2022State->cs[g]) {
3113 if(cs < CNS_11643) {
3114 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3115 } else {
3116 U_ASSERT(cs >= CNS_11643_1);
3117 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3118 }
3119 len = 4;
3120 pFromU2022State->cs[g] = cs;
3121 if(g == 1) {
3122 /* changing the SO/G1 charset invalidates the choices[] */
3123 choiceCount = 0;
3124 }
3125 }
3126
3127 /* write the shift sequence if necessary */
3128 if(g != pFromU2022State->g) {
3129 switch(g) {
3130 case 1:
3131 buffer[len++] = UCNV_SO;
3132
3133 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3134 pFromU2022State->g = 1;
3135 break;
3136 case 2:
3137 buffer[len++] = 0x1b;
3138 buffer[len++] = 0x4e;
3139 break;
3140 default: /* case 3 */
3141 buffer[len++] = 0x1b;
3142 buffer[len++] = 0x4f;
3143 break;
3144 }
3145 }
3146
3147 /* write the two output bytes */
3148 buffer[len++] = (char)(targetValue >> 8);
3149 buffer[len++] = (char)targetValue;
3150 } else {
3151 /* if we cannot find the character after checking all codepages
3152 * then this is an error
3153 */
3154 *err = U_INVALID_CHAR_FOUND;
3155 cnv->fromUChar32=sourceChar;
3156 break;
3157 }
3158 }
3159
3160 /* output len>0 bytes in buffer[] */
3161 if(len == 1) {
3162 *target++ = buffer[0];
3163 if(offsets) {
3164 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3165 }
3166 } else if(len == 2 && (target + 2) <= targetLimit) {
3167 *target++ = buffer[0];
3168 *target++ = buffer[1];
3169 if(offsets) {
3170 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3171 *offsets++ = sourceIndex;
3172 *offsets++ = sourceIndex;
3173 }
3174 } else {
3175 fromUWriteUInt8(
3176 cnv,
3177 buffer, len,
3178 &target, (const char *)targetLimit,
3179 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3180 err);
3181 if(U_FAILURE(*err)) {
3182 break;
3183 }
3184 }
3185 } /* end if(myTargetIndex<myTargetLength) */
3186 else{
3187 *err =U_BUFFER_OVERFLOW_ERROR;
3188 break;
3189 }
3190
3191 }/* end while(mySourceIndex<mySourceLength) */
3192
3193 /*
3194 * the end of the input stream and detection of truncated input
3195 * are handled by the framework, but for ISO-2022-CN conversion
3196 * we need to be in ASCII mode at the very end
3197 *
3198 * conditions:
3199 * successful
3200 * not in ASCII mode
3201 * end of input and no truncated input
3202 */
3203 if( U_SUCCESS(*err) &&
3204 pFromU2022State->g!=0 &&
3205 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3206 ) {
3207 int32_t sourceIndex;
3208
3209 /* we are switching to ASCII */
3210 pFromU2022State->g=0;
3211
3212 /* get the source index of the last input character */
3213 /*
3214 * TODO this would be simpler and more reliable if we used a pair
3215 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3216 * so that we could simply use the prevSourceIndex here;
3217 * this code gives an incorrect result for the rare case of an unmatched
3218 * trail surrogate that is alone in the last buffer of the text stream
3219 */
3220 sourceIndex=(int32_t)(source-args->source);
3221 if(sourceIndex>0) {
3222 --sourceIndex;
3223 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3224 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3225 ) {
3226 --sourceIndex;
3227 }
3228 } else {
3229 sourceIndex=-1;
3230 }
3231
3232 fromUWriteUInt8(
3233 cnv,
3234 SHIFT_IN_STR, 1,
3235 &target, (const char *)targetLimit,
3236 &offsets, sourceIndex,
3237 err);
3238 }
3239
3240 /*save the state and return */
3241 args->source = source;
3242 args->target = (char*)target;
3243 }
3244
3245
3246 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3247 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3248 UErrorCode* err){
3249 char tempBuf[3];
3250 const char *mySource = (char *) args->source;
3251 UChar *myTarget = args->target;
3252 const char *mySourceLimit = args->sourceLimit;
3253 uint32_t targetUniChar = 0x0000;
3254 uint32_t mySourceChar = 0x0000;
3255 UConverterDataISO2022* myData;
3256 ISO2022State *pToU2022State;
3257
3258 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3259 pToU2022State = &myData->toU2022State;
3260
3261 if(myData->key != 0) {
3262 /* continue with a partial escape sequence */
3263 goto escape;
3264 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3265 /* continue with a partial double-byte character */
3266 mySourceChar = args->converter->toUBytes[0];
3267 args->converter->toULength = 0;
3268 targetUniChar = missingCharMarker;
3269 goto getTrailByte;
3270 }
3271
3272 while(mySource < mySourceLimit){
3273
3274 targetUniChar =missingCharMarker;
3275
3276 if(myTarget < args->targetLimit){
3277
3278 mySourceChar= (unsigned char) *mySource++;
3279
3280 switch(mySourceChar){
3281 case UCNV_SI:
3282 pToU2022State->g=0;
3283 if (myData->isEmptySegment) {
3284 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3285 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3286 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3287 args->converter->toUBytes[0] = mySourceChar;
3288 args->converter->toULength = 1;
3289 args->target = myTarget;
3290 args->source = mySource;
3291 return;
3292 }
3293 continue;
3294
3295 case UCNV_SO:
3296 if(pToU2022State->cs[1] != 0) {
3297 pToU2022State->g=1;
3298 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3299 continue;
3300 } else {
3301 /* illegal to have SO before a matching designator */
3302 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3303 break;
3304 }
3305
3306 case ESC_2022:
3307 mySource--;
3308 escape:
3309 {
3310 const char * mySourceBefore = mySource;
3311 int8_t toULengthBefore = args->converter->toULength;
3312
3313 changeState_2022(args->converter,&(mySource),
3314 mySourceLimit, ISO_2022_CN,err);
3315
3316 /* After SO there must be at least one character before a designator (designator error handled separately) */
3317 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3318 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3319 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3320 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3321 }
3322 }
3323
3324 /* invalid or illegal escape sequence */
3325 if(U_FAILURE(*err)){
3326 args->target = myTarget;
3327 args->source = mySource;
3328 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3329 return;
3330 }
3331 continue;
3332
3333 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3334
3335 case CR:
3336 /*falls through*/
3337 case LF:
3338 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3339 /* falls through */
3340 default:
3341 /* convert one or two bytes */
3342 myData->isEmptySegment = FALSE;
3343 if(pToU2022State->g != 0) {
3344 if(mySource < mySourceLimit) {
3345 UConverterSharedData *cnv;
3346 StateEnum tempState;
3347 int32_t tempBufLen;
3348 int leadIsOk, trailIsOk;
3349 uint8_t trailByte;
3350 getTrailByte:
3351 trailByte = (uint8_t)*mySource;
3352 /*
3353 * Ticket 5691: consistent illegal sequences:
3354 * - We include at least the first byte in the illegal sequence.
3355 * - If any of the non-initial bytes could be the start of a character,
3356 * we stop the illegal sequence before the first one of those.
3357 *
3358 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3359 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3360 * Otherwise we convert or report the pair of bytes.
3361 */
3362 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3363 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3364 if (leadIsOk && trailIsOk) {
3365 ++mySource;
3366 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3367 if(tempState >= CNS_11643_0) {
3368 cnv = myData->myConverterArray[CNS_11643];
3369 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3370 tempBuf[1] = (char) (mySourceChar);
3371 tempBuf[2] = (char) trailByte;
3372 tempBufLen = 3;
3373
3374 }else{
3375 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3376 cnv = myData->myConverterArray[tempState];
3377 tempBuf[0] = (char) (mySourceChar);
3378 tempBuf[1] = (char) trailByte;
3379 tempBufLen = 2;
3380 }
3381 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3382 mySourceChar = (mySourceChar << 8) | trailByte;
3383 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3384 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3385 ++mySource;
3386 /* add another bit so that the code below writes 2 bytes in case of error */
3387 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3388 }
3389 if(pToU2022State->g>=2) {
3390 /* return from a single-shift state to the previous one */
3391 pToU2022State->g=pToU2022State->prevG;
3392 }
3393 } else {
3394 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3395 args->converter->toULength = 1;
3396 goto endloop;
3397 }
3398 }
3399 else{
3400 if(mySourceChar <= 0x7f) {
3401 targetUniChar = (UChar) mySourceChar;
3402 }
3403 }
3404 break;
3405 }
3406 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3407 if(args->offsets){
3408 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3409 }
3410 *(myTarget++)=(UChar)targetUniChar;
3411 }
3412 else if(targetUniChar > missingCharMarker){
3413 /* disassemble the surrogate pair and write to output*/
3414 targetUniChar-=0x0010000;
3415 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3416 if(args->offsets){
3417 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3418 }
3419 ++myTarget;
3420 if(myTarget< args->targetLimit){
3421 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3422 if(args->offsets){
3423 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3424 }
3425 ++myTarget;
3426 }else{
3427 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3428 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3429 }
3430
3431 }
3432 else{
3433 /* Call the callback function*/
3434 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3435 break;
3436 }
3437 }
3438 else{
3439 *err =U_BUFFER_OVERFLOW_ERROR;
3440 break;
3441 }
3442 }
3443 endloop:
3444 args->target = myTarget;
3445 args->source = mySource;
3446 }
3447
3448 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3449 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3450 UConverter *cnv = args->converter;
3451 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3452 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3453 char *p, *subchar;
3454 char buffer[8];
3455 int32_t length;
3456
3457 subchar=(char *)cnv->subChars;
3458 length=cnv->subCharLen; /* assume length==1 for most variants */
3459
3460 p = buffer;
3461 switch(myConverterData->locale[0]){
3462 case 'j':
3463 {
3464 int8_t cs;
3465
3466 if(pFromU2022State->g == 1) {
3467 /* JIS7: switch from G1 to G0 */
3468 pFromU2022State->g = 0;
3469 *p++ = UCNV_SI;
3470 }
3471
3472 cs = pFromU2022State->cs[0];
3473 if(cs != ASCII && cs != JISX201) {
3474 /* not in ASCII or JIS X 0201: switch to ASCII */
3475 pFromU2022State->cs[0] = (int8_t)ASCII;
3476 *p++ = '\x1b';
3477 *p++ = '\x28';
3478 *p++ = '\x42';
3479 }
3480
3481 *p++ = subchar[0];
3482 break;
3483 }
3484 case 'c':
3485 if(pFromU2022State->g != 0) {
3486 /* not in ASCII mode: switch to ASCII */
3487 pFromU2022State->g = 0;
3488 *p++ = UCNV_SI;
3489 }
3490 *p++ = subchar[0];
3491 break;
3492 case 'k':
3493 if(myConverterData->version == 0) {
3494 if(length == 1) {
3495 if((UBool)args->converter->fromUnicodeStatus) {
3496 /* in DBCS mode: switch to SBCS */
3497 args->converter->fromUnicodeStatus = 0;
3498 *p++ = UCNV_SI;
3499 }
3500 *p++ = subchar[0];
3501 } else /* length == 2*/ {
3502 if(!(UBool)args->converter->fromUnicodeStatus) {
3503 /* in SBCS mode: switch to DBCS */
3504 args->converter->fromUnicodeStatus = 1;
3505 *p++ = UCNV_SO;
3506 }
3507 *p++ = subchar[0];
3508 *p++ = subchar[1];
3509 }
3510 break;
3511 } else {
3512 /* save the subconverter's substitution string */
3513 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3514 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3515
3516 /* set our substitution string into the subconverter */
3517 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3518 myConverterData->currentConverter->subCharLen = (int8_t)length;
3519
3520 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3521 args->converter = myConverterData->currentConverter;
3522 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3523 ucnv_cbFromUWriteSub(args, 0, err);
3524 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3525 args->converter = cnv;
3526
3527 /* restore the subconverter's substitution string */
3528 myConverterData->currentConverter->subChars = currentSubChars;
3529 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3530
3531 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3532 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3533 uprv_memcpy(
3534 cnv->charErrorBuffer,
3535 myConverterData->currentConverter->charErrorBuffer,
3536 myConverterData->currentConverter->charErrorBufferLength);
3537 }
3538 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3539 myConverterData->currentConverter->charErrorBufferLength = 0;
3540 }
3541 return;
3542 }
3543 default:
3544 /* not expected */
3545 break;
3546 }
3547 ucnv_cbFromUWriteBytes(args,
3548 buffer, (int32_t)(p - buffer),
3549 offsetIndex, err);
3550 }
3551
3552 /*
3553 * Structure for cloning an ISO 2022 converter into a single memory block.
3554 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3555 * and then ucnv_safeClone() of the sub-converter may additionally align
3556 * currentConverter inside the cloneStruct, for which we need the deadSpace
3557 * after currentConverter.
3558 * This is because UAlignedMemory may be larger than the actually
3559 * necessary alignment size for the platform.
3560 * The other cloneStruct fields will not be moved around,
3561 * and are aligned properly with cloneStruct's alignment.
3562 */
3563 struct cloneStruct
3564 {
3565 UConverter cnv;
3566 UConverter currentConverter;
3567 UAlignedMemory deadSpace;
3568 UConverterDataISO2022 mydata;
3569 };
3570
3571
3572 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3573 _ISO_2022_SafeClone(
3574 const UConverter *cnv,
3575 void *stackBuffer,
3576 int32_t *pBufferSize,
3577 UErrorCode *status)
3578 {
3579 struct cloneStruct * localClone;
3580 UConverterDataISO2022 *cnvData;
3581 int32_t i, size;
3582
3583 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3584 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3585 return NULL;
3586 }
3587
3588 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3589 localClone = (struct cloneStruct *)stackBuffer;
3590
3591 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3592
3593 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3594 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3595 localClone->cnv.isExtraLocal = TRUE;
3596
3597 /* share the subconverters */
3598
3599 if(cnvData->currentConverter != NULL) {
3600 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3601 localClone->mydata.currentConverter =
3602 ucnv_safeClone(cnvData->currentConverter,
3603 &localClone->currentConverter,
3604 &size, status);
3605 if(U_FAILURE(*status)) {
3606 return NULL;
3607 }
3608 }
3609
3610 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3611 if(cnvData->myConverterArray[i] != NULL) {
3612 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3613 }
3614 }
3615
3616 return &localClone->cnv;
3617 }
3618
3619 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3620 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3621 const USetAdder *sa,
3622 UConverterUnicodeSet which,
3623 UErrorCode *pErrorCode)
3624 {
3625 int32_t i;
3626 UConverterDataISO2022* cnvData;
3627
3628 if (U_FAILURE(*pErrorCode)) {
3629 return;
3630 }
3631 #ifdef U_ENABLE_GENERIC_ISO_2022
3632 if (cnv->sharedData == &_ISO2022Data) {
3633 /* We use UTF-8 in this case */
3634 sa->addRange(sa->set, 0, 0xd7FF);
3635 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3636 return;
3637 }
3638 #endif
3639
3640 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3641
3642 /* open a set and initialize it with code points that are algorithmically round-tripped */
3643 switch(cnvData->locale[0]){
3644 case 'j':
3645 /* include JIS X 0201 which is hardcoded */
3646 sa->add(sa->set, 0xa5);
3647 sa->add(sa->set, 0x203e);
3648 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3649 /* include Latin-1 for some variants of JP */
3650 sa->addRange(sa->set, 0, 0xff);
3651 } else {
3652 /* include ASCII for JP */
3653 sa->addRange(sa->set, 0, 0x7f);
3654 }
3655 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3656 /*
3657 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3658 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3659 * use half-width Katakana.
3660 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3661 * half-width Katakana via the ESC ( I sequence.
3662 * However, we only emit (fromUnicode) half-width Katakana according to the
3663 * definition of each variant.
3664 *
3665 * When including fallbacks,
3666 * we need to include half-width Katakana Unicode code points for all JP variants because
3667 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3668 */
3669 /* include half-width Katakana for JP */
3670 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3671 }
3672 break;
3673 case 'c':
3674 case 'z':
3675 /* include ASCII for CN */
3676 sa->addRange(sa->set, 0, 0x7f);
3677 break;
3678 case 'k':
3679 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3680 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3681 cnvData->currentConverter, sa, which, pErrorCode);
3682 /* the loop over myConverterArray[] will simply not find another converter */
3683 break;
3684 default:
3685 break;
3686 }
3687
3688 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3689 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3690 cnvData->version==0 && i==CNS_11643
3691 ) {
3692 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3693 ucnv_MBCSGetUnicodeSetForBytes(
3694 cnvData->myConverterArray[i],
3695 sa, UCNV_ROUNDTRIP_SET,
3696 0, 0x81, 0x82,
3697 pErrorCode);
3698 }
3699 #endif
3700
3701 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3702 UConverterSetFilter filter;
3703 if(cnvData->myConverterArray[i]!=NULL) {
3704 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3705 cnvData->version==0 && i==CNS_11643
3706 ) {
3707 /*
3708 * Version-specific for CN:
3709 * CN version 0 does not map CNS planes 3..7 although
3710 * they are all available in the CNS conversion table;
3711 * CN version 1 (-EXT) does map them all.
3712 * The two versions create different Unicode sets.
3713 */
3714 filter=UCNV_SET_FILTER_2022_CN;
3715 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3716 /*
3717 * Only add code points that map to Shift-JIS codes
3718 * corresponding to JIS X 0208.
3719 */
3720 filter=UCNV_SET_FILTER_SJIS;
3721 } else if(i==KSC5601) {
3722 /*
3723 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3724 * are broader than GR94.
3725 */
3726 filter=UCNV_SET_FILTER_GR94DBCS;
3727 } else {
3728 filter=UCNV_SET_FILTER_NONE;
3729 }
3730 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3731 }
3732 }
3733
3734 /*
3735 * ISO 2022 converters must not convert SO/SI/ESC despite what
3736 * sub-converters do by themselves.
3737 * Remove these characters from the set.
3738 */
3739 sa->remove(sa->set, 0x0e);
3740 sa->remove(sa->set, 0x0f);
3741 sa->remove(sa->set, 0x1b);
3742
3743 /* ISO 2022 converters do not convert C1 controls either */
3744 sa->removeRange(sa->set, 0x80, 0x9f);
3745 }
3746
3747 static const UConverterImpl _ISO2022Impl={
3748 UCNV_ISO_2022,
3749
3750 NULL,
3751 NULL,
3752
3753 _ISO2022Open,
3754 _ISO2022Close,
3755 _ISO2022Reset,
3756
3757 #ifdef U_ENABLE_GENERIC_ISO_2022
3758 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3759 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3760 ucnv_fromUnicode_UTF8,
3761 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3762 #else
3763 NULL,
3764 NULL,
3765 NULL,
3766 NULL,
3767 #endif
3768 NULL,
3769
3770 NULL,
3771 _ISO2022getName,
3772 _ISO_2022_WriteSub,
3773 _ISO_2022_SafeClone,
3774 _ISO_2022_GetUnicodeSet,
3775
3776 NULL,
3777 NULL
3778 };
3779 static const UConverterStaticData _ISO2022StaticData={
3780 sizeof(UConverterStaticData),
3781 "ISO_2022",
3782 2022,
3783 UCNV_IBM,
3784 UCNV_ISO_2022,
3785 1,
3786 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3787 { 0x1a, 0, 0, 0 },
3788 1,
3789 FALSE,
3790 FALSE,
3791 0,
3792 0,
3793 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3794 };
3795 const UConverterSharedData _ISO2022Data={
3796 sizeof(UConverterSharedData),
3797 ~((uint32_t) 0),
3798 NULL,
3799 NULL,
3800 &_ISO2022StaticData,
3801 FALSE,
3802 &_ISO2022Impl,
3803 0, UCNV_MBCS_TABLE_INITIALIZER
3804 };
3805
3806 /*************JP****************/
3807 static const UConverterImpl _ISO2022JPImpl={
3808 UCNV_ISO_2022,
3809
3810 NULL,
3811 NULL,
3812
3813 _ISO2022Open,
3814 _ISO2022Close,
3815 _ISO2022Reset,
3816
3817 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3818 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3819 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3820 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3821 NULL,
3822
3823 NULL,
3824 _ISO2022getName,
3825 _ISO_2022_WriteSub,
3826 _ISO_2022_SafeClone,
3827 _ISO_2022_GetUnicodeSet,
3828
3829 NULL,
3830 NULL
3831 };
3832 static const UConverterStaticData _ISO2022JPStaticData={
3833 sizeof(UConverterStaticData),
3834 "ISO_2022_JP",
3835 0,
3836 UCNV_IBM,
3837 UCNV_ISO_2022,
3838 1,
3839 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3840 { 0x1a, 0, 0, 0 },
3841 1,
3842 FALSE,
3843 FALSE,
3844 0,
3845 0,
3846 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3847 };
3848
3849 namespace {
3850
3851 const UConverterSharedData _ISO2022JPData={
3852 sizeof(UConverterSharedData),
3853 ~((uint32_t) 0),
3854 NULL,
3855 NULL,
3856 &_ISO2022JPStaticData,
3857 FALSE,
3858 &_ISO2022JPImpl,
3859 0, UCNV_MBCS_TABLE_INITIALIZER
3860 };
3861
3862 } // namespace
3863
3864 /************* KR ***************/
3865 static const UConverterImpl _ISO2022KRImpl={
3866 UCNV_ISO_2022,
3867
3868 NULL,
3869 NULL,
3870
3871 _ISO2022Open,
3872 _ISO2022Close,
3873 _ISO2022Reset,
3874
3875 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3876 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3877 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3878 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3879 NULL,
3880
3881 NULL,
3882 _ISO2022getName,
3883 _ISO_2022_WriteSub,
3884 _ISO_2022_SafeClone,
3885 _ISO_2022_GetUnicodeSet,
3886
3887 NULL,
3888 NULL
3889 };
3890 static const UConverterStaticData _ISO2022KRStaticData={
3891 sizeof(UConverterStaticData),
3892 "ISO_2022_KR",
3893 0,
3894 UCNV_IBM,
3895 UCNV_ISO_2022,
3896 1,
3897 3, /* max 3 bytes per UChar: SO+DBCS */
3898 { 0x1a, 0, 0, 0 },
3899 1,
3900 FALSE,
3901 FALSE,
3902 0,
3903 0,
3904 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3905 };
3906
3907 namespace {
3908
3909 const UConverterSharedData _ISO2022KRData={
3910 sizeof(UConverterSharedData),
3911 ~((uint32_t) 0),
3912 NULL,
3913 NULL,
3914 &_ISO2022KRStaticData,
3915 FALSE,
3916 &_ISO2022KRImpl,
3917 0, UCNV_MBCS_TABLE_INITIALIZER
3918 };
3919
3920 } // namespace
3921
3922 /*************** CN ***************/
3923 static const UConverterImpl _ISO2022CNImpl={
3924
3925 UCNV_ISO_2022,
3926
3927 NULL,
3928 NULL,
3929
3930 _ISO2022Open,
3931 _ISO2022Close,
3932 _ISO2022Reset,
3933
3934 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3937 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3938 NULL,
3939
3940 NULL,
3941 _ISO2022getName,
3942 _ISO_2022_WriteSub,
3943 _ISO_2022_SafeClone,
3944 _ISO_2022_GetUnicodeSet,
3945
3946 NULL,
3947 NULL
3948 };
3949 static const UConverterStaticData _ISO2022CNStaticData={
3950 sizeof(UConverterStaticData),
3951 "ISO_2022_CN",
3952 0,
3953 UCNV_IBM,
3954 UCNV_ISO_2022,
3955 1,
3956 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3957 { 0x1a, 0, 0, 0 },
3958 1,
3959 FALSE,
3960 FALSE,
3961 0,
3962 0,
3963 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3964 };
3965
3966 namespace {
3967
3968 const UConverterSharedData _ISO2022CNData={
3969 sizeof(UConverterSharedData),
3970 ~((uint32_t) 0),
3971 NULL,
3972 NULL,
3973 &_ISO2022CNStaticData,
3974 FALSE,
3975 &_ISO2022CNImpl,
3976 0, UCNV_MBCS_TABLE_INITIALIZER
3977 };
3978
3979 } // namespace
3980
3981 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3982