1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
38 #include "ucnv_imp.h"
39 #include "ucnv_bld.h"
40 #include "ucnv_cnv.h"
41 #include "ucnvmbcs.h"
42 #include "cstring.h"
43 #include "cmemory.h"
44 #include "uassert.h"
45
46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
52 *
53 * Reasons:
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
74 * for 7-bit ISO-2022.
75 *
76 * Markus Scherer 2003-dec-03
77 */
78 #endif
79
80 static const char SHIFT_IN_STR[] = "\x0F";
81 // static const char SHIFT_OUT_STR[] = "\x0E";
82
83 #define CR 0x0D
84 #define LF 0x0A
85 #define H_TAB 0x09
86 #define V_TAB 0x0B
87 #define SPACE 0x20
88
89 enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92 };
93
94 /*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102 enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107 };
108
109 /*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117 /* for ISO-2022-JP and -CN implementations */
118 typedef enum {
119 /* shared values */
120 INVALID_STATE=-1,
121 ASCII = 0,
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
135
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
154 } StateEnum;
155
156 /* is the StateEnum charset value for a DBCS charset? */
157 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
158
159 #define CSM(cs) ((uint16_t)1<<(cs))
160
161 /*
162 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
163 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
164 *
165 * Note: The converter uses some leniency:
166 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
167 * all versions, not just JIS7 and JIS8.
168 * - ICU does not distinguish between different versions of JIS X 0208.
169 */
170 enum { MAX_JA_VERSION=4 };
171 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
175 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
176 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
177 };
178
179 typedef enum {
180 ASCII1=0,
181 LATIN1,
182 SBCS,
183 DBCS,
184 MBCS,
185 HWKANA
186 }Cnv2022Type;
187
188 typedef struct ISO2022State {
189 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
190 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
191 int8_t prevG; /* g before single shift (SS2 or SS3) */
192 } ISO2022State;
193
194 #define UCNV_OPTIONS_VERSION_MASK 0xf
195 #define UCNV_2022_MAX_CONVERTERS 10
196
197 typedef struct{
198 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
199 UConverter *currentConverter;
200 Cnv2022Type currentType;
201 ISO2022State toU2022State, fromU2022State;
202 uint32_t key;
203 uint32_t version;
204 #ifdef U_ENABLE_GENERIC_ISO_2022
205 UBool isFirstBuffer;
206 #endif
207 UBool isEmptySegment;
208 char name[30];
209 char locale[3];
210 }UConverterDataISO2022;
211
212 /* Protos */
213 /* ISO-2022 ----------------------------------------------------------------- */
214
215 /*Forward declaration */
216 U_CFUNC void
217 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
218 UErrorCode * err);
219 U_CFUNC void
220 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
221 UErrorCode * err);
222
223 #define ESC_2022 0x1B /*ESC*/
224
225 typedef enum
226 {
227 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
228 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
229 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
230 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
231 } UCNV_TableStates_2022;
232
233 /*
234 * The way these state transition arrays work is:
235 * ex : ESC$B is the sequence for JISX208
236 * a) First Iteration: char is ESC
237 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
238 * int x = normalize_esq_chars_2022[27] which is equal to 1
239 * ii) Search for this value in escSeqStateTable_Key_2022[]
240 * value of x is stored at escSeqStateTable_Key_2022[0]
241 * iii) Save this index as offset
242 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
243 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
244 * b) Switch on this state and continue to next char
245 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
246 * which is normalize_esq_chars_2022[36] == 4
247 * ii) x is currently 1(from above)
248 * x<<=5 -- x is now 32
249 * x+=normalize_esq_chars_2022[36]
250 * now x is 36
251 * iii) Search for this value in escSeqStateTable_Key_2022[]
252 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
253 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
254 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
255 * c) Switch on this state and continue to next char
256 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
257 * ii) x is currently 36 (from above)
258 * x<<=5 -- x is now 1152
259 * x+=normalize_esq_chars_2022[66]
260 * now x is 1161
261 * iii) Search for this value in escSeqStateTable_Key_2022[]
262 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
264 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
265 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
266 */
267
268
269 /*Below are the 3 arrays depicting a state transition table*/
270 static const int8_t normalize_esq_chars_2022[256] = {
271 /* 0 1 2 3 4 5 6 7 8 9 */
272
273 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
274 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
276 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
277 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
278 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
279 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
280 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
281 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
282 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0
299 };
300
301 #ifdef U_ENABLE_GENERIC_ISO_2022
302 /*
303 * When the generic ISO-2022 converter is completely removed, not just disabled
304 * per #ifdef, then the following state table and the associated tables that are
305 * dimensioned with MAX_STATES_2022 should be trimmed.
306 *
307 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
308 * the associated escape sequences starting with ESC ( B should be removed.
309 * This includes the ones with key values 1097 and all of the ones above 1000000.
310 *
311 * For the latter, the tables can simply be truncated.
312 * For the former, since the tables must be kept parallel, it is probably best
313 * to simply duplicate an adjacent table cell, parallel in all tables.
314 *
315 * It may make sense to restructure the tables, especially by using small search
316 * tables for the variants instead of indexing them parallel to the table here.
317 */
318 #endif
319
320 #define MAX_STATES_2022 74
321 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
322 /* 0 1 2 3 4 5 6 7 8 9 */
323
324 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
325 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
326 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
327 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
328 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
329 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
330 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
331 ,35947631 ,35947635 ,35947636 ,35947638
332 };
333
334 #ifdef U_ENABLE_GENERIC_ISO_2022
335
336 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
337 /* 0 1 2 3 4 5 6 7 8 9 */
338
339 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
340 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
341 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
342 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
343 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
344 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
345 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
346 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
347 };
348
349 #endif
350
351 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
352 /* 0 1 2 3 4 5 6 7 8 9 */
353 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
354 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
360 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
361 };
362
363
364 /* Type def for refactoring changeState_2022 code*/
365 typedef enum{
366 #ifdef U_ENABLE_GENERIC_ISO_2022
367 ISO_2022=0,
368 #endif
369 ISO_2022_JP=1,
370 ISO_2022_KR=2,
371 ISO_2022_CN=3
372 } Variant2022;
373
374 /*********** ISO 2022 Converter Protos ***********/
375 static void
376 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
377
378 static void
379 _ISO2022Close(UConverter *converter);
380
381 static void
382 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
383
384 static const char*
385 _ISO2022getName(const UConverter* cnv);
386
387 static void
388 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
389
390 static UConverter *
391 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
392
393 #ifdef U_ENABLE_GENERIC_ISO_2022
394 static void
395 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
396 #endif
397
398 namespace {
399
400 /*const UConverterSharedData _ISO2022Data;*/
401 extern const UConverterSharedData _ISO2022JPData;
402 extern const UConverterSharedData _ISO2022KRData;
403 extern const UConverterSharedData _ISO2022CNData;
404
405 } // namespace
406
407 /*************** Converter implementations ******************/
408
409 /* The purpose of this function is to get around gcc compiler warnings. */
410 static inline void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)411 fromUWriteUInt8(UConverter *cnv,
412 const char *bytes, int32_t length,
413 uint8_t **target, const char *targetLimit,
414 int32_t **offsets,
415 int32_t sourceIndex,
416 UErrorCode *pErrorCode)
417 {
418 char *targetChars = (char *)*target;
419 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
420 offsets, sourceIndex, pErrorCode);
421 *target = (uint8_t*)targetChars;
422
423 }
424
425 static inline void
setInitialStateToUnicodeKR(UConverter *,UConverterDataISO2022 * myConverterData)426 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
427 if(myConverterData->version == 1) {
428 UConverter *cnv = myConverterData->currentConverter;
429
430 cnv->toUnicodeStatus=0; /* offset */
431 cnv->mode=0; /* state */
432 cnv->toULength=0; /* byteIndex */
433 }
434 }
435
436 static inline void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)437 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
438 /* in ISO-2022-KR the designator sequence appears only once
439 * in a file so we append it only once
440 */
441 if( converter->charErrorBufferLength==0){
442
443 converter->charErrorBufferLength = 4;
444 converter->charErrorBuffer[0] = 0x1b;
445 converter->charErrorBuffer[1] = 0x24;
446 converter->charErrorBuffer[2] = 0x29;
447 converter->charErrorBuffer[3] = 0x43;
448 }
449 if(myConverterData->version == 1) {
450 UConverter *cnv = myConverterData->currentConverter;
451
452 cnv->fromUChar32=0;
453 cnv->fromUnicodeStatus=1; /* prevLength */
454 }
455 }
456
457 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)458 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
459
460 char myLocale[6]={' ',' ',' ',' ',' ',' '};
461
462 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
463 if(cnv->extraInfo != NULL) {
464 UConverterNamePieces stackPieces;
465 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
467 uint32_t version;
468
469 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
470
471 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
472 myConverterData->currentType = ASCII1;
473 cnv->fromUnicodeStatus =FALSE;
474 if(pArgs->locale){
475 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
476 }
477 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
478 myConverterData->version = version;
479 /* Begin Google-specific change. */
480 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
481 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
482 if((myLocale[0]=='j' &&
483 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
484 myLocale[1]=='s') &&
485 (myLocale[2]=='_' || myLocale[2]=='\0')))
486 {
487 size_t len=0;
488 /* open the required converters and cache them */
489 if(version>MAX_JA_VERSION) {
490 /* prevent indexing beyond jpCharsetMasks[] */
491 myConverterData->version = version = 0;
492 }
493 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
494 myConverterData->myConverterArray[ISO8859_7] =
495 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
496 }
497 if (myLocale[1]=='k') { /* Use KDDI's version. */
498 myConverterData->myConverterArray[JISX208] =
499 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
500 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */
501 myConverterData->myConverterArray[JISX208] =
502 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
503 } else {
504 /*
505 * Change for http://b/issue?id=937017 :
506 * Restore JIS X 0208 ISO-2022-JP mappings from before
507 * sharing the table with the Shift-JIS converter
508 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797).
509 * TODO(mscherer): Create and use a new, unified Google Shift-JIS
510 * table for both Shift-JIS and ISO-2022-JP.
511 */
512 myConverterData->myConverterArray[JISX208] =
513 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode);
514 }
515 /* End Google-specific change. */
516 if(jpCharsetMasks[version]&CSM(JISX212)) {
517 myConverterData->myConverterArray[JISX212] =
518 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
519 }
520 if(jpCharsetMasks[version]&CSM(GB2312)) {
521 myConverterData->myConverterArray[GB2312] =
522 /* BEGIN android-changed */
523 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
524 /* END android-changed */
525 }
526 if(jpCharsetMasks[version]&CSM(KSC5601)) {
527 myConverterData->myConverterArray[KSC5601] =
528 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
529 }
530
531 /* set the function pointers to appropriate funtions */
532 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
533 uprv_strcpy(myConverterData->locale,"ja");
534
535 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
536 len = uprv_strlen(myConverterData->name);
537 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
538 myConverterData->name[len+1]='\0';
539 }
540 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541 (myLocale[2]=='_' || myLocale[2]=='\0'))
542 {
543 const char *cnvName;
544 if(version==1) {
545 cnvName="icu-internal-25546";
546 } else {
547 /* BEGIN android-changed */
548 cnvName="ksc_5601";
549 /* END android-changed */
550 myConverterData->version=version=0;
551 }
552 if(pArgs->onlyTestIsLoadable) {
553 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
554 uprv_free(cnv->extraInfo);
555 cnv->extraInfo=NULL;
556 return;
557 } else {
558 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
559 if (U_FAILURE(*errorCode)) {
560 _ISO2022Close(cnv);
561 return;
562 }
563
564 if(version==1) {
565 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
566 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
567 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
568 }else{
569 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
570 }
571
572 /* initialize the state variables */
573 setInitialStateToUnicodeKR(cnv, myConverterData);
574 setInitialStateFromUnicodeKR(cnv, myConverterData);
575
576 /* set the function pointers to appropriate funtions */
577 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
578 uprv_strcpy(myConverterData->locale,"ko");
579 }
580 }
581 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
582 (myLocale[2]=='_' || myLocale[2]=='\0'))
583 {
584
585 /* open the required converters and cache them */
586 /* BEGIN android-changed */
587 myConverterData->myConverterArray[GB2312_1] =
588 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
589 if(version==1) {
590 myConverterData->myConverterArray[ISO_IR_165] =
591 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
592 }
593 myConverterData->myConverterArray[CNS_11643] =
594 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
595 /* END android-changed */
596
597
598 /* set the function pointers to appropriate funtions */
599 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
600 uprv_strcpy(myConverterData->locale,"cn");
601
602 if (version==0){
603 myConverterData->version = 0;
604 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
605 }else if (version==1){
606 myConverterData->version = 1;
607 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
608 }else {
609 myConverterData->version = 2;
610 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
611 }
612 }
613 else{
614 #ifdef U_ENABLE_GENERIC_ISO_2022
615 myConverterData->isFirstBuffer = TRUE;
616
617 /* append the UTF-8 escape sequence */
618 cnv->charErrorBufferLength = 3;
619 cnv->charErrorBuffer[0] = 0x1b;
620 cnv->charErrorBuffer[1] = 0x25;
621 cnv->charErrorBuffer[2] = 0x42;
622
623 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
624 /* initialize the state variables */
625 uprv_strcpy(myConverterData->name,"ISO_2022");
626 #else
627 *errorCode = U_UNSUPPORTED_ERROR;
628 return;
629 #endif
630 }
631
632 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
633
634 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
635 _ISO2022Close(cnv);
636 }
637 } else {
638 *errorCode = U_MEMORY_ALLOCATION_ERROR;
639 }
640 }
641
642
643 static void
_ISO2022Close(UConverter * converter)644 _ISO2022Close(UConverter *converter) {
645 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
646 UConverterSharedData **array = myData->myConverterArray;
647 int32_t i;
648
649 if (converter->extraInfo != NULL) {
650 /*close the array of converter pointers and free the memory*/
651 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
652 if(array[i]!=NULL) {
653 ucnv_unloadSharedDataIfReady(array[i]);
654 }
655 }
656
657 ucnv_close(myData->currentConverter);
658
659 if(!converter->isExtraLocal){
660 uprv_free (converter->extraInfo);
661 converter->extraInfo = NULL;
662 }
663 }
664 }
665
666 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)667 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
668 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
669 if(choice<=UCNV_RESET_TO_UNICODE) {
670 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
671 myConverterData->key = 0;
672 myConverterData->isEmptySegment = FALSE;
673 }
674 if(choice!=UCNV_RESET_TO_UNICODE) {
675 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
676 }
677 #ifdef U_ENABLE_GENERIC_ISO_2022
678 if(myConverterData->locale[0] == 0){
679 if(choice<=UCNV_RESET_TO_UNICODE) {
680 myConverterData->isFirstBuffer = TRUE;
681 myConverterData->key = 0;
682 if (converter->mode == UCNV_SO){
683 ucnv_close (myConverterData->currentConverter);
684 myConverterData->currentConverter=NULL;
685 }
686 converter->mode = UCNV_SI;
687 }
688 if(choice!=UCNV_RESET_TO_UNICODE) {
689 /* re-append UTF-8 escape sequence */
690 converter->charErrorBufferLength = 3;
691 converter->charErrorBuffer[0] = 0x1b;
692 converter->charErrorBuffer[1] = 0x28;
693 converter->charErrorBuffer[2] = 0x42;
694 }
695 }
696 else
697 #endif
698 {
699 /* reset the state variables */
700 if(myConverterData->locale[0] == 'k'){
701 if(choice<=UCNV_RESET_TO_UNICODE) {
702 setInitialStateToUnicodeKR(converter, myConverterData);
703 }
704 if(choice!=UCNV_RESET_TO_UNICODE) {
705 setInitialStateFromUnicodeKR(converter, myConverterData);
706 }
707 }
708 }
709 }
710
711 static const char*
_ISO2022getName(const UConverter * cnv)712 _ISO2022getName(const UConverter* cnv){
713 if(cnv->extraInfo){
714 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
715 return myData->name;
716 }
717 return NULL;
718 }
719
720
721 /*************** to unicode *******************/
722 /****************************************************************************
723 * Recognized escape sequences are
724 * <ESC>(B ASCII
725 * <ESC>.A ISO-8859-1
726 * <ESC>.F ISO-8859-7
727 * <ESC>(J JISX-201
728 * <ESC>(I JISX-201
729 * <ESC>$B JISX-208
730 * <ESC>$@ JISX-208
731 * <ESC>$(D JISX-212
732 * <ESC>$A GB2312
733 * <ESC>$(C KSC5601
734 */
735 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
736 /* 0 1 2 3 4 5 6 7 8 9 */
737 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
738 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
739 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
740 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
741 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
742 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
743 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
744 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
745 };
746
747 /*************** to unicode *******************/
748 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
749 /* 0 1 2 3 4 5 6 7 8 9 */
750 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
751 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
752 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
753 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
754 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
755 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
756 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
757 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
758 };
759
760
761 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)762 getKey_2022(char c,int32_t* key,int32_t* offset){
763 int32_t togo;
764 int32_t low = 0;
765 int32_t hi = MAX_STATES_2022;
766 int32_t oldmid=0;
767
768 togo = normalize_esq_chars_2022[(uint8_t)c];
769 if(togo == 0) {
770 /* not a valid character anywhere in an escape sequence */
771 *key = 0;
772 *offset = 0;
773 return INVALID_2022;
774 }
775 togo = (*key << 5) + togo;
776
777 while (hi != low) /*binary search*/{
778
779 int32_t mid = (hi+low) >> 1; /*Finds median*/
780
781 if (mid == oldmid)
782 break;
783
784 if (escSeqStateTable_Key_2022[mid] > togo){
785 hi = mid;
786 }
787 else if (escSeqStateTable_Key_2022[mid] < togo){
788 low = mid;
789 }
790 else /*we found it*/{
791 *key = togo;
792 *offset = mid;
793 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
794 }
795 oldmid = mid;
796
797 }
798
799 *key = 0;
800 *offset = 0;
801 return INVALID_2022;
802 }
803
804 /*runs through a state machine to determine the escape sequence - codepage correspondance
805 */
806 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)807 changeState_2022(UConverter* _this,
808 const char** source,
809 const char* sourceLimit,
810 Variant2022 var,
811 UErrorCode* err){
812 UCNV_TableStates_2022 value;
813 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
814 uint32_t key = myData2022->key;
815 int32_t offset = 0;
816 int8_t initialToULength = _this->toULength;
817 char c;
818
819 value = VALID_NON_TERMINAL_2022;
820 while (*source < sourceLimit) {
821 c = *(*source)++;
822 _this->toUBytes[_this->toULength++]=(uint8_t)c;
823 value = getKey_2022(c,(int32_t *) &key, &offset);
824
825 switch (value){
826
827 case VALID_NON_TERMINAL_2022 :
828 /* continue with the loop */
829 break;
830
831 case VALID_TERMINAL_2022:
832 key = 0;
833 goto DONE;
834
835 case INVALID_2022:
836 goto DONE;
837
838 case VALID_MAYBE_TERMINAL_2022:
839 #ifdef U_ENABLE_GENERIC_ISO_2022
840 /* ESC ( B is ambiguous only for ISO_2022 itself */
841 if(var == ISO_2022) {
842 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
843 _this->toULength = 0;
844
845 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
846
847 /* continue with the loop */
848 value = VALID_NON_TERMINAL_2022;
849 break;
850 } else
851 #endif
852 {
853 /* not ISO_2022 itself, finish here */
854 value = VALID_TERMINAL_2022;
855 key = 0;
856 goto DONE;
857 }
858 }
859 }
860
861 DONE:
862 myData2022->key = key;
863
864 if (value == VALID_NON_TERMINAL_2022) {
865 /* indicate that the escape sequence is incomplete: key!=0 */
866 return;
867 } else if (value == INVALID_2022 ) {
868 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
869 } else /* value == VALID_TERMINAL_2022 */ {
870 switch(var){
871 #ifdef U_ENABLE_GENERIC_ISO_2022
872 case ISO_2022:
873 {
874 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
875 if(chosenConverterName == NULL) {
876 /* SS2 or SS3 */
877 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
878 _this->toUCallbackReason = UCNV_UNASSIGNED;
879 return;
880 }
881
882 _this->mode = UCNV_SI;
883 ucnv_close(myData2022->currentConverter);
884 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
885 if(U_SUCCESS(*err)) {
886 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
887 _this->mode = UCNV_SO;
888 }
889 break;
890 }
891 #endif
892 case ISO_2022_JP:
893 {
894 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
895 switch(tempState) {
896 case INVALID_STATE:
897 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
898 break;
899 case SS2_STATE:
900 if(myData2022->toU2022State.cs[2]!=0) {
901 if(myData2022->toU2022State.g<2) {
902 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
903 }
904 myData2022->toU2022State.g=2;
905 } else {
906 /* illegal to have SS2 before a matching designator */
907 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
908 }
909 break;
910 /* case SS3_STATE: not used in ISO-2022-JP-x */
911 case ISO8859_1:
912 case ISO8859_7:
913 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
914 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
915 } else {
916 /* G2 charset for SS2 */
917 myData2022->toU2022State.cs[2]=(int8_t)tempState;
918 }
919 break;
920 default:
921 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
922 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923 } else {
924 /* G0 charset */
925 myData2022->toU2022State.cs[0]=(int8_t)tempState;
926 }
927 break;
928 }
929 }
930 break;
931 case ISO_2022_CN:
932 {
933 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
934 switch(tempState) {
935 case INVALID_STATE:
936 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
937 break;
938 case SS2_STATE:
939 if(myData2022->toU2022State.cs[2]!=0) {
940 if(myData2022->toU2022State.g<2) {
941 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
942 }
943 myData2022->toU2022State.g=2;
944 } else {
945 /* illegal to have SS2 before a matching designator */
946 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
947 }
948 break;
949 case SS3_STATE:
950 if(myData2022->toU2022State.cs[3]!=0) {
951 if(myData2022->toU2022State.g<2) {
952 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
953 }
954 myData2022->toU2022State.g=3;
955 } else {
956 /* illegal to have SS3 before a matching designator */
957 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
958 }
959 break;
960 case ISO_IR_165:
961 if(myData2022->version==0) {
962 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
963 break;
964 }
965 /*fall through*/
966 case GB2312_1:
967 /*fall through*/
968 case CNS_11643_1:
969 myData2022->toU2022State.cs[1]=(int8_t)tempState;
970 break;
971 case CNS_11643_2:
972 myData2022->toU2022State.cs[2]=(int8_t)tempState;
973 break;
974 default:
975 /* other CNS 11643 planes */
976 if(myData2022->version==0) {
977 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
978 } else {
979 myData2022->toU2022State.cs[3]=(int8_t)tempState;
980 }
981 break;
982 }
983 }
984 break;
985 case ISO_2022_KR:
986 if(offset==0x30){
987 /* nothing to be done, just accept this one escape sequence */
988 } else {
989 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
990 }
991 break;
992
993 default:
994 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
995 break;
996 }
997 }
998 if(U_SUCCESS(*err)) {
999 _this->toULength = 0;
1000 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1001 if(_this->toULength>1) {
1002 /*
1003 * Ticket 5691: consistent illegal sequences:
1004 * - We include at least the first byte (ESC) in the illegal sequence.
1005 * - If any of the non-initial bytes could be the start of a character,
1006 * we stop the illegal sequence before the first one of those.
1007 * In escape sequences, all following bytes are "printable", that is,
1008 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1009 * they are valid single/lead bytes.
1010 * For simplicity, we always only report the initial ESC byte as the
1011 * illegal sequence and back out all other bytes we looked at.
1012 */
1013 /* Back out some bytes. */
1014 int8_t backOutDistance=_this->toULength-1;
1015 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1016 if(backOutDistance<=bytesFromThisBuffer) {
1017 /* same as initialToULength<=1 */
1018 *source-=backOutDistance;
1019 } else {
1020 /* Back out bytes from the previous buffer: Need to replay them. */
1021 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1022 /* same as -(initialToULength-1) */
1023 /* preToULength is negative! */
1024 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1025 *source-=bytesFromThisBuffer;
1026 }
1027 _this->toULength=1;
1028 }
1029 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1030 _this->toUCallbackReason = UCNV_UNASSIGNED;
1031 }
1032 }
1033
1034 /*Checks the characters of the buffer against valid 2022 escape sequences
1035 *if the match we return a pointer to the initial start of the sequence otherwise
1036 *we return sourceLimit
1037 */
1038 /*for 2022 looks ahead in the stream
1039 *to determine the longest possible convertible
1040 *data stream
1041 */
1042 static inline const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool)1043 getEndOfBuffer_2022(const char** source,
1044 const char* sourceLimit,
1045 UBool /*flush*/){
1046
1047 const char* mySource = *source;
1048
1049 #ifdef U_ENABLE_GENERIC_ISO_2022
1050 if (*source >= sourceLimit)
1051 return sourceLimit;
1052
1053 do{
1054
1055 if (*mySource == ESC_2022){
1056 int8_t i;
1057 int32_t key = 0;
1058 int32_t offset;
1059 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1060
1061 /* Kludge: I could not
1062 * figure out the reason for validating an escape sequence
1063 * twice - once here and once in changeState_2022().
1064 * is it possible to have an ESC character in a ISO2022
1065 * byte stream which is valid in a code page? Is it legal?
1066 */
1067 for (i=0;
1068 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1069 i++) {
1070 value = getKey_2022(*(mySource+i), &key, &offset);
1071 }
1072 if (value > 0 || *mySource==ESC_2022)
1073 return mySource;
1074
1075 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1076 return sourceLimit;
1077 }
1078 }while (++mySource < sourceLimit);
1079
1080 return sourceLimit;
1081 #else
1082 while(mySource < sourceLimit && *mySource != ESC_2022) {
1083 ++mySource;
1084 }
1085 return mySource;
1086 #endif
1087 }
1088
1089
1090 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1091 * any future change in _MBCSFromUChar32() function should be reflected here.
1092 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1093 */
1094 static inline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1095 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1096 UChar32 c,
1097 uint32_t* value,
1098 UBool useFallback,
1099 int outputType)
1100 {
1101 const int32_t *cx;
1102 const uint16_t *table;
1103 uint32_t stage2Entry;
1104 uint32_t myValue;
1105 int32_t length;
1106 const uint8_t *p;
1107 /*
1108 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1109 * Use internal version of ucnv_open() that verifies that the new structures are available,
1110 * else U_INTERNAL_PROGRAM_ERROR.
1111 */
1112 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1113 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1114 table=sharedData->mbcs.fromUnicodeTable;
1115 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1116 /* get the bytes and the length for the output */
1117 if(outputType==MBCS_OUTPUT_2){
1118 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1119 if(myValue<=0xff) {
1120 length=1;
1121 } else {
1122 length=2;
1123 }
1124 } else /* outputType==MBCS_OUTPUT_3 */ {
1125 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1126 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1127 if(myValue<=0xff) {
1128 length=1;
1129 } else if(myValue<=0xffff) {
1130 length=2;
1131 } else {
1132 length=3;
1133 }
1134 }
1135 /* is this code point assigned, or do we use fallbacks? */
1136 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1137 /* assigned */
1138 *value=myValue;
1139 return length;
1140 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1141 /*
1142 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1143 * There is no way with this data structure for fallback output
1144 * to be a zero byte.
1145 */
1146 *value=myValue;
1147 return -length;
1148 }
1149 }
1150
1151 cx=sharedData->mbcs.extIndexes;
1152 if(cx!=NULL) {
1153 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1154 }
1155
1156 /* unassigned */
1157 return 0;
1158 }
1159
1160 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1161 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1162 * @param retval pointer to output byte
1163 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1164 */
1165 static inline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1166 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1167 UChar32 c,
1168 uint32_t* retval,
1169 UBool useFallback)
1170 {
1171 const uint16_t *table;
1172 int32_t value;
1173 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1174 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1175 return 0;
1176 }
1177 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1178 table=sharedData->mbcs.fromUnicodeTable;
1179 /* get the byte for the output */
1180 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1181 /* is this code point assigned, or do we use fallbacks? */
1182 *retval=(uint32_t)(value&0xff);
1183 if(value>=0xf00) {
1184 return 1; /* roundtrip */
1185 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1186 return -1; /* fallback taken */
1187 } else {
1188 return 0; /* no mapping */
1189 }
1190 }
1191
1192 /*
1193 * Check that the result is a 2-byte value with each byte in the range A1..FE
1194 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1195 * to move it to the ISO 2022 range 21..7E.
1196 * Return 0 if out of range.
1197 */
1198 static inline uint32_t
_2022FromGR94DBCS(uint32_t value)1199 _2022FromGR94DBCS(uint32_t value) {
1200 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1201 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1202 ) {
1203 return value - 0x8080; /* shift down to 21..7e byte range */
1204 } else {
1205 return 0; /* not valid for ISO 2022 */
1206 }
1207 }
1208
1209 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1210 /*
1211 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1212 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1213 * unchanged.
1214 */
1215 static inline uint32_t
1216 _2022ToGR94DBCS(uint32_t value) {
1217 uint32_t returnValue = value + 0x8080;
1218 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1219 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1220 return returnValue;
1221 } else {
1222 return value;
1223 }
1224 }
1225 #endif
1226
1227 #ifdef U_ENABLE_GENERIC_ISO_2022
1228
1229 /**********************************************************************************
1230 * ISO-2022 Converter
1231 *
1232 *
1233 */
1234
1235 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1236 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1237 UErrorCode* err){
1238 const char* mySourceLimit, *realSourceLimit;
1239 const char* sourceStart;
1240 const UChar* myTargetStart;
1241 UConverter* saveThis;
1242 UConverterDataISO2022* myData;
1243 int8_t length;
1244
1245 saveThis = args->converter;
1246 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1247
1248 realSourceLimit = args->sourceLimit;
1249 while (args->source < realSourceLimit) {
1250 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1251 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1252 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1253
1254 if(args->source < mySourceLimit) {
1255 if(myData->currentConverter==NULL) {
1256 myData->currentConverter = ucnv_open("ASCII",err);
1257 if(U_FAILURE(*err)){
1258 return;
1259 }
1260
1261 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1262 saveThis->mode = UCNV_SO;
1263 }
1264
1265 /* convert to before the ESC or until the end of the buffer */
1266 myData->isFirstBuffer=FALSE;
1267 sourceStart = args->source;
1268 myTargetStart = args->target;
1269 args->converter = myData->currentConverter;
1270 ucnv_toUnicode(args->converter,
1271 &args->target,
1272 args->targetLimit,
1273 &args->source,
1274 mySourceLimit,
1275 args->offsets,
1276 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1277 err);
1278 args->converter = saveThis;
1279
1280 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1281 /* move the overflow buffer */
1282 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1283 myData->currentConverter->UCharErrorBufferLength = 0;
1284 if(length > 0) {
1285 uprv_memcpy(saveThis->UCharErrorBuffer,
1286 myData->currentConverter->UCharErrorBuffer,
1287 length*U_SIZEOF_UCHAR);
1288 }
1289 return;
1290 }
1291
1292 /*
1293 * At least one of:
1294 * -Error while converting
1295 * -Done with entire buffer
1296 * -Need to write offsets or update the current offset
1297 * (leave that up to the code in ucnv.c)
1298 *
1299 * or else we just stopped at an ESC byte and continue with changeState_2022()
1300 */
1301 if (U_FAILURE(*err) ||
1302 (args->source == realSourceLimit) ||
1303 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1304 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1305 ) {
1306 /* copy partial or error input for truncated detection and error handling */
1307 if(U_FAILURE(*err)) {
1308 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1309 if(length > 0) {
1310 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1311 }
1312 } else {
1313 length = saveThis->toULength = myData->currentConverter->toULength;
1314 if(length > 0) {
1315 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1316 if(args->source < mySourceLimit) {
1317 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1318 }
1319 }
1320 }
1321 return;
1322 }
1323 }
1324 }
1325
1326 sourceStart = args->source;
1327 changeState_2022(args->converter,
1328 &(args->source),
1329 realSourceLimit,
1330 ISO_2022,
1331 err);
1332 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1333 /* let the ucnv.c code update its current offset */
1334 return;
1335 }
1336 }
1337 }
1338
1339 #endif
1340
1341 /*
1342 * To Unicode Callback helper function
1343 */
1344 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1345 toUnicodeCallback(UConverter *cnv,
1346 const uint32_t sourceChar, const uint32_t targetUniChar,
1347 UErrorCode* err){
1348 if(sourceChar>0xff){
1349 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1350 cnv->toUBytes[1] = (uint8_t)sourceChar;
1351 cnv->toULength = 2;
1352 }
1353 else{
1354 cnv->toUBytes[0] =(char) sourceChar;
1355 cnv->toULength = 1;
1356 }
1357
1358 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1359 *err = U_INVALID_CHAR_FOUND;
1360 }
1361 else{
1362 *err = U_ILLEGAL_CHAR_FOUND;
1363 }
1364 }
1365
1366 /**************************************ISO-2022-JP*************************************************/
1367
1368 /************************************** IMPORTANT **************************************************
1369 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1370 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1371 * The converter iterates over each Unicode codepoint
1372 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1373 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1374 * would do as far as possible.
1375 *
1376 * If the implementation of these macros or structure of sharedData struct change in the future, make
1377 * sure that ISO-2022 is also changed.
1378 ***************************************************************************************************
1379 */
1380
1381 /***************************************************************************************************
1382 * Rules for ISO-2022-jp encoding
1383 * (i) Escape sequences must be fully contained within a line they should not
1384 * span new lines or CRs
1385 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1386 * JIS-Roman character escape sequence should follow before the line terminates
1387 * (iii) If the first character on the line is represented by two bytes then a two
1388 * byte character escape sequence should precede it
1389 * (iv) If no escape sequence is encountered then the characters are ASCII
1390 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1391 * and invoked with SS2 (ESC N).
1392 * (vi) If there is any G0 designation in text, there must be a switch to
1393 * ASCII or to JIS X 0201-Roman before a space character (but not
1394 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1395 * characters such as tab or CRLF.
1396 * (vi) Supported encodings:
1397 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1398 *
1399 * source : RFC-1554
1400 *
1401 * JISX201, JISX208,JISX212 : new .cnv data files created
1402 * KSC5601 : alias to ibm-949 mapping table
1403 * GB2312 : alias to ibm-1386 mapping table
1404 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1405 * ISO-8859-7 : alisas to ibm-9409 mapping table
1406 */
1407
1408 /* preference order of JP charsets */
1409 static const StateEnum jpCharsetPref[]={
1410 ASCII,
1411 JISX201,
1412 ISO8859_1,
1413 ISO8859_7,
1414 JISX208,
1415 JISX212,
1416 GB2312,
1417 KSC5601,
1418 HWKANA_7BIT
1419 };
1420
1421 /*
1422 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1423 * not in order of jpCharsetPref[]!
1424 */
1425 static const char escSeqChars[][6] ={
1426 "\x1B\x28\x42", /* <ESC>(B ASCII */
1427 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1428 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1429 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1430 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1431 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1432 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1433 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1434 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1435
1436 };
1437 static const int8_t escSeqCharsLen[] ={
1438 3, /* length of <ESC>(B ASCII */
1439 3, /* length of <ESC>.A ISO-8859-1 */
1440 3, /* length of <ESC>.F ISO-8859-7 */
1441 3, /* length of <ESC>(J JISX-201 */
1442 3, /* length of <ESC>$B JISX-208 */
1443 4, /* length of <ESC>$(D JISX-212 */
1444 3, /* length of <ESC>$A GB2312 */
1445 4, /* length of <ESC>$(C KSC5601 */
1446 3 /* length of <ESC>(I HWKANA_7BIT */
1447 };
1448
1449 /*
1450 * The iteration over various code pages works this way:
1451 * i) Get the currentState from myConverterData->currentState
1452 * ii) Check if the character is mapped to a valid character in the currentState
1453 * Yes -> a) set the initIterState to currentState
1454 * b) remain in this state until an invalid character is found
1455 * No -> a) go to the next code page and find the character
1456 * iii) Before changing the state increment the current state check if the current state
1457 * is equal to the intitIteration state
1458 * Yes -> A character that cannot be represented in any of the supported encodings
1459 * break and return a U_INVALID_CHARACTER error
1460 * No -> Continue and find the character in next code page
1461 *
1462 *
1463 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1464 */
1465
1466 /* Map 00..7F to Unicode according to JIS X 0201. */
1467 static inline uint32_t
jisx201ToU(uint32_t value)1468 jisx201ToU(uint32_t value) {
1469 if(value < 0x5c) {
1470 return value;
1471 } else if(value == 0x5c) {
1472 return 0xa5;
1473 } else if(value == 0x7e) {
1474 return 0x203e;
1475 } else /* value <= 0x7f */ {
1476 return value;
1477 }
1478 }
1479
1480 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1481 static inline uint32_t
jisx201FromU(uint32_t value)1482 jisx201FromU(uint32_t value) {
1483 if(value<=0x7f) {
1484 if(value!=0x5c && value!=0x7e) {
1485 return value;
1486 }
1487 } else if(value==0xa5) {
1488 return 0x5c;
1489 } else if(value==0x203e) {
1490 return 0x7e;
1491 }
1492 return 0xfffe;
1493 }
1494
1495 /*
1496 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1497 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1498 * Return 0 if the byte pair is out of range.
1499 */
1500 static inline uint32_t
_2022FromSJIS(uint32_t value)1501 _2022FromSJIS(uint32_t value) {
1502 uint8_t trail;
1503
1504 if(value > 0xEFFC) {
1505 return 0; /* beyond JIS X 0208 */
1506 }
1507
1508 trail = (uint8_t)value;
1509
1510 value &= 0xff00; /* lead byte */
1511 if(value <= 0x9f00) {
1512 value -= 0x7000;
1513 } else /* 0xe000 <= value <= 0xef00 */ {
1514 value -= 0xb000;
1515 }
1516 value <<= 1;
1517
1518 if(trail <= 0x9e) {
1519 value -= 0x100;
1520 if(trail <= 0x7e) {
1521 value |= trail - 0x1f;
1522 } else {
1523 value |= trail - 0x20;
1524 }
1525 } else /* trail <= 0xfc */ {
1526 value |= trail - 0x7e;
1527 }
1528 return value;
1529 }
1530
1531 /*
1532 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1533 * If either byte is outside 21..7E make sure that the result is not valid
1534 * for Shift-JIS so that the converter catches it.
1535 * Some invalid byte values already turn into equally invalid Shift-JIS
1536 * byte values and need not be tested explicitly.
1537 */
1538 static inline void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1539 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1540 if(c1&1) {
1541 ++c1;
1542 if(c2 <= 0x5f) {
1543 c2 += 0x1f;
1544 } else if(c2 <= 0x7e) {
1545 c2 += 0x20;
1546 } else {
1547 c2 = 0; /* invalid */
1548 }
1549 } else {
1550 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1551 c2 += 0x7e;
1552 } else {
1553 c2 = 0; /* invalid */
1554 }
1555 }
1556 c1 >>= 1;
1557 if(c1 <= 0x2f) {
1558 c1 += 0x70;
1559 } else if(c1 <= 0x3f) {
1560 c1 += 0xb0;
1561 } else {
1562 c1 = 0; /* invalid */
1563 }
1564 bytes[0] = (char)c1;
1565 bytes[1] = (char)c2;
1566 }
1567
1568 /*
1569 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1570 * Katakana.
1571 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1572 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1573 * These were the only fallbacks in ICU's jisx-208.ucm file.
1574 */
1575 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1576 0x2123, /* U+FF61 */
1577 0x2156,
1578 0x2157,
1579 0x2122,
1580 0x2126,
1581 0x2572,
1582 0x2521,
1583 0x2523,
1584 0x2525,
1585 0x2527,
1586 0x2529,
1587 0x2563,
1588 0x2565,
1589 0x2567,
1590 0x2543,
1591 0x213C, /* U+FF70 */
1592 0x2522,
1593 0x2524,
1594 0x2526,
1595 0x2528,
1596 0x252A,
1597 0x252B,
1598 0x252D,
1599 0x252F,
1600 0x2531,
1601 0x2533,
1602 0x2535,
1603 0x2537,
1604 0x2539,
1605 0x253B,
1606 0x253D,
1607 0x253F, /* U+FF80 */
1608 0x2541,
1609 0x2544,
1610 0x2546,
1611 0x2548,
1612 0x254A,
1613 0x254B,
1614 0x254C,
1615 0x254D,
1616 0x254E,
1617 0x254F,
1618 0x2552,
1619 0x2555,
1620 0x2558,
1621 0x255B,
1622 0x255E,
1623 0x255F, /* U+FF90 */
1624 0x2560,
1625 0x2561,
1626 0x2562,
1627 0x2564,
1628 0x2566,
1629 0x2568,
1630 0x2569,
1631 0x256A,
1632 0x256B,
1633 0x256C,
1634 0x256D,
1635 0x256F,
1636 0x2573,
1637 0x212B,
1638 0x212C /* U+FF9F */
1639 };
1640
1641 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1642 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1643 UConverter *cnv = args->converter;
1644 UConverterDataISO2022 *converterData;
1645 ISO2022State *pFromU2022State;
1646 uint8_t *target = (uint8_t *) args->target;
1647 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1648 const UChar* source = args->source;
1649 const UChar* sourceLimit = args->sourceLimit;
1650 int32_t* offsets = args->offsets;
1651 UChar32 sourceChar;
1652 char buffer[8];
1653 int32_t len, outLen;
1654 int8_t choices[10];
1655 int32_t choiceCount;
1656 uint32_t targetValue = 0;
1657 UBool useFallback;
1658
1659 int32_t i;
1660 int8_t cs, g;
1661
1662 /* set up the state */
1663 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1664 pFromU2022State = &converterData->fromU2022State;
1665
1666 choiceCount = 0;
1667
1668 /* check if the last codepoint of previous buffer was a lead surrogate*/
1669 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1670 goto getTrail;
1671 }
1672
1673 while(source < sourceLimit) {
1674 if(target < targetLimit) {
1675
1676 sourceChar = *(source++);
1677 /*check if the char is a First surrogate*/
1678 if(U16_IS_SURROGATE(sourceChar)) {
1679 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1680 getTrail:
1681 /*look ahead to find the trail surrogate*/
1682 if(source < sourceLimit) {
1683 /* test the following code unit */
1684 UChar trail=(UChar) *source;
1685 if(U16_IS_TRAIL(trail)) {
1686 source++;
1687 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1688 cnv->fromUChar32=0x00;
1689 /* convert this supplementary code point */
1690 /* exit this condition tree */
1691 } else {
1692 /* this is an unmatched lead code unit (1st surrogate) */
1693 /* callback(illegal) */
1694 *err=U_ILLEGAL_CHAR_FOUND;
1695 cnv->fromUChar32=sourceChar;
1696 break;
1697 }
1698 } else {
1699 /* no more input */
1700 cnv->fromUChar32=sourceChar;
1701 break;
1702 }
1703 } else {
1704 /* this is an unmatched trail code unit (2nd surrogate) */
1705 /* callback(illegal) */
1706 *err=U_ILLEGAL_CHAR_FOUND;
1707 cnv->fromUChar32=sourceChar;
1708 break;
1709 }
1710 }
1711
1712 /* do not convert SO/SI/ESC */
1713 if(IS_2022_CONTROL(sourceChar)) {
1714 /* callback(illegal) */
1715 *err=U_ILLEGAL_CHAR_FOUND;
1716 cnv->fromUChar32=sourceChar;
1717 break;
1718 }
1719
1720 /* do the conversion */
1721
1722 if(choiceCount == 0) {
1723 uint16_t csm;
1724
1725 /*
1726 * The csm variable keeps track of which charsets are allowed
1727 * and not used yet while building the choices[].
1728 */
1729 csm = jpCharsetMasks[converterData->version];
1730 choiceCount = 0;
1731
1732 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1733 if(converterData->version == 3 || converterData->version == 4) {
1734 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1735 }
1736 /* Do not try single-byte half-width Katakana for other versions. */
1737 csm &= ~CSM(HWKANA_7BIT);
1738
1739 /* try the current G0 charset */
1740 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1741 csm &= ~CSM(cs);
1742
1743 /* try the current G2 charset */
1744 if((cs = pFromU2022State->cs[2]) != 0) {
1745 choices[choiceCount++] = cs;
1746 csm &= ~CSM(cs);
1747 }
1748
1749 /* try all the other possible charsets */
1750 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1751 cs = (int8_t)jpCharsetPref[i];
1752 if(CSM(cs) & csm) {
1753 choices[choiceCount++] = cs;
1754 csm &= ~CSM(cs);
1755 }
1756 }
1757 }
1758
1759 cs = g = 0;
1760 /*
1761 * len==0: no mapping found yet
1762 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1763 * len>0: found a roundtrip result, done
1764 */
1765 len = 0;
1766 /*
1767 * We will turn off useFallback after finding a fallback,
1768 * but we still get fallbacks from PUA code points as usual.
1769 * Therefore, we will also need to check that we don't overwrite
1770 * an early fallback with a later one.
1771 */
1772 useFallback = cnv->useFallback;
1773
1774 for(i = 0; i < choiceCount && len <= 0; ++i) {
1775 uint32_t value;
1776 int32_t len2;
1777 int8_t cs0 = choices[i];
1778 switch(cs0) {
1779 case ASCII:
1780 if(sourceChar <= 0x7f) {
1781 targetValue = (uint32_t)sourceChar;
1782 len = 1;
1783 cs = cs0;
1784 g = 0;
1785 }
1786 break;
1787 case ISO8859_1:
1788 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1789 targetValue = (uint32_t)sourceChar - 0x80;
1790 len = 1;
1791 cs = cs0;
1792 g = 2;
1793 }
1794 break;
1795 case HWKANA_7BIT:
1796 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1797 if(converterData->version==3) {
1798 /* JIS7: use G1 (SO) */
1799 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1800 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1801 len = 1;
1802 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1803 g = 1;
1804 } else if(converterData->version==4) {
1805 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1806 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1807 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1808 len = 1;
1809
1810 cs = pFromU2022State->cs[0];
1811 if(IS_JP_DBCS(cs)) {
1812 /* switch from a DBCS charset to JISX201 */
1813 cs = (int8_t)JISX201;
1814 }
1815 /* else stay in the current G0 charset */
1816 g = 0;
1817 }
1818 /* else do not use HWKANA_7BIT with other versions */
1819 }
1820 break;
1821 case JISX201:
1822 /* G0 SBCS */
1823 value = jisx201FromU(sourceChar);
1824 if(value <= 0x7f) {
1825 targetValue = value;
1826 len = 1;
1827 cs = cs0;
1828 g = 0;
1829 useFallback = FALSE;
1830 }
1831 break;
1832 case JISX208:
1833 /* G0 DBCS from Shift-JIS table */
1834 len2 = MBCS_FROM_UCHAR32_ISO2022(
1835 converterData->myConverterArray[cs0],
1836 sourceChar, &value,
1837 useFallback, MBCS_OUTPUT_2);
1838 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1839 value = _2022FromSJIS(value);
1840 if(value != 0) {
1841 targetValue = value;
1842 len = len2;
1843 cs = cs0;
1844 g = 0;
1845 useFallback = FALSE;
1846 }
1847 } else if(len == 0 && useFallback &&
1848 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1849 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1850 len = -2;
1851 cs = cs0;
1852 g = 0;
1853 useFallback = FALSE;
1854 }
1855 break;
1856 case ISO8859_7:
1857 /* G0 SBCS forced to 7-bit output */
1858 len2 = MBCS_SINGLE_FROM_UCHAR32(
1859 converterData->myConverterArray[cs0],
1860 sourceChar, &value,
1861 useFallback);
1862 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1863 targetValue = value - 0x80;
1864 len = len2;
1865 cs = cs0;
1866 g = 2;
1867 useFallback = FALSE;
1868 }
1869 break;
1870 default:
1871 /* G0 DBCS */
1872 len2 = MBCS_FROM_UCHAR32_ISO2022(
1873 converterData->myConverterArray[cs0],
1874 sourceChar, &value,
1875 useFallback, MBCS_OUTPUT_2);
1876 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1877 if(cs0 == KSC5601) {
1878 /*
1879 * Check for valid bytes for the encoding scheme.
1880 * This is necessary because the sub-converter (windows-949)
1881 * has a broader encoding scheme than is valid for 2022.
1882 */
1883 value = _2022FromGR94DBCS(value);
1884 if(value == 0) {
1885 break;
1886 }
1887 }
1888 targetValue = value;
1889 len = len2;
1890 cs = cs0;
1891 g = 0;
1892 useFallback = FALSE;
1893 }
1894 break;
1895 }
1896 }
1897
1898 if(len != 0) {
1899 if(len < 0) {
1900 len = -len; /* fallback */
1901 }
1902 outLen = 0; /* count output bytes */
1903
1904 /* write SI if necessary (only for JIS7) */
1905 if(pFromU2022State->g == 1 && g == 0) {
1906 buffer[outLen++] = UCNV_SI;
1907 pFromU2022State->g = 0;
1908 }
1909
1910 /* write the designation sequence if necessary */
1911 if(cs != pFromU2022State->cs[g]) {
1912 int32_t escLen = escSeqCharsLen[cs];
1913 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1914 outLen += escLen;
1915 pFromU2022State->cs[g] = cs;
1916
1917 /* invalidate the choices[] */
1918 choiceCount = 0;
1919 }
1920
1921 /* write the shift sequence if necessary */
1922 if(g != pFromU2022State->g) {
1923 switch(g) {
1924 /* case 0 handled before writing escapes */
1925 case 1:
1926 buffer[outLen++] = UCNV_SO;
1927 pFromU2022State->g = 1;
1928 break;
1929 default: /* case 2 */
1930 buffer[outLen++] = 0x1b;
1931 buffer[outLen++] = 0x4e;
1932 break;
1933 /* no case 3: no SS3 in ISO-2022-JP-x */
1934 }
1935 }
1936
1937 /* write the output bytes */
1938 if(len == 1) {
1939 buffer[outLen++] = (char)targetValue;
1940 } else /* len == 2 */ {
1941 buffer[outLen++] = (char)(targetValue >> 8);
1942 buffer[outLen++] = (char)targetValue;
1943 }
1944 } else {
1945 /*
1946 * if we cannot find the character after checking all codepages
1947 * then this is an error
1948 */
1949 *err = U_INVALID_CHAR_FOUND;
1950 cnv->fromUChar32=sourceChar;
1951 break;
1952 }
1953
1954 if(sourceChar == CR || sourceChar == LF) {
1955 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1956 pFromU2022State->cs[2] = 0;
1957 choiceCount = 0;
1958 }
1959
1960 /* output outLen>0 bytes in buffer[] */
1961 if(outLen == 1) {
1962 *target++ = buffer[0];
1963 if(offsets) {
1964 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1965 }
1966 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1967 *target++ = buffer[0];
1968 *target++ = buffer[1];
1969 if(offsets) {
1970 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1971 *offsets++ = sourceIndex;
1972 *offsets++ = sourceIndex;
1973 }
1974 } else {
1975 fromUWriteUInt8(
1976 cnv,
1977 buffer, outLen,
1978 &target, (const char *)targetLimit,
1979 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1980 err);
1981 if(U_FAILURE(*err)) {
1982 break;
1983 }
1984 }
1985 } /* end if(myTargetIndex<myTargetLength) */
1986 else{
1987 *err =U_BUFFER_OVERFLOW_ERROR;
1988 break;
1989 }
1990
1991 }/* end while(mySourceIndex<mySourceLength) */
1992
1993 /*
1994 * the end of the input stream and detection of truncated input
1995 * are handled by the framework, but for ISO-2022-JP conversion
1996 * we need to be in ASCII mode at the very end
1997 *
1998 * conditions:
1999 * successful
2000 * in SO mode or not in ASCII mode
2001 * end of input and no truncated input
2002 */
2003 if( U_SUCCESS(*err) &&
2004 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2005 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2006 ) {
2007 int32_t sourceIndex;
2008
2009 outLen = 0;
2010
2011 if(pFromU2022State->g != 0) {
2012 buffer[outLen++] = UCNV_SI;
2013 pFromU2022State->g = 0;
2014 }
2015
2016 if(pFromU2022State->cs[0] != ASCII) {
2017 int32_t escLen = escSeqCharsLen[ASCII];
2018 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2019 outLen += escLen;
2020 pFromU2022State->cs[0] = (int8_t)ASCII;
2021 }
2022
2023 /* get the source index of the last input character */
2024 /*
2025 * TODO this would be simpler and more reliable if we used a pair
2026 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2027 * so that we could simply use the prevSourceIndex here;
2028 * this code gives an incorrect result for the rare case of an unmatched
2029 * trail surrogate that is alone in the last buffer of the text stream
2030 */
2031 sourceIndex=(int32_t)(source-args->source);
2032 if(sourceIndex>0) {
2033 --sourceIndex;
2034 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2035 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2036 ) {
2037 --sourceIndex;
2038 }
2039 } else {
2040 sourceIndex=-1;
2041 }
2042
2043 fromUWriteUInt8(
2044 cnv,
2045 buffer, outLen,
2046 &target, (const char *)targetLimit,
2047 &offsets, sourceIndex,
2048 err);
2049 }
2050
2051 /*save the state and return */
2052 args->source = source;
2053 args->target = (char*)target;
2054 }
2055
2056 /*************** to unicode *******************/
2057
2058 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2059 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2060 UErrorCode* err){
2061 char tempBuf[2];
2062 const char *mySource = (char *) args->source;
2063 UChar *myTarget = args->target;
2064 const char *mySourceLimit = args->sourceLimit;
2065 uint32_t targetUniChar = 0x0000;
2066 uint32_t mySourceChar = 0x0000;
2067 uint32_t tmpSourceChar = 0x0000;
2068 UConverterDataISO2022* myData;
2069 ISO2022State *pToU2022State;
2070 StateEnum cs;
2071
2072 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2073 pToU2022State = &myData->toU2022State;
2074
2075 if(myData->key != 0) {
2076 /* continue with a partial escape sequence */
2077 goto escape;
2078 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2079 /* continue with a partial double-byte character */
2080 mySourceChar = args->converter->toUBytes[0];
2081 args->converter->toULength = 0;
2082 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2083 targetUniChar = missingCharMarker;
2084 goto getTrailByte;
2085 }
2086
2087 while(mySource < mySourceLimit){
2088
2089 targetUniChar =missingCharMarker;
2090
2091 if(myTarget < args->targetLimit){
2092
2093 mySourceChar= (unsigned char) *mySource++;
2094
2095 switch(mySourceChar) {
2096 case UCNV_SI:
2097 if(myData->version==3) {
2098 pToU2022State->g=0;
2099 continue;
2100 } else {
2101 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2102 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2103 break;
2104 }
2105
2106 case UCNV_SO:
2107 if(myData->version==3) {
2108 /* JIS7: switch to G1 half-width Katakana */
2109 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2110 pToU2022State->g=1;
2111 continue;
2112 } else {
2113 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2114 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2115 break;
2116 }
2117
2118 case ESC_2022:
2119 mySource--;
2120 escape:
2121 {
2122 const char * mySourceBefore = mySource;
2123 int8_t toULengthBefore = args->converter->toULength;
2124
2125 changeState_2022(args->converter,&(mySource),
2126 mySourceLimit, ISO_2022_JP,err);
2127
2128 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2129 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2130 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2131 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2132 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2133 }
2134 }
2135
2136 /* invalid or illegal escape sequence */
2137 if(U_FAILURE(*err)){
2138 args->target = myTarget;
2139 args->source = mySource;
2140 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2141 return;
2142 }
2143 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2144 if(myData->key==0) {
2145 myData->isEmptySegment = TRUE;
2146 }
2147 continue;
2148
2149 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2150
2151 case CR:
2152 /*falls through*/
2153 case LF:
2154 /* automatically reset to single-byte mode */
2155 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2156 pToU2022State->cs[0] = (int8_t)ASCII;
2157 }
2158 pToU2022State->cs[2] = 0;
2159 pToU2022State->g = 0;
2160 /* falls through */
2161 default:
2162 /* convert one or two bytes */
2163 myData->isEmptySegment = FALSE;
2164 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2165 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2166 !IS_JP_DBCS(cs)
2167 ) {
2168 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2169 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2170
2171 /* return from a single-shift state to the previous one */
2172 if(pToU2022State->g >= 2) {
2173 pToU2022State->g=pToU2022State->prevG;
2174 }
2175 } else switch(cs) {
2176 case ASCII:
2177 if(mySourceChar <= 0x7f) {
2178 targetUniChar = mySourceChar;
2179 }
2180 break;
2181 case ISO8859_1:
2182 if(mySourceChar <= 0x7f) {
2183 targetUniChar = mySourceChar + 0x80;
2184 }
2185 /* return from a single-shift state to the previous one */
2186 pToU2022State->g=pToU2022State->prevG;
2187 break;
2188 case ISO8859_7:
2189 if(mySourceChar <= 0x7f) {
2190 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2191 targetUniChar =
2192 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2193 myData->myConverterArray[cs],
2194 mySourceChar + 0x80);
2195 }
2196 /* return from a single-shift state to the previous one */
2197 pToU2022State->g=pToU2022State->prevG;
2198 break;
2199 case JISX201:
2200 if(mySourceChar <= 0x7f) {
2201 targetUniChar = jisx201ToU(mySourceChar);
2202 }
2203 break;
2204 case HWKANA_7BIT:
2205 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2206 /* 7-bit halfwidth Katakana */
2207 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2208 }
2209 break;
2210 default:
2211 /* G0 DBCS */
2212 if(mySource < mySourceLimit) {
2213 int leadIsOk, trailIsOk;
2214 uint8_t trailByte;
2215 getTrailByte:
2216 trailByte = (uint8_t)*mySource;
2217 /*
2218 * Ticket 5691: consistent illegal sequences:
2219 * - We include at least the first byte in the illegal sequence.
2220 * - If any of the non-initial bytes could be the start of a character,
2221 * we stop the illegal sequence before the first one of those.
2222 *
2223 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2224 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2225 * Otherwise we convert or report the pair of bytes.
2226 */
2227 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2228 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2229 if (leadIsOk && trailIsOk) {
2230 ++mySource;
2231 tmpSourceChar = (mySourceChar << 8) | trailByte;
2232 if(cs == JISX208) {
2233 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2234 mySourceChar = tmpSourceChar;
2235 } else {
2236 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2237 mySourceChar = tmpSourceChar;
2238 if (cs == KSC5601) {
2239 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2240 }
2241 tempBuf[0] = (char)(tmpSourceChar >> 8);
2242 tempBuf[1] = (char)(tmpSourceChar);
2243 }
2244 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2245 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2246 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2247 ++mySource;
2248 /* add another bit so that the code below writes 2 bytes in case of error */
2249 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2250 }
2251 } else {
2252 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2253 args->converter->toULength = 1;
2254 goto endloop;
2255 }
2256 } /* End of inner switch */
2257 break;
2258 } /* End of outer switch */
2259 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2260 if(args->offsets){
2261 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2262 }
2263 *(myTarget++)=(UChar)targetUniChar;
2264 }
2265 else if(targetUniChar > missingCharMarker){
2266 /* disassemble the surrogate pair and write to output*/
2267 targetUniChar-=0x0010000;
2268 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2269 if(args->offsets){
2270 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2271 }
2272 ++myTarget;
2273 if(myTarget< args->targetLimit){
2274 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2275 if(args->offsets){
2276 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2277 }
2278 ++myTarget;
2279 }else{
2280 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2281 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2282 }
2283
2284 }
2285 else{
2286 /* Call the callback function*/
2287 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2288 break;
2289 }
2290 }
2291 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2292 *err =U_BUFFER_OVERFLOW_ERROR;
2293 break;
2294 }
2295 }
2296 endloop:
2297 args->target = myTarget;
2298 args->source = mySource;
2299 }
2300
2301
2302 /***************************************************************
2303 * Rules for ISO-2022-KR encoding
2304 * i) The KSC5601 designator sequence should appear only once in a file,
2305 * at the begining of a line before any KSC5601 characters. This usually
2306 * means that it appears by itself on the first line of the file
2307 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2308 * and SI to shift into single byte mode
2309 */
2310 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2311 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2312
2313 UConverter* saveConv = args->converter;
2314 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2315 args->converter=myConverterData->currentConverter;
2316
2317 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2318 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2319 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2320
2321 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2322 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2323 uprv_memcpy(
2324 saveConv->charErrorBuffer,
2325 myConverterData->currentConverter->charErrorBuffer,
2326 myConverterData->currentConverter->charErrorBufferLength);
2327 }
2328 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2329 myConverterData->currentConverter->charErrorBufferLength = 0;
2330 }
2331 args->converter=saveConv;
2332 }
2333
2334 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2335 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2336
2337 const UChar *source = args->source;
2338 const UChar *sourceLimit = args->sourceLimit;
2339 unsigned char *target = (unsigned char *) args->target;
2340 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2341 int32_t* offsets = args->offsets;
2342 uint32_t targetByteUnit = 0x0000;
2343 UChar32 sourceChar = 0x0000;
2344 UBool isTargetByteDBCS;
2345 UBool oldIsTargetByteDBCS;
2346 UConverterDataISO2022 *converterData;
2347 UConverterSharedData* sharedData;
2348 UBool useFallback;
2349 int32_t length =0;
2350
2351 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2352 /* if the version is 1 then the user is requesting
2353 * conversion with ibm-25546 pass the arguments to
2354 * MBCS converter and return
2355 */
2356 if(converterData->version==1){
2357 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2358 return;
2359 }
2360
2361 /* initialize data */
2362 sharedData = converterData->currentConverter->sharedData;
2363 useFallback = args->converter->useFallback;
2364 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2365 oldIsTargetByteDBCS = isTargetByteDBCS;
2366
2367 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2368 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2369 goto getTrail;
2370 }
2371 while(source < sourceLimit){
2372
2373 targetByteUnit = missingCharMarker;
2374
2375 if(target < (unsigned char*) args->targetLimit){
2376 sourceChar = *source++;
2377
2378 /* do not convert SO/SI/ESC */
2379 if(IS_2022_CONTROL(sourceChar)) {
2380 /* callback(illegal) */
2381 *err=U_ILLEGAL_CHAR_FOUND;
2382 args->converter->fromUChar32=sourceChar;
2383 break;
2384 }
2385
2386 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2387 if(length < 0) {
2388 length = -length; /* fallback */
2389 }
2390 /* only DBCS or SBCS characters are expected*/
2391 /* DB characters with high bit set to 1 are expected */
2392 if( length > 2 || length==0 ||
2393 (length == 1 && targetByteUnit > 0x7f) ||
2394 (length == 2 &&
2395 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2396 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2397 ) {
2398 targetByteUnit=missingCharMarker;
2399 }
2400 if (targetByteUnit != missingCharMarker){
2401
2402 oldIsTargetByteDBCS = isTargetByteDBCS;
2403 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2404 /* append the shift sequence */
2405 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2406
2407 if (isTargetByteDBCS)
2408 *target++ = UCNV_SO;
2409 else
2410 *target++ = UCNV_SI;
2411 if(offsets)
2412 *(offsets++) = (int32_t)(source - args->source-1);
2413 }
2414 /* write the targetUniChar to target */
2415 if(targetByteUnit <= 0x00FF){
2416 if( target < targetLimit){
2417 *(target++) = (unsigned char) targetByteUnit;
2418 if(offsets){
2419 *(offsets++) = (int32_t)(source - args->source-1);
2420 }
2421
2422 }else{
2423 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2424 *err = U_BUFFER_OVERFLOW_ERROR;
2425 }
2426 }else{
2427 if(target < targetLimit){
2428 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2429 if(offsets){
2430 *(offsets++) = (int32_t)(source - args->source-1);
2431 }
2432 if(target < targetLimit){
2433 *(target++) =(unsigned char) (targetByteUnit -0x80);
2434 if(offsets){
2435 *(offsets++) = (int32_t)(source - args->source-1);
2436 }
2437 }else{
2438 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2439 *err = U_BUFFER_OVERFLOW_ERROR;
2440 }
2441 }else{
2442 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2443 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2444 *err = U_BUFFER_OVERFLOW_ERROR;
2445 }
2446 }
2447
2448 }
2449 else{
2450 /* oops.. the code point is unassingned
2451 * set the error and reason
2452 */
2453
2454 /*check if the char is a First surrogate*/
2455 if(U16_IS_SURROGATE(sourceChar)) {
2456 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2457 getTrail:
2458 /*look ahead to find the trail surrogate*/
2459 if(source < sourceLimit) {
2460 /* test the following code unit */
2461 UChar trail=(UChar) *source;
2462 if(U16_IS_TRAIL(trail)) {
2463 source++;
2464 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2465 *err = U_INVALID_CHAR_FOUND;
2466 /* convert this surrogate code point */
2467 /* exit this condition tree */
2468 } else {
2469 /* this is an unmatched lead code unit (1st surrogate) */
2470 /* callback(illegal) */
2471 *err=U_ILLEGAL_CHAR_FOUND;
2472 }
2473 } else {
2474 /* no more input */
2475 *err = U_ZERO_ERROR;
2476 }
2477 } else {
2478 /* this is an unmatched trail code unit (2nd surrogate) */
2479 /* callback(illegal) */
2480 *err=U_ILLEGAL_CHAR_FOUND;
2481 }
2482 } else {
2483 /* callback(unassigned) for a BMP code point */
2484 *err = U_INVALID_CHAR_FOUND;
2485 }
2486
2487 args->converter->fromUChar32=sourceChar;
2488 break;
2489 }
2490 } /* end if(myTargetIndex<myTargetLength) */
2491 else{
2492 *err =U_BUFFER_OVERFLOW_ERROR;
2493 break;
2494 }
2495
2496 }/* end while(mySourceIndex<mySourceLength) */
2497
2498 /*
2499 * the end of the input stream and detection of truncated input
2500 * are handled by the framework, but for ISO-2022-KR conversion
2501 * we need to be in ASCII mode at the very end
2502 *
2503 * conditions:
2504 * successful
2505 * not in ASCII mode
2506 * end of input and no truncated input
2507 */
2508 if( U_SUCCESS(*err) &&
2509 isTargetByteDBCS &&
2510 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2511 ) {
2512 int32_t sourceIndex;
2513
2514 /* we are switching to ASCII */
2515 isTargetByteDBCS=FALSE;
2516
2517 /* get the source index of the last input character */
2518 /*
2519 * TODO this would be simpler and more reliable if we used a pair
2520 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2521 * so that we could simply use the prevSourceIndex here;
2522 * this code gives an incorrect result for the rare case of an unmatched
2523 * trail surrogate that is alone in the last buffer of the text stream
2524 */
2525 sourceIndex=(int32_t)(source-args->source);
2526 if(sourceIndex>0) {
2527 --sourceIndex;
2528 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2529 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2530 ) {
2531 --sourceIndex;
2532 }
2533 } else {
2534 sourceIndex=-1;
2535 }
2536
2537 fromUWriteUInt8(
2538 args->converter,
2539 SHIFT_IN_STR, 1,
2540 &target, (const char *)targetLimit,
2541 &offsets, sourceIndex,
2542 err);
2543 }
2544
2545 /*save the state and return */
2546 args->source = source;
2547 args->target = (char*)target;
2548 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2549 }
2550
2551 /************************ To Unicode ***************************************/
2552
2553 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2554 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2555 UErrorCode* err){
2556 char const* sourceStart;
2557 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2558
2559 UConverterToUnicodeArgs subArgs;
2560 int32_t minArgsSize;
2561
2562 /* set up the subconverter arguments */
2563 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2564 minArgsSize = args->size;
2565 } else {
2566 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2567 }
2568
2569 uprv_memcpy(&subArgs, args, minArgsSize);
2570 subArgs.size = (uint16_t)minArgsSize;
2571 subArgs.converter = myData->currentConverter;
2572
2573 /* remember the original start of the input for offsets */
2574 sourceStart = args->source;
2575
2576 if(myData->key != 0) {
2577 /* continue with a partial escape sequence */
2578 goto escape;
2579 }
2580
2581 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2582 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2583 subArgs.source = args->source;
2584 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2585 if(subArgs.source != subArgs.sourceLimit) {
2586 /*
2587 * get the current partial byte sequence
2588 *
2589 * it needs to be moved between the public and the subconverter
2590 * so that the conversion framework, which only sees the public
2591 * converter, can handle truncated and illegal input etc.
2592 */
2593 if(args->converter->toULength > 0) {
2594 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2595 }
2596 subArgs.converter->toULength = args->converter->toULength;
2597
2598 /*
2599 * Convert up to the end of the input, or to before the next escape character.
2600 * Does not handle conversion extensions because the preToU[] state etc.
2601 * is not copied.
2602 */
2603 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2604
2605 if(args->offsets != NULL && sourceStart != args->source) {
2606 /* update offsets to base them on the actual start of the input */
2607 int32_t *offsets = args->offsets;
2608 UChar *target = args->target;
2609 int32_t delta = (int32_t)(args->source - sourceStart);
2610 while(target < subArgs.target) {
2611 if(*offsets >= 0) {
2612 *offsets += delta;
2613 }
2614 ++offsets;
2615 ++target;
2616 }
2617 }
2618 args->source = subArgs.source;
2619 args->target = subArgs.target;
2620 args->offsets = subArgs.offsets;
2621
2622 /* copy input/error/overflow buffers */
2623 if(subArgs.converter->toULength > 0) {
2624 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2625 }
2626 args->converter->toULength = subArgs.converter->toULength;
2627
2628 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2629 if(subArgs.converter->UCharErrorBufferLength > 0) {
2630 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2631 subArgs.converter->UCharErrorBufferLength);
2632 }
2633 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2634 subArgs.converter->UCharErrorBufferLength = 0;
2635 }
2636 }
2637
2638 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2639 return;
2640 }
2641
2642 escape:
2643 changeState_2022(args->converter,
2644 &(args->source),
2645 args->sourceLimit,
2646 ISO_2022_KR,
2647 err);
2648 }
2649 }
2650
2651 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2652 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2653 UErrorCode* err){
2654 char tempBuf[2];
2655 const char *mySource = ( char *) args->source;
2656 UChar *myTarget = args->target;
2657 const char *mySourceLimit = args->sourceLimit;
2658 UChar32 targetUniChar = 0x0000;
2659 UChar mySourceChar = 0x0000;
2660 UConverterDataISO2022* myData;
2661 UConverterSharedData* sharedData ;
2662 UBool useFallback;
2663
2664 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2665 if(myData->version==1){
2666 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2667 return;
2668 }
2669
2670 /* initialize state */
2671 sharedData = myData->currentConverter->sharedData;
2672 useFallback = args->converter->useFallback;
2673
2674 if(myData->key != 0) {
2675 /* continue with a partial escape sequence */
2676 goto escape;
2677 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2678 /* continue with a partial double-byte character */
2679 mySourceChar = args->converter->toUBytes[0];
2680 args->converter->toULength = 0;
2681 goto getTrailByte;
2682 }
2683
2684 while(mySource< mySourceLimit){
2685
2686 if(myTarget < args->targetLimit){
2687
2688 mySourceChar= (unsigned char) *mySource++;
2689
2690 if(mySourceChar==UCNV_SI){
2691 myData->toU2022State.g = 0;
2692 if (myData->isEmptySegment) {
2693 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2694 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2695 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2696 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2697 args->converter->toULength = 1;
2698 args->target = myTarget;
2699 args->source = mySource;
2700 return;
2701 }
2702 /*consume the source */
2703 continue;
2704 }else if(mySourceChar==UCNV_SO){
2705 myData->toU2022State.g = 1;
2706 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2707 /*consume the source */
2708 continue;
2709 }else if(mySourceChar==ESC_2022){
2710 mySource--;
2711 escape:
2712 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2713 changeState_2022(args->converter,&(mySource),
2714 mySourceLimit, ISO_2022_KR, err);
2715 if(U_FAILURE(*err)){
2716 args->target = myTarget;
2717 args->source = mySource;
2718 return;
2719 }
2720 continue;
2721 }
2722
2723 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2724 if(myData->toU2022State.g == 1) {
2725 if(mySource < mySourceLimit) {
2726 int leadIsOk, trailIsOk;
2727 uint8_t trailByte;
2728 getTrailByte:
2729 targetUniChar = missingCharMarker;
2730 trailByte = (uint8_t)*mySource;
2731 /*
2732 * Ticket 5691: consistent illegal sequences:
2733 * - We include at least the first byte in the illegal sequence.
2734 * - If any of the non-initial bytes could be the start of a character,
2735 * we stop the illegal sequence before the first one of those.
2736 *
2737 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2738 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2739 * Otherwise we convert or report the pair of bytes.
2740 */
2741 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2742 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2743 if (leadIsOk && trailIsOk) {
2744 ++mySource;
2745 tempBuf[0] = (char)(mySourceChar + 0x80);
2746 tempBuf[1] = (char)(trailByte + 0x80);
2747 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2748 mySourceChar = (mySourceChar << 8) | trailByte;
2749 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2750 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2751 ++mySource;
2752 /* add another bit so that the code below writes 2 bytes in case of error */
2753 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2754 }
2755 } else {
2756 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2757 args->converter->toULength = 1;
2758 break;
2759 }
2760 }
2761 else if(mySourceChar <= 0x7f) {
2762 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2763 } else {
2764 targetUniChar = 0xffff;
2765 }
2766 if(targetUniChar < 0xfffe){
2767 if(args->offsets) {
2768 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2769 }
2770 *(myTarget++)=(UChar)targetUniChar;
2771 }
2772 else {
2773 /* Call the callback function*/
2774 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2775 break;
2776 }
2777 }
2778 else{
2779 *err =U_BUFFER_OVERFLOW_ERROR;
2780 break;
2781 }
2782 }
2783 args->target = myTarget;
2784 args->source = mySource;
2785 }
2786
2787 /*************************** END ISO2022-KR *********************************/
2788
2789 /*************************** ISO-2022-CN *********************************
2790 *
2791 * Rules for ISO-2022-CN Encoding:
2792 * i) The designator sequence must appear once on a line before any instance
2793 * of character set it designates.
2794 * ii) If two lines contain characters from the same character set, both lines
2795 * must include the designator sequence.
2796 * iii) Once the designator sequence is known, a shifting sequence has to be found
2797 * to invoke the shifting
2798 * iv) All lines start in ASCII and end in ASCII.
2799 * v) Four shifting sequences are employed for this purpose:
2800 *
2801 * Sequcence ASCII Eq Charsets
2802 * ---------- ------- ---------
2803 * SI <SI> US-ASCII
2804 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2805 * SS2 <ESC>N CNS-11643-1992 Plane 2
2806 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2807 *
2808 * vi)
2809 * SOdesignator : ESC "$" ")" finalchar_for_SO
2810 * SS2designator : ESC "$" "*" finalchar_for_SS2
2811 * SS3designator : ESC "$" "+" finalchar_for_SS3
2812 *
2813 * ESC $ ) A Indicates the bytes following SO are Chinese
2814 * characters as defined in GB 2312-80, until
2815 * another SOdesignation appears
2816 *
2817 *
2818 * ESC $ ) E Indicates the bytes following SO are as defined
2819 * in ISO-IR-165 (for details, see section 2.1),
2820 * until another SOdesignation appears
2821 *
2822 * ESC $ ) G Indicates the bytes following SO are as defined
2823 * in CNS 11643-plane-1, until another
2824 * SOdesignation appears
2825 *
2826 * ESC $ * H Indicates the two bytes immediately following
2827 * SS2 is a Chinese character as defined in CNS
2828 * 11643-plane-2, until another SS2designation
2829 * appears
2830 * (Meaning <ESC>N must preceed every 2 byte
2831 * sequence.)
2832 *
2833 * ESC $ + I Indicates the immediate two bytes following SS3
2834 * is a Chinese character as defined in CNS
2835 * 11643-plane-3, until another SS3designation
2836 * appears
2837 * (Meaning <ESC>O must preceed every 2 byte
2838 * sequence.)
2839 *
2840 * ESC $ + J Indicates the immediate two bytes following SS3
2841 * is a Chinese character as defined in CNS
2842 * 11643-plane-4, until another SS3designation
2843 * appears
2844 * (In English: <ESC>O must preceed every 2 byte
2845 * sequence.)
2846 *
2847 * ESC $ + K Indicates the immediate two bytes following SS3
2848 * is a Chinese character as defined in CNS
2849 * 11643-plane-5, until another SS3designation
2850 * appears
2851 *
2852 * ESC $ + L Indicates the immediate two bytes following SS3
2853 * is a Chinese character as defined in CNS
2854 * 11643-plane-6, until another SS3designation
2855 * appears
2856 *
2857 * ESC $ + M Indicates the immediate two bytes following SS3
2858 * is a Chinese character as defined in CNS
2859 * 11643-plane-7, until another SS3designation
2860 * appears
2861 *
2862 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2863 * has its own designation information before any Chinese characters
2864 * appear
2865 *
2866 */
2867
2868 /* The following are defined this way to make the strings truly readonly */
2869 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2870 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2871 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2872 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2873 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2874 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2875 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2876 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2877 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2878
2879 /********************** ISO2022-CN Data **************************/
2880 static const char* const escSeqCharsCN[10] ={
2881 SHIFT_IN_STR, /* 0 ASCII */
2882 GB_2312_80_STR, /* 1 GB2312_1 */
2883 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2884 CNS_11643_1992_Plane_1_STR,
2885 CNS_11643_1992_Plane_2_STR,
2886 CNS_11643_1992_Plane_3_STR,
2887 CNS_11643_1992_Plane_4_STR,
2888 CNS_11643_1992_Plane_5_STR,
2889 CNS_11643_1992_Plane_6_STR,
2890 CNS_11643_1992_Plane_7_STR
2891 };
2892
2893 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2894 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2895 UConverter *cnv = args->converter;
2896 UConverterDataISO2022 *converterData;
2897 ISO2022State *pFromU2022State;
2898 uint8_t *target = (uint8_t *) args->target;
2899 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2900 const UChar* source = args->source;
2901 const UChar* sourceLimit = args->sourceLimit;
2902 int32_t* offsets = args->offsets;
2903 UChar32 sourceChar;
2904 char buffer[8];
2905 int32_t len;
2906 int8_t choices[3];
2907 int32_t choiceCount;
2908 uint32_t targetValue = 0;
2909 UBool useFallback;
2910
2911 /* set up the state */
2912 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2913 pFromU2022State = &converterData->fromU2022State;
2914
2915 choiceCount = 0;
2916
2917 /* check if the last codepoint of previous buffer was a lead surrogate*/
2918 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2919 goto getTrail;
2920 }
2921
2922 while( source < sourceLimit){
2923 if(target < targetLimit){
2924
2925 sourceChar = *(source++);
2926 /*check if the char is a First surrogate*/
2927 if(U16_IS_SURROGATE(sourceChar)) {
2928 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2929 getTrail:
2930 /*look ahead to find the trail surrogate*/
2931 if(source < sourceLimit) {
2932 /* test the following code unit */
2933 UChar trail=(UChar) *source;
2934 if(U16_IS_TRAIL(trail)) {
2935 source++;
2936 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2937 cnv->fromUChar32=0x00;
2938 /* convert this supplementary code point */
2939 /* exit this condition tree */
2940 } else {
2941 /* this is an unmatched lead code unit (1st surrogate) */
2942 /* callback(illegal) */
2943 *err=U_ILLEGAL_CHAR_FOUND;
2944 cnv->fromUChar32=sourceChar;
2945 break;
2946 }
2947 } else {
2948 /* no more input */
2949 cnv->fromUChar32=sourceChar;
2950 break;
2951 }
2952 } else {
2953 /* this is an unmatched trail code unit (2nd surrogate) */
2954 /* callback(illegal) */
2955 *err=U_ILLEGAL_CHAR_FOUND;
2956 cnv->fromUChar32=sourceChar;
2957 break;
2958 }
2959 }
2960
2961 /* do the conversion */
2962 if(sourceChar <= 0x007f ){
2963 /* do not convert SO/SI/ESC */
2964 if(IS_2022_CONTROL(sourceChar)) {
2965 /* callback(illegal) */
2966 *err=U_ILLEGAL_CHAR_FOUND;
2967 cnv->fromUChar32=sourceChar;
2968 break;
2969 }
2970
2971 /* US-ASCII */
2972 if(pFromU2022State->g == 0) {
2973 buffer[0] = (char)sourceChar;
2974 len = 1;
2975 } else {
2976 buffer[0] = UCNV_SI;
2977 buffer[1] = (char)sourceChar;
2978 len = 2;
2979 pFromU2022State->g = 0;
2980 choiceCount = 0;
2981 }
2982 if(sourceChar == CR || sourceChar == LF) {
2983 /* reset the state at the end of a line */
2984 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2985 choiceCount = 0;
2986 }
2987 }
2988 else{
2989 /* convert U+0080..U+10ffff */
2990 int32_t i;
2991 int8_t cs, g;
2992
2993 if(choiceCount == 0) {
2994 /* try the current SO/G1 converter first */
2995 choices[0] = pFromU2022State->cs[1];
2996
2997 /* default to GB2312_1 if none is designated yet */
2998 if(choices[0] == 0) {
2999 choices[0] = GB2312_1;
3000 }
3001
3002 if(converterData->version == 0) {
3003 /* ISO-2022-CN */
3004
3005 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3006 if(choices[0] == GB2312_1) {
3007 choices[1] = (int8_t)CNS_11643_1;
3008 } else {
3009 choices[1] = (int8_t)GB2312_1;
3010 }
3011
3012 choiceCount = 2;
3013 } else if (converterData->version == 1) {
3014 /* ISO-2022-CN-EXT */
3015
3016 /* try one of the other converters */
3017 switch(choices[0]) {
3018 case GB2312_1:
3019 choices[1] = (int8_t)CNS_11643_1;
3020 choices[2] = (int8_t)ISO_IR_165;
3021 break;
3022 case ISO_IR_165:
3023 choices[1] = (int8_t)GB2312_1;
3024 choices[2] = (int8_t)CNS_11643_1;
3025 break;
3026 default: /* CNS_11643_x */
3027 choices[1] = (int8_t)GB2312_1;
3028 choices[2] = (int8_t)ISO_IR_165;
3029 break;
3030 }
3031
3032 choiceCount = 3;
3033 } else {
3034 choices[0] = (int8_t)CNS_11643_1;
3035 choices[1] = (int8_t)GB2312_1;
3036 }
3037 }
3038
3039 cs = g = 0;
3040 /*
3041 * len==0: no mapping found yet
3042 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3043 * len>0: found a roundtrip result, done
3044 */
3045 len = 0;
3046 /*
3047 * We will turn off useFallback after finding a fallback,
3048 * but we still get fallbacks from PUA code points as usual.
3049 * Therefore, we will also need to check that we don't overwrite
3050 * an early fallback with a later one.
3051 */
3052 useFallback = cnv->useFallback;
3053
3054 for(i = 0; i < choiceCount && len <= 0; ++i) {
3055 int8_t cs0 = choices[i];
3056 if(cs0 > 0) {
3057 uint32_t value;
3058 int32_t len2;
3059 if(cs0 >= CNS_11643_0) {
3060 len2 = MBCS_FROM_UCHAR32_ISO2022(
3061 converterData->myConverterArray[CNS_11643],
3062 sourceChar,
3063 &value,
3064 useFallback,
3065 MBCS_OUTPUT_3);
3066 if(len2 == 3 || (len2 == -3 && len == 0)) {
3067 targetValue = value;
3068 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3069 if(len2 >= 0) {
3070 len = 2;
3071 } else {
3072 len = -2;
3073 useFallback = FALSE;
3074 }
3075 if(cs == CNS_11643_1) {
3076 g = 1;
3077 } else if(cs == CNS_11643_2) {
3078 g = 2;
3079 } else /* plane 3..7 */ if(converterData->version == 1) {
3080 g = 3;
3081 } else {
3082 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3083 len = 0;
3084 }
3085 }
3086 } else {
3087 /* GB2312_1 or ISO-IR-165 */
3088 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3089 len2 = MBCS_FROM_UCHAR32_ISO2022(
3090 converterData->myConverterArray[cs0],
3091 sourceChar,
3092 &value,
3093 useFallback,
3094 MBCS_OUTPUT_2);
3095 if(len2 == 2 || (len2 == -2 && len == 0)) {
3096 targetValue = value;
3097 len = len2;
3098 cs = cs0;
3099 g = 1;
3100 useFallback = FALSE;
3101 }
3102 }
3103 }
3104 }
3105
3106 if(len != 0) {
3107 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3108
3109 /* write the designation sequence if necessary */
3110 if(cs != pFromU2022State->cs[g]) {
3111 if(cs < CNS_11643) {
3112 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3113 } else {
3114 U_ASSERT(cs >= CNS_11643_1);
3115 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3116 }
3117 len = 4;
3118 pFromU2022State->cs[g] = cs;
3119 if(g == 1) {
3120 /* changing the SO/G1 charset invalidates the choices[] */
3121 choiceCount = 0;
3122 }
3123 }
3124
3125 /* write the shift sequence if necessary */
3126 if(g != pFromU2022State->g) {
3127 switch(g) {
3128 case 1:
3129 buffer[len++] = UCNV_SO;
3130
3131 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3132 pFromU2022State->g = 1;
3133 break;
3134 case 2:
3135 buffer[len++] = 0x1b;
3136 buffer[len++] = 0x4e;
3137 break;
3138 default: /* case 3 */
3139 buffer[len++] = 0x1b;
3140 buffer[len++] = 0x4f;
3141 break;
3142 }
3143 }
3144
3145 /* write the two output bytes */
3146 buffer[len++] = (char)(targetValue >> 8);
3147 buffer[len++] = (char)targetValue;
3148 } else {
3149 /* if we cannot find the character after checking all codepages
3150 * then this is an error
3151 */
3152 *err = U_INVALID_CHAR_FOUND;
3153 cnv->fromUChar32=sourceChar;
3154 break;
3155 }
3156 }
3157
3158 /* output len>0 bytes in buffer[] */
3159 if(len == 1) {
3160 *target++ = buffer[0];
3161 if(offsets) {
3162 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3163 }
3164 } else if(len == 2 && (target + 2) <= targetLimit) {
3165 *target++ = buffer[0];
3166 *target++ = buffer[1];
3167 if(offsets) {
3168 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3169 *offsets++ = sourceIndex;
3170 *offsets++ = sourceIndex;
3171 }
3172 } else {
3173 fromUWriteUInt8(
3174 cnv,
3175 buffer, len,
3176 &target, (const char *)targetLimit,
3177 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3178 err);
3179 if(U_FAILURE(*err)) {
3180 break;
3181 }
3182 }
3183 } /* end if(myTargetIndex<myTargetLength) */
3184 else{
3185 *err =U_BUFFER_OVERFLOW_ERROR;
3186 break;
3187 }
3188
3189 }/* end while(mySourceIndex<mySourceLength) */
3190
3191 /*
3192 * the end of the input stream and detection of truncated input
3193 * are handled by the framework, but for ISO-2022-CN conversion
3194 * we need to be in ASCII mode at the very end
3195 *
3196 * conditions:
3197 * successful
3198 * not in ASCII mode
3199 * end of input and no truncated input
3200 */
3201 if( U_SUCCESS(*err) &&
3202 pFromU2022State->g!=0 &&
3203 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3204 ) {
3205 int32_t sourceIndex;
3206
3207 /* we are switching to ASCII */
3208 pFromU2022State->g=0;
3209
3210 /* get the source index of the last input character */
3211 /*
3212 * TODO this would be simpler and more reliable if we used a pair
3213 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3214 * so that we could simply use the prevSourceIndex here;
3215 * this code gives an incorrect result for the rare case of an unmatched
3216 * trail surrogate that is alone in the last buffer of the text stream
3217 */
3218 sourceIndex=(int32_t)(source-args->source);
3219 if(sourceIndex>0) {
3220 --sourceIndex;
3221 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3222 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3223 ) {
3224 --sourceIndex;
3225 }
3226 } else {
3227 sourceIndex=-1;
3228 }
3229
3230 fromUWriteUInt8(
3231 cnv,
3232 SHIFT_IN_STR, 1,
3233 &target, (const char *)targetLimit,
3234 &offsets, sourceIndex,
3235 err);
3236 }
3237
3238 /*save the state and return */
3239 args->source = source;
3240 args->target = (char*)target;
3241 }
3242
3243
3244 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3245 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3246 UErrorCode* err){
3247 char tempBuf[3];
3248 const char *mySource = (char *) args->source;
3249 UChar *myTarget = args->target;
3250 const char *mySourceLimit = args->sourceLimit;
3251 uint32_t targetUniChar = 0x0000;
3252 uint32_t mySourceChar = 0x0000;
3253 UConverterDataISO2022* myData;
3254 ISO2022State *pToU2022State;
3255
3256 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3257 pToU2022State = &myData->toU2022State;
3258
3259 if(myData->key != 0) {
3260 /* continue with a partial escape sequence */
3261 goto escape;
3262 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3263 /* continue with a partial double-byte character */
3264 mySourceChar = args->converter->toUBytes[0];
3265 args->converter->toULength = 0;
3266 targetUniChar = missingCharMarker;
3267 goto getTrailByte;
3268 }
3269
3270 while(mySource < mySourceLimit){
3271
3272 targetUniChar =missingCharMarker;
3273
3274 if(myTarget < args->targetLimit){
3275
3276 mySourceChar= (unsigned char) *mySource++;
3277
3278 switch(mySourceChar){
3279 case UCNV_SI:
3280 pToU2022State->g=0;
3281 if (myData->isEmptySegment) {
3282 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3283 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3284 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3285 args->converter->toUBytes[0] = mySourceChar;
3286 args->converter->toULength = 1;
3287 args->target = myTarget;
3288 args->source = mySource;
3289 return;
3290 }
3291 continue;
3292
3293 case UCNV_SO:
3294 if(pToU2022State->cs[1] != 0) {
3295 pToU2022State->g=1;
3296 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3297 continue;
3298 } else {
3299 /* illegal to have SO before a matching designator */
3300 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3301 break;
3302 }
3303
3304 case ESC_2022:
3305 mySource--;
3306 escape:
3307 {
3308 const char * mySourceBefore = mySource;
3309 int8_t toULengthBefore = args->converter->toULength;
3310
3311 changeState_2022(args->converter,&(mySource),
3312 mySourceLimit, ISO_2022_CN,err);
3313
3314 /* After SO there must be at least one character before a designator (designator error handled separately) */
3315 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3316 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3317 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3318 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3319 }
3320 }
3321
3322 /* invalid or illegal escape sequence */
3323 if(U_FAILURE(*err)){
3324 args->target = myTarget;
3325 args->source = mySource;
3326 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3327 return;
3328 }
3329 continue;
3330
3331 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3332
3333 case CR:
3334 /*falls through*/
3335 case LF:
3336 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3337 /* falls through */
3338 default:
3339 /* convert one or two bytes */
3340 myData->isEmptySegment = FALSE;
3341 if(pToU2022State->g != 0) {
3342 if(mySource < mySourceLimit) {
3343 UConverterSharedData *cnv;
3344 StateEnum tempState;
3345 int32_t tempBufLen;
3346 int leadIsOk, trailIsOk;
3347 uint8_t trailByte;
3348 getTrailByte:
3349 trailByte = (uint8_t)*mySource;
3350 /*
3351 * Ticket 5691: consistent illegal sequences:
3352 * - We include at least the first byte in the illegal sequence.
3353 * - If any of the non-initial bytes could be the start of a character,
3354 * we stop the illegal sequence before the first one of those.
3355 *
3356 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3357 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3358 * Otherwise we convert or report the pair of bytes.
3359 */
3360 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3361 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3362 if (leadIsOk && trailIsOk) {
3363 ++mySource;
3364 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3365 if(tempState >= CNS_11643_0) {
3366 cnv = myData->myConverterArray[CNS_11643];
3367 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3368 tempBuf[1] = (char) (mySourceChar);
3369 tempBuf[2] = (char) trailByte;
3370 tempBufLen = 3;
3371
3372 }else{
3373 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3374 cnv = myData->myConverterArray[tempState];
3375 tempBuf[0] = (char) (mySourceChar);
3376 tempBuf[1] = (char) trailByte;
3377 tempBufLen = 2;
3378 }
3379 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3380 mySourceChar = (mySourceChar << 8) | trailByte;
3381 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3382 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3383 ++mySource;
3384 /* add another bit so that the code below writes 2 bytes in case of error */
3385 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3386 }
3387 if(pToU2022State->g>=2) {
3388 /* return from a single-shift state to the previous one */
3389 pToU2022State->g=pToU2022State->prevG;
3390 }
3391 } else {
3392 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3393 args->converter->toULength = 1;
3394 goto endloop;
3395 }
3396 }
3397 else{
3398 if(mySourceChar <= 0x7f) {
3399 targetUniChar = (UChar) mySourceChar;
3400 }
3401 }
3402 break;
3403 }
3404 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3405 if(args->offsets){
3406 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3407 }
3408 *(myTarget++)=(UChar)targetUniChar;
3409 }
3410 else if(targetUniChar > missingCharMarker){
3411 /* disassemble the surrogate pair and write to output*/
3412 targetUniChar-=0x0010000;
3413 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3414 if(args->offsets){
3415 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3416 }
3417 ++myTarget;
3418 if(myTarget< args->targetLimit){
3419 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3420 if(args->offsets){
3421 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3422 }
3423 ++myTarget;
3424 }else{
3425 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3426 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3427 }
3428
3429 }
3430 else{
3431 /* Call the callback function*/
3432 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3433 break;
3434 }
3435 }
3436 else{
3437 *err =U_BUFFER_OVERFLOW_ERROR;
3438 break;
3439 }
3440 }
3441 endloop:
3442 args->target = myTarget;
3443 args->source = mySource;
3444 }
3445
3446 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3447 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3448 UConverter *cnv = args->converter;
3449 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3450 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3451 char *p, *subchar;
3452 char buffer[8];
3453 int32_t length;
3454
3455 subchar=(char *)cnv->subChars;
3456 length=cnv->subCharLen; /* assume length==1 for most variants */
3457
3458 p = buffer;
3459 switch(myConverterData->locale[0]){
3460 case 'j':
3461 {
3462 int8_t cs;
3463
3464 if(pFromU2022State->g == 1) {
3465 /* JIS7: switch from G1 to G0 */
3466 pFromU2022State->g = 0;
3467 *p++ = UCNV_SI;
3468 }
3469
3470 cs = pFromU2022State->cs[0];
3471 if(cs != ASCII && cs != JISX201) {
3472 /* not in ASCII or JIS X 0201: switch to ASCII */
3473 pFromU2022State->cs[0] = (int8_t)ASCII;
3474 *p++ = '\x1b';
3475 *p++ = '\x28';
3476 *p++ = '\x42';
3477 }
3478
3479 *p++ = subchar[0];
3480 break;
3481 }
3482 case 'c':
3483 if(pFromU2022State->g != 0) {
3484 /* not in ASCII mode: switch to ASCII */
3485 pFromU2022State->g = 0;
3486 *p++ = UCNV_SI;
3487 }
3488 *p++ = subchar[0];
3489 break;
3490 case 'k':
3491 if(myConverterData->version == 0) {
3492 if(length == 1) {
3493 if((UBool)args->converter->fromUnicodeStatus) {
3494 /* in DBCS mode: switch to SBCS */
3495 args->converter->fromUnicodeStatus = 0;
3496 *p++ = UCNV_SI;
3497 }
3498 *p++ = subchar[0];
3499 } else /* length == 2*/ {
3500 if(!(UBool)args->converter->fromUnicodeStatus) {
3501 /* in SBCS mode: switch to DBCS */
3502 args->converter->fromUnicodeStatus = 1;
3503 *p++ = UCNV_SO;
3504 }
3505 *p++ = subchar[0];
3506 *p++ = subchar[1];
3507 }
3508 break;
3509 } else {
3510 /* save the subconverter's substitution string */
3511 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3512 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3513
3514 /* set our substitution string into the subconverter */
3515 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3516 myConverterData->currentConverter->subCharLen = (int8_t)length;
3517
3518 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3519 args->converter = myConverterData->currentConverter;
3520 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3521 ucnv_cbFromUWriteSub(args, 0, err);
3522 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3523 args->converter = cnv;
3524
3525 /* restore the subconverter's substitution string */
3526 myConverterData->currentConverter->subChars = currentSubChars;
3527 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3528
3529 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3530 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3531 uprv_memcpy(
3532 cnv->charErrorBuffer,
3533 myConverterData->currentConverter->charErrorBuffer,
3534 myConverterData->currentConverter->charErrorBufferLength);
3535 }
3536 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3537 myConverterData->currentConverter->charErrorBufferLength = 0;
3538 }
3539 return;
3540 }
3541 default:
3542 /* not expected */
3543 break;
3544 }
3545 ucnv_cbFromUWriteBytes(args,
3546 buffer, (int32_t)(p - buffer),
3547 offsetIndex, err);
3548 }
3549
3550 /*
3551 * Structure for cloning an ISO 2022 converter into a single memory block.
3552 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3553 * and then ucnv_safeClone() of the sub-converter may additionally align
3554 * currentConverter inside the cloneStruct, for which we need the deadSpace
3555 * after currentConverter.
3556 * This is because UAlignedMemory may be larger than the actually
3557 * necessary alignment size for the platform.
3558 * The other cloneStruct fields will not be moved around,
3559 * and are aligned properly with cloneStruct's alignment.
3560 */
3561 struct cloneStruct
3562 {
3563 UConverter cnv;
3564 UConverter currentConverter;
3565 UAlignedMemory deadSpace;
3566 UConverterDataISO2022 mydata;
3567 };
3568
3569
3570 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3571 _ISO_2022_SafeClone(
3572 const UConverter *cnv,
3573 void *stackBuffer,
3574 int32_t *pBufferSize,
3575 UErrorCode *status)
3576 {
3577 struct cloneStruct * localClone;
3578 UConverterDataISO2022 *cnvData;
3579 int32_t i, size;
3580
3581 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3582 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3583 return NULL;
3584 }
3585
3586 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3587 localClone = (struct cloneStruct *)stackBuffer;
3588
3589 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3590
3591 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3592 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3593 localClone->cnv.isExtraLocal = TRUE;
3594
3595 /* share the subconverters */
3596
3597 if(cnvData->currentConverter != NULL) {
3598 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3599 localClone->mydata.currentConverter =
3600 ucnv_safeClone(cnvData->currentConverter,
3601 &localClone->currentConverter,
3602 &size, status);
3603 if(U_FAILURE(*status)) {
3604 return NULL;
3605 }
3606 }
3607
3608 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3609 if(cnvData->myConverterArray[i] != NULL) {
3610 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3611 }
3612 }
3613
3614 return &localClone->cnv;
3615 }
3616
3617 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3618 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3619 const USetAdder *sa,
3620 UConverterUnicodeSet which,
3621 UErrorCode *pErrorCode)
3622 {
3623 int32_t i;
3624 UConverterDataISO2022* cnvData;
3625
3626 if (U_FAILURE(*pErrorCode)) {
3627 return;
3628 }
3629 #ifdef U_ENABLE_GENERIC_ISO_2022
3630 if (cnv->sharedData == &_ISO2022Data) {
3631 /* We use UTF-8 in this case */
3632 sa->addRange(sa->set, 0, 0xd7FF);
3633 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3634 return;
3635 }
3636 #endif
3637
3638 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3639
3640 /* open a set and initialize it with code points that are algorithmically round-tripped */
3641 switch(cnvData->locale[0]){
3642 case 'j':
3643 /* include JIS X 0201 which is hardcoded */
3644 sa->add(sa->set, 0xa5);
3645 sa->add(sa->set, 0x203e);
3646 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3647 /* include Latin-1 for some variants of JP */
3648 sa->addRange(sa->set, 0, 0xff);
3649 } else {
3650 /* include ASCII for JP */
3651 sa->addRange(sa->set, 0, 0x7f);
3652 }
3653 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3654 /*
3655 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3656 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3657 * use half-width Katakana.
3658 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3659 * half-width Katakana via the ESC ( I sequence.
3660 * However, we only emit (fromUnicode) half-width Katakana according to the
3661 * definition of each variant.
3662 *
3663 * When including fallbacks,
3664 * we need to include half-width Katakana Unicode code points for all JP variants because
3665 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3666 */
3667 /* include half-width Katakana for JP */
3668 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3669 }
3670 break;
3671 case 'c':
3672 case 'z':
3673 /* include ASCII for CN */
3674 sa->addRange(sa->set, 0, 0x7f);
3675 break;
3676 case 'k':
3677 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3678 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3679 cnvData->currentConverter, sa, which, pErrorCode);
3680 /* the loop over myConverterArray[] will simply not find another converter */
3681 break;
3682 default:
3683 break;
3684 }
3685
3686 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3687 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3688 cnvData->version==0 && i==CNS_11643
3689 ) {
3690 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3691 ucnv_MBCSGetUnicodeSetForBytes(
3692 cnvData->myConverterArray[i],
3693 sa, UCNV_ROUNDTRIP_SET,
3694 0, 0x81, 0x82,
3695 pErrorCode);
3696 }
3697 #endif
3698
3699 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3700 UConverterSetFilter filter;
3701 if(cnvData->myConverterArray[i]!=NULL) {
3702 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3703 cnvData->version==0 && i==CNS_11643
3704 ) {
3705 /*
3706 * Version-specific for CN:
3707 * CN version 0 does not map CNS planes 3..7 although
3708 * they are all available in the CNS conversion table;
3709 * CN version 1 (-EXT) does map them all.
3710 * The two versions create different Unicode sets.
3711 */
3712 filter=UCNV_SET_FILTER_2022_CN;
3713 } else if(cnvData->locale[0]=='j' && i==JISX208) {
3714 /*
3715 * Only add code points that map to Shift-JIS codes
3716 * corresponding to JIS X 0208.
3717 */
3718 filter=UCNV_SET_FILTER_SJIS;
3719 } else if(i==KSC5601) {
3720 /*
3721 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3722 * are broader than GR94.
3723 */
3724 filter=UCNV_SET_FILTER_GR94DBCS;
3725 } else {
3726 filter=UCNV_SET_FILTER_NONE;
3727 }
3728 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3729 }
3730 }
3731
3732 /*
3733 * ISO 2022 converters must not convert SO/SI/ESC despite what
3734 * sub-converters do by themselves.
3735 * Remove these characters from the set.
3736 */
3737 sa->remove(sa->set, 0x0e);
3738 sa->remove(sa->set, 0x0f);
3739 sa->remove(sa->set, 0x1b);
3740
3741 /* ISO 2022 converters do not convert C1 controls either */
3742 sa->removeRange(sa->set, 0x80, 0x9f);
3743 }
3744
3745 static const UConverterImpl _ISO2022Impl={
3746 UCNV_ISO_2022,
3747
3748 NULL,
3749 NULL,
3750
3751 _ISO2022Open,
3752 _ISO2022Close,
3753 _ISO2022Reset,
3754
3755 #ifdef U_ENABLE_GENERIC_ISO_2022
3756 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3757 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3758 ucnv_fromUnicode_UTF8,
3759 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3760 #else
3761 NULL,
3762 NULL,
3763 NULL,
3764 NULL,
3765 #endif
3766 NULL,
3767
3768 NULL,
3769 _ISO2022getName,
3770 _ISO_2022_WriteSub,
3771 _ISO_2022_SafeClone,
3772 _ISO_2022_GetUnicodeSet,
3773
3774 NULL,
3775 NULL
3776 };
3777 static const UConverterStaticData _ISO2022StaticData={
3778 sizeof(UConverterStaticData),
3779 "ISO_2022",
3780 2022,
3781 UCNV_IBM,
3782 UCNV_ISO_2022,
3783 1,
3784 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3785 { 0x1a, 0, 0, 0 },
3786 1,
3787 FALSE,
3788 FALSE,
3789 0,
3790 0,
3791 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3792 };
3793 const UConverterSharedData _ISO2022Data={
3794 sizeof(UConverterSharedData),
3795 ~((uint32_t) 0),
3796 NULL,
3797 NULL,
3798 &_ISO2022StaticData,
3799 FALSE,
3800 &_ISO2022Impl,
3801 0, UCNV_MBCS_TABLE_INITIALIZER
3802 };
3803
3804 /*************JP****************/
3805 static const UConverterImpl _ISO2022JPImpl={
3806 UCNV_ISO_2022,
3807
3808 NULL,
3809 NULL,
3810
3811 _ISO2022Open,
3812 _ISO2022Close,
3813 _ISO2022Reset,
3814
3815 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3816 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3817 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3818 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3819 NULL,
3820
3821 NULL,
3822 _ISO2022getName,
3823 _ISO_2022_WriteSub,
3824 _ISO_2022_SafeClone,
3825 _ISO_2022_GetUnicodeSet,
3826
3827 NULL,
3828 NULL
3829 };
3830 static const UConverterStaticData _ISO2022JPStaticData={
3831 sizeof(UConverterStaticData),
3832 "ISO_2022_JP",
3833 0,
3834 UCNV_IBM,
3835 UCNV_ISO_2022,
3836 1,
3837 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3838 { 0x1a, 0, 0, 0 },
3839 1,
3840 FALSE,
3841 FALSE,
3842 0,
3843 0,
3844 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3845 };
3846
3847 namespace {
3848
3849 const UConverterSharedData _ISO2022JPData={
3850 sizeof(UConverterSharedData),
3851 ~((uint32_t) 0),
3852 NULL,
3853 NULL,
3854 &_ISO2022JPStaticData,
3855 FALSE,
3856 &_ISO2022JPImpl,
3857 0, UCNV_MBCS_TABLE_INITIALIZER
3858 };
3859
3860 } // namespace
3861
3862 /************* KR ***************/
3863 static const UConverterImpl _ISO2022KRImpl={
3864 UCNV_ISO_2022,
3865
3866 NULL,
3867 NULL,
3868
3869 _ISO2022Open,
3870 _ISO2022Close,
3871 _ISO2022Reset,
3872
3873 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3874 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3875 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3876 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3877 NULL,
3878
3879 NULL,
3880 _ISO2022getName,
3881 _ISO_2022_WriteSub,
3882 _ISO_2022_SafeClone,
3883 _ISO_2022_GetUnicodeSet,
3884
3885 NULL,
3886 NULL
3887 };
3888 static const UConverterStaticData _ISO2022KRStaticData={
3889 sizeof(UConverterStaticData),
3890 "ISO_2022_KR",
3891 0,
3892 UCNV_IBM,
3893 UCNV_ISO_2022,
3894 1,
3895 3, /* max 3 bytes per UChar: SO+DBCS */
3896 { 0x1a, 0, 0, 0 },
3897 1,
3898 FALSE,
3899 FALSE,
3900 0,
3901 0,
3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3903 };
3904
3905 namespace {
3906
3907 const UConverterSharedData _ISO2022KRData={
3908 sizeof(UConverterSharedData),
3909 ~((uint32_t) 0),
3910 NULL,
3911 NULL,
3912 &_ISO2022KRStaticData,
3913 FALSE,
3914 &_ISO2022KRImpl,
3915 0, UCNV_MBCS_TABLE_INITIALIZER
3916 };
3917
3918 } // namespace
3919
3920 /*************** CN ***************/
3921 static const UConverterImpl _ISO2022CNImpl={
3922
3923 UCNV_ISO_2022,
3924
3925 NULL,
3926 NULL,
3927
3928 _ISO2022Open,
3929 _ISO2022Close,
3930 _ISO2022Reset,
3931
3932 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3933 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3934 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936 NULL,
3937
3938 NULL,
3939 _ISO2022getName,
3940 _ISO_2022_WriteSub,
3941 _ISO_2022_SafeClone,
3942 _ISO_2022_GetUnicodeSet,
3943
3944 NULL,
3945 NULL
3946 };
3947 static const UConverterStaticData _ISO2022CNStaticData={
3948 sizeof(UConverterStaticData),
3949 "ISO_2022_CN",
3950 0,
3951 UCNV_IBM,
3952 UCNV_ISO_2022,
3953 1,
3954 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3955 { 0x1a, 0, 0, 0 },
3956 1,
3957 FALSE,
3958 FALSE,
3959 0,
3960 0,
3961 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3962 };
3963
3964 namespace {
3965
3966 const UConverterSharedData _ISO2022CNData={
3967 sizeof(UConverterSharedData),
3968 ~((uint32_t) 0),
3969 NULL,
3970 NULL,
3971 &_ISO2022CNStaticData,
3972 FALSE,
3973 &_ISO2022CNImpl,
3974 0, UCNV_MBCS_TABLE_INITIALIZER
3975 };
3976
3977 } // namespace
3978
3979 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3980