1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
13 *
14 * Change history:
15 *
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
19 * function
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
22 * ucnvebdc.c
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
38 #include "ucnv_imp.h"
39 #include "ucnv_bld.h"
40 #include "ucnv_cnv.h"
41 #include "ucnvmbcs.h"
42 #include "cstring.h"
43 #include "cmemory.h"
44 #include "uassert.h"
45
46 #ifdef U_ENABLE_GENERIC_ISO_2022
47 /*
48 * I am disabling the generic ISO-2022 converter after proposing to do so on
49 * the icu mailing list two days ago.
50 *
51 * Reasons:
52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
53 * its designation sequences, single shifts with return to the previous state,
54 * switch-with-no-return to UTF-16BE or similar, etc.
55 * This is unlike the language-specific variants like ISO-2022-JP which
56 * require a much smaller repertoire of ISO-2022 features.
57 * These variants continue to be supported.
58 * 2. I believe that no one is really using the generic ISO-2022 converter
59 * but rather always one of the language-specific variants.
60 * Note that ICU's generic ISO-2022 converter has always output one escape
61 * sequence followed by UTF-8 for the whole stream.
62 * 3. Switching between subcharsets is extremely slow, because each time
63 * the previous converter is closed and a new one opened,
64 * without any kind of caching, least-recently-used list, etc.
65 * 4. The code is currently buggy, and given the above it does not seem
66 * reasonable to spend the time on maintenance.
67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
68 * This means, for example, that when ISO-8859-7 is designated, the following
69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
70 * The ICU ISO-2022 converter does not handle this - and has no information
71 * about which subconverter would have to be shifted vs. which is designed
72 * for 7-bit ISO-2022.
73 *
74 * Markus Scherer 2003-dec-03
75 */
76 #endif
77
78 #if !UCONFIG_ONLY_HTML_CONVERSION
79 static const char SHIFT_IN_STR[] = "\x0F";
80 // static const char SHIFT_OUT_STR[] = "\x0E";
81 #endif
82
83 #define CR 0x0D
84 #define LF 0x0A
85 #define H_TAB 0x09
86 #define V_TAB 0x0B
87 #define SPACE 0x20
88
89 enum {
90 HWKANA_START=0xff61,
91 HWKANA_END=0xff9f
92 };
93
94 /*
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
101 */
102 enum {
103 GR94_START=0xa1,
104 GR94_END=0xfe,
105 GR96_START=0xa0,
106 GR96_END=0xff
107 };
108
109 /*
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
114 */
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117 /* for ISO-2022-JP and -CN implementations */
118 typedef enum {
119 /* shared values */
120 INVALID_STATE=-1,
121 ASCII = 0,
122
123 SS2_STATE=0x10,
124 SS3_STATE,
125
126 /* JP */
127 ISO8859_1 = 1 ,
128 ISO8859_7 = 2 ,
129 JISX201 = 3,
130 JISX208 = 4,
131 JISX212 = 5,
132 GB2312 =6,
133 KSC5601 =7,
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
135
136 /* CN */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138 GB2312_1=1,
139 ISO_IR_165=2,
140 CNS_11643=3,
141
142 /*
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
145 */
146 CNS_11643_0=0x20,
147 CNS_11643_1,
148 CNS_11643_2,
149 CNS_11643_3,
150 CNS_11643_4,
151 CNS_11643_5,
152 CNS_11643_6,
153 CNS_11643_7
154 } StateEnum;
155
156 /* is the StateEnum charset value for a DBCS charset? */
157 #if UCONFIG_ONLY_HTML_CONVERSION
158 #define IS_JP_DBCS(cs) (JISX208==(cs))
159 #else
160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
161 #endif
162
163 #define CSM(cs) ((uint16_t)1<<(cs))
164
165 /*
166 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
167 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
168 *
169 * Note: The converter uses some leniency:
170 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
171 * all versions, not just JIS7 and JIS8.
172 * - ICU does not distinguish between different versions of JIS X 0208.
173 */
174 #if UCONFIG_ONLY_HTML_CONVERSION
175 enum { MAX_JA_VERSION=0 };
176 #else
177 enum { MAX_JA_VERSION=4 };
178 #endif
179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
180 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
181 #if !UCONFIG_ONLY_HTML_CONVERSION
182 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
183 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
185 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
186 #endif
187 };
188
189 typedef enum {
190 ASCII1=0,
191 LATIN1,
192 SBCS,
193 DBCS,
194 MBCS,
195 HWKANA
196 }Cnv2022Type;
197
198 typedef struct ISO2022State {
199 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
200 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
201 int8_t prevG; /* g before single shift (SS2 or SS3) */
202 } ISO2022State;
203
204 #define UCNV_OPTIONS_VERSION_MASK 0xf
205 #define UCNV_2022_MAX_CONVERTERS 10
206
207 typedef struct{
208 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
209 UConverter *currentConverter;
210 Cnv2022Type currentType;
211 ISO2022State toU2022State, fromU2022State;
212 uint32_t key;
213 uint32_t version;
214 #ifdef U_ENABLE_GENERIC_ISO_2022
215 UBool isFirstBuffer;
216 #endif
217 UBool isEmptySegment;
218 char name[30];
219 char locale[3];
220 }UConverterDataISO2022;
221
222 /* Protos */
223 /* ISO-2022 ----------------------------------------------------------------- */
224
225 /*Forward declaration */
226 U_CFUNC void
227 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
228 UErrorCode * err);
229 U_CFUNC void
230 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
231 UErrorCode * err);
232
233 #define ESC_2022 0x1B /*ESC*/
234
235 typedef enum
236 {
237 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
238 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
239 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
240 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
241 } UCNV_TableStates_2022;
242
243 /*
244 * The way these state transition arrays work is:
245 * ex : ESC$B is the sequence for JISX208
246 * a) First Iteration: char is ESC
247 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
248 * int x = normalize_esq_chars_2022[27] which is equal to 1
249 * ii) Search for this value in escSeqStateTable_Key_2022[]
250 * value of x is stored at escSeqStateTable_Key_2022[0]
251 * iii) Save this index as offset
252 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
253 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
254 * b) Switch on this state and continue to next char
255 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
256 * which is normalize_esq_chars_2022[36] == 4
257 * ii) x is currently 1(from above)
258 * x<<=5 -- x is now 32
259 * x+=normalize_esq_chars_2022[36]
260 * now x is 36
261 * iii) Search for this value in escSeqStateTable_Key_2022[]
262 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
263 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
264 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
265 * c) Switch on this state and continue to next char
266 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
267 * ii) x is currently 36 (from above)
268 * x<<=5 -- x is now 1152
269 * x+=normalize_esq_chars_2022[66]
270 * now x is 1161
271 * iii) Search for this value in escSeqStateTable_Key_2022[]
272 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
273 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
274 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
275 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
276 */
277
278
279 /*Below are the 3 arrays depicting a state transition table*/
280 static const int8_t normalize_esq_chars_2022[256] = {
281 /* 0 1 2 3 4 5 6 7 8 9 */
282
283 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
286 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
287 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
290 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
291 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0
309 };
310
311 #ifdef U_ENABLE_GENERIC_ISO_2022
312 /*
313 * When the generic ISO-2022 converter is completely removed, not just disabled
314 * per #ifdef, then the following state table and the associated tables that are
315 * dimensioned with MAX_STATES_2022 should be trimmed.
316 *
317 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
318 * the associated escape sequences starting with ESC ( B should be removed.
319 * This includes the ones with key values 1097 and all of the ones above 1000000.
320 *
321 * For the latter, the tables can simply be truncated.
322 * For the former, since the tables must be kept parallel, it is probably best
323 * to simply duplicate an adjacent table cell, parallel in all tables.
324 *
325 * It may make sense to restructure the tables, especially by using small search
326 * tables for the variants instead of indexing them parallel to the table here.
327 */
328 #endif
329
330 #define MAX_STATES_2022 74
331 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
332 /* 0 1 2 3 4 5 6 7 8 9 */
333
334 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
335 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
336 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
337 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
338 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
339 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
340 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
341 ,35947631 ,35947635 ,35947636 ,35947638
342 };
343
344 #ifdef U_ENABLE_GENERIC_ISO_2022
345
346 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
347 /* 0 1 2 3 4 5 6 7 8 9 */
348
349 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
350 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
351 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
352 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
353 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
354 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
355 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
356 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
357 };
358
359 #endif
360
361 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
362 /* 0 1 2 3 4 5 6 7 8 9 */
363 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
364 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
365 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
366 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
367 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 };
372
373 /* Type def for refactoring changeState_2022 code*/
374 typedef enum{
375 #ifdef U_ENABLE_GENERIC_ISO_2022
376 ISO_2022=0,
377 #endif
378 ISO_2022_JP=1,
379 #if !UCONFIG_ONLY_HTML_CONVERSION
380 ISO_2022_KR=2,
381 ISO_2022_CN=3
382 #endif
383 } Variant2022;
384
385 /*********** ISO 2022 Converter Protos ***********/
386 static void
387 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
388
389 static void
390 _ISO2022Close(UConverter *converter);
391
392 static void
393 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
394
395 static const char*
396 _ISO2022getName(const UConverter* cnv);
397
398 static void
399 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
400
401 static UConverter *
402 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
403
404 #ifdef U_ENABLE_GENERIC_ISO_2022
405 static void
406 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
407 #endif
408
409 namespace {
410
411 /*const UConverterSharedData _ISO2022Data;*/
412 extern const UConverterSharedData _ISO2022JPData;
413
414 #if !UCONFIG_ONLY_HTML_CONVERSION
415 extern const UConverterSharedData _ISO2022KRData;
416 extern const UConverterSharedData _ISO2022CNData;
417 #endif
418
419 } // namespace
420
421 /*************** Converter implementations ******************/
422
423 /* The purpose of this function is to get around gcc compiler warnings. */
424 static inline void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)425 fromUWriteUInt8(UConverter *cnv,
426 const char *bytes, int32_t length,
427 uint8_t **target, const char *targetLimit,
428 int32_t **offsets,
429 int32_t sourceIndex,
430 UErrorCode *pErrorCode)
431 {
432 char *targetChars = (char *)*target;
433 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
434 offsets, sourceIndex, pErrorCode);
435 *target = (uint8_t*)targetChars;
436
437 }
438
439 static inline void
setInitialStateToUnicodeKR(UConverter *,UConverterDataISO2022 * myConverterData)440 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
441 if(myConverterData->version == 1) {
442 UConverter *cnv = myConverterData->currentConverter;
443
444 cnv->toUnicodeStatus=0; /* offset */
445 cnv->mode=0; /* state */
446 cnv->toULength=0; /* byteIndex */
447 }
448 }
449
450 static inline void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)451 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
452 /* in ISO-2022-KR the designator sequence appears only once
453 * in a file so we append it only once
454 */
455 if( converter->charErrorBufferLength==0){
456
457 converter->charErrorBufferLength = 4;
458 converter->charErrorBuffer[0] = 0x1b;
459 converter->charErrorBuffer[1] = 0x24;
460 converter->charErrorBuffer[2] = 0x29;
461 converter->charErrorBuffer[3] = 0x43;
462 }
463 if(myConverterData->version == 1) {
464 UConverter *cnv = myConverterData->currentConverter;
465
466 cnv->fromUChar32=0;
467 cnv->fromUnicodeStatus=1; /* prevLength */
468 }
469 }
470
471 static void
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)472 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
473
474 char myLocale[6]={' ',' ',' ',' ',' ',' '};
475
476 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
477 if(cnv->extraInfo != NULL) {
478 UConverterNamePieces stackPieces;
479 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
480 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
481 uint32_t version;
482
483 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
484
485 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
486 myConverterData->currentType = ASCII1;
487 cnv->fromUnicodeStatus =FALSE;
488 if(pArgs->locale){
489 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
490 }
491 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
492 myConverterData->version = version;
493 /* Begin Google-specific change. */
494 /* The "jk" locale ID was made up for KDDI ISO-2022-JP. */
495 /* The "js" locale ID was made up for SoftBank ISO-2022-JP. */
496 if((myLocale[0]=='j' &&
497 (myLocale[1]=='a'|| myLocale[1]=='p' || myLocale[1]=='k' ||
498 myLocale[1]=='s') &&
499 (myLocale[2]=='_' || myLocale[2]=='\0')))
500 {
501 /* open the required converters and cache them */
502 if(version>MAX_JA_VERSION) {
503 // ICU 55 fails to open a converter for an unsupported version.
504 // Previously, it fell back to version 0, but that would yield
505 // unexpected behavior.
506 *errorCode = U_MISSING_RESOURCE_ERROR;
507 return;
508 }
509 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
510 myConverterData->myConverterArray[ISO8859_7] =
511 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
512 }
513 if (myLocale[1]=='k') { /* Use KDDI's version. */
514 myConverterData->myConverterArray[JISX208] =
515 ucnv_loadSharedData("kddi-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
516 } else if (myLocale[1]=='s') { /* Use SoftBank's version. */
517 myConverterData->myConverterArray[JISX208] =
518 ucnv_loadSharedData("softbank-jisx-208-2007", &stackPieces, &stackArgs, errorCode);
519 } else {
520 /*
521 * Change for http://b/issue?id=937017 :
522 * Restore JIS X 0208 ISO-2022-JP mappings from before
523 * sharing the table with the Shift-JIS converter
524 * (CL 5963009 and http://bugs.icu-project.org/trac/ticket/5797).
525 * TODO(mscherer): Create and use a new, unified Google Shift-JIS
526 * table for both Shift-JIS and ISO-2022-JP.
527 */
528 myConverterData->myConverterArray[JISX208] =
529 ucnv_loadSharedData("jisx-208", &stackPieces, &stackArgs, errorCode);
530 }
531 /* End Google-specific change. */
532 if(jpCharsetMasks[version]&CSM(JISX212)) {
533 myConverterData->myConverterArray[JISX212] =
534 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
535 }
536 if(jpCharsetMasks[version]&CSM(GB2312)) {
537 myConverterData->myConverterArray[GB2312] =
538 /* BEGIN android-changed */
539 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
540 /* END android-changed */
541 }
542 if(jpCharsetMasks[version]&CSM(KSC5601)) {
543 myConverterData->myConverterArray[KSC5601] =
544 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
545 }
546
547 /* set the function pointers to appropriate funtions */
548 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
549 uprv_strcpy(myConverterData->locale,"ja");
550
551 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
552 size_t len = uprv_strlen(myConverterData->name);
553 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
554 myConverterData->name[len+1]='\0';
555 }
556 #if !UCONFIG_ONLY_HTML_CONVERSION
557 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
558 (myLocale[2]=='_' || myLocale[2]=='\0'))
559 {
560 if(version>1) {
561 // ICU 55 fails to open a converter for an unsupported version.
562 // Previously, it fell back to version 0, but that would yield
563 // unexpected behavior.
564 *errorCode = U_MISSING_RESOURCE_ERROR;
565 return;
566 }
567 const char *cnvName;
568 if(version==1) {
569 cnvName="icu-internal-25546";
570 } else {
571 /* BEGIN android-changed */
572 cnvName="ksc_5601";
573 /* END android-changed */
574 myConverterData->version=version=0;
575 }
576 if(pArgs->onlyTestIsLoadable) {
577 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
578 uprv_free(cnv->extraInfo);
579 cnv->extraInfo=NULL;
580 return;
581 } else {
582 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
583 if (U_FAILURE(*errorCode)) {
584 _ISO2022Close(cnv);
585 return;
586 }
587
588 if(version==1) {
589 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
590 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
591 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
592 }else{
593 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
594 }
595
596 /* initialize the state variables */
597 setInitialStateToUnicodeKR(cnv, myConverterData);
598 setInitialStateFromUnicodeKR(cnv, myConverterData);
599
600 /* set the function pointers to appropriate funtions */
601 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
602 uprv_strcpy(myConverterData->locale,"ko");
603 }
604 }
605 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
606 (myLocale[2]=='_' || myLocale[2]=='\0'))
607 {
608 if(version>2) {
609 // ICU 55 fails to open a converter for an unsupported version.
610 // Previously, it fell back to version 0, but that would yield
611 // unexpected behavior.
612 *errorCode = U_MISSING_RESOURCE_ERROR;
613 return;
614 }
615
616 /* open the required converters and cache them */
617 /* BEGIN android-changed */
618 myConverterData->myConverterArray[GB2312_1] =
619 ucnv_loadSharedData("noop-gb2312_gl", &stackPieces, &stackArgs, errorCode);
620 if(version==1) {
621 myConverterData->myConverterArray[ISO_IR_165] =
622 ucnv_loadSharedData("noop-iso-ir-165", &stackPieces, &stackArgs, errorCode);
623 }
624 myConverterData->myConverterArray[CNS_11643] =
625 ucnv_loadSharedData("noop-cns-11643", &stackPieces, &stackArgs, errorCode);
626 /* END android-changed */
627
628
629 /* set the function pointers to appropriate funtions */
630 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
631 uprv_strcpy(myConverterData->locale,"cn");
632
633 if (version==0){
634 myConverterData->version = 0;
635 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
636 }else if (version==1){
637 myConverterData->version = 1;
638 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
639 }else {
640 myConverterData->version = 2;
641 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
642 }
643 }
644 #endif // !UCONFIG_ONLY_HTML_CONVERSION
645 else{
646 #ifdef U_ENABLE_GENERIC_ISO_2022
647 myConverterData->isFirstBuffer = TRUE;
648
649 /* append the UTF-8 escape sequence */
650 cnv->charErrorBufferLength = 3;
651 cnv->charErrorBuffer[0] = 0x1b;
652 cnv->charErrorBuffer[1] = 0x25;
653 cnv->charErrorBuffer[2] = 0x42;
654
655 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
656 /* initialize the state variables */
657 uprv_strcpy(myConverterData->name,"ISO_2022");
658 #else
659 *errorCode = U_MISSING_RESOURCE_ERROR;
660 // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
661 // data loading error code.
662 return;
663 #endif
664 }
665
666 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
667
668 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
669 _ISO2022Close(cnv);
670 }
671 } else {
672 *errorCode = U_MEMORY_ALLOCATION_ERROR;
673 }
674 }
675
676
677 static void
_ISO2022Close(UConverter * converter)678 _ISO2022Close(UConverter *converter) {
679 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
680 UConverterSharedData **array = myData->myConverterArray;
681 int32_t i;
682
683 if (converter->extraInfo != NULL) {
684 /*close the array of converter pointers and free the memory*/
685 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
686 if(array[i]!=NULL) {
687 ucnv_unloadSharedDataIfReady(array[i]);
688 }
689 }
690
691 ucnv_close(myData->currentConverter);
692
693 if(!converter->isExtraLocal){
694 uprv_free (converter->extraInfo);
695 converter->extraInfo = NULL;
696 }
697 }
698 }
699
700 static void
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)701 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
702 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
703 if(choice<=UCNV_RESET_TO_UNICODE) {
704 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
705 myConverterData->key = 0;
706 myConverterData->isEmptySegment = FALSE;
707 }
708 if(choice!=UCNV_RESET_TO_UNICODE) {
709 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
710 }
711 #ifdef U_ENABLE_GENERIC_ISO_2022
712 if(myConverterData->locale[0] == 0){
713 if(choice<=UCNV_RESET_TO_UNICODE) {
714 myConverterData->isFirstBuffer = TRUE;
715 myConverterData->key = 0;
716 if (converter->mode == UCNV_SO){
717 ucnv_close (myConverterData->currentConverter);
718 myConverterData->currentConverter=NULL;
719 }
720 converter->mode = UCNV_SI;
721 }
722 if(choice!=UCNV_RESET_TO_UNICODE) {
723 /* re-append UTF-8 escape sequence */
724 converter->charErrorBufferLength = 3;
725 converter->charErrorBuffer[0] = 0x1b;
726 converter->charErrorBuffer[1] = 0x28;
727 converter->charErrorBuffer[2] = 0x42;
728 }
729 }
730 else
731 #endif
732 {
733 /* reset the state variables */
734 if(myConverterData->locale[0] == 'k'){
735 if(choice<=UCNV_RESET_TO_UNICODE) {
736 setInitialStateToUnicodeKR(converter, myConverterData);
737 }
738 if(choice!=UCNV_RESET_TO_UNICODE) {
739 setInitialStateFromUnicodeKR(converter, myConverterData);
740 }
741 }
742 }
743 }
744
745 static const char*
_ISO2022getName(const UConverter * cnv)746 _ISO2022getName(const UConverter* cnv){
747 if(cnv->extraInfo){
748 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
749 return myData->name;
750 }
751 return NULL;
752 }
753
754
755 /*************** to unicode *******************/
756 /****************************************************************************
757 * Recognized escape sequences are
758 * <ESC>(B ASCII
759 * <ESC>.A ISO-8859-1
760 * <ESC>.F ISO-8859-7
761 * <ESC>(J JISX-201
762 * <ESC>(I JISX-201
763 * <ESC>$B JISX-208
764 * <ESC>$@ JISX-208
765 * <ESC>$(D JISX-212
766 * <ESC>$A GB2312
767 * <ESC>$(C KSC5601
768 */
769 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
770 /* 0 1 2 3 4 5 6 7 8 9 */
771 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
772 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
773 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
774 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
775 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
776 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
777 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
778 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
779 };
780
781 #if !UCONFIG_ONLY_HTML_CONVERSION
782 /*************** to unicode *******************/
783 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
784 /* 0 1 2 3 4 5 6 7 8 9 */
785 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
786 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
787 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
788 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
789 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
790 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
791 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
792 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
793 };
794 #endif
795
796
797 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)798 getKey_2022(char c,int32_t* key,int32_t* offset){
799 int32_t togo;
800 int32_t low = 0;
801 int32_t hi = MAX_STATES_2022;
802 int32_t oldmid=0;
803
804 togo = normalize_esq_chars_2022[(uint8_t)c];
805 if(togo == 0) {
806 /* not a valid character anywhere in an escape sequence */
807 *key = 0;
808 *offset = 0;
809 return INVALID_2022;
810 }
811 togo = (*key << 5) + togo;
812
813 while (hi != low) /*binary search*/{
814
815 int32_t mid = (hi+low) >> 1; /*Finds median*/
816
817 if (mid == oldmid)
818 break;
819
820 if (escSeqStateTable_Key_2022[mid] > togo){
821 hi = mid;
822 }
823 else if (escSeqStateTable_Key_2022[mid] < togo){
824 low = mid;
825 }
826 else /*we found it*/{
827 *key = togo;
828 *offset = mid;
829 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
830 }
831 oldmid = mid;
832
833 }
834
835 *key = 0;
836 *offset = 0;
837 return INVALID_2022;
838 }
839
840 /*runs through a state machine to determine the escape sequence - codepage correspondance
841 */
842 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)843 changeState_2022(UConverter* _this,
844 const char** source,
845 const char* sourceLimit,
846 Variant2022 var,
847 UErrorCode* err){
848 UCNV_TableStates_2022 value;
849 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
850 uint32_t key = myData2022->key;
851 int32_t offset = 0;
852 int8_t initialToULength = _this->toULength;
853 char c;
854
855 value = VALID_NON_TERMINAL_2022;
856 while (*source < sourceLimit) {
857 c = *(*source)++;
858 _this->toUBytes[_this->toULength++]=(uint8_t)c;
859 value = getKey_2022(c,(int32_t *) &key, &offset);
860
861 switch (value){
862
863 case VALID_NON_TERMINAL_2022 :
864 /* continue with the loop */
865 break;
866
867 case VALID_TERMINAL_2022:
868 key = 0;
869 goto DONE;
870
871 case INVALID_2022:
872 goto DONE;
873
874 case VALID_MAYBE_TERMINAL_2022:
875 #ifdef U_ENABLE_GENERIC_ISO_2022
876 /* ESC ( B is ambiguous only for ISO_2022 itself */
877 if(var == ISO_2022) {
878 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
879 _this->toULength = 0;
880
881 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
882
883 /* continue with the loop */
884 value = VALID_NON_TERMINAL_2022;
885 break;
886 } else
887 #endif
888 {
889 /* not ISO_2022 itself, finish here */
890 value = VALID_TERMINAL_2022;
891 key = 0;
892 goto DONE;
893 }
894 }
895 }
896
897 DONE:
898 myData2022->key = key;
899
900 if (value == VALID_NON_TERMINAL_2022) {
901 /* indicate that the escape sequence is incomplete: key!=0 */
902 return;
903 } else if (value == INVALID_2022 ) {
904 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
905 } else /* value == VALID_TERMINAL_2022 */ {
906 switch(var){
907 #ifdef U_ENABLE_GENERIC_ISO_2022
908 case ISO_2022:
909 {
910 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
911 if(chosenConverterName == NULL) {
912 /* SS2 or SS3 */
913 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
914 _this->toUCallbackReason = UCNV_UNASSIGNED;
915 return;
916 }
917
918 _this->mode = UCNV_SI;
919 ucnv_close(myData2022->currentConverter);
920 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
921 if(U_SUCCESS(*err)) {
922 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
923 _this->mode = UCNV_SO;
924 }
925 break;
926 }
927 #endif
928 case ISO_2022_JP:
929 {
930 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
931 switch(tempState) {
932 case INVALID_STATE:
933 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934 break;
935 case SS2_STATE:
936 if(myData2022->toU2022State.cs[2]!=0) {
937 if(myData2022->toU2022State.g<2) {
938 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
939 }
940 myData2022->toU2022State.g=2;
941 } else {
942 /* illegal to have SS2 before a matching designator */
943 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
944 }
945 break;
946 /* case SS3_STATE: not used in ISO-2022-JP-x */
947 case ISO8859_1:
948 case ISO8859_7:
949 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
950 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
951 } else {
952 /* G2 charset for SS2 */
953 myData2022->toU2022State.cs[2]=(int8_t)tempState;
954 }
955 break;
956 default:
957 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
958 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
959 } else {
960 /* G0 charset */
961 myData2022->toU2022State.cs[0]=(int8_t)tempState;
962 }
963 break;
964 }
965 }
966 break;
967 #if !UCONFIG_ONLY_HTML_CONVERSION
968 case ISO_2022_CN:
969 {
970 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
971 switch(tempState) {
972 case INVALID_STATE:
973 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
974 break;
975 case SS2_STATE:
976 if(myData2022->toU2022State.cs[2]!=0) {
977 if(myData2022->toU2022State.g<2) {
978 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
979 }
980 myData2022->toU2022State.g=2;
981 } else {
982 /* illegal to have SS2 before a matching designator */
983 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
984 }
985 break;
986 case SS3_STATE:
987 if(myData2022->toU2022State.cs[3]!=0) {
988 if(myData2022->toU2022State.g<2) {
989 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
990 }
991 myData2022->toU2022State.g=3;
992 } else {
993 /* illegal to have SS3 before a matching designator */
994 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
995 }
996 break;
997 case ISO_IR_165:
998 if(myData2022->version==0) {
999 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1000 break;
1001 }
1002 /*fall through*/
1003 case GB2312_1:
1004 /*fall through*/
1005 case CNS_11643_1:
1006 myData2022->toU2022State.cs[1]=(int8_t)tempState;
1007 break;
1008 case CNS_11643_2:
1009 myData2022->toU2022State.cs[2]=(int8_t)tempState;
1010 break;
1011 default:
1012 /* other CNS 11643 planes */
1013 if(myData2022->version==0) {
1014 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1015 } else {
1016 myData2022->toU2022State.cs[3]=(int8_t)tempState;
1017 }
1018 break;
1019 }
1020 }
1021 break;
1022 case ISO_2022_KR:
1023 if(offset==0x30){
1024 /* nothing to be done, just accept this one escape sequence */
1025 } else {
1026 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1027 }
1028 break;
1029 #endif // !UCONFIG_ONLY_HTML_CONVERSION
1030
1031 default:
1032 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1033 break;
1034 }
1035 }
1036 if(U_SUCCESS(*err)) {
1037 _this->toULength = 0;
1038 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1039 if(_this->toULength>1) {
1040 /*
1041 * Ticket 5691: consistent illegal sequences:
1042 * - We include at least the first byte (ESC) in the illegal sequence.
1043 * - If any of the non-initial bytes could be the start of a character,
1044 * we stop the illegal sequence before the first one of those.
1045 * In escape sequences, all following bytes are "printable", that is,
1046 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1047 * they are valid single/lead bytes.
1048 * For simplicity, we always only report the initial ESC byte as the
1049 * illegal sequence and back out all other bytes we looked at.
1050 */
1051 /* Back out some bytes. */
1052 int8_t backOutDistance=_this->toULength-1;
1053 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1054 if(backOutDistance<=bytesFromThisBuffer) {
1055 /* same as initialToULength<=1 */
1056 *source-=backOutDistance;
1057 } else {
1058 /* Back out bytes from the previous buffer: Need to replay them. */
1059 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1060 /* same as -(initialToULength-1) */
1061 /* preToULength is negative! */
1062 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1063 *source-=bytesFromThisBuffer;
1064 }
1065 _this->toULength=1;
1066 }
1067 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1068 _this->toUCallbackReason = UCNV_UNASSIGNED;
1069 }
1070 }
1071
1072 #if !UCONFIG_ONLY_HTML_CONVERSION
1073 /*Checks the characters of the buffer against valid 2022 escape sequences
1074 *if the match we return a pointer to the initial start of the sequence otherwise
1075 *we return sourceLimit
1076 */
1077 /*for 2022 looks ahead in the stream
1078 *to determine the longest possible convertible
1079 *data stream
1080 */
1081 static inline const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool)1082 getEndOfBuffer_2022(const char** source,
1083 const char* sourceLimit,
1084 UBool /*flush*/){
1085
1086 const char* mySource = *source;
1087
1088 #ifdef U_ENABLE_GENERIC_ISO_2022
1089 if (*source >= sourceLimit)
1090 return sourceLimit;
1091
1092 do{
1093
1094 if (*mySource == ESC_2022){
1095 int8_t i;
1096 int32_t key = 0;
1097 int32_t offset;
1098 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1099
1100 /* Kludge: I could not
1101 * figure out the reason for validating an escape sequence
1102 * twice - once here and once in changeState_2022().
1103 * is it possible to have an ESC character in a ISO2022
1104 * byte stream which is valid in a code page? Is it legal?
1105 */
1106 for (i=0;
1107 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1108 i++) {
1109 value = getKey_2022(*(mySource+i), &key, &offset);
1110 }
1111 if (value > 0 || *mySource==ESC_2022)
1112 return mySource;
1113
1114 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1115 return sourceLimit;
1116 }
1117 }while (++mySource < sourceLimit);
1118
1119 return sourceLimit;
1120 #else
1121 while(mySource < sourceLimit && *mySource != ESC_2022) {
1122 ++mySource;
1123 }
1124 return mySource;
1125 #endif
1126 }
1127 #endif
1128
1129 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1130 * any future change in _MBCSFromUChar32() function should be reflected here.
1131 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1132 */
1133 static inline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1134 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1135 UChar32 c,
1136 uint32_t* value,
1137 UBool useFallback,
1138 int outputType)
1139 {
1140 const int32_t *cx;
1141 const uint16_t *table;
1142 uint32_t stage2Entry;
1143 uint32_t myValue;
1144 int32_t length;
1145 const uint8_t *p;
1146 /*
1147 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1148 * Use internal version of ucnv_open() that verifies that the new structures are available,
1149 * else U_INTERNAL_PROGRAM_ERROR.
1150 */
1151 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1152 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1153 table=sharedData->mbcs.fromUnicodeTable;
1154 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1155 /* get the bytes and the length for the output */
1156 if(outputType==MBCS_OUTPUT_2){
1157 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1158 if(myValue<=0xff) {
1159 length=1;
1160 } else {
1161 length=2;
1162 }
1163 } else /* outputType==MBCS_OUTPUT_3 */ {
1164 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1165 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1166 if(myValue<=0xff) {
1167 length=1;
1168 } else if(myValue<=0xffff) {
1169 length=2;
1170 } else {
1171 length=3;
1172 }
1173 }
1174 /* is this code point assigned, or do we use fallbacks? */
1175 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1176 /* assigned */
1177 *value=myValue;
1178 return length;
1179 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1180 /*
1181 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1182 * There is no way with this data structure for fallback output
1183 * to be a zero byte.
1184 */
1185 *value=myValue;
1186 return -length;
1187 }
1188 }
1189
1190 cx=sharedData->mbcs.extIndexes;
1191 if(cx!=NULL) {
1192 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1193 }
1194
1195 /* unassigned */
1196 return 0;
1197 }
1198
1199 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1200 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1201 * @param retval pointer to output byte
1202 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1203 */
1204 static inline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1205 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1206 UChar32 c,
1207 uint32_t* retval,
1208 UBool useFallback)
1209 {
1210 const uint16_t *table;
1211 int32_t value;
1212 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1213 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1214 return 0;
1215 }
1216 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1217 table=sharedData->mbcs.fromUnicodeTable;
1218 /* get the byte for the output */
1219 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1220 /* is this code point assigned, or do we use fallbacks? */
1221 *retval=(uint32_t)(value&0xff);
1222 if(value>=0xf00) {
1223 return 1; /* roundtrip */
1224 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1225 return -1; /* fallback taken */
1226 } else {
1227 return 0; /* no mapping */
1228 }
1229 }
1230
1231 /*
1232 * Check that the result is a 2-byte value with each byte in the range A1..FE
1233 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1234 * to move it to the ISO 2022 range 21..7E.
1235 * Return 0 if out of range.
1236 */
1237 static inline uint32_t
_2022FromGR94DBCS(uint32_t value)1238 _2022FromGR94DBCS(uint32_t value) {
1239 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1240 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1241 ) {
1242 return value - 0x8080; /* shift down to 21..7e byte range */
1243 } else {
1244 return 0; /* not valid for ISO 2022 */
1245 }
1246 }
1247
1248 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1249 /*
1250 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1251 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1252 * unchanged.
1253 */
1254 static inline uint32_t
1255 _2022ToGR94DBCS(uint32_t value) {
1256 uint32_t returnValue = value + 0x8080;
1257 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1258 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1259 return returnValue;
1260 } else {
1261 return value;
1262 }
1263 }
1264 #endif
1265
1266 #ifdef U_ENABLE_GENERIC_ISO_2022
1267
1268 /**********************************************************************************
1269 * ISO-2022 Converter
1270 *
1271 *
1272 */
1273
1274 static void
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1275 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1276 UErrorCode* err){
1277 const char* mySourceLimit, *realSourceLimit;
1278 const char* sourceStart;
1279 const UChar* myTargetStart;
1280 UConverter* saveThis;
1281 UConverterDataISO2022* myData;
1282 int8_t length;
1283
1284 saveThis = args->converter;
1285 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1286
1287 realSourceLimit = args->sourceLimit;
1288 while (args->source < realSourceLimit) {
1289 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1290 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1291 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1292
1293 if(args->source < mySourceLimit) {
1294 if(myData->currentConverter==NULL) {
1295 myData->currentConverter = ucnv_open("ASCII",err);
1296 if(U_FAILURE(*err)){
1297 return;
1298 }
1299
1300 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1301 saveThis->mode = UCNV_SO;
1302 }
1303
1304 /* convert to before the ESC or until the end of the buffer */
1305 myData->isFirstBuffer=FALSE;
1306 sourceStart = args->source;
1307 myTargetStart = args->target;
1308 args->converter = myData->currentConverter;
1309 ucnv_toUnicode(args->converter,
1310 &args->target,
1311 args->targetLimit,
1312 &args->source,
1313 mySourceLimit,
1314 args->offsets,
1315 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1316 err);
1317 args->converter = saveThis;
1318
1319 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1320 /* move the overflow buffer */
1321 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1322 myData->currentConverter->UCharErrorBufferLength = 0;
1323 if(length > 0) {
1324 uprv_memcpy(saveThis->UCharErrorBuffer,
1325 myData->currentConverter->UCharErrorBuffer,
1326 length*U_SIZEOF_UCHAR);
1327 }
1328 return;
1329 }
1330
1331 /*
1332 * At least one of:
1333 * -Error while converting
1334 * -Done with entire buffer
1335 * -Need to write offsets or update the current offset
1336 * (leave that up to the code in ucnv.c)
1337 *
1338 * or else we just stopped at an ESC byte and continue with changeState_2022()
1339 */
1340 if (U_FAILURE(*err) ||
1341 (args->source == realSourceLimit) ||
1342 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1343 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1344 ) {
1345 /* copy partial or error input for truncated detection and error handling */
1346 if(U_FAILURE(*err)) {
1347 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1348 if(length > 0) {
1349 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1350 }
1351 } else {
1352 length = saveThis->toULength = myData->currentConverter->toULength;
1353 if(length > 0) {
1354 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1355 if(args->source < mySourceLimit) {
1356 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1357 }
1358 }
1359 }
1360 return;
1361 }
1362 }
1363 }
1364
1365 sourceStart = args->source;
1366 changeState_2022(args->converter,
1367 &(args->source),
1368 realSourceLimit,
1369 ISO_2022,
1370 err);
1371 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1372 /* let the ucnv.c code update its current offset */
1373 return;
1374 }
1375 }
1376 }
1377
1378 #endif
1379
1380 /*
1381 * To Unicode Callback helper function
1382 */
1383 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1384 toUnicodeCallback(UConverter *cnv,
1385 const uint32_t sourceChar, const uint32_t targetUniChar,
1386 UErrorCode* err){
1387 if(sourceChar>0xff){
1388 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1389 cnv->toUBytes[1] = (uint8_t)sourceChar;
1390 cnv->toULength = 2;
1391 }
1392 else{
1393 cnv->toUBytes[0] =(char) sourceChar;
1394 cnv->toULength = 1;
1395 }
1396
1397 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1398 *err = U_INVALID_CHAR_FOUND;
1399 }
1400 else{
1401 *err = U_ILLEGAL_CHAR_FOUND;
1402 }
1403 }
1404
1405 /**************************************ISO-2022-JP*************************************************/
1406
1407 /************************************** IMPORTANT **************************************************
1408 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1409 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1410 * The converter iterates over each Unicode codepoint
1411 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1412 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1413 * would do as far as possible.
1414 *
1415 * If the implementation of these macros or structure of sharedData struct change in the future, make
1416 * sure that ISO-2022 is also changed.
1417 ***************************************************************************************************
1418 */
1419
1420 /***************************************************************************************************
1421 * Rules for ISO-2022-jp encoding
1422 * (i) Escape sequences must be fully contained within a line they should not
1423 * span new lines or CRs
1424 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1425 * JIS-Roman character escape sequence should follow before the line terminates
1426 * (iii) If the first character on the line is represented by two bytes then a two
1427 * byte character escape sequence should precede it
1428 * (iv) If no escape sequence is encountered then the characters are ASCII
1429 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1430 * and invoked with SS2 (ESC N).
1431 * (vi) If there is any G0 designation in text, there must be a switch to
1432 * ASCII or to JIS X 0201-Roman before a space character (but not
1433 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1434 * characters such as tab or CRLF.
1435 * (vi) Supported encodings:
1436 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1437 *
1438 * source : RFC-1554
1439 *
1440 * JISX201, JISX208,JISX212 : new .cnv data files created
1441 * KSC5601 : alias to ibm-949 mapping table
1442 * GB2312 : alias to ibm-1386 mapping table
1443 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1444 * ISO-8859-7 : alisas to ibm-9409 mapping table
1445 */
1446
1447 /* preference order of JP charsets */
1448 static const StateEnum jpCharsetPref[]={
1449 ASCII,
1450 JISX201,
1451 ISO8859_1,
1452 JISX208,
1453 ISO8859_7,
1454 JISX212,
1455 GB2312,
1456 KSC5601,
1457 HWKANA_7BIT
1458 };
1459
1460 /*
1461 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1462 * not in order of jpCharsetPref[]!
1463 */
1464 static const char escSeqChars[][6] ={
1465 "\x1B\x28\x42", /* <ESC>(B ASCII */
1466 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1467 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1468 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1469 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1470 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1471 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1472 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1473 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1474
1475 };
1476 static const int8_t escSeqCharsLen[] ={
1477 3, /* length of <ESC>(B ASCII */
1478 3, /* length of <ESC>.A ISO-8859-1 */
1479 3, /* length of <ESC>.F ISO-8859-7 */
1480 3, /* length of <ESC>(J JISX-201 */
1481 3, /* length of <ESC>$B JISX-208 */
1482 4, /* length of <ESC>$(D JISX-212 */
1483 3, /* length of <ESC>$A GB2312 */
1484 4, /* length of <ESC>$(C KSC5601 */
1485 3 /* length of <ESC>(I HWKANA_7BIT */
1486 };
1487
1488 /*
1489 * The iteration over various code pages works this way:
1490 * i) Get the currentState from myConverterData->currentState
1491 * ii) Check if the character is mapped to a valid character in the currentState
1492 * Yes -> a) set the initIterState to currentState
1493 * b) remain in this state until an invalid character is found
1494 * No -> a) go to the next code page and find the character
1495 * iii) Before changing the state increment the current state check if the current state
1496 * is equal to the intitIteration state
1497 * Yes -> A character that cannot be represented in any of the supported encodings
1498 * break and return a U_INVALID_CHARACTER error
1499 * No -> Continue and find the character in next code page
1500 *
1501 *
1502 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1503 */
1504
1505 /* Map 00..7F to Unicode according to JIS X 0201. */
1506 static inline uint32_t
jisx201ToU(uint32_t value)1507 jisx201ToU(uint32_t value) {
1508 if(value < 0x5c) {
1509 return value;
1510 } else if(value == 0x5c) {
1511 return 0xa5;
1512 } else if(value == 0x7e) {
1513 return 0x203e;
1514 } else /* value <= 0x7f */ {
1515 return value;
1516 }
1517 }
1518
1519 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1520 static inline uint32_t
jisx201FromU(uint32_t value)1521 jisx201FromU(uint32_t value) {
1522 if(value<=0x7f) {
1523 if(value!=0x5c && value!=0x7e) {
1524 return value;
1525 }
1526 } else if(value==0xa5) {
1527 return 0x5c;
1528 } else if(value==0x203e) {
1529 return 0x7e;
1530 }
1531 return 0xfffe;
1532 }
1533
1534 /*
1535 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1536 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1537 * Return 0 if the byte pair is out of range.
1538 */
1539 static inline uint32_t
_2022FromSJIS(uint32_t value)1540 _2022FromSJIS(uint32_t value) {
1541 uint8_t trail;
1542
1543 if(value > 0xEFFC) {
1544 return 0; /* beyond JIS X 0208 */
1545 }
1546
1547 trail = (uint8_t)value;
1548
1549 value &= 0xff00; /* lead byte */
1550 if(value <= 0x9f00) {
1551 value -= 0x7000;
1552 } else /* 0xe000 <= value <= 0xef00 */ {
1553 value -= 0xb000;
1554 }
1555 value <<= 1;
1556
1557 if(trail <= 0x9e) {
1558 value -= 0x100;
1559 if(trail <= 0x7e) {
1560 value |= trail - 0x1f;
1561 } else {
1562 value |= trail - 0x20;
1563 }
1564 } else /* trail <= 0xfc */ {
1565 value |= trail - 0x7e;
1566 }
1567 return value;
1568 }
1569
1570 /*
1571 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1572 * If either byte is outside 21..7E make sure that the result is not valid
1573 * for Shift-JIS so that the converter catches it.
1574 * Some invalid byte values already turn into equally invalid Shift-JIS
1575 * byte values and need not be tested explicitly.
1576 */
1577 static inline void
_2022ToSJIS(uint8_t c1,uint8_t c2,char bytes[2])1578 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1579 if(c1&1) {
1580 ++c1;
1581 if(c2 <= 0x5f) {
1582 c2 += 0x1f;
1583 } else if(c2 <= 0x7e) {
1584 c2 += 0x20;
1585 } else {
1586 c2 = 0; /* invalid */
1587 }
1588 } else {
1589 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1590 c2 += 0x7e;
1591 } else {
1592 c2 = 0; /* invalid */
1593 }
1594 }
1595 c1 >>= 1;
1596 if(c1 <= 0x2f) {
1597 c1 += 0x70;
1598 } else if(c1 <= 0x3f) {
1599 c1 += 0xb0;
1600 } else {
1601 c1 = 0; /* invalid */
1602 }
1603 bytes[0] = (char)c1;
1604 bytes[1] = (char)c2;
1605 }
1606
1607 /*
1608 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1609 * Katakana.
1610 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1611 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1612 * These were the only fallbacks in ICU's jisx-208.ucm file.
1613 */
1614 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1615 0x2123, /* U+FF61 */
1616 0x2156,
1617 0x2157,
1618 0x2122,
1619 0x2126,
1620 0x2572,
1621 0x2521,
1622 0x2523,
1623 0x2525,
1624 0x2527,
1625 0x2529,
1626 0x2563,
1627 0x2565,
1628 0x2567,
1629 0x2543,
1630 0x213C, /* U+FF70 */
1631 0x2522,
1632 0x2524,
1633 0x2526,
1634 0x2528,
1635 0x252A,
1636 0x252B,
1637 0x252D,
1638 0x252F,
1639 0x2531,
1640 0x2533,
1641 0x2535,
1642 0x2537,
1643 0x2539,
1644 0x253B,
1645 0x253D,
1646 0x253F, /* U+FF80 */
1647 0x2541,
1648 0x2544,
1649 0x2546,
1650 0x2548,
1651 0x254A,
1652 0x254B,
1653 0x254C,
1654 0x254D,
1655 0x254E,
1656 0x254F,
1657 0x2552,
1658 0x2555,
1659 0x2558,
1660 0x255B,
1661 0x255E,
1662 0x255F, /* U+FF90 */
1663 0x2560,
1664 0x2561,
1665 0x2562,
1666 0x2564,
1667 0x2566,
1668 0x2568,
1669 0x2569,
1670 0x256A,
1671 0x256B,
1672 0x256C,
1673 0x256D,
1674 0x256F,
1675 0x2573,
1676 0x212B,
1677 0x212C /* U+FF9F */
1678 };
1679
1680 static void
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1681 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1682 UConverter *cnv = args->converter;
1683 UConverterDataISO2022 *converterData;
1684 ISO2022State *pFromU2022State;
1685 uint8_t *target = (uint8_t *) args->target;
1686 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1687 const UChar* source = args->source;
1688 const UChar* sourceLimit = args->sourceLimit;
1689 int32_t* offsets = args->offsets;
1690 UChar32 sourceChar;
1691 char buffer[8];
1692 int32_t len, outLen;
1693 int8_t choices[10];
1694 int32_t choiceCount;
1695 uint32_t targetValue = 0;
1696 UBool useFallback;
1697
1698 int32_t i;
1699 int8_t cs, g;
1700
1701 /* set up the state */
1702 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1703 pFromU2022State = &converterData->fromU2022State;
1704
1705 choiceCount = 0;
1706
1707 /* check if the last codepoint of previous buffer was a lead surrogate*/
1708 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1709 goto getTrail;
1710 }
1711
1712 while(source < sourceLimit) {
1713 if(target < targetLimit) {
1714
1715 sourceChar = *(source++);
1716 /*check if the char is a First surrogate*/
1717 if(U16_IS_SURROGATE(sourceChar)) {
1718 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1719 getTrail:
1720 /*look ahead to find the trail surrogate*/
1721 if(source < sourceLimit) {
1722 /* test the following code unit */
1723 UChar trail=(UChar) *source;
1724 if(U16_IS_TRAIL(trail)) {
1725 source++;
1726 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1727 cnv->fromUChar32=0x00;
1728 /* convert this supplementary code point */
1729 /* exit this condition tree */
1730 } else {
1731 /* this is an unmatched lead code unit (1st surrogate) */
1732 /* callback(illegal) */
1733 *err=U_ILLEGAL_CHAR_FOUND;
1734 cnv->fromUChar32=sourceChar;
1735 break;
1736 }
1737 } else {
1738 /* no more input */
1739 cnv->fromUChar32=sourceChar;
1740 break;
1741 }
1742 } else {
1743 /* this is an unmatched trail code unit (2nd surrogate) */
1744 /* callback(illegal) */
1745 *err=U_ILLEGAL_CHAR_FOUND;
1746 cnv->fromUChar32=sourceChar;
1747 break;
1748 }
1749 }
1750
1751 /* do not convert SO/SI/ESC */
1752 if(IS_2022_CONTROL(sourceChar)) {
1753 /* callback(illegal) */
1754 *err=U_ILLEGAL_CHAR_FOUND;
1755 cnv->fromUChar32=sourceChar;
1756 break;
1757 }
1758
1759 /* do the conversion */
1760
1761 if(choiceCount == 0) {
1762 uint16_t csm;
1763
1764 /*
1765 * The csm variable keeps track of which charsets are allowed
1766 * and not used yet while building the choices[].
1767 */
1768 csm = jpCharsetMasks[converterData->version];
1769 choiceCount = 0;
1770
1771 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1772 if(converterData->version == 3 || converterData->version == 4) {
1773 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1774 }
1775 /* Do not try single-byte half-width Katakana for other versions. */
1776 csm &= ~CSM(HWKANA_7BIT);
1777
1778 /* try the current G0 charset */
1779 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1780 csm &= ~CSM(cs);
1781
1782 /* try the current G2 charset */
1783 if((cs = pFromU2022State->cs[2]) != 0) {
1784 choices[choiceCount++] = cs;
1785 csm &= ~CSM(cs);
1786 }
1787
1788 /* try all the other possible charsets */
1789 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1790 cs = (int8_t)jpCharsetPref[i];
1791 if(CSM(cs) & csm) {
1792 choices[choiceCount++] = cs;
1793 csm &= ~CSM(cs);
1794 }
1795 }
1796 }
1797
1798 cs = g = 0;
1799 /*
1800 * len==0: no mapping found yet
1801 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1802 * len>0: found a roundtrip result, done
1803 */
1804 len = 0;
1805 /*
1806 * We will turn off useFallback after finding a fallback,
1807 * but we still get fallbacks from PUA code points as usual.
1808 * Therefore, we will also need to check that we don't overwrite
1809 * an early fallback with a later one.
1810 */
1811 useFallback = cnv->useFallback;
1812
1813 for(i = 0; i < choiceCount && len <= 0; ++i) {
1814 uint32_t value;
1815 int32_t len2;
1816 int8_t cs0 = choices[i];
1817 switch(cs0) {
1818 case ASCII:
1819 if(sourceChar <= 0x7f) {
1820 targetValue = (uint32_t)sourceChar;
1821 len = 1;
1822 cs = cs0;
1823 g = 0;
1824 }
1825 break;
1826 case ISO8859_1:
1827 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1828 targetValue = (uint32_t)sourceChar - 0x80;
1829 len = 1;
1830 cs = cs0;
1831 g = 2;
1832 }
1833 break;
1834 case HWKANA_7BIT:
1835 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1836 if(converterData->version==3) {
1837 /* JIS7: use G1 (SO) */
1838 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1839 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1840 len = 1;
1841 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1842 g = 1;
1843 } else if(converterData->version==4) {
1844 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1845 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1846 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1847 len = 1;
1848
1849 cs = pFromU2022State->cs[0];
1850 if(IS_JP_DBCS(cs)) {
1851 /* switch from a DBCS charset to JISX201 */
1852 cs = (int8_t)JISX201;
1853 }
1854 /* else stay in the current G0 charset */
1855 g = 0;
1856 }
1857 /* else do not use HWKANA_7BIT with other versions */
1858 }
1859 break;
1860 case JISX201:
1861 /* G0 SBCS */
1862 value = jisx201FromU(sourceChar);
1863 if(value <= 0x7f) {
1864 targetValue = value;
1865 len = 1;
1866 cs = cs0;
1867 g = 0;
1868 useFallback = FALSE;
1869 }
1870 break;
1871 case JISX208:
1872 /* G0 DBCS from Shift-JIS table */
1873 len2 = MBCS_FROM_UCHAR32_ISO2022(
1874 converterData->myConverterArray[cs0],
1875 sourceChar, &value,
1876 useFallback, MBCS_OUTPUT_2);
1877 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1878 value = _2022FromSJIS(value);
1879 if(value != 0) {
1880 targetValue = value;
1881 len = len2;
1882 cs = cs0;
1883 g = 0;
1884 useFallback = FALSE;
1885 }
1886 } else if(len == 0 && useFallback &&
1887 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1888 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1889 len = -2;
1890 cs = cs0;
1891 g = 0;
1892 useFallback = FALSE;
1893 }
1894 break;
1895 case ISO8859_7:
1896 /* G0 SBCS forced to 7-bit output */
1897 len2 = MBCS_SINGLE_FROM_UCHAR32(
1898 converterData->myConverterArray[cs0],
1899 sourceChar, &value,
1900 useFallback);
1901 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1902 targetValue = value - 0x80;
1903 len = len2;
1904 cs = cs0;
1905 g = 2;
1906 useFallback = FALSE;
1907 }
1908 break;
1909 default:
1910 /* G0 DBCS */
1911 len2 = MBCS_FROM_UCHAR32_ISO2022(
1912 converterData->myConverterArray[cs0],
1913 sourceChar, &value,
1914 useFallback, MBCS_OUTPUT_2);
1915 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1916 if(cs0 == KSC5601) {
1917 /*
1918 * Check for valid bytes for the encoding scheme.
1919 * This is necessary because the sub-converter (windows-949)
1920 * has a broader encoding scheme than is valid for 2022.
1921 */
1922 value = _2022FromGR94DBCS(value);
1923 if(value == 0) {
1924 break;
1925 }
1926 }
1927 targetValue = value;
1928 len = len2;
1929 cs = cs0;
1930 g = 0;
1931 useFallback = FALSE;
1932 }
1933 break;
1934 }
1935 }
1936
1937 if(len != 0) {
1938 if(len < 0) {
1939 len = -len; /* fallback */
1940 }
1941 outLen = 0; /* count output bytes */
1942
1943 /* write SI if necessary (only for JIS7) */
1944 if(pFromU2022State->g == 1 && g == 0) {
1945 buffer[outLen++] = UCNV_SI;
1946 pFromU2022State->g = 0;
1947 }
1948
1949 /* write the designation sequence if necessary */
1950 if(cs != pFromU2022State->cs[g]) {
1951 int32_t escLen = escSeqCharsLen[cs];
1952 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1953 outLen += escLen;
1954 pFromU2022State->cs[g] = cs;
1955
1956 /* invalidate the choices[] */
1957 choiceCount = 0;
1958 }
1959
1960 /* write the shift sequence if necessary */
1961 if(g != pFromU2022State->g) {
1962 switch(g) {
1963 /* case 0 handled before writing escapes */
1964 case 1:
1965 buffer[outLen++] = UCNV_SO;
1966 pFromU2022State->g = 1;
1967 break;
1968 default: /* case 2 */
1969 buffer[outLen++] = 0x1b;
1970 buffer[outLen++] = 0x4e;
1971 break;
1972 /* no case 3: no SS3 in ISO-2022-JP-x */
1973 }
1974 }
1975
1976 /* write the output bytes */
1977 if(len == 1) {
1978 buffer[outLen++] = (char)targetValue;
1979 } else /* len == 2 */ {
1980 buffer[outLen++] = (char)(targetValue >> 8);
1981 buffer[outLen++] = (char)targetValue;
1982 }
1983 } else {
1984 /*
1985 * if we cannot find the character after checking all codepages
1986 * then this is an error
1987 */
1988 *err = U_INVALID_CHAR_FOUND;
1989 cnv->fromUChar32=sourceChar;
1990 break;
1991 }
1992
1993 if(sourceChar == CR || sourceChar == LF) {
1994 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1995 pFromU2022State->cs[2] = 0;
1996 choiceCount = 0;
1997 }
1998
1999 /* output outLen>0 bytes in buffer[] */
2000 if(outLen == 1) {
2001 *target++ = buffer[0];
2002 if(offsets) {
2003 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2004 }
2005 } else if(outLen == 2 && (target + 2) <= targetLimit) {
2006 *target++ = buffer[0];
2007 *target++ = buffer[1];
2008 if(offsets) {
2009 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2010 *offsets++ = sourceIndex;
2011 *offsets++ = sourceIndex;
2012 }
2013 } else {
2014 fromUWriteUInt8(
2015 cnv,
2016 buffer, outLen,
2017 &target, (const char *)targetLimit,
2018 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2019 err);
2020 if(U_FAILURE(*err)) {
2021 break;
2022 }
2023 }
2024 } /* end if(myTargetIndex<myTargetLength) */
2025 else{
2026 *err =U_BUFFER_OVERFLOW_ERROR;
2027 break;
2028 }
2029
2030 }/* end while(mySourceIndex<mySourceLength) */
2031
2032 /*
2033 * the end of the input stream and detection of truncated input
2034 * are handled by the framework, but for ISO-2022-JP conversion
2035 * we need to be in ASCII mode at the very end
2036 *
2037 * conditions:
2038 * successful
2039 * in SO mode or not in ASCII mode
2040 * end of input and no truncated input
2041 */
2042 if( U_SUCCESS(*err) &&
2043 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2044 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2045 ) {
2046 int32_t sourceIndex;
2047
2048 outLen = 0;
2049
2050 if(pFromU2022State->g != 0) {
2051 buffer[outLen++] = UCNV_SI;
2052 pFromU2022State->g = 0;
2053 }
2054
2055 if(pFromU2022State->cs[0] != ASCII) {
2056 int32_t escLen = escSeqCharsLen[ASCII];
2057 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2058 outLen += escLen;
2059 pFromU2022State->cs[0] = (int8_t)ASCII;
2060 }
2061
2062 /* get the source index of the last input character */
2063 /*
2064 * TODO this would be simpler and more reliable if we used a pair
2065 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2066 * so that we could simply use the prevSourceIndex here;
2067 * this code gives an incorrect result for the rare case of an unmatched
2068 * trail surrogate that is alone in the last buffer of the text stream
2069 */
2070 sourceIndex=(int32_t)(source-args->source);
2071 if(sourceIndex>0) {
2072 --sourceIndex;
2073 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2074 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2075 ) {
2076 --sourceIndex;
2077 }
2078 } else {
2079 sourceIndex=-1;
2080 }
2081
2082 fromUWriteUInt8(
2083 cnv,
2084 buffer, outLen,
2085 &target, (const char *)targetLimit,
2086 &offsets, sourceIndex,
2087 err);
2088 }
2089
2090 /*save the state and return */
2091 args->source = source;
2092 args->target = (char*)target;
2093 }
2094
2095 /*************** to unicode *******************/
2096
2097 static void
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2098 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2099 UErrorCode* err){
2100 char tempBuf[2];
2101 const char *mySource = (char *) args->source;
2102 UChar *myTarget = args->target;
2103 const char *mySourceLimit = args->sourceLimit;
2104 uint32_t targetUniChar = 0x0000;
2105 uint32_t mySourceChar = 0x0000;
2106 uint32_t tmpSourceChar = 0x0000;
2107 UConverterDataISO2022* myData;
2108 ISO2022State *pToU2022State;
2109 StateEnum cs;
2110
2111 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2112 pToU2022State = &myData->toU2022State;
2113
2114 if(myData->key != 0) {
2115 /* continue with a partial escape sequence */
2116 goto escape;
2117 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2118 /* continue with a partial double-byte character */
2119 mySourceChar = args->converter->toUBytes[0];
2120 args->converter->toULength = 0;
2121 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2122 targetUniChar = missingCharMarker;
2123 goto getTrailByte;
2124 }
2125
2126 while(mySource < mySourceLimit){
2127
2128 targetUniChar =missingCharMarker;
2129
2130 if(myTarget < args->targetLimit){
2131
2132 mySourceChar= (unsigned char) *mySource++;
2133
2134 switch(mySourceChar) {
2135 case UCNV_SI:
2136 if(myData->version==3) {
2137 pToU2022State->g=0;
2138 continue;
2139 } else {
2140 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2141 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2142 break;
2143 }
2144
2145 case UCNV_SO:
2146 if(myData->version==3) {
2147 /* JIS7: switch to G1 half-width Katakana */
2148 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2149 pToU2022State->g=1;
2150 continue;
2151 } else {
2152 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2153 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2154 break;
2155 }
2156
2157 case ESC_2022:
2158 mySource--;
2159 escape:
2160 {
2161 const char * mySourceBefore = mySource;
2162 int8_t toULengthBefore = args->converter->toULength;
2163
2164 changeState_2022(args->converter,&(mySource),
2165 mySourceLimit, ISO_2022_JP,err);
2166
2167 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2168 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2169 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2170 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2171 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2172 }
2173 }
2174
2175 /* invalid or illegal escape sequence */
2176 if(U_FAILURE(*err)){
2177 args->target = myTarget;
2178 args->source = mySource;
2179 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2180 return;
2181 }
2182 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2183 if(myData->key==0) {
2184 myData->isEmptySegment = TRUE;
2185 }
2186 continue;
2187
2188 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2189
2190 case CR:
2191 /*falls through*/
2192 case LF:
2193 /* automatically reset to single-byte mode */
2194 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2195 pToU2022State->cs[0] = (int8_t)ASCII;
2196 }
2197 pToU2022State->cs[2] = 0;
2198 pToU2022State->g = 0;
2199 /* falls through */
2200 default:
2201 /* convert one or two bytes */
2202 myData->isEmptySegment = FALSE;
2203 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2204 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2205 !IS_JP_DBCS(cs)
2206 ) {
2207 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2208 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2209
2210 /* return from a single-shift state to the previous one */
2211 if(pToU2022State->g >= 2) {
2212 pToU2022State->g=pToU2022State->prevG;
2213 }
2214 } else switch(cs) {
2215 case ASCII:
2216 if(mySourceChar <= 0x7f) {
2217 targetUniChar = mySourceChar;
2218 }
2219 break;
2220 case ISO8859_1:
2221 if(mySourceChar <= 0x7f) {
2222 targetUniChar = mySourceChar + 0x80;
2223 }
2224 /* return from a single-shift state to the previous one */
2225 pToU2022State->g=pToU2022State->prevG;
2226 break;
2227 case ISO8859_7:
2228 if(mySourceChar <= 0x7f) {
2229 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2230 targetUniChar =
2231 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2232 myData->myConverterArray[cs],
2233 mySourceChar + 0x80);
2234 }
2235 /* return from a single-shift state to the previous one */
2236 pToU2022State->g=pToU2022State->prevG;
2237 break;
2238 case JISX201:
2239 if(mySourceChar <= 0x7f) {
2240 targetUniChar = jisx201ToU(mySourceChar);
2241 }
2242 break;
2243 case HWKANA_7BIT:
2244 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2245 /* 7-bit halfwidth Katakana */
2246 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2247 }
2248 break;
2249 default:
2250 /* G0 DBCS */
2251 if(mySource < mySourceLimit) {
2252 int leadIsOk, trailIsOk;
2253 uint8_t trailByte;
2254 getTrailByte:
2255 trailByte = (uint8_t)*mySource;
2256 /*
2257 * Ticket 5691: consistent illegal sequences:
2258 * - We include at least the first byte in the illegal sequence.
2259 * - If any of the non-initial bytes could be the start of a character,
2260 * we stop the illegal sequence before the first one of those.
2261 *
2262 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2263 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2264 * Otherwise we convert or report the pair of bytes.
2265 */
2266 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2267 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2268 if (leadIsOk && trailIsOk) {
2269 ++mySource;
2270 tmpSourceChar = (mySourceChar << 8) | trailByte;
2271 if(cs == JISX208) {
2272 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2273 mySourceChar = tmpSourceChar;
2274 } else {
2275 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2276 mySourceChar = tmpSourceChar;
2277 if (cs == KSC5601) {
2278 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2279 }
2280 tempBuf[0] = (char)(tmpSourceChar >> 8);
2281 tempBuf[1] = (char)(tmpSourceChar);
2282 }
2283 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2284 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2285 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2286 ++mySource;
2287 /* add another bit so that the code below writes 2 bytes in case of error */
2288 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2289 }
2290 } else {
2291 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2292 args->converter->toULength = 1;
2293 goto endloop;
2294 }
2295 } /* End of inner switch */
2296 break;
2297 } /* End of outer switch */
2298 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2299 if(args->offsets){
2300 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2301 }
2302 *(myTarget++)=(UChar)targetUniChar;
2303 }
2304 else if(targetUniChar > missingCharMarker){
2305 /* disassemble the surrogate pair and write to output*/
2306 targetUniChar-=0x0010000;
2307 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2308 if(args->offsets){
2309 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2310 }
2311 ++myTarget;
2312 if(myTarget< args->targetLimit){
2313 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2314 if(args->offsets){
2315 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2316 }
2317 ++myTarget;
2318 }else{
2319 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2320 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2321 }
2322
2323 }
2324 else{
2325 /* Call the callback function*/
2326 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2327 break;
2328 }
2329 }
2330 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2331 *err =U_BUFFER_OVERFLOW_ERROR;
2332 break;
2333 }
2334 }
2335 endloop:
2336 args->target = myTarget;
2337 args->source = mySource;
2338 }
2339
2340
2341 #if !UCONFIG_ONLY_HTML_CONVERSION
2342 /***************************************************************
2343 * Rules for ISO-2022-KR encoding
2344 * i) The KSC5601 designator sequence should appear only once in a file,
2345 * at the begining of a line before any KSC5601 characters. This usually
2346 * means that it appears by itself on the first line of the file
2347 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2348 * and SI to shift into single byte mode
2349 */
2350 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2352
2353 UConverter* saveConv = args->converter;
2354 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2355 args->converter=myConverterData->currentConverter;
2356
2357 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2358 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2359 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2360
2361 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2362 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2363 uprv_memcpy(
2364 saveConv->charErrorBuffer,
2365 myConverterData->currentConverter->charErrorBuffer,
2366 myConverterData->currentConverter->charErrorBufferLength);
2367 }
2368 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2369 myConverterData->currentConverter->charErrorBufferLength = 0;
2370 }
2371 args->converter=saveConv;
2372 }
2373
2374 static void
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2375 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2376
2377 const UChar *source = args->source;
2378 const UChar *sourceLimit = args->sourceLimit;
2379 unsigned char *target = (unsigned char *) args->target;
2380 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2381 int32_t* offsets = args->offsets;
2382 uint32_t targetByteUnit = 0x0000;
2383 UChar32 sourceChar = 0x0000;
2384 UBool isTargetByteDBCS;
2385 UBool oldIsTargetByteDBCS;
2386 UConverterDataISO2022 *converterData;
2387 UConverterSharedData* sharedData;
2388 UBool useFallback;
2389 int32_t length =0;
2390
2391 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2392 /* if the version is 1 then the user is requesting
2393 * conversion with ibm-25546 pass the arguments to
2394 * MBCS converter and return
2395 */
2396 if(converterData->version==1){
2397 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2398 return;
2399 }
2400
2401 /* initialize data */
2402 sharedData = converterData->currentConverter->sharedData;
2403 useFallback = args->converter->useFallback;
2404 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2405 oldIsTargetByteDBCS = isTargetByteDBCS;
2406
2407 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2408 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2409 goto getTrail;
2410 }
2411 while(source < sourceLimit){
2412
2413 targetByteUnit = missingCharMarker;
2414
2415 if(target < (unsigned char*) args->targetLimit){
2416 sourceChar = *source++;
2417
2418 /* do not convert SO/SI/ESC */
2419 if(IS_2022_CONTROL(sourceChar)) {
2420 /* callback(illegal) */
2421 *err=U_ILLEGAL_CHAR_FOUND;
2422 args->converter->fromUChar32=sourceChar;
2423 break;
2424 }
2425
2426 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2427 if(length < 0) {
2428 length = -length; /* fallback */
2429 }
2430 /* only DBCS or SBCS characters are expected*/
2431 /* DB characters with high bit set to 1 are expected */
2432 if( length > 2 || length==0 ||
2433 (length == 1 && targetByteUnit > 0x7f) ||
2434 (length == 2 &&
2435 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2436 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2437 ) {
2438 targetByteUnit=missingCharMarker;
2439 }
2440 if (targetByteUnit != missingCharMarker){
2441
2442 oldIsTargetByteDBCS = isTargetByteDBCS;
2443 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2444 /* append the shift sequence */
2445 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2446
2447 if (isTargetByteDBCS)
2448 *target++ = UCNV_SO;
2449 else
2450 *target++ = UCNV_SI;
2451 if(offsets)
2452 *(offsets++) = (int32_t)(source - args->source-1);
2453 }
2454 /* write the targetUniChar to target */
2455 if(targetByteUnit <= 0x00FF){
2456 if( target < targetLimit){
2457 *(target++) = (unsigned char) targetByteUnit;
2458 if(offsets){
2459 *(offsets++) = (int32_t)(source - args->source-1);
2460 }
2461
2462 }else{
2463 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2464 *err = U_BUFFER_OVERFLOW_ERROR;
2465 }
2466 }else{
2467 if(target < targetLimit){
2468 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2469 if(offsets){
2470 *(offsets++) = (int32_t)(source - args->source-1);
2471 }
2472 if(target < targetLimit){
2473 *(target++) =(unsigned char) (targetByteUnit -0x80);
2474 if(offsets){
2475 *(offsets++) = (int32_t)(source - args->source-1);
2476 }
2477 }else{
2478 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2479 *err = U_BUFFER_OVERFLOW_ERROR;
2480 }
2481 }else{
2482 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2483 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2484 *err = U_BUFFER_OVERFLOW_ERROR;
2485 }
2486 }
2487
2488 }
2489 else{
2490 /* oops.. the code point is unassingned
2491 * set the error and reason
2492 */
2493
2494 /*check if the char is a First surrogate*/
2495 if(U16_IS_SURROGATE(sourceChar)) {
2496 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2497 getTrail:
2498 /*look ahead to find the trail surrogate*/
2499 if(source < sourceLimit) {
2500 /* test the following code unit */
2501 UChar trail=(UChar) *source;
2502 if(U16_IS_TRAIL(trail)) {
2503 source++;
2504 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2505 *err = U_INVALID_CHAR_FOUND;
2506 /* convert this surrogate code point */
2507 /* exit this condition tree */
2508 } else {
2509 /* this is an unmatched lead code unit (1st surrogate) */
2510 /* callback(illegal) */
2511 *err=U_ILLEGAL_CHAR_FOUND;
2512 }
2513 } else {
2514 /* no more input */
2515 *err = U_ZERO_ERROR;
2516 }
2517 } else {
2518 /* this is an unmatched trail code unit (2nd surrogate) */
2519 /* callback(illegal) */
2520 *err=U_ILLEGAL_CHAR_FOUND;
2521 }
2522 } else {
2523 /* callback(unassigned) for a BMP code point */
2524 *err = U_INVALID_CHAR_FOUND;
2525 }
2526
2527 args->converter->fromUChar32=sourceChar;
2528 break;
2529 }
2530 } /* end if(myTargetIndex<myTargetLength) */
2531 else{
2532 *err =U_BUFFER_OVERFLOW_ERROR;
2533 break;
2534 }
2535
2536 }/* end while(mySourceIndex<mySourceLength) */
2537
2538 /*
2539 * the end of the input stream and detection of truncated input
2540 * are handled by the framework, but for ISO-2022-KR conversion
2541 * we need to be in ASCII mode at the very end
2542 *
2543 * conditions:
2544 * successful
2545 * not in ASCII mode
2546 * end of input and no truncated input
2547 */
2548 if( U_SUCCESS(*err) &&
2549 isTargetByteDBCS &&
2550 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2551 ) {
2552 int32_t sourceIndex;
2553
2554 /* we are switching to ASCII */
2555 isTargetByteDBCS=FALSE;
2556
2557 /* get the source index of the last input character */
2558 /*
2559 * TODO this would be simpler and more reliable if we used a pair
2560 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2561 * so that we could simply use the prevSourceIndex here;
2562 * this code gives an incorrect result for the rare case of an unmatched
2563 * trail surrogate that is alone in the last buffer of the text stream
2564 */
2565 sourceIndex=(int32_t)(source-args->source);
2566 if(sourceIndex>0) {
2567 --sourceIndex;
2568 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2569 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2570 ) {
2571 --sourceIndex;
2572 }
2573 } else {
2574 sourceIndex=-1;
2575 }
2576
2577 fromUWriteUInt8(
2578 args->converter,
2579 SHIFT_IN_STR, 1,
2580 &target, (const char *)targetLimit,
2581 &offsets, sourceIndex,
2582 err);
2583 }
2584
2585 /*save the state and return */
2586 args->source = source;
2587 args->target = (char*)target;
2588 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2589 }
2590
2591 /************************ To Unicode ***************************************/
2592
2593 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2594 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2595 UErrorCode* err){
2596 char const* sourceStart;
2597 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2598
2599 UConverterToUnicodeArgs subArgs;
2600 int32_t minArgsSize;
2601
2602 /* set up the subconverter arguments */
2603 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2604 minArgsSize = args->size;
2605 } else {
2606 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2607 }
2608
2609 uprv_memcpy(&subArgs, args, minArgsSize);
2610 subArgs.size = (uint16_t)minArgsSize;
2611 subArgs.converter = myData->currentConverter;
2612
2613 /* remember the original start of the input for offsets */
2614 sourceStart = args->source;
2615
2616 if(myData->key != 0) {
2617 /* continue with a partial escape sequence */
2618 goto escape;
2619 }
2620
2621 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2622 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2623 subArgs.source = args->source;
2624 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2625 if(subArgs.source != subArgs.sourceLimit) {
2626 /*
2627 * get the current partial byte sequence
2628 *
2629 * it needs to be moved between the public and the subconverter
2630 * so that the conversion framework, which only sees the public
2631 * converter, can handle truncated and illegal input etc.
2632 */
2633 if(args->converter->toULength > 0) {
2634 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2635 }
2636 subArgs.converter->toULength = args->converter->toULength;
2637
2638 /*
2639 * Convert up to the end of the input, or to before the next escape character.
2640 * Does not handle conversion extensions because the preToU[] state etc.
2641 * is not copied.
2642 */
2643 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2644
2645 if(args->offsets != NULL && sourceStart != args->source) {
2646 /* update offsets to base them on the actual start of the input */
2647 int32_t *offsets = args->offsets;
2648 UChar *target = args->target;
2649 int32_t delta = (int32_t)(args->source - sourceStart);
2650 while(target < subArgs.target) {
2651 if(*offsets >= 0) {
2652 *offsets += delta;
2653 }
2654 ++offsets;
2655 ++target;
2656 }
2657 }
2658 args->source = subArgs.source;
2659 args->target = subArgs.target;
2660 args->offsets = subArgs.offsets;
2661
2662 /* copy input/error/overflow buffers */
2663 if(subArgs.converter->toULength > 0) {
2664 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2665 }
2666 args->converter->toULength = subArgs.converter->toULength;
2667
2668 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2669 if(subArgs.converter->UCharErrorBufferLength > 0) {
2670 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2671 subArgs.converter->UCharErrorBufferLength);
2672 }
2673 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2674 subArgs.converter->UCharErrorBufferLength = 0;
2675 }
2676 }
2677
2678 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2679 return;
2680 }
2681
2682 escape:
2683 changeState_2022(args->converter,
2684 &(args->source),
2685 args->sourceLimit,
2686 ISO_2022_KR,
2687 err);
2688 }
2689 }
2690
2691 static void
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2692 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2693 UErrorCode* err){
2694 char tempBuf[2];
2695 const char *mySource = ( char *) args->source;
2696 UChar *myTarget = args->target;
2697 const char *mySourceLimit = args->sourceLimit;
2698 UChar32 targetUniChar = 0x0000;
2699 UChar mySourceChar = 0x0000;
2700 UConverterDataISO2022* myData;
2701 UConverterSharedData* sharedData ;
2702 UBool useFallback;
2703
2704 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2705 if(myData->version==1){
2706 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2707 return;
2708 }
2709
2710 /* initialize state */
2711 sharedData = myData->currentConverter->sharedData;
2712 useFallback = args->converter->useFallback;
2713
2714 if(myData->key != 0) {
2715 /* continue with a partial escape sequence */
2716 goto escape;
2717 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2718 /* continue with a partial double-byte character */
2719 mySourceChar = args->converter->toUBytes[0];
2720 args->converter->toULength = 0;
2721 goto getTrailByte;
2722 }
2723
2724 while(mySource< mySourceLimit){
2725
2726 if(myTarget < args->targetLimit){
2727
2728 mySourceChar= (unsigned char) *mySource++;
2729
2730 if(mySourceChar==UCNV_SI){
2731 myData->toU2022State.g = 0;
2732 if (myData->isEmptySegment) {
2733 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2734 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2735 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2736 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2737 args->converter->toULength = 1;
2738 args->target = myTarget;
2739 args->source = mySource;
2740 return;
2741 }
2742 /*consume the source */
2743 continue;
2744 }else if(mySourceChar==UCNV_SO){
2745 myData->toU2022State.g = 1;
2746 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2747 /*consume the source */
2748 continue;
2749 }else if(mySourceChar==ESC_2022){
2750 mySource--;
2751 escape:
2752 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2753 changeState_2022(args->converter,&(mySource),
2754 mySourceLimit, ISO_2022_KR, err);
2755 if(U_FAILURE(*err)){
2756 args->target = myTarget;
2757 args->source = mySource;
2758 return;
2759 }
2760 continue;
2761 }
2762
2763 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2764 if(myData->toU2022State.g == 1) {
2765 if(mySource < mySourceLimit) {
2766 int leadIsOk, trailIsOk;
2767 uint8_t trailByte;
2768 getTrailByte:
2769 targetUniChar = missingCharMarker;
2770 trailByte = (uint8_t)*mySource;
2771 /*
2772 * Ticket 5691: consistent illegal sequences:
2773 * - We include at least the first byte in the illegal sequence.
2774 * - If any of the non-initial bytes could be the start of a character,
2775 * we stop the illegal sequence before the first one of those.
2776 *
2777 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2778 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2779 * Otherwise we convert or report the pair of bytes.
2780 */
2781 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2782 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2783 if (leadIsOk && trailIsOk) {
2784 ++mySource;
2785 tempBuf[0] = (char)(mySourceChar + 0x80);
2786 tempBuf[1] = (char)(trailByte + 0x80);
2787 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2788 mySourceChar = (mySourceChar << 8) | trailByte;
2789 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2790 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2791 ++mySource;
2792 /* add another bit so that the code below writes 2 bytes in case of error */
2793 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2794 }
2795 } else {
2796 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2797 args->converter->toULength = 1;
2798 break;
2799 }
2800 }
2801 else if(mySourceChar <= 0x7f) {
2802 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2803 } else {
2804 targetUniChar = 0xffff;
2805 }
2806 if(targetUniChar < 0xfffe){
2807 if(args->offsets) {
2808 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2809 }
2810 *(myTarget++)=(UChar)targetUniChar;
2811 }
2812 else {
2813 /* Call the callback function*/
2814 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2815 break;
2816 }
2817 }
2818 else{
2819 *err =U_BUFFER_OVERFLOW_ERROR;
2820 break;
2821 }
2822 }
2823 args->target = myTarget;
2824 args->source = mySource;
2825 }
2826
2827 /*************************** END ISO2022-KR *********************************/
2828
2829 /*************************** ISO-2022-CN *********************************
2830 *
2831 * Rules for ISO-2022-CN Encoding:
2832 * i) The designator sequence must appear once on a line before any instance
2833 * of character set it designates.
2834 * ii) If two lines contain characters from the same character set, both lines
2835 * must include the designator sequence.
2836 * iii) Once the designator sequence is known, a shifting sequence has to be found
2837 * to invoke the shifting
2838 * iv) All lines start in ASCII and end in ASCII.
2839 * v) Four shifting sequences are employed for this purpose:
2840 *
2841 * Sequcence ASCII Eq Charsets
2842 * ---------- ------- ---------
2843 * SI <SI> US-ASCII
2844 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2845 * SS2 <ESC>N CNS-11643-1992 Plane 2
2846 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2847 *
2848 * vi)
2849 * SOdesignator : ESC "$" ")" finalchar_for_SO
2850 * SS2designator : ESC "$" "*" finalchar_for_SS2
2851 * SS3designator : ESC "$" "+" finalchar_for_SS3
2852 *
2853 * ESC $ ) A Indicates the bytes following SO are Chinese
2854 * characters as defined in GB 2312-80, until
2855 * another SOdesignation appears
2856 *
2857 *
2858 * ESC $ ) E Indicates the bytes following SO are as defined
2859 * in ISO-IR-165 (for details, see section 2.1),
2860 * until another SOdesignation appears
2861 *
2862 * ESC $ ) G Indicates the bytes following SO are as defined
2863 * in CNS 11643-plane-1, until another
2864 * SOdesignation appears
2865 *
2866 * ESC $ * H Indicates the two bytes immediately following
2867 * SS2 is a Chinese character as defined in CNS
2868 * 11643-plane-2, until another SS2designation
2869 * appears
2870 * (Meaning <ESC>N must preceed every 2 byte
2871 * sequence.)
2872 *
2873 * ESC $ + I Indicates the immediate two bytes following SS3
2874 * is a Chinese character as defined in CNS
2875 * 11643-plane-3, until another SS3designation
2876 * appears
2877 * (Meaning <ESC>O must preceed every 2 byte
2878 * sequence.)
2879 *
2880 * ESC $ + J Indicates the immediate two bytes following SS3
2881 * is a Chinese character as defined in CNS
2882 * 11643-plane-4, until another SS3designation
2883 * appears
2884 * (In English: <ESC>O must preceed every 2 byte
2885 * sequence.)
2886 *
2887 * ESC $ + K Indicates the immediate two bytes following SS3
2888 * is a Chinese character as defined in CNS
2889 * 11643-plane-5, until another SS3designation
2890 * appears
2891 *
2892 * ESC $ + L Indicates the immediate two bytes following SS3
2893 * is a Chinese character as defined in CNS
2894 * 11643-plane-6, until another SS3designation
2895 * appears
2896 *
2897 * ESC $ + M Indicates the immediate two bytes following SS3
2898 * is a Chinese character as defined in CNS
2899 * 11643-plane-7, until another SS3designation
2900 * appears
2901 *
2902 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2903 * has its own designation information before any Chinese characters
2904 * appear
2905 *
2906 */
2907
2908 /* The following are defined this way to make the strings truly readonly */
2909 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2910 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2911 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2912 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2913 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2914 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2915 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2916 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2917 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2918
2919 /********************** ISO2022-CN Data **************************/
2920 static const char* const escSeqCharsCN[10] ={
2921 SHIFT_IN_STR, /* 0 ASCII */
2922 GB_2312_80_STR, /* 1 GB2312_1 */
2923 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2924 CNS_11643_1992_Plane_1_STR,
2925 CNS_11643_1992_Plane_2_STR,
2926 CNS_11643_1992_Plane_3_STR,
2927 CNS_11643_1992_Plane_4_STR,
2928 CNS_11643_1992_Plane_5_STR,
2929 CNS_11643_1992_Plane_6_STR,
2930 CNS_11643_1992_Plane_7_STR
2931 };
2932
2933 static void
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2934 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2935 UConverter *cnv = args->converter;
2936 UConverterDataISO2022 *converterData;
2937 ISO2022State *pFromU2022State;
2938 uint8_t *target = (uint8_t *) args->target;
2939 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2940 const UChar* source = args->source;
2941 const UChar* sourceLimit = args->sourceLimit;
2942 int32_t* offsets = args->offsets;
2943 UChar32 sourceChar;
2944 char buffer[8];
2945 int32_t len;
2946 int8_t choices[3];
2947 int32_t choiceCount;
2948 uint32_t targetValue = 0;
2949 UBool useFallback;
2950
2951 /* set up the state */
2952 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2953 pFromU2022State = &converterData->fromU2022State;
2954
2955 choiceCount = 0;
2956
2957 /* check if the last codepoint of previous buffer was a lead surrogate*/
2958 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2959 goto getTrail;
2960 }
2961
2962 while( source < sourceLimit){
2963 if(target < targetLimit){
2964
2965 sourceChar = *(source++);
2966 /*check if the char is a First surrogate*/
2967 if(U16_IS_SURROGATE(sourceChar)) {
2968 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2969 getTrail:
2970 /*look ahead to find the trail surrogate*/
2971 if(source < sourceLimit) {
2972 /* test the following code unit */
2973 UChar trail=(UChar) *source;
2974 if(U16_IS_TRAIL(trail)) {
2975 source++;
2976 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2977 cnv->fromUChar32=0x00;
2978 /* convert this supplementary code point */
2979 /* exit this condition tree */
2980 } else {
2981 /* this is an unmatched lead code unit (1st surrogate) */
2982 /* callback(illegal) */
2983 *err=U_ILLEGAL_CHAR_FOUND;
2984 cnv->fromUChar32=sourceChar;
2985 break;
2986 }
2987 } else {
2988 /* no more input */
2989 cnv->fromUChar32=sourceChar;
2990 break;
2991 }
2992 } else {
2993 /* this is an unmatched trail code unit (2nd surrogate) */
2994 /* callback(illegal) */
2995 *err=U_ILLEGAL_CHAR_FOUND;
2996 cnv->fromUChar32=sourceChar;
2997 break;
2998 }
2999 }
3000
3001 /* do the conversion */
3002 if(sourceChar <= 0x007f ){
3003 /* do not convert SO/SI/ESC */
3004 if(IS_2022_CONTROL(sourceChar)) {
3005 /* callback(illegal) */
3006 *err=U_ILLEGAL_CHAR_FOUND;
3007 cnv->fromUChar32=sourceChar;
3008 break;
3009 }
3010
3011 /* US-ASCII */
3012 if(pFromU2022State->g == 0) {
3013 buffer[0] = (char)sourceChar;
3014 len = 1;
3015 } else {
3016 buffer[0] = UCNV_SI;
3017 buffer[1] = (char)sourceChar;
3018 len = 2;
3019 pFromU2022State->g = 0;
3020 choiceCount = 0;
3021 }
3022 if(sourceChar == CR || sourceChar == LF) {
3023 /* reset the state at the end of a line */
3024 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3025 choiceCount = 0;
3026 }
3027 }
3028 else{
3029 /* convert U+0080..U+10ffff */
3030 int32_t i;
3031 int8_t cs, g;
3032
3033 if(choiceCount == 0) {
3034 /* try the current SO/G1 converter first */
3035 choices[0] = pFromU2022State->cs[1];
3036
3037 /* default to GB2312_1 if none is designated yet */
3038 if(choices[0] == 0) {
3039 choices[0] = GB2312_1;
3040 }
3041
3042 if(converterData->version == 0) {
3043 /* ISO-2022-CN */
3044
3045 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3046 if(choices[0] == GB2312_1) {
3047 choices[1] = (int8_t)CNS_11643_1;
3048 } else {
3049 choices[1] = (int8_t)GB2312_1;
3050 }
3051
3052 choiceCount = 2;
3053 } else if (converterData->version == 1) {
3054 /* ISO-2022-CN-EXT */
3055
3056 /* try one of the other converters */
3057 switch(choices[0]) {
3058 case GB2312_1:
3059 choices[1] = (int8_t)CNS_11643_1;
3060 choices[2] = (int8_t)ISO_IR_165;
3061 break;
3062 case ISO_IR_165:
3063 choices[1] = (int8_t)GB2312_1;
3064 choices[2] = (int8_t)CNS_11643_1;
3065 break;
3066 default: /* CNS_11643_x */
3067 choices[1] = (int8_t)GB2312_1;
3068 choices[2] = (int8_t)ISO_IR_165;
3069 break;
3070 }
3071
3072 choiceCount = 3;
3073 } else {
3074 choices[0] = (int8_t)CNS_11643_1;
3075 choices[1] = (int8_t)GB2312_1;
3076 }
3077 }
3078
3079 cs = g = 0;
3080 /*
3081 * len==0: no mapping found yet
3082 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3083 * len>0: found a roundtrip result, done
3084 */
3085 len = 0;
3086 /*
3087 * We will turn off useFallback after finding a fallback,
3088 * but we still get fallbacks from PUA code points as usual.
3089 * Therefore, we will also need to check that we don't overwrite
3090 * an early fallback with a later one.
3091 */
3092 useFallback = cnv->useFallback;
3093
3094 for(i = 0; i < choiceCount && len <= 0; ++i) {
3095 int8_t cs0 = choices[i];
3096 if(cs0 > 0) {
3097 uint32_t value;
3098 int32_t len2;
3099 if(cs0 >= CNS_11643_0) {
3100 len2 = MBCS_FROM_UCHAR32_ISO2022(
3101 converterData->myConverterArray[CNS_11643],
3102 sourceChar,
3103 &value,
3104 useFallback,
3105 MBCS_OUTPUT_3);
3106 if(len2 == 3 || (len2 == -3 && len == 0)) {
3107 targetValue = value;
3108 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3109 if(len2 >= 0) {
3110 len = 2;
3111 } else {
3112 len = -2;
3113 useFallback = FALSE;
3114 }
3115 if(cs == CNS_11643_1) {
3116 g = 1;
3117 } else if(cs == CNS_11643_2) {
3118 g = 2;
3119 } else /* plane 3..7 */ if(converterData->version == 1) {
3120 g = 3;
3121 } else {
3122 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3123 len = 0;
3124 }
3125 }
3126 } else {
3127 /* GB2312_1 or ISO-IR-165 */
3128 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3129 len2 = MBCS_FROM_UCHAR32_ISO2022(
3130 converterData->myConverterArray[cs0],
3131 sourceChar,
3132 &value,
3133 useFallback,
3134 MBCS_OUTPUT_2);
3135 if(len2 == 2 || (len2 == -2 && len == 0)) {
3136 targetValue = value;
3137 len = len2;
3138 cs = cs0;
3139 g = 1;
3140 useFallback = FALSE;
3141 }
3142 }
3143 }
3144 }
3145
3146 if(len != 0) {
3147 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3148
3149 /* write the designation sequence if necessary */
3150 if(cs != pFromU2022State->cs[g]) {
3151 if(cs < CNS_11643) {
3152 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3153 } else {
3154 U_ASSERT(cs >= CNS_11643_1);
3155 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3156 }
3157 len = 4;
3158 pFromU2022State->cs[g] = cs;
3159 if(g == 1) {
3160 /* changing the SO/G1 charset invalidates the choices[] */
3161 choiceCount = 0;
3162 }
3163 }
3164
3165 /* write the shift sequence if necessary */
3166 if(g != pFromU2022State->g) {
3167 switch(g) {
3168 case 1:
3169 buffer[len++] = UCNV_SO;
3170
3171 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3172 pFromU2022State->g = 1;
3173 break;
3174 case 2:
3175 buffer[len++] = 0x1b;
3176 buffer[len++] = 0x4e;
3177 break;
3178 default: /* case 3 */
3179 buffer[len++] = 0x1b;
3180 buffer[len++] = 0x4f;
3181 break;
3182 }
3183 }
3184
3185 /* write the two output bytes */
3186 buffer[len++] = (char)(targetValue >> 8);
3187 buffer[len++] = (char)targetValue;
3188 } else {
3189 /* if we cannot find the character after checking all codepages
3190 * then this is an error
3191 */
3192 *err = U_INVALID_CHAR_FOUND;
3193 cnv->fromUChar32=sourceChar;
3194 break;
3195 }
3196 }
3197
3198 /* output len>0 bytes in buffer[] */
3199 if(len == 1) {
3200 *target++ = buffer[0];
3201 if(offsets) {
3202 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3203 }
3204 } else if(len == 2 && (target + 2) <= targetLimit) {
3205 *target++ = buffer[0];
3206 *target++ = buffer[1];
3207 if(offsets) {
3208 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3209 *offsets++ = sourceIndex;
3210 *offsets++ = sourceIndex;
3211 }
3212 } else {
3213 fromUWriteUInt8(
3214 cnv,
3215 buffer, len,
3216 &target, (const char *)targetLimit,
3217 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3218 err);
3219 if(U_FAILURE(*err)) {
3220 break;
3221 }
3222 }
3223 } /* end if(myTargetIndex<myTargetLength) */
3224 else{
3225 *err =U_BUFFER_OVERFLOW_ERROR;
3226 break;
3227 }
3228
3229 }/* end while(mySourceIndex<mySourceLength) */
3230
3231 /*
3232 * the end of the input stream and detection of truncated input
3233 * are handled by the framework, but for ISO-2022-CN conversion
3234 * we need to be in ASCII mode at the very end
3235 *
3236 * conditions:
3237 * successful
3238 * not in ASCII mode
3239 * end of input and no truncated input
3240 */
3241 if( U_SUCCESS(*err) &&
3242 pFromU2022State->g!=0 &&
3243 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3244 ) {
3245 int32_t sourceIndex;
3246
3247 /* we are switching to ASCII */
3248 pFromU2022State->g=0;
3249
3250 /* get the source index of the last input character */
3251 /*
3252 * TODO this would be simpler and more reliable if we used a pair
3253 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3254 * so that we could simply use the prevSourceIndex here;
3255 * this code gives an incorrect result for the rare case of an unmatched
3256 * trail surrogate that is alone in the last buffer of the text stream
3257 */
3258 sourceIndex=(int32_t)(source-args->source);
3259 if(sourceIndex>0) {
3260 --sourceIndex;
3261 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3262 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3263 ) {
3264 --sourceIndex;
3265 }
3266 } else {
3267 sourceIndex=-1;
3268 }
3269
3270 fromUWriteUInt8(
3271 cnv,
3272 SHIFT_IN_STR, 1,
3273 &target, (const char *)targetLimit,
3274 &offsets, sourceIndex,
3275 err);
3276 }
3277
3278 /*save the state and return */
3279 args->source = source;
3280 args->target = (char*)target;
3281 }
3282
3283
3284 static void
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3285 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3286 UErrorCode* err){
3287 char tempBuf[3];
3288 const char *mySource = (char *) args->source;
3289 UChar *myTarget = args->target;
3290 const char *mySourceLimit = args->sourceLimit;
3291 uint32_t targetUniChar = 0x0000;
3292 uint32_t mySourceChar = 0x0000;
3293 UConverterDataISO2022* myData;
3294 ISO2022State *pToU2022State;
3295
3296 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3297 pToU2022State = &myData->toU2022State;
3298
3299 if(myData->key != 0) {
3300 /* continue with a partial escape sequence */
3301 goto escape;
3302 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3303 /* continue with a partial double-byte character */
3304 mySourceChar = args->converter->toUBytes[0];
3305 args->converter->toULength = 0;
3306 targetUniChar = missingCharMarker;
3307 goto getTrailByte;
3308 }
3309
3310 while(mySource < mySourceLimit){
3311
3312 targetUniChar =missingCharMarker;
3313
3314 if(myTarget < args->targetLimit){
3315
3316 mySourceChar= (unsigned char) *mySource++;
3317
3318 switch(mySourceChar){
3319 case UCNV_SI:
3320 pToU2022State->g=0;
3321 if (myData->isEmptySegment) {
3322 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3323 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3324 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3325 args->converter->toUBytes[0] = mySourceChar;
3326 args->converter->toULength = 1;
3327 args->target = myTarget;
3328 args->source = mySource;
3329 return;
3330 }
3331 continue;
3332
3333 case UCNV_SO:
3334 if(pToU2022State->cs[1] != 0) {
3335 pToU2022State->g=1;
3336 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3337 continue;
3338 } else {
3339 /* illegal to have SO before a matching designator */
3340 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3341 break;
3342 }
3343
3344 case ESC_2022:
3345 mySource--;
3346 escape:
3347 {
3348 const char * mySourceBefore = mySource;
3349 int8_t toULengthBefore = args->converter->toULength;
3350
3351 changeState_2022(args->converter,&(mySource),
3352 mySourceLimit, ISO_2022_CN,err);
3353
3354 /* After SO there must be at least one character before a designator (designator error handled separately) */
3355 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3356 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3357 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3358 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3359 }
3360 }
3361
3362 /* invalid or illegal escape sequence */
3363 if(U_FAILURE(*err)){
3364 args->target = myTarget;
3365 args->source = mySource;
3366 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3367 return;
3368 }
3369 continue;
3370
3371 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3372
3373 case CR:
3374 /*falls through*/
3375 case LF:
3376 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3377 /* falls through */
3378 default:
3379 /* convert one or two bytes */
3380 myData->isEmptySegment = FALSE;
3381 if(pToU2022State->g != 0) {
3382 if(mySource < mySourceLimit) {
3383 UConverterSharedData *cnv;
3384 StateEnum tempState;
3385 int32_t tempBufLen;
3386 int leadIsOk, trailIsOk;
3387 uint8_t trailByte;
3388 getTrailByte:
3389 trailByte = (uint8_t)*mySource;
3390 /*
3391 * Ticket 5691: consistent illegal sequences:
3392 * - We include at least the first byte in the illegal sequence.
3393 * - If any of the non-initial bytes could be the start of a character,
3394 * we stop the illegal sequence before the first one of those.
3395 *
3396 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3397 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3398 * Otherwise we convert or report the pair of bytes.
3399 */
3400 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3401 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3402 if (leadIsOk && trailIsOk) {
3403 ++mySource;
3404 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3405 if(tempState >= CNS_11643_0) {
3406 cnv = myData->myConverterArray[CNS_11643];
3407 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3408 tempBuf[1] = (char) (mySourceChar);
3409 tempBuf[2] = (char) trailByte;
3410 tempBufLen = 3;
3411
3412 }else{
3413 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3414 cnv = myData->myConverterArray[tempState];
3415 tempBuf[0] = (char) (mySourceChar);
3416 tempBuf[1] = (char) trailByte;
3417 tempBufLen = 2;
3418 }
3419 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3420 mySourceChar = (mySourceChar << 8) | trailByte;
3421 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3422 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3423 ++mySource;
3424 /* add another bit so that the code below writes 2 bytes in case of error */
3425 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3426 }
3427 if(pToU2022State->g>=2) {
3428 /* return from a single-shift state to the previous one */
3429 pToU2022State->g=pToU2022State->prevG;
3430 }
3431 } else {
3432 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3433 args->converter->toULength = 1;
3434 goto endloop;
3435 }
3436 }
3437 else{
3438 if(mySourceChar <= 0x7f) {
3439 targetUniChar = (UChar) mySourceChar;
3440 }
3441 }
3442 break;
3443 }
3444 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3445 if(args->offsets){
3446 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3447 }
3448 *(myTarget++)=(UChar)targetUniChar;
3449 }
3450 else if(targetUniChar > missingCharMarker){
3451 /* disassemble the surrogate pair and write to output*/
3452 targetUniChar-=0x0010000;
3453 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3454 if(args->offsets){
3455 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3456 }
3457 ++myTarget;
3458 if(myTarget< args->targetLimit){
3459 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3460 if(args->offsets){
3461 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3462 }
3463 ++myTarget;
3464 }else{
3465 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3466 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3467 }
3468
3469 }
3470 else{
3471 /* Call the callback function*/
3472 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3473 break;
3474 }
3475 }
3476 else{
3477 *err =U_BUFFER_OVERFLOW_ERROR;
3478 break;
3479 }
3480 }
3481 endloop:
3482 args->target = myTarget;
3483 args->source = mySource;
3484 }
3485 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3486
3487 static void
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3488 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3489 UConverter *cnv = args->converter;
3490 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3491 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3492 char *p, *subchar;
3493 char buffer[8];
3494 int32_t length;
3495
3496 subchar=(char *)cnv->subChars;
3497 length=cnv->subCharLen; /* assume length==1 for most variants */
3498
3499 p = buffer;
3500 switch(myConverterData->locale[0]){
3501 case 'j':
3502 {
3503 int8_t cs;
3504
3505 if(pFromU2022State->g == 1) {
3506 /* JIS7: switch from G1 to G0 */
3507 pFromU2022State->g = 0;
3508 *p++ = UCNV_SI;
3509 }
3510
3511 cs = pFromU2022State->cs[0];
3512 if(cs != ASCII && cs != JISX201) {
3513 /* not in ASCII or JIS X 0201: switch to ASCII */
3514 pFromU2022State->cs[0] = (int8_t)ASCII;
3515 *p++ = '\x1b';
3516 *p++ = '\x28';
3517 *p++ = '\x42';
3518 }
3519
3520 *p++ = subchar[0];
3521 break;
3522 }
3523 case 'c':
3524 if(pFromU2022State->g != 0) {
3525 /* not in ASCII mode: switch to ASCII */
3526 pFromU2022State->g = 0;
3527 *p++ = UCNV_SI;
3528 }
3529 *p++ = subchar[0];
3530 break;
3531 case 'k':
3532 if(myConverterData->version == 0) {
3533 if(length == 1) {
3534 if((UBool)args->converter->fromUnicodeStatus) {
3535 /* in DBCS mode: switch to SBCS */
3536 args->converter->fromUnicodeStatus = 0;
3537 *p++ = UCNV_SI;
3538 }
3539 *p++ = subchar[0];
3540 } else /* length == 2*/ {
3541 if(!(UBool)args->converter->fromUnicodeStatus) {
3542 /* in SBCS mode: switch to DBCS */
3543 args->converter->fromUnicodeStatus = 1;
3544 *p++ = UCNV_SO;
3545 }
3546 *p++ = subchar[0];
3547 *p++ = subchar[1];
3548 }
3549 break;
3550 } else {
3551 /* save the subconverter's substitution string */
3552 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3553 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3554
3555 /* set our substitution string into the subconverter */
3556 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3557 myConverterData->currentConverter->subCharLen = (int8_t)length;
3558
3559 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3560 args->converter = myConverterData->currentConverter;
3561 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3562 ucnv_cbFromUWriteSub(args, 0, err);
3563 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3564 args->converter = cnv;
3565
3566 /* restore the subconverter's substitution string */
3567 myConverterData->currentConverter->subChars = currentSubChars;
3568 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3569
3570 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3571 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3572 uprv_memcpy(
3573 cnv->charErrorBuffer,
3574 myConverterData->currentConverter->charErrorBuffer,
3575 myConverterData->currentConverter->charErrorBufferLength);
3576 }
3577 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3578 myConverterData->currentConverter->charErrorBufferLength = 0;
3579 }
3580 return;
3581 }
3582 default:
3583 /* not expected */
3584 break;
3585 }
3586 ucnv_cbFromUWriteBytes(args,
3587 buffer, (int32_t)(p - buffer),
3588 offsetIndex, err);
3589 }
3590
3591 /*
3592 * Structure for cloning an ISO 2022 converter into a single memory block.
3593 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3594 * and then ucnv_safeClone() of the sub-converter may additionally align
3595 * currentConverter inside the cloneStruct, for which we need the deadSpace
3596 * after currentConverter.
3597 * This is because UAlignedMemory may be larger than the actually
3598 * necessary alignment size for the platform.
3599 * The other cloneStruct fields will not be moved around,
3600 * and are aligned properly with cloneStruct's alignment.
3601 */
3602 struct cloneStruct
3603 {
3604 UConverter cnv;
3605 UConverter currentConverter;
3606 UAlignedMemory deadSpace;
3607 UConverterDataISO2022 mydata;
3608 };
3609
3610
3611 static UConverter *
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3612 _ISO_2022_SafeClone(
3613 const UConverter *cnv,
3614 void *stackBuffer,
3615 int32_t *pBufferSize,
3616 UErrorCode *status)
3617 {
3618 struct cloneStruct * localClone;
3619 UConverterDataISO2022 *cnvData;
3620 int32_t i, size;
3621
3622 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3623 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3624 return NULL;
3625 }
3626
3627 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3628 localClone = (struct cloneStruct *)stackBuffer;
3629
3630 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3631
3632 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3633 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3634 localClone->cnv.isExtraLocal = TRUE;
3635
3636 /* share the subconverters */
3637
3638 if(cnvData->currentConverter != NULL) {
3639 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3640 localClone->mydata.currentConverter =
3641 ucnv_safeClone(cnvData->currentConverter,
3642 &localClone->currentConverter,
3643 &size, status);
3644 if(U_FAILURE(*status)) {
3645 return NULL;
3646 }
3647 }
3648
3649 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3650 if(cnvData->myConverterArray[i] != NULL) {
3651 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3652 }
3653 }
3654
3655 return &localClone->cnv;
3656 }
3657
3658 static void
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3659 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3660 const USetAdder *sa,
3661 UConverterUnicodeSet which,
3662 UErrorCode *pErrorCode)
3663 {
3664 int32_t i;
3665 UConverterDataISO2022* cnvData;
3666
3667 if (U_FAILURE(*pErrorCode)) {
3668 return;
3669 }
3670 #ifdef U_ENABLE_GENERIC_ISO_2022
3671 if (cnv->sharedData == &_ISO2022Data) {
3672 /* We use UTF-8 in this case */
3673 sa->addRange(sa->set, 0, 0xd7FF);
3674 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3675 return;
3676 }
3677 #endif
3678
3679 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3680
3681 /* open a set and initialize it with code points that are algorithmically round-tripped */
3682 switch(cnvData->locale[0]){
3683 case 'j':
3684 /* include JIS X 0201 which is hardcoded */
3685 sa->add(sa->set, 0xa5);
3686 sa->add(sa->set, 0x203e);
3687 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3688 /* include Latin-1 for some variants of JP */
3689 sa->addRange(sa->set, 0, 0xff);
3690 } else {
3691 /* include ASCII for JP */
3692 sa->addRange(sa->set, 0, 0x7f);
3693 }
3694 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3695 /*
3696 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3697 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3698 * use half-width Katakana.
3699 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3700 * half-width Katakana via the ESC ( I sequence.
3701 * However, we only emit (fromUnicode) half-width Katakana according to the
3702 * definition of each variant.
3703 *
3704 * When including fallbacks,
3705 * we need to include half-width Katakana Unicode code points for all JP variants because
3706 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3707 */
3708 /* include half-width Katakana for JP */
3709 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3710 }
3711 break;
3712 #if !UCONFIG_ONLY_HTML_CONVERSION
3713 case 'c':
3714 case 'z':
3715 /* include ASCII for CN */
3716 sa->addRange(sa->set, 0, 0x7f);
3717 break;
3718 case 'k':
3719 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3720 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3721 cnvData->currentConverter, sa, which, pErrorCode);
3722 /* the loop over myConverterArray[] will simply not find another converter */
3723 break;
3724 #endif
3725 default:
3726 break;
3727 }
3728
3729 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3730 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3731 cnvData->version==0 && i==CNS_11643
3732 ) {
3733 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3734 ucnv_MBCSGetUnicodeSetForBytes(
3735 cnvData->myConverterArray[i],
3736 sa, UCNV_ROUNDTRIP_SET,
3737 0, 0x81, 0x82,
3738 pErrorCode);
3739 }
3740 #endif
3741
3742 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3743 UConverterSetFilter filter;
3744 if(cnvData->myConverterArray[i]!=NULL) {
3745 if(cnvData->locale[0]=='j' && i==JISX208) {
3746 /*
3747 * Only add code points that map to Shift-JIS codes
3748 * corresponding to JIS X 0208.
3749 */
3750 filter=UCNV_SET_FILTER_SJIS;
3751 #if !UCONFIG_ONLY_HTML_CONVERSION
3752 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3753 cnvData->version==0 && i==CNS_11643) {
3754 /*
3755 * Version-specific for CN:
3756 * CN version 0 does not map CNS planes 3..7 although
3757 * they are all available in the CNS conversion table;
3758 * CN version 1 (-EXT) does map them all.
3759 * The two versions create different Unicode sets.
3760 */
3761 filter=UCNV_SET_FILTER_2022_CN;
3762 } else if(i==KSC5601) {
3763 /*
3764 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3765 * are broader than GR94.
3766 */
3767 filter=UCNV_SET_FILTER_GR94DBCS;
3768 #endif
3769 } else {
3770 filter=UCNV_SET_FILTER_NONE;
3771 }
3772 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3773 }
3774 }
3775
3776 /*
3777 * ISO 2022 converters must not convert SO/SI/ESC despite what
3778 * sub-converters do by themselves.
3779 * Remove these characters from the set.
3780 */
3781 sa->remove(sa->set, 0x0e);
3782 sa->remove(sa->set, 0x0f);
3783 sa->remove(sa->set, 0x1b);
3784
3785 /* ISO 2022 converters do not convert C1 controls either */
3786 sa->removeRange(sa->set, 0x80, 0x9f);
3787 }
3788
3789 static const UConverterImpl _ISO2022Impl={
3790 UCNV_ISO_2022,
3791
3792 NULL,
3793 NULL,
3794
3795 _ISO2022Open,
3796 _ISO2022Close,
3797 _ISO2022Reset,
3798
3799 #ifdef U_ENABLE_GENERIC_ISO_2022
3800 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3801 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3802 ucnv_fromUnicode_UTF8,
3803 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3804 #else
3805 NULL,
3806 NULL,
3807 NULL,
3808 NULL,
3809 #endif
3810 NULL,
3811
3812 NULL,
3813 _ISO2022getName,
3814 _ISO_2022_WriteSub,
3815 _ISO_2022_SafeClone,
3816 _ISO_2022_GetUnicodeSet,
3817
3818 NULL,
3819 NULL
3820 };
3821 static const UConverterStaticData _ISO2022StaticData={
3822 sizeof(UConverterStaticData),
3823 "ISO_2022",
3824 2022,
3825 UCNV_IBM,
3826 UCNV_ISO_2022,
3827 1,
3828 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3829 { 0x1a, 0, 0, 0 },
3830 1,
3831 FALSE,
3832 FALSE,
3833 0,
3834 0,
3835 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3836 };
3837 const UConverterSharedData _ISO2022Data=
3838 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3839
3840 /*************JP****************/
3841 static const UConverterImpl _ISO2022JPImpl={
3842 UCNV_ISO_2022,
3843
3844 NULL,
3845 NULL,
3846
3847 _ISO2022Open,
3848 _ISO2022Close,
3849 _ISO2022Reset,
3850
3851 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3852 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3853 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3854 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3855 NULL,
3856
3857 NULL,
3858 _ISO2022getName,
3859 _ISO_2022_WriteSub,
3860 _ISO_2022_SafeClone,
3861 _ISO_2022_GetUnicodeSet,
3862
3863 NULL,
3864 NULL
3865 };
3866 static const UConverterStaticData _ISO2022JPStaticData={
3867 sizeof(UConverterStaticData),
3868 "ISO_2022_JP",
3869 0,
3870 UCNV_IBM,
3871 UCNV_ISO_2022,
3872 1,
3873 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3874 { 0x1a, 0, 0, 0 },
3875 1,
3876 FALSE,
3877 FALSE,
3878 0,
3879 0,
3880 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3881 };
3882
3883 namespace {
3884
3885 const UConverterSharedData _ISO2022JPData=
3886 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3887
3888 } // namespace
3889
3890 #if !UCONFIG_ONLY_HTML_CONVERSION
3891 /************* KR ***************/
3892 static const UConverterImpl _ISO2022KRImpl={
3893 UCNV_ISO_2022,
3894
3895 NULL,
3896 NULL,
3897
3898 _ISO2022Open,
3899 _ISO2022Close,
3900 _ISO2022Reset,
3901
3902 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3903 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3904 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3905 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3906 NULL,
3907
3908 NULL,
3909 _ISO2022getName,
3910 _ISO_2022_WriteSub,
3911 _ISO_2022_SafeClone,
3912 _ISO_2022_GetUnicodeSet,
3913
3914 NULL,
3915 NULL
3916 };
3917 static const UConverterStaticData _ISO2022KRStaticData={
3918 sizeof(UConverterStaticData),
3919 "ISO_2022_KR",
3920 0,
3921 UCNV_IBM,
3922 UCNV_ISO_2022,
3923 1,
3924 3, /* max 3 bytes per UChar: SO+DBCS */
3925 { 0x1a, 0, 0, 0 },
3926 1,
3927 FALSE,
3928 FALSE,
3929 0,
3930 0,
3931 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3932 };
3933
3934 namespace {
3935
3936 const UConverterSharedData _ISO2022KRData=
3937 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3938
3939 } // namespace
3940
3941 /*************** CN ***************/
3942 static const UConverterImpl _ISO2022CNImpl={
3943
3944 UCNV_ISO_2022,
3945
3946 NULL,
3947 NULL,
3948
3949 _ISO2022Open,
3950 _ISO2022Close,
3951 _ISO2022Reset,
3952
3953 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3954 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3955 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3956 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3957 NULL,
3958
3959 NULL,
3960 _ISO2022getName,
3961 _ISO_2022_WriteSub,
3962 _ISO_2022_SafeClone,
3963 _ISO_2022_GetUnicodeSet,
3964
3965 NULL,
3966 NULL
3967 };
3968 static const UConverterStaticData _ISO2022CNStaticData={
3969 sizeof(UConverterStaticData),
3970 "ISO_2022_CN",
3971 0,
3972 UCNV_IBM,
3973 UCNV_ISO_2022,
3974 1,
3975 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3976 { 0x1a, 0, 0, 0 },
3977 1,
3978 FALSE,
3979 FALSE,
3980 0,
3981 0,
3982 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3983 };
3984
3985 namespace {
3986
3987 const UConverterSharedData _ISO2022CNData=
3988 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3989
3990 } // namespace
3991 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3992
3993 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3994