1 /*
2 **********************************************************************
3 * Copyright (C) 2000-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvhz.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2000oct16
12 * created by: Ram Viswanadha
13 * 10/31/2000 Ram Implemented offsets logic function
14 *
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
20
21 #include "cmemory.h"
22 #include "unicode/ucnv.h"
23 #include "unicode/ucnv_cb.h"
24 #include "unicode/uset.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27
28 #define UCNV_TILDE 0x7E /* ~ */
29 #define UCNV_OPEN_BRACE 0x7B /* { */
30 #define UCNV_CLOSE_BRACE 0x7D /* } */
31 #define SB_ESCAPE "\x7E\x7D"
32 #define DB_ESCAPE "\x7E\x7B"
33 #define TILDE_ESCAPE "\x7E\x7E"
34 #define ESC_LEN 2
35
36
37 #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
38 while(len-->0){ \
39 if(targetIndex < targetLength){ \
40 args->target[targetIndex] = (unsigned char) *strToAppend; \
41 if(args->offsets!=NULL){ \
42 *(offsets++) = sourceIndex-1; \
43 } \
44 targetIndex++; \
45 } \
46 else{ \
47 args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
48 *err =U_BUFFER_OVERFLOW_ERROR; \
49 } \
50 strToAppend++; \
51 } \
52 }
53
54
55 typedef struct{
56 UConverter* gbConverter;
57 int32_t targetIndex;
58 int32_t sourceIndex;
59 UBool isEscapeAppended;
60 UBool isStateDBCS;
61 UBool isTargetUCharDBCS;
62 }UConverterDataHZ;
63
64
65
66 static void
_HZOpen(UConverter * cnv,const char * name,const char * locale,uint32_t options,UErrorCode * errorCode)67 _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
68 cnv->toUnicodeStatus = 0;
69 cnv->fromUnicodeStatus= 0;
70 cnv->mode=0;
71 cnv->fromUChar32=0x0000;
72 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
73 if(cnv->extraInfo != NULL){
74 uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
75 ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
76 }
77 else {
78 *errorCode = U_MEMORY_ALLOCATION_ERROR;
79 return;
80 }
81 }
82
83 static void
_HZClose(UConverter * cnv)84 _HZClose(UConverter *cnv){
85 if(cnv->extraInfo != NULL) {
86 ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
87 if(!cnv->isExtraLocal) {
88 uprv_free(cnv->extraInfo);
89 }
90 cnv->extraInfo = NULL;
91 }
92 }
93
94 static void
_HZReset(UConverter * cnv,UConverterResetChoice choice)95 _HZReset(UConverter *cnv, UConverterResetChoice choice){
96 if(choice<=UCNV_RESET_TO_UNICODE) {
97 cnv->toUnicodeStatus = 0;
98 cnv->mode=0;
99 if(cnv->extraInfo != NULL){
100 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
101 }
102 }
103 if(choice!=UCNV_RESET_TO_UNICODE) {
104 cnv->fromUnicodeStatus= 0;
105 cnv->fromUChar32=0x0000;
106 if(cnv->extraInfo != NULL){
107 ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
108 ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
109 ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
110 ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
111 }
112 }
113 }
114
115 /**************************************HZ Encoding*************************************************
116 * Rules for HZ encoding
117 *
118 * In ASCII mode, a byte is interpreted as an ASCII character, unless a
119 * '~' is encountered. The character '~' is an escape character. By
120 * convention, it must be immediately followed ONLY by '~', '{' or '\n'
121 * (<LF>), with the following special meaning.
122
123 * 1. The escape sequence '~~' is interpreted as a '~'.
124 * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
125 * 3. The escape sequence '~\n' is a line-continuation marker to be
126 * consumed with no output produced.
127 * In GB mode, characters are interpreted two bytes at a time as (pure)
128 * GB codes until the escape-from-GB code '~}' is read. This code
129 * switches the mode from GB back to ASCII. (Note that the escape-
130 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
131 *
132 * Source: RFC 1842
133 */
134
135
136 static void
UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)137 UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
138 UErrorCode* err){
139 char tempBuf[2];
140 const char *mySource = ( char *) args->source;
141 UChar *myTarget = args->target;
142 const char *mySourceLimit = args->sourceLimit;
143 UChar32 targetUniChar = 0x0000;
144 int32_t mySourceChar = 0x0000;
145 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
146 tempBuf[0]=0;
147 tempBuf[1]=0;
148 if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
149 *err = U_ILLEGAL_ARGUMENT_ERROR;
150 return;
151 }
152
153 while(mySource< mySourceLimit){
154
155 if(myTarget < args->targetLimit){
156
157 mySourceChar= (unsigned char) *mySource++;
158
159 if(args->converter->mode == UCNV_TILDE) {
160 /* second byte after ~ */
161 args->converter->mode=0;
162 switch(mySourceChar) {
163 case 0x0A:
164 /* no output for ~\n (line-continuation marker) */
165 continue;
166 case UCNV_TILDE:
167 if(args->offsets) {
168 args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
169 }
170 *(myTarget++)=(UChar)mySourceChar;
171 continue;
172 case UCNV_OPEN_BRACE:
173 myData->isStateDBCS = TRUE;
174 continue;
175 case UCNV_CLOSE_BRACE:
176 myData->isStateDBCS = FALSE;
177 continue;
178 default:
179 /* if the first byte is equal to TILDE and the trail byte
180 * is not a valid byte then it is an error condition
181 */
182 mySourceChar = 0x7e00 | mySourceChar;
183 targetUniChar = 0xffff;
184 break;
185 }
186 } else if(myData->isStateDBCS) {
187 if(args->converter->toUnicodeStatus == 0x00){
188 /* lead byte */
189 if(mySourceChar == UCNV_TILDE) {
190 args->converter->mode = UCNV_TILDE;
191 } else {
192 /* add another bit to distinguish a 0 byte from not having seen a lead byte */
193 args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
194 }
195 continue;
196 }
197 else{
198 /* trail byte */
199 uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
200 if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) &&
201 (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
202 ) {
203 tempBuf[0] = (char) (leadByte+0x80) ;
204 tempBuf[1] = (char) (mySourceChar+0x80);
205 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
206 tempBuf, 2, args->converter->useFallback);
207 } else {
208 targetUniChar = 0xffff;
209 }
210 /* add another bit so that the code below writes 2 bytes in case of error */
211 mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
212 args->converter->toUnicodeStatus =0x00;
213 }
214 }
215 else{
216 if(mySourceChar == UCNV_TILDE) {
217 args->converter->mode = UCNV_TILDE;
218 continue;
219 } else if(mySourceChar <= 0x7f) {
220 targetUniChar = (UChar)mySourceChar; /* ASCII */
221 } else {
222 targetUniChar = 0xffff;
223 }
224 }
225 if(targetUniChar < 0xfffe){
226 if(args->offsets) {
227 args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
228 }
229
230 *(myTarget++)=(UChar)targetUniChar;
231 }
232 else /* targetUniChar>=0xfffe */ {
233 if(targetUniChar == 0xfffe){
234 *err = U_INVALID_CHAR_FOUND;
235 }
236 else{
237 *err = U_ILLEGAL_CHAR_FOUND;
238 }
239 if(mySourceChar > 0xff){
240 args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
241 args->converter->toUBytes[1] = (uint8_t)mySourceChar;
242 args->converter->toULength=2;
243 }
244 else{
245 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
246 args->converter->toULength=1;
247 }
248 break;
249 }
250 }
251 else{
252 *err =U_BUFFER_OVERFLOW_ERROR;
253 break;
254 }
255 }
256
257 args->target = myTarget;
258 args->source = mySource;
259 }
260
261
262 static void
UConverter_fromUnicode_HZ_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)263 UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
264 UErrorCode * err){
265 const UChar *mySource = args->source;
266 char *myTarget = args->target;
267 int32_t* offsets = args->offsets;
268 int32_t mySourceIndex = 0;
269 int32_t myTargetIndex = 0;
270 int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
271 int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
272 int32_t length=0;
273 uint32_t targetUniChar = 0x0000;
274 UChar32 mySourceChar = 0x0000;
275 UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
276 UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
277 UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
278 int len =0;
279 const char* escSeq=NULL;
280
281 if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
282 *err = U_ILLEGAL_ARGUMENT_ERROR;
283 return;
284 }
285 if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
286 goto getTrail;
287 }
288 /*writing the char to the output stream */
289 while (mySourceIndex < mySourceLength){
290 targetUniChar = missingCharMarker;
291 if (myTargetIndex < targetLength){
292
293 mySourceChar = (UChar) mySource[mySourceIndex++];
294
295
296 oldIsTargetUCharDBCS = isTargetUCharDBCS;
297 if(mySourceChar ==UCNV_TILDE){
298 /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
299 len = ESC_LEN;
300 escSeq = TILDE_ESCAPE;
301 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
302 continue;
303 } else if(mySourceChar <= 0x7f) {
304 length = 1;
305 targetUniChar = mySourceChar;
306 } else {
307 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
308 mySourceChar,&targetUniChar,args->converter->useFallback);
309 /* we can only use lead bytes 21..7D and trail bytes 21..7E */
310 if( length == 2 &&
311 (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
312 (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
313 ) {
314 targetUniChar -= 0x8080;
315 } else {
316 targetUniChar = missingCharMarker;
317 }
318 }
319 if (targetUniChar != missingCharMarker){
320 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
321 if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
322 /*Shifting from a double byte to single byte mode*/
323 if(!isTargetUCharDBCS){
324 len =ESC_LEN;
325 escSeq = SB_ESCAPE;
326 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
327 myConverterData->isEscapeAppended = TRUE;
328 }
329 else{ /* Shifting from a single byte to double byte mode*/
330 len =ESC_LEN;
331 escSeq = DB_ESCAPE;
332 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
333 myConverterData->isEscapeAppended = TRUE;
334
335 }
336 }
337
338 if(isTargetUCharDBCS){
339 if( myTargetIndex <targetLength){
340 myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
341 if(offsets){
342 *(offsets++) = mySourceIndex-1;
343 }
344 if(myTargetIndex < targetLength){
345 myTarget[myTargetIndex++] =(char) targetUniChar;
346 if(offsets){
347 *(offsets++) = mySourceIndex-1;
348 }
349 }else{
350 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
351 *err = U_BUFFER_OVERFLOW_ERROR;
352 }
353 }else{
354 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
355 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
356 *err = U_BUFFER_OVERFLOW_ERROR;
357 }
358
359 }else{
360 if( myTargetIndex <targetLength){
361 myTarget[myTargetIndex++] = (char) (targetUniChar );
362 if(offsets){
363 *(offsets++) = mySourceIndex-1;
364 }
365
366 }else{
367 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
368 *err = U_BUFFER_OVERFLOW_ERROR;
369 }
370 }
371
372 }
373 else{
374 /* oops.. the code point is unassigned */
375 /*Handle surrogates */
376 /*check if the char is a First surrogate*/
377 if(UTF_IS_SURROGATE(mySourceChar)) {
378 if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
379 args->converter->fromUChar32=mySourceChar;
380 getTrail:
381 /*look ahead to find the trail surrogate*/
382 if(mySourceIndex < mySourceLength) {
383 /* test the following code unit */
384 UChar trail=(UChar) args->source[mySourceIndex];
385 if(UTF_IS_SECOND_SURROGATE(trail)) {
386 ++mySourceIndex;
387 mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
388 args->converter->fromUChar32=0x00;
389 /* there are no surrogates in GB2312*/
390 *err = U_INVALID_CHAR_FOUND;
391 /* exit this condition tree */
392 } else {
393 /* this is an unmatched lead code unit (1st surrogate) */
394 /* callback(illegal) */
395 *err=U_ILLEGAL_CHAR_FOUND;
396 }
397 } else {
398 /* no more input */
399 *err = U_ZERO_ERROR;
400 }
401 } else {
402 /* this is an unmatched trail code unit (2nd surrogate) */
403 /* callback(illegal) */
404 *err=U_ILLEGAL_CHAR_FOUND;
405 }
406 } else {
407 /* callback(unassigned) for a BMP code point */
408 *err = U_INVALID_CHAR_FOUND;
409 }
410
411 args->converter->fromUChar32=mySourceChar;
412 break;
413 }
414 }
415 else{
416 *err = U_BUFFER_OVERFLOW_ERROR;
417 break;
418 }
419 targetUniChar=missingCharMarker;
420 }
421
422 args->target += myTargetIndex;
423 args->source += mySourceIndex;
424 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
425 }
426
427 static void
_HZ_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)428 _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
429 UConverter *cnv = args->converter;
430 UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
431 char *p;
432 char buffer[4];
433 p = buffer;
434
435 if( convData->isTargetUCharDBCS){
436 *p++= UCNV_TILDE;
437 *p++= UCNV_CLOSE_BRACE;
438 convData->isTargetUCharDBCS=FALSE;
439 }
440 *p++= (char)cnv->subChars[0];
441
442 ucnv_cbFromUWriteBytes(args,
443 buffer, (int32_t)(p - buffer),
444 offsetIndex, err);
445 }
446
447 /*
448 * Structure for cloning an HZ converter into a single memory block.
449 * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
450 * and then ucnv_safeClone() of the sub-converter may additionally align
451 * subCnv inside the cloneHZStruct, for which we need the deadSpace after
452 * subCnv. This is because UAlignedMemory may be larger than the actually
453 * necessary alignment size for the platform.
454 * The other cloneHZStruct fields will not be moved around,
455 * and are aligned properly with cloneHZStruct's alignment.
456 */
457 struct cloneHZStruct
458 {
459 UConverter cnv;
460 UConverter subCnv;
461 UAlignedMemory deadSpace;
462 UConverterDataHZ mydata;
463 };
464
465
466 static UConverter *
_HZ_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)467 _HZ_SafeClone(const UConverter *cnv,
468 void *stackBuffer,
469 int32_t *pBufferSize,
470 UErrorCode *status)
471 {
472 struct cloneHZStruct * localClone;
473 int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
474
475 if (U_FAILURE(*status)){
476 return 0;
477 }
478
479 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
480 *pBufferSize = bufferSizeNeeded;
481 return 0;
482 }
483
484 localClone = (struct cloneHZStruct *)stackBuffer;
485 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
486
487 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
488 localClone->cnv.extraInfo = &localClone->mydata;
489 localClone->cnv.isExtraLocal = TRUE;
490
491 /* deep-clone the sub-converter */
492 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
493 ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
494 ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
495
496 return &localClone->cnv;
497 }
498
499 static void
_HZ_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)500 _HZ_GetUnicodeSet(const UConverter *cnv,
501 const USetAdder *sa,
502 UConverterUnicodeSet which,
503 UErrorCode *pErrorCode) {
504 /* HZ converts all of ASCII */
505 sa->addRange(sa->set, 0, 0x7f);
506
507 /* add all of the code points that the sub-converter handles */
508 ucnv_MBCSGetFilteredUnicodeSetForUnicode(
509 ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
510 sa, which, UCNV_SET_FILTER_HZ,
511 pErrorCode);
512 }
513
514 static const UConverterImpl _HZImpl={
515
516 UCNV_HZ,
517
518 NULL,
519 NULL,
520
521 _HZOpen,
522 _HZClose,
523 _HZReset,
524
525 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
526 UConverter_toUnicode_HZ_OFFSETS_LOGIC,
527 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
528 UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
529 NULL,
530
531 NULL,
532 NULL,
533 _HZ_WriteSub,
534 _HZ_SafeClone,
535 _HZ_GetUnicodeSet
536 };
537
538 static const UConverterStaticData _HZStaticData={
539 sizeof(UConverterStaticData),
540 "HZ",
541 0,
542 UCNV_IBM,
543 UCNV_HZ,
544 1,
545 4,
546 { 0x1a, 0, 0, 0 },
547 1,
548 FALSE,
549 FALSE,
550 0,
551 0,
552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
553
554 };
555
556
557 const UConverterSharedData _HZData={
558 sizeof(UConverterSharedData),
559 ~((uint32_t) 0),
560 NULL,
561 NULL,
562 &_HZStaticData,
563 FALSE,
564 &_HZImpl,
565 0
566 };
567
568 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
569