1 /*
2 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #define USING_IMPORTED_MAPS
8 #define USING_BINARY_PAIR_SEARCH
9 #define EXTERN_JISX0213_PAIR
10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12
13 #include "cjkcodecs.h"
14 #include "alg_jisx0201.h"
15 #include "emu_jisx0213_2000.h"
16 #include "mappings_jisx0213_pair.h"
17
18 /* STATE
19
20 state->c[0-3]
21
22 00000000
23 ||^^^^^|
24 |+-----+---- G0-3 Character Set
25 +----------- Is G0-3 double byte?
26
27 state->c[4]
28
29 00000000
30 ||
31 |+---- Locked-Shift?
32 +----- ESC Throughout
33 */
34
35 #define ESC 0x1B
36 #define SO 0x0E
37 #define SI 0x0F
38 #define LF 0x0A
39
40 #define MAX_ESCSEQLEN 16
41
42 #define CHARSET_ISO8859_1 'A'
43 #define CHARSET_ASCII 'B'
44 #define CHARSET_ISO8859_7 'F'
45 #define CHARSET_JISX0201_K 'I'
46 #define CHARSET_JISX0201_R 'J'
47
48 #define CHARSET_GB2312 ('A'|CHARSET_DBCS)
49 #define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
50 #define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
51 #define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
52 #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
53 #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
54 #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56 #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58 #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
59
60 #define CHARSET_DBCS 0x80
61 #define ESCMARK(mark) ((mark) & 0x7f)
62
63 #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64 #define IS_ISO2022ESC(c2) \
65 ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66 (c2) == '.' || (c2) == '&')
67 /* this is not a complete list of ISO-2022 escape sequence headers.
68 * but, it's enough to implement CJK instances of iso-2022. */
69
70 #define MAP_UNMAPPABLE 0xFFFF
71 #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
72
73 #define F_SHIFTED 0x01
74 #define F_ESCTHROUGHOUT 0x02
75
76 #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0)
77 #define STATE_GETG(dn) ((state)->c[dn])
78
79 #define STATE_G0 STATE_GETG(0)
80 #define STATE_G1 STATE_GETG(1)
81 #define STATE_G2 STATE_GETG(2)
82 #define STATE_G3 STATE_GETG(3)
83 #define STATE_SETG0(v) STATE_SETG(0, v)
84 #define STATE_SETG1(v) STATE_SETG(1, v)
85 #define STATE_SETG2(v) STATE_SETG(2, v)
86 #define STATE_SETG3(v) STATE_SETG(3, v)
87
88 #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0)
89 #define STATE_GETFLAG(f) ((state)->c[4] & (f))
90 #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0)
91 #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0)
92
93 #define ISO2022_CONFIG ((const struct iso2022_config *)config)
94 #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
95 #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
96
97 /* iso2022_config.flags */
98 #define NO_SHIFT 0x01
99 #define USE_G2 0x02
100 #define USE_JISX0208_EXT 0x04
101
102 /*-*- internal data structures -*-*/
103
104 typedef int (*iso2022_init_func)(void);
105 typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data);
106 typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length);
107
108 struct iso2022_designation {
109 unsigned char mark;
110 unsigned char plane;
111 unsigned char width;
112 iso2022_init_func initializer;
113 iso2022_decode_func decoder;
114 iso2022_encode_func encoder;
115 };
116
117 struct iso2022_config {
118 int flags;
119 const struct iso2022_designation *designations; /* non-ascii desigs */
120 };
121
122 /*-*- iso-2022 codec implementation -*-*/
123
CODEC_INIT(iso2022)124 CODEC_INIT(iso2022)
125 {
126 const struct iso2022_designation *desig;
127 for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128 if (desig->initializer != NULL && desig->initializer() != 0)
129 return -1;
130 return 0;
131 }
132
ENCODER_INIT(iso2022)133 ENCODER_INIT(iso2022)
134 {
135 STATE_CLEARFLAGS();
136 STATE_SETG0(CHARSET_ASCII);
137 STATE_SETG1(CHARSET_ASCII);
138 return 0;
139 }
140
ENCODER_RESET(iso2022)141 ENCODER_RESET(iso2022)
142 {
143 if (STATE_GETFLAG(F_SHIFTED)) {
144 WRITEBYTE1(SI);
145 NEXT_OUT(1);
146 STATE_CLEARFLAG(F_SHIFTED);
147 }
148 if (STATE_G0 != CHARSET_ASCII) {
149 WRITEBYTE3(ESC, '(', 'B');
150 NEXT_OUT(3);
151 STATE_SETG0(CHARSET_ASCII);
152 }
153 return 0;
154 }
155
ENCODER(iso2022)156 ENCODER(iso2022)
157 {
158 while (*inpos < inlen) {
159 const struct iso2022_designation *dsg;
160 DBCHAR encoded;
161 Py_UCS4 c = INCHAR1;
162 Py_ssize_t insize;
163
164 if (c < 0x80) {
165 if (STATE_G0 != CHARSET_ASCII) {
166 WRITEBYTE3(ESC, '(', 'B');
167 STATE_SETG0(CHARSET_ASCII);
168 NEXT_OUT(3);
169 }
170 if (STATE_GETFLAG(F_SHIFTED)) {
171 WRITEBYTE1(SI);
172 STATE_CLEARFLAG(F_SHIFTED);
173 NEXT_OUT(1);
174 }
175 WRITEBYTE1((unsigned char)c);
176 NEXT(1, 1);
177 continue;
178 }
179
180 insize = 1;
181
182 encoded = MAP_UNMAPPABLE;
183 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
184 Py_ssize_t length = 1;
185 encoded = dsg->encoder(&c, &length);
186 if (encoded == MAP_MULTIPLE_AVAIL) {
187 /* this implementation won't work for pair
188 * of non-bmp characters. */
189 if (inlen - *inpos < 2) {
190 if (!(flags & MBENC_FLUSH))
191 return MBERR_TOOFEW;
192 length = -1;
193 }
194 else
195 length = 2;
196 encoded = dsg->encoder(&c, &length);
197 if (encoded != MAP_UNMAPPABLE) {
198 insize = length;
199 break;
200 }
201 }
202 else if (encoded != MAP_UNMAPPABLE)
203 break;
204 }
205
206 if (!dsg->mark)
207 return 1;
208 assert(dsg->width == 1 || dsg->width == 2);
209
210 switch (dsg->plane) {
211 case 0: /* G0 */
212 if (STATE_GETFLAG(F_SHIFTED)) {
213 WRITEBYTE1(SI);
214 STATE_CLEARFLAG(F_SHIFTED);
215 NEXT_OUT(1);
216 }
217 if (STATE_G0 != dsg->mark) {
218 if (dsg->width == 1) {
219 WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
220 STATE_SETG0(dsg->mark);
221 NEXT_OUT(3);
222 }
223 else if (dsg->mark == CHARSET_JISX0208) {
224 WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
225 STATE_SETG0(dsg->mark);
226 NEXT_OUT(3);
227 }
228 else {
229 WRITEBYTE4(ESC, '$', '(',
230 ESCMARK(dsg->mark));
231 STATE_SETG0(dsg->mark);
232 NEXT_OUT(4);
233 }
234 }
235 break;
236 case 1: /* G1 */
237 if (STATE_G1 != dsg->mark) {
238 if (dsg->width == 1) {
239 WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
240 STATE_SETG1(dsg->mark);
241 NEXT_OUT(3);
242 }
243 else {
244 WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
245 STATE_SETG1(dsg->mark);
246 NEXT_OUT(4);
247 }
248 }
249 if (!STATE_GETFLAG(F_SHIFTED)) {
250 WRITEBYTE1(SO);
251 STATE_SETFLAG(F_SHIFTED);
252 NEXT_OUT(1);
253 }
254 break;
255 default: /* G2 and G3 is not supported: no encoding in
256 * CJKCodecs are using them yet */
257 return MBERR_INTERNAL;
258 }
259
260 if (dsg->width == 1) {
261 WRITEBYTE1((unsigned char)encoded);
262 NEXT_OUT(1);
263 }
264 else {
265 WRITEBYTE2(encoded >> 8, encoded & 0xff);
266 NEXT_OUT(2);
267 }
268 NEXT_INCHAR(insize);
269 }
270
271 return 0;
272 }
273
DECODER_INIT(iso2022)274 DECODER_INIT(iso2022)
275 {
276 STATE_CLEARFLAGS();
277 STATE_SETG0(CHARSET_ASCII);
278 STATE_SETG1(CHARSET_ASCII);
279 STATE_SETG2(CHARSET_ASCII);
280 return 0;
281 }
282
DECODER_RESET(iso2022)283 DECODER_RESET(iso2022)
284 {
285 STATE_SETG0(CHARSET_ASCII);
286 STATE_CLEARFLAG(F_SHIFTED);
287 return 0;
288 }
289
290 static Py_ssize_t
iso2022processesc(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft)291 iso2022processesc(const void *config, MultibyteCodec_State *state,
292 const unsigned char **inbuf, Py_ssize_t *inleft)
293 {
294 unsigned char charset, designation;
295 Py_ssize_t i, esclen = 0;
296
297 for (i = 1;i < MAX_ESCSEQLEN;i++) {
298 if (i >= *inleft)
299 return MBERR_TOOFEW;
300 if (IS_ESCEND((*inbuf)[i])) {
301 esclen = i + 1;
302 break;
303 }
304 else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
305 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
306 i += 2;
307 }
308 }
309
310 switch (esclen) {
311 case 0:
312 return 1; /* unterminated escape sequence */
313 case 3:
314 if (INBYTE2 == '$') {
315 charset = INBYTE3 | CHARSET_DBCS;
316 designation = 0;
317 }
318 else {
319 charset = INBYTE3;
320 if (INBYTE2 == '(')
321 designation = 0;
322 else if (INBYTE2 == ')')
323 designation = 1;
324 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
325 designation = 2;
326 else
327 return 3;
328 }
329 break;
330 case 4:
331 if (INBYTE2 != '$')
332 return 4;
333
334 charset = INBYTE4 | CHARSET_DBCS;
335 if (INBYTE3 == '(')
336 designation = 0;
337 else if (INBYTE3 == ')')
338 designation = 1;
339 else
340 return 4;
341 break;
342 case 6: /* designation with prefix */
343 if (CONFIG_ISSET(USE_JISX0208_EXT) &&
344 (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
345 (*inbuf)[5] == 'B') {
346 charset = 'B' | CHARSET_DBCS;
347 designation = 0;
348 }
349 else
350 return 6;
351 break;
352 default:
353 return esclen;
354 }
355
356 /* raise error when the charset is not designated for this encoding */
357 if (charset != CHARSET_ASCII) {
358 const struct iso2022_designation *dsg;
359
360 for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
361 if (dsg->mark == charset)
362 break;
363 }
364 if (!dsg->mark)
365 return esclen;
366 }
367
368 STATE_SETG(designation, charset);
369 *inleft -= esclen;
370 (*inbuf) += esclen;
371 return 0;
372 }
373
374 #define ISO8859_7_DECODE(c, writer) \
375 if ((c) < 0xa0) { \
376 OUTCHAR(c); \
377 } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
378 OUTCHAR(c); \
379 } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
380 (0xbffffd77L & (1L << ((c)-0xb4))))) { \
381 OUTCHAR(0x02d0 + (c)); \
382 } else if ((c) == 0xa1) { \
383 OUTCHAR(0x2018); \
384 } else if ((c) == 0xa2) { \
385 OUTCHAR(0x2019); \
386 } else if ((c) == 0xaf) { \
387 OUTCHAR(0x2015); \
388 }
389
390 static Py_ssize_t
iso2022processg2(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft,_PyUnicodeWriter * writer)391 iso2022processg2(const void *config, MultibyteCodec_State *state,
392 const unsigned char **inbuf, Py_ssize_t *inleft,
393 _PyUnicodeWriter *writer)
394 {
395 /* not written to use encoder, decoder functions because only few
396 * encodings use G2 designations in CJKCodecs */
397 if (STATE_G2 == CHARSET_ISO8859_1) {
398 if (INBYTE3 < 0x80)
399 OUTCHAR(INBYTE3 + 0x80);
400 else
401 return 3;
402 }
403 else if (STATE_G2 == CHARSET_ISO8859_7) {
404 ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
405 else
406 return 3;
407 }
408 else if (STATE_G2 == CHARSET_ASCII) {
409 if (INBYTE3 & 0x80)
410 return 3;
411 else
412 OUTCHAR(INBYTE3);
413 }
414 else
415 return MBERR_INTERNAL;
416
417 (*inbuf) += 3;
418 *inleft -= 3;
419 return 0;
420 }
421
DECODER(iso2022)422 DECODER(iso2022)
423 {
424 const struct iso2022_designation *dsgcache = NULL;
425
426 while (inleft > 0) {
427 unsigned char c = INBYTE1;
428 Py_ssize_t err;
429
430 if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
431 /* ESC throughout mode:
432 * for non-iso2022 escape sequences */
433 OUTCHAR(c); /* assume as ISO-8859-1 */
434 NEXT_IN(1);
435 if (IS_ESCEND(c)) {
436 STATE_CLEARFLAG(F_ESCTHROUGHOUT);
437 }
438 continue;
439 }
440
441 switch (c) {
442 case ESC:
443 REQUIRE_INBUF(2);
444 if (IS_ISO2022ESC(INBYTE2)) {
445 err = iso2022processesc(config, state,
446 inbuf, &inleft);
447 if (err != 0)
448 return err;
449 }
450 else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
451 REQUIRE_INBUF(3);
452 err = iso2022processg2(config, state,
453 inbuf, &inleft, writer);
454 if (err != 0)
455 return err;
456 }
457 else {
458 OUTCHAR(ESC);
459 STATE_SETFLAG(F_ESCTHROUGHOUT);
460 NEXT_IN(1);
461 }
462 break;
463 case SI:
464 if (CONFIG_ISSET(NO_SHIFT))
465 goto bypass;
466 STATE_CLEARFLAG(F_SHIFTED);
467 NEXT_IN(1);
468 break;
469 case SO:
470 if (CONFIG_ISSET(NO_SHIFT))
471 goto bypass;
472 STATE_SETFLAG(F_SHIFTED);
473 NEXT_IN(1);
474 break;
475 case LF:
476 STATE_CLEARFLAG(F_SHIFTED);
477 OUTCHAR(LF);
478 NEXT_IN(1);
479 break;
480 default:
481 if (c < 0x20) /* C0 */
482 goto bypass;
483 else if (c >= 0x80)
484 return 1;
485 else {
486 const struct iso2022_designation *dsg;
487 unsigned char charset;
488 Py_UCS4 decoded;
489
490 if (STATE_GETFLAG(F_SHIFTED))
491 charset = STATE_G1;
492 else
493 charset = STATE_G0;
494
495 if (charset == CHARSET_ASCII) {
496 bypass:
497 OUTCHAR(c);
498 NEXT_IN(1);
499 break;
500 }
501
502 if (dsgcache != NULL &&
503 dsgcache->mark == charset)
504 dsg = dsgcache;
505 else {
506 for (dsg = CONFIG_DESIGNATIONS;
507 dsg->mark != charset
508 #ifdef Py_DEBUG
509 && dsg->mark != '\0'
510 #endif
511 ; dsg++)
512 {
513 /* noop */
514 }
515 assert(dsg->mark != '\0');
516 dsgcache = dsg;
517 }
518
519 REQUIRE_INBUF(dsg->width);
520 decoded = dsg->decoder(*inbuf);
521 if (decoded == MAP_UNMAPPABLE)
522 return dsg->width;
523
524 if (decoded < 0x10000) {
525 OUTCHAR(decoded);
526 }
527 else if (decoded < 0x30000) {
528 OUTCHAR(decoded);
529 }
530 else { /* JIS X 0213 pairs */
531 OUTCHAR2(decoded >> 16, decoded & 0xffff);
532 }
533 NEXT_IN(dsg->width);
534 }
535 break;
536 }
537 }
538 return 0;
539 }
540
541 /*-*- mapping table holders -*-*/
542
543 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
544 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
545
546 /* kr */
547 ENCMAP(cp949)
DECMAP(ksx1001)548 DECMAP(ksx1001)
549
550 /* jp */
551 ENCMAP(jisxcommon)
552 DECMAP(jisx0208)
553 DECMAP(jisx0212)
554 ENCMAP(jisx0213_bmp)
555 DECMAP(jisx0213_1_bmp)
556 DECMAP(jisx0213_2_bmp)
557 ENCMAP(jisx0213_emp)
558 DECMAP(jisx0213_1_emp)
559 DECMAP(jisx0213_2_emp)
560
561 /* cn */
562 ENCMAP(gbcommon)
563 DECMAP(gb2312)
564
565 /* tw */
566
567 /*-*- mapping access functions -*-*/
568
569 static int
570 ksx1001_init(void)
571 {
572 static int initialized = 0;
573
574 if (!initialized && (
575 IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
576 IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
577 return -1;
578 initialized = 1;
579 return 0;
580 }
581
582 static Py_UCS4
ksx1001_decoder(const unsigned char * data)583 ksx1001_decoder(const unsigned char *data)
584 {
585 Py_UCS4 u;
586 if (TRYMAP_DEC(ksx1001, u, data[0], data[1]))
587 return u;
588 else
589 return MAP_UNMAPPABLE;
590 }
591
592 static DBCHAR
ksx1001_encoder(const Py_UCS4 * data,Py_ssize_t * length)593 ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length)
594 {
595 DBCHAR coded;
596 assert(*length == 1);
597 if (*data < 0x10000) {
598 if (TRYMAP_ENC(cp949, coded, *data)) {
599 if (!(coded & 0x8000))
600 return coded;
601 }
602 }
603 return MAP_UNMAPPABLE;
604 }
605
606 static int
jisx0208_init(void)607 jisx0208_init(void)
608 {
609 static int initialized = 0;
610
611 if (!initialized && (
612 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
613 IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
614 return -1;
615 initialized = 1;
616 return 0;
617 }
618
619 static Py_UCS4
jisx0208_decoder(const unsigned char * data)620 jisx0208_decoder(const unsigned char *data)
621 {
622 Py_UCS4 u;
623 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
624 return 0xff3c;
625 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
626 return u;
627 else
628 return MAP_UNMAPPABLE;
629 }
630
631 static DBCHAR
jisx0208_encoder(const Py_UCS4 * data,Py_ssize_t * length)632 jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length)
633 {
634 DBCHAR coded;
635 assert(*length == 1);
636 if (*data < 0x10000) {
637 if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
638 return 0x2140;
639 else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
640 if (!(coded & 0x8000))
641 return coded;
642 }
643 }
644 return MAP_UNMAPPABLE;
645 }
646
647 static int
jisx0212_init(void)648 jisx0212_init(void)
649 {
650 static int initialized = 0;
651
652 if (!initialized && (
653 IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
654 IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
655 return -1;
656 initialized = 1;
657 return 0;
658 }
659
660 static Py_UCS4
jisx0212_decoder(const unsigned char * data)661 jisx0212_decoder(const unsigned char *data)
662 {
663 Py_UCS4 u;
664 if (TRYMAP_DEC(jisx0212, u, data[0], data[1]))
665 return u;
666 else
667 return MAP_UNMAPPABLE;
668 }
669
670 static DBCHAR
jisx0212_encoder(const Py_UCS4 * data,Py_ssize_t * length)671 jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length)
672 {
673 DBCHAR coded;
674 assert(*length == 1);
675 if (*data < 0x10000) {
676 if (TRYMAP_ENC(jisxcommon, coded, *data)) {
677 if (coded & 0x8000)
678 return coded & 0x7fff;
679 }
680 }
681 return MAP_UNMAPPABLE;
682 }
683
684 static int
jisx0213_init(void)685 jisx0213_init(void)
686 {
687 static int initialized = 0;
688
689 if (!initialized && (
690 jisx0208_init() ||
691 IMPORT_MAP(jp, jisx0213_bmp,
692 &jisx0213_bmp_encmap, NULL) ||
693 IMPORT_MAP(jp, jisx0213_1_bmp,
694 NULL, &jisx0213_1_bmp_decmap) ||
695 IMPORT_MAP(jp, jisx0213_2_bmp,
696 NULL, &jisx0213_2_bmp_decmap) ||
697 IMPORT_MAP(jp, jisx0213_emp,
698 &jisx0213_emp_encmap, NULL) ||
699 IMPORT_MAP(jp, jisx0213_1_emp,
700 NULL, &jisx0213_1_emp_decmap) ||
701 IMPORT_MAP(jp, jisx0213_2_emp,
702 NULL, &jisx0213_2_emp_decmap) ||
703 IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
704 &jisx0213_pair_decmap)))
705 return -1;
706 initialized = 1;
707 return 0;
708 }
709
710 #define config ((void *)2000)
711 static Py_UCS4
jisx0213_2000_1_decoder(const unsigned char * data)712 jisx0213_2000_1_decoder(const unsigned char *data)
713 {
714 Py_UCS4 u;
715 EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
716 else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
717 return 0xff3c;
718 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
719 ;
720 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
721 ;
722 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
723 u |= 0x20000;
724 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
725 ;
726 else
727 return MAP_UNMAPPABLE;
728 return u;
729 }
730
731 static Py_UCS4
jisx0213_2000_2_decoder(const unsigned char * data)732 jisx0213_2000_2_decoder(const unsigned char *data)
733 {
734 Py_UCS4 u;
735 EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1])
736 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
737 ;
738 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
739 u |= 0x20000;
740 else
741 return MAP_UNMAPPABLE;
742 return u;
743 }
744 #undef config
745
746 static Py_UCS4
jisx0213_2004_1_decoder(const unsigned char * data)747 jisx0213_2004_1_decoder(const unsigned char *data)
748 {
749 Py_UCS4 u;
750 if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
751 return 0xff3c;
752 else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
753 ;
754 else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
755 ;
756 else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
757 u |= 0x20000;
758 else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
759 ;
760 else
761 return MAP_UNMAPPABLE;
762 return u;
763 }
764
765 static Py_UCS4
jisx0213_2004_2_decoder(const unsigned char * data)766 jisx0213_2004_2_decoder(const unsigned char *data)
767 {
768 Py_UCS4 u;
769 if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
770 ;
771 else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
772 u |= 0x20000;
773 else
774 return MAP_UNMAPPABLE;
775 return u;
776 }
777
778 static DBCHAR
jisx0213_encoder(const Py_UCS4 * data,Py_ssize_t * length,void * config)779 jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config)
780 {
781 DBCHAR coded;
782
783 switch (*length) {
784 case 1: /* first character */
785 if (*data >= 0x10000) {
786 if ((*data) >> 16 == 0x20000 >> 16) {
787 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
788 else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff))
789 return coded;
790 }
791 return MAP_UNMAPPABLE;
792 }
793
794 EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
795 else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) {
796 if (coded == MULTIC)
797 return MAP_MULTIPLE_AVAIL;
798 }
799 else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
800 if (coded & 0x8000)
801 return MAP_UNMAPPABLE;
802 }
803 else
804 return MAP_UNMAPPABLE;
805 return coded;
806
807 case 2: /* second character of unicode pair */
808 coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
809 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
810 if (coded != DBCINV)
811 return coded;
812 /* fall through */
813
814 case -1: /* flush unterminated */
815 *length = 1;
816 coded = find_pairencmap((ucs2_t)data[0], 0,
817 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
818 if (coded == DBCINV)
819 return MAP_UNMAPPABLE;
820 else
821 return coded;
822 break;
823
824 default:
825 return MAP_UNMAPPABLE;
826 }
827 }
828
829 static DBCHAR
jisx0213_2000_1_encoder(const Py_UCS4 * data,Py_ssize_t * length)830 jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
831 {
832 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
833 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
834 return coded;
835 else if (coded & 0x8000)
836 return MAP_UNMAPPABLE;
837 else
838 return coded;
839 }
840
841 static DBCHAR
jisx0213_2000_1_encoder_paironly(const Py_UCS4 * data,Py_ssize_t * length)842 jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
843 {
844 DBCHAR coded;
845 Py_ssize_t ilength = *length;
846
847 coded = jisx0213_encoder(data, length, (void *)2000);
848 switch (ilength) {
849 case 1:
850 if (coded == MAP_MULTIPLE_AVAIL)
851 return MAP_MULTIPLE_AVAIL;
852 else
853 return MAP_UNMAPPABLE;
854 case 2:
855 if (*length != 2)
856 return MAP_UNMAPPABLE;
857 else
858 return coded;
859 default:
860 return MAP_UNMAPPABLE;
861 }
862 }
863
864 static DBCHAR
jisx0213_2000_2_encoder(const Py_UCS4 * data,Py_ssize_t * length)865 jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
866 {
867 DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
868 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
869 return coded;
870 else if (coded & 0x8000)
871 return coded & 0x7fff;
872 else
873 return MAP_UNMAPPABLE;
874 }
875
876 static DBCHAR
jisx0213_2004_1_encoder(const Py_UCS4 * data,Py_ssize_t * length)877 jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
878 {
879 DBCHAR coded = jisx0213_encoder(data, length, NULL);
880 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
881 return coded;
882 else if (coded & 0x8000)
883 return MAP_UNMAPPABLE;
884 else
885 return coded;
886 }
887
888 static DBCHAR
jisx0213_2004_1_encoder_paironly(const Py_UCS4 * data,Py_ssize_t * length)889 jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
890 {
891 DBCHAR coded;
892 Py_ssize_t ilength = *length;
893
894 coded = jisx0213_encoder(data, length, NULL);
895 switch (ilength) {
896 case 1:
897 if (coded == MAP_MULTIPLE_AVAIL)
898 return MAP_MULTIPLE_AVAIL;
899 else
900 return MAP_UNMAPPABLE;
901 case 2:
902 if (*length != 2)
903 return MAP_UNMAPPABLE;
904 else
905 return coded;
906 default:
907 return MAP_UNMAPPABLE;
908 }
909 }
910
911 static DBCHAR
jisx0213_2004_2_encoder(const Py_UCS4 * data,Py_ssize_t * length)912 jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
913 {
914 DBCHAR coded = jisx0213_encoder(data, length, NULL);
915 if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
916 return coded;
917 else if (coded & 0x8000)
918 return coded & 0x7fff;
919 else
920 return MAP_UNMAPPABLE;
921 }
922
923 static Py_UCS4
jisx0201_r_decoder(const unsigned char * data)924 jisx0201_r_decoder(const unsigned char *data)
925 {
926 Py_UCS4 u;
927 JISX0201_R_DECODE_CHAR(*data, u)
928 else
929 return MAP_UNMAPPABLE;
930 return u;
931 }
932
933 static DBCHAR
jisx0201_r_encoder(const Py_UCS4 * data,Py_ssize_t * length)934 jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length)
935 {
936 DBCHAR coded;
937 JISX0201_R_ENCODE(*data, coded)
938 else
939 return MAP_UNMAPPABLE;
940 return coded;
941 }
942
943 static Py_UCS4
jisx0201_k_decoder(const unsigned char * data)944 jisx0201_k_decoder(const unsigned char *data)
945 {
946 Py_UCS4 u;
947 JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
948 else
949 return MAP_UNMAPPABLE;
950 return u;
951 }
952
953 static DBCHAR
jisx0201_k_encoder(const Py_UCS4 * data,Py_ssize_t * length)954 jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length)
955 {
956 DBCHAR coded;
957 JISX0201_K_ENCODE(*data, coded)
958 else
959 return MAP_UNMAPPABLE;
960 return coded - 0x80;
961 }
962
963 static int
gb2312_init(void)964 gb2312_init(void)
965 {
966 static int initialized = 0;
967
968 if (!initialized && (
969 IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
970 IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
971 return -1;
972 initialized = 1;
973 return 0;
974 }
975
976 static Py_UCS4
gb2312_decoder(const unsigned char * data)977 gb2312_decoder(const unsigned char *data)
978 {
979 Py_UCS4 u;
980 if (TRYMAP_DEC(gb2312, u, data[0], data[1]))
981 return u;
982 else
983 return MAP_UNMAPPABLE;
984 }
985
986 static DBCHAR
gb2312_encoder(const Py_UCS4 * data,Py_ssize_t * length)987 gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length)
988 {
989 DBCHAR coded;
990 assert(*length == 1);
991 if (*data < 0x10000) {
992 if (TRYMAP_ENC(gbcommon, coded, *data)) {
993 if (!(coded & 0x8000))
994 return coded;
995 }
996 }
997 return MAP_UNMAPPABLE;
998 }
999
1000
1001 static Py_UCS4
dummy_decoder(const unsigned char * data)1002 dummy_decoder(const unsigned char *data)
1003 {
1004 return MAP_UNMAPPABLE;
1005 }
1006
1007 static DBCHAR
dummy_encoder(const Py_UCS4 * data,Py_ssize_t * length)1008 dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length)
1009 {
1010 return MAP_UNMAPPABLE;
1011 }
1012
1013 /*-*- registry tables -*-*/
1014
1015 #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
1016 ksx1001_init, \
1017 ksx1001_decoder, ksx1001_encoder }
1018 #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
1019 ksx1001_init, \
1020 ksx1001_decoder, ksx1001_encoder }
1021 #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
1022 NULL, \
1023 jisx0201_r_decoder, jisx0201_r_encoder }
1024 #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
1025 NULL, \
1026 jisx0201_k_decoder, jisx0201_k_encoder }
1027 #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
1028 jisx0208_init, \
1029 jisx0208_decoder, jisx0208_encoder }
1030 #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
1031 jisx0208_init, \
1032 jisx0208_decoder, jisx0208_encoder }
1033 #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
1034 jisx0212_init, \
1035 jisx0212_decoder, jisx0212_encoder }
1036 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
1037 jisx0213_init, \
1038 jisx0213_2000_1_decoder, \
1039 jisx0213_2000_1_encoder }
1040 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1041 jisx0213_init, \
1042 jisx0213_2000_1_decoder, \
1043 jisx0213_2000_1_encoder_paironly }
1044 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
1045 jisx0213_init, \
1046 jisx0213_2000_2_decoder, \
1047 jisx0213_2000_2_encoder }
1048 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
1049 jisx0213_init, \
1050 jisx0213_2004_1_decoder, \
1051 jisx0213_2004_1_encoder }
1052 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1053 jisx0213_init, \
1054 jisx0213_2004_1_decoder, \
1055 jisx0213_2004_1_encoder_paironly }
1056 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
1057 jisx0213_init, \
1058 jisx0213_2004_2_decoder, \
1059 jisx0213_2004_2_encoder }
1060 #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
1061 gb2312_init, \
1062 gb2312_decoder, gb2312_encoder }
1063 #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
1064 cns11643_init, \
1065 cns11643_1_decoder, cns11643_1_encoder }
1066 #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
1067 cns11643_init, \
1068 cns11643_2_decoder, cns11643_2_encoder }
1069 #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
1070 NULL, dummy_decoder, dummy_encoder }
1071 #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
1072 NULL, dummy_decoder, dummy_encoder }
1073 #define REGISTRY_SENTINEL { 0, }
1074 #define CONFIGDEF(var, attrs) \
1075 static const struct iso2022_config iso2022_##var##_config = { \
1076 attrs, iso2022_##var##_designations \
1077 };
1078
1079 static const struct iso2022_designation iso2022_kr_designations[] = {
1080 REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1081 };
1082 CONFIGDEF(kr, 0)
1083
1084 static const struct iso2022_designation iso2022_jp_designations[] = {
1085 REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1086 REGISTRY_SENTINEL
1087 };
1088 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1089
1090 static const struct iso2022_designation iso2022_jp_1_designations[] = {
1091 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1092 REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1093 };
1094 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1095
1096 static const struct iso2022_designation iso2022_jp_2_designations[] = {
1097 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1098 REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1099 REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1100 };
1101 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1102
1103 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1104 REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1105 REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1106 };
1107 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1108
1109 static const struct iso2022_designation iso2022_jp_3_designations[] = {
1110 REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1111 REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1112 };
1113 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1114
1115 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1116 REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1117 REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1118 };
1119 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1120
1121
1122 BEGIN_MAPPINGS_LIST
1123 /* no mapping table here */
1124 END_MAPPINGS_LIST
1125
1126 #define ISO2022_CODEC(variation) { \
1127 "iso2022_" #variation, \
1128 &iso2022_##variation##_config, \
1129 iso2022_codec_init, \
1130 _STATEFUL_METHODS(iso2022) \
1131 },
1132
1133 BEGIN_CODECS_LIST
1134 ISO2022_CODEC(kr)
1135 ISO2022_CODEC(jp)
1136 ISO2022_CODEC(jp_1)
1137 ISO2022_CODEC(jp_2)
1138 ISO2022_CODEC(jp_2004)
1139 ISO2022_CODEC(jp_3)
1140 ISO2022_CODEC(jp_ext)
1141 END_CODECS_LIST
1142
1143 I_AM_A_MODULE_FOR(iso2022)
1144