• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3  *
4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
5  */
6 
7 #define USING_IMPORTED_MAPS
8 #define USING_BINARY_PAIR_SEARCH
9 #define EXTERN_JISX0213_PAIR
10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12 
13 #include "cjkcodecs.h"
14 #include "alg_jisx0201.h"
15 #include "emu_jisx0213_2000.h"
16 #include "mappings_jisx0213_pair.h"
17 
18 /* STATE
19 
20    state->c[0-3]
21 
22     00000000
23     ||^^^^^|
24     |+-----+----  G0-3 Character Set
25     +-----------  Is G0-3 double byte?
26 
27    state->c[4]
28 
29     00000000
30           ||
31           |+----  Locked-Shift?
32           +-----  ESC Throughout
33 */
34 
35 #define ESC                     0x1B
36 #define SO                      0x0E
37 #define SI                      0x0F
38 #define LF                      0x0A
39 
40 #define MAX_ESCSEQLEN           16
41 
42 #define CHARSET_ISO8859_1       'A'
43 #define CHARSET_ASCII           'B'
44 #define CHARSET_ISO8859_7       'F'
45 #define CHARSET_JISX0201_K      'I'
46 #define CHARSET_JISX0201_R      'J'
47 
48 #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
49 #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
50 #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
51 #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
52 #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
53 #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
54 #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56 #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58 #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
59 
60 #define CHARSET_DBCS            0x80
61 #define ESCMARK(mark)           ((mark) & 0x7f)
62 
63 #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64 #define IS_ISO2022ESC(c2) \
65         ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66          (c2) == '.' || (c2) == '&')
67     /* this is not a complete list of ISO-2022 escape sequence headers.
68      * but, it's enough to implement CJK instances of iso-2022. */
69 
70 #define MAP_UNMAPPABLE          0xFFFF
71 #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
72 
73 #define F_SHIFTED               0x01
74 #define F_ESCTHROUGHOUT         0x02
75 
76 #define STATE_SETG(dn, v)       do { ((state)->c[dn]) = (v); } while (0)
77 #define STATE_GETG(dn)          ((state)->c[dn])
78 
79 #define STATE_G0                STATE_GETG(0)
80 #define STATE_G1                STATE_GETG(1)
81 #define STATE_G2                STATE_GETG(2)
82 #define STATE_G3                STATE_GETG(3)
83 #define STATE_SETG0(v)          STATE_SETG(0, v)
84 #define STATE_SETG1(v)          STATE_SETG(1, v)
85 #define STATE_SETG2(v)          STATE_SETG(2, v)
86 #define STATE_SETG3(v)          STATE_SETG(3, v)
87 
88 #define STATE_SETFLAG(f)        do { ((state)->c[4]) |= (f); } while (0)
89 #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
90 #define STATE_CLEARFLAG(f)      do { ((state)->c[4]) &= ~(f); } while (0)
91 #define STATE_CLEARFLAGS()      do { ((state)->c[4]) = 0; } while (0)
92 
93 #define ISO2022_CONFIG          ((const struct iso2022_config *)config)
94 #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
95 #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
96 
97 /* iso2022_config.flags */
98 #define NO_SHIFT                0x01
99 #define USE_G2                  0x02
100 #define USE_JISX0208_EXT        0x04
101 
102 /*-*- internal data structures -*-*/
103 
104 typedef int (*iso2022_init_func)(void);
105 typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data);
106 typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length);
107 
108 struct iso2022_designation {
109     unsigned char mark;
110     unsigned char plane;
111     unsigned char width;
112     iso2022_init_func initializer;
113     iso2022_decode_func decoder;
114     iso2022_encode_func encoder;
115 };
116 
117 struct iso2022_config {
118     int flags;
119     const struct iso2022_designation *designations; /* non-ascii desigs */
120 };
121 
122 /*-*- iso-2022 codec implementation -*-*/
123 
CODEC_INIT(iso2022)124 CODEC_INIT(iso2022)
125 {
126     const struct iso2022_designation *desig;
127     for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128         if (desig->initializer != NULL && desig->initializer() != 0)
129             return -1;
130     return 0;
131 }
132 
ENCODER_INIT(iso2022)133 ENCODER_INIT(iso2022)
134 {
135     STATE_CLEARFLAGS();
136     STATE_SETG0(CHARSET_ASCII);
137     STATE_SETG1(CHARSET_ASCII);
138     return 0;
139 }
140 
ENCODER_RESET(iso2022)141 ENCODER_RESET(iso2022)
142 {
143     if (STATE_GETFLAG(F_SHIFTED)) {
144         WRITEBYTE1(SI);
145         NEXT_OUT(1);
146         STATE_CLEARFLAG(F_SHIFTED);
147     }
148     if (STATE_G0 != CHARSET_ASCII) {
149         WRITEBYTE3(ESC, '(', 'B');
150         NEXT_OUT(3);
151         STATE_SETG0(CHARSET_ASCII);
152     }
153     return 0;
154 }
155 
ENCODER(iso2022)156 ENCODER(iso2022)
157 {
158     while (*inpos < inlen) {
159         const struct iso2022_designation *dsg;
160         DBCHAR encoded;
161         Py_UCS4 c = INCHAR1;
162         Py_ssize_t insize;
163 
164         if (c < 0x80) {
165             if (STATE_G0 != CHARSET_ASCII) {
166                 WRITEBYTE3(ESC, '(', 'B');
167                 STATE_SETG0(CHARSET_ASCII);
168                 NEXT_OUT(3);
169             }
170             if (STATE_GETFLAG(F_SHIFTED)) {
171                 WRITEBYTE1(SI);
172                 STATE_CLEARFLAG(F_SHIFTED);
173                 NEXT_OUT(1);
174             }
175             WRITEBYTE1((unsigned char)c);
176             NEXT(1, 1);
177             continue;
178         }
179 
180         insize = 1;
181 
182         encoded = MAP_UNMAPPABLE;
183         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
184             Py_ssize_t length = 1;
185             encoded = dsg->encoder(&c, &length);
186             if (encoded == MAP_MULTIPLE_AVAIL) {
187                 /* this implementation won't work for pair
188                  * of non-bmp characters. */
189                 if (inlen - *inpos < 2) {
190                     if (!(flags & MBENC_FLUSH))
191                         return MBERR_TOOFEW;
192                     length = -1;
193                 }
194                 else
195                     length = 2;
196                 encoded = dsg->encoder(&c, &length);
197                 if (encoded != MAP_UNMAPPABLE) {
198                     insize = length;
199                     break;
200                 }
201             }
202             else if (encoded != MAP_UNMAPPABLE)
203                 break;
204         }
205 
206         if (!dsg->mark)
207             return 1;
208         assert(dsg->width == 1 || dsg->width == 2);
209 
210         switch (dsg->plane) {
211         case 0: /* G0 */
212             if (STATE_GETFLAG(F_SHIFTED)) {
213                 WRITEBYTE1(SI);
214                 STATE_CLEARFLAG(F_SHIFTED);
215                 NEXT_OUT(1);
216             }
217             if (STATE_G0 != dsg->mark) {
218                 if (dsg->width == 1) {
219                     WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
220                     STATE_SETG0(dsg->mark);
221                     NEXT_OUT(3);
222                 }
223                 else if (dsg->mark == CHARSET_JISX0208) {
224                     WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
225                     STATE_SETG0(dsg->mark);
226                     NEXT_OUT(3);
227                 }
228                 else {
229                     WRITEBYTE4(ESC, '$', '(',
230                         ESCMARK(dsg->mark));
231                     STATE_SETG0(dsg->mark);
232                     NEXT_OUT(4);
233                 }
234             }
235             break;
236         case 1: /* G1 */
237             if (STATE_G1 != dsg->mark) {
238                 if (dsg->width == 1) {
239                     WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
240                     STATE_SETG1(dsg->mark);
241                     NEXT_OUT(3);
242                 }
243                 else {
244                     WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
245                     STATE_SETG1(dsg->mark);
246                     NEXT_OUT(4);
247                 }
248             }
249             if (!STATE_GETFLAG(F_SHIFTED)) {
250                 WRITEBYTE1(SO);
251                 STATE_SETFLAG(F_SHIFTED);
252                 NEXT_OUT(1);
253             }
254             break;
255         default: /* G2 and G3 is not supported: no encoding in
256                   * CJKCodecs are using them yet */
257             return MBERR_INTERNAL;
258         }
259 
260         if (dsg->width == 1) {
261             WRITEBYTE1((unsigned char)encoded);
262             NEXT_OUT(1);
263         }
264         else {
265             WRITEBYTE2(encoded >> 8, encoded & 0xff);
266             NEXT_OUT(2);
267         }
268         NEXT_INCHAR(insize);
269     }
270 
271     return 0;
272 }
273 
DECODER_INIT(iso2022)274 DECODER_INIT(iso2022)
275 {
276     STATE_CLEARFLAGS();
277     STATE_SETG0(CHARSET_ASCII);
278     STATE_SETG1(CHARSET_ASCII);
279     STATE_SETG2(CHARSET_ASCII);
280     return 0;
281 }
282 
DECODER_RESET(iso2022)283 DECODER_RESET(iso2022)
284 {
285     STATE_SETG0(CHARSET_ASCII);
286     STATE_CLEARFLAG(F_SHIFTED);
287     return 0;
288 }
289 
290 static Py_ssize_t
iso2022processesc(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft)291 iso2022processesc(const void *config, MultibyteCodec_State *state,
292                   const unsigned char **inbuf, Py_ssize_t *inleft)
293 {
294     unsigned char charset, designation;
295     Py_ssize_t i, esclen = 0;
296 
297     for (i = 1;i < MAX_ESCSEQLEN;i++) {
298         if (i >= *inleft)
299             return MBERR_TOOFEW;
300         if (IS_ESCEND((*inbuf)[i])) {
301             esclen = i + 1;
302             break;
303         }
304         else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
305                  (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
306             i += 2;
307         }
308     }
309 
310     switch (esclen) {
311     case 0:
312         return 1; /* unterminated escape sequence */
313     case 3:
314         if (INBYTE2 == '$') {
315             charset = INBYTE3 | CHARSET_DBCS;
316             designation = 0;
317         }
318         else {
319             charset = INBYTE3;
320             if (INBYTE2 == '(')
321                 designation = 0;
322             else if (INBYTE2 == ')')
323                 designation = 1;
324             else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
325                 designation = 2;
326             else
327                 return 3;
328         }
329         break;
330     case 4:
331         if (INBYTE2 != '$')
332             return 4;
333 
334         charset = INBYTE4 | CHARSET_DBCS;
335         if (INBYTE3 == '(')
336             designation = 0;
337         else if (INBYTE3 == ')')
338             designation = 1;
339         else
340             return 4;
341         break;
342     case 6: /* designation with prefix */
343         if (CONFIG_ISSET(USE_JISX0208_EXT) &&
344             (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
345             (*inbuf)[5] == 'B') {
346             charset = 'B' | CHARSET_DBCS;
347             designation = 0;
348         }
349         else
350             return 6;
351         break;
352     default:
353         return esclen;
354     }
355 
356     /* raise error when the charset is not designated for this encoding */
357     if (charset != CHARSET_ASCII) {
358         const struct iso2022_designation *dsg;
359 
360         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
361             if (dsg->mark == charset)
362                 break;
363         }
364         if (!dsg->mark)
365             return esclen;
366     }
367 
368     STATE_SETG(designation, charset);
369     *inleft -= esclen;
370     (*inbuf) += esclen;
371     return 0;
372 }
373 
374 #define ISO8859_7_DECODE(c, writer)                                \
375     if ((c) < 0xa0) {                                              \
376         OUTCHAR(c);                                                \
377     } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
378         OUTCHAR(c);                                                \
379     } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||       \
380              (0xbffffd77L & (1L << ((c)-0xb4))))) {                \
381         OUTCHAR(0x02d0 + (c));                                     \
382     } else if ((c) == 0xa1) {                                      \
383         OUTCHAR(0x2018);                                           \
384     } else if ((c) == 0xa2) {                                      \
385         OUTCHAR(0x2019);                                           \
386     } else if ((c) == 0xaf) {                                      \
387         OUTCHAR(0x2015);                                           \
388     }
389 
390 static Py_ssize_t
iso2022processg2(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft,_PyUnicodeWriter * writer)391 iso2022processg2(const void *config, MultibyteCodec_State *state,
392                  const unsigned char **inbuf, Py_ssize_t *inleft,
393                  _PyUnicodeWriter *writer)
394 {
395     /* not written to use encoder, decoder functions because only few
396      * encodings use G2 designations in CJKCodecs */
397     if (STATE_G2 == CHARSET_ISO8859_1) {
398         if (INBYTE3 < 0x80)
399             OUTCHAR(INBYTE3 + 0x80);
400         else
401             return 3;
402     }
403     else if (STATE_G2 == CHARSET_ISO8859_7) {
404         ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
405         else
406             return 3;
407     }
408     else if (STATE_G2 == CHARSET_ASCII) {
409         if (INBYTE3 & 0x80)
410             return 3;
411         else
412             OUTCHAR(INBYTE3);
413     }
414     else
415         return MBERR_INTERNAL;
416 
417     (*inbuf) += 3;
418     *inleft -= 3;
419     return 0;
420 }
421 
DECODER(iso2022)422 DECODER(iso2022)
423 {
424     const struct iso2022_designation *dsgcache = NULL;
425 
426     while (inleft > 0) {
427         unsigned char c = INBYTE1;
428         Py_ssize_t err;
429 
430         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
431             /* ESC throughout mode:
432              * for non-iso2022 escape sequences */
433             OUTCHAR(c); /* assume as ISO-8859-1 */
434             NEXT_IN(1);
435             if (IS_ESCEND(c)) {
436                 STATE_CLEARFLAG(F_ESCTHROUGHOUT);
437             }
438             continue;
439         }
440 
441         switch (c) {
442         case ESC:
443             REQUIRE_INBUF(2);
444             if (IS_ISO2022ESC(INBYTE2)) {
445                 err = iso2022processesc(config, state,
446                                         inbuf, &inleft);
447                 if (err != 0)
448                     return err;
449             }
450             else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
451                 REQUIRE_INBUF(3);
452                 err = iso2022processg2(config, state,
453                                        inbuf, &inleft, writer);
454                 if (err != 0)
455                     return err;
456             }
457             else {
458                 OUTCHAR(ESC);
459                 STATE_SETFLAG(F_ESCTHROUGHOUT);
460                 NEXT_IN(1);
461             }
462             break;
463         case SI:
464             if (CONFIG_ISSET(NO_SHIFT))
465                 goto bypass;
466             STATE_CLEARFLAG(F_SHIFTED);
467             NEXT_IN(1);
468             break;
469         case SO:
470             if (CONFIG_ISSET(NO_SHIFT))
471                 goto bypass;
472             STATE_SETFLAG(F_SHIFTED);
473             NEXT_IN(1);
474             break;
475         case LF:
476             STATE_CLEARFLAG(F_SHIFTED);
477             OUTCHAR(LF);
478             NEXT_IN(1);
479             break;
480         default:
481             if (c < 0x20) /* C0 */
482                 goto bypass;
483             else if (c >= 0x80)
484                 return 1;
485             else {
486                 const struct iso2022_designation *dsg;
487                 unsigned char charset;
488                 Py_UCS4 decoded;
489 
490                 if (STATE_GETFLAG(F_SHIFTED))
491                     charset = STATE_G1;
492                 else
493                     charset = STATE_G0;
494 
495                 if (charset == CHARSET_ASCII) {
496 bypass:
497                     OUTCHAR(c);
498                     NEXT_IN(1);
499                     break;
500                 }
501 
502                 if (dsgcache != NULL &&
503                     dsgcache->mark == charset)
504                         dsg = dsgcache;
505                 else {
506                     for (dsg = CONFIG_DESIGNATIONS;
507                          dsg->mark != charset
508 #ifdef Py_DEBUG
509                             && dsg->mark != '\0'
510 #endif
511                          ; dsg++)
512                     {
513                         /* noop */
514                     }
515                     assert(dsg->mark != '\0');
516                     dsgcache = dsg;
517                 }
518 
519                 REQUIRE_INBUF(dsg->width);
520                 decoded = dsg->decoder(*inbuf);
521                 if (decoded == MAP_UNMAPPABLE)
522                     return dsg->width;
523 
524                 if (decoded < 0x10000) {
525                     OUTCHAR(decoded);
526                 }
527                 else if (decoded < 0x30000) {
528                     OUTCHAR(decoded);
529                 }
530                 else { /* JIS X 0213 pairs */
531                     OUTCHAR2(decoded >> 16, decoded & 0xffff);
532                 }
533                 NEXT_IN(dsg->width);
534             }
535             break;
536         }
537     }
538     return 0;
539 }
540 
541 /*-*- mapping table holders -*-*/
542 
543 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
544 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
545 
546 /* kr */
547 ENCMAP(cp949)
DECMAP(ksx1001)548 DECMAP(ksx1001)
549 
550 /* jp */
551 ENCMAP(jisxcommon)
552 DECMAP(jisx0208)
553 DECMAP(jisx0212)
554 ENCMAP(jisx0213_bmp)
555 DECMAP(jisx0213_1_bmp)
556 DECMAP(jisx0213_2_bmp)
557 ENCMAP(jisx0213_emp)
558 DECMAP(jisx0213_1_emp)
559 DECMAP(jisx0213_2_emp)
560 
561 /* cn */
562 ENCMAP(gbcommon)
563 DECMAP(gb2312)
564 
565 /* tw */
566 
567 /*-*- mapping access functions -*-*/
568 
569 static int
570 ksx1001_init(void)
571 {
572     static int initialized = 0;
573 
574     if (!initialized && (
575                     IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
576                     IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
577         return -1;
578     initialized = 1;
579     return 0;
580 }
581 
582 static Py_UCS4
ksx1001_decoder(const unsigned char * data)583 ksx1001_decoder(const unsigned char *data)
584 {
585     Py_UCS4 u;
586     if (TRYMAP_DEC(ksx1001, u, data[0], data[1]))
587         return u;
588     else
589         return MAP_UNMAPPABLE;
590 }
591 
592 static DBCHAR
ksx1001_encoder(const Py_UCS4 * data,Py_ssize_t * length)593 ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length)
594 {
595     DBCHAR coded;
596     assert(*length == 1);
597     if (*data < 0x10000) {
598         if (TRYMAP_ENC(cp949, coded, *data)) {
599             if (!(coded & 0x8000))
600                 return coded;
601         }
602     }
603     return MAP_UNMAPPABLE;
604 }
605 
606 static int
jisx0208_init(void)607 jisx0208_init(void)
608 {
609     static int initialized = 0;
610 
611     if (!initialized && (
612                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
613                     IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
614         return -1;
615     initialized = 1;
616     return 0;
617 }
618 
619 static Py_UCS4
jisx0208_decoder(const unsigned char * data)620 jisx0208_decoder(const unsigned char *data)
621 {
622     Py_UCS4 u;
623     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
624         return 0xff3c;
625     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
626         return u;
627     else
628         return MAP_UNMAPPABLE;
629 }
630 
631 static DBCHAR
jisx0208_encoder(const Py_UCS4 * data,Py_ssize_t * length)632 jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length)
633 {
634     DBCHAR coded;
635     assert(*length == 1);
636     if (*data < 0x10000) {
637         if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
638             return 0x2140;
639         else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
640             if (!(coded & 0x8000))
641                 return coded;
642         }
643     }
644     return MAP_UNMAPPABLE;
645 }
646 
647 static int
jisx0212_init(void)648 jisx0212_init(void)
649 {
650     static int initialized = 0;
651 
652     if (!initialized && (
653                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
654                     IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
655         return -1;
656     initialized = 1;
657     return 0;
658 }
659 
660 static Py_UCS4
jisx0212_decoder(const unsigned char * data)661 jisx0212_decoder(const unsigned char *data)
662 {
663     Py_UCS4 u;
664     if (TRYMAP_DEC(jisx0212, u, data[0], data[1]))
665         return u;
666     else
667         return MAP_UNMAPPABLE;
668 }
669 
670 static DBCHAR
jisx0212_encoder(const Py_UCS4 * data,Py_ssize_t * length)671 jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length)
672 {
673     DBCHAR coded;
674     assert(*length == 1);
675     if (*data < 0x10000) {
676         if (TRYMAP_ENC(jisxcommon, coded, *data)) {
677             if (coded & 0x8000)
678                 return coded & 0x7fff;
679         }
680     }
681     return MAP_UNMAPPABLE;
682 }
683 
684 static int
jisx0213_init(void)685 jisx0213_init(void)
686 {
687     static int initialized = 0;
688 
689     if (!initialized && (
690                     jisx0208_init() ||
691                     IMPORT_MAP(jp, jisx0213_bmp,
692                                &jisx0213_bmp_encmap, NULL) ||
693                     IMPORT_MAP(jp, jisx0213_1_bmp,
694                                NULL, &jisx0213_1_bmp_decmap) ||
695                     IMPORT_MAP(jp, jisx0213_2_bmp,
696                                NULL, &jisx0213_2_bmp_decmap) ||
697                     IMPORT_MAP(jp, jisx0213_emp,
698                                &jisx0213_emp_encmap, NULL) ||
699                     IMPORT_MAP(jp, jisx0213_1_emp,
700                                NULL, &jisx0213_1_emp_decmap) ||
701                     IMPORT_MAP(jp, jisx0213_2_emp,
702                                NULL, &jisx0213_2_emp_decmap) ||
703                     IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
704                                &jisx0213_pair_decmap)))
705         return -1;
706     initialized = 1;
707     return 0;
708 }
709 
710 #define config ((void *)2000)
711 static Py_UCS4
jisx0213_2000_1_decoder(const unsigned char * data)712 jisx0213_2000_1_decoder(const unsigned char *data)
713 {
714     Py_UCS4 u;
715     EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
716     else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
717         return 0xff3c;
718     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
719         ;
720     else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
721         ;
722     else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
723         u |= 0x20000;
724     else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
725         ;
726     else
727         return MAP_UNMAPPABLE;
728     return u;
729 }
730 
731 static Py_UCS4
jisx0213_2000_2_decoder(const unsigned char * data)732 jisx0213_2000_2_decoder(const unsigned char *data)
733 {
734     Py_UCS4 u;
735     EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1])
736     if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
737         ;
738     else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
739         u |= 0x20000;
740     else
741         return MAP_UNMAPPABLE;
742     return u;
743 }
744 #undef config
745 
746 static Py_UCS4
jisx0213_2004_1_decoder(const unsigned char * data)747 jisx0213_2004_1_decoder(const unsigned char *data)
748 {
749     Py_UCS4 u;
750     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
751         return 0xff3c;
752     else if (TRYMAP_DEC(jisx0208, u, data[0], data[1]))
753         ;
754     else if (TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]))
755         ;
756     else if (TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]))
757         u |= 0x20000;
758     else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
759         ;
760     else
761         return MAP_UNMAPPABLE;
762     return u;
763 }
764 
765 static Py_UCS4
jisx0213_2004_2_decoder(const unsigned char * data)766 jisx0213_2004_2_decoder(const unsigned char *data)
767 {
768     Py_UCS4 u;
769     if (TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]))
770         ;
771     else if (TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]))
772         u |= 0x20000;
773     else
774         return MAP_UNMAPPABLE;
775     return u;
776 }
777 
778 static DBCHAR
jisx0213_encoder(const Py_UCS4 * data,Py_ssize_t * length,void * config)779 jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config)
780 {
781     DBCHAR coded;
782 
783     switch (*length) {
784     case 1: /* first character */
785         if (*data >= 0x10000) {
786             if ((*data) >> 16 == 0x20000 >> 16) {
787                 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
788                 else if (TRYMAP_ENC(jisx0213_emp, coded, (*data) & 0xffff))
789                     return coded;
790             }
791             return MAP_UNMAPPABLE;
792         }
793 
794         EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
795         else if (TRYMAP_ENC(jisx0213_bmp, coded, *data)) {
796             if (coded == MULTIC)
797                 return MAP_MULTIPLE_AVAIL;
798         }
799         else if (TRYMAP_ENC(jisxcommon, coded, *data)) {
800             if (coded & 0x8000)
801                 return MAP_UNMAPPABLE;
802         }
803         else
804             return MAP_UNMAPPABLE;
805         return coded;
806 
807     case 2: /* second character of unicode pair */
808         coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
809                                 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
810         if (coded != DBCINV)
811             return coded;
812         /* fall through */
813 
814     case -1: /* flush unterminated */
815         *length = 1;
816         coded = find_pairencmap((ucs2_t)data[0], 0,
817                                 jisx0213_pair_encmap, JISX0213_ENCPAIRS);
818         if (coded == DBCINV)
819             return MAP_UNMAPPABLE;
820         else
821             return coded;
822         break;
823 
824     default:
825         return MAP_UNMAPPABLE;
826     }
827 }
828 
829 static DBCHAR
jisx0213_2000_1_encoder(const Py_UCS4 * data,Py_ssize_t * length)830 jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
831 {
832     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
833     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
834         return coded;
835     else if (coded & 0x8000)
836         return MAP_UNMAPPABLE;
837     else
838         return coded;
839 }
840 
841 static DBCHAR
jisx0213_2000_1_encoder_paironly(const Py_UCS4 * data,Py_ssize_t * length)842 jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
843 {
844     DBCHAR coded;
845     Py_ssize_t ilength = *length;
846 
847     coded = jisx0213_encoder(data, length, (void *)2000);
848     switch (ilength) {
849     case 1:
850         if (coded == MAP_MULTIPLE_AVAIL)
851             return MAP_MULTIPLE_AVAIL;
852         else
853             return MAP_UNMAPPABLE;
854     case 2:
855         if (*length != 2)
856             return MAP_UNMAPPABLE;
857         else
858             return coded;
859     default:
860         return MAP_UNMAPPABLE;
861     }
862 }
863 
864 static DBCHAR
jisx0213_2000_2_encoder(const Py_UCS4 * data,Py_ssize_t * length)865 jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
866 {
867     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
868     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
869         return coded;
870     else if (coded & 0x8000)
871         return coded & 0x7fff;
872     else
873         return MAP_UNMAPPABLE;
874 }
875 
876 static DBCHAR
jisx0213_2004_1_encoder(const Py_UCS4 * data,Py_ssize_t * length)877 jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length)
878 {
879     DBCHAR coded = jisx0213_encoder(data, length, NULL);
880     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
881         return coded;
882     else if (coded & 0x8000)
883         return MAP_UNMAPPABLE;
884     else
885         return coded;
886 }
887 
888 static DBCHAR
jisx0213_2004_1_encoder_paironly(const Py_UCS4 * data,Py_ssize_t * length)889 jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length)
890 {
891     DBCHAR coded;
892     Py_ssize_t ilength = *length;
893 
894     coded = jisx0213_encoder(data, length, NULL);
895     switch (ilength) {
896     case 1:
897         if (coded == MAP_MULTIPLE_AVAIL)
898             return MAP_MULTIPLE_AVAIL;
899         else
900             return MAP_UNMAPPABLE;
901     case 2:
902         if (*length != 2)
903             return MAP_UNMAPPABLE;
904         else
905             return coded;
906     default:
907         return MAP_UNMAPPABLE;
908     }
909 }
910 
911 static DBCHAR
jisx0213_2004_2_encoder(const Py_UCS4 * data,Py_ssize_t * length)912 jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length)
913 {
914     DBCHAR coded = jisx0213_encoder(data, length, NULL);
915     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
916         return coded;
917     else if (coded & 0x8000)
918         return coded & 0x7fff;
919     else
920         return MAP_UNMAPPABLE;
921 }
922 
923 static Py_UCS4
jisx0201_r_decoder(const unsigned char * data)924 jisx0201_r_decoder(const unsigned char *data)
925 {
926     Py_UCS4 u;
927     JISX0201_R_DECODE_CHAR(*data, u)
928     else
929         return MAP_UNMAPPABLE;
930     return u;
931 }
932 
933 static DBCHAR
jisx0201_r_encoder(const Py_UCS4 * data,Py_ssize_t * length)934 jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length)
935 {
936     DBCHAR coded;
937     JISX0201_R_ENCODE(*data, coded)
938     else
939         return MAP_UNMAPPABLE;
940     return coded;
941 }
942 
943 static Py_UCS4
jisx0201_k_decoder(const unsigned char * data)944 jisx0201_k_decoder(const unsigned char *data)
945 {
946     Py_UCS4 u;
947     JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
948     else
949         return MAP_UNMAPPABLE;
950     return u;
951 }
952 
953 static DBCHAR
jisx0201_k_encoder(const Py_UCS4 * data,Py_ssize_t * length)954 jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length)
955 {
956     DBCHAR coded;
957     JISX0201_K_ENCODE(*data, coded)
958     else
959         return MAP_UNMAPPABLE;
960     return coded - 0x80;
961 }
962 
963 static int
gb2312_init(void)964 gb2312_init(void)
965 {
966     static int initialized = 0;
967 
968     if (!initialized && (
969                     IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
970                     IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
971         return -1;
972     initialized = 1;
973     return 0;
974 }
975 
976 static Py_UCS4
gb2312_decoder(const unsigned char * data)977 gb2312_decoder(const unsigned char *data)
978 {
979     Py_UCS4 u;
980     if (TRYMAP_DEC(gb2312, u, data[0], data[1]))
981         return u;
982     else
983         return MAP_UNMAPPABLE;
984 }
985 
986 static DBCHAR
gb2312_encoder(const Py_UCS4 * data,Py_ssize_t * length)987 gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length)
988 {
989     DBCHAR coded;
990     assert(*length == 1);
991     if (*data < 0x10000) {
992         if (TRYMAP_ENC(gbcommon, coded, *data)) {
993             if (!(coded & 0x8000))
994                 return coded;
995         }
996     }
997     return MAP_UNMAPPABLE;
998 }
999 
1000 
1001 static Py_UCS4
dummy_decoder(const unsigned char * data)1002 dummy_decoder(const unsigned char *data)
1003 {
1004     return MAP_UNMAPPABLE;
1005 }
1006 
1007 static DBCHAR
dummy_encoder(const Py_UCS4 * data,Py_ssize_t * length)1008 dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length)
1009 {
1010     return MAP_UNMAPPABLE;
1011 }
1012 
1013 /*-*- registry tables -*-*/
1014 
1015 #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
1016                   ksx1001_init,                                         \
1017                   ksx1001_decoder, ksx1001_encoder }
1018 #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
1019                   ksx1001_init,                                         \
1020                   ksx1001_decoder, ksx1001_encoder }
1021 #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
1022                   NULL,                                                 \
1023                   jisx0201_r_decoder, jisx0201_r_encoder }
1024 #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
1025                   NULL,                                                 \
1026                   jisx0201_k_decoder, jisx0201_k_encoder }
1027 #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
1028                   jisx0208_init,                                        \
1029                   jisx0208_decoder, jisx0208_encoder }
1030 #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
1031                   jisx0208_init,                                        \
1032                   jisx0208_decoder, jisx0208_encoder }
1033 #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
1034                   jisx0212_init,                                        \
1035                   jisx0212_decoder, jisx0212_encoder }
1036 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
1037                   jisx0213_init,                                        \
1038                   jisx0213_2000_1_decoder,                              \
1039                   jisx0213_2000_1_encoder }
1040 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1041                   jisx0213_init,                                        \
1042                   jisx0213_2000_1_decoder,                              \
1043                   jisx0213_2000_1_encoder_paironly }
1044 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
1045                   jisx0213_init,                                        \
1046                   jisx0213_2000_2_decoder,                              \
1047                   jisx0213_2000_2_encoder }
1048 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
1049                   jisx0213_init,                                        \
1050                   jisx0213_2004_1_decoder,                              \
1051                   jisx0213_2004_1_encoder }
1052 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1053                   jisx0213_init,                                        \
1054                   jisx0213_2004_1_decoder,                              \
1055                   jisx0213_2004_1_encoder_paironly }
1056 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
1057                   jisx0213_init,                                        \
1058                   jisx0213_2004_2_decoder,                              \
1059                   jisx0213_2004_2_encoder }
1060 #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
1061                   gb2312_init,                                          \
1062                   gb2312_decoder, gb2312_encoder }
1063 #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
1064                   cns11643_init,                                        \
1065                   cns11643_1_decoder, cns11643_1_encoder }
1066 #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
1067                   cns11643_init,                                        \
1068                   cns11643_2_decoder, cns11643_2_encoder }
1069 #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
1070                   NULL, dummy_decoder, dummy_encoder }
1071 #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
1072                   NULL, dummy_decoder, dummy_encoder }
1073 #define REGISTRY_SENTINEL       { 0, }
1074 #define CONFIGDEF(var, attrs)                                           \
1075     static const struct iso2022_config iso2022_##var##_config = {       \
1076         attrs, iso2022_##var##_designations                             \
1077     };
1078 
1079 static const struct iso2022_designation iso2022_kr_designations[] = {
1080     REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1081 };
1082 CONFIGDEF(kr, 0)
1083 
1084 static const struct iso2022_designation iso2022_jp_designations[] = {
1085     REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1086     REGISTRY_SENTINEL
1087 };
1088 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1089 
1090 static const struct iso2022_designation iso2022_jp_1_designations[] = {
1091     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1092     REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1093 };
1094 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1095 
1096 static const struct iso2022_designation iso2022_jp_2_designations[] = {
1097     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1098     REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1099     REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1100 };
1101 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1102 
1103 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1104     REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1105     REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1106 };
1107 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1108 
1109 static const struct iso2022_designation iso2022_jp_3_designations[] = {
1110     REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1111     REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1112 };
1113 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1114 
1115 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1116     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1117     REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1118 };
1119 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1120 
1121 
1122 BEGIN_MAPPINGS_LIST
1123   /* no mapping table here */
1124 END_MAPPINGS_LIST
1125 
1126 #define ISO2022_CODEC(variation) {              \
1127     "iso2022_" #variation,                      \
1128     &iso2022_##variation##_config,              \
1129     iso2022_codec_init,                         \
1130     _STATEFUL_METHODS(iso2022)                  \
1131 },
1132 
1133 BEGIN_CODECS_LIST
1134   ISO2022_CODEC(kr)
1135   ISO2022_CODEC(jp)
1136   ISO2022_CODEC(jp_1)
1137   ISO2022_CODEC(jp_2)
1138   ISO2022_CODEC(jp_2004)
1139   ISO2022_CODEC(jp_3)
1140   ISO2022_CODEC(jp_ext)
1141 END_CODECS_LIST
1142 
1143 I_AM_A_MODULE_FOR(iso2022)
1144