• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
3  *
4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
5  */
6 
7 #define USING_IMPORTED_MAPS
8 #define USING_BINARY_PAIR_SEARCH
9 #define EXTERN_JISX0213_PAIR
10 #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
11 #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
12 
13 #include "cjkcodecs.h"
14 #include "alg_jisx0201.h"
15 #include "emu_jisx0213_2000.h"
16 #include "mappings_jisx0213_pair.h"
17 
18 /* STATE
19 
20    state->c[0-3]
21 
22     00000000
23     ||^^^^^|
24     |+-----+----  G0-3 Character Set
25     +-----------  Is G0-3 double byte?
26 
27    state->c[4]
28 
29     00000000
30           ||
31           |+----  Locked-Shift?
32           +-----  ESC Throughout
33 */
34 
35 #define ESC                     0x1B
36 #define SO                      0x0E
37 #define SI                      0x0F
38 #define LF                      0x0A
39 
40 #define MAX_ESCSEQLEN           16
41 
42 #define CHARSET_ISO8859_1       'A'
43 #define CHARSET_ASCII           'B'
44 #define CHARSET_ISO8859_7       'F'
45 #define CHARSET_JISX0201_K      'I'
46 #define CHARSET_JISX0201_R      'J'
47 
48 #define CHARSET_GB2312          ('A'|CHARSET_DBCS)
49 #define CHARSET_JISX0208        ('B'|CHARSET_DBCS)
50 #define CHARSET_KSX1001         ('C'|CHARSET_DBCS)
51 #define CHARSET_JISX0212        ('D'|CHARSET_DBCS)
52 #define CHARSET_GB2312_8565     ('E'|CHARSET_DBCS)
53 #define CHARSET_CNS11643_1      ('G'|CHARSET_DBCS)
54 #define CHARSET_CNS11643_2      ('H'|CHARSET_DBCS)
55 #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
56 #define CHARSET_JISX0213_2      ('P'|CHARSET_DBCS)
57 #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
58 #define CHARSET_JISX0208_O      ('@'|CHARSET_DBCS)
59 
60 #define CHARSET_DBCS            0x80
61 #define ESCMARK(mark)           ((mark) & 0x7f)
62 
63 #define IS_ESCEND(c)    (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
64 #define IS_ISO2022ESC(c2) \
65         ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
66          (c2) == '.' || (c2) == '&')
67     /* this is not a complete list of ISO-2022 escape sequence headers.
68      * but, it's enough to implement CJK instances of iso-2022. */
69 
70 #define MAP_UNMAPPABLE          0xFFFF
71 #define MAP_MULTIPLE_AVAIL      0xFFFE /* for JIS X 0213 */
72 
73 #define F_SHIFTED               0x01
74 #define F_ESCTHROUGHOUT         0x02
75 
76 #define STATE_SETG(dn, v)       ((state)->c[dn]) = (v);
77 #define STATE_GETG(dn)          ((state)->c[dn])
78 
79 #define STATE_G0                STATE_GETG(0)
80 #define STATE_G1                STATE_GETG(1)
81 #define STATE_G2                STATE_GETG(2)
82 #define STATE_G3                STATE_GETG(3)
83 #define STATE_SETG0(v)          STATE_SETG(0, v)
84 #define STATE_SETG1(v)          STATE_SETG(1, v)
85 #define STATE_SETG2(v)          STATE_SETG(2, v)
86 #define STATE_SETG3(v)          STATE_SETG(3, v)
87 
88 #define STATE_SETFLAG(f)        ((state)->c[4]) |= (f);
89 #define STATE_GETFLAG(f)        ((state)->c[4] & (f))
90 #define STATE_CLEARFLAG(f)      ((state)->c[4]) &= ~(f);
91 #define STATE_CLEARFLAGS()      ((state)->c[4]) = 0;
92 
93 #define ISO2022_CONFIG          ((const struct iso2022_config *)config)
94 #define CONFIG_ISSET(flag)      (ISO2022_CONFIG->flags & (flag))
95 #define CONFIG_DESIGNATIONS     (ISO2022_CONFIG->designations)
96 
97 /* iso2022_config.flags */
98 #define NO_SHIFT                0x01
99 #define USE_G2                  0x02
100 #define USE_JISX0208_EXT        0x04
101 
102 /*-*- internal data structures -*-*/
103 
104 typedef int (*iso2022_init_func)(void);
105 typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
106 typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);
107 
108 struct iso2022_designation {
109     unsigned char mark;
110     unsigned char plane;
111     unsigned char width;
112     iso2022_init_func initializer;
113     iso2022_decode_func decoder;
114     iso2022_encode_func encoder;
115 };
116 
117 struct iso2022_config {
118     int flags;
119     const struct iso2022_designation *designations; /* non-ascii desigs */
120 };
121 
122 /*-*- iso-2022 codec implementation -*-*/
123 
CODEC_INIT(iso2022)124 CODEC_INIT(iso2022)
125 {
126     const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
127     for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
128         if (desig->initializer != NULL && desig->initializer() != 0)
129             return -1;
130     return 0;
131 }
132 
ENCODER_INIT(iso2022)133 ENCODER_INIT(iso2022)
134 {
135     STATE_CLEARFLAGS()
136     STATE_SETG0(CHARSET_ASCII)
137     STATE_SETG1(CHARSET_ASCII)
138     return 0;
139 }
140 
ENCODER_RESET(iso2022)141 ENCODER_RESET(iso2022)
142 {
143     if (STATE_GETFLAG(F_SHIFTED)) {
144         WRITE1(SI)
145         NEXT_OUT(1)
146         STATE_CLEARFLAG(F_SHIFTED)
147     }
148     if (STATE_G0 != CHARSET_ASCII) {
149         WRITE3(ESC, '(', 'B')
150         NEXT_OUT(3)
151         STATE_SETG0(CHARSET_ASCII)
152     }
153     return 0;
154 }
155 
ENCODER(iso2022)156 ENCODER(iso2022)
157 {
158     while (inleft > 0) {
159         const struct iso2022_designation *dsg;
160         DBCHAR encoded;
161         ucs4_t c = **inbuf;
162         Py_ssize_t insize;
163 
164         if (c < 0x80) {
165             if (STATE_G0 != CHARSET_ASCII) {
166                 WRITE3(ESC, '(', 'B')
167                 STATE_SETG0(CHARSET_ASCII)
168                 NEXT_OUT(3)
169             }
170             if (STATE_GETFLAG(F_SHIFTED)) {
171                 WRITE1(SI)
172                 STATE_CLEARFLAG(F_SHIFTED)
173                 NEXT_OUT(1)
174             }
175             WRITE1((unsigned char)c)
176             NEXT(1, 1)
177             continue;
178         }
179 
180         DECODE_SURROGATE(c)
181         insize = GET_INSIZE(c);
182 
183         encoded = MAP_UNMAPPABLE;
184         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
185             Py_ssize_t length = 1;
186             encoded = dsg->encoder(&c, &length);
187             if (encoded == MAP_MULTIPLE_AVAIL) {
188                 /* this implementation won't work for pair
189                  * of non-bmp characters. */
190                 if (inleft < 2) {
191                     if (!(flags & MBENC_FLUSH))
192                         return MBERR_TOOFEW;
193                     length = -1;
194                 }
195                 else
196                     length = 2;
197 #if Py_UNICODE_SIZE == 2
198                 if (length == 2) {
199                     ucs4_t u4in[2];
200                     u4in[0] = (ucs4_t)IN1;
201                     u4in[1] = (ucs4_t)IN2;
202                     encoded = dsg->encoder(u4in, &length);
203                 } else
204                     encoded = dsg->encoder(&c, &length);
205 #else
206                 encoded = dsg->encoder(&c, &length);
207 #endif
208                 if (encoded != MAP_UNMAPPABLE) {
209                     insize = length;
210                     break;
211                 }
212             }
213             else if (encoded != MAP_UNMAPPABLE)
214                 break;
215         }
216 
217         if (!dsg->mark)
218             return 1;
219         assert(dsg->width == 1 || dsg->width == 2);
220 
221         switch (dsg->plane) {
222         case 0: /* G0 */
223             if (STATE_GETFLAG(F_SHIFTED)) {
224                 WRITE1(SI)
225                 STATE_CLEARFLAG(F_SHIFTED)
226                 NEXT_OUT(1)
227             }
228             if (STATE_G0 != dsg->mark) {
229                 if (dsg->width == 1) {
230                     WRITE3(ESC, '(', ESCMARK(dsg->mark))
231                     STATE_SETG0(dsg->mark)
232                     NEXT_OUT(3)
233                 }
234                 else if (dsg->mark == CHARSET_JISX0208) {
235                     WRITE3(ESC, '$', ESCMARK(dsg->mark))
236                     STATE_SETG0(dsg->mark)
237                     NEXT_OUT(3)
238                 }
239                 else {
240                     WRITE4(ESC, '$', '(',
241                         ESCMARK(dsg->mark))
242                     STATE_SETG0(dsg->mark)
243                     NEXT_OUT(4)
244                 }
245             }
246             break;
247         case 1: /* G1 */
248             if (STATE_G1 != dsg->mark) {
249                 if (dsg->width == 1) {
250                     WRITE3(ESC, ')', ESCMARK(dsg->mark))
251                     STATE_SETG1(dsg->mark)
252                     NEXT_OUT(3)
253                 }
254                 else {
255                     WRITE4(ESC, '$', ')',
256                         ESCMARK(dsg->mark))
257                     STATE_SETG1(dsg->mark)
258                     NEXT_OUT(4)
259                 }
260             }
261             if (!STATE_GETFLAG(F_SHIFTED)) {
262                 WRITE1(SO)
263                 STATE_SETFLAG(F_SHIFTED)
264                 NEXT_OUT(1)
265             }
266             break;
267         default: /* G2 and G3 is not supported: no encoding in
268                   * CJKCodecs are using them yet */
269             return MBERR_INTERNAL;
270         }
271 
272         if (dsg->width == 1) {
273             WRITE1((unsigned char)encoded)
274             NEXT_OUT(1)
275         }
276         else {
277             WRITE2(encoded >> 8, encoded & 0xff)
278             NEXT_OUT(2)
279         }
280         NEXT_IN(insize)
281     }
282 
283     return 0;
284 }
285 
DECODER_INIT(iso2022)286 DECODER_INIT(iso2022)
287 {
288     STATE_CLEARFLAGS()
289     STATE_SETG0(CHARSET_ASCII)
290     STATE_SETG1(CHARSET_ASCII)
291     STATE_SETG2(CHARSET_ASCII)
292     return 0;
293 }
294 
DECODER_RESET(iso2022)295 DECODER_RESET(iso2022)
296 {
297     STATE_SETG0(CHARSET_ASCII)
298     STATE_CLEARFLAG(F_SHIFTED)
299     return 0;
300 }
301 
302 static Py_ssize_t
iso2022processesc(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft)303 iso2022processesc(const void *config, MultibyteCodec_State *state,
304                   const unsigned char **inbuf, Py_ssize_t *inleft)
305 {
306     unsigned char charset, designation;
307     Py_ssize_t i, esclen;
308 
309     for (i = 1;i < MAX_ESCSEQLEN;i++) {
310         if (i >= *inleft)
311             return MBERR_TOOFEW;
312         if (IS_ESCEND((*inbuf)[i])) {
313             esclen = i + 1;
314             break;
315         }
316         else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
317                  (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
318             i += 2;
319     }
320 
321     if (i >= MAX_ESCSEQLEN)
322         return 1; /* unterminated escape sequence */
323 
324     switch (esclen) {
325     case 3:
326         if (IN2 == '$') {
327             charset = IN3 | CHARSET_DBCS;
328             designation = 0;
329         }
330         else {
331             charset = IN3;
332             if (IN2 == '(') designation = 0;
333             else if (IN2 == ')') designation = 1;
334             else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
335                 designation = 2;
336             else return 3;
337         }
338         break;
339     case 4:
340         if (IN2 != '$')
341             return 4;
342 
343         charset = IN4 | CHARSET_DBCS;
344         if (IN3 == '(') designation = 0;
345         else if (IN3 == ')') designation = 1;
346         else return 4;
347         break;
348     case 6: /* designation with prefix */
349         if (CONFIG_ISSET(USE_JISX0208_EXT) &&
350             (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
351             (*inbuf)[5] == 'B') {
352             charset = 'B' | CHARSET_DBCS;
353             designation = 0;
354         }
355         else
356             return 6;
357         break;
358     default:
359         return esclen;
360     }
361 
362     /* raise error when the charset is not designated for this encoding */
363     if (charset != CHARSET_ASCII) {
364         const struct iso2022_designation *dsg;
365 
366         for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
367             if (dsg->mark == charset)
368                 break;
369         if (!dsg->mark)
370             return esclen;
371     }
372 
373     STATE_SETG(designation, charset)
374     *inleft -= esclen;
375     (*inbuf) += esclen;
376     return 0;
377 }
378 
379 #define ISO8859_7_DECODE(c, assi)                                       \
380     if ((c) < 0xa0) (assi) = (c);                                       \
381     else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0))))          \
382         (assi) = (c);                                                   \
383     else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||              \
384              (0xbffffd77L & (1L << ((c)-0xb4)))))                       \
385         (assi) = 0x02d0 + (c);                                          \
386     else if ((c) == 0xa1) (assi) = 0x2018;                              \
387     else if ((c) == 0xa2) (assi) = 0x2019;                              \
388     else if ((c) == 0xaf) (assi) = 0x2015;
389 
390 static Py_ssize_t
iso2022processg2(const void * config,MultibyteCodec_State * state,const unsigned char ** inbuf,Py_ssize_t * inleft,Py_UNICODE ** outbuf,Py_ssize_t * outleft)391 iso2022processg2(const void *config, MultibyteCodec_State *state,
392                  const unsigned char **inbuf, Py_ssize_t *inleft,
393                  Py_UNICODE **outbuf, Py_ssize_t *outleft)
394 {
395     /* not written to use encoder, decoder functions because only few
396      * encodings use G2 designations in CJKCodecs */
397     if (STATE_G2 == CHARSET_ISO8859_1) {
398         if (IN3 < 0x80)
399             OUT1(IN3 + 0x80)
400         else
401             return 3;
402     }
403     else if (STATE_G2 == CHARSET_ISO8859_7) {
404         ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
405         else return 3;
406     }
407     else if (STATE_G2 == CHARSET_ASCII) {
408         if (IN3 & 0x80) return 3;
409         else **outbuf = IN3;
410     }
411     else
412         return MBERR_INTERNAL;
413 
414     (*inbuf) += 3;
415     *inleft -= 3;
416     (*outbuf) += 1;
417     *outleft -= 1;
418     return 0;
419 }
420 
DECODER(iso2022)421 DECODER(iso2022)
422 {
423     const struct iso2022_designation *dsgcache = NULL;
424 
425     while (inleft > 0) {
426         unsigned char c = IN1;
427         Py_ssize_t err;
428 
429         if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
430             /* ESC throughout mode:
431              * for non-iso2022 escape sequences */
432             WRITE1(c) /* assume as ISO-8859-1 */
433             NEXT(1, 1)
434             if (IS_ESCEND(c)) {
435                 STATE_CLEARFLAG(F_ESCTHROUGHOUT)
436             }
437             continue;
438         }
439 
440         switch (c) {
441         case ESC:
442             REQUIRE_INBUF(2)
443             if (IS_ISO2022ESC(IN2)) {
444                 err = iso2022processesc(config, state,
445                                         inbuf, &inleft);
446                 if (err != 0)
447                     return err;
448             }
449             else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
450                 REQUIRE_INBUF(3)
451                 err = iso2022processg2(config, state,
452                     inbuf, &inleft, outbuf, &outleft);
453                 if (err != 0)
454                     return err;
455             }
456             else {
457                 WRITE1(ESC)
458                 STATE_SETFLAG(F_ESCTHROUGHOUT)
459                 NEXT(1, 1)
460             }
461             break;
462         case SI:
463             if (CONFIG_ISSET(NO_SHIFT))
464                 goto bypass;
465             STATE_CLEARFLAG(F_SHIFTED)
466             NEXT_IN(1)
467             break;
468         case SO:
469             if (CONFIG_ISSET(NO_SHIFT))
470                 goto bypass;
471             STATE_SETFLAG(F_SHIFTED)
472             NEXT_IN(1)
473             break;
474         case LF:
475             STATE_CLEARFLAG(F_SHIFTED)
476             WRITE1(LF)
477             NEXT(1, 1)
478             break;
479         default:
480             if (c < 0x20) /* C0 */
481                 goto bypass;
482             else if (c >= 0x80)
483                 return 1;
484             else {
485                 const struct iso2022_designation *dsg;
486                 unsigned char charset;
487                 ucs4_t decoded;
488 
489                 if (STATE_GETFLAG(F_SHIFTED))
490                     charset = STATE_G1;
491                 else
492                     charset = STATE_G0;
493 
494                 if (charset == CHARSET_ASCII) {
495 bypass:                                 WRITE1(c)
496                                         NEXT(1, 1)
497                                         break;
498                                 }
499 
500                                 if (dsgcache != NULL &&
501                                     dsgcache->mark == charset)
502                                         dsg = dsgcache;
503                                 else {
504                                         for (dsg = CONFIG_DESIGNATIONS;
505                                              dsg->mark != charset
506 #ifdef Py_DEBUG
507                                                 && dsg->mark != '\0'
508 #endif
509                                              ;dsg++)
510                                                 /* noop */;
511                                         assert(dsg->mark != '\0');
512                                         dsgcache = dsg;
513                                 }
514 
515                                 REQUIRE_INBUF(dsg->width)
516                                 decoded = dsg->decoder(*inbuf);
517                                 if (decoded == MAP_UNMAPPABLE)
518                                         return dsg->width;
519 
520                                 if (decoded < 0x10000) {
521                                         WRITE1(decoded)
522                                         NEXT_OUT(1)
523                                 }
524                                 else if (decoded < 0x30000) {
525                                         WRITEUCS4(decoded)
526                                 }
527                                 else { /* JIS X 0213 pairs */
528                     WRITE2(decoded >> 16, decoded & 0xffff)
529                     NEXT_OUT(2)
530                 }
531                 NEXT_IN(dsg->width)
532             }
533             break;
534         }
535     }
536     return 0;
537 }
538 
539 /*-*- mapping table holders -*-*/
540 
541 #define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
542 #define DECMAP(enc) static const decode_map *enc##_decmap = NULL;
543 
544 /* kr */
545 ENCMAP(cp949)
DECMAP(ksx1001)546 DECMAP(ksx1001)
547 
548 /* jp */
549 ENCMAP(jisxcommon)
550 DECMAP(jisx0208)
551 DECMAP(jisx0212)
552 ENCMAP(jisx0213_bmp)
553 DECMAP(jisx0213_1_bmp)
554 DECMAP(jisx0213_2_bmp)
555 ENCMAP(jisx0213_emp)
556 DECMAP(jisx0213_1_emp)
557 DECMAP(jisx0213_2_emp)
558 
559 /* cn */
560 ENCMAP(gbcommon)
561 DECMAP(gb2312)
562 
563 /* tw */
564 
565 /*-*- mapping access functions -*-*/
566 
567 static int
568 ksx1001_init(void)
569 {
570     static int initialized = 0;
571 
572     if (!initialized && (
573                     IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
574                     IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
575         return -1;
576     initialized = 1;
577     return 0;
578 }
579 
580 static ucs4_t
ksx1001_decoder(const unsigned char * data)581 ksx1001_decoder(const unsigned char *data)
582 {
583     ucs4_t u;
584     TRYMAP_DEC(ksx1001, u, data[0], data[1])
585         return u;
586     else
587         return MAP_UNMAPPABLE;
588 }
589 
590 static DBCHAR
ksx1001_encoder(const ucs4_t * data,Py_ssize_t * length)591 ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
592 {
593     DBCHAR coded;
594     assert(*length == 1);
595     if (*data < 0x10000) {
596         TRYMAP_ENC(cp949, coded, *data)
597             if (!(coded & 0x8000))
598                 return coded;
599     }
600     return MAP_UNMAPPABLE;
601 }
602 
603 static int
jisx0208_init(void)604 jisx0208_init(void)
605 {
606     static int initialized = 0;
607 
608     if (!initialized && (
609                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
610                     IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
611         return -1;
612     initialized = 1;
613     return 0;
614 }
615 
616 static ucs4_t
jisx0208_decoder(const unsigned char * data)617 jisx0208_decoder(const unsigned char *data)
618 {
619     ucs4_t u;
620     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
621         return 0xff3c;
622     else TRYMAP_DEC(jisx0208, u, data[0], data[1])
623         return u;
624     else
625         return MAP_UNMAPPABLE;
626 }
627 
628 static DBCHAR
jisx0208_encoder(const ucs4_t * data,Py_ssize_t * length)629 jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
630 {
631     DBCHAR coded;
632     assert(*length == 1);
633     if (*data < 0x10000) {
634         if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
635             return 0x2140;
636         else TRYMAP_ENC(jisxcommon, coded, *data) {
637             if (!(coded & 0x8000))
638                 return coded;
639         }
640     }
641     return MAP_UNMAPPABLE;
642 }
643 
644 static int
jisx0212_init(void)645 jisx0212_init(void)
646 {
647     static int initialized = 0;
648 
649     if (!initialized && (
650                     IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
651                     IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
652         return -1;
653     initialized = 1;
654     return 0;
655 }
656 
657 static ucs4_t
jisx0212_decoder(const unsigned char * data)658 jisx0212_decoder(const unsigned char *data)
659 {
660     ucs4_t u;
661     TRYMAP_DEC(jisx0212, u, data[0], data[1])
662         return u;
663     else
664         return MAP_UNMAPPABLE;
665 }
666 
667 static DBCHAR
jisx0212_encoder(const ucs4_t * data,Py_ssize_t * length)668 jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
669 {
670     DBCHAR coded;
671     assert(*length == 1);
672     if (*data < 0x10000) {
673         TRYMAP_ENC(jisxcommon, coded, *data) {
674             if (coded & 0x8000)
675                 return coded & 0x7fff;
676         }
677     }
678     return MAP_UNMAPPABLE;
679 }
680 
681 static int
jisx0213_init(void)682 jisx0213_init(void)
683 {
684     static int initialized = 0;
685 
686     if (!initialized && (
687                     jisx0208_init() ||
688                     IMPORT_MAP(jp, jisx0213_bmp,
689                                &jisx0213_bmp_encmap, NULL) ||
690                     IMPORT_MAP(jp, jisx0213_1_bmp,
691                                NULL, &jisx0213_1_bmp_decmap) ||
692                     IMPORT_MAP(jp, jisx0213_2_bmp,
693                                NULL, &jisx0213_2_bmp_decmap) ||
694                     IMPORT_MAP(jp, jisx0213_emp,
695                                &jisx0213_emp_encmap, NULL) ||
696                     IMPORT_MAP(jp, jisx0213_1_emp,
697                                NULL, &jisx0213_1_emp_decmap) ||
698                     IMPORT_MAP(jp, jisx0213_2_emp,
699                                NULL, &jisx0213_2_emp_decmap) ||
700                     IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
701                                &jisx0213_pair_decmap)))
702         return -1;
703     initialized = 1;
704     return 0;
705 }
706 
707 #define config ((void *)2000)
708 static ucs4_t
jisx0213_2000_1_decoder(const unsigned char * data)709 jisx0213_2000_1_decoder(const unsigned char *data)
710 {
711     ucs4_t u;
712     EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
713     else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
714         return 0xff3c;
715     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
716     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
717     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
718         u |= 0x20000;
719     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
720     else
721         return MAP_UNMAPPABLE;
722     return u;
723 }
724 
725 static ucs4_t
jisx0213_2000_2_decoder(const unsigned char * data)726 jisx0213_2000_2_decoder(const unsigned char *data)
727 {
728     ucs4_t u;
729     EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
730     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
731     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
732         u |= 0x20000;
733     else
734         return MAP_UNMAPPABLE;
735     return u;
736 }
737 #undef config
738 
739 static ucs4_t
jisx0213_2004_1_decoder(const unsigned char * data)740 jisx0213_2004_1_decoder(const unsigned char *data)
741 {
742     ucs4_t u;
743     if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
744         return 0xff3c;
745     else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
746     else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
747     else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
748         u |= 0x20000;
749     else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
750     else
751         return MAP_UNMAPPABLE;
752     return u;
753 }
754 
755 static ucs4_t
jisx0213_2004_2_decoder(const unsigned char * data)756 jisx0213_2004_2_decoder(const unsigned char *data)
757 {
758     ucs4_t u;
759     TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
760     else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
761         u |= 0x20000;
762     else
763         return MAP_UNMAPPABLE;
764     return u;
765 }
766 
767 static DBCHAR
jisx0213_encoder(const ucs4_t * data,Py_ssize_t * length,void * config)768 jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
769 {
770     DBCHAR coded;
771 
772     switch (*length) {
773     case 1: /* first character */
774         if (*data >= 0x10000) {
775             if ((*data) >> 16 == 0x20000 >> 16) {
776                 EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
777                 else TRYMAP_ENC(jisx0213_emp, coded,
778                                 (*data) & 0xffff)
779                     return coded;
780             }
781             return MAP_UNMAPPABLE;
782         }
783 
784         EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
785         else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
786             if (coded == MULTIC)
787                 return MAP_MULTIPLE_AVAIL;
788         }
789         else TRYMAP_ENC(jisxcommon, coded, *data) {
790             if (coded & 0x8000)
791                 return MAP_UNMAPPABLE;
792         }
793         else
794             return MAP_UNMAPPABLE;
795         return coded;
796     case 2: /* second character of unicode pair */
797         coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
798                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
799         if (coded == DBCINV) {
800             *length = 1;
801             coded = find_pairencmap((ucs2_t)data[0], 0,
802                       jisx0213_pair_encmap, JISX0213_ENCPAIRS);
803             if (coded == DBCINV)
804                 return MAP_UNMAPPABLE;
805         }
806         else
807             return coded;
808     case -1: /* flush unterminated */
809         *length = 1;
810         coded = find_pairencmap((ucs2_t)data[0], 0,
811                         jisx0213_pair_encmap, JISX0213_ENCPAIRS);
812         if (coded == DBCINV)
813             return MAP_UNMAPPABLE;
814         else
815             return coded;
816     default:
817         return MAP_UNMAPPABLE;
818     }
819 }
820 
821 static DBCHAR
jisx0213_2000_1_encoder(const ucs4_t * data,Py_ssize_t * length)822 jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
823 {
824     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
825     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
826         return coded;
827     else if (coded & 0x8000)
828         return MAP_UNMAPPABLE;
829     else
830         return coded;
831 }
832 
833 static DBCHAR
jisx0213_2000_1_encoder_paironly(const ucs4_t * data,Py_ssize_t * length)834 jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
835 {
836     DBCHAR coded;
837     Py_ssize_t ilength = *length;
838 
839     coded = jisx0213_encoder(data, length, (void *)2000);
840     switch (ilength) {
841     case 1:
842         if (coded == MAP_MULTIPLE_AVAIL)
843             return MAP_MULTIPLE_AVAIL;
844         else
845             return MAP_UNMAPPABLE;
846     case 2:
847         if (*length != 2)
848             return MAP_UNMAPPABLE;
849         else
850             return coded;
851     default:
852         return MAP_UNMAPPABLE;
853     }
854 }
855 
856 static DBCHAR
jisx0213_2000_2_encoder(const ucs4_t * data,Py_ssize_t * length)857 jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
858 {
859     DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
860     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
861         return coded;
862     else if (coded & 0x8000)
863         return coded & 0x7fff;
864     else
865         return MAP_UNMAPPABLE;
866 }
867 
868 static DBCHAR
jisx0213_2004_1_encoder(const ucs4_t * data,Py_ssize_t * length)869 jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
870 {
871     DBCHAR coded = jisx0213_encoder(data, length, NULL);
872     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
873         return coded;
874     else if (coded & 0x8000)
875         return MAP_UNMAPPABLE;
876     else
877         return coded;
878 }
879 
880 static DBCHAR
jisx0213_2004_1_encoder_paironly(const ucs4_t * data,Py_ssize_t * length)881 jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
882 {
883     DBCHAR coded;
884     Py_ssize_t ilength = *length;
885 
886     coded = jisx0213_encoder(data, length, NULL);
887     switch (ilength) {
888     case 1:
889         if (coded == MAP_MULTIPLE_AVAIL)
890             return MAP_MULTIPLE_AVAIL;
891         else
892             return MAP_UNMAPPABLE;
893     case 2:
894         if (*length != 2)
895             return MAP_UNMAPPABLE;
896         else
897             return coded;
898     default:
899         return MAP_UNMAPPABLE;
900     }
901 }
902 
903 static DBCHAR
jisx0213_2004_2_encoder(const ucs4_t * data,Py_ssize_t * length)904 jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
905 {
906     DBCHAR coded = jisx0213_encoder(data, length, NULL);
907     if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
908         return coded;
909     else if (coded & 0x8000)
910         return coded & 0x7fff;
911     else
912         return MAP_UNMAPPABLE;
913 }
914 
915 static ucs4_t
jisx0201_r_decoder(const unsigned char * data)916 jisx0201_r_decoder(const unsigned char *data)
917 {
918     ucs4_t u;
919     JISX0201_R_DECODE(*data, u)
920     else return MAP_UNMAPPABLE;
921     return u;
922 }
923 
924 static DBCHAR
jisx0201_r_encoder(const ucs4_t * data,Py_ssize_t * length)925 jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
926 {
927     DBCHAR coded;
928     JISX0201_R_ENCODE(*data, coded)
929     else return MAP_UNMAPPABLE;
930     return coded;
931 }
932 
933 static ucs4_t
jisx0201_k_decoder(const unsigned char * data)934 jisx0201_k_decoder(const unsigned char *data)
935 {
936     ucs4_t u;
937     JISX0201_K_DECODE(*data ^ 0x80, u)
938     else return MAP_UNMAPPABLE;
939     return u;
940 }
941 
942 static DBCHAR
jisx0201_k_encoder(const ucs4_t * data,Py_ssize_t * length)943 jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
944 {
945     DBCHAR coded;
946     JISX0201_K_ENCODE(*data, coded)
947     else return MAP_UNMAPPABLE;
948     return coded - 0x80;
949 }
950 
951 static int
gb2312_init(void)952 gb2312_init(void)
953 {
954     static int initialized = 0;
955 
956     if (!initialized && (
957                     IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
958                     IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
959         return -1;
960     initialized = 1;
961     return 0;
962 }
963 
964 static ucs4_t
gb2312_decoder(const unsigned char * data)965 gb2312_decoder(const unsigned char *data)
966 {
967     ucs4_t u;
968     TRYMAP_DEC(gb2312, u, data[0], data[1])
969         return u;
970     else
971         return MAP_UNMAPPABLE;
972 }
973 
974 static DBCHAR
gb2312_encoder(const ucs4_t * data,Py_ssize_t * length)975 gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
976 {
977     DBCHAR coded;
978     assert(*length == 1);
979     if (*data < 0x10000) {
980         TRYMAP_ENC(gbcommon, coded, *data) {
981             if (!(coded & 0x8000))
982                 return coded;
983         }
984     }
985     return MAP_UNMAPPABLE;
986 }
987 
988 
989 static ucs4_t
dummy_decoder(const unsigned char * data)990 dummy_decoder(const unsigned char *data)
991 {
992     return MAP_UNMAPPABLE;
993 }
994 
995 static DBCHAR
dummy_encoder(const ucs4_t * data,Py_ssize_t * length)996 dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
997 {
998     return MAP_UNMAPPABLE;
999 }
1000 
1001 /*-*- registry tables -*-*/
1002 
1003 #define REGISTRY_KSX1001_G0     { CHARSET_KSX1001, 0, 2,                \
1004                   ksx1001_init,                                         \
1005                   ksx1001_decoder, ksx1001_encoder }
1006 #define REGISTRY_KSX1001_G1     { CHARSET_KSX1001, 1, 2,                \
1007                   ksx1001_init,                                         \
1008                   ksx1001_decoder, ksx1001_encoder }
1009 #define REGISTRY_JISX0201_R     { CHARSET_JISX0201_R, 0, 1,             \
1010                   NULL,                                                 \
1011                   jisx0201_r_decoder, jisx0201_r_encoder }
1012 #define REGISTRY_JISX0201_K     { CHARSET_JISX0201_K, 0, 1,             \
1013                   NULL,                                                 \
1014                   jisx0201_k_decoder, jisx0201_k_encoder }
1015 #define REGISTRY_JISX0208       { CHARSET_JISX0208, 0, 2,               \
1016                   jisx0208_init,                                        \
1017                   jisx0208_decoder, jisx0208_encoder }
1018 #define REGISTRY_JISX0208_O     { CHARSET_JISX0208_O, 0, 2,             \
1019                   jisx0208_init,                                        \
1020                   jisx0208_decoder, jisx0208_encoder }
1021 #define REGISTRY_JISX0212       { CHARSET_JISX0212, 0, 2,               \
1022                   jisx0212_init,                                        \
1023                   jisx0212_decoder, jisx0212_encoder }
1024 #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,       \
1025                   jisx0213_init,                                        \
1026                   jisx0213_2000_1_decoder,                              \
1027                   jisx0213_2000_1_encoder }
1028 #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
1029                   jisx0213_init,                                        \
1030                   jisx0213_2000_1_decoder,                              \
1031                   jisx0213_2000_1_encoder_paironly }
1032 #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,            \
1033                   jisx0213_init,                                        \
1034                   jisx0213_2000_2_decoder,                              \
1035                   jisx0213_2000_2_encoder }
1036 #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,       \
1037                   jisx0213_init,                                        \
1038                   jisx0213_2004_1_decoder,                              \
1039                   jisx0213_2004_1_encoder }
1040 #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
1041                   jisx0213_init,                                        \
1042                   jisx0213_2004_1_decoder,                              \
1043                   jisx0213_2004_1_encoder_paironly }
1044 #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,            \
1045                   jisx0213_init,                                        \
1046                   jisx0213_2004_2_decoder,                              \
1047                   jisx0213_2004_2_encoder }
1048 #define REGISTRY_GB2312         { CHARSET_GB2312, 0, 2,                 \
1049                   gb2312_init,                                          \
1050                   gb2312_decoder, gb2312_encoder }
1051 #define REGISTRY_CNS11643_1     { CHARSET_CNS11643_1, 1, 2,             \
1052                   cns11643_init,                                        \
1053                   cns11643_1_decoder, cns11643_1_encoder }
1054 #define REGISTRY_CNS11643_2     { CHARSET_CNS11643_2, 2, 2,             \
1055                   cns11643_init,                                        \
1056                   cns11643_2_decoder, cns11643_2_encoder }
1057 #define REGISTRY_ISO8859_1      { CHARSET_ISO8859_1, 2, 1,              \
1058                   NULL, dummy_decoder, dummy_encoder }
1059 #define REGISTRY_ISO8859_7      { CHARSET_ISO8859_7, 2, 1,              \
1060                   NULL, dummy_decoder, dummy_encoder }
1061 #define REGISTRY_SENTINEL       { 0, }
1062 #define CONFIGDEF(var, attrs)                                           \
1063     static const struct iso2022_config iso2022_##var##_config = {       \
1064         attrs, iso2022_##var##_designations                             \
1065     };
1066 
1067 static const struct iso2022_designation iso2022_kr_designations[] = {
1068     REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
1069 };
1070 CONFIGDEF(kr, 0)
1071 
1072 static const struct iso2022_designation iso2022_jp_designations[] = {
1073     REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1074     REGISTRY_SENTINEL
1075 };
1076 CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
1077 
1078 static const struct iso2022_designation iso2022_jp_1_designations[] = {
1079     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1080     REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1081 };
1082 CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
1083 
1084 static const struct iso2022_designation iso2022_jp_2_designations[] = {
1085     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
1086     REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
1087     REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
1088 };
1089 CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
1090 
1091 static const struct iso2022_designation iso2022_jp_2004_designations[] = {
1092     REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
1093     REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
1094 };
1095 CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
1096 
1097 static const struct iso2022_designation iso2022_jp_3_designations[] = {
1098     REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
1099     REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
1100 };
1101 CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
1102 
1103 static const struct iso2022_designation iso2022_jp_ext_designations[] = {
1104     REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
1105     REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
1106 };
1107 CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
1108 
1109 
1110 BEGIN_MAPPINGS_LIST
1111   /* no mapping table here */
1112 END_MAPPINGS_LIST
1113 
1114 #define ISO2022_CODEC(variation) {              \
1115     "iso2022_" #variation,                      \
1116     &iso2022_##variation##_config,              \
1117     iso2022_codec_init,                         \
1118     _STATEFUL_METHODS(iso2022)                  \
1119 },
1120 
1121 BEGIN_CODECS_LIST
1122   ISO2022_CODEC(kr)
1123   ISO2022_CODEC(jp)
1124   ISO2022_CODEC(jp_1)
1125   ISO2022_CODEC(jp_2)
1126   ISO2022_CODEC(jp_2004)
1127   ISO2022_CODEC(jp_3)
1128   ISO2022_CODEC(jp_ext)
1129 END_CODECS_LIST
1130 
1131 I_AM_A_MODULE_FOR(iso2022)
1132