1 /*
2 * _codecs_jp.c: Codecs collection for Japanese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #define USING_BINARY_PAIR_SEARCH
8 #define EMPBASE 0x20000
9
10 #include "cjkcodecs.h"
11 #include "mappings_jp.h"
12 #include "mappings_jisx0213_pair.h"
13 #include "alg_jisx0201.h"
14 #include "emu_jisx0213_2000.h"
15
16 /*
17 * CP932 codec
18 */
19
ENCODER(cp932)20 ENCODER(cp932)
21 {
22 while (inleft > 0) {
23 Py_UNICODE c = IN1;
24 DBCHAR code;
25 unsigned char c1, c2;
26
27 if (c <= 0x80) {
28 WRITE1((unsigned char)c)
29 NEXT(1, 1)
30 continue;
31 }
32 else if (c >= 0xff61 && c <= 0xff9f) {
33 WRITE1(c - 0xfec0)
34 NEXT(1, 1)
35 continue;
36 }
37 else if (c >= 0xf8f0 && c <= 0xf8f3) {
38 /* Windows compatibility */
39 REQUIRE_OUTBUF(1)
40 if (c == 0xf8f0)
41 OUT1(0xa0)
42 else
43 OUT1(c - 0xfef1 + 0xfd)
44 NEXT(1, 1)
45 continue;
46 }
47
48 UCS4INVALID(c)
49 REQUIRE_OUTBUF(2)
50
51 TRYMAP_ENC(cp932ext, code, c) {
52 OUT1(code >> 8)
53 OUT2(code & 0xff)
54 }
55 else TRYMAP_ENC(jisxcommon, code, c) {
56 if (code & 0x8000) /* MSB set: JIS X 0212 */
57 return 1;
58
59 /* JIS X 0208 */
60 c1 = code >> 8;
61 c2 = code & 0xff;
62 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
63 c1 = (c1 - 0x21) >> 1;
64 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
65 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
66 }
67 else if (c >= 0xe000 && c < 0xe758) {
68 /* User-defined area */
69 c1 = (Py_UNICODE)(c - 0xe000) / 188;
70 c2 = (Py_UNICODE)(c - 0xe000) % 188;
71 OUT1(c1 + 0xf0)
72 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
73 }
74 else
75 return 1;
76
77 NEXT(1, 2)
78 }
79
80 return 0;
81 }
82
DECODER(cp932)83 DECODER(cp932)
84 {
85 while (inleft > 0) {
86 unsigned char c = IN1, c2;
87
88 REQUIRE_OUTBUF(1)
89 if (c <= 0x80) {
90 OUT1(c)
91 NEXT(1, 1)
92 continue;
93 }
94 else if (c >= 0xa0 && c <= 0xdf) {
95 if (c == 0xa0)
96 OUT1(0xf8f0) /* half-width katakana */
97 else
98 OUT1(0xfec0 + c)
99 NEXT(1, 1)
100 continue;
101 }
102 else if (c >= 0xfd/* && c <= 0xff*/) {
103 /* Windows compatibility */
104 OUT1(0xf8f1 - 0xfd + c)
105 NEXT(1, 1)
106 continue;
107 }
108
109 REQUIRE_INBUF(2)
110 c2 = IN2;
111
112 TRYMAP_DEC(cp932ext, **outbuf, c, c2);
113 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
114 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
115 return 2;
116
117 c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
118 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
119 c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
120 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
121
122 TRYMAP_DEC(jisx0208, **outbuf, c, c2);
123 else return 2;
124 }
125 else if (c >= 0xf0 && c <= 0xf9) {
126 if ((c2 >= 0x40 && c2 <= 0x7e) ||
127 (c2 >= 0x80 && c2 <= 0xfc))
128 OUT1(0xe000 + 188 * (c - 0xf0) +
129 (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
130 else
131 return 2;
132 }
133 else
134 return 2;
135
136 NEXT(2, 1)
137 }
138
139 return 0;
140 }
141
142
143 /*
144 * EUC-JIS-2004 codec
145 */
146
ENCODER(euc_jis_2004)147 ENCODER(euc_jis_2004)
148 {
149 while (inleft > 0) {
150 ucs4_t c = IN1;
151 DBCHAR code;
152 Py_ssize_t insize;
153
154 if (c < 0x80) {
155 WRITE1(c)
156 NEXT(1, 1)
157 continue;
158 }
159
160 DECODE_SURROGATE(c)
161 insize = GET_INSIZE(c);
162
163 if (c <= 0xFFFF) {
164 EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
165 else TRYMAP_ENC(jisx0213_bmp, code, c) {
166 if (code == MULTIC) {
167 if (inleft < 2) {
168 if (flags & MBENC_FLUSH) {
169 code = find_pairencmap(
170 (ucs2_t)c, 0,
171 jisx0213_pair_encmap,
172 JISX0213_ENCPAIRS);
173 if (code == DBCINV)
174 return 1;
175 }
176 else
177 return MBERR_TOOFEW;
178 }
179 else {
180 code = find_pairencmap(
181 (ucs2_t)c, (*inbuf)[1],
182 jisx0213_pair_encmap,
183 JISX0213_ENCPAIRS);
184 if (code == DBCINV) {
185 code = find_pairencmap(
186 (ucs2_t)c, 0,
187 jisx0213_pair_encmap,
188 JISX0213_ENCPAIRS);
189 if (code == DBCINV)
190 return 1;
191 } else
192 insize = 2;
193 }
194 }
195 }
196 else TRYMAP_ENC(jisxcommon, code, c);
197 else if (c >= 0xff61 && c <= 0xff9f) {
198 /* JIS X 0201 half-width katakana */
199 WRITE2(0x8e, c - 0xfec0)
200 NEXT(1, 2)
201 continue;
202 }
203 else if (c == 0xff3c)
204 /* F/W REVERSE SOLIDUS (see NOTES) */
205 code = 0x2140;
206 else if (c == 0xff5e)
207 /* F/W TILDE (see NOTES) */
208 code = 0x2232;
209 else
210 return 1;
211 }
212 else if (c >> 16 == EMPBASE >> 16) {
213 EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
214 else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
215 else return insize;
216 }
217 else
218 return insize;
219
220 if (code & 0x8000) {
221 /* Codeset 2 */
222 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
223 NEXT(insize, 3)
224 } else {
225 /* Codeset 1 */
226 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
227 NEXT(insize, 2)
228 }
229 }
230
231 return 0;
232 }
233
DECODER(euc_jis_2004)234 DECODER(euc_jis_2004)
235 {
236 while (inleft > 0) {
237 unsigned char c = IN1;
238 ucs4_t code;
239
240 REQUIRE_OUTBUF(1)
241
242 if (c < 0x80) {
243 OUT1(c)
244 NEXT(1, 1)
245 continue;
246 }
247
248 if (c == 0x8e) {
249 /* JIS X 0201 half-width katakana */
250 unsigned char c2;
251
252 REQUIRE_INBUF(2)
253 c2 = IN2;
254 if (c2 >= 0xa1 && c2 <= 0xdf) {
255 OUT1(0xfec0 + c2)
256 NEXT(2, 1)
257 }
258 else
259 return 2;
260 }
261 else if (c == 0x8f) {
262 unsigned char c2, c3;
263
264 REQUIRE_INBUF(3)
265 c2 = IN2 ^ 0x80;
266 c3 = IN3 ^ 0x80;
267
268 /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
269 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3)
270 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
271 else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
272 WRITEUCS4(EMPBASE | code)
273 NEXT_IN(3)
274 continue;
275 }
276 else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
277 else return 3;
278 NEXT(3, 1)
279 }
280 else {
281 unsigned char c2;
282
283 REQUIRE_INBUF(2)
284 c ^= 0x80;
285 c2 = IN2 ^ 0x80;
286
287 /* JIS X 0213 Plane 1 */
288 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2)
289 else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
290 else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
291 else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
292 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
293 else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
294 WRITEUCS4(EMPBASE | code)
295 NEXT_IN(2)
296 continue;
297 }
298 else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
299 WRITE2(code >> 16, code & 0xffff)
300 NEXT(2, 2)
301 continue;
302 }
303 else return 2;
304 NEXT(2, 1)
305 }
306 }
307
308 return 0;
309 }
310
311
312 /*
313 * EUC-JP codec
314 */
315
ENCODER(euc_jp)316 ENCODER(euc_jp)
317 {
318 while (inleft > 0) {
319 Py_UNICODE c = IN1;
320 DBCHAR code;
321
322 if (c < 0x80) {
323 WRITE1((unsigned char)c)
324 NEXT(1, 1)
325 continue;
326 }
327
328 UCS4INVALID(c)
329
330 TRYMAP_ENC(jisxcommon, code, c);
331 else if (c >= 0xff61 && c <= 0xff9f) {
332 /* JIS X 0201 half-width katakana */
333 WRITE2(0x8e, c - 0xfec0)
334 NEXT(1, 2)
335 continue;
336 }
337 #ifndef STRICT_BUILD
338 else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
339 code = 0x2140;
340 else if (c == 0xa5) { /* YEN SIGN */
341 WRITE1(0x5c);
342 NEXT(1, 1)
343 continue;
344 } else if (c == 0x203e) { /* OVERLINE */
345 WRITE1(0x7e);
346 NEXT(1, 1)
347 continue;
348 }
349 #endif
350 else
351 return 1;
352
353 if (code & 0x8000) {
354 /* JIS X 0212 */
355 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
356 NEXT(1, 3)
357 } else {
358 /* JIS X 0208 */
359 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
360 NEXT(1, 2)
361 }
362 }
363
364 return 0;
365 }
366
DECODER(euc_jp)367 DECODER(euc_jp)
368 {
369 while (inleft > 0) {
370 unsigned char c = IN1;
371
372 REQUIRE_OUTBUF(1)
373
374 if (c < 0x80) {
375 OUT1(c)
376 NEXT(1, 1)
377 continue;
378 }
379
380 if (c == 0x8e) {
381 /* JIS X 0201 half-width katakana */
382 unsigned char c2;
383
384 REQUIRE_INBUF(2)
385 c2 = IN2;
386 if (c2 >= 0xa1 && c2 <= 0xdf) {
387 OUT1(0xfec0 + c2)
388 NEXT(2, 1)
389 }
390 else
391 return 2;
392 }
393 else if (c == 0x8f) {
394 unsigned char c2, c3;
395
396 REQUIRE_INBUF(3)
397 c2 = IN2;
398 c3 = IN3;
399 /* JIS X 0212 */
400 TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
401 NEXT(3, 1)
402 }
403 else
404 return 3;
405 }
406 else {
407 unsigned char c2;
408
409 REQUIRE_INBUF(2)
410 c2 = IN2;
411 /* JIS X 0208 */
412 #ifndef STRICT_BUILD
413 if (c == 0xa1 && c2 == 0xc0)
414 /* FULL-WIDTH REVERSE SOLIDUS */
415 **outbuf = 0xff3c;
416 else
417 #endif
418 TRYMAP_DEC(jisx0208, **outbuf,
419 c ^ 0x80, c2 ^ 0x80) ;
420 else return 2;
421 NEXT(2, 1)
422 }
423 }
424
425 return 0;
426 }
427
428
429 /*
430 * SHIFT_JIS codec
431 */
432
ENCODER(shift_jis)433 ENCODER(shift_jis)
434 {
435 while (inleft > 0) {
436 Py_UNICODE c = IN1;
437 DBCHAR code;
438 unsigned char c1, c2;
439
440 #ifdef STRICT_BUILD
441 JISX0201_R_ENCODE(c, code)
442 #else
443 if (c < 0x80) code = c;
444 else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */
445 else if (c == 0x203e) code = 0x7e; /* OVERLINE */
446 #endif
447 else JISX0201_K_ENCODE(c, code)
448 else UCS4INVALID(c)
449 else code = NOCHAR;
450
451 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
452 REQUIRE_OUTBUF(1)
453
454 OUT1((unsigned char)code)
455 NEXT(1, 1)
456 continue;
457 }
458
459 REQUIRE_OUTBUF(2)
460
461 if (code == NOCHAR) {
462 TRYMAP_ENC(jisxcommon, code, c);
463 #ifndef STRICT_BUILD
464 else if (c == 0xff3c)
465 code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */
466 #endif
467 else
468 return 1;
469
470 if (code & 0x8000) /* MSB set: JIS X 0212 */
471 return 1;
472 }
473
474 c1 = code >> 8;
475 c2 = code & 0xff;
476 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
477 c1 = (c1 - 0x21) >> 1;
478 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
479 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
480 NEXT(1, 2)
481 }
482
483 return 0;
484 }
485
DECODER(shift_jis)486 DECODER(shift_jis)
487 {
488 while (inleft > 0) {
489 unsigned char c = IN1;
490
491 REQUIRE_OUTBUF(1)
492
493 #ifdef STRICT_BUILD
494 JISX0201_R_DECODE(c, **outbuf)
495 #else
496 if (c < 0x80) **outbuf = c;
497 #endif
498 else JISX0201_K_DECODE(c, **outbuf)
499 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
500 unsigned char c1, c2;
501
502 REQUIRE_INBUF(2)
503 c2 = IN2;
504 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
505 return 2;
506
507 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
508 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
509 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21);
510 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
511
512 #ifndef STRICT_BUILD
513 if (c1 == 0x21 && c2 == 0x40) {
514 /* FULL-WIDTH REVERSE SOLIDUS */
515 OUT1(0xff3c)
516 NEXT(2, 1)
517 continue;
518 }
519 #endif
520 TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
521 NEXT(2, 1)
522 continue;
523 }
524 else
525 return 2;
526 }
527 else
528 return 2;
529
530 NEXT(1, 1) /* JIS X 0201 */
531 }
532
533 return 0;
534 }
535
536
537 /*
538 * SHIFT_JIS-2004 codec
539 */
540
ENCODER(shift_jis_2004)541 ENCODER(shift_jis_2004)
542 {
543 while (inleft > 0) {
544 ucs4_t c = IN1;
545 DBCHAR code = NOCHAR;
546 int c1, c2;
547 Py_ssize_t insize;
548
549 JISX0201_ENCODE(c, code)
550 else DECODE_SURROGATE(c)
551
552 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
553 WRITE1((unsigned char)code)
554 NEXT(1, 1)
555 continue;
556 }
557
558 REQUIRE_OUTBUF(2)
559 insize = GET_INSIZE(c);
560
561 if (code == NOCHAR) {
562 if (c <= 0xffff) {
563 EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
564 else TRYMAP_ENC(jisx0213_bmp, code, c) {
565 if (code == MULTIC) {
566 if (inleft < 2) {
567 if (flags & MBENC_FLUSH) {
568 code = find_pairencmap
569 ((ucs2_t)c, 0,
570 jisx0213_pair_encmap,
571 JISX0213_ENCPAIRS);
572 if (code == DBCINV)
573 return 1;
574 }
575 else
576 return MBERR_TOOFEW;
577 }
578 else {
579 code = find_pairencmap(
580 (ucs2_t)c, IN2,
581 jisx0213_pair_encmap,
582 JISX0213_ENCPAIRS);
583 if (code == DBCINV) {
584 code = find_pairencmap(
585 (ucs2_t)c, 0,
586 jisx0213_pair_encmap,
587 JISX0213_ENCPAIRS);
588 if (code == DBCINV)
589 return 1;
590 }
591 else
592 insize = 2;
593 }
594 }
595 }
596 else TRYMAP_ENC(jisxcommon, code, c) {
597 /* abandon JIS X 0212 codes */
598 if (code & 0x8000)
599 return 1;
600 }
601 else return 1;
602 }
603 else if (c >> 16 == EMPBASE >> 16) {
604 EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
605 else TRYMAP_ENC(jisx0213_emp, code, c&0xffff);
606 else return insize;
607 }
608 else
609 return insize;
610 }
611
612 c1 = code >> 8;
613 c2 = (code & 0xff) - 0x21;
614
615 if (c1 & 0x80) { /* Plane 2 */
616 if (c1 >= 0xee) c1 -= 0x87;
617 else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49;
618 else c1 -= 0x43;
619 }
620 else /* Plane 1 */
621 c1 -= 0x21;
622
623 if (c1 & 1) c2 += 0x5e;
624 c1 >>= 1;
625 OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
626 OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
627
628 NEXT(insize, 2)
629 }
630
631 return 0;
632 }
633
DECODER(shift_jis_2004)634 DECODER(shift_jis_2004)
635 {
636 while (inleft > 0) {
637 unsigned char c = IN1;
638
639 REQUIRE_OUTBUF(1)
640 JISX0201_DECODE(c, **outbuf)
641 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
642 unsigned char c1, c2;
643 ucs4_t code;
644
645 REQUIRE_INBUF(2)
646 c2 = IN2;
647 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
648 return 2;
649
650 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
651 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
652 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1));
653 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
654
655 if (c1 < 0x5e) { /* Plane 1 */
656 c1 += 0x21;
657 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf,
658 c1, c2)
659 else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
660 NEXT_OUT(1)
661 }
662 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf,
663 c1, c2) {
664 NEXT_OUT(1)
665 }
666 else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) {
667 WRITEUCS4(EMPBASE | code)
668 }
669 else TRYMAP_DEC(jisx0213_pair, code, c1, c2) {
670 WRITE2(code >> 16, code & 0xffff)
671 NEXT_OUT(2)
672 }
673 else
674 return 2;
675 NEXT_IN(2)
676 }
677 else { /* Plane 2 */
678 if (c1 >= 0x67) c1 += 0x07;
679 else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37;
680 else c1 -= 0x3d;
681
682 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf,
683 c1, c2)
684 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf,
685 c1, c2) ;
686 else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) {
687 WRITEUCS4(EMPBASE | code)
688 NEXT_IN(2)
689 continue;
690 }
691 else
692 return 2;
693 NEXT(2, 1)
694 }
695 continue;
696 }
697 else
698 return 2;
699
700 NEXT(1, 1) /* JIS X 0201 */
701 }
702
703 return 0;
704 }
705
706
707 BEGIN_MAPPINGS_LIST
MAPPING_DECONLY(jisx0208)708 MAPPING_DECONLY(jisx0208)
709 MAPPING_DECONLY(jisx0212)
710 MAPPING_ENCONLY(jisxcommon)
711 MAPPING_DECONLY(jisx0213_1_bmp)
712 MAPPING_DECONLY(jisx0213_2_bmp)
713 MAPPING_ENCONLY(jisx0213_bmp)
714 MAPPING_DECONLY(jisx0213_1_emp)
715 MAPPING_DECONLY(jisx0213_2_emp)
716 MAPPING_ENCONLY(jisx0213_emp)
717 MAPPING_ENCDEC(jisx0213_pair)
718 MAPPING_ENCDEC(cp932ext)
719 END_MAPPINGS_LIST
720
721 BEGIN_CODECS_LIST
722 CODEC_STATELESS(shift_jis)
723 CODEC_STATELESS(cp932)
724 CODEC_STATELESS(euc_jp)
725 CODEC_STATELESS(shift_jis_2004)
726 CODEC_STATELESS(euc_jis_2004)
727 { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) },
728 { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) },
729 END_CODECS_LIST
730
731 I_AM_A_MODULE_FOR(jp)
732