• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3  *
4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
5  */
6 
7 #include "cjkcodecs.h"
8 #include "mappings_cn.h"
9 
10 /**
11  * hz is predefined as 100 on AIX. So we undefine it to avoid
12  * conflict against hz codec's.
13  */
14 #ifdef _AIX
15 #undef hz
16 #endif
17 
18 /* GBK and GB2312 map differently in few code points that are listed below:
19  *
20  *              gb2312                          gbk
21  * A1A4         U+30FB KATAKANA MIDDLE DOT      U+00B7 MIDDLE DOT
22  * A1AA         U+2015 HORIZONTAL BAR           U+2014 EM DASH
23  * A844         undefined                       U+2015 HORIZONTAL BAR
24  */
25 
26 #define GBK_DECODE(dc1, dc2, writer)                                \
27     if ((dc1) == 0xa1 && (dc2) == 0xaa) {                           \
28         OUTCHAR(0x2014);                                            \
29     }                                                               \
30     else if ((dc1) == 0xa8 && (dc2) == 0x44) {                      \
31         OUTCHAR(0x2015);                                            \
32     }                                                               \
33     else if ((dc1) == 0xa1 && (dc2) == 0xa4) {                      \
34         OUTCHAR(0x00b7);                                            \
35     }                                                               \
36     else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
37         OUTCHAR(decoded);                                           \
38     }                                                               \
39     else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) {               \
40         OUTCHAR(decoded);                                           \
41     }
42 
43 #define GBK_ENCODE(code, assi)                                         \
44     if ((code) == 0x2014) {                                            \
45         (assi) = 0xa1aa;                                               \
46     } else if ((code) == 0x2015) {                                     \
47         (assi) = 0xa844;                                               \
48     } else if ((code) == 0x00b7) {                                     \
49         (assi) = 0xa1a4;                                               \
50     } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
51         ;                                                              \
52     }
53 
54 /*
55  * codecs in this file use the first byte of MultibyteCodec_State.c[8]
56  * to store a 0 or 1 state value
57  */
58 #define CN_STATE_OFFSET 0
59 
60 /*
61  * GB2312 codec
62  */
63 
ENCODER(gb2312)64 ENCODER(gb2312)
65 {
66     while (*inpos < inlen) {
67         Py_UCS4 c = INCHAR1;
68         DBCHAR code;
69 
70         if (c < 0x80) {
71             WRITEBYTE1((unsigned char)c);
72             NEXT(1, 1);
73             continue;
74         }
75 
76         if (c > 0xFFFF)
77             return 1;
78 
79         REQUIRE_OUTBUF(2);
80         if (TRYMAP_ENC(gbcommon, code, c))
81             ;
82         else
83             return 1;
84 
85         if (code & 0x8000) /* MSB set: GBK */
86             return 1;
87 
88         OUTBYTE1((code >> 8) | 0x80);
89         OUTBYTE2((code & 0xFF) | 0x80);
90         NEXT(1, 2);
91     }
92 
93     return 0;
94 }
95 
DECODER(gb2312)96 DECODER(gb2312)
97 {
98     while (inleft > 0) {
99         unsigned char c = **inbuf;
100         Py_UCS4 decoded;
101 
102         if (c < 0x80) {
103             OUTCHAR(c);
104             NEXT_IN(1);
105             continue;
106         }
107 
108         REQUIRE_INBUF(2);
109         if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
110             OUTCHAR(decoded);
111             NEXT_IN(2);
112         }
113         else
114             return 1;
115     }
116 
117     return 0;
118 }
119 
120 
121 /*
122  * GBK codec
123  */
124 
ENCODER(gbk)125 ENCODER(gbk)
126 {
127     while (*inpos < inlen) {
128         Py_UCS4 c = INCHAR1;
129         DBCHAR code;
130 
131         if (c < 0x80) {
132             WRITEBYTE1((unsigned char)c);
133             NEXT(1, 1);
134             continue;
135         }
136 
137         if (c > 0xFFFF)
138             return 1;
139 
140         REQUIRE_OUTBUF(2);
141 
142         GBK_ENCODE(c, code)
143         else
144             return 1;
145 
146         OUTBYTE1((code >> 8) | 0x80);
147         if (code & 0x8000)
148             OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
149         else
150             OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
151         NEXT(1, 2);
152     }
153 
154     return 0;
155 }
156 
DECODER(gbk)157 DECODER(gbk)
158 {
159     while (inleft > 0) {
160         unsigned char c = INBYTE1;
161         Py_UCS4 decoded;
162 
163         if (c < 0x80) {
164             OUTCHAR(c);
165             NEXT_IN(1);
166             continue;
167         }
168 
169         REQUIRE_INBUF(2);
170 
171         GBK_DECODE(c, INBYTE2, writer)
172         else
173             return 1;
174 
175         NEXT_IN(2);
176     }
177 
178     return 0;
179 }
180 
181 
182 /*
183  * GB18030 codec
184  */
185 
ENCODER(gb18030)186 ENCODER(gb18030)
187 {
188     while (*inpos < inlen) {
189         Py_UCS4 c = INCHAR1;
190         DBCHAR code;
191 
192         if (c < 0x80) {
193             WRITEBYTE1(c);
194             NEXT(1, 1);
195             continue;
196         }
197 
198         if (c >= 0x10000) {
199             Py_UCS4 tc = c - 0x10000;
200             assert (c <= 0x10FFFF);
201 
202             REQUIRE_OUTBUF(4);
203 
204             OUTBYTE4((unsigned char)(tc % 10) + 0x30);
205             tc /= 10;
206             OUTBYTE3((unsigned char)(tc % 126) + 0x81);
207             tc /= 126;
208             OUTBYTE2((unsigned char)(tc % 10) + 0x30);
209             tc /= 10;
210             OUTBYTE1((unsigned char)(tc + 0x90));
211 
212             NEXT(1, 4);
213             continue;
214         }
215 
216         REQUIRE_OUTBUF(2);
217 
218         GBK_ENCODE(c, code)
219         else if (TRYMAP_ENC(gb18030ext, code, c))
220             ;
221         else {
222             const struct _gb18030_to_unibmp_ranges *utrrange;
223 
224             REQUIRE_OUTBUF(4);
225 
226             for (utrrange = gb18030_to_unibmp_ranges;
227                  utrrange->first != 0;
228                  utrrange++)
229                 if (utrrange->first <= c &&
230                     c <= utrrange->last) {
231                     Py_UCS4 tc;
232 
233                     tc = c - utrrange->first +
234                          utrrange->base;
235 
236                     OUTBYTE4((unsigned char)(tc % 10) + 0x30);
237                     tc /= 10;
238                     OUTBYTE3((unsigned char)(tc % 126) + 0x81);
239                     tc /= 126;
240                     OUTBYTE2((unsigned char)(tc % 10) + 0x30);
241                     tc /= 10;
242                     OUTBYTE1((unsigned char)tc + 0x81);
243 
244                     NEXT(1, 4);
245                     break;
246                 }
247 
248             if (utrrange->first == 0)
249                 return 1;
250             continue;
251         }
252 
253         OUTBYTE1((code >> 8) | 0x80);
254         if (code & 0x8000)
255             OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
256         else
257             OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
258 
259         NEXT(1, 2);
260     }
261 
262     return 0;
263 }
264 
DECODER(gb18030)265 DECODER(gb18030)
266 {
267     while (inleft > 0) {
268         unsigned char c = INBYTE1, c2;
269         Py_UCS4 decoded;
270 
271         if (c < 0x80) {
272             OUTCHAR(c);
273             NEXT_IN(1);
274             continue;
275         }
276 
277         REQUIRE_INBUF(2);
278 
279         c2 = INBYTE2;
280         if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
281             const struct _gb18030_to_unibmp_ranges *utr;
282             unsigned char c3, c4;
283             Py_UCS4 lseq;
284 
285             REQUIRE_INBUF(4);
286             c3 = INBYTE3;
287             c4 = INBYTE4;
288             if (c  < 0x81 || c  > 0xFE ||
289                 c3 < 0x81 || c3 > 0xFE ||
290                 c4 < 0x30 || c4 > 0x39)
291                 return 1;
292             c -= 0x81;  c2 -= 0x30;
293             c3 -= 0x81; c4 -= 0x30;
294 
295             if (c < 4) { /* U+0080 - U+FFFF */
296                 lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
297                     (Py_UCS4)c3 * 10 + c4;
298                 if (lseq < 39420) {
299                     for (utr = gb18030_to_unibmp_ranges;
300                          lseq >= (utr + 1)->base;
301                          utr++) ;
302                     OUTCHAR(utr->first - utr->base + lseq);
303                     NEXT_IN(4);
304                     continue;
305                 }
306             }
307             else if (c >= 15) { /* U+10000 - U+10FFFF */
308                 lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
309                     * 1260 + (Py_UCS4)c3 * 10 + c4;
310                 if (lseq <= 0x10FFFF) {
311                     OUTCHAR(lseq);
312                     NEXT_IN(4);
313                     continue;
314                 }
315             }
316             return 1;
317         }
318 
319         GBK_DECODE(c, c2, writer)
320         else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
321             OUTCHAR(decoded);
322         else
323             return 1;
324 
325         NEXT_IN(2);
326     }
327 
328     return 0;
329 }
330 
331 
332 /*
333  * HZ codec
334  */
335 
ENCODER_INIT(hz)336 ENCODER_INIT(hz)
337 {
338     state->c[CN_STATE_OFFSET] = 0;
339     return 0;
340 }
341 
ENCODER_RESET(hz)342 ENCODER_RESET(hz)
343 {
344     if (state->c[CN_STATE_OFFSET] != 0) {
345         WRITEBYTE2('~', '}');
346         state->c[CN_STATE_OFFSET] = 0;
347         NEXT_OUT(2);
348     }
349     return 0;
350 }
351 
ENCODER(hz)352 ENCODER(hz)
353 {
354     while (*inpos < inlen) {
355         Py_UCS4 c = INCHAR1;
356         DBCHAR code;
357 
358         if (c < 0x80) {
359             if (state->c[CN_STATE_OFFSET]) {
360                 WRITEBYTE2('~', '}');
361                 NEXT_OUT(2);
362                 state->c[CN_STATE_OFFSET] = 0;
363             }
364             WRITEBYTE1((unsigned char)c);
365             NEXT(1, 1);
366             if (c == '~') {
367                 WRITEBYTE1('~');
368                 NEXT_OUT(1);
369             }
370             continue;
371         }
372 
373         if (c > 0xFFFF)
374             return 1;
375 
376         if (TRYMAP_ENC(gbcommon, code, c))
377             ;
378         else
379             return 1;
380 
381         if (code & 0x8000) /* MSB set: GBK */
382             return 1;
383 
384         if (state->c[CN_STATE_OFFSET] == 0) {
385             WRITEBYTE4('~', '{', code >> 8, code & 0xff);
386             NEXT(1, 4);
387             state->c[CN_STATE_OFFSET] = 1;
388         }
389         else {
390             WRITEBYTE2(code >> 8, code & 0xff);
391             NEXT(1, 2);
392         }
393     }
394 
395     return 0;
396 }
397 
DECODER_INIT(hz)398 DECODER_INIT(hz)
399 {
400     state->c[CN_STATE_OFFSET] = 0;
401     return 0;
402 }
403 
DECODER_RESET(hz)404 DECODER_RESET(hz)
405 {
406     state->c[CN_STATE_OFFSET] = 0;
407     return 0;
408 }
409 
DECODER(hz)410 DECODER(hz)
411 {
412     while (inleft > 0) {
413         unsigned char c = INBYTE1;
414         Py_UCS4 decoded;
415 
416         if (c == '~') {
417             unsigned char c2 = INBYTE2;
418 
419             REQUIRE_INBUF(2);
420             if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
421                 OUTCHAR('~');
422             else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
423                 state->c[CN_STATE_OFFSET] = 1; /* set GB */
424             else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
425                 ; /* line-continuation */
426             else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
427                 state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
428             else
429                 return 1;
430             NEXT_IN(2);
431             continue;
432         }
433 
434         if (c & 0x80)
435             return 1;
436 
437         if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
438             OUTCHAR(c);
439             NEXT_IN(1);
440         }
441         else { /* GB mode */
442             REQUIRE_INBUF(2);
443             if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
444                 OUTCHAR(decoded);
445                 NEXT_IN(2);
446             }
447             else
448                 return 1;
449         }
450     }
451 
452     return 0;
453 }
454 
455 
456 BEGIN_MAPPINGS_LIST
457   MAPPING_DECONLY(gb2312)
458   MAPPING_DECONLY(gbkext)
459   MAPPING_ENCONLY(gbcommon)
460   MAPPING_ENCDEC(gb18030ext)
461 END_MAPPINGS_LIST
462 
463 BEGIN_CODECS_LIST
464   CODEC_STATELESS(gb2312)
465   CODEC_STATELESS(gbk)
466   CODEC_STATELESS(gb18030)
467   CODEC_STATEFUL(hz)
468 END_CODECS_LIST
469 
470 I_AM_A_MODULE_FOR(cn)
471