1 /*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #include "cjkcodecs.h"
8 #include "mappings_cn.h"
9
10 /**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14 #ifdef _AIX
15 #undef hz
16 #endif
17
18 /* GBK and GB2312 map differently in few code points that are listed below:
19 *
20 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
24 */
25
26 #define GBK_DECODE(dc1, dc2, writer) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) { \
28 OUTCHAR(0x2014); \
29 } \
30 else if ((dc1) == 0xa8 && (dc2) == 0x44) { \
31 OUTCHAR(0x2015); \
32 } \
33 else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \
34 OUTCHAR(0x00b7); \
35 } \
36 else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
37 OUTCHAR(decoded); \
38 } \
39 else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \
40 OUTCHAR(decoded); \
41 }
42
43 #define GBK_ENCODE(code, assi) \
44 if ((code) == 0x2014) { \
45 (assi) = 0xa1aa; \
46 } else if ((code) == 0x2015) { \
47 (assi) = 0xa844; \
48 } else if ((code) == 0x00b7) { \
49 (assi) = 0xa1a4; \
50 } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
51 ; \
52 }
53
54 /*
55 * codecs in this file use the first byte of MultibyteCodec_State.c[8]
56 * to store a 0 or 1 state value
57 */
58 #define CN_STATE_OFFSET 0
59
60 /*
61 * GB2312 codec
62 */
63
ENCODER(gb2312)64 ENCODER(gb2312)
65 {
66 while (*inpos < inlen) {
67 Py_UCS4 c = INCHAR1;
68 DBCHAR code;
69
70 if (c < 0x80) {
71 WRITEBYTE1((unsigned char)c);
72 NEXT(1, 1);
73 continue;
74 }
75
76 if (c > 0xFFFF)
77 return 1;
78
79 REQUIRE_OUTBUF(2);
80 if (TRYMAP_ENC(gbcommon, code, c))
81 ;
82 else
83 return 1;
84
85 if (code & 0x8000) /* MSB set: GBK */
86 return 1;
87
88 OUTBYTE1((code >> 8) | 0x80);
89 OUTBYTE2((code & 0xFF) | 0x80);
90 NEXT(1, 2);
91 }
92
93 return 0;
94 }
95
DECODER(gb2312)96 DECODER(gb2312)
97 {
98 while (inleft > 0) {
99 unsigned char c = **inbuf;
100 Py_UCS4 decoded;
101
102 if (c < 0x80) {
103 OUTCHAR(c);
104 NEXT_IN(1);
105 continue;
106 }
107
108 REQUIRE_INBUF(2);
109 if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
110 OUTCHAR(decoded);
111 NEXT_IN(2);
112 }
113 else
114 return 1;
115 }
116
117 return 0;
118 }
119
120
121 /*
122 * GBK codec
123 */
124
ENCODER(gbk)125 ENCODER(gbk)
126 {
127 while (*inpos < inlen) {
128 Py_UCS4 c = INCHAR1;
129 DBCHAR code;
130
131 if (c < 0x80) {
132 WRITEBYTE1((unsigned char)c);
133 NEXT(1, 1);
134 continue;
135 }
136
137 if (c > 0xFFFF)
138 return 1;
139
140 REQUIRE_OUTBUF(2);
141
142 GBK_ENCODE(c, code)
143 else
144 return 1;
145
146 OUTBYTE1((code >> 8) | 0x80);
147 if (code & 0x8000)
148 OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
149 else
150 OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
151 NEXT(1, 2);
152 }
153
154 return 0;
155 }
156
DECODER(gbk)157 DECODER(gbk)
158 {
159 while (inleft > 0) {
160 unsigned char c = INBYTE1;
161 Py_UCS4 decoded;
162
163 if (c < 0x80) {
164 OUTCHAR(c);
165 NEXT_IN(1);
166 continue;
167 }
168
169 REQUIRE_INBUF(2);
170
171 GBK_DECODE(c, INBYTE2, writer)
172 else
173 return 1;
174
175 NEXT_IN(2);
176 }
177
178 return 0;
179 }
180
181
182 /*
183 * GB18030 codec
184 */
185
ENCODER(gb18030)186 ENCODER(gb18030)
187 {
188 while (*inpos < inlen) {
189 Py_UCS4 c = INCHAR1;
190 DBCHAR code;
191
192 if (c < 0x80) {
193 WRITEBYTE1(c);
194 NEXT(1, 1);
195 continue;
196 }
197
198 if (c >= 0x10000) {
199 Py_UCS4 tc = c - 0x10000;
200 assert (c <= 0x10FFFF);
201
202 REQUIRE_OUTBUF(4);
203
204 OUTBYTE4((unsigned char)(tc % 10) + 0x30);
205 tc /= 10;
206 OUTBYTE3((unsigned char)(tc % 126) + 0x81);
207 tc /= 126;
208 OUTBYTE2((unsigned char)(tc % 10) + 0x30);
209 tc /= 10;
210 OUTBYTE1((unsigned char)(tc + 0x90));
211
212 NEXT(1, 4);
213 continue;
214 }
215
216 REQUIRE_OUTBUF(2);
217
218 GBK_ENCODE(c, code)
219 else if (TRYMAP_ENC(gb18030ext, code, c))
220 ;
221 else {
222 const struct _gb18030_to_unibmp_ranges *utrrange;
223
224 REQUIRE_OUTBUF(4);
225
226 for (utrrange = gb18030_to_unibmp_ranges;
227 utrrange->first != 0;
228 utrrange++)
229 if (utrrange->first <= c &&
230 c <= utrrange->last) {
231 Py_UCS4 tc;
232
233 tc = c - utrrange->first +
234 utrrange->base;
235
236 OUTBYTE4((unsigned char)(tc % 10) + 0x30);
237 tc /= 10;
238 OUTBYTE3((unsigned char)(tc % 126) + 0x81);
239 tc /= 126;
240 OUTBYTE2((unsigned char)(tc % 10) + 0x30);
241 tc /= 10;
242 OUTBYTE1((unsigned char)tc + 0x81);
243
244 NEXT(1, 4);
245 break;
246 }
247
248 if (utrrange->first == 0)
249 return 1;
250 continue;
251 }
252
253 OUTBYTE1((code >> 8) | 0x80);
254 if (code & 0x8000)
255 OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
256 else
257 OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
258
259 NEXT(1, 2);
260 }
261
262 return 0;
263 }
264
DECODER(gb18030)265 DECODER(gb18030)
266 {
267 while (inleft > 0) {
268 unsigned char c = INBYTE1, c2;
269 Py_UCS4 decoded;
270
271 if (c < 0x80) {
272 OUTCHAR(c);
273 NEXT_IN(1);
274 continue;
275 }
276
277 REQUIRE_INBUF(2);
278
279 c2 = INBYTE2;
280 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
281 const struct _gb18030_to_unibmp_ranges *utr;
282 unsigned char c3, c4;
283 Py_UCS4 lseq;
284
285 REQUIRE_INBUF(4);
286 c3 = INBYTE3;
287 c4 = INBYTE4;
288 if (c < 0x81 || c > 0xFE ||
289 c3 < 0x81 || c3 > 0xFE ||
290 c4 < 0x30 || c4 > 0x39)
291 return 1;
292 c -= 0x81; c2 -= 0x30;
293 c3 -= 0x81; c4 -= 0x30;
294
295 if (c < 4) { /* U+0080 - U+FFFF */
296 lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
297 (Py_UCS4)c3 * 10 + c4;
298 if (lseq < 39420) {
299 for (utr = gb18030_to_unibmp_ranges;
300 lseq >= (utr + 1)->base;
301 utr++) ;
302 OUTCHAR(utr->first - utr->base + lseq);
303 NEXT_IN(4);
304 continue;
305 }
306 }
307 else if (c >= 15) { /* U+10000 - U+10FFFF */
308 lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
309 * 1260 + (Py_UCS4)c3 * 10 + c4;
310 if (lseq <= 0x10FFFF) {
311 OUTCHAR(lseq);
312 NEXT_IN(4);
313 continue;
314 }
315 }
316 return 1;
317 }
318
319 GBK_DECODE(c, c2, writer)
320 else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
321 OUTCHAR(decoded);
322 else
323 return 1;
324
325 NEXT_IN(2);
326 }
327
328 return 0;
329 }
330
331
332 /*
333 * HZ codec
334 */
335
ENCODER_INIT(hz)336 ENCODER_INIT(hz)
337 {
338 state->c[CN_STATE_OFFSET] = 0;
339 return 0;
340 }
341
ENCODER_RESET(hz)342 ENCODER_RESET(hz)
343 {
344 if (state->c[CN_STATE_OFFSET] != 0) {
345 WRITEBYTE2('~', '}');
346 state->c[CN_STATE_OFFSET] = 0;
347 NEXT_OUT(2);
348 }
349 return 0;
350 }
351
ENCODER(hz)352 ENCODER(hz)
353 {
354 while (*inpos < inlen) {
355 Py_UCS4 c = INCHAR1;
356 DBCHAR code;
357
358 if (c < 0x80) {
359 if (state->c[CN_STATE_OFFSET]) {
360 WRITEBYTE2('~', '}');
361 NEXT_OUT(2);
362 state->c[CN_STATE_OFFSET] = 0;
363 }
364 WRITEBYTE1((unsigned char)c);
365 NEXT(1, 1);
366 if (c == '~') {
367 WRITEBYTE1('~');
368 NEXT_OUT(1);
369 }
370 continue;
371 }
372
373 if (c > 0xFFFF)
374 return 1;
375
376 if (TRYMAP_ENC(gbcommon, code, c))
377 ;
378 else
379 return 1;
380
381 if (code & 0x8000) /* MSB set: GBK */
382 return 1;
383
384 if (state->c[CN_STATE_OFFSET] == 0) {
385 WRITEBYTE4('~', '{', code >> 8, code & 0xff);
386 NEXT(1, 4);
387 state->c[CN_STATE_OFFSET] = 1;
388 }
389 else {
390 WRITEBYTE2(code >> 8, code & 0xff);
391 NEXT(1, 2);
392 }
393 }
394
395 return 0;
396 }
397
DECODER_INIT(hz)398 DECODER_INIT(hz)
399 {
400 state->c[CN_STATE_OFFSET] = 0;
401 return 0;
402 }
403
DECODER_RESET(hz)404 DECODER_RESET(hz)
405 {
406 state->c[CN_STATE_OFFSET] = 0;
407 return 0;
408 }
409
DECODER(hz)410 DECODER(hz)
411 {
412 while (inleft > 0) {
413 unsigned char c = INBYTE1;
414 Py_UCS4 decoded;
415
416 if (c == '~') {
417 unsigned char c2 = INBYTE2;
418
419 REQUIRE_INBUF(2);
420 if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
421 OUTCHAR('~');
422 else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
423 state->c[CN_STATE_OFFSET] = 1; /* set GB */
424 else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
425 ; /* line-continuation */
426 else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
427 state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
428 else
429 return 1;
430 NEXT_IN(2);
431 continue;
432 }
433
434 if (c & 0x80)
435 return 1;
436
437 if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
438 OUTCHAR(c);
439 NEXT_IN(1);
440 }
441 else { /* GB mode */
442 REQUIRE_INBUF(2);
443 if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
444 OUTCHAR(decoded);
445 NEXT_IN(2);
446 }
447 else
448 return 1;
449 }
450 }
451
452 return 0;
453 }
454
455
456 BEGIN_MAPPINGS_LIST
457 MAPPING_DECONLY(gb2312)
458 MAPPING_DECONLY(gbkext)
459 MAPPING_ENCONLY(gbcommon)
460 MAPPING_ENCDEC(gb18030ext)
461 END_MAPPINGS_LIST
462
463 BEGIN_CODECS_LIST
464 CODEC_STATELESS(gb2312)
465 CODEC_STATELESS(gbk)
466 CODEC_STATELESS(gb18030)
467 CODEC_STATEFUL(hz)
468 END_CODECS_LIST
469
470 I_AM_A_MODULE_FOR(cn)
471