1 /*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #include "cjkcodecs.h"
8 #include "mappings_cn.h"
9
10 /**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14 #ifdef _AIX
15 #undef hz
16 #endif
17
18 /* GBK and GB2312 map differently in few code points that are listed below:
19 *
20 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
24 */
25
26 #define GBK_DECODE(dc1, dc2, assi) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
32
33 #define GBK_ENCODE(code, assi) \
34 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
38
39 /*
40 * GB2312 codec
41 */
42
ENCODER(gb2312)43 ENCODER(gb2312)
44 {
45 while (inleft > 0) {
46 Py_UNICODE c = IN1;
47 DBCHAR code;
48
49 if (c < 0x80) {
50 WRITE1((unsigned char)c)
51 NEXT(1, 1)
52 continue;
53 }
54 UCS4INVALID(c)
55
56 REQUIRE_OUTBUF(2)
57 TRYMAP_ENC(gbcommon, code, c);
58 else return 1;
59
60 if (code & 0x8000) /* MSB set: GBK */
61 return 1;
62
63 OUT1((code >> 8) | 0x80)
64 OUT2((code & 0xFF) | 0x80)
65 NEXT(1, 2)
66 }
67
68 return 0;
69 }
70
DECODER(gb2312)71 DECODER(gb2312)
72 {
73 while (inleft > 0) {
74 unsigned char c = **inbuf;
75
76 REQUIRE_OUTBUF(1)
77
78 if (c < 0x80) {
79 OUT1(c)
80 NEXT(1, 1)
81 continue;
82 }
83
84 REQUIRE_INBUF(2)
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86 NEXT(2, 1)
87 }
88 else return 2;
89 }
90
91 return 0;
92 }
93
94
95 /*
96 * GBK codec
97 */
98
ENCODER(gbk)99 ENCODER(gbk)
100 {
101 while (inleft > 0) {
102 Py_UNICODE c = IN1;
103 DBCHAR code;
104
105 if (c < 0x80) {
106 WRITE1((unsigned char)c)
107 NEXT(1, 1)
108 continue;
109 }
110 UCS4INVALID(c)
111
112 REQUIRE_OUTBUF(2)
113
114 GBK_ENCODE(c, code)
115 else return 1;
116
117 OUT1((code >> 8) | 0x80)
118 if (code & 0x8000)
119 OUT2((code & 0xFF)) /* MSB set: GBK */
120 else
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122 NEXT(1, 2)
123 }
124
125 return 0;
126 }
127
DECODER(gbk)128 DECODER(gbk)
129 {
130 while (inleft > 0) {
131 unsigned char c = IN1;
132
133 REQUIRE_OUTBUF(1)
134
135 if (c < 0x80) {
136 OUT1(c)
137 NEXT(1, 1)
138 continue;
139 }
140
141 REQUIRE_INBUF(2)
142
143 GBK_DECODE(c, IN2, **outbuf)
144 else return 2;
145
146 NEXT(2, 1)
147 }
148
149 return 0;
150 }
151
152
153 /*
154 * GB18030 codec
155 */
156
ENCODER(gb18030)157 ENCODER(gb18030)
158 {
159 while (inleft > 0) {
160 ucs4_t c = IN1;
161 DBCHAR code;
162
163 if (c < 0x80) {
164 WRITE1(c)
165 NEXT(1, 1)
166 continue;
167 }
168
169 DECODE_SURROGATE(c)
170 if (c > 0x10FFFF)
171 #if Py_UNICODE_SIZE == 2
172 return 2; /* surrogates pair */
173 #else
174 return 1;
175 #endif
176 else if (c >= 0x10000) {
177 ucs4_t tc = c - 0x10000;
178
179 REQUIRE_OUTBUF(4)
180
181 OUT4((unsigned char)(tc % 10) + 0x30)
182 tc /= 10;
183 OUT3((unsigned char)(tc % 126) + 0x81)
184 tc /= 126;
185 OUT2((unsigned char)(tc % 10) + 0x30)
186 tc /= 10;
187 OUT1((unsigned char)(tc + 0x90))
188
189 #if Py_UNICODE_SIZE == 2
190 NEXT(2, 4) /* surrogates pair */
191 #else
192 NEXT(1, 4)
193 #endif
194 continue;
195 }
196
197 REQUIRE_OUTBUF(2)
198
199 GBK_ENCODE(c, code)
200 else TRYMAP_ENC(gb18030ext, code, c);
201 else {
202 const struct _gb18030_to_unibmp_ranges *utrrange;
203
204 REQUIRE_OUTBUF(4)
205
206 for (utrrange = gb18030_to_unibmp_ranges;
207 utrrange->first != 0;
208 utrrange++)
209 if (utrrange->first <= c &&
210 c <= utrrange->last) {
211 Py_UNICODE tc;
212
213 tc = c - utrrange->first +
214 utrrange->base;
215
216 OUT4((unsigned char)(tc % 10) + 0x30)
217 tc /= 10;
218 OUT3((unsigned char)(tc % 126) + 0x81)
219 tc /= 126;
220 OUT2((unsigned char)(tc % 10) + 0x30)
221 tc /= 10;
222 OUT1((unsigned char)tc + 0x81)
223
224 NEXT(1, 4)
225 break;
226 }
227
228 if (utrrange->first == 0)
229 return 1;
230 continue;
231 }
232
233 OUT1((code >> 8) | 0x80)
234 if (code & 0x8000)
235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
236 else
237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
238
239 NEXT(1, 2)
240 }
241
242 return 0;
243 }
244
DECODER(gb18030)245 DECODER(gb18030)
246 {
247 while (inleft > 0) {
248 unsigned char c = IN1, c2;
249
250 REQUIRE_OUTBUF(1)
251
252 if (c < 0x80) {
253 OUT1(c)
254 NEXT(1, 1)
255 continue;
256 }
257
258 REQUIRE_INBUF(2)
259
260 c2 = IN2;
261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262 const struct _gb18030_to_unibmp_ranges *utr;
263 unsigned char c3, c4;
264 ucs4_t lseq;
265
266 REQUIRE_INBUF(4)
267 c3 = IN3;
268 c4 = IN4;
269 if (c < 0x81 || c > 0xFE ||
270 c3 < 0x81 || c3 > 0xFE ||
271 c4 < 0x30 || c4 > 0x39)
272 return 4;
273 c -= 0x81; c2 -= 0x30;
274 c3 -= 0x81; c4 -= 0x30;
275
276 if (c < 4) { /* U+0080 - U+FFFF */
277 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
278 (ucs4_t)c3 * 10 + c4;
279 if (lseq < 39420) {
280 for (utr = gb18030_to_unibmp_ranges;
281 lseq >= (utr + 1)->base;
282 utr++) ;
283 OUT1(utr->first - utr->base + lseq)
284 NEXT(4, 1)
285 continue;
286 }
287 }
288 else if (c >= 15) { /* U+10000 - U+10FFFF */
289 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
290 * 1260 + (ucs4_t)c3 * 10 + c4;
291 if (lseq <= 0x10FFFF) {
292 WRITEUCS4(lseq);
293 NEXT_IN(4)
294 continue;
295 }
296 }
297 return 4;
298 }
299
300 GBK_DECODE(c, c2, **outbuf)
301 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
302 else return 2;
303
304 NEXT(2, 1)
305 }
306
307 return 0;
308 }
309
310
311 /*
312 * HZ codec
313 */
314
ENCODER_INIT(hz)315 ENCODER_INIT(hz)
316 {
317 state->i = 0;
318 return 0;
319 }
320
ENCODER_RESET(hz)321 ENCODER_RESET(hz)
322 {
323 if (state->i != 0) {
324 WRITE2('~', '}')
325 state->i = 0;
326 NEXT_OUT(2)
327 }
328 return 0;
329 }
330
ENCODER(hz)331 ENCODER(hz)
332 {
333 while (inleft > 0) {
334 Py_UNICODE c = IN1;
335 DBCHAR code;
336
337 if (c < 0x80) {
338 if (state->i) {
339 WRITE2('~', '}')
340 NEXT_OUT(2)
341 state->i = 0;
342 }
343 WRITE1((unsigned char)c)
344 NEXT(1, 1)
345 if (c == '~') {
346 WRITE1('~')
347 NEXT_OUT(1)
348 }
349 continue;
350 }
351
352 UCS4INVALID(c)
353
354 TRYMAP_ENC(gbcommon, code, c);
355 else return 1;
356
357 if (code & 0x8000) /* MSB set: GBK */
358 return 1;
359
360 if (state->i == 0) {
361 WRITE4('~', '{', code >> 8, code & 0xff)
362 NEXT(1, 4)
363 state->i = 1;
364 }
365 else {
366 WRITE2(code >> 8, code & 0xff)
367 NEXT(1, 2)
368 }
369 }
370
371 return 0;
372 }
373
DECODER_INIT(hz)374 DECODER_INIT(hz)
375 {
376 state->i = 0;
377 return 0;
378 }
379
DECODER_RESET(hz)380 DECODER_RESET(hz)
381 {
382 state->i = 0;
383 return 0;
384 }
385
DECODER(hz)386 DECODER(hz)
387 {
388 while (inleft > 0) {
389 unsigned char c = IN1;
390
391 if (c == '~') {
392 unsigned char c2 = IN2;
393
394 REQUIRE_INBUF(2)
395 if (c2 == '~' && state->i == 0) {
396 WRITE1('~')
397 NEXT_OUT(1)
398 }
399 else if (c2 == '{' && state->i == 0)
400 state->i = 1; /* set GB */
401 else if (c2 == '\n' && state->i == 0)
402 ; /* line-continuation */
403 else if (c2 == '}' && state->i == 1)
404 state->i = 0; /* set ASCII */
405 else
406 return 2;
407 NEXT_IN(2)
408 continue;
409 }
410
411 if (c & 0x80)
412 return 1;
413
414 if (state->i == 0) { /* ASCII mode */
415 WRITE1(c)
416 NEXT(1, 1)
417 }
418 else { /* GB mode */
419 REQUIRE_INBUF(2)
420 REQUIRE_OUTBUF(1)
421 TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
422 NEXT(2, 1)
423 }
424 else
425 return 2;
426 }
427 }
428
429 return 0;
430 }
431
432
433 BEGIN_MAPPINGS_LIST
434 MAPPING_DECONLY(gb2312)
435 MAPPING_DECONLY(gbkext)
436 MAPPING_ENCONLY(gbcommon)
437 MAPPING_ENCDEC(gb18030ext)
438 END_MAPPINGS_LIST
439
440 BEGIN_CODECS_LIST
441 CODEC_STATELESS(gb2312)
442 CODEC_STATELESS(gbk)
443 CODEC_STATELESS(gb18030)
444 CODEC_STATEFUL(hz)
445 END_CODECS_LIST
446
447 I_AM_A_MODULE_FOR(cn)
448