1 /*
2 * cjkcodecs.h: common header for cjkcodecs
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #ifndef _CJKCODECS_H_
8 #define _CJKCODECS_H_
9
10 #ifndef Py_BUILD_CORE_BUILTIN
11 # define Py_BUILD_CORE_MODULE 1
12 #endif
13
14 #include "Python.h"
15 #include "multibytecodec.h"
16 #include "pycore_import.h" // _PyImport_GetModuleAttrString()
17
18
19 /* a unicode "undefined" code point */
20 #define UNIINV 0xFFFE
21
22 /* internal-use DBCS code points which aren't used by any charsets */
23 #define NOCHAR 0xFFFF
24 #define MULTIC 0xFFFE
25 #define DBCINV 0xFFFD
26
27 /* shorter macros to save source size of mapping tables */
28 #define U UNIINV
29 #define N NOCHAR
30 #define M MULTIC
31 #define D DBCINV
32
33 struct dbcs_index {
34 const ucs2_t *map;
35 unsigned char bottom, top;
36 };
37 typedef struct dbcs_index decode_map;
38
39 struct widedbcs_index {
40 const Py_UCS4 *map;
41 unsigned char bottom, top;
42 };
43 typedef struct widedbcs_index widedecode_map;
44
45 struct unim_index {
46 const DBCHAR *map;
47 unsigned char bottom, top;
48 };
49 typedef struct unim_index encode_map;
50
51 struct unim_index_bytebased {
52 const unsigned char *map;
53 unsigned char bottom, top;
54 };
55
56 struct dbcs_map {
57 const char *charset;
58 const struct unim_index *encmap;
59 const struct dbcs_index *decmap;
60 };
61
62 struct pair_encodemap {
63 Py_UCS4 uniseq;
64 DBCHAR code;
65 };
66
67 #ifndef CJK_MOD_SPECIFIC_STATE
68 #define CJK_MOD_SPECIFIC_STATE
69 #endif
70
71 typedef struct _cjk_mod_state {
72 int num_mappings;
73 int num_codecs;
74 struct dbcs_map *mapping_list;
75 MultibyteCodec *codec_list;
76
77 CJK_MOD_SPECIFIC_STATE
78 } cjkcodecs_module_state;
79
80 static inline cjkcodecs_module_state *
get_module_state(PyObject * mod)81 get_module_state(PyObject *mod)
82 {
83 void *state = PyModule_GetState(mod);
84 assert(state != NULL);
85 return (cjkcodecs_module_state *)state;
86 }
87
88 #define CODEC_INIT(encoding) \
89 static int encoding##_codec_init(const MultibyteCodec *codec)
90
91 #define ENCODER_INIT(encoding) \
92 static int encoding##_encode_init( \
93 MultibyteCodec_State *state, const MultibyteCodec *codec)
94 #define ENCODER(encoding) \
95 static Py_ssize_t encoding##_encode( \
96 MultibyteCodec_State *state, const MultibyteCodec *codec, \
97 int kind, const void *data, \
98 Py_ssize_t *inpos, Py_ssize_t inlen, \
99 unsigned char **outbuf, Py_ssize_t outleft, int flags)
100 #define ENCODER_RESET(encoding) \
101 static Py_ssize_t encoding##_encode_reset( \
102 MultibyteCodec_State *state, const MultibyteCodec *codec, \
103 unsigned char **outbuf, Py_ssize_t outleft)
104
105 #define DECODER_INIT(encoding) \
106 static int encoding##_decode_init( \
107 MultibyteCodec_State *state, const MultibyteCodec *codec)
108 #define DECODER(encoding) \
109 static Py_ssize_t encoding##_decode( \
110 MultibyteCodec_State *state, const MultibyteCodec *codec, \
111 const unsigned char **inbuf, Py_ssize_t inleft, \
112 _PyUnicodeWriter *writer)
113 #define DECODER_RESET(encoding) \
114 static Py_ssize_t encoding##_decode_reset( \
115 MultibyteCodec_State *state, const MultibyteCodec *codec)
116
117 #define NEXT_IN(i) \
118 do { \
119 (*inbuf) += (i); \
120 (inleft) -= (i); \
121 } while (0)
122 #define NEXT_INCHAR(i) \
123 do { \
124 (*inpos) += (i); \
125 } while (0)
126 #define NEXT_OUT(o) \
127 do { \
128 (*outbuf) += (o); \
129 (outleft) -= (o); \
130 } while (0)
131 #define NEXT(i, o) \
132 do { \
133 NEXT_INCHAR(i); \
134 NEXT_OUT(o); \
135 } while (0)
136
137 #define REQUIRE_INBUF(n) \
138 do { \
139 if (inleft < (n)) \
140 return MBERR_TOOFEW; \
141 } while (0)
142
143 #define REQUIRE_OUTBUF(n) \
144 do { \
145 if (outleft < (n)) \
146 return MBERR_TOOSMALL; \
147 } while (0)
148
149 #define INBYTE1 ((*inbuf)[0])
150 #define INBYTE2 ((*inbuf)[1])
151 #define INBYTE3 ((*inbuf)[2])
152 #define INBYTE4 ((*inbuf)[3])
153
154 #define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
155 #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
156
157 #define OUTCHAR(c) \
158 do { \
159 if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
160 return MBERR_EXCEPTION; \
161 } while (0)
162
163 #define OUTCHAR2(c1, c2) \
164 do { \
165 Py_UCS4 _c1 = (c1); \
166 Py_UCS4 _c2 = (c2); \
167 if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
168 return MBERR_EXCEPTION; \
169 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
170 PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
171 writer->pos += 2; \
172 } while (0)
173
174 #define OUTBYTEI(c, i) \
175 do { \
176 assert((unsigned char)(c) == (c)); \
177 ((*outbuf)[i]) = (c); \
178 } while (0)
179
180 #define OUTBYTE1(c) OUTBYTEI(c, 0)
181 #define OUTBYTE2(c) OUTBYTEI(c, 1)
182 #define OUTBYTE3(c) OUTBYTEI(c, 2)
183 #define OUTBYTE4(c) OUTBYTEI(c, 3)
184
185 #define WRITEBYTE1(c1) \
186 do { \
187 REQUIRE_OUTBUF(1); \
188 OUTBYTE1(c1); \
189 } while (0)
190 #define WRITEBYTE2(c1, c2) \
191 do { \
192 REQUIRE_OUTBUF(2); \
193 OUTBYTE1(c1); \
194 OUTBYTE2(c2); \
195 } while (0)
196 #define WRITEBYTE3(c1, c2, c3) \
197 do { \
198 REQUIRE_OUTBUF(3); \
199 OUTBYTE1(c1); \
200 OUTBYTE2(c2); \
201 OUTBYTE3(c3); \
202 } while (0)
203 #define WRITEBYTE4(c1, c2, c3, c4) \
204 do { \
205 REQUIRE_OUTBUF(4); \
206 OUTBYTE1(c1); \
207 OUTBYTE2(c2); \
208 OUTBYTE3(c3); \
209 OUTBYTE4(c4); \
210 } while (0)
211
212 #define _TRYMAP_ENC(m, assi, val) \
213 ((m)->map != NULL && (val) >= (m)->bottom && \
214 (val)<= (m)->top && ((assi) = (m)->map[(val) - \
215 (m)->bottom]) != NOCHAR)
216 #define TRYMAP_ENC(charset, assi, uni) \
217 _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
218 #define TRYMAP_ENC_ST(charset, assi, uni) \
219 _TRYMAP_ENC(&(codec->modstate->charset##_encmap)[(uni) >> 8], \
220 assi, (uni) & 0xff)
221
222 #define _TRYMAP_DEC(m, assi, val) \
223 ((m)->map != NULL && \
224 (val) >= (m)->bottom && \
225 (val)<= (m)->top && \
226 ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
227 #define TRYMAP_DEC(charset, assi, c1, c2) \
228 _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
229 #define TRYMAP_DEC_ST(charset, assi, c1, c2) \
230 _TRYMAP_DEC(&(codec->modstate->charset##_decmap)[c1], assi, c2)
231
232 #define BEGIN_MAPPINGS_LIST(NUM) \
233 static int \
234 add_mappings(cjkcodecs_module_state *st) \
235 { \
236 int idx = 0; \
237 (void)idx; \
238 st->num_mappings = NUM; \
239 st->mapping_list = PyMem_Calloc(NUM, sizeof(struct dbcs_map)); \
240 if (st->mapping_list == NULL) { \
241 return -1; \
242 }
243
244 #define MAPPING_ENCONLY(enc) \
245 st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, NULL};
246 #define MAPPING_DECONLY(enc) \
247 st->mapping_list[idx++] = (struct dbcs_map){#enc, NULL, (void*)enc##_decmap};
248 #define MAPPING_ENCDEC(enc) \
249 st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, (void*)enc##_decmap};
250
251 #define END_MAPPINGS_LIST \
252 assert(st->num_mappings == idx); \
253 return 0; \
254 }
255
256 #define BEGIN_CODECS_LIST(NUM) \
257 static int \
258 add_codecs(cjkcodecs_module_state *st) \
259 { \
260 int idx = 0; \
261 (void)idx; \
262 st->num_codecs = NUM; \
263 st->codec_list = PyMem_Calloc(NUM, sizeof(MultibyteCodec)); \
264 if (st->codec_list == NULL) { \
265 return -1; \
266 }
267
268 #define _STATEFUL_METHODS(enc) \
269 enc##_encode, \
270 enc##_encode_init, \
271 enc##_encode_reset, \
272 enc##_decode, \
273 enc##_decode_init, \
274 enc##_decode_reset,
275 #define _STATELESS_METHODS(enc) \
276 enc##_encode, NULL, NULL, \
277 enc##_decode, NULL, NULL,
278
279 #define NEXT_CODEC \
280 st->codec_list[idx++]
281
282 #define CODEC_STATEFUL(enc) \
283 NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATEFUL_METHODS(enc)};
284 #define CODEC_STATELESS(enc) \
285 NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATELESS_METHODS(enc)};
286 #define CODEC_STATELESS_WINIT(enc) \
287 NEXT_CODEC = (MultibyteCodec){#enc, NULL, enc##_codec_init, _STATELESS_METHODS(enc)};
288
289 #define END_CODECS_LIST \
290 assert(st->num_codecs == idx); \
291 for (int i = 0; i < st->num_codecs; i++) { \
292 st->codec_list[i].modstate = st; \
293 } \
294 return 0; \
295 }
296
297
298
299 static PyObject *
getmultibytecodec(void)300 getmultibytecodec(void)
301 {
302 return _PyImport_GetModuleAttrString("_multibytecodec", "__create_codec");
303 }
304
305 static void
destroy_codec_capsule(PyObject * capsule)306 destroy_codec_capsule(PyObject *capsule)
307 {
308 void *ptr = PyCapsule_GetPointer(capsule, CODEC_CAPSULE);
309 codec_capsule *data = (codec_capsule *)ptr;
310 Py_DECREF(data->cjk_module);
311 PyMem_Free(ptr);
312 }
313
314 static codec_capsule *
capsulate_codec(PyObject * mod,const MultibyteCodec * codec)315 capsulate_codec(PyObject *mod, const MultibyteCodec *codec)
316 {
317 codec_capsule *data = PyMem_Malloc(sizeof(codec_capsule));
318 if (data == NULL) {
319 PyErr_NoMemory();
320 return NULL;
321 }
322 data->codec = codec;
323 data->cjk_module = Py_NewRef(mod);
324 return data;
325 }
326
327 static PyObject *
_getcodec(PyObject * self,const MultibyteCodec * codec)328 _getcodec(PyObject *self, const MultibyteCodec *codec)
329 {
330 PyObject *cofunc = getmultibytecodec();
331 if (cofunc == NULL) {
332 return NULL;
333 }
334
335 codec_capsule *data = capsulate_codec(self, codec);
336 if (data == NULL) {
337 Py_DECREF(cofunc);
338 return NULL;
339 }
340 PyObject *codecobj = PyCapsule_New(data, CODEC_CAPSULE,
341 destroy_codec_capsule);
342 if (codecobj == NULL) {
343 PyMem_Free(data);
344 Py_DECREF(cofunc);
345 return NULL;
346 }
347
348 PyObject *res = PyObject_CallOneArg(cofunc, codecobj);
349 Py_DECREF(codecobj);
350 Py_DECREF(cofunc);
351 return res;
352 }
353
354 static PyObject *
getcodec(PyObject * self,PyObject * encoding)355 getcodec(PyObject *self, PyObject *encoding)
356 {
357 if (!PyUnicode_Check(encoding)) {
358 PyErr_SetString(PyExc_TypeError,
359 "encoding name must be a string.");
360 return NULL;
361 }
362 const char *enc = PyUnicode_AsUTF8(encoding);
363 if (enc == NULL) {
364 return NULL;
365 }
366
367 cjkcodecs_module_state *st = get_module_state(self);
368 for (int i = 0; i < st->num_codecs; i++) {
369 const MultibyteCodec *codec = &st->codec_list[i];
370 if (strcmp(codec->encoding, enc) == 0) {
371 return _getcodec(self, codec);
372 }
373 }
374
375 PyErr_SetString(PyExc_LookupError,
376 "no such codec is supported.");
377 return NULL;
378 }
379
380 static int add_mappings(cjkcodecs_module_state *);
381 static int add_codecs(cjkcodecs_module_state *);
382
383 static int
register_maps(PyObject * module)384 register_maps(PyObject *module)
385 {
386 // Init module state.
387 cjkcodecs_module_state *st = get_module_state(module);
388 if (add_mappings(st) < 0) {
389 return -1;
390 }
391 if (add_codecs(st) < 0) {
392 return -1;
393 }
394
395 for (int i = 0; i < st->num_mappings; i++) {
396 const struct dbcs_map *h = &st->mapping_list[i];
397 char mhname[256] = "__map_";
398 strcpy(mhname + sizeof("__map_") - 1, h->charset);
399
400 PyObject *capsule = PyCapsule_New((void *)h, MAP_CAPSULE, NULL);
401 if (PyModule_Add(module, mhname, capsule) < 0) {
402 return -1;
403 }
404 }
405 return 0;
406 }
407
408 #ifdef USING_BINARY_PAIR_SEARCH
409 static DBCHAR
find_pairencmap(ucs2_t body,ucs2_t modifier,const struct pair_encodemap * haystack,int haystacksize)410 find_pairencmap(ucs2_t body, ucs2_t modifier,
411 const struct pair_encodemap *haystack, int haystacksize)
412 {
413 int pos, min, max;
414 Py_UCS4 value = body << 16 | modifier;
415
416 min = 0;
417 max = haystacksize;
418
419 for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
420 if (value < haystack[pos].uniseq) {
421 if (max != pos) {
422 max = pos;
423 continue;
424 }
425 }
426 else if (value > haystack[pos].uniseq) {
427 if (min != pos) {
428 min = pos;
429 continue;
430 }
431 }
432 break;
433 }
434
435 if (value == haystack[pos].uniseq) {
436 return haystack[pos].code;
437 }
438 return DBCINV;
439 }
440 #endif
441
442 #ifdef USING_IMPORTED_MAPS
443 #define IMPORT_MAP(locale, charset, encmap, decmap) \
444 importmap("_codecs_" #locale, "__map_" #charset, \
445 (const void**)encmap, (const void**)decmap)
446
447 static int
importmap(const char * modname,const char * symbol,const void ** encmap,const void ** decmap)448 importmap(const char *modname, const char *symbol,
449 const void **encmap, const void **decmap)
450 {
451 PyObject *o, *mod;
452
453 mod = PyImport_ImportModule(modname);
454 if (mod == NULL)
455 return -1;
456
457 o = PyObject_GetAttrString(mod, symbol);
458 if (o == NULL)
459 goto errorexit;
460 else if (!PyCapsule_IsValid(o, MAP_CAPSULE)) {
461 PyErr_SetString(PyExc_ValueError,
462 "map data must be a Capsule.");
463 goto errorexit;
464 }
465 else {
466 struct dbcs_map *map;
467 map = PyCapsule_GetPointer(o, MAP_CAPSULE);
468 if (encmap != NULL)
469 *encmap = map->encmap;
470 if (decmap != NULL)
471 *decmap = map->decmap;
472 Py_DECREF(o);
473 }
474
475 Py_DECREF(mod);
476 return 0;
477
478 errorexit:
479 Py_DECREF(mod);
480 return -1;
481 }
482 #endif
483
484 static int
_cjk_exec(PyObject * module)485 _cjk_exec(PyObject *module)
486 {
487 return register_maps(module);
488 }
489
490 static void
_cjk_free(void * mod)491 _cjk_free(void *mod)
492 {
493 cjkcodecs_module_state *st = get_module_state((PyObject *)mod);
494 PyMem_Free(st->mapping_list);
495 PyMem_Free(st->codec_list);
496 }
497
498 static struct PyMethodDef _cjk_methods[] = {
499 {"getcodec", (PyCFunction)getcodec, METH_O, ""},
500 {NULL, NULL},
501 };
502
503 static PyModuleDef_Slot _cjk_slots[] = {
504 {Py_mod_exec, _cjk_exec},
505 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
506 {Py_mod_gil, Py_MOD_GIL_NOT_USED},
507 {0, NULL}
508 };
509
510 #define I_AM_A_MODULE_FOR(loc) \
511 static struct PyModuleDef _cjk_module = { \
512 PyModuleDef_HEAD_INIT, \
513 .m_name = "_codecs_"#loc, \
514 .m_size = sizeof(cjkcodecs_module_state), \
515 .m_methods = _cjk_methods, \
516 .m_slots = _cjk_slots, \
517 .m_free = _cjk_free, \
518 }; \
519 \
520 PyMODINIT_FUNC \
521 PyInit__codecs_##loc(void) \
522 { \
523 return PyModuleDef_Init(&_cjk_module); \
524 }
525
526 #endif
527