• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * cjkcodecs.h: common header for cjkcodecs
3  *
4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
5  */
6 
7 #ifndef _CJKCODECS_H_
8 #define _CJKCODECS_H_
9 
10 #ifndef Py_BUILD_CORE_BUILTIN
11 #  define Py_BUILD_CORE_MODULE 1
12 #endif
13 
14 #include "Python.h"
15 #include "multibytecodec.h"
16 #include "pycore_import.h"        // _PyImport_GetModuleAttrString()
17 
18 
19 /* a unicode "undefined" code point */
20 #define UNIINV  0xFFFE
21 
22 /* internal-use DBCS code points which aren't used by any charsets */
23 #define NOCHAR  0xFFFF
24 #define MULTIC  0xFFFE
25 #define DBCINV  0xFFFD
26 
27 /* shorter macros to save source size of mapping tables */
28 #define U UNIINV
29 #define N NOCHAR
30 #define M MULTIC
31 #define D DBCINV
32 
33 struct dbcs_index {
34     const ucs2_t *map;
35     unsigned char bottom, top;
36 };
37 typedef struct dbcs_index decode_map;
38 
39 struct widedbcs_index {
40     const Py_UCS4 *map;
41     unsigned char bottom, top;
42 };
43 typedef struct widedbcs_index widedecode_map;
44 
45 struct unim_index {
46     const DBCHAR *map;
47     unsigned char bottom, top;
48 };
49 typedef struct unim_index encode_map;
50 
51 struct unim_index_bytebased {
52     const unsigned char *map;
53     unsigned char bottom, top;
54 };
55 
56 struct dbcs_map {
57     const char *charset;
58     const struct unim_index *encmap;
59     const struct dbcs_index *decmap;
60 };
61 
62 struct pair_encodemap {
63     Py_UCS4 uniseq;
64     DBCHAR code;
65 };
66 
67 #ifndef CJK_MOD_SPECIFIC_STATE
68 #define CJK_MOD_SPECIFIC_STATE
69 #endif
70 
71 typedef struct _cjk_mod_state {
72     int num_mappings;
73     int num_codecs;
74     struct dbcs_map *mapping_list;
75     MultibyteCodec *codec_list;
76 
77     CJK_MOD_SPECIFIC_STATE
78 } cjkcodecs_module_state;
79 
80 static inline cjkcodecs_module_state *
get_module_state(PyObject * mod)81 get_module_state(PyObject *mod)
82 {
83     void *state = PyModule_GetState(mod);
84     assert(state != NULL);
85     return (cjkcodecs_module_state *)state;
86 }
87 
88 #define CODEC_INIT(encoding)                                            \
89     static int encoding##_codec_init(const MultibyteCodec *codec)
90 
91 #define ENCODER_INIT(encoding)                                          \
92     static int encoding##_encode_init(                                  \
93         MultibyteCodec_State *state, const MultibyteCodec *codec)
94 #define ENCODER(encoding)                                               \
95     static Py_ssize_t encoding##_encode(                                \
96         MultibyteCodec_State *state, const MultibyteCodec *codec,       \
97         int kind, const void *data,                                     \
98         Py_ssize_t *inpos, Py_ssize_t inlen,                            \
99         unsigned char **outbuf, Py_ssize_t outleft, int flags)
100 #define ENCODER_RESET(encoding)                                         \
101     static Py_ssize_t encoding##_encode_reset(                          \
102         MultibyteCodec_State *state, const MultibyteCodec *codec,       \
103         unsigned char **outbuf, Py_ssize_t outleft)
104 
105 #define DECODER_INIT(encoding)                                          \
106     static int encoding##_decode_init(                                  \
107         MultibyteCodec_State *state, const MultibyteCodec *codec)
108 #define DECODER(encoding)                                               \
109     static Py_ssize_t encoding##_decode(                                \
110         MultibyteCodec_State *state, const MultibyteCodec *codec,       \
111         const unsigned char **inbuf, Py_ssize_t inleft,                 \
112         _PyUnicodeWriter *writer)
113 #define DECODER_RESET(encoding)                                         \
114     static Py_ssize_t encoding##_decode_reset(                          \
115         MultibyteCodec_State *state, const MultibyteCodec *codec)
116 
117 #define NEXT_IN(i)                              \
118     do {                                        \
119         (*inbuf) += (i);                        \
120         (inleft) -= (i);                        \
121     } while (0)
122 #define NEXT_INCHAR(i)                          \
123     do {                                        \
124         (*inpos) += (i);                        \
125     } while (0)
126 #define NEXT_OUT(o)                             \
127     do {                                        \
128         (*outbuf) += (o);                       \
129         (outleft) -= (o);                       \
130     } while (0)
131 #define NEXT(i, o)                              \
132     do {                                        \
133         NEXT_INCHAR(i);                         \
134         NEXT_OUT(o);                            \
135     } while (0)
136 
137 #define REQUIRE_INBUF(n)                        \
138     do {                                        \
139         if (inleft < (n))                       \
140             return MBERR_TOOFEW;                \
141     } while (0)
142 
143 #define REQUIRE_OUTBUF(n)                       \
144     do {                                        \
145         if (outleft < (n))                      \
146             return MBERR_TOOSMALL;              \
147     } while (0)
148 
149 #define INBYTE1 ((*inbuf)[0])
150 #define INBYTE2 ((*inbuf)[1])
151 #define INBYTE3 ((*inbuf)[2])
152 #define INBYTE4 ((*inbuf)[3])
153 
154 #define INCHAR1 (PyUnicode_READ(kind, data, *inpos))
155 #define INCHAR2 (PyUnicode_READ(kind, data, *inpos + 1))
156 
157 #define OUTCHAR(c)                                                         \
158     do {                                                                   \
159         if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0)                   \
160             return MBERR_EXCEPTION;                                         \
161     } while (0)
162 
163 #define OUTCHAR2(c1, c2)                                                   \
164     do {                                                                   \
165         Py_UCS4 _c1 = (c1);                                                \
166         Py_UCS4 _c2 = (c2);                                                \
167         if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0)      \
168             return MBERR_EXCEPTION;                                        \
169         PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1);     \
170         PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
171         writer->pos += 2;                                                  \
172     } while (0)
173 
174 #define OUTBYTEI(c, i)                     \
175     do {                                   \
176         assert((unsigned char)(c) == (c)); \
177         ((*outbuf)[i]) = (c);              \
178     } while (0)
179 
180 #define OUTBYTE1(c) OUTBYTEI(c, 0)
181 #define OUTBYTE2(c) OUTBYTEI(c, 1)
182 #define OUTBYTE3(c) OUTBYTEI(c, 2)
183 #define OUTBYTE4(c) OUTBYTEI(c, 3)
184 
185 #define WRITEBYTE1(c1)              \
186     do {                            \
187         REQUIRE_OUTBUF(1);          \
188         OUTBYTE1(c1);               \
189     } while (0)
190 #define WRITEBYTE2(c1, c2)          \
191     do {                            \
192         REQUIRE_OUTBUF(2);          \
193         OUTBYTE1(c1);               \
194         OUTBYTE2(c2);               \
195     } while (0)
196 #define WRITEBYTE3(c1, c2, c3)      \
197     do {                            \
198         REQUIRE_OUTBUF(3);          \
199         OUTBYTE1(c1);               \
200         OUTBYTE2(c2);               \
201         OUTBYTE3(c3);               \
202     } while (0)
203 #define WRITEBYTE4(c1, c2, c3, c4)  \
204     do {                            \
205         REQUIRE_OUTBUF(4);          \
206         OUTBYTE1(c1);               \
207         OUTBYTE2(c2);               \
208         OUTBYTE3(c3);               \
209         OUTBYTE4(c4);               \
210     } while (0)
211 
212 #define _TRYMAP_ENC(m, assi, val)                               \
213     ((m)->map != NULL && (val) >= (m)->bottom &&                \
214         (val)<= (m)->top && ((assi) = (m)->map[(val) -          \
215         (m)->bottom]) != NOCHAR)
216 #define TRYMAP_ENC(charset, assi, uni)                     \
217     _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
218 #define TRYMAP_ENC_ST(charset, assi, uni) \
219     _TRYMAP_ENC(&(codec->modstate->charset##_encmap)[(uni) >> 8], \
220                 assi, (uni) & 0xff)
221 
222 #define _TRYMAP_DEC(m, assi, val)                             \
223     ((m)->map != NULL &&                                        \
224      (val) >= (m)->bottom &&                                    \
225      (val)<= (m)->top &&                                        \
226      ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
227 #define TRYMAP_DEC(charset, assi, c1, c2)                     \
228     _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
229 #define TRYMAP_DEC_ST(charset, assi, c1, c2) \
230     _TRYMAP_DEC(&(codec->modstate->charset##_decmap)[c1], assi, c2)
231 
232 #define BEGIN_MAPPINGS_LIST(NUM)                                    \
233 static int                                                          \
234 add_mappings(cjkcodecs_module_state *st)                            \
235 {                                                                   \
236     int idx = 0;                                                    \
237     (void)idx;                                                      \
238     st->num_mappings = NUM;                                         \
239     st->mapping_list = PyMem_Calloc(NUM, sizeof(struct dbcs_map));  \
240     if (st->mapping_list == NULL) {                                 \
241         return -1;                                                  \
242     }
243 
244 #define MAPPING_ENCONLY(enc) \
245     st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, NULL};
246 #define MAPPING_DECONLY(enc) \
247     st->mapping_list[idx++] = (struct dbcs_map){#enc, NULL, (void*)enc##_decmap};
248 #define MAPPING_ENCDEC(enc) \
249     st->mapping_list[idx++] = (struct dbcs_map){#enc, (void*)enc##_encmap, (void*)enc##_decmap};
250 
251 #define END_MAPPINGS_LIST               \
252     assert(st->num_mappings == idx);    \
253     return 0;                           \
254 }
255 
256 #define BEGIN_CODECS_LIST(NUM)                                  \
257 static int                                                      \
258 add_codecs(cjkcodecs_module_state *st)                          \
259 {                                                               \
260     int idx = 0;                                                \
261     (void)idx;                                                  \
262     st->num_codecs = NUM;                                       \
263     st->codec_list = PyMem_Calloc(NUM, sizeof(MultibyteCodec)); \
264     if (st->codec_list == NULL) {                               \
265         return -1;                                              \
266     }
267 
268 #define _STATEFUL_METHODS(enc)          \
269     enc##_encode,                       \
270     enc##_encode_init,                  \
271     enc##_encode_reset,                 \
272     enc##_decode,                       \
273     enc##_decode_init,                  \
274     enc##_decode_reset,
275 #define _STATELESS_METHODS(enc)         \
276     enc##_encode, NULL, NULL,           \
277     enc##_decode, NULL, NULL,
278 
279 #define NEXT_CODEC \
280     st->codec_list[idx++]
281 
282 #define CODEC_STATEFUL(enc) \
283     NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATEFUL_METHODS(enc)};
284 #define CODEC_STATELESS(enc) \
285     NEXT_CODEC = (MultibyteCodec){#enc, NULL, NULL, _STATELESS_METHODS(enc)};
286 #define CODEC_STATELESS_WINIT(enc) \
287     NEXT_CODEC = (MultibyteCodec){#enc, NULL, enc##_codec_init, _STATELESS_METHODS(enc)};
288 
289 #define END_CODECS_LIST                         \
290     assert(st->num_codecs == idx);              \
291     for (int i = 0; i < st->num_codecs; i++) {  \
292         st->codec_list[i].modstate = st;        \
293     }                                           \
294     return 0;                                   \
295 }
296 
297 
298 
299 static PyObject *
getmultibytecodec(void)300 getmultibytecodec(void)
301 {
302     return _PyImport_GetModuleAttrString("_multibytecodec", "__create_codec");
303 }
304 
305 static void
destroy_codec_capsule(PyObject * capsule)306 destroy_codec_capsule(PyObject *capsule)
307 {
308     void *ptr = PyCapsule_GetPointer(capsule, CODEC_CAPSULE);
309     codec_capsule *data = (codec_capsule *)ptr;
310     Py_DECREF(data->cjk_module);
311     PyMem_Free(ptr);
312 }
313 
314 static codec_capsule *
capsulate_codec(PyObject * mod,const MultibyteCodec * codec)315 capsulate_codec(PyObject *mod, const MultibyteCodec *codec)
316 {
317     codec_capsule *data = PyMem_Malloc(sizeof(codec_capsule));
318     if (data == NULL) {
319         PyErr_NoMemory();
320         return NULL;
321     }
322     data->codec = codec;
323     data->cjk_module = Py_NewRef(mod);
324     return data;
325 }
326 
327 static PyObject *
_getcodec(PyObject * self,const MultibyteCodec * codec)328 _getcodec(PyObject *self, const MultibyteCodec *codec)
329 {
330     PyObject *cofunc = getmultibytecodec();
331     if (cofunc == NULL) {
332         return NULL;
333     }
334 
335     codec_capsule *data = capsulate_codec(self, codec);
336     if (data == NULL) {
337         Py_DECREF(cofunc);
338         return NULL;
339     }
340     PyObject *codecobj = PyCapsule_New(data, CODEC_CAPSULE,
341                                        destroy_codec_capsule);
342     if (codecobj == NULL) {
343         PyMem_Free(data);
344         Py_DECREF(cofunc);
345         return NULL;
346     }
347 
348     PyObject *res = PyObject_CallOneArg(cofunc, codecobj);
349     Py_DECREF(codecobj);
350     Py_DECREF(cofunc);
351     return res;
352 }
353 
354 static PyObject *
getcodec(PyObject * self,PyObject * encoding)355 getcodec(PyObject *self, PyObject *encoding)
356 {
357     if (!PyUnicode_Check(encoding)) {
358         PyErr_SetString(PyExc_TypeError,
359                         "encoding name must be a string.");
360         return NULL;
361     }
362     const char *enc = PyUnicode_AsUTF8(encoding);
363     if (enc == NULL) {
364         return NULL;
365     }
366 
367     cjkcodecs_module_state *st = get_module_state(self);
368     for (int i = 0; i < st->num_codecs; i++) {
369         const MultibyteCodec *codec = &st->codec_list[i];
370         if (strcmp(codec->encoding, enc) == 0) {
371             return _getcodec(self, codec);
372         }
373     }
374 
375     PyErr_SetString(PyExc_LookupError,
376                     "no such codec is supported.");
377     return NULL;
378 }
379 
380 static int add_mappings(cjkcodecs_module_state *);
381 static int add_codecs(cjkcodecs_module_state *);
382 
383 static int
register_maps(PyObject * module)384 register_maps(PyObject *module)
385 {
386     // Init module state.
387     cjkcodecs_module_state *st = get_module_state(module);
388     if (add_mappings(st) < 0) {
389         return -1;
390     }
391     if (add_codecs(st) < 0) {
392         return -1;
393     }
394 
395     for (int i = 0; i < st->num_mappings; i++) {
396         const struct dbcs_map *h = &st->mapping_list[i];
397         char mhname[256] = "__map_";
398         strcpy(mhname + sizeof("__map_") - 1, h->charset);
399 
400         PyObject *capsule = PyCapsule_New((void *)h, MAP_CAPSULE, NULL);
401         if (PyModule_Add(module, mhname, capsule) < 0) {
402             return -1;
403         }
404     }
405     return 0;
406 }
407 
408 #ifdef USING_BINARY_PAIR_SEARCH
409 static DBCHAR
find_pairencmap(ucs2_t body,ucs2_t modifier,const struct pair_encodemap * haystack,int haystacksize)410 find_pairencmap(ucs2_t body, ucs2_t modifier,
411                 const struct pair_encodemap *haystack, int haystacksize)
412 {
413     int pos, min, max;
414     Py_UCS4 value = body << 16 | modifier;
415 
416     min = 0;
417     max = haystacksize;
418 
419     for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1) {
420         if (value < haystack[pos].uniseq) {
421             if (max != pos) {
422                 max = pos;
423                 continue;
424             }
425         }
426         else if (value > haystack[pos].uniseq) {
427             if (min != pos) {
428                 min = pos;
429                 continue;
430             }
431         }
432         break;
433     }
434 
435     if (value == haystack[pos].uniseq) {
436         return haystack[pos].code;
437     }
438     return DBCINV;
439 }
440 #endif
441 
442 #ifdef USING_IMPORTED_MAPS
443 #define IMPORT_MAP(locale, charset, encmap, decmap) \
444     importmap("_codecs_" #locale, "__map_" #charset, \
445               (const void**)encmap, (const void**)decmap)
446 
447 static int
importmap(const char * modname,const char * symbol,const void ** encmap,const void ** decmap)448 importmap(const char *modname, const char *symbol,
449           const void **encmap, const void **decmap)
450 {
451     PyObject *o, *mod;
452 
453     mod = PyImport_ImportModule(modname);
454     if (mod == NULL)
455         return -1;
456 
457     o = PyObject_GetAttrString(mod, symbol);
458     if (o == NULL)
459         goto errorexit;
460     else if (!PyCapsule_IsValid(o, MAP_CAPSULE)) {
461         PyErr_SetString(PyExc_ValueError,
462                         "map data must be a Capsule.");
463         goto errorexit;
464     }
465     else {
466         struct dbcs_map *map;
467         map = PyCapsule_GetPointer(o, MAP_CAPSULE);
468         if (encmap != NULL)
469             *encmap = map->encmap;
470         if (decmap != NULL)
471             *decmap = map->decmap;
472         Py_DECREF(o);
473     }
474 
475     Py_DECREF(mod);
476     return 0;
477 
478 errorexit:
479     Py_DECREF(mod);
480     return -1;
481 }
482 #endif
483 
484 static int
_cjk_exec(PyObject * module)485 _cjk_exec(PyObject *module)
486 {
487     return register_maps(module);
488 }
489 
490 static void
_cjk_free(void * mod)491 _cjk_free(void *mod)
492 {
493     cjkcodecs_module_state *st = get_module_state((PyObject *)mod);
494     PyMem_Free(st->mapping_list);
495     PyMem_Free(st->codec_list);
496 }
497 
498 static struct PyMethodDef _cjk_methods[] = {
499     {"getcodec", (PyCFunction)getcodec, METH_O, ""},
500     {NULL, NULL},
501 };
502 
503 static PyModuleDef_Slot _cjk_slots[] = {
504     {Py_mod_exec, _cjk_exec},
505     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
506     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
507     {0, NULL}
508 };
509 
510 #define I_AM_A_MODULE_FOR(loc)                                          \
511     static struct PyModuleDef _cjk_module = {                           \
512         PyModuleDef_HEAD_INIT,                                          \
513         .m_name = "_codecs_"#loc,                                       \
514         .m_size = sizeof(cjkcodecs_module_state),                       \
515         .m_methods = _cjk_methods,                                      \
516         .m_slots = _cjk_slots,                                          \
517         .m_free = _cjk_free,                                            \
518     };                                                                  \
519                                                                         \
520     PyMODINIT_FUNC                                                      \
521     PyInit__codecs_##loc(void)                                          \
522     {                                                                   \
523         return PyModuleDef_Init(&_cjk_module);                          \
524     }
525 
526 #endif
527