1 /* ------------------------------------------------------------------------
2
3 _codecs -- Provides access to the codec registry and the builtin
4 codecs.
5
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
8
9 The codec registry is accessible via:
10
11 register(search_function) -> None
12
13 lookup(encoding) -> CodecInfo object
14
15 The builtin Unicode codecs use the following interface:
16
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
19
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
22
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
26
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
29 mbcs (on win32).
30
31
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
33
34 Copyright (c) Corporation for National Research Initiatives.
35
36 ------------------------------------------------------------------------ */
37
38 #define PY_SSIZE_T_CLEAN
39 #include "Python.h"
40
41 /* --- Registry ----------------------------------------------------------- */
42
43 PyDoc_STRVAR(register__doc__,
44 "register(search_function)\n\
45 \n\
46 Register a codec search function. Search functions are expected to take\n\
47 one argument, the encoding name in all lower case letters, and return\n\
48 a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
49 (or a CodecInfo object).");
50
51 static
codec_register(PyObject * self,PyObject * search_function)52 PyObject *codec_register(PyObject *self, PyObject *search_function)
53 {
54 if (PyCodec_Register(search_function))
55 return NULL;
56
57 Py_RETURN_NONE;
58 }
59
60 PyDoc_STRVAR(lookup__doc__,
61 "lookup(encoding) -> CodecInfo\n\
62 \n\
63 Looks up a codec tuple in the Python codec registry and returns\n\
64 a CodecInfo object.");
65
66 static
codec_lookup(PyObject * self,PyObject * args)67 PyObject *codec_lookup(PyObject *self, PyObject *args)
68 {
69 char *encoding;
70
71 if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
72 return NULL;
73
74 return _PyCodec_Lookup(encoding);
75 }
76
77 PyDoc_STRVAR(encode__doc__,
78 "encode(obj, [encoding[,errors]]) -> object\n\
79 \n\
80 Encodes obj using the codec registered for encoding. encoding defaults\n\
81 to the default encoding. errors may be given to set a different error\n\
82 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
83 a ValueError. Other possible values are 'ignore', 'replace' and\n\
84 'xmlcharrefreplace' as well as any other name registered with\n\
85 codecs.register_error that can handle ValueErrors.");
86
87 static PyObject *
codec_encode(PyObject * self,PyObject * args)88 codec_encode(PyObject *self, PyObject *args)
89 {
90 const char *encoding = NULL;
91 const char *errors = NULL;
92 PyObject *v;
93
94 if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
95 return NULL;
96
97 #ifdef Py_USING_UNICODE
98 if (encoding == NULL)
99 encoding = PyUnicode_GetDefaultEncoding();
100 #else
101 if (encoding == NULL) {
102 PyErr_SetString(PyExc_ValueError, "no encoding specified");
103 return NULL;
104 }
105 #endif
106
107 /* Encode via the codec registry */
108 return PyCodec_Encode(v, encoding, errors);
109 }
110
111 PyDoc_STRVAR(decode__doc__,
112 "decode(obj, [encoding[,errors]]) -> object\n\
113 \n\
114 Decodes obj using the codec registered for encoding. encoding defaults\n\
115 to the default encoding. errors may be given to set a different error\n\
116 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
117 a ValueError. Other possible values are 'ignore' and 'replace'\n\
118 as well as any other name registered with codecs.register_error that is\n\
119 able to handle ValueErrors.");
120
121 static PyObject *
codec_decode(PyObject * self,PyObject * args)122 codec_decode(PyObject *self, PyObject *args)
123 {
124 const char *encoding = NULL;
125 const char *errors = NULL;
126 PyObject *v;
127
128 if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
129 return NULL;
130
131 #ifdef Py_USING_UNICODE
132 if (encoding == NULL)
133 encoding = PyUnicode_GetDefaultEncoding();
134 #else
135 if (encoding == NULL) {
136 PyErr_SetString(PyExc_ValueError, "no encoding specified");
137 return NULL;
138 }
139 #endif
140
141 /* Decode via the codec registry */
142 return PyCodec_Decode(v, encoding, errors);
143 }
144
145 /* --- Helpers ------------------------------------------------------------ */
146
147 static
codec_tuple(PyObject * unicode,Py_ssize_t len)148 PyObject *codec_tuple(PyObject *unicode,
149 Py_ssize_t len)
150 {
151 PyObject *v;
152 if (unicode == NULL)
153 return NULL;
154 v = Py_BuildValue("On", unicode, len);
155 Py_DECREF(unicode);
156 return v;
157 }
158
159 /* --- String codecs ------------------------------------------------------ */
160 static PyObject *
escape_decode(PyObject * self,PyObject * args)161 escape_decode(PyObject *self,
162 PyObject *args)
163 {
164 const char *errors = NULL;
165 const char *data;
166 Py_ssize_t size;
167
168 if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
169 &data, &size, &errors))
170 return NULL;
171 return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
172 size);
173 }
174
175 static PyObject *
escape_encode(PyObject * self,PyObject * args)176 escape_encode(PyObject *self,
177 PyObject *args)
178 {
179 PyObject *str;
180 const char *errors = NULL;
181 char *buf;
182 Py_ssize_t consumed, len;
183
184 if (!PyArg_ParseTuple(args, "S|z:escape_encode",
185 &str, &errors))
186 return NULL;
187
188 consumed = PyString_GET_SIZE(str);
189 str = PyString_Repr(str, 0);
190 if (!str)
191 return NULL;
192
193 /* The string will be quoted. Unquote, similar to unicode-escape. */
194 buf = PyString_AS_STRING (str);
195 len = PyString_GET_SIZE (str);
196 memmove(buf, buf+1, len-2);
197 if (_PyString_Resize(&str, len-2) < 0)
198 return NULL;
199
200 return codec_tuple(str, consumed);
201 }
202
203 #ifdef Py_USING_UNICODE
204 /* --- Decoder ------------------------------------------------------------ */
205
206 static PyObject *
unicode_internal_decode(PyObject * self,PyObject * args)207 unicode_internal_decode(PyObject *self,
208 PyObject *args)
209 {
210 PyObject *obj;
211 const char *errors = NULL;
212 const char *data;
213 Py_ssize_t size;
214
215 if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
216 &obj, &errors))
217 return NULL;
218
219 if (PyUnicode_Check(obj)) {
220 Py_INCREF(obj);
221 return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
222 }
223 else {
224 if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
225 return NULL;
226
227 return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
228 size);
229 }
230 }
231
232 static PyObject *
utf_7_decode(PyObject * self,PyObject * args)233 utf_7_decode(PyObject *self,
234 PyObject *args)
235 {
236 Py_buffer pbuf;
237 const char *errors = NULL;
238 int final = 0;
239 Py_ssize_t consumed;
240 PyObject *decoded = NULL;
241
242 if (!PyArg_ParseTuple(args, "s*|zi:utf_7_decode",
243 &pbuf, &errors, &final))
244 return NULL;
245 consumed = pbuf.len;
246
247 decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
248 final ? NULL : &consumed);
249 PyBuffer_Release(&pbuf);
250 if (decoded == NULL)
251 return NULL;
252 return codec_tuple(decoded, consumed);
253 }
254
255 static PyObject *
utf_8_decode(PyObject * self,PyObject * args)256 utf_8_decode(PyObject *self,
257 PyObject *args)
258 {
259 Py_buffer pbuf;
260 const char *errors = NULL;
261 int final = 0;
262 Py_ssize_t consumed;
263 PyObject *decoded = NULL;
264
265 if (!PyArg_ParseTuple(args, "s*|zi:utf_8_decode",
266 &pbuf, &errors, &final))
267 return NULL;
268 consumed = pbuf.len;
269
270 decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
271 final ? NULL : &consumed);
272 PyBuffer_Release(&pbuf);
273 if (decoded == NULL)
274 return NULL;
275 return codec_tuple(decoded, consumed);
276 }
277
278 static PyObject *
utf_16_decode(PyObject * self,PyObject * args)279 utf_16_decode(PyObject *self,
280 PyObject *args)
281 {
282 Py_buffer pbuf;
283 const char *errors = NULL;
284 int byteorder = 0;
285 int final = 0;
286 Py_ssize_t consumed;
287 PyObject *decoded;
288
289 if (!PyArg_ParseTuple(args, "s*|zi:utf_16_decode",
290 &pbuf, &errors, &final))
291 return NULL;
292 consumed = pbuf.len; /* This is overwritten unless final is true. */
293 decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
294 &byteorder, final ? NULL : &consumed);
295 PyBuffer_Release(&pbuf);
296 if (decoded == NULL)
297 return NULL;
298 return codec_tuple(decoded, consumed);
299 }
300
301 static PyObject *
utf_16_le_decode(PyObject * self,PyObject * args)302 utf_16_le_decode(PyObject *self,
303 PyObject *args)
304 {
305 Py_buffer pbuf;
306 const char *errors = NULL;
307 int byteorder = -1;
308 int final = 0;
309 Py_ssize_t consumed;
310 PyObject *decoded = NULL;
311
312 if (!PyArg_ParseTuple(args, "s*|zi:utf_16_le_decode",
313 &pbuf, &errors, &final))
314 return NULL;
315
316 consumed = pbuf.len; /* This is overwritten unless final is true. */
317 decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
318 &byteorder, final ? NULL : &consumed);
319 PyBuffer_Release(&pbuf);
320 if (decoded == NULL)
321 return NULL;
322 return codec_tuple(decoded, consumed);
323 }
324
325 static PyObject *
utf_16_be_decode(PyObject * self,PyObject * args)326 utf_16_be_decode(PyObject *self,
327 PyObject *args)
328 {
329 Py_buffer pbuf;
330 const char *errors = NULL;
331 int byteorder = 1;
332 int final = 0;
333 Py_ssize_t consumed;
334 PyObject *decoded = NULL;
335
336 if (!PyArg_ParseTuple(args, "s*|zi:utf_16_be_decode",
337 &pbuf, &errors, &final))
338 return NULL;
339
340 consumed = pbuf.len; /* This is overwritten unless final is true. */
341 decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
342 &byteorder, final ? NULL : &consumed);
343 PyBuffer_Release(&pbuf);
344 if (decoded == NULL)
345 return NULL;
346 return codec_tuple(decoded, consumed);
347 }
348
349 /* This non-standard version also provides access to the byteorder
350 parameter of the builtin UTF-16 codec.
351
352 It returns a tuple (unicode, bytesread, byteorder) with byteorder
353 being the value in effect at the end of data.
354
355 */
356
357 static PyObject *
utf_16_ex_decode(PyObject * self,PyObject * args)358 utf_16_ex_decode(PyObject *self,
359 PyObject *args)
360 {
361 Py_buffer pbuf;
362 const char *errors = NULL;
363 int byteorder = 0;
364 PyObject *unicode, *tuple;
365 int final = 0;
366 Py_ssize_t consumed;
367
368 if (!PyArg_ParseTuple(args, "s*|zii:utf_16_ex_decode",
369 &pbuf, &errors, &byteorder, &final))
370 return NULL;
371 consumed = pbuf.len; /* This is overwritten unless final is true. */
372 unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
373 &byteorder, final ? NULL : &consumed);
374 PyBuffer_Release(&pbuf);
375 if (unicode == NULL)
376 return NULL;
377 tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
378 Py_DECREF(unicode);
379 return tuple;
380 }
381
382 static PyObject *
utf_32_decode(PyObject * self,PyObject * args)383 utf_32_decode(PyObject *self,
384 PyObject *args)
385 {
386 Py_buffer pbuf;
387 const char *errors = NULL;
388 int byteorder = 0;
389 int final = 0;
390 Py_ssize_t consumed;
391 PyObject *decoded;
392
393 if (!PyArg_ParseTuple(args, "s*|zi:utf_32_decode",
394 &pbuf, &errors, &final))
395 return NULL;
396 consumed = pbuf.len; /* This is overwritten unless final is true. */
397 decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
398 &byteorder, final ? NULL : &consumed);
399 PyBuffer_Release(&pbuf);
400 if (decoded == NULL)
401 return NULL;
402 return codec_tuple(decoded, consumed);
403 }
404
405 static PyObject *
utf_32_le_decode(PyObject * self,PyObject * args)406 utf_32_le_decode(PyObject *self,
407 PyObject *args)
408 {
409 Py_buffer pbuf;
410 const char *errors = NULL;
411 int byteorder = -1;
412 int final = 0;
413 Py_ssize_t consumed;
414 PyObject *decoded;
415
416 if (!PyArg_ParseTuple(args, "s*|zi:utf_32_le_decode",
417 &pbuf, &errors, &final))
418 return NULL;
419 consumed = pbuf.len; /* This is overwritten unless final is true. */
420 decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
421 &byteorder, final ? NULL : &consumed);
422 PyBuffer_Release(&pbuf);
423 if (decoded == NULL)
424 return NULL;
425 return codec_tuple(decoded, consumed);
426 }
427
428 static PyObject *
utf_32_be_decode(PyObject * self,PyObject * args)429 utf_32_be_decode(PyObject *self,
430 PyObject *args)
431 {
432 Py_buffer pbuf;
433 const char *errors = NULL;
434 int byteorder = 1;
435 int final = 0;
436 Py_ssize_t consumed;
437 PyObject *decoded;
438
439 if (!PyArg_ParseTuple(args, "s*|zi:utf_32_be_decode",
440 &pbuf, &errors, &final))
441 return NULL;
442 consumed = pbuf.len; /* This is overwritten unless final is true. */
443 decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
444 &byteorder, final ? NULL : &consumed);
445 PyBuffer_Release(&pbuf);
446 if (decoded == NULL)
447 return NULL;
448 return codec_tuple(decoded, consumed);
449 }
450
451 /* This non-standard version also provides access to the byteorder
452 parameter of the builtin UTF-32 codec.
453
454 It returns a tuple (unicode, bytesread, byteorder) with byteorder
455 being the value in effect at the end of data.
456
457 */
458
459 static PyObject *
utf_32_ex_decode(PyObject * self,PyObject * args)460 utf_32_ex_decode(PyObject *self,
461 PyObject *args)
462 {
463 Py_buffer pbuf;
464 const char *errors = NULL;
465 int byteorder = 0;
466 PyObject *unicode, *tuple;
467 int final = 0;
468 Py_ssize_t consumed;
469
470 if (!PyArg_ParseTuple(args, "s*|zii:utf_32_ex_decode",
471 &pbuf, &errors, &byteorder, &final))
472 return NULL;
473 consumed = pbuf.len; /* This is overwritten unless final is true. */
474 unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
475 &byteorder, final ? NULL : &consumed);
476 PyBuffer_Release(&pbuf);
477 if (unicode == NULL)
478 return NULL;
479 tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
480 Py_DECREF(unicode);
481 return tuple;
482 }
483
484 static PyObject *
unicode_escape_decode(PyObject * self,PyObject * args)485 unicode_escape_decode(PyObject *self,
486 PyObject *args)
487 {
488 Py_buffer pbuf;
489 const char *errors = NULL;
490 PyObject *unicode;
491
492 if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
493 &pbuf, &errors))
494 return NULL;
495
496 unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
497 PyBuffer_Release(&pbuf);
498 return codec_tuple(unicode, pbuf.len);
499 }
500
501 static PyObject *
raw_unicode_escape_decode(PyObject * self,PyObject * args)502 raw_unicode_escape_decode(PyObject *self,
503 PyObject *args)
504 {
505 Py_buffer pbuf;
506 const char *errors = NULL;
507 PyObject *unicode;
508
509 if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
510 &pbuf, &errors))
511 return NULL;
512
513 unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
514 PyBuffer_Release(&pbuf);
515 return codec_tuple(unicode, pbuf.len);
516 }
517
518 static PyObject *
latin_1_decode(PyObject * self,PyObject * args)519 latin_1_decode(PyObject *self,
520 PyObject *args)
521 {
522 Py_buffer pbuf;
523 PyObject *unicode;
524 const char *errors = NULL;
525
526 if (!PyArg_ParseTuple(args, "s*|z:latin_1_decode",
527 &pbuf, &errors))
528 return NULL;
529
530 unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
531 PyBuffer_Release(&pbuf);
532 return codec_tuple(unicode, pbuf.len);
533 }
534
535 static PyObject *
ascii_decode(PyObject * self,PyObject * args)536 ascii_decode(PyObject *self,
537 PyObject *args)
538 {
539 Py_buffer pbuf;
540 PyObject *unicode;
541 const char *errors = NULL;
542
543 if (!PyArg_ParseTuple(args, "s*|z:ascii_decode",
544 &pbuf, &errors))
545 return NULL;
546
547 unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
548 PyBuffer_Release(&pbuf);
549 return codec_tuple(unicode, pbuf.len);
550 }
551
552 static PyObject *
charmap_decode(PyObject * self,PyObject * args)553 charmap_decode(PyObject *self,
554 PyObject *args)
555 {
556 Py_buffer pbuf;
557 PyObject *unicode;
558 const char *errors = NULL;
559 PyObject *mapping = NULL;
560
561 if (!PyArg_ParseTuple(args, "s*|zO:charmap_decode",
562 &pbuf, &errors, &mapping))
563 return NULL;
564 if (mapping == Py_None)
565 mapping = NULL;
566
567 unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
568 PyBuffer_Release(&pbuf);
569 return codec_tuple(unicode, pbuf.len);
570 }
571
572 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
573
574 static PyObject *
mbcs_decode(PyObject * self,PyObject * args)575 mbcs_decode(PyObject *self,
576 PyObject *args)
577 {
578 Py_buffer pbuf;
579 const char *errors = NULL;
580 int final = 0;
581 Py_ssize_t consumed;
582 PyObject *decoded = NULL;
583
584 if (!PyArg_ParseTuple(args, "s*|zi:mbcs_decode",
585 &pbuf, &errors, &final))
586 return NULL;
587 consumed = pbuf.len;
588
589 decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
590 final ? NULL : &consumed);
591 PyBuffer_Release(&pbuf);
592 if (decoded == NULL)
593 return NULL;
594 return codec_tuple(decoded, consumed);
595 }
596
597 #endif /* MS_WINDOWS */
598
599 /* --- Encoder ------------------------------------------------------------ */
600
601 static PyObject *
readbuffer_encode(PyObject * self,PyObject * args)602 readbuffer_encode(PyObject *self,
603 PyObject *args)
604 {
605 const char *data;
606 Py_ssize_t size;
607 const char *errors = NULL;
608
609 if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
610 &data, &size, &errors))
611 return NULL;
612
613 return codec_tuple(PyString_FromStringAndSize(data, size),
614 size);
615 }
616
617 static PyObject *
charbuffer_encode(PyObject * self,PyObject * args)618 charbuffer_encode(PyObject *self,
619 PyObject *args)
620 {
621 const char *data;
622 Py_ssize_t size;
623 const char *errors = NULL;
624
625 if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
626 &data, &size, &errors))
627 return NULL;
628
629 return codec_tuple(PyString_FromStringAndSize(data, size),
630 size);
631 }
632
633 static PyObject *
unicode_internal_encode(PyObject * self,PyObject * args)634 unicode_internal_encode(PyObject *self,
635 PyObject *args)
636 {
637 PyObject *obj;
638 const char *errors = NULL;
639 const char *data;
640 Py_ssize_t size;
641
642 if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
643 &obj, &errors))
644 return NULL;
645
646 if (PyUnicode_Check(obj)) {
647 data = PyUnicode_AS_DATA(obj);
648 size = PyUnicode_GET_DATA_SIZE(obj);
649 return codec_tuple(PyString_FromStringAndSize(data, size),
650 PyUnicode_GET_SIZE(obj));
651 }
652 else {
653 if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
654 return NULL;
655 return codec_tuple(PyString_FromStringAndSize(data, size),
656 size);
657 }
658 }
659
660 static PyObject *
utf_7_encode(PyObject * self,PyObject * args)661 utf_7_encode(PyObject *self,
662 PyObject *args)
663 {
664 PyObject *str, *v;
665 const char *errors = NULL;
666
667 if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
668 &str, &errors))
669 return NULL;
670
671 str = PyUnicode_FromObject(str);
672 if (str == NULL)
673 return NULL;
674 v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
675 PyUnicode_GET_SIZE(str),
676 0,
677 0,
678 errors),
679 PyUnicode_GET_SIZE(str));
680 Py_DECREF(str);
681 return v;
682 }
683
684 static PyObject *
utf_8_encode(PyObject * self,PyObject * args)685 utf_8_encode(PyObject *self,
686 PyObject *args)
687 {
688 PyObject *str, *v;
689 const char *errors = NULL;
690
691 if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
692 &str, &errors))
693 return NULL;
694
695 str = PyUnicode_FromObject(str);
696 if (str == NULL)
697 return NULL;
698 v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
699 PyUnicode_GET_SIZE(str),
700 errors),
701 PyUnicode_GET_SIZE(str));
702 Py_DECREF(str);
703 return v;
704 }
705
706 /* This version provides access to the byteorder parameter of the
707 builtin UTF-16 codecs as optional third argument. It defaults to 0
708 which means: use the native byte order and prepend the data with a
709 BOM mark.
710
711 */
712
713 static PyObject *
utf_16_encode(PyObject * self,PyObject * args)714 utf_16_encode(PyObject *self,
715 PyObject *args)
716 {
717 PyObject *str, *v;
718 const char *errors = NULL;
719 int byteorder = 0;
720
721 if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
722 &str, &errors, &byteorder))
723 return NULL;
724
725 str = PyUnicode_FromObject(str);
726 if (str == NULL)
727 return NULL;
728 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
729 PyUnicode_GET_SIZE(str),
730 errors,
731 byteorder),
732 PyUnicode_GET_SIZE(str));
733 Py_DECREF(str);
734 return v;
735 }
736
737 static PyObject *
utf_16_le_encode(PyObject * self,PyObject * args)738 utf_16_le_encode(PyObject *self,
739 PyObject *args)
740 {
741 PyObject *str, *v;
742 const char *errors = NULL;
743
744 if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
745 &str, &errors))
746 return NULL;
747
748 str = PyUnicode_FromObject(str);
749 if (str == NULL)
750 return NULL;
751 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
752 PyUnicode_GET_SIZE(str),
753 errors,
754 -1),
755 PyUnicode_GET_SIZE(str));
756 Py_DECREF(str);
757 return v;
758 }
759
760 static PyObject *
utf_16_be_encode(PyObject * self,PyObject * args)761 utf_16_be_encode(PyObject *self,
762 PyObject *args)
763 {
764 PyObject *str, *v;
765 const char *errors = NULL;
766
767 if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
768 &str, &errors))
769 return NULL;
770
771 str = PyUnicode_FromObject(str);
772 if (str == NULL)
773 return NULL;
774 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
775 PyUnicode_GET_SIZE(str),
776 errors,
777 +1),
778 PyUnicode_GET_SIZE(str));
779 Py_DECREF(str);
780 return v;
781 }
782
783 /* This version provides access to the byteorder parameter of the
784 builtin UTF-32 codecs as optional third argument. It defaults to 0
785 which means: use the native byte order and prepend the data with a
786 BOM mark.
787
788 */
789
790 static PyObject *
utf_32_encode(PyObject * self,PyObject * args)791 utf_32_encode(PyObject *self,
792 PyObject *args)
793 {
794 PyObject *str, *v;
795 const char *errors = NULL;
796 int byteorder = 0;
797
798 if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
799 &str, &errors, &byteorder))
800 return NULL;
801
802 str = PyUnicode_FromObject(str);
803 if (str == NULL)
804 return NULL;
805 v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
806 PyUnicode_GET_SIZE(str),
807 errors,
808 byteorder),
809 PyUnicode_GET_SIZE(str));
810 Py_DECREF(str);
811 return v;
812 }
813
814 static PyObject *
utf_32_le_encode(PyObject * self,PyObject * args)815 utf_32_le_encode(PyObject *self,
816 PyObject *args)
817 {
818 PyObject *str, *v;
819 const char *errors = NULL;
820
821 if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
822 &str, &errors))
823 return NULL;
824
825 str = PyUnicode_FromObject(str);
826 if (str == NULL)
827 return NULL;
828 v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
829 PyUnicode_GET_SIZE(str),
830 errors,
831 -1),
832 PyUnicode_GET_SIZE(str));
833 Py_DECREF(str);
834 return v;
835 }
836
837 static PyObject *
utf_32_be_encode(PyObject * self,PyObject * args)838 utf_32_be_encode(PyObject *self,
839 PyObject *args)
840 {
841 PyObject *str, *v;
842 const char *errors = NULL;
843
844 if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
845 &str, &errors))
846 return NULL;
847
848 str = PyUnicode_FromObject(str);
849 if (str == NULL)
850 return NULL;
851 v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
852 PyUnicode_GET_SIZE(str),
853 errors,
854 +1),
855 PyUnicode_GET_SIZE(str));
856 Py_DECREF(str);
857 return v;
858 }
859
860 static PyObject *
unicode_escape_encode(PyObject * self,PyObject * args)861 unicode_escape_encode(PyObject *self,
862 PyObject *args)
863 {
864 PyObject *str, *v;
865 const char *errors = NULL;
866
867 if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
868 &str, &errors))
869 return NULL;
870
871 str = PyUnicode_FromObject(str);
872 if (str == NULL)
873 return NULL;
874 v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
875 PyUnicode_GET_SIZE(str)),
876 PyUnicode_GET_SIZE(str));
877 Py_DECREF(str);
878 return v;
879 }
880
881 static PyObject *
raw_unicode_escape_encode(PyObject * self,PyObject * args)882 raw_unicode_escape_encode(PyObject *self,
883 PyObject *args)
884 {
885 PyObject *str, *v;
886 const char *errors = NULL;
887
888 if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
889 &str, &errors))
890 return NULL;
891
892 str = PyUnicode_FromObject(str);
893 if (str == NULL)
894 return NULL;
895 v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
896 PyUnicode_AS_UNICODE(str),
897 PyUnicode_GET_SIZE(str)),
898 PyUnicode_GET_SIZE(str));
899 Py_DECREF(str);
900 return v;
901 }
902
903 static PyObject *
latin_1_encode(PyObject * self,PyObject * args)904 latin_1_encode(PyObject *self,
905 PyObject *args)
906 {
907 PyObject *str, *v;
908 const char *errors = NULL;
909
910 if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
911 &str, &errors))
912 return NULL;
913
914 str = PyUnicode_FromObject(str);
915 if (str == NULL)
916 return NULL;
917 v = codec_tuple(PyUnicode_EncodeLatin1(
918 PyUnicode_AS_UNICODE(str),
919 PyUnicode_GET_SIZE(str),
920 errors),
921 PyUnicode_GET_SIZE(str));
922 Py_DECREF(str);
923 return v;
924 }
925
926 static PyObject *
ascii_encode(PyObject * self,PyObject * args)927 ascii_encode(PyObject *self,
928 PyObject *args)
929 {
930 PyObject *str, *v;
931 const char *errors = NULL;
932
933 if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
934 &str, &errors))
935 return NULL;
936
937 str = PyUnicode_FromObject(str);
938 if (str == NULL)
939 return NULL;
940 v = codec_tuple(PyUnicode_EncodeASCII(
941 PyUnicode_AS_UNICODE(str),
942 PyUnicode_GET_SIZE(str),
943 errors),
944 PyUnicode_GET_SIZE(str));
945 Py_DECREF(str);
946 return v;
947 }
948
949 static PyObject *
charmap_encode(PyObject * self,PyObject * args)950 charmap_encode(PyObject *self,
951 PyObject *args)
952 {
953 PyObject *str, *v;
954 const char *errors = NULL;
955 PyObject *mapping = NULL;
956
957 if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
958 &str, &errors, &mapping))
959 return NULL;
960 if (mapping == Py_None)
961 mapping = NULL;
962
963 str = PyUnicode_FromObject(str);
964 if (str == NULL)
965 return NULL;
966 v = codec_tuple(PyUnicode_EncodeCharmap(
967 PyUnicode_AS_UNICODE(str),
968 PyUnicode_GET_SIZE(str),
969 mapping,
970 errors),
971 PyUnicode_GET_SIZE(str));
972 Py_DECREF(str);
973 return v;
974 }
975
976 static PyObject*
charmap_build(PyObject * self,PyObject * args)977 charmap_build(PyObject *self, PyObject *args)
978 {
979 PyObject *map;
980 if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
981 return NULL;
982 return PyUnicode_BuildEncodingMap(map);
983 }
984
985 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
986
987 static PyObject *
mbcs_encode(PyObject * self,PyObject * args)988 mbcs_encode(PyObject *self,
989 PyObject *args)
990 {
991 PyObject *str, *v;
992 const char *errors = NULL;
993
994 if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
995 &str, &errors))
996 return NULL;
997
998 str = PyUnicode_FromObject(str);
999 if (str == NULL)
1000 return NULL;
1001 v = codec_tuple(PyUnicode_EncodeMBCS(
1002 PyUnicode_AS_UNICODE(str),
1003 PyUnicode_GET_SIZE(str),
1004 errors),
1005 PyUnicode_GET_SIZE(str));
1006 Py_DECREF(str);
1007 return v;
1008 }
1009
1010 #endif /* MS_WINDOWS */
1011 #endif /* Py_USING_UNICODE */
1012
1013 /* --- Error handler registry --------------------------------------------- */
1014
1015 PyDoc_STRVAR(register_error__doc__,
1016 "register_error(errors, handler)\n\
1017 \n\
1018 Register the specified error handler under the name\n\
1019 errors. handler must be a callable object, that\n\
1020 will be called with an exception instance containing\n\
1021 information about the location of the encoding/decoding\n\
1022 error and must return a (replacement, new position) tuple.");
1023
register_error(PyObject * self,PyObject * args)1024 static PyObject *register_error(PyObject *self, PyObject *args)
1025 {
1026 const char *name;
1027 PyObject *handler;
1028
1029 if (!PyArg_ParseTuple(args, "sO:register_error",
1030 &name, &handler))
1031 return NULL;
1032 if (PyCodec_RegisterError(name, handler))
1033 return NULL;
1034 Py_RETURN_NONE;
1035 }
1036
1037 PyDoc_STRVAR(lookup_error__doc__,
1038 "lookup_error(errors) -> handler\n\
1039 \n\
1040 Return the error handler for the specified error handling name\n\
1041 or raise a LookupError, if no handler exists under this name.");
1042
lookup_error(PyObject * self,PyObject * args)1043 static PyObject *lookup_error(PyObject *self, PyObject *args)
1044 {
1045 const char *name;
1046
1047 if (!PyArg_ParseTuple(args, "s:lookup_error",
1048 &name))
1049 return NULL;
1050 return PyCodec_LookupError(name);
1051 }
1052
1053 /* --- Module API --------------------------------------------------------- */
1054
1055 static PyMethodDef _codecs_functions[] = {
1056 {"register", codec_register, METH_O,
1057 register__doc__},
1058 {"lookup", codec_lookup, METH_VARARGS,
1059 lookup__doc__},
1060 {"encode", codec_encode, METH_VARARGS,
1061 encode__doc__},
1062 {"decode", codec_decode, METH_VARARGS,
1063 decode__doc__},
1064 {"escape_encode", escape_encode, METH_VARARGS},
1065 {"escape_decode", escape_decode, METH_VARARGS},
1066 #ifdef Py_USING_UNICODE
1067 {"utf_8_encode", utf_8_encode, METH_VARARGS},
1068 {"utf_8_decode", utf_8_decode, METH_VARARGS},
1069 {"utf_7_encode", utf_7_encode, METH_VARARGS},
1070 {"utf_7_decode", utf_7_decode, METH_VARARGS},
1071 {"utf_16_encode", utf_16_encode, METH_VARARGS},
1072 {"utf_16_le_encode", utf_16_le_encode, METH_VARARGS},
1073 {"utf_16_be_encode", utf_16_be_encode, METH_VARARGS},
1074 {"utf_16_decode", utf_16_decode, METH_VARARGS},
1075 {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
1076 {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
1077 {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
1078 {"utf_32_encode", utf_32_encode, METH_VARARGS},
1079 {"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
1080 {"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
1081 {"utf_32_decode", utf_32_decode, METH_VARARGS},
1082 {"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
1083 {"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
1084 {"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
1085 {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
1086 {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
1087 {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
1088 {"unicode_internal_decode", unicode_internal_decode, METH_VARARGS},
1089 {"raw_unicode_escape_encode", raw_unicode_escape_encode, METH_VARARGS},
1090 {"raw_unicode_escape_decode", raw_unicode_escape_decode, METH_VARARGS},
1091 {"latin_1_encode", latin_1_encode, METH_VARARGS},
1092 {"latin_1_decode", latin_1_decode, METH_VARARGS},
1093 {"ascii_encode", ascii_encode, METH_VARARGS},
1094 {"ascii_decode", ascii_decode, METH_VARARGS},
1095 {"charmap_encode", charmap_encode, METH_VARARGS},
1096 {"charmap_decode", charmap_decode, METH_VARARGS},
1097 {"charmap_build", charmap_build, METH_VARARGS},
1098 {"readbuffer_encode", readbuffer_encode, METH_VARARGS},
1099 {"charbuffer_encode", charbuffer_encode, METH_VARARGS},
1100 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1101 {"mbcs_encode", mbcs_encode, METH_VARARGS},
1102 {"mbcs_decode", mbcs_decode, METH_VARARGS},
1103 #endif
1104 #endif /* Py_USING_UNICODE */
1105 {"register_error", register_error, METH_VARARGS,
1106 register_error__doc__},
1107 {"lookup_error", lookup_error, METH_VARARGS,
1108 lookup_error__doc__},
1109 {NULL, NULL} /* sentinel */
1110 };
1111
1112 PyMODINIT_FUNC
init_codecs(void)1113 init_codecs(void)
1114 {
1115 Py_InitModule("_codecs", _codecs_functions);
1116 }
1117