1 /* ------------------------------------------------------------------------
2
3 _codecs -- Provides access to the codec registry and the builtin
4 codecs.
5
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
8
9 The codec registry is accessible via:
10
11 register(search_function) -> None
12
13 lookup(encoding) -> CodecInfo object
14
15 The builtin Unicode codecs use the following interface:
16
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
19
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
22
23 These <encoding>s are available: utf_8, unicode_escape,
24 raw_unicode_escape, latin_1, ascii (7-bit), mbcs (on win32).
25
26
27 Written by Marc-Andre Lemburg (mal@lemburg.com).
28
29 Copyright (c) Corporation for National Research Initiatives.
30
31 ------------------------------------------------------------------------ */
32
33 #include "Python.h"
34 #include "pycore_codecs.h" // _PyCodec_Lookup()
35
36 #ifdef MS_WINDOWS
37 #include <windows.h>
38 #endif
39
40 /*[clinic input]
41 module _codecs
42 [clinic start generated code]*/
43 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e1390e3da3cb9deb]*/
44
45 #include "pycore_runtime.h"
46 #include "clinic/_codecsmodule.c.h"
47
48 /* --- Registry ----------------------------------------------------------- */
49
50 /*[clinic input]
51 _codecs.register
52 search_function: object
53 /
54
55 Register a codec search function.
56
57 Search functions are expected to take one argument, the encoding name in
58 all lower case letters, and either return None, or a tuple of functions
59 (encoder, decoder, stream_reader, stream_writer) (or a CodecInfo object).
60 [clinic start generated code]*/
61
62 static PyObject *
_codecs_register(PyObject * module,PyObject * search_function)63 _codecs_register(PyObject *module, PyObject *search_function)
64 /*[clinic end generated code: output=d1bf21e99db7d6d3 input=369578467955cae4]*/
65 {
66 if (PyCodec_Register(search_function))
67 return NULL;
68
69 Py_RETURN_NONE;
70 }
71
72 /*[clinic input]
73 _codecs.unregister
74 search_function: object
75 /
76
77 Unregister a codec search function and clear the registry's cache.
78
79 If the search function is not registered, do nothing.
80 [clinic start generated code]*/
81
82 static PyObject *
_codecs_unregister(PyObject * module,PyObject * search_function)83 _codecs_unregister(PyObject *module, PyObject *search_function)
84 /*[clinic end generated code: output=1f0edee9cf246399 input=dd7c004c652d345e]*/
85 {
86 if (PyCodec_Unregister(search_function) < 0) {
87 return NULL;
88 }
89
90 Py_RETURN_NONE;
91 }
92
93 /*[clinic input]
94 _codecs.lookup
95 encoding: str
96 /
97
98 Looks up a codec tuple in the Python codec registry and returns a CodecInfo object.
99 [clinic start generated code]*/
100
101 static PyObject *
_codecs_lookup_impl(PyObject * module,const char * encoding)102 _codecs_lookup_impl(PyObject *module, const char *encoding)
103 /*[clinic end generated code: output=9f0afa572080c36d input=3c572c0db3febe9c]*/
104 {
105 return _PyCodec_Lookup(encoding);
106 }
107
108 /*[clinic input]
109 _codecs.encode
110 obj: object
111 encoding: str(c_default="NULL") = "utf-8"
112 errors: str(c_default="NULL") = "strict"
113
114 Encodes obj using the codec registered for encoding.
115
116 The default encoding is 'utf-8'. errors may be given to set a
117 different error handling scheme. Default is 'strict' meaning that encoding
118 errors raise a ValueError. Other possible values are 'ignore', 'replace'
119 and 'backslashreplace' as well as any other name registered with
120 codecs.register_error that can handle ValueErrors.
121 [clinic start generated code]*/
122
123 static PyObject *
_codecs_encode_impl(PyObject * module,PyObject * obj,const char * encoding,const char * errors)124 _codecs_encode_impl(PyObject *module, PyObject *obj, const char *encoding,
125 const char *errors)
126 /*[clinic end generated code: output=385148eb9a067c86 input=cd5b685040ff61f0]*/
127 {
128 if (encoding == NULL)
129 encoding = PyUnicode_GetDefaultEncoding();
130
131 /* Encode via the codec registry */
132 return PyCodec_Encode(obj, encoding, errors);
133 }
134
135 /*[clinic input]
136 _codecs.decode
137 obj: object
138 encoding: str(c_default="NULL") = "utf-8"
139 errors: str(c_default="NULL") = "strict"
140
141 Decodes obj using the codec registered for encoding.
142
143 Default encoding is 'utf-8'. errors may be given to set a
144 different error handling scheme. Default is 'strict' meaning that encoding
145 errors raise a ValueError. Other possible values are 'ignore', 'replace'
146 and 'backslashreplace' as well as any other name registered with
147 codecs.register_error that can handle ValueErrors.
148 [clinic start generated code]*/
149
150 static PyObject *
_codecs_decode_impl(PyObject * module,PyObject * obj,const char * encoding,const char * errors)151 _codecs_decode_impl(PyObject *module, PyObject *obj, const char *encoding,
152 const char *errors)
153 /*[clinic end generated code: output=679882417dc3a0bd input=7702c0cc2fa1add6]*/
154 {
155 if (encoding == NULL)
156 encoding = PyUnicode_GetDefaultEncoding();
157
158 /* Decode via the codec registry */
159 return PyCodec_Decode(obj, encoding, errors);
160 }
161
162 /* --- Helpers ------------------------------------------------------------ */
163
164 static
codec_tuple(PyObject * decoded,Py_ssize_t len)165 PyObject *codec_tuple(PyObject *decoded,
166 Py_ssize_t len)
167 {
168 if (decoded == NULL)
169 return NULL;
170 return Py_BuildValue("Nn", decoded, len);
171 }
172
173 /* --- String codecs ------------------------------------------------------ */
174 /*[clinic input]
175 _codecs.escape_decode
176 data: Py_buffer(accept={str, buffer})
177 errors: str(accept={str, NoneType}) = None
178 /
179 [clinic start generated code]*/
180
181 static PyObject *
_codecs_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors)182 _codecs_escape_decode_impl(PyObject *module, Py_buffer *data,
183 const char *errors)
184 /*[clinic end generated code: output=505200ba8056979a input=77298a561c90bd82]*/
185 {
186 PyObject *decoded = PyBytes_DecodeEscape(data->buf, data->len,
187 errors, 0, NULL);
188 return codec_tuple(decoded, data->len);
189 }
190
191 /*[clinic input]
192 _codecs.escape_encode
193 data: object(subclass_of='&PyBytes_Type')
194 errors: str(accept={str, NoneType}) = None
195 /
196 [clinic start generated code]*/
197
198 static PyObject *
_codecs_escape_encode_impl(PyObject * module,PyObject * data,const char * errors)199 _codecs_escape_encode_impl(PyObject *module, PyObject *data,
200 const char *errors)
201 /*[clinic end generated code: output=4af1d477834bab34 input=8f4b144799a94245]*/
202 {
203 Py_ssize_t size;
204 Py_ssize_t newsize;
205 PyObject *v;
206
207 size = PyBytes_GET_SIZE(data);
208 if (size > PY_SSIZE_T_MAX / 4) {
209 PyErr_SetString(PyExc_OverflowError,
210 "string is too large to encode");
211 return NULL;
212 }
213 newsize = 4*size;
214 v = PyBytes_FromStringAndSize(NULL, newsize);
215
216 if (v == NULL) {
217 return NULL;
218 }
219 else {
220 Py_ssize_t i;
221 char c;
222 char *p = PyBytes_AS_STRING(v);
223
224 for (i = 0; i < size; i++) {
225 /* There's at least enough room for a hex escape */
226 assert(newsize - (p - PyBytes_AS_STRING(v)) >= 4);
227 c = PyBytes_AS_STRING(data)[i];
228 if (c == '\'' || c == '\\')
229 *p++ = '\\', *p++ = c;
230 else if (c == '\t')
231 *p++ = '\\', *p++ = 't';
232 else if (c == '\n')
233 *p++ = '\\', *p++ = 'n';
234 else if (c == '\r')
235 *p++ = '\\', *p++ = 'r';
236 else if (c < ' ' || c >= 0x7f) {
237 *p++ = '\\';
238 *p++ = 'x';
239 *p++ = Py_hexdigits[(c & 0xf0) >> 4];
240 *p++ = Py_hexdigits[c & 0xf];
241 }
242 else
243 *p++ = c;
244 }
245 *p = '\0';
246 if (_PyBytes_Resize(&v, (p - PyBytes_AS_STRING(v)))) {
247 return NULL;
248 }
249 }
250
251 return codec_tuple(v, size);
252 }
253
254 /* --- Decoder ------------------------------------------------------------ */
255 /*[clinic input]
256 _codecs.utf_7_decode
257 data: Py_buffer
258 errors: str(accept={str, NoneType}) = None
259 final: bool = False
260 /
261 [clinic start generated code]*/
262
263 static PyObject *
_codecs_utf_7_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)264 _codecs_utf_7_decode_impl(PyObject *module, Py_buffer *data,
265 const char *errors, int final)
266 /*[clinic end generated code: output=0cd3a944a32a4089 input=dbf8c8998102dc7d]*/
267 {
268 Py_ssize_t consumed = data->len;
269 PyObject *decoded = PyUnicode_DecodeUTF7Stateful(data->buf, data->len,
270 errors,
271 final ? NULL : &consumed);
272 return codec_tuple(decoded, consumed);
273 }
274
275 /*[clinic input]
276 _codecs.utf_8_decode
277 data: Py_buffer
278 errors: str(accept={str, NoneType}) = None
279 final: bool = False
280 /
281 [clinic start generated code]*/
282
283 static PyObject *
_codecs_utf_8_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)284 _codecs_utf_8_decode_impl(PyObject *module, Py_buffer *data,
285 const char *errors, int final)
286 /*[clinic end generated code: output=10f74dec8d9bb8bf input=ca06bc8a9c970e25]*/
287 {
288 Py_ssize_t consumed = data->len;
289 PyObject *decoded = PyUnicode_DecodeUTF8Stateful(data->buf, data->len,
290 errors,
291 final ? NULL : &consumed);
292 return codec_tuple(decoded, consumed);
293 }
294
295 /*[clinic input]
296 _codecs.utf_16_decode
297 data: Py_buffer
298 errors: str(accept={str, NoneType}) = None
299 final: bool = False
300 /
301 [clinic start generated code]*/
302
303 static PyObject *
_codecs_utf_16_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)304 _codecs_utf_16_decode_impl(PyObject *module, Py_buffer *data,
305 const char *errors, int final)
306 /*[clinic end generated code: output=783b442abcbcc2d0 input=5b0f52071ba6cadc]*/
307 {
308 int byteorder = 0;
309 /* This is overwritten unless final is true. */
310 Py_ssize_t consumed = data->len;
311 PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
312 errors, &byteorder,
313 final ? NULL : &consumed);
314 return codec_tuple(decoded, consumed);
315 }
316
317 /*[clinic input]
318 _codecs.utf_16_le_decode
319 data: Py_buffer
320 errors: str(accept={str, NoneType}) = None
321 final: bool = False
322 /
323 [clinic start generated code]*/
324
325 static PyObject *
_codecs_utf_16_le_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)326 _codecs_utf_16_le_decode_impl(PyObject *module, Py_buffer *data,
327 const char *errors, int final)
328 /*[clinic end generated code: output=899b9e6364379dcd input=115bd8c7b783d0bf]*/
329 {
330 int byteorder = -1;
331 /* This is overwritten unless final is true. */
332 Py_ssize_t consumed = data->len;
333 PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
334 errors, &byteorder,
335 final ? NULL : &consumed);
336 return codec_tuple(decoded, consumed);
337 }
338
339 /*[clinic input]
340 _codecs.utf_16_be_decode
341 data: Py_buffer
342 errors: str(accept={str, NoneType}) = None
343 final: bool = False
344 /
345 [clinic start generated code]*/
346
347 static PyObject *
_codecs_utf_16_be_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)348 _codecs_utf_16_be_decode_impl(PyObject *module, Py_buffer *data,
349 const char *errors, int final)
350 /*[clinic end generated code: output=49f6465ea07669c8 input=63131422b01f9cb4]*/
351 {
352 int byteorder = 1;
353 /* This is overwritten unless final is true. */
354 Py_ssize_t consumed = data->len;
355 PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
356 errors, &byteorder,
357 final ? NULL : &consumed);
358 return codec_tuple(decoded, consumed);
359 }
360
361 /* This non-standard version also provides access to the byteorder
362 parameter of the builtin UTF-16 codec.
363
364 It returns a tuple (unicode, bytesread, byteorder) with byteorder
365 being the value in effect at the end of data.
366
367 */
368 /*[clinic input]
369 _codecs.utf_16_ex_decode
370 data: Py_buffer
371 errors: str(accept={str, NoneType}) = None
372 byteorder: int = 0
373 final: bool = False
374 /
375 [clinic start generated code]*/
376
377 static PyObject *
_codecs_utf_16_ex_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int byteorder,int final)378 _codecs_utf_16_ex_decode_impl(PyObject *module, Py_buffer *data,
379 const char *errors, int byteorder, int final)
380 /*[clinic end generated code: output=0f385f251ecc1988 input=f368a51cf384bf4c]*/
381 {
382 /* This is overwritten unless final is true. */
383 Py_ssize_t consumed = data->len;
384
385 PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
386 errors, &byteorder,
387 final ? NULL : &consumed);
388 if (decoded == NULL)
389 return NULL;
390 return Py_BuildValue("Nni", decoded, consumed, byteorder);
391 }
392
393 /*[clinic input]
394 _codecs.utf_32_decode
395 data: Py_buffer
396 errors: str(accept={str, NoneType}) = None
397 final: bool = False
398 /
399 [clinic start generated code]*/
400
401 static PyObject *
_codecs_utf_32_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)402 _codecs_utf_32_decode_impl(PyObject *module, Py_buffer *data,
403 const char *errors, int final)
404 /*[clinic end generated code: output=2fc961807f7b145f input=fcdf3658c5e9b5f3]*/
405 {
406 int byteorder = 0;
407 /* This is overwritten unless final is true. */
408 Py_ssize_t consumed = data->len;
409 PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
410 errors, &byteorder,
411 final ? NULL : &consumed);
412 return codec_tuple(decoded, consumed);
413 }
414
415 /*[clinic input]
416 _codecs.utf_32_le_decode
417 data: Py_buffer
418 errors: str(accept={str, NoneType}) = None
419 final: bool = False
420 /
421 [clinic start generated code]*/
422
423 static PyObject *
_codecs_utf_32_le_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)424 _codecs_utf_32_le_decode_impl(PyObject *module, Py_buffer *data,
425 const char *errors, int final)
426 /*[clinic end generated code: output=ec8f46b67a94f3e6 input=12220556e885f817]*/
427 {
428 int byteorder = -1;
429 /* This is overwritten unless final is true. */
430 Py_ssize_t consumed = data->len;
431 PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
432 errors, &byteorder,
433 final ? NULL : &consumed);
434 return codec_tuple(decoded, consumed);
435 }
436
437 /*[clinic input]
438 _codecs.utf_32_be_decode
439 data: Py_buffer
440 errors: str(accept={str, NoneType}) = None
441 final: bool = False
442 /
443 [clinic start generated code]*/
444
445 static PyObject *
_codecs_utf_32_be_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)446 _codecs_utf_32_be_decode_impl(PyObject *module, Py_buffer *data,
447 const char *errors, int final)
448 /*[clinic end generated code: output=ff82bae862c92c4e input=2bc669b4781598db]*/
449 {
450 int byteorder = 1;
451 /* This is overwritten unless final is true. */
452 Py_ssize_t consumed = data->len;
453 PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
454 errors, &byteorder,
455 final ? NULL : &consumed);
456 return codec_tuple(decoded, consumed);
457 }
458
459 /* This non-standard version also provides access to the byteorder
460 parameter of the builtin UTF-32 codec.
461
462 It returns a tuple (unicode, bytesread, byteorder) with byteorder
463 being the value in effect at the end of data.
464
465 */
466 /*[clinic input]
467 _codecs.utf_32_ex_decode
468 data: Py_buffer
469 errors: str(accept={str, NoneType}) = None
470 byteorder: int = 0
471 final: bool = False
472 /
473 [clinic start generated code]*/
474
475 static PyObject *
_codecs_utf_32_ex_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int byteorder,int final)476 _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
477 const char *errors, int byteorder, int final)
478 /*[clinic end generated code: output=6bfb177dceaf4848 input=4a2323d0013620df]*/
479 {
480 Py_ssize_t consumed = data->len;
481 PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
482 errors, &byteorder,
483 final ? NULL : &consumed);
484 if (decoded == NULL)
485 return NULL;
486 return Py_BuildValue("Nni", decoded, consumed, byteorder);
487 }
488
489 /*[clinic input]
490 _codecs.unicode_escape_decode
491 data: Py_buffer(accept={str, buffer})
492 errors: str(accept={str, NoneType}) = None
493 final: bool = True
494 /
495 [clinic start generated code]*/
496
497 static PyObject *
_codecs_unicode_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)498 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
499 const char *errors, int final)
500 /*[clinic end generated code: output=b284f97b12c635ee input=15019f081ffe272b]*/
501 {
502 Py_ssize_t consumed = data->len;
503 PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
504 errors,
505 final ? NULL : &consumed);
506 return codec_tuple(decoded, consumed);
507 }
508
509 /*[clinic input]
510 _codecs.raw_unicode_escape_decode
511 data: Py_buffer(accept={str, buffer})
512 errors: str(accept={str, NoneType}) = None
513 final: bool = True
514 /
515 [clinic start generated code]*/
516
517 static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)518 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
519 const char *errors, int final)
520 /*[clinic end generated code: output=11dbd96301e2879e input=b93f823aa8c343ad]*/
521 {
522 Py_ssize_t consumed = data->len;
523 PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
524 errors,
525 final ? NULL : &consumed);
526 return codec_tuple(decoded, consumed);
527 }
528
529 /*[clinic input]
530 _codecs.latin_1_decode
531 data: Py_buffer
532 errors: str(accept={str, NoneType}) = None
533 /
534 [clinic start generated code]*/
535
536 static PyObject *
_codecs_latin_1_decode_impl(PyObject * module,Py_buffer * data,const char * errors)537 _codecs_latin_1_decode_impl(PyObject *module, Py_buffer *data,
538 const char *errors)
539 /*[clinic end generated code: output=07f3dfa3f72c7d8f input=76ca58fd6dcd08c7]*/
540 {
541 PyObject *decoded = PyUnicode_DecodeLatin1(data->buf, data->len, errors);
542 return codec_tuple(decoded, data->len);
543 }
544
545 /*[clinic input]
546 _codecs.ascii_decode
547 data: Py_buffer
548 errors: str(accept={str, NoneType}) = None
549 /
550 [clinic start generated code]*/
551
552 static PyObject *
_codecs_ascii_decode_impl(PyObject * module,Py_buffer * data,const char * errors)553 _codecs_ascii_decode_impl(PyObject *module, Py_buffer *data,
554 const char *errors)
555 /*[clinic end generated code: output=2627d72058d42429 input=e428a267a04b4481]*/
556 {
557 PyObject *decoded = PyUnicode_DecodeASCII(data->buf, data->len, errors);
558 return codec_tuple(decoded, data->len);
559 }
560
561 /*[clinic input]
562 _codecs.charmap_decode
563 data: Py_buffer
564 errors: str(accept={str, NoneType}) = None
565 mapping: object = None
566 /
567 [clinic start generated code]*/
568
569 static PyObject *
_codecs_charmap_decode_impl(PyObject * module,Py_buffer * data,const char * errors,PyObject * mapping)570 _codecs_charmap_decode_impl(PyObject *module, Py_buffer *data,
571 const char *errors, PyObject *mapping)
572 /*[clinic end generated code: output=2c335b09778cf895 input=15b69df43458eb40]*/
573 {
574 PyObject *decoded;
575
576 if (mapping == Py_None)
577 mapping = NULL;
578
579 decoded = PyUnicode_DecodeCharmap(data->buf, data->len, mapping, errors);
580 return codec_tuple(decoded, data->len);
581 }
582
583 #ifdef MS_WINDOWS
584
585 /*[clinic input]
586 _codecs.mbcs_decode
587 data: Py_buffer
588 errors: str(accept={str, NoneType}) = None
589 final: bool = False
590 /
591 [clinic start generated code]*/
592
593 static PyObject *
_codecs_mbcs_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)594 _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
595 const char *errors, int final)
596 /*[clinic end generated code: output=39b65b8598938c4b input=f144ad1ed6d8f5a6]*/
597 {
598 Py_ssize_t consumed = data->len;
599 PyObject *decoded = PyUnicode_DecodeMBCSStateful(data->buf, data->len,
600 errors, final ? NULL : &consumed);
601 return codec_tuple(decoded, consumed);
602 }
603
604 /*[clinic input]
605 _codecs.oem_decode
606 data: Py_buffer
607 errors: str(accept={str, NoneType}) = None
608 final: bool = False
609 /
610 [clinic start generated code]*/
611
612 static PyObject *
_codecs_oem_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)613 _codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
614 const char *errors, int final)
615 /*[clinic end generated code: output=da1617612f3fcad8 input=629bf87376d211b4]*/
616 {
617 Py_ssize_t consumed = data->len;
618 PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
619 data->buf, data->len, errors, final ? NULL : &consumed);
620 return codec_tuple(decoded, consumed);
621 }
622
623 /*[clinic input]
624 _codecs.code_page_decode
625 codepage: int
626 data: Py_buffer
627 errors: str(accept={str, NoneType}) = None
628 final: bool = False
629 /
630 [clinic start generated code]*/
631
632 static PyObject *
_codecs_code_page_decode_impl(PyObject * module,int codepage,Py_buffer * data,const char * errors,int final)633 _codecs_code_page_decode_impl(PyObject *module, int codepage,
634 Py_buffer *data, const char *errors, int final)
635 /*[clinic end generated code: output=53008ea967da3fff input=6a32589b0658c277]*/
636 {
637 Py_ssize_t consumed = data->len;
638 PyObject *decoded = PyUnicode_DecodeCodePageStateful(codepage,
639 data->buf, data->len,
640 errors,
641 final ? NULL : &consumed);
642 return codec_tuple(decoded, consumed);
643 }
644
645 #endif /* MS_WINDOWS */
646
647 /* --- Encoder ------------------------------------------------------------ */
648
649 /*[clinic input]
650 _codecs.readbuffer_encode
651 data: Py_buffer(accept={str, buffer})
652 errors: str(accept={str, NoneType}) = None
653 /
654 [clinic start generated code]*/
655
656 static PyObject *
_codecs_readbuffer_encode_impl(PyObject * module,Py_buffer * data,const char * errors)657 _codecs_readbuffer_encode_impl(PyObject *module, Py_buffer *data,
658 const char *errors)
659 /*[clinic end generated code: output=c645ea7cdb3d6e86 input=aa10cfdf252455c5]*/
660 {
661 PyObject *result = PyBytes_FromStringAndSize(data->buf, data->len);
662 return codec_tuple(result, data->len);
663 }
664
665 /*[clinic input]
666 _codecs.utf_7_encode
667 str: unicode
668 errors: str(accept={str, NoneType}) = None
669 /
670 [clinic start generated code]*/
671
672 static PyObject *
_codecs_utf_7_encode_impl(PyObject * module,PyObject * str,const char * errors)673 _codecs_utf_7_encode_impl(PyObject *module, PyObject *str,
674 const char *errors)
675 /*[clinic end generated code: output=0feda21ffc921bc8 input=2546dbbb3fa53114]*/
676 {
677 return codec_tuple(_PyUnicode_EncodeUTF7(str, 0, 0, errors),
678 PyUnicode_GET_LENGTH(str));
679 }
680
681 /*[clinic input]
682 _codecs.utf_8_encode
683 str: unicode
684 errors: str(accept={str, NoneType}) = None
685 /
686 [clinic start generated code]*/
687
688 static PyObject *
_codecs_utf_8_encode_impl(PyObject * module,PyObject * str,const char * errors)689 _codecs_utf_8_encode_impl(PyObject *module, PyObject *str,
690 const char *errors)
691 /*[clinic end generated code: output=02bf47332b9c796c input=a3e71ae01c3f93f3]*/
692 {
693 return codec_tuple(_PyUnicode_AsUTF8String(str, errors),
694 PyUnicode_GET_LENGTH(str));
695 }
696
697 /* This version provides access to the byteorder parameter of the
698 builtin UTF-16 codecs as optional third argument. It defaults to 0
699 which means: use the native byte order and prepend the data with a
700 BOM mark.
701
702 */
703
704 /*[clinic input]
705 _codecs.utf_16_encode
706 str: unicode
707 errors: str(accept={str, NoneType}) = None
708 byteorder: int = 0
709 /
710 [clinic start generated code]*/
711
712 static PyObject *
_codecs_utf_16_encode_impl(PyObject * module,PyObject * str,const char * errors,int byteorder)713 _codecs_utf_16_encode_impl(PyObject *module, PyObject *str,
714 const char *errors, int byteorder)
715 /*[clinic end generated code: output=c654e13efa2e64e4 input=68cdc2eb8338555d]*/
716 {
717 return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, byteorder),
718 PyUnicode_GET_LENGTH(str));
719 }
720
721 /*[clinic input]
722 _codecs.utf_16_le_encode
723 str: unicode
724 errors: str(accept={str, NoneType}) = None
725 /
726 [clinic start generated code]*/
727
728 static PyObject *
_codecs_utf_16_le_encode_impl(PyObject * module,PyObject * str,const char * errors)729 _codecs_utf_16_le_encode_impl(PyObject *module, PyObject *str,
730 const char *errors)
731 /*[clinic end generated code: output=431b01e55f2d4995 input=83d042706eed6798]*/
732 {
733 return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, -1),
734 PyUnicode_GET_LENGTH(str));
735 }
736
737 /*[clinic input]
738 _codecs.utf_16_be_encode
739 str: unicode
740 errors: str(accept={str, NoneType}) = None
741 /
742 [clinic start generated code]*/
743
744 static PyObject *
_codecs_utf_16_be_encode_impl(PyObject * module,PyObject * str,const char * errors)745 _codecs_utf_16_be_encode_impl(PyObject *module, PyObject *str,
746 const char *errors)
747 /*[clinic end generated code: output=96886a6fd54dcae3 input=6f1e9e623b03071b]*/
748 {
749 return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, +1),
750 PyUnicode_GET_LENGTH(str));
751 }
752
753 /* This version provides access to the byteorder parameter of the
754 builtin UTF-32 codecs as optional third argument. It defaults to 0
755 which means: use the native byte order and prepend the data with a
756 BOM mark.
757
758 */
759
760 /*[clinic input]
761 _codecs.utf_32_encode
762 str: unicode
763 errors: str(accept={str, NoneType}) = None
764 byteorder: int = 0
765 /
766 [clinic start generated code]*/
767
768 static PyObject *
_codecs_utf_32_encode_impl(PyObject * module,PyObject * str,const char * errors,int byteorder)769 _codecs_utf_32_encode_impl(PyObject *module, PyObject *str,
770 const char *errors, int byteorder)
771 /*[clinic end generated code: output=5c760da0c09a8b83 input=8ec4c64d983bc52b]*/
772 {
773 return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, byteorder),
774 PyUnicode_GET_LENGTH(str));
775 }
776
777 /*[clinic input]
778 _codecs.utf_32_le_encode
779 str: unicode
780 errors: str(accept={str, NoneType}) = None
781 /
782 [clinic start generated code]*/
783
784 static PyObject *
_codecs_utf_32_le_encode_impl(PyObject * module,PyObject * str,const char * errors)785 _codecs_utf_32_le_encode_impl(PyObject *module, PyObject *str,
786 const char *errors)
787 /*[clinic end generated code: output=b65cd176de8e36d6 input=f0918d41de3eb1b1]*/
788 {
789 return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, -1),
790 PyUnicode_GET_LENGTH(str));
791 }
792
793 /*[clinic input]
794 _codecs.utf_32_be_encode
795 str: unicode
796 errors: str(accept={str, NoneType}) = None
797 /
798 [clinic start generated code]*/
799
800 static PyObject *
_codecs_utf_32_be_encode_impl(PyObject * module,PyObject * str,const char * errors)801 _codecs_utf_32_be_encode_impl(PyObject *module, PyObject *str,
802 const char *errors)
803 /*[clinic end generated code: output=1d9e71a9358709e9 input=967a99a95748b557]*/
804 {
805 return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, +1),
806 PyUnicode_GET_LENGTH(str));
807 }
808
809 /*[clinic input]
810 _codecs.unicode_escape_encode
811 str: unicode
812 errors: str(accept={str, NoneType}) = None
813 /
814 [clinic start generated code]*/
815
816 static PyObject *
_codecs_unicode_escape_encode_impl(PyObject * module,PyObject * str,const char * errors)817 _codecs_unicode_escape_encode_impl(PyObject *module, PyObject *str,
818 const char *errors)
819 /*[clinic end generated code: output=66271b30bc4f7a3c input=8c4de07597054e33]*/
820 {
821 return codec_tuple(PyUnicode_AsUnicodeEscapeString(str),
822 PyUnicode_GET_LENGTH(str));
823 }
824
825 /*[clinic input]
826 _codecs.raw_unicode_escape_encode
827 str: unicode
828 errors: str(accept={str, NoneType}) = None
829 /
830 [clinic start generated code]*/
831
832 static PyObject *
_codecs_raw_unicode_escape_encode_impl(PyObject * module,PyObject * str,const char * errors)833 _codecs_raw_unicode_escape_encode_impl(PyObject *module, PyObject *str,
834 const char *errors)
835 /*[clinic end generated code: output=a66a806ed01c830a input=4aa6f280d78e4574]*/
836 {
837 return codec_tuple(PyUnicode_AsRawUnicodeEscapeString(str),
838 PyUnicode_GET_LENGTH(str));
839 }
840
841 /*[clinic input]
842 _codecs.latin_1_encode
843 str: unicode
844 errors: str(accept={str, NoneType}) = None
845 /
846 [clinic start generated code]*/
847
848 static PyObject *
_codecs_latin_1_encode_impl(PyObject * module,PyObject * str,const char * errors)849 _codecs_latin_1_encode_impl(PyObject *module, PyObject *str,
850 const char *errors)
851 /*[clinic end generated code: output=2c28c83a27884e08 input=ec3ef74bf85c5c5d]*/
852 {
853 return codec_tuple(_PyUnicode_AsLatin1String(str, errors),
854 PyUnicode_GET_LENGTH(str));
855 }
856
857 /*[clinic input]
858 _codecs.ascii_encode
859 str: unicode
860 errors: str(accept={str, NoneType}) = None
861 /
862 [clinic start generated code]*/
863
864 static PyObject *
_codecs_ascii_encode_impl(PyObject * module,PyObject * str,const char * errors)865 _codecs_ascii_encode_impl(PyObject *module, PyObject *str,
866 const char *errors)
867 /*[clinic end generated code: output=b5e035182d33befc input=93e6e602838bd3de]*/
868 {
869 return codec_tuple(_PyUnicode_AsASCIIString(str, errors),
870 PyUnicode_GET_LENGTH(str));
871 }
872
873 /*[clinic input]
874 _codecs.charmap_encode
875 str: unicode
876 errors: str(accept={str, NoneType}) = None
877 mapping: object = None
878 /
879 [clinic start generated code]*/
880
881 static PyObject *
_codecs_charmap_encode_impl(PyObject * module,PyObject * str,const char * errors,PyObject * mapping)882 _codecs_charmap_encode_impl(PyObject *module, PyObject *str,
883 const char *errors, PyObject *mapping)
884 /*[clinic end generated code: output=047476f48495a9e9 input=2a98feae73dadce8]*/
885 {
886 if (mapping == Py_None)
887 mapping = NULL;
888
889 return codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors),
890 PyUnicode_GET_LENGTH(str));
891 }
892
893 /*[clinic input]
894 _codecs.charmap_build
895 map: unicode
896 /
897 [clinic start generated code]*/
898
899 static PyObject *
_codecs_charmap_build_impl(PyObject * module,PyObject * map)900 _codecs_charmap_build_impl(PyObject *module, PyObject *map)
901 /*[clinic end generated code: output=bb073c27031db9ac input=d91a91d1717dbc6d]*/
902 {
903 return PyUnicode_BuildEncodingMap(map);
904 }
905
906 #ifdef MS_WINDOWS
907
908 /*[clinic input]
909 _codecs.mbcs_encode
910 str: unicode
911 errors: str(accept={str, NoneType}) = None
912 /
913 [clinic start generated code]*/
914
915 static PyObject *
_codecs_mbcs_encode_impl(PyObject * module,PyObject * str,const char * errors)916 _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
917 /*[clinic end generated code: output=76e2e170c966c080 input=2e932fc289ea5a5b]*/
918 {
919 return codec_tuple(PyUnicode_EncodeCodePage(CP_ACP, str, errors),
920 PyUnicode_GET_LENGTH(str));
921 }
922
923 /*[clinic input]
924 _codecs.oem_encode
925 str: unicode
926 errors: str(accept={str, NoneType}) = None
927 /
928 [clinic start generated code]*/
929
930 static PyObject *
_codecs_oem_encode_impl(PyObject * module,PyObject * str,const char * errors)931 _codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
932 /*[clinic end generated code: output=65d5982c737de649 input=9eac86dc21eb14f2]*/
933 {
934 return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
935 PyUnicode_GET_LENGTH(str));
936 }
937
938 /*[clinic input]
939 _codecs.code_page_encode
940 code_page: int
941 str: unicode
942 errors: str(accept={str, NoneType}) = None
943 /
944 [clinic start generated code]*/
945
946 static PyObject *
_codecs_code_page_encode_impl(PyObject * module,int code_page,PyObject * str,const char * errors)947 _codecs_code_page_encode_impl(PyObject *module, int code_page, PyObject *str,
948 const char *errors)
949 /*[clinic end generated code: output=45673f6085657a9e input=7d18a33bc8cd0f94]*/
950 {
951 return codec_tuple(PyUnicode_EncodeCodePage(code_page, str, errors),
952 PyUnicode_GET_LENGTH(str));
953 }
954
955 #endif /* MS_WINDOWS */
956
957 /* --- Error handler registry --------------------------------------------- */
958
959 /*[clinic input]
960 _codecs.register_error
961 errors: str
962 handler: object
963 /
964
965 Register the specified error handler under the name errors.
966
967 handler must be a callable object, that will be called with an exception
968 instance containing information about the location of the encoding/decoding
969 error and must return a (replacement, new position) tuple.
970 [clinic start generated code]*/
971
972 static PyObject *
_codecs_register_error_impl(PyObject * module,const char * errors,PyObject * handler)973 _codecs_register_error_impl(PyObject *module, const char *errors,
974 PyObject *handler)
975 /*[clinic end generated code: output=fa2f7d1879b3067d input=5e6709203c2e33fe]*/
976 {
977 if (PyCodec_RegisterError(errors, handler))
978 return NULL;
979 Py_RETURN_NONE;
980 }
981
982 /*[clinic input]
983 _codecs.lookup_error
984 name: str
985 /
986
987 lookup_error(errors) -> handler
988
989 Return the error handler for the specified error handling name or raise a
990 LookupError, if no handler exists under this name.
991 [clinic start generated code]*/
992
993 static PyObject *
_codecs_lookup_error_impl(PyObject * module,const char * name)994 _codecs_lookup_error_impl(PyObject *module, const char *name)
995 /*[clinic end generated code: output=087f05dc0c9a98cc input=4775dd65e6235aba]*/
996 {
997 return PyCodec_LookupError(name);
998 }
999
1000 /* --- Module API --------------------------------------------------------- */
1001
1002 static PyMethodDef _codecs_functions[] = {
1003 _CODECS_REGISTER_METHODDEF
1004 _CODECS_UNREGISTER_METHODDEF
1005 _CODECS_LOOKUP_METHODDEF
1006 _CODECS_ENCODE_METHODDEF
1007 _CODECS_DECODE_METHODDEF
1008 _CODECS_ESCAPE_ENCODE_METHODDEF
1009 _CODECS_ESCAPE_DECODE_METHODDEF
1010 _CODECS_UTF_8_ENCODE_METHODDEF
1011 _CODECS_UTF_8_DECODE_METHODDEF
1012 _CODECS_UTF_7_ENCODE_METHODDEF
1013 _CODECS_UTF_7_DECODE_METHODDEF
1014 _CODECS_UTF_16_ENCODE_METHODDEF
1015 _CODECS_UTF_16_LE_ENCODE_METHODDEF
1016 _CODECS_UTF_16_BE_ENCODE_METHODDEF
1017 _CODECS_UTF_16_DECODE_METHODDEF
1018 _CODECS_UTF_16_LE_DECODE_METHODDEF
1019 _CODECS_UTF_16_BE_DECODE_METHODDEF
1020 _CODECS_UTF_16_EX_DECODE_METHODDEF
1021 _CODECS_UTF_32_ENCODE_METHODDEF
1022 _CODECS_UTF_32_LE_ENCODE_METHODDEF
1023 _CODECS_UTF_32_BE_ENCODE_METHODDEF
1024 _CODECS_UTF_32_DECODE_METHODDEF
1025 _CODECS_UTF_32_LE_DECODE_METHODDEF
1026 _CODECS_UTF_32_BE_DECODE_METHODDEF
1027 _CODECS_UTF_32_EX_DECODE_METHODDEF
1028 _CODECS_UNICODE_ESCAPE_ENCODE_METHODDEF
1029 _CODECS_UNICODE_ESCAPE_DECODE_METHODDEF
1030 _CODECS_RAW_UNICODE_ESCAPE_ENCODE_METHODDEF
1031 _CODECS_RAW_UNICODE_ESCAPE_DECODE_METHODDEF
1032 _CODECS_LATIN_1_ENCODE_METHODDEF
1033 _CODECS_LATIN_1_DECODE_METHODDEF
1034 _CODECS_ASCII_ENCODE_METHODDEF
1035 _CODECS_ASCII_DECODE_METHODDEF
1036 _CODECS_CHARMAP_ENCODE_METHODDEF
1037 _CODECS_CHARMAP_DECODE_METHODDEF
1038 _CODECS_CHARMAP_BUILD_METHODDEF
1039 _CODECS_READBUFFER_ENCODE_METHODDEF
1040 _CODECS_MBCS_ENCODE_METHODDEF
1041 _CODECS_MBCS_DECODE_METHODDEF
1042 _CODECS_OEM_ENCODE_METHODDEF
1043 _CODECS_OEM_DECODE_METHODDEF
1044 _CODECS_CODE_PAGE_ENCODE_METHODDEF
1045 _CODECS_CODE_PAGE_DECODE_METHODDEF
1046 _CODECS_REGISTER_ERROR_METHODDEF
1047 _CODECS_LOOKUP_ERROR_METHODDEF
1048 {NULL, NULL} /* sentinel */
1049 };
1050
1051 static PyModuleDef_Slot _codecs_slots[] = {
1052 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
1053 {Py_mod_gil, Py_MOD_GIL_NOT_USED},
1054 {0, NULL}
1055 };
1056
1057 static struct PyModuleDef codecsmodule = {
1058 PyModuleDef_HEAD_INIT,
1059 "_codecs",
1060 NULL,
1061 0,
1062 _codecs_functions,
1063 _codecs_slots,
1064 NULL,
1065 NULL,
1066 NULL
1067 };
1068
1069 PyMODINIT_FUNC
PyInit__codecs(void)1070 PyInit__codecs(void)
1071 {
1072 return PyModuleDef_Init(&codecsmodule);
1073 }
1074