1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "pycore_interp.h" // PyInterpreterState.codec_search_path
13 #include "pycore_pystate.h" // _PyInterpreterState_GET()
14 #include "ucnhash.h"
15 #include <ctype.h>
16
17 const char *Py_hexdigits = "0123456789abcdef";
18
19 /* --- Codec Registry ----------------------------------------------------- */
20
21 /* Import the standard encodings package which will register the first
22 codec search function.
23
24 This is done in a lazy way so that the Unicode implementation does
25 not downgrade startup time of scripts not needing it.
26
27 ImportErrors are silently ignored by this function. Only one try is
28 made.
29
30 */
31
32 static int _PyCodecRegistry_Init(void); /* Forward */
33
PyCodec_Register(PyObject * search_function)34 int PyCodec_Register(PyObject *search_function)
35 {
36 PyInterpreterState *interp = _PyInterpreterState_GET();
37 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
38 goto onError;
39 if (search_function == NULL) {
40 PyErr_BadArgument();
41 goto onError;
42 }
43 if (!PyCallable_Check(search_function)) {
44 PyErr_SetString(PyExc_TypeError, "argument must be callable");
45 goto onError;
46 }
47 return PyList_Append(interp->codec_search_path, search_function);
48
49 onError:
50 return -1;
51 }
52
53 extern int _Py_normalize_encoding(const char *, char *, size_t);
54
55 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
56 converted to lower case, spaces and hyphens are replaced with underscores. */
57
58 static
normalizestring(const char * string)59 PyObject *normalizestring(const char *string)
60 {
61 size_t len = strlen(string);
62 char *encoding;
63 PyObject *v;
64
65 if (len > PY_SSIZE_T_MAX) {
66 PyErr_SetString(PyExc_OverflowError, "string is too large");
67 return NULL;
68 }
69
70 encoding = PyMem_Malloc(len + 1);
71 if (encoding == NULL)
72 return PyErr_NoMemory();
73
74 if (!_Py_normalize_encoding(string, encoding, len + 1))
75 {
76 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
77 PyMem_Free(encoding);
78 return NULL;
79 }
80
81 v = PyUnicode_FromString(encoding);
82 PyMem_Free(encoding);
83 return v;
84 }
85
86 /* Lookup the given encoding and return a tuple providing the codec
87 facilities.
88
89 The encoding string is looked up converted to all lower-case
90 characters. This makes encodings looked up through this mechanism
91 effectively case-insensitive.
92
93 If no codec is found, a LookupError is set and NULL returned.
94
95 As side effect, this tries to load the encodings package, if not
96 yet done. This is part of the lazy load strategy for the encodings
97 package.
98
99 */
100
_PyCodec_Lookup(const char * encoding)101 PyObject *_PyCodec_Lookup(const char *encoding)
102 {
103 if (encoding == NULL) {
104 PyErr_BadArgument();
105 return NULL;
106 }
107
108 PyInterpreterState *interp = _PyInterpreterState_GET();
109 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
110 return NULL;
111 }
112
113 /* Convert the encoding to a normalized Python string: all
114 characters are converted to lower case, spaces and hyphens are
115 replaced with underscores. */
116 PyObject *v = normalizestring(encoding);
117 if (v == NULL) {
118 return NULL;
119 }
120 PyUnicode_InternInPlace(&v);
121
122 /* First, try to lookup the name in the registry dictionary */
123 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124 if (result != NULL) {
125 Py_INCREF(result);
126 Py_DECREF(v);
127 return result;
128 }
129 else if (PyErr_Occurred()) {
130 goto onError;
131 }
132
133 /* Next, scan the search functions in order of registration */
134 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
135 if (len < 0)
136 goto onError;
137 if (len == 0) {
138 PyErr_SetString(PyExc_LookupError,
139 "no codec search functions registered: "
140 "can't find encoding");
141 goto onError;
142 }
143
144 Py_ssize_t i;
145 for (i = 0; i < len; i++) {
146 PyObject *func;
147
148 func = PyList_GetItem(interp->codec_search_path, i);
149 if (func == NULL)
150 goto onError;
151 result = PyObject_CallOneArg(func, v);
152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
165 }
166 if (i == len) {
167 /* XXX Perhaps we should cache misses too ? */
168 PyErr_Format(PyExc_LookupError,
169 "unknown encoding: %s", encoding);
170 goto onError;
171 }
172
173 /* Cache and return the result */
174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175 Py_DECREF(result);
176 goto onError;
177 }
178 Py_DECREF(v);
179 return result;
180
181 onError:
182 Py_DECREF(v);
183 return NULL;
184 }
185
_PyCodec_Forget(const char * encoding)186 int _PyCodec_Forget(const char *encoding)
187 {
188 PyObject *v;
189 int result;
190
191 PyInterpreterState *interp = _PyInterpreterState_GET();
192 if (interp->codec_search_path == NULL) {
193 return -1;
194 }
195
196 /* Convert the encoding to a normalized Python string: all
197 characters are converted to lower case, spaces and hyphens are
198 replaced with underscores. */
199 v = normalizestring(encoding);
200 if (v == NULL) {
201 return -1;
202 }
203
204 /* Drop the named codec from the internal cache */
205 result = PyDict_DelItem(interp->codec_search_cache, v);
206 Py_DECREF(v);
207
208 return result;
209 }
210
211 /* Codec registry encoding check API. */
212
PyCodec_KnownEncoding(const char * encoding)213 int PyCodec_KnownEncoding(const char *encoding)
214 {
215 PyObject *codecs;
216
217 codecs = _PyCodec_Lookup(encoding);
218 if (!codecs) {
219 PyErr_Clear();
220 return 0;
221 }
222 else {
223 Py_DECREF(codecs);
224 return 1;
225 }
226 }
227
228 static
args_tuple(PyObject * object,const char * errors)229 PyObject *args_tuple(PyObject *object,
230 const char *errors)
231 {
232 PyObject *args;
233
234 args = PyTuple_New(1 + (errors != NULL));
235 if (args == NULL)
236 return NULL;
237 Py_INCREF(object);
238 PyTuple_SET_ITEM(args,0,object);
239 if (errors) {
240 PyObject *v;
241
242 v = PyUnicode_FromString(errors);
243 if (v == NULL) {
244 Py_DECREF(args);
245 return NULL;
246 }
247 PyTuple_SET_ITEM(args, 1, v);
248 }
249 return args;
250 }
251
252 /* Helper function to get a codec item */
253
254 static
codec_getitem(const char * encoding,int index)255 PyObject *codec_getitem(const char *encoding, int index)
256 {
257 PyObject *codecs;
258 PyObject *v;
259
260 codecs = _PyCodec_Lookup(encoding);
261 if (codecs == NULL)
262 return NULL;
263 v = PyTuple_GET_ITEM(codecs, index);
264 Py_DECREF(codecs);
265 Py_INCREF(v);
266 return v;
267 }
268
269 /* Helper functions to create an incremental codec. */
270 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)271 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272 const char *errors,
273 const char *attrname)
274 {
275 PyObject *ret, *inccodec;
276
277 inccodec = PyObject_GetAttrString(codec_info, attrname);
278 if (inccodec == NULL)
279 return NULL;
280 if (errors)
281 ret = PyObject_CallFunction(inccodec, "s", errors);
282 else
283 ret = _PyObject_CallNoArg(inccodec);
284 Py_DECREF(inccodec);
285 return ret;
286 }
287
288 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)289 PyObject *codec_getincrementalcodec(const char *encoding,
290 const char *errors,
291 const char *attrname)
292 {
293 PyObject *codec_info, *ret;
294
295 codec_info = _PyCodec_Lookup(encoding);
296 if (codec_info == NULL)
297 return NULL;
298 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299 Py_DECREF(codec_info);
300 return ret;
301 }
302
303 /* Helper function to create a stream codec. */
304
305 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)306 PyObject *codec_getstreamcodec(const char *encoding,
307 PyObject *stream,
308 const char *errors,
309 const int index)
310 {
311 PyObject *codecs, *streamcodec, *codeccls;
312
313 codecs = _PyCodec_Lookup(encoding);
314 if (codecs == NULL)
315 return NULL;
316
317 codeccls = PyTuple_GET_ITEM(codecs, index);
318 if (errors != NULL)
319 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
320 else
321 streamcodec = PyObject_CallOneArg(codeccls, stream);
322 Py_DECREF(codecs);
323 return streamcodec;
324 }
325
326 /* Helpers to work with the result of _PyCodec_Lookup
327
328 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)329 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330 const char *errors)
331 {
332 return codec_makeincrementalcodec(codec_info, errors,
333 "incrementaldecoder");
334 }
335
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337 const char *errors)
338 {
339 return codec_makeincrementalcodec(codec_info, errors,
340 "incrementalencoder");
341 }
342
343
344 /* Convenience APIs to query the Codec registry.
345
346 All APIs return a codec object with incremented refcount.
347
348 */
349
PyCodec_Encoder(const char * encoding)350 PyObject *PyCodec_Encoder(const char *encoding)
351 {
352 return codec_getitem(encoding, 0);
353 }
354
PyCodec_Decoder(const char * encoding)355 PyObject *PyCodec_Decoder(const char *encoding)
356 {
357 return codec_getitem(encoding, 1);
358 }
359
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)360 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
361 const char *errors)
362 {
363 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
364 }
365
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)366 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
367 const char *errors)
368 {
369 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
370 }
371
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)372 PyObject *PyCodec_StreamReader(const char *encoding,
373 PyObject *stream,
374 const char *errors)
375 {
376 return codec_getstreamcodec(encoding, stream, errors, 2);
377 }
378
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamWriter(const char *encoding,
380 PyObject *stream,
381 const char *errors)
382 {
383 return codec_getstreamcodec(encoding, stream, errors, 3);
384 }
385
386 /* Helper that tries to ensure the reported exception chain indicates the
387 * codec that was invoked to trigger the failure without changing the type
388 * of the exception raised.
389 */
390 static void
wrap_codec_error(const char * operation,const char * encoding)391 wrap_codec_error(const char *operation,
392 const char *encoding)
393 {
394 /* TrySetFromCause will replace the active exception with a suitably
395 * updated clone if it can, otherwise it will leave the original
396 * exception alone.
397 */
398 _PyErr_TrySetFromCause("%s with '%s' codec failed",
399 operation, encoding);
400 }
401
402 /* Encode an object (e.g. a Unicode object) using the given encoding
403 and return the resulting encoded object (usually a Python string).
404
405 errors is passed to the encoder factory as argument if non-NULL. */
406
407 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)408 _PyCodec_EncodeInternal(PyObject *object,
409 PyObject *encoder,
410 const char *encoding,
411 const char *errors)
412 {
413 PyObject *args = NULL, *result = NULL;
414 PyObject *v = NULL;
415
416 args = args_tuple(object, errors);
417 if (args == NULL)
418 goto onError;
419
420 result = PyObject_Call(encoder, args, NULL);
421 if (result == NULL) {
422 wrap_codec_error("encoding", encoding);
423 goto onError;
424 }
425
426 if (!PyTuple_Check(result) ||
427 PyTuple_GET_SIZE(result) != 2) {
428 PyErr_SetString(PyExc_TypeError,
429 "encoder must return a tuple (object, integer)");
430 goto onError;
431 }
432 v = PyTuple_GET_ITEM(result,0);
433 Py_INCREF(v);
434 /* We don't check or use the second (integer) entry. */
435
436 Py_DECREF(args);
437 Py_DECREF(encoder);
438 Py_DECREF(result);
439 return v;
440
441 onError:
442 Py_XDECREF(result);
443 Py_XDECREF(args);
444 Py_XDECREF(encoder);
445 return NULL;
446 }
447
448 /* Decode an object (usually a Python string) using the given encoding
449 and return an equivalent object (e.g. a Unicode object).
450
451 errors is passed to the decoder factory as argument if non-NULL. */
452
453 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)454 _PyCodec_DecodeInternal(PyObject *object,
455 PyObject *decoder,
456 const char *encoding,
457 const char *errors)
458 {
459 PyObject *args = NULL, *result = NULL;
460 PyObject *v;
461
462 args = args_tuple(object, errors);
463 if (args == NULL)
464 goto onError;
465
466 result = PyObject_Call(decoder, args, NULL);
467 if (result == NULL) {
468 wrap_codec_error("decoding", encoding);
469 goto onError;
470 }
471 if (!PyTuple_Check(result) ||
472 PyTuple_GET_SIZE(result) != 2) {
473 PyErr_SetString(PyExc_TypeError,
474 "decoder must return a tuple (object,integer)");
475 goto onError;
476 }
477 v = PyTuple_GET_ITEM(result,0);
478 Py_INCREF(v);
479 /* We don't check or use the second (integer) entry. */
480
481 Py_DECREF(args);
482 Py_DECREF(decoder);
483 Py_DECREF(result);
484 return v;
485
486 onError:
487 Py_XDECREF(args);
488 Py_XDECREF(decoder);
489 Py_XDECREF(result);
490 return NULL;
491 }
492
493 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)494 PyObject *PyCodec_Encode(PyObject *object,
495 const char *encoding,
496 const char *errors)
497 {
498 PyObject *encoder;
499
500 encoder = PyCodec_Encoder(encoding);
501 if (encoder == NULL)
502 return NULL;
503
504 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
505 }
506
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)507 PyObject *PyCodec_Decode(PyObject *object,
508 const char *encoding,
509 const char *errors)
510 {
511 PyObject *decoder;
512
513 decoder = PyCodec_Decoder(encoding);
514 if (decoder == NULL)
515 return NULL;
516
517 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
518 }
519
520 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)521 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
522 const char *alternate_command)
523 {
524 _Py_IDENTIFIER(_is_text_encoding);
525 PyObject *codec;
526 PyObject *attr;
527 int is_text_codec;
528
529 codec = _PyCodec_Lookup(encoding);
530 if (codec == NULL)
531 return NULL;
532
533 /* Backwards compatibility: assume any raw tuple describes a text
534 * encoding, and the same for anything lacking the private
535 * attribute.
536 */
537 if (!PyTuple_CheckExact(codec)) {
538 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
539 Py_DECREF(codec);
540 return NULL;
541 }
542 if (attr != NULL) {
543 is_text_codec = PyObject_IsTrue(attr);
544 Py_DECREF(attr);
545 if (is_text_codec <= 0) {
546 Py_DECREF(codec);
547 if (!is_text_codec)
548 PyErr_Format(PyExc_LookupError,
549 "'%.400s' is not a text encoding; "
550 "use %s to handle arbitrary codecs",
551 encoding, alternate_command);
552 return NULL;
553 }
554 }
555 }
556
557 /* This appears to be a valid text encoding */
558 return codec;
559 }
560
561
562 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)563 PyObject *codec_getitem_checked(const char *encoding,
564 const char *alternate_command,
565 int index)
566 {
567 PyObject *codec;
568 PyObject *v;
569
570 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571 if (codec == NULL)
572 return NULL;
573
574 v = PyTuple_GET_ITEM(codec, index);
575 Py_INCREF(v);
576 Py_DECREF(codec);
577 return v;
578 }
579
_PyCodec_TextEncoder(const char * encoding)580 static PyObject * _PyCodec_TextEncoder(const char *encoding)
581 {
582 return codec_getitem_checked(encoding, "codecs.encode()", 0);
583 }
584
_PyCodec_TextDecoder(const char * encoding)585 static PyObject * _PyCodec_TextDecoder(const char *encoding)
586 {
587 return codec_getitem_checked(encoding, "codecs.decode()", 1);
588 }
589
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_EncodeText(PyObject *object,
591 const char *encoding,
592 const char *errors)
593 {
594 PyObject *encoder;
595
596 encoder = _PyCodec_TextEncoder(encoding);
597 if (encoder == NULL)
598 return NULL;
599
600 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601 }
602
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)603 PyObject *_PyCodec_DecodeText(PyObject *object,
604 const char *encoding,
605 const char *errors)
606 {
607 PyObject *decoder;
608
609 decoder = _PyCodec_TextDecoder(encoding);
610 if (decoder == NULL)
611 return NULL;
612
613 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614 }
615
616 /* Register the error handling callback function error under the name
617 name. This function will be called by the codec when it encounters
618 an unencodable characters/undecodable bytes and doesn't know the
619 callback name, when name is specified as the error parameter
620 in the call to the encode/decode function.
621 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)622 int PyCodec_RegisterError(const char *name, PyObject *error)
623 {
624 PyInterpreterState *interp = _PyInterpreterState_GET();
625 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626 return -1;
627 if (!PyCallable_Check(error)) {
628 PyErr_SetString(PyExc_TypeError, "handler must be callable");
629 return -1;
630 }
631 return PyDict_SetItemString(interp->codec_error_registry,
632 name, error);
633 }
634
635 /* Lookup the error handling callback function registered under the
636 name error. As a special case NULL can be passed, in which case
637 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)638 PyObject *PyCodec_LookupError(const char *name)
639 {
640 PyObject *handler = NULL;
641
642 PyInterpreterState *interp = _PyInterpreterState_GET();
643 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644 return NULL;
645
646 if (name==NULL)
647 name = "strict";
648 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649 if (handler) {
650 Py_INCREF(handler);
651 }
652 else if (!PyErr_Occurred()) {
653 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654 }
655 return handler;
656 }
657
wrong_exception_type(PyObject * exc)658 static void wrong_exception_type(PyObject *exc)
659 {
660 PyErr_Format(PyExc_TypeError,
661 "don't know how to handle %.200s in error callback",
662 Py_TYPE(exc)->tp_name);
663 }
664
PyCodec_StrictErrors(PyObject * exc)665 PyObject *PyCodec_StrictErrors(PyObject *exc)
666 {
667 if (PyExceptionInstance_Check(exc))
668 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669 else
670 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671 return NULL;
672 }
673
674
PyCodec_IgnoreErrors(PyObject * exc)675 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676 {
677 Py_ssize_t end;
678
679 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680 if (PyUnicodeEncodeError_GetEnd(exc, &end))
681 return NULL;
682 }
683 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684 if (PyUnicodeDecodeError_GetEnd(exc, &end))
685 return NULL;
686 }
687 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688 if (PyUnicodeTranslateError_GetEnd(exc, &end))
689 return NULL;
690 }
691 else {
692 wrong_exception_type(exc);
693 return NULL;
694 }
695 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696 }
697
698
PyCodec_ReplaceErrors(PyObject * exc)699 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700 {
701 Py_ssize_t start, end, i, len;
702
703 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704 PyObject *res;
705 Py_UCS1 *outp;
706 if (PyUnicodeEncodeError_GetStart(exc, &start))
707 return NULL;
708 if (PyUnicodeEncodeError_GetEnd(exc, &end))
709 return NULL;
710 len = end - start;
711 res = PyUnicode_New(len, '?');
712 if (res == NULL)
713 return NULL;
714 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715 outp = PyUnicode_1BYTE_DATA(res);
716 for (i = 0; i < len; ++i)
717 outp[i] = '?';
718 assert(_PyUnicode_CheckConsistency(res, 1));
719 return Py_BuildValue("(Nn)", res, end);
720 }
721 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722 if (PyUnicodeDecodeError_GetEnd(exc, &end))
723 return NULL;
724 return Py_BuildValue("(Cn)",
725 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726 end);
727 }
728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729 PyObject *res;
730 Py_UCS2 *outp;
731 if (PyUnicodeTranslateError_GetStart(exc, &start))
732 return NULL;
733 if (PyUnicodeTranslateError_GetEnd(exc, &end))
734 return NULL;
735 len = end - start;
736 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737 if (res == NULL)
738 return NULL;
739 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740 outp = PyUnicode_2BYTE_DATA(res);
741 for (i = 0; i < len; i++)
742 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743 assert(_PyUnicode_CheckConsistency(res, 1));
744 return Py_BuildValue("(Nn)", res, end);
745 }
746 else {
747 wrong_exception_type(exc);
748 return NULL;
749 }
750 }
751
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)752 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753 {
754 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755 PyObject *restuple;
756 PyObject *object;
757 Py_ssize_t i;
758 Py_ssize_t start;
759 Py_ssize_t end;
760 PyObject *res;
761 Py_UCS1 *outp;
762 Py_ssize_t ressize;
763 Py_UCS4 ch;
764 if (PyUnicodeEncodeError_GetStart(exc, &start))
765 return NULL;
766 if (PyUnicodeEncodeError_GetEnd(exc, &end))
767 return NULL;
768 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769 return NULL;
770 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771 end = start + PY_SSIZE_T_MAX / (2+7+1);
772 for (i = start, ressize = 0; i < end; ++i) {
773 /* object is guaranteed to be "ready" */
774 ch = PyUnicode_READ_CHAR(object, i);
775 if (ch<10)
776 ressize += 2+1+1;
777 else if (ch<100)
778 ressize += 2+2+1;
779 else if (ch<1000)
780 ressize += 2+3+1;
781 else if (ch<10000)
782 ressize += 2+4+1;
783 else if (ch<100000)
784 ressize += 2+5+1;
785 else if (ch<1000000)
786 ressize += 2+6+1;
787 else
788 ressize += 2+7+1;
789 }
790 /* allocate replacement */
791 res = PyUnicode_New(ressize, 127);
792 if (res == NULL) {
793 Py_DECREF(object);
794 return NULL;
795 }
796 outp = PyUnicode_1BYTE_DATA(res);
797 /* generate replacement */
798 for (i = start; i < end; ++i) {
799 int digits;
800 int base;
801 ch = PyUnicode_READ_CHAR(object, i);
802 *outp++ = '&';
803 *outp++ = '#';
804 if (ch<10) {
805 digits = 1;
806 base = 1;
807 }
808 else if (ch<100) {
809 digits = 2;
810 base = 10;
811 }
812 else if (ch<1000) {
813 digits = 3;
814 base = 100;
815 }
816 else if (ch<10000) {
817 digits = 4;
818 base = 1000;
819 }
820 else if (ch<100000) {
821 digits = 5;
822 base = 10000;
823 }
824 else if (ch<1000000) {
825 digits = 6;
826 base = 100000;
827 }
828 else {
829 digits = 7;
830 base = 1000000;
831 }
832 while (digits-->0) {
833 *outp++ = '0' + ch/base;
834 ch %= base;
835 base /= 10;
836 }
837 *outp++ = ';';
838 }
839 assert(_PyUnicode_CheckConsistency(res, 1));
840 restuple = Py_BuildValue("(Nn)", res, end);
841 Py_DECREF(object);
842 return restuple;
843 }
844 else {
845 wrong_exception_type(exc);
846 return NULL;
847 }
848 }
849
PyCodec_BackslashReplaceErrors(PyObject * exc)850 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851 {
852 PyObject *object;
853 Py_ssize_t i;
854 Py_ssize_t start;
855 Py_ssize_t end;
856 PyObject *res;
857 Py_UCS1 *outp;
858 int ressize;
859 Py_UCS4 c;
860
861 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862 const unsigned char *p;
863 if (PyUnicodeDecodeError_GetStart(exc, &start))
864 return NULL;
865 if (PyUnicodeDecodeError_GetEnd(exc, &end))
866 return NULL;
867 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868 return NULL;
869 p = (const unsigned char*)PyBytes_AS_STRING(object);
870 res = PyUnicode_New(4 * (end - start), 127);
871 if (res == NULL) {
872 Py_DECREF(object);
873 return NULL;
874 }
875 outp = PyUnicode_1BYTE_DATA(res);
876 for (i = start; i < end; i++, outp += 4) {
877 unsigned char c = p[i];
878 outp[0] = '\\';
879 outp[1] = 'x';
880 outp[2] = Py_hexdigits[(c>>4)&0xf];
881 outp[3] = Py_hexdigits[c&0xf];
882 }
883
884 assert(_PyUnicode_CheckConsistency(res, 1));
885 Py_DECREF(object);
886 return Py_BuildValue("(Nn)", res, end);
887 }
888 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889 if (PyUnicodeEncodeError_GetStart(exc, &start))
890 return NULL;
891 if (PyUnicodeEncodeError_GetEnd(exc, &end))
892 return NULL;
893 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894 return NULL;
895 }
896 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897 if (PyUnicodeTranslateError_GetStart(exc, &start))
898 return NULL;
899 if (PyUnicodeTranslateError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902 return NULL;
903 }
904 else {
905 wrong_exception_type(exc);
906 return NULL;
907 }
908
909 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910 end = start + PY_SSIZE_T_MAX / (1+1+8);
911 for (i = start, ressize = 0; i < end; ++i) {
912 /* object is guaranteed to be "ready" */
913 c = PyUnicode_READ_CHAR(object, i);
914 if (c >= 0x10000) {
915 ressize += 1+1+8;
916 }
917 else if (c >= 0x100) {
918 ressize += 1+1+4;
919 }
920 else
921 ressize += 1+1+2;
922 }
923 res = PyUnicode_New(ressize, 127);
924 if (res == NULL) {
925 Py_DECREF(object);
926 return NULL;
927 }
928 outp = PyUnicode_1BYTE_DATA(res);
929 for (i = start; i < end; ++i) {
930 c = PyUnicode_READ_CHAR(object, i);
931 *outp++ = '\\';
932 if (c >= 0x00010000) {
933 *outp++ = 'U';
934 *outp++ = Py_hexdigits[(c>>28)&0xf];
935 *outp++ = Py_hexdigits[(c>>24)&0xf];
936 *outp++ = Py_hexdigits[(c>>20)&0xf];
937 *outp++ = Py_hexdigits[(c>>16)&0xf];
938 *outp++ = Py_hexdigits[(c>>12)&0xf];
939 *outp++ = Py_hexdigits[(c>>8)&0xf];
940 }
941 else if (c >= 0x100) {
942 *outp++ = 'u';
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else
947 *outp++ = 'x';
948 *outp++ = Py_hexdigits[(c>>4)&0xf];
949 *outp++ = Py_hexdigits[c&0xf];
950 }
951
952 assert(_PyUnicode_CheckConsistency(res, 1));
953 Py_DECREF(object);
954 return Py_BuildValue("(Nn)", res, end);
955 }
956
957 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
958
PyCodec_NameReplaceErrors(PyObject * exc)959 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960 {
961 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962 PyObject *restuple;
963 PyObject *object;
964 Py_ssize_t i;
965 Py_ssize_t start;
966 Py_ssize_t end;
967 PyObject *res;
968 Py_UCS1 *outp;
969 Py_ssize_t ressize;
970 int replsize;
971 Py_UCS4 c;
972 char buffer[256]; /* NAME_MAXLEN */
973 if (PyUnicodeEncodeError_GetStart(exc, &start))
974 return NULL;
975 if (PyUnicodeEncodeError_GetEnd(exc, &end))
976 return NULL;
977 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978 return NULL;
979 if (!ucnhash_CAPI) {
980 /* load the unicode data module */
981 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982 PyUnicodeData_CAPSULE_NAME, 1);
983 if (!ucnhash_CAPI)
984 return NULL;
985 }
986 for (i = start, ressize = 0; i < end; ++i) {
987 /* object is guaranteed to be "ready" */
988 c = PyUnicode_READ_CHAR(object, i);
989 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
990 replsize = 1+1+1+(int)strlen(buffer)+1;
991 }
992 else if (c >= 0x10000) {
993 replsize = 1+1+8;
994 }
995 else if (c >= 0x100) {
996 replsize = 1+1+4;
997 }
998 else
999 replsize = 1+1+2;
1000 if (ressize > PY_SSIZE_T_MAX - replsize)
1001 break;
1002 ressize += replsize;
1003 }
1004 end = i;
1005 res = PyUnicode_New(ressize, 127);
1006 if (res==NULL)
1007 return NULL;
1008 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1009 i < end; ++i) {
1010 c = PyUnicode_READ_CHAR(object, i);
1011 *outp++ = '\\';
1012 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1013 *outp++ = 'N';
1014 *outp++ = '{';
1015 strcpy((char *)outp, buffer);
1016 outp += strlen(buffer);
1017 *outp++ = '}';
1018 continue;
1019 }
1020 if (c >= 0x00010000) {
1021 *outp++ = 'U';
1022 *outp++ = Py_hexdigits[(c>>28)&0xf];
1023 *outp++ = Py_hexdigits[(c>>24)&0xf];
1024 *outp++ = Py_hexdigits[(c>>20)&0xf];
1025 *outp++ = Py_hexdigits[(c>>16)&0xf];
1026 *outp++ = Py_hexdigits[(c>>12)&0xf];
1027 *outp++ = Py_hexdigits[(c>>8)&0xf];
1028 }
1029 else if (c >= 0x100) {
1030 *outp++ = 'u';
1031 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033 }
1034 else
1035 *outp++ = 'x';
1036 *outp++ = Py_hexdigits[(c>>4)&0xf];
1037 *outp++ = Py_hexdigits[c&0xf];
1038 }
1039
1040 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1041 assert(_PyUnicode_CheckConsistency(res, 1));
1042 restuple = Py_BuildValue("(Nn)", res, end);
1043 Py_DECREF(object);
1044 return restuple;
1045 }
1046 else {
1047 wrong_exception_type(exc);
1048 return NULL;
1049 }
1050 }
1051
1052 #define ENC_UNKNOWN -1
1053 #define ENC_UTF8 0
1054 #define ENC_UTF16BE 1
1055 #define ENC_UTF16LE 2
1056 #define ENC_UTF32BE 3
1057 #define ENC_UTF32LE 4
1058
1059 static int
get_standard_encoding(const char * encoding,int * bytelength)1060 get_standard_encoding(const char *encoding, int *bytelength)
1061 {
1062 if (Py_TOLOWER(encoding[0]) == 'u' &&
1063 Py_TOLOWER(encoding[1]) == 't' &&
1064 Py_TOLOWER(encoding[2]) == 'f') {
1065 encoding += 3;
1066 if (*encoding == '-' || *encoding == '_' )
1067 encoding++;
1068 if (encoding[0] == '8' && encoding[1] == '\0') {
1069 *bytelength = 3;
1070 return ENC_UTF8;
1071 }
1072 else if (encoding[0] == '1' && encoding[1] == '6') {
1073 encoding += 2;
1074 *bytelength = 2;
1075 if (*encoding == '\0') {
1076 #ifdef WORDS_BIGENDIAN
1077 return ENC_UTF16BE;
1078 #else
1079 return ENC_UTF16LE;
1080 #endif
1081 }
1082 if (*encoding == '-' || *encoding == '_' )
1083 encoding++;
1084 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1085 if (Py_TOLOWER(encoding[0]) == 'b')
1086 return ENC_UTF16BE;
1087 if (Py_TOLOWER(encoding[0]) == 'l')
1088 return ENC_UTF16LE;
1089 }
1090 }
1091 else if (encoding[0] == '3' && encoding[1] == '2') {
1092 encoding += 2;
1093 *bytelength = 4;
1094 if (*encoding == '\0') {
1095 #ifdef WORDS_BIGENDIAN
1096 return ENC_UTF32BE;
1097 #else
1098 return ENC_UTF32LE;
1099 #endif
1100 }
1101 if (*encoding == '-' || *encoding == '_' )
1102 encoding++;
1103 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1104 if (Py_TOLOWER(encoding[0]) == 'b')
1105 return ENC_UTF32BE;
1106 if (Py_TOLOWER(encoding[0]) == 'l')
1107 return ENC_UTF32LE;
1108 }
1109 }
1110 }
1111 else if (strcmp(encoding, "CP_UTF8") == 0) {
1112 *bytelength = 3;
1113 return ENC_UTF8;
1114 }
1115 return ENC_UNKNOWN;
1116 }
1117
1118 /* This handler is declared static until someone demonstrates
1119 a need to call it directly. */
1120 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1121 PyCodec_SurrogatePassErrors(PyObject *exc)
1122 {
1123 PyObject *restuple;
1124 PyObject *object;
1125 PyObject *encode;
1126 const char *encoding;
1127 int code;
1128 int bytelength;
1129 Py_ssize_t i;
1130 Py_ssize_t start;
1131 Py_ssize_t end;
1132 PyObject *res;
1133
1134 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1135 unsigned char *outp;
1136 if (PyUnicodeEncodeError_GetStart(exc, &start))
1137 return NULL;
1138 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1139 return NULL;
1140 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1141 return NULL;
1142 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1143 Py_DECREF(object);
1144 return NULL;
1145 }
1146 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1147 Py_DECREF(object);
1148 Py_DECREF(encode);
1149 return NULL;
1150 }
1151 code = get_standard_encoding(encoding, &bytelength);
1152 Py_DECREF(encode);
1153 if (code == ENC_UNKNOWN) {
1154 /* Not supported, fail with original exception */
1155 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1156 Py_DECREF(object);
1157 return NULL;
1158 }
1159
1160 if (end - start > PY_SSIZE_T_MAX / bytelength)
1161 end = start + PY_SSIZE_T_MAX / bytelength;
1162 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1163 if (!res) {
1164 Py_DECREF(object);
1165 return NULL;
1166 }
1167 outp = (unsigned char*)PyBytes_AsString(res);
1168 for (i = start; i < end; i++) {
1169 /* object is guaranteed to be "ready" */
1170 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1171 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1172 /* Not a surrogate, fail with original exception */
1173 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1174 Py_DECREF(res);
1175 Py_DECREF(object);
1176 return NULL;
1177 }
1178 switch (code) {
1179 case ENC_UTF8:
1180 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1181 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1182 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1183 break;
1184 case ENC_UTF16LE:
1185 *outp++ = (unsigned char) ch;
1186 *outp++ = (unsigned char)(ch >> 8);
1187 break;
1188 case ENC_UTF16BE:
1189 *outp++ = (unsigned char)(ch >> 8);
1190 *outp++ = (unsigned char) ch;
1191 break;
1192 case ENC_UTF32LE:
1193 *outp++ = (unsigned char) ch;
1194 *outp++ = (unsigned char)(ch >> 8);
1195 *outp++ = (unsigned char)(ch >> 16);
1196 *outp++ = (unsigned char)(ch >> 24);
1197 break;
1198 case ENC_UTF32BE:
1199 *outp++ = (unsigned char)(ch >> 24);
1200 *outp++ = (unsigned char)(ch >> 16);
1201 *outp++ = (unsigned char)(ch >> 8);
1202 *outp++ = (unsigned char) ch;
1203 break;
1204 }
1205 }
1206 restuple = Py_BuildValue("(On)", res, end);
1207 Py_DECREF(res);
1208 Py_DECREF(object);
1209 return restuple;
1210 }
1211 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1212 const unsigned char *p;
1213 Py_UCS4 ch = 0;
1214 if (PyUnicodeDecodeError_GetStart(exc, &start))
1215 return NULL;
1216 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1217 return NULL;
1218 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1219 return NULL;
1220 p = (const unsigned char*)PyBytes_AS_STRING(object);
1221 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1222 Py_DECREF(object);
1223 return NULL;
1224 }
1225 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1226 Py_DECREF(object);
1227 Py_DECREF(encode);
1228 return NULL;
1229 }
1230 code = get_standard_encoding(encoding, &bytelength);
1231 Py_DECREF(encode);
1232 if (code == ENC_UNKNOWN) {
1233 /* Not supported, fail with original exception */
1234 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1235 Py_DECREF(object);
1236 return NULL;
1237 }
1238
1239 /* Try decoding a single surrogate character. If
1240 there are more, let the codec call us again. */
1241 p += start;
1242 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1243 switch (code) {
1244 case ENC_UTF8:
1245 if ((p[0] & 0xf0) == 0xe0 &&
1246 (p[1] & 0xc0) == 0x80 &&
1247 (p[2] & 0xc0) == 0x80) {
1248 /* it's a three-byte code */
1249 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1250 }
1251 break;
1252 case ENC_UTF16LE:
1253 ch = p[1] << 8 | p[0];
1254 break;
1255 case ENC_UTF16BE:
1256 ch = p[0] << 8 | p[1];
1257 break;
1258 case ENC_UTF32LE:
1259 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1260 break;
1261 case ENC_UTF32BE:
1262 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1263 break;
1264 }
1265 }
1266
1267 Py_DECREF(object);
1268 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1269 /* it's not a surrogate - fail */
1270 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1271 return NULL;
1272 }
1273 res = PyUnicode_FromOrdinal(ch);
1274 if (res == NULL)
1275 return NULL;
1276 return Py_BuildValue("(Nn)", res, start + bytelength);
1277 }
1278 else {
1279 wrong_exception_type(exc);
1280 return NULL;
1281 }
1282 }
1283
1284 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1285 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1286 {
1287 PyObject *restuple;
1288 PyObject *object;
1289 Py_ssize_t i;
1290 Py_ssize_t start;
1291 Py_ssize_t end;
1292 PyObject *res;
1293
1294 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1295 char *outp;
1296 if (PyUnicodeEncodeError_GetStart(exc, &start))
1297 return NULL;
1298 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1299 return NULL;
1300 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1301 return NULL;
1302 res = PyBytes_FromStringAndSize(NULL, end-start);
1303 if (!res) {
1304 Py_DECREF(object);
1305 return NULL;
1306 }
1307 outp = PyBytes_AsString(res);
1308 for (i = start; i < end; i++) {
1309 /* object is guaranteed to be "ready" */
1310 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1311 if (ch < 0xdc80 || ch > 0xdcff) {
1312 /* Not a UTF-8b surrogate, fail with original exception */
1313 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1314 Py_DECREF(res);
1315 Py_DECREF(object);
1316 return NULL;
1317 }
1318 *outp++ = ch - 0xdc00;
1319 }
1320 restuple = Py_BuildValue("(On)", res, end);
1321 Py_DECREF(res);
1322 Py_DECREF(object);
1323 return restuple;
1324 }
1325 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1326 PyObject *str;
1327 const unsigned char *p;
1328 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1329 int consumed = 0;
1330 if (PyUnicodeDecodeError_GetStart(exc, &start))
1331 return NULL;
1332 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1333 return NULL;
1334 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1335 return NULL;
1336 p = (const unsigned char*)PyBytes_AS_STRING(object);
1337 while (consumed < 4 && consumed < end-start) {
1338 /* Refuse to escape ASCII bytes. */
1339 if (p[start+consumed] < 128)
1340 break;
1341 ch[consumed] = 0xdc00 + p[start+consumed];
1342 consumed++;
1343 }
1344 Py_DECREF(object);
1345 if (!consumed) {
1346 /* codec complained about ASCII byte. */
1347 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1348 return NULL;
1349 }
1350 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1351 if (str == NULL)
1352 return NULL;
1353 return Py_BuildValue("(Nn)", str, start+consumed);
1354 }
1355 else {
1356 wrong_exception_type(exc);
1357 return NULL;
1358 }
1359 }
1360
1361
strict_errors(PyObject * self,PyObject * exc)1362 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1363 {
1364 return PyCodec_StrictErrors(exc);
1365 }
1366
1367
ignore_errors(PyObject * self,PyObject * exc)1368 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1369 {
1370 return PyCodec_IgnoreErrors(exc);
1371 }
1372
1373
replace_errors(PyObject * self,PyObject * exc)1374 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1375 {
1376 return PyCodec_ReplaceErrors(exc);
1377 }
1378
1379
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1380 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1381 {
1382 return PyCodec_XMLCharRefReplaceErrors(exc);
1383 }
1384
1385
backslashreplace_errors(PyObject * self,PyObject * exc)1386 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1387 {
1388 return PyCodec_BackslashReplaceErrors(exc);
1389 }
1390
namereplace_errors(PyObject * self,PyObject * exc)1391 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1392 {
1393 return PyCodec_NameReplaceErrors(exc);
1394 }
1395
surrogatepass_errors(PyObject * self,PyObject * exc)1396 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1397 {
1398 return PyCodec_SurrogatePassErrors(exc);
1399 }
1400
surrogateescape_errors(PyObject * self,PyObject * exc)1401 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1402 {
1403 return PyCodec_SurrogateEscapeErrors(exc);
1404 }
1405
_PyCodecRegistry_Init(void)1406 static int _PyCodecRegistry_Init(void)
1407 {
1408 static struct {
1409 const char *name;
1410 PyMethodDef def;
1411 } methods[] =
1412 {
1413 {
1414 "strict",
1415 {
1416 "strict_errors",
1417 strict_errors,
1418 METH_O,
1419 PyDoc_STR("Implements the 'strict' error handling, which "
1420 "raises a UnicodeError on coding errors.")
1421 }
1422 },
1423 {
1424 "ignore",
1425 {
1426 "ignore_errors",
1427 ignore_errors,
1428 METH_O,
1429 PyDoc_STR("Implements the 'ignore' error handling, which "
1430 "ignores malformed data and continues.")
1431 }
1432 },
1433 {
1434 "replace",
1435 {
1436 "replace_errors",
1437 replace_errors,
1438 METH_O,
1439 PyDoc_STR("Implements the 'replace' error handling, which "
1440 "replaces malformed data with a replacement marker.")
1441 }
1442 },
1443 {
1444 "xmlcharrefreplace",
1445 {
1446 "xmlcharrefreplace_errors",
1447 xmlcharrefreplace_errors,
1448 METH_O,
1449 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1450 "which replaces an unencodable character with the "
1451 "appropriate XML character reference.")
1452 }
1453 },
1454 {
1455 "backslashreplace",
1456 {
1457 "backslashreplace_errors",
1458 backslashreplace_errors,
1459 METH_O,
1460 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1461 "which replaces malformed data with a backslashed "
1462 "escape sequence.")
1463 }
1464 },
1465 {
1466 "namereplace",
1467 {
1468 "namereplace_errors",
1469 namereplace_errors,
1470 METH_O,
1471 PyDoc_STR("Implements the 'namereplace' error handling, "
1472 "which replaces an unencodable character with a "
1473 "\\N{...} escape sequence.")
1474 }
1475 },
1476 {
1477 "surrogatepass",
1478 {
1479 "surrogatepass",
1480 surrogatepass_errors,
1481 METH_O
1482 }
1483 },
1484 {
1485 "surrogateescape",
1486 {
1487 "surrogateescape",
1488 surrogateescape_errors,
1489 METH_O
1490 }
1491 }
1492 };
1493
1494 PyInterpreterState *interp = _PyInterpreterState_GET();
1495 PyObject *mod;
1496
1497 if (interp->codec_search_path != NULL)
1498 return 0;
1499
1500 interp->codec_search_path = PyList_New(0);
1501 if (interp->codec_search_path == NULL) {
1502 return -1;
1503 }
1504
1505 interp->codec_search_cache = PyDict_New();
1506 if (interp->codec_search_cache == NULL) {
1507 return -1;
1508 }
1509
1510 interp->codec_error_registry = PyDict_New();
1511 if (interp->codec_error_registry == NULL) {
1512 return -1;
1513 }
1514
1515 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517 if (!func) {
1518 return -1;
1519 }
1520
1521 int res = PyCodec_RegisterError(methods[i].name, func);
1522 Py_DECREF(func);
1523 if (res) {
1524 return -1;
1525 }
1526 }
1527
1528 mod = PyImport_ImportModuleNoBlock("encodings");
1529 if (mod == NULL) {
1530 return -1;
1531 }
1532 Py_DECREF(mod);
1533 interp->codecs_initialized = 1;
1534 return 0;
1535 }
1536