1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "pycore_call.h" // _PyObject_CallNoArgs()
13 #include "pycore_interp.h" // PyInterpreterState.codec_search_path
14 #include "pycore_lock.h" // PyMutex
15 #include "pycore_pyerrors.h" // _PyErr_FormatNote()
16 #include "pycore_pystate.h" // _PyInterpreterState_GET()
17 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
18
19 const char *Py_hexdigits = "0123456789abcdef";
20
21 /* --- Codec Registry ----------------------------------------------------- */
22
PyCodec_Register(PyObject * search_function)23 int PyCodec_Register(PyObject *search_function)
24 {
25 PyInterpreterState *interp = _PyInterpreterState_GET();
26 assert(interp->codecs.initialized);
27 if (search_function == NULL) {
28 PyErr_BadArgument();
29 goto onError;
30 }
31 if (!PyCallable_Check(search_function)) {
32 PyErr_SetString(PyExc_TypeError, "argument must be callable");
33 goto onError;
34 }
35 #ifdef Py_GIL_DISABLED
36 PyMutex_Lock(&interp->codecs.search_path_mutex);
37 #endif
38 int ret = PyList_Append(interp->codecs.search_path, search_function);
39 #ifdef Py_GIL_DISABLED
40 PyMutex_Unlock(&interp->codecs.search_path_mutex);
41 #endif
42 return ret;
43
44 onError:
45 return -1;
46 }
47
48 int
PyCodec_Unregister(PyObject * search_function)49 PyCodec_Unregister(PyObject *search_function)
50 {
51 PyInterpreterState *interp = _PyInterpreterState_GET();
52 if (interp->codecs.initialized != 1) {
53 /* Do nothing if codecs state was cleared (only possible during
54 interpreter shutdown). */
55 return 0;
56 }
57
58 PyObject *codec_search_path = interp->codecs.search_path;
59 assert(PyList_CheckExact(codec_search_path));
60 for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
61 #ifdef Py_GIL_DISABLED
62 PyMutex_Lock(&interp->codecs.search_path_mutex);
63 #endif
64 PyObject *item = PyList_GetItemRef(codec_search_path, i);
65 int ret = 1;
66 if (item == search_function) {
67 // We hold a reference to the item, so its destructor can't run
68 // while we hold search_path_mutex.
69 ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
70 }
71 #ifdef Py_GIL_DISABLED
72 PyMutex_Unlock(&interp->codecs.search_path_mutex);
73 #endif
74 Py_DECREF(item);
75 if (ret != 1) {
76 assert(interp->codecs.search_cache != NULL);
77 assert(PyDict_CheckExact(interp->codecs.search_cache));
78 PyDict_Clear(interp->codecs.search_cache);
79 return ret;
80 }
81 }
82 return 0;
83 }
84
85 extern int _Py_normalize_encoding(const char *, char *, size_t);
86
87 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
88 converted to lower case, spaces and hyphens are replaced with underscores. */
89
90 static
normalizestring(const char * string)91 PyObject *normalizestring(const char *string)
92 {
93 size_t len = strlen(string);
94 char *encoding;
95 PyObject *v;
96
97 if (len > PY_SSIZE_T_MAX) {
98 PyErr_SetString(PyExc_OverflowError, "string is too large");
99 return NULL;
100 }
101
102 encoding = PyMem_Malloc(len + 1);
103 if (encoding == NULL)
104 return PyErr_NoMemory();
105
106 if (!_Py_normalize_encoding(string, encoding, len + 1))
107 {
108 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
109 PyMem_Free(encoding);
110 return NULL;
111 }
112
113 v = PyUnicode_FromString(encoding);
114 PyMem_Free(encoding);
115 return v;
116 }
117
118 /* Lookup the given encoding and return a tuple providing the codec
119 facilities.
120
121 The encoding string is looked up converted to all lower-case
122 characters. This makes encodings looked up through this mechanism
123 effectively case-insensitive.
124
125 If no codec is found, a LookupError is set and NULL returned.
126
127 As side effect, this tries to load the encodings package, if not
128 yet done. This is part of the lazy load strategy for the encodings
129 package.
130
131 */
132
_PyCodec_Lookup(const char * encoding)133 PyObject *_PyCodec_Lookup(const char *encoding)
134 {
135 if (encoding == NULL) {
136 PyErr_BadArgument();
137 return NULL;
138 }
139
140 PyInterpreterState *interp = _PyInterpreterState_GET();
141 assert(interp->codecs.initialized);
142
143 /* Convert the encoding to a normalized Python string: all
144 characters are converted to lower case, spaces and hyphens are
145 replaced with underscores. */
146 PyObject *v = normalizestring(encoding);
147 if (v == NULL) {
148 return NULL;
149 }
150
151 /* Intern the string. We'll make it immortal later if lookup succeeds. */
152 _PyUnicode_InternMortal(interp, &v);
153
154 /* First, try to lookup the name in the registry dictionary */
155 PyObject *result;
156 if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
157 goto onError;
158 }
159 if (result != NULL) {
160 Py_DECREF(v);
161 return result;
162 }
163
164 /* Next, scan the search functions in order of registration */
165 const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
166 if (len < 0)
167 goto onError;
168 if (len == 0) {
169 PyErr_SetString(PyExc_LookupError,
170 "no codec search functions registered: "
171 "can't find encoding");
172 goto onError;
173 }
174
175 Py_ssize_t i;
176 for (i = 0; i < len; i++) {
177 PyObject *func;
178
179 func = PyList_GetItemRef(interp->codecs.search_path, i);
180 if (func == NULL)
181 goto onError;
182 result = PyObject_CallOneArg(func, v);
183 Py_DECREF(func);
184 if (result == NULL)
185 goto onError;
186 if (result == Py_None) {
187 Py_CLEAR(result);
188 continue;
189 }
190 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
191 PyErr_SetString(PyExc_TypeError,
192 "codec search functions must return 4-tuples");
193 Py_DECREF(result);
194 goto onError;
195 }
196 break;
197 }
198 if (result == NULL) {
199 /* XXX Perhaps we should cache misses too ? */
200 PyErr_Format(PyExc_LookupError,
201 "unknown encoding: %s", encoding);
202 goto onError;
203 }
204
205 _PyUnicode_InternImmortal(interp, &v);
206
207 /* Cache and return the result */
208 if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
209 Py_DECREF(result);
210 goto onError;
211 }
212 Py_DECREF(v);
213 return result;
214
215 onError:
216 Py_DECREF(v);
217 return NULL;
218 }
219
220 /* Codec registry encoding check API. */
221
PyCodec_KnownEncoding(const char * encoding)222 int PyCodec_KnownEncoding(const char *encoding)
223 {
224 PyObject *codecs;
225
226 codecs = _PyCodec_Lookup(encoding);
227 if (!codecs) {
228 PyErr_Clear();
229 return 0;
230 }
231 else {
232 Py_DECREF(codecs);
233 return 1;
234 }
235 }
236
237 static
args_tuple(PyObject * object,const char * errors)238 PyObject *args_tuple(PyObject *object,
239 const char *errors)
240 {
241 PyObject *args;
242
243 args = PyTuple_New(1 + (errors != NULL));
244 if (args == NULL)
245 return NULL;
246 PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
247 if (errors) {
248 PyObject *v;
249
250 v = PyUnicode_FromString(errors);
251 if (v == NULL) {
252 Py_DECREF(args);
253 return NULL;
254 }
255 PyTuple_SET_ITEM(args, 1, v);
256 }
257 return args;
258 }
259
260 /* Helper function to get a codec item */
261
262 static
codec_getitem(const char * encoding,int index)263 PyObject *codec_getitem(const char *encoding, int index)
264 {
265 PyObject *codecs;
266 PyObject *v;
267
268 codecs = _PyCodec_Lookup(encoding);
269 if (codecs == NULL)
270 return NULL;
271 v = PyTuple_GET_ITEM(codecs, index);
272 Py_DECREF(codecs);
273 return Py_NewRef(v);
274 }
275
276 /* Helper functions to create an incremental codec. */
277 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)278 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279 const char *errors,
280 const char *attrname)
281 {
282 PyObject *ret, *inccodec;
283
284 inccodec = PyObject_GetAttrString(codec_info, attrname);
285 if (inccodec == NULL)
286 return NULL;
287 if (errors)
288 ret = PyObject_CallFunction(inccodec, "s", errors);
289 else
290 ret = _PyObject_CallNoArgs(inccodec);
291 Py_DECREF(inccodec);
292 return ret;
293 }
294
295 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)296 PyObject *codec_getincrementalcodec(const char *encoding,
297 const char *errors,
298 const char *attrname)
299 {
300 PyObject *codec_info, *ret;
301
302 codec_info = _PyCodec_Lookup(encoding);
303 if (codec_info == NULL)
304 return NULL;
305 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306 Py_DECREF(codec_info);
307 return ret;
308 }
309
310 /* Helper function to create a stream codec. */
311
312 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)313 PyObject *codec_getstreamcodec(const char *encoding,
314 PyObject *stream,
315 const char *errors,
316 const int index)
317 {
318 PyObject *codecs, *streamcodec, *codeccls;
319
320 codecs = _PyCodec_Lookup(encoding);
321 if (codecs == NULL)
322 return NULL;
323
324 codeccls = PyTuple_GET_ITEM(codecs, index);
325 if (errors != NULL)
326 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327 else
328 streamcodec = PyObject_CallOneArg(codeccls, stream);
329 Py_DECREF(codecs);
330 return streamcodec;
331 }
332
333 /* Helpers to work with the result of _PyCodec_Lookup
334
335 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337 const char *errors)
338 {
339 return codec_makeincrementalcodec(codec_info, errors,
340 "incrementaldecoder");
341 }
342
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)343 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344 const char *errors)
345 {
346 return codec_makeincrementalcodec(codec_info, errors,
347 "incrementalencoder");
348 }
349
350
351 /* Convenience APIs to query the Codec registry.
352
353 All APIs return a codec object with incremented refcount.
354
355 */
356
PyCodec_Encoder(const char * encoding)357 PyObject *PyCodec_Encoder(const char *encoding)
358 {
359 return codec_getitem(encoding, 0);
360 }
361
PyCodec_Decoder(const char * encoding)362 PyObject *PyCodec_Decoder(const char *encoding)
363 {
364 return codec_getitem(encoding, 1);
365 }
366
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368 const char *errors)
369 {
370 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371 }
372
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)373 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374 const char *errors)
375 {
376 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377 }
378
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamReader(const char *encoding,
380 PyObject *stream,
381 const char *errors)
382 {
383 return codec_getstreamcodec(encoding, stream, errors, 2);
384 }
385
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)386 PyObject *PyCodec_StreamWriter(const char *encoding,
387 PyObject *stream,
388 const char *errors)
389 {
390 return codec_getstreamcodec(encoding, stream, errors, 3);
391 }
392
393 /* Encode an object (e.g. a Unicode object) using the given encoding
394 and return the resulting encoded object (usually a Python string).
395
396 errors is passed to the encoder factory as argument if non-NULL. */
397
398 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)399 _PyCodec_EncodeInternal(PyObject *object,
400 PyObject *encoder,
401 const char *encoding,
402 const char *errors)
403 {
404 PyObject *args = NULL, *result = NULL;
405 PyObject *v = NULL;
406
407 args = args_tuple(object, errors);
408 if (args == NULL)
409 goto onError;
410
411 result = PyObject_Call(encoder, args, NULL);
412 if (result == NULL) {
413 _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
414 goto onError;
415 }
416
417 if (!PyTuple_Check(result) ||
418 PyTuple_GET_SIZE(result) != 2) {
419 PyErr_SetString(PyExc_TypeError,
420 "encoder must return a tuple (object, integer)");
421 goto onError;
422 }
423 v = Py_NewRef(PyTuple_GET_ITEM(result,0));
424 /* We don't check or use the second (integer) entry. */
425
426 Py_DECREF(args);
427 Py_DECREF(encoder);
428 Py_DECREF(result);
429 return v;
430
431 onError:
432 Py_XDECREF(result);
433 Py_XDECREF(args);
434 Py_XDECREF(encoder);
435 return NULL;
436 }
437
438 /* Decode an object (usually a Python string) using the given encoding
439 and return an equivalent object (e.g. a Unicode object).
440
441 errors is passed to the decoder factory as argument if non-NULL. */
442
443 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)444 _PyCodec_DecodeInternal(PyObject *object,
445 PyObject *decoder,
446 const char *encoding,
447 const char *errors)
448 {
449 PyObject *args = NULL, *result = NULL;
450 PyObject *v;
451
452 args = args_tuple(object, errors);
453 if (args == NULL)
454 goto onError;
455
456 result = PyObject_Call(decoder, args, NULL);
457 if (result == NULL) {
458 _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
459 goto onError;
460 }
461 if (!PyTuple_Check(result) ||
462 PyTuple_GET_SIZE(result) != 2) {
463 PyErr_SetString(PyExc_TypeError,
464 "decoder must return a tuple (object,integer)");
465 goto onError;
466 }
467 v = Py_NewRef(PyTuple_GET_ITEM(result,0));
468 /* We don't check or use the second (integer) entry. */
469
470 Py_DECREF(args);
471 Py_DECREF(decoder);
472 Py_DECREF(result);
473 return v;
474
475 onError:
476 Py_XDECREF(args);
477 Py_XDECREF(decoder);
478 Py_XDECREF(result);
479 return NULL;
480 }
481
482 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)483 PyObject *PyCodec_Encode(PyObject *object,
484 const char *encoding,
485 const char *errors)
486 {
487 PyObject *encoder;
488
489 encoder = PyCodec_Encoder(encoding);
490 if (encoder == NULL)
491 return NULL;
492
493 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
494 }
495
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)496 PyObject *PyCodec_Decode(PyObject *object,
497 const char *encoding,
498 const char *errors)
499 {
500 PyObject *decoder;
501
502 decoder = PyCodec_Decoder(encoding);
503 if (decoder == NULL)
504 return NULL;
505
506 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
507 }
508
509 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)510 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
511 const char *alternate_command)
512 {
513 PyObject *codec;
514 PyObject *attr;
515 int is_text_codec;
516
517 codec = _PyCodec_Lookup(encoding);
518 if (codec == NULL)
519 return NULL;
520
521 /* Backwards compatibility: assume any raw tuple describes a text
522 * encoding, and the same for anything lacking the private
523 * attribute.
524 */
525 if (!PyTuple_CheckExact(codec)) {
526 if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
527 Py_DECREF(codec);
528 return NULL;
529 }
530 if (attr != NULL) {
531 is_text_codec = PyObject_IsTrue(attr);
532 Py_DECREF(attr);
533 if (is_text_codec <= 0) {
534 Py_DECREF(codec);
535 if (!is_text_codec)
536 PyErr_Format(PyExc_LookupError,
537 "'%.400s' is not a text encoding; "
538 "use %s to handle arbitrary codecs",
539 encoding, alternate_command);
540 return NULL;
541 }
542 }
543 }
544
545 /* This appears to be a valid text encoding */
546 return codec;
547 }
548
549
550 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)551 PyObject *codec_getitem_checked(const char *encoding,
552 const char *alternate_command,
553 int index)
554 {
555 PyObject *codec;
556 PyObject *v;
557
558 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
559 if (codec == NULL)
560 return NULL;
561
562 v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
563 Py_DECREF(codec);
564 return v;
565 }
566
_PyCodec_TextEncoder(const char * encoding)567 static PyObject * _PyCodec_TextEncoder(const char *encoding)
568 {
569 return codec_getitem_checked(encoding, "codecs.encode()", 0);
570 }
571
_PyCodec_TextDecoder(const char * encoding)572 static PyObject * _PyCodec_TextDecoder(const char *encoding)
573 {
574 return codec_getitem_checked(encoding, "codecs.decode()", 1);
575 }
576
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)577 PyObject *_PyCodec_EncodeText(PyObject *object,
578 const char *encoding,
579 const char *errors)
580 {
581 PyObject *encoder;
582
583 encoder = _PyCodec_TextEncoder(encoding);
584 if (encoder == NULL)
585 return NULL;
586
587 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
588 }
589
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_DecodeText(PyObject *object,
591 const char *encoding,
592 const char *errors)
593 {
594 PyObject *decoder;
595
596 decoder = _PyCodec_TextDecoder(encoding);
597 if (decoder == NULL)
598 return NULL;
599
600 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
601 }
602
603 /* Register the error handling callback function error under the name
604 name. This function will be called by the codec when it encounters
605 an unencodable characters/undecodable bytes and doesn't know the
606 callback name, when name is specified as the error parameter
607 in the call to the encode/decode function.
608 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)609 int PyCodec_RegisterError(const char *name, PyObject *error)
610 {
611 PyInterpreterState *interp = _PyInterpreterState_GET();
612 assert(interp->codecs.initialized);
613 if (!PyCallable_Check(error)) {
614 PyErr_SetString(PyExc_TypeError, "handler must be callable");
615 return -1;
616 }
617 return PyDict_SetItemString(interp->codecs.error_registry,
618 name, error);
619 }
620
621 /* Lookup the error handling callback function registered under the
622 name error. As a special case NULL can be passed, in which case
623 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)624 PyObject *PyCodec_LookupError(const char *name)
625 {
626 PyInterpreterState *interp = _PyInterpreterState_GET();
627 assert(interp->codecs.initialized);
628
629 if (name==NULL)
630 name = "strict";
631 PyObject *handler;
632 if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
633 return NULL;
634 }
635 if (handler == NULL) {
636 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
637 return NULL;
638 }
639 return handler;
640 }
641
wrong_exception_type(PyObject * exc)642 static void wrong_exception_type(PyObject *exc)
643 {
644 PyErr_Format(PyExc_TypeError,
645 "don't know how to handle %.200s in error callback",
646 Py_TYPE(exc)->tp_name);
647 }
648
PyCodec_StrictErrors(PyObject * exc)649 PyObject *PyCodec_StrictErrors(PyObject *exc)
650 {
651 if (PyExceptionInstance_Check(exc))
652 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
653 else
654 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
655 return NULL;
656 }
657
658
PyCodec_IgnoreErrors(PyObject * exc)659 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
660 {
661 Py_ssize_t end;
662
663 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
664 if (PyUnicodeEncodeError_GetEnd(exc, &end))
665 return NULL;
666 }
667 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
668 if (PyUnicodeDecodeError_GetEnd(exc, &end))
669 return NULL;
670 }
671 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
672 if (PyUnicodeTranslateError_GetEnd(exc, &end))
673 return NULL;
674 }
675 else {
676 wrong_exception_type(exc);
677 return NULL;
678 }
679 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
680 }
681
682
PyCodec_ReplaceErrors(PyObject * exc)683 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
684 {
685 Py_ssize_t start, end, i, len;
686
687 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
688 PyObject *res;
689 Py_UCS1 *outp;
690 if (PyUnicodeEncodeError_GetStart(exc, &start))
691 return NULL;
692 if (PyUnicodeEncodeError_GetEnd(exc, &end))
693 return NULL;
694 len = end - start;
695 res = PyUnicode_New(len, '?');
696 if (res == NULL)
697 return NULL;
698 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
699 outp = PyUnicode_1BYTE_DATA(res);
700 for (i = 0; i < len; ++i)
701 outp[i] = '?';
702 assert(_PyUnicode_CheckConsistency(res, 1));
703 return Py_BuildValue("(Nn)", res, end);
704 }
705 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
706 if (PyUnicodeDecodeError_GetEnd(exc, &end))
707 return NULL;
708 return Py_BuildValue("(Cn)",
709 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
710 end);
711 }
712 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
713 PyObject *res;
714 Py_UCS2 *outp;
715 if (PyUnicodeTranslateError_GetStart(exc, &start))
716 return NULL;
717 if (PyUnicodeTranslateError_GetEnd(exc, &end))
718 return NULL;
719 len = end - start;
720 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
721 if (res == NULL)
722 return NULL;
723 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
724 outp = PyUnicode_2BYTE_DATA(res);
725 for (i = 0; i < len; i++)
726 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
727 assert(_PyUnicode_CheckConsistency(res, 1));
728 return Py_BuildValue("(Nn)", res, end);
729 }
730 else {
731 wrong_exception_type(exc);
732 return NULL;
733 }
734 }
735
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)736 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
737 {
738 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
739 PyObject *restuple;
740 PyObject *object;
741 Py_ssize_t i;
742 Py_ssize_t start;
743 Py_ssize_t end;
744 PyObject *res;
745 Py_UCS1 *outp;
746 Py_ssize_t ressize;
747 Py_UCS4 ch;
748 if (PyUnicodeEncodeError_GetStart(exc, &start))
749 return NULL;
750 if (PyUnicodeEncodeError_GetEnd(exc, &end))
751 return NULL;
752 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
753 return NULL;
754 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
755 end = start + PY_SSIZE_T_MAX / (2+7+1);
756 for (i = start, ressize = 0; i < end; ++i) {
757 /* object is guaranteed to be "ready" */
758 ch = PyUnicode_READ_CHAR(object, i);
759 if (ch<10)
760 ressize += 2+1+1;
761 else if (ch<100)
762 ressize += 2+2+1;
763 else if (ch<1000)
764 ressize += 2+3+1;
765 else if (ch<10000)
766 ressize += 2+4+1;
767 else if (ch<100000)
768 ressize += 2+5+1;
769 else if (ch<1000000)
770 ressize += 2+6+1;
771 else
772 ressize += 2+7+1;
773 }
774 /* allocate replacement */
775 res = PyUnicode_New(ressize, 127);
776 if (res == NULL) {
777 Py_DECREF(object);
778 return NULL;
779 }
780 outp = PyUnicode_1BYTE_DATA(res);
781 /* generate replacement */
782 for (i = start; i < end; ++i) {
783 int digits;
784 int base;
785 ch = PyUnicode_READ_CHAR(object, i);
786 *outp++ = '&';
787 *outp++ = '#';
788 if (ch<10) {
789 digits = 1;
790 base = 1;
791 }
792 else if (ch<100) {
793 digits = 2;
794 base = 10;
795 }
796 else if (ch<1000) {
797 digits = 3;
798 base = 100;
799 }
800 else if (ch<10000) {
801 digits = 4;
802 base = 1000;
803 }
804 else if (ch<100000) {
805 digits = 5;
806 base = 10000;
807 }
808 else if (ch<1000000) {
809 digits = 6;
810 base = 100000;
811 }
812 else {
813 digits = 7;
814 base = 1000000;
815 }
816 while (digits-->0) {
817 *outp++ = '0' + ch/base;
818 ch %= base;
819 base /= 10;
820 }
821 *outp++ = ';';
822 }
823 assert(_PyUnicode_CheckConsistency(res, 1));
824 restuple = Py_BuildValue("(Nn)", res, end);
825 Py_DECREF(object);
826 return restuple;
827 }
828 else {
829 wrong_exception_type(exc);
830 return NULL;
831 }
832 }
833
PyCodec_BackslashReplaceErrors(PyObject * exc)834 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
835 {
836 PyObject *object;
837 Py_ssize_t i;
838 Py_ssize_t start;
839 Py_ssize_t end;
840 PyObject *res;
841 Py_UCS1 *outp;
842 int ressize;
843 Py_UCS4 c;
844
845 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
846 const unsigned char *p;
847 if (PyUnicodeDecodeError_GetStart(exc, &start))
848 return NULL;
849 if (PyUnicodeDecodeError_GetEnd(exc, &end))
850 return NULL;
851 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
852 return NULL;
853 p = (const unsigned char*)PyBytes_AS_STRING(object);
854 res = PyUnicode_New(4 * (end - start), 127);
855 if (res == NULL) {
856 Py_DECREF(object);
857 return NULL;
858 }
859 outp = PyUnicode_1BYTE_DATA(res);
860 for (i = start; i < end; i++, outp += 4) {
861 unsigned char c = p[i];
862 outp[0] = '\\';
863 outp[1] = 'x';
864 outp[2] = Py_hexdigits[(c>>4)&0xf];
865 outp[3] = Py_hexdigits[c&0xf];
866 }
867
868 assert(_PyUnicode_CheckConsistency(res, 1));
869 Py_DECREF(object);
870 return Py_BuildValue("(Nn)", res, end);
871 }
872 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
873 if (PyUnicodeEncodeError_GetStart(exc, &start))
874 return NULL;
875 if (PyUnicodeEncodeError_GetEnd(exc, &end))
876 return NULL;
877 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
878 return NULL;
879 }
880 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
881 if (PyUnicodeTranslateError_GetStart(exc, &start))
882 return NULL;
883 if (PyUnicodeTranslateError_GetEnd(exc, &end))
884 return NULL;
885 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
886 return NULL;
887 }
888 else {
889 wrong_exception_type(exc);
890 return NULL;
891 }
892
893 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
894 end = start + PY_SSIZE_T_MAX / (1+1+8);
895 for (i = start, ressize = 0; i < end; ++i) {
896 /* object is guaranteed to be "ready" */
897 c = PyUnicode_READ_CHAR(object, i);
898 if (c >= 0x10000) {
899 ressize += 1+1+8;
900 }
901 else if (c >= 0x100) {
902 ressize += 1+1+4;
903 }
904 else
905 ressize += 1+1+2;
906 }
907 res = PyUnicode_New(ressize, 127);
908 if (res == NULL) {
909 Py_DECREF(object);
910 return NULL;
911 }
912 outp = PyUnicode_1BYTE_DATA(res);
913 for (i = start; i < end; ++i) {
914 c = PyUnicode_READ_CHAR(object, i);
915 *outp++ = '\\';
916 if (c >= 0x00010000) {
917 *outp++ = 'U';
918 *outp++ = Py_hexdigits[(c>>28)&0xf];
919 *outp++ = Py_hexdigits[(c>>24)&0xf];
920 *outp++ = Py_hexdigits[(c>>20)&0xf];
921 *outp++ = Py_hexdigits[(c>>16)&0xf];
922 *outp++ = Py_hexdigits[(c>>12)&0xf];
923 *outp++ = Py_hexdigits[(c>>8)&0xf];
924 }
925 else if (c >= 0x100) {
926 *outp++ = 'u';
927 *outp++ = Py_hexdigits[(c>>12)&0xf];
928 *outp++ = Py_hexdigits[(c>>8)&0xf];
929 }
930 else
931 *outp++ = 'x';
932 *outp++ = Py_hexdigits[(c>>4)&0xf];
933 *outp++ = Py_hexdigits[c&0xf];
934 }
935
936 assert(_PyUnicode_CheckConsistency(res, 1));
937 Py_DECREF(object);
938 return Py_BuildValue("(Nn)", res, end);
939 }
940
PyCodec_NameReplaceErrors(PyObject * exc)941 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
942 {
943 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
944 PyObject *restuple;
945 PyObject *object;
946 Py_ssize_t i;
947 Py_ssize_t start;
948 Py_ssize_t end;
949 PyObject *res;
950 Py_UCS1 *outp;
951 Py_ssize_t ressize;
952 int replsize;
953 Py_UCS4 c;
954 char buffer[256]; /* NAME_MAXLEN */
955 if (PyUnicodeEncodeError_GetStart(exc, &start))
956 return NULL;
957 if (PyUnicodeEncodeError_GetEnd(exc, &end))
958 return NULL;
959 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
960 return NULL;
961 _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
962 if (ucnhash_capi == NULL) {
963 return NULL;
964 }
965 for (i = start, ressize = 0; i < end; ++i) {
966 /* object is guaranteed to be "ready" */
967 c = PyUnicode_READ_CHAR(object, i);
968 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
969 replsize = 1+1+1+(int)strlen(buffer)+1;
970 }
971 else if (c >= 0x10000) {
972 replsize = 1+1+8;
973 }
974 else if (c >= 0x100) {
975 replsize = 1+1+4;
976 }
977 else
978 replsize = 1+1+2;
979 if (ressize > PY_SSIZE_T_MAX - replsize)
980 break;
981 ressize += replsize;
982 }
983 end = i;
984 res = PyUnicode_New(ressize, 127);
985 if (res==NULL)
986 return NULL;
987 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
988 i < end; ++i) {
989 c = PyUnicode_READ_CHAR(object, i);
990 *outp++ = '\\';
991 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
992 *outp++ = 'N';
993 *outp++ = '{';
994 strcpy((char *)outp, buffer);
995 outp += strlen(buffer);
996 *outp++ = '}';
997 continue;
998 }
999 if (c >= 0x00010000) {
1000 *outp++ = 'U';
1001 *outp++ = Py_hexdigits[(c>>28)&0xf];
1002 *outp++ = Py_hexdigits[(c>>24)&0xf];
1003 *outp++ = Py_hexdigits[(c>>20)&0xf];
1004 *outp++ = Py_hexdigits[(c>>16)&0xf];
1005 *outp++ = Py_hexdigits[(c>>12)&0xf];
1006 *outp++ = Py_hexdigits[(c>>8)&0xf];
1007 }
1008 else if (c >= 0x100) {
1009 *outp++ = 'u';
1010 *outp++ = Py_hexdigits[(c>>12)&0xf];
1011 *outp++ = Py_hexdigits[(c>>8)&0xf];
1012 }
1013 else
1014 *outp++ = 'x';
1015 *outp++ = Py_hexdigits[(c>>4)&0xf];
1016 *outp++ = Py_hexdigits[c&0xf];
1017 }
1018
1019 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1020 assert(_PyUnicode_CheckConsistency(res, 1));
1021 restuple = Py_BuildValue("(Nn)", res, end);
1022 Py_DECREF(object);
1023 return restuple;
1024 }
1025 else {
1026 wrong_exception_type(exc);
1027 return NULL;
1028 }
1029 }
1030
1031 #define ENC_UNKNOWN -1
1032 #define ENC_UTF8 0
1033 #define ENC_UTF16BE 1
1034 #define ENC_UTF16LE 2
1035 #define ENC_UTF32BE 3
1036 #define ENC_UTF32LE 4
1037
1038 static int
get_standard_encoding(const char * encoding,int * bytelength)1039 get_standard_encoding(const char *encoding, int *bytelength)
1040 {
1041 if (Py_TOLOWER(encoding[0]) == 'u' &&
1042 Py_TOLOWER(encoding[1]) == 't' &&
1043 Py_TOLOWER(encoding[2]) == 'f') {
1044 encoding += 3;
1045 if (*encoding == '-' || *encoding == '_' )
1046 encoding++;
1047 if (encoding[0] == '8' && encoding[1] == '\0') {
1048 *bytelength = 3;
1049 return ENC_UTF8;
1050 }
1051 else if (encoding[0] == '1' && encoding[1] == '6') {
1052 encoding += 2;
1053 *bytelength = 2;
1054 if (*encoding == '\0') {
1055 #ifdef WORDS_BIGENDIAN
1056 return ENC_UTF16BE;
1057 #else
1058 return ENC_UTF16LE;
1059 #endif
1060 }
1061 if (*encoding == '-' || *encoding == '_' )
1062 encoding++;
1063 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1064 if (Py_TOLOWER(encoding[0]) == 'b')
1065 return ENC_UTF16BE;
1066 if (Py_TOLOWER(encoding[0]) == 'l')
1067 return ENC_UTF16LE;
1068 }
1069 }
1070 else if (encoding[0] == '3' && encoding[1] == '2') {
1071 encoding += 2;
1072 *bytelength = 4;
1073 if (*encoding == '\0') {
1074 #ifdef WORDS_BIGENDIAN
1075 return ENC_UTF32BE;
1076 #else
1077 return ENC_UTF32LE;
1078 #endif
1079 }
1080 if (*encoding == '-' || *encoding == '_' )
1081 encoding++;
1082 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1083 if (Py_TOLOWER(encoding[0]) == 'b')
1084 return ENC_UTF32BE;
1085 if (Py_TOLOWER(encoding[0]) == 'l')
1086 return ENC_UTF32LE;
1087 }
1088 }
1089 }
1090 else if (strcmp(encoding, "CP_UTF8") == 0) {
1091 *bytelength = 3;
1092 return ENC_UTF8;
1093 }
1094 return ENC_UNKNOWN;
1095 }
1096
1097 /* This handler is declared static until someone demonstrates
1098 a need to call it directly. */
1099 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1100 PyCodec_SurrogatePassErrors(PyObject *exc)
1101 {
1102 PyObject *restuple;
1103 PyObject *object;
1104 PyObject *encode;
1105 const char *encoding;
1106 int code;
1107 int bytelength;
1108 Py_ssize_t i;
1109 Py_ssize_t start;
1110 Py_ssize_t end;
1111 PyObject *res;
1112
1113 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1114 unsigned char *outp;
1115 if (PyUnicodeEncodeError_GetStart(exc, &start))
1116 return NULL;
1117 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1118 return NULL;
1119 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1120 return NULL;
1121 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1122 Py_DECREF(object);
1123 return NULL;
1124 }
1125 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1126 Py_DECREF(object);
1127 Py_DECREF(encode);
1128 return NULL;
1129 }
1130 code = get_standard_encoding(encoding, &bytelength);
1131 Py_DECREF(encode);
1132 if (code == ENC_UNKNOWN) {
1133 /* Not supported, fail with original exception */
1134 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1135 Py_DECREF(object);
1136 return NULL;
1137 }
1138
1139 if (end - start > PY_SSIZE_T_MAX / bytelength)
1140 end = start + PY_SSIZE_T_MAX / bytelength;
1141 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1142 if (!res) {
1143 Py_DECREF(object);
1144 return NULL;
1145 }
1146 outp = (unsigned char*)PyBytes_AsString(res);
1147 for (i = start; i < end; i++) {
1148 /* object is guaranteed to be "ready" */
1149 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1150 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1151 /* Not a surrogate, fail with original exception */
1152 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1153 Py_DECREF(res);
1154 Py_DECREF(object);
1155 return NULL;
1156 }
1157 switch (code) {
1158 case ENC_UTF8:
1159 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1160 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1161 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1162 break;
1163 case ENC_UTF16LE:
1164 *outp++ = (unsigned char) ch;
1165 *outp++ = (unsigned char)(ch >> 8);
1166 break;
1167 case ENC_UTF16BE:
1168 *outp++ = (unsigned char)(ch >> 8);
1169 *outp++ = (unsigned char) ch;
1170 break;
1171 case ENC_UTF32LE:
1172 *outp++ = (unsigned char) ch;
1173 *outp++ = (unsigned char)(ch >> 8);
1174 *outp++ = (unsigned char)(ch >> 16);
1175 *outp++ = (unsigned char)(ch >> 24);
1176 break;
1177 case ENC_UTF32BE:
1178 *outp++ = (unsigned char)(ch >> 24);
1179 *outp++ = (unsigned char)(ch >> 16);
1180 *outp++ = (unsigned char)(ch >> 8);
1181 *outp++ = (unsigned char) ch;
1182 break;
1183 }
1184 }
1185 restuple = Py_BuildValue("(On)", res, end);
1186 Py_DECREF(res);
1187 Py_DECREF(object);
1188 return restuple;
1189 }
1190 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1191 const unsigned char *p;
1192 Py_UCS4 ch = 0;
1193 if (PyUnicodeDecodeError_GetStart(exc, &start))
1194 return NULL;
1195 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1196 return NULL;
1197 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1198 return NULL;
1199 p = (const unsigned char*)PyBytes_AS_STRING(object);
1200 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1201 Py_DECREF(object);
1202 return NULL;
1203 }
1204 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1205 Py_DECREF(object);
1206 Py_DECREF(encode);
1207 return NULL;
1208 }
1209 code = get_standard_encoding(encoding, &bytelength);
1210 Py_DECREF(encode);
1211 if (code == ENC_UNKNOWN) {
1212 /* Not supported, fail with original exception */
1213 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1214 Py_DECREF(object);
1215 return NULL;
1216 }
1217
1218 /* Try decoding a single surrogate character. If
1219 there are more, let the codec call us again. */
1220 p += start;
1221 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1222 switch (code) {
1223 case ENC_UTF8:
1224 if ((p[0] & 0xf0) == 0xe0 &&
1225 (p[1] & 0xc0) == 0x80 &&
1226 (p[2] & 0xc0) == 0x80) {
1227 /* it's a three-byte code */
1228 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1229 }
1230 break;
1231 case ENC_UTF16LE:
1232 ch = p[1] << 8 | p[0];
1233 break;
1234 case ENC_UTF16BE:
1235 ch = p[0] << 8 | p[1];
1236 break;
1237 case ENC_UTF32LE:
1238 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1239 break;
1240 case ENC_UTF32BE:
1241 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1242 break;
1243 }
1244 }
1245
1246 Py_DECREF(object);
1247 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1248 /* it's not a surrogate - fail */
1249 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1250 return NULL;
1251 }
1252 res = PyUnicode_FromOrdinal(ch);
1253 if (res == NULL)
1254 return NULL;
1255 return Py_BuildValue("(Nn)", res, start + bytelength);
1256 }
1257 else {
1258 wrong_exception_type(exc);
1259 return NULL;
1260 }
1261 }
1262
1263 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1264 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1265 {
1266 PyObject *restuple;
1267 PyObject *object;
1268 Py_ssize_t i;
1269 Py_ssize_t start;
1270 Py_ssize_t end;
1271 PyObject *res;
1272
1273 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1274 char *outp;
1275 if (PyUnicodeEncodeError_GetStart(exc, &start))
1276 return NULL;
1277 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1278 return NULL;
1279 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1280 return NULL;
1281 res = PyBytes_FromStringAndSize(NULL, end-start);
1282 if (!res) {
1283 Py_DECREF(object);
1284 return NULL;
1285 }
1286 outp = PyBytes_AsString(res);
1287 for (i = start; i < end; i++) {
1288 /* object is guaranteed to be "ready" */
1289 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1290 if (ch < 0xdc80 || ch > 0xdcff) {
1291 /* Not a UTF-8b surrogate, fail with original exception */
1292 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1293 Py_DECREF(res);
1294 Py_DECREF(object);
1295 return NULL;
1296 }
1297 *outp++ = ch - 0xdc00;
1298 }
1299 restuple = Py_BuildValue("(On)", res, end);
1300 Py_DECREF(res);
1301 Py_DECREF(object);
1302 return restuple;
1303 }
1304 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1305 PyObject *str;
1306 const unsigned char *p;
1307 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1308 int consumed = 0;
1309 if (PyUnicodeDecodeError_GetStart(exc, &start))
1310 return NULL;
1311 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1312 return NULL;
1313 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1314 return NULL;
1315 p = (const unsigned char*)PyBytes_AS_STRING(object);
1316 while (consumed < 4 && consumed < end-start) {
1317 /* Refuse to escape ASCII bytes. */
1318 if (p[start+consumed] < 128)
1319 break;
1320 ch[consumed] = 0xdc00 + p[start+consumed];
1321 consumed++;
1322 }
1323 Py_DECREF(object);
1324 if (!consumed) {
1325 /* codec complained about ASCII byte. */
1326 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1327 return NULL;
1328 }
1329 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1330 if (str == NULL)
1331 return NULL;
1332 return Py_BuildValue("(Nn)", str, start+consumed);
1333 }
1334 else {
1335 wrong_exception_type(exc);
1336 return NULL;
1337 }
1338 }
1339
1340
strict_errors(PyObject * self,PyObject * exc)1341 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1342 {
1343 return PyCodec_StrictErrors(exc);
1344 }
1345
1346
ignore_errors(PyObject * self,PyObject * exc)1347 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1348 {
1349 return PyCodec_IgnoreErrors(exc);
1350 }
1351
1352
replace_errors(PyObject * self,PyObject * exc)1353 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1354 {
1355 return PyCodec_ReplaceErrors(exc);
1356 }
1357
1358
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1359 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1360 {
1361 return PyCodec_XMLCharRefReplaceErrors(exc);
1362 }
1363
1364
backslashreplace_errors(PyObject * self,PyObject * exc)1365 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1366 {
1367 return PyCodec_BackslashReplaceErrors(exc);
1368 }
1369
namereplace_errors(PyObject * self,PyObject * exc)1370 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1371 {
1372 return PyCodec_NameReplaceErrors(exc);
1373 }
1374
surrogatepass_errors(PyObject * self,PyObject * exc)1375 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1376 {
1377 return PyCodec_SurrogatePassErrors(exc);
1378 }
1379
surrogateescape_errors(PyObject * self,PyObject * exc)1380 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1381 {
1382 return PyCodec_SurrogateEscapeErrors(exc);
1383 }
1384
1385 PyStatus
_PyCodec_InitRegistry(PyInterpreterState * interp)1386 _PyCodec_InitRegistry(PyInterpreterState *interp)
1387 {
1388 static struct {
1389 const char *name;
1390 PyMethodDef def;
1391 } methods[] =
1392 {
1393 {
1394 "strict",
1395 {
1396 "strict_errors",
1397 strict_errors,
1398 METH_O,
1399 PyDoc_STR("Implements the 'strict' error handling, which "
1400 "raises a UnicodeError on coding errors.")
1401 }
1402 },
1403 {
1404 "ignore",
1405 {
1406 "ignore_errors",
1407 ignore_errors,
1408 METH_O,
1409 PyDoc_STR("Implements the 'ignore' error handling, which "
1410 "ignores malformed data and continues.")
1411 }
1412 },
1413 {
1414 "replace",
1415 {
1416 "replace_errors",
1417 replace_errors,
1418 METH_O,
1419 PyDoc_STR("Implements the 'replace' error handling, which "
1420 "replaces malformed data with a replacement marker.")
1421 }
1422 },
1423 {
1424 "xmlcharrefreplace",
1425 {
1426 "xmlcharrefreplace_errors",
1427 xmlcharrefreplace_errors,
1428 METH_O,
1429 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1430 "which replaces an unencodable character with the "
1431 "appropriate XML character reference.")
1432 }
1433 },
1434 {
1435 "backslashreplace",
1436 {
1437 "backslashreplace_errors",
1438 backslashreplace_errors,
1439 METH_O,
1440 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1441 "which replaces malformed data with a backslashed "
1442 "escape sequence.")
1443 }
1444 },
1445 {
1446 "namereplace",
1447 {
1448 "namereplace_errors",
1449 namereplace_errors,
1450 METH_O,
1451 PyDoc_STR("Implements the 'namereplace' error handling, "
1452 "which replaces an unencodable character with a "
1453 "\\N{...} escape sequence.")
1454 }
1455 },
1456 {
1457 "surrogatepass",
1458 {
1459 "surrogatepass",
1460 surrogatepass_errors,
1461 METH_O
1462 }
1463 },
1464 {
1465 "surrogateescape",
1466 {
1467 "surrogateescape",
1468 surrogateescape_errors,
1469 METH_O
1470 }
1471 }
1472 };
1473
1474 assert(interp->codecs.initialized == 0);
1475 interp->codecs.search_path = PyList_New(0);
1476 if (interp->codecs.search_path == NULL) {
1477 return PyStatus_NoMemory();
1478 }
1479 interp->codecs.search_cache = PyDict_New();
1480 if (interp->codecs.search_cache == NULL) {
1481 return PyStatus_NoMemory();
1482 }
1483 interp->codecs.error_registry = PyDict_New();
1484 if (interp->codecs.error_registry == NULL) {
1485 return PyStatus_NoMemory();
1486 }
1487 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1488 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1489 if (func == NULL) {
1490 return PyStatus_NoMemory();
1491 }
1492
1493 int res = PyDict_SetItemString(interp->codecs.error_registry,
1494 methods[i].name, func);
1495 Py_DECREF(func);
1496 if (res < 0) {
1497 return PyStatus_Error("Failed to insert into codec error registry");
1498 }
1499 }
1500
1501 interp->codecs.initialized = 1;
1502
1503 // Importing `encodings' will call back into this module to register codec
1504 // search functions, so this is done after everything else is initialized.
1505 PyObject *mod = PyImport_ImportModule("encodings");
1506 if (mod == NULL) {
1507 return PyStatus_Error("Failed to import encodings module");
1508 }
1509 Py_DECREF(mod);
1510
1511 return PyStatus_Ok();
1512 }
1513
1514 void
_PyCodec_Fini(PyInterpreterState * interp)1515 _PyCodec_Fini(PyInterpreterState *interp)
1516 {
1517 Py_CLEAR(interp->codecs.search_path);
1518 Py_CLEAR(interp->codecs.search_cache);
1519 Py_CLEAR(interp->codecs.error_registry);
1520 interp->codecs.initialized = 0;
1521 }
1522