• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "pycore_call.h"          // _PyObject_CallNoArgs()
13 #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14 #include "pycore_lock.h"          // PyMutex
15 #include "pycore_pyerrors.h"      // _PyErr_FormatNote()
16 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
17 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
18 
19 const char *Py_hexdigits = "0123456789abcdef";
20 
21 /* --- Codec Registry ----------------------------------------------------- */
22 
PyCodec_Register(PyObject * search_function)23 int PyCodec_Register(PyObject *search_function)
24 {
25     PyInterpreterState *interp = _PyInterpreterState_GET();
26     assert(interp->codecs.initialized);
27     if (search_function == NULL) {
28         PyErr_BadArgument();
29         goto onError;
30     }
31     if (!PyCallable_Check(search_function)) {
32         PyErr_SetString(PyExc_TypeError, "argument must be callable");
33         goto onError;
34     }
35 #ifdef Py_GIL_DISABLED
36     PyMutex_Lock(&interp->codecs.search_path_mutex);
37 #endif
38     int ret = PyList_Append(interp->codecs.search_path, search_function);
39 #ifdef Py_GIL_DISABLED
40     PyMutex_Unlock(&interp->codecs.search_path_mutex);
41 #endif
42     return ret;
43 
44  onError:
45     return -1;
46 }
47 
48 int
PyCodec_Unregister(PyObject * search_function)49 PyCodec_Unregister(PyObject *search_function)
50 {
51     PyInterpreterState *interp = _PyInterpreterState_GET();
52     if (interp->codecs.initialized != 1) {
53         /* Do nothing if codecs state was cleared (only possible during
54            interpreter shutdown). */
55         return 0;
56     }
57 
58     PyObject *codec_search_path = interp->codecs.search_path;
59     assert(PyList_CheckExact(codec_search_path));
60     for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
61 #ifdef Py_GIL_DISABLED
62         PyMutex_Lock(&interp->codecs.search_path_mutex);
63 #endif
64         PyObject *item = PyList_GetItemRef(codec_search_path, i);
65         int ret = 1;
66         if (item == search_function) {
67             // We hold a reference to the item, so its destructor can't run
68             // while we hold search_path_mutex.
69             ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
70         }
71 #ifdef Py_GIL_DISABLED
72         PyMutex_Unlock(&interp->codecs.search_path_mutex);
73 #endif
74         Py_DECREF(item);
75         if (ret != 1) {
76             assert(interp->codecs.search_cache != NULL);
77             assert(PyDict_CheckExact(interp->codecs.search_cache));
78             PyDict_Clear(interp->codecs.search_cache);
79             return ret;
80         }
81     }
82     return 0;
83 }
84 
85 extern int _Py_normalize_encoding(const char *, char *, size_t);
86 
87 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
88    converted to lower case, spaces and hyphens are replaced with underscores. */
89 
90 static
normalizestring(const char * string)91 PyObject *normalizestring(const char *string)
92 {
93     size_t len = strlen(string);
94     char *encoding;
95     PyObject *v;
96 
97     if (len > PY_SSIZE_T_MAX) {
98         PyErr_SetString(PyExc_OverflowError, "string is too large");
99         return NULL;
100     }
101 
102     encoding = PyMem_Malloc(len + 1);
103     if (encoding == NULL)
104         return PyErr_NoMemory();
105 
106     if (!_Py_normalize_encoding(string, encoding, len + 1))
107     {
108         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
109         PyMem_Free(encoding);
110         return NULL;
111     }
112 
113     v = PyUnicode_FromString(encoding);
114     PyMem_Free(encoding);
115     return v;
116 }
117 
118 /* Lookup the given encoding and return a tuple providing the codec
119    facilities.
120 
121    The encoding string is looked up converted to all lower-case
122    characters. This makes encodings looked up through this mechanism
123    effectively case-insensitive.
124 
125    If no codec is found, a LookupError is set and NULL returned.
126 
127    As side effect, this tries to load the encodings package, if not
128    yet done. This is part of the lazy load strategy for the encodings
129    package.
130 
131 */
132 
_PyCodec_Lookup(const char * encoding)133 PyObject *_PyCodec_Lookup(const char *encoding)
134 {
135     if (encoding == NULL) {
136         PyErr_BadArgument();
137         return NULL;
138     }
139 
140     PyInterpreterState *interp = _PyInterpreterState_GET();
141     assert(interp->codecs.initialized);
142 
143     /* Convert the encoding to a normalized Python string: all
144        characters are converted to lower case, spaces and hyphens are
145        replaced with underscores. */
146     PyObject *v = normalizestring(encoding);
147     if (v == NULL) {
148         return NULL;
149     }
150 
151     /* Intern the string. We'll make it immortal later if lookup succeeds. */
152     _PyUnicode_InternMortal(interp, &v);
153 
154     /* First, try to lookup the name in the registry dictionary */
155     PyObject *result;
156     if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
157         goto onError;
158     }
159     if (result != NULL) {
160         Py_DECREF(v);
161         return result;
162     }
163 
164     /* Next, scan the search functions in order of registration */
165     const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
166     if (len < 0)
167         goto onError;
168     if (len == 0) {
169         PyErr_SetString(PyExc_LookupError,
170                         "no codec search functions registered: "
171                         "can't find encoding");
172         goto onError;
173     }
174 
175     Py_ssize_t i;
176     for (i = 0; i < len; i++) {
177         PyObject *func;
178 
179         func = PyList_GetItemRef(interp->codecs.search_path, i);
180         if (func == NULL)
181             goto onError;
182         result = PyObject_CallOneArg(func, v);
183         Py_DECREF(func);
184         if (result == NULL)
185             goto onError;
186         if (result == Py_None) {
187             Py_CLEAR(result);
188             continue;
189         }
190         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
191             PyErr_SetString(PyExc_TypeError,
192                             "codec search functions must return 4-tuples");
193             Py_DECREF(result);
194             goto onError;
195         }
196         break;
197     }
198     if (result == NULL) {
199         /* XXX Perhaps we should cache misses too ? */
200         PyErr_Format(PyExc_LookupError,
201                      "unknown encoding: %s", encoding);
202         goto onError;
203     }
204 
205     _PyUnicode_InternImmortal(interp, &v);
206 
207     /* Cache and return the result */
208     if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
209         Py_DECREF(result);
210         goto onError;
211     }
212     Py_DECREF(v);
213     return result;
214 
215  onError:
216     Py_DECREF(v);
217     return NULL;
218 }
219 
220 /* Codec registry encoding check API. */
221 
PyCodec_KnownEncoding(const char * encoding)222 int PyCodec_KnownEncoding(const char *encoding)
223 {
224     PyObject *codecs;
225 
226     codecs = _PyCodec_Lookup(encoding);
227     if (!codecs) {
228         PyErr_Clear();
229         return 0;
230     }
231     else {
232         Py_DECREF(codecs);
233         return 1;
234     }
235 }
236 
237 static
args_tuple(PyObject * object,const char * errors)238 PyObject *args_tuple(PyObject *object,
239                      const char *errors)
240 {
241     PyObject *args;
242 
243     args = PyTuple_New(1 + (errors != NULL));
244     if (args == NULL)
245         return NULL;
246     PyTuple_SET_ITEM(args, 0, Py_NewRef(object));
247     if (errors) {
248         PyObject *v;
249 
250         v = PyUnicode_FromString(errors);
251         if (v == NULL) {
252             Py_DECREF(args);
253             return NULL;
254         }
255         PyTuple_SET_ITEM(args, 1, v);
256     }
257     return args;
258 }
259 
260 /* Helper function to get a codec item */
261 
262 static
codec_getitem(const char * encoding,int index)263 PyObject *codec_getitem(const char *encoding, int index)
264 {
265     PyObject *codecs;
266     PyObject *v;
267 
268     codecs = _PyCodec_Lookup(encoding);
269     if (codecs == NULL)
270         return NULL;
271     v = PyTuple_GET_ITEM(codecs, index);
272     Py_DECREF(codecs);
273     return Py_NewRef(v);
274 }
275 
276 /* Helper functions to create an incremental codec. */
277 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)278 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279                                      const char *errors,
280                                      const char *attrname)
281 {
282     PyObject *ret, *inccodec;
283 
284     inccodec = PyObject_GetAttrString(codec_info, attrname);
285     if (inccodec == NULL)
286         return NULL;
287     if (errors)
288         ret = PyObject_CallFunction(inccodec, "s", errors);
289     else
290         ret = _PyObject_CallNoArgs(inccodec);
291     Py_DECREF(inccodec);
292     return ret;
293 }
294 
295 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)296 PyObject *codec_getincrementalcodec(const char *encoding,
297                                     const char *errors,
298                                     const char *attrname)
299 {
300     PyObject *codec_info, *ret;
301 
302     codec_info = _PyCodec_Lookup(encoding);
303     if (codec_info == NULL)
304         return NULL;
305     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306     Py_DECREF(codec_info);
307     return ret;
308 }
309 
310 /* Helper function to create a stream codec. */
311 
312 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)313 PyObject *codec_getstreamcodec(const char *encoding,
314                                PyObject *stream,
315                                const char *errors,
316                                const int index)
317 {
318     PyObject *codecs, *streamcodec, *codeccls;
319 
320     codecs = _PyCodec_Lookup(encoding);
321     if (codecs == NULL)
322         return NULL;
323 
324     codeccls = PyTuple_GET_ITEM(codecs, index);
325     if (errors != NULL)
326         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327     else
328         streamcodec = PyObject_CallOneArg(codeccls, stream);
329     Py_DECREF(codecs);
330     return streamcodec;
331 }
332 
333 /* Helpers to work with the result of _PyCodec_Lookup
334 
335  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337                                              const char *errors)
338 {
339     return codec_makeincrementalcodec(codec_info, errors,
340                                       "incrementaldecoder");
341 }
342 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)343 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344                                              const char *errors)
345 {
346     return codec_makeincrementalcodec(codec_info, errors,
347                                       "incrementalencoder");
348 }
349 
350 
351 /* Convenience APIs to query the Codec registry.
352 
353    All APIs return a codec object with incremented refcount.
354 
355  */
356 
PyCodec_Encoder(const char * encoding)357 PyObject *PyCodec_Encoder(const char *encoding)
358 {
359     return codec_getitem(encoding, 0);
360 }
361 
PyCodec_Decoder(const char * encoding)362 PyObject *PyCodec_Decoder(const char *encoding)
363 {
364     return codec_getitem(encoding, 1);
365 }
366 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368                                      const char *errors)
369 {
370     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371 }
372 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)373 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374                                      const char *errors)
375 {
376     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377 }
378 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamReader(const char *encoding,
380                                PyObject *stream,
381                                const char *errors)
382 {
383     return codec_getstreamcodec(encoding, stream, errors, 2);
384 }
385 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)386 PyObject *PyCodec_StreamWriter(const char *encoding,
387                                PyObject *stream,
388                                const char *errors)
389 {
390     return codec_getstreamcodec(encoding, stream, errors, 3);
391 }
392 
393 /* Encode an object (e.g. a Unicode object) using the given encoding
394    and return the resulting encoded object (usually a Python string).
395 
396    errors is passed to the encoder factory as argument if non-NULL. */
397 
398 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)399 _PyCodec_EncodeInternal(PyObject *object,
400                         PyObject *encoder,
401                         const char *encoding,
402                         const char *errors)
403 {
404     PyObject *args = NULL, *result = NULL;
405     PyObject *v = NULL;
406 
407     args = args_tuple(object, errors);
408     if (args == NULL)
409         goto onError;
410 
411     result = PyObject_Call(encoder, args, NULL);
412     if (result == NULL) {
413         _PyErr_FormatNote("%s with '%s' codec failed", "encoding", encoding);
414         goto onError;
415     }
416 
417     if (!PyTuple_Check(result) ||
418         PyTuple_GET_SIZE(result) != 2) {
419         PyErr_SetString(PyExc_TypeError,
420                         "encoder must return a tuple (object, integer)");
421         goto onError;
422     }
423     v = Py_NewRef(PyTuple_GET_ITEM(result,0));
424     /* We don't check or use the second (integer) entry. */
425 
426     Py_DECREF(args);
427     Py_DECREF(encoder);
428     Py_DECREF(result);
429     return v;
430 
431  onError:
432     Py_XDECREF(result);
433     Py_XDECREF(args);
434     Py_XDECREF(encoder);
435     return NULL;
436 }
437 
438 /* Decode an object (usually a Python string) using the given encoding
439    and return an equivalent object (e.g. a Unicode object).
440 
441    errors is passed to the decoder factory as argument if non-NULL. */
442 
443 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)444 _PyCodec_DecodeInternal(PyObject *object,
445                         PyObject *decoder,
446                         const char *encoding,
447                         const char *errors)
448 {
449     PyObject *args = NULL, *result = NULL;
450     PyObject *v;
451 
452     args = args_tuple(object, errors);
453     if (args == NULL)
454         goto onError;
455 
456     result = PyObject_Call(decoder, args, NULL);
457     if (result == NULL) {
458         _PyErr_FormatNote("%s with '%s' codec failed", "decoding", encoding);
459         goto onError;
460     }
461     if (!PyTuple_Check(result) ||
462         PyTuple_GET_SIZE(result) != 2) {
463         PyErr_SetString(PyExc_TypeError,
464                         "decoder must return a tuple (object,integer)");
465         goto onError;
466     }
467     v = Py_NewRef(PyTuple_GET_ITEM(result,0));
468     /* We don't check or use the second (integer) entry. */
469 
470     Py_DECREF(args);
471     Py_DECREF(decoder);
472     Py_DECREF(result);
473     return v;
474 
475  onError:
476     Py_XDECREF(args);
477     Py_XDECREF(decoder);
478     Py_XDECREF(result);
479     return NULL;
480 }
481 
482 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)483 PyObject *PyCodec_Encode(PyObject *object,
484                          const char *encoding,
485                          const char *errors)
486 {
487     PyObject *encoder;
488 
489     encoder = PyCodec_Encoder(encoding);
490     if (encoder == NULL)
491         return NULL;
492 
493     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
494 }
495 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)496 PyObject *PyCodec_Decode(PyObject *object,
497                          const char *encoding,
498                          const char *errors)
499 {
500     PyObject *decoder;
501 
502     decoder = PyCodec_Decoder(encoding);
503     if (decoder == NULL)
504         return NULL;
505 
506     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
507 }
508 
509 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)510 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
511                                        const char *alternate_command)
512 {
513     PyObject *codec;
514     PyObject *attr;
515     int is_text_codec;
516 
517     codec = _PyCodec_Lookup(encoding);
518     if (codec == NULL)
519         return NULL;
520 
521     /* Backwards compatibility: assume any raw tuple describes a text
522      * encoding, and the same for anything lacking the private
523      * attribute.
524      */
525     if (!PyTuple_CheckExact(codec)) {
526         if (PyObject_GetOptionalAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
527             Py_DECREF(codec);
528             return NULL;
529         }
530         if (attr != NULL) {
531             is_text_codec = PyObject_IsTrue(attr);
532             Py_DECREF(attr);
533             if (is_text_codec <= 0) {
534                 Py_DECREF(codec);
535                 if (!is_text_codec)
536                     PyErr_Format(PyExc_LookupError,
537                                  "'%.400s' is not a text encoding; "
538                                  "use %s to handle arbitrary codecs",
539                                  encoding, alternate_command);
540                 return NULL;
541             }
542         }
543     }
544 
545     /* This appears to be a valid text encoding */
546     return codec;
547 }
548 
549 
550 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)551 PyObject *codec_getitem_checked(const char *encoding,
552                                 const char *alternate_command,
553                                 int index)
554 {
555     PyObject *codec;
556     PyObject *v;
557 
558     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
559     if (codec == NULL)
560         return NULL;
561 
562     v = Py_NewRef(PyTuple_GET_ITEM(codec, index));
563     Py_DECREF(codec);
564     return v;
565 }
566 
_PyCodec_TextEncoder(const char * encoding)567 static PyObject * _PyCodec_TextEncoder(const char *encoding)
568 {
569     return codec_getitem_checked(encoding, "codecs.encode()", 0);
570 }
571 
_PyCodec_TextDecoder(const char * encoding)572 static PyObject * _PyCodec_TextDecoder(const char *encoding)
573 {
574     return codec_getitem_checked(encoding, "codecs.decode()", 1);
575 }
576 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)577 PyObject *_PyCodec_EncodeText(PyObject *object,
578                               const char *encoding,
579                               const char *errors)
580 {
581     PyObject *encoder;
582 
583     encoder = _PyCodec_TextEncoder(encoding);
584     if (encoder == NULL)
585         return NULL;
586 
587     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
588 }
589 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_DecodeText(PyObject *object,
591                               const char *encoding,
592                               const char *errors)
593 {
594     PyObject *decoder;
595 
596     decoder = _PyCodec_TextDecoder(encoding);
597     if (decoder == NULL)
598         return NULL;
599 
600     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
601 }
602 
603 /* Register the error handling callback function error under the name
604    name. This function will be called by the codec when it encounters
605    an unencodable characters/undecodable bytes and doesn't know the
606    callback name, when name is specified as the error parameter
607    in the call to the encode/decode function.
608    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)609 int PyCodec_RegisterError(const char *name, PyObject *error)
610 {
611     PyInterpreterState *interp = _PyInterpreterState_GET();
612     assert(interp->codecs.initialized);
613     if (!PyCallable_Check(error)) {
614         PyErr_SetString(PyExc_TypeError, "handler must be callable");
615         return -1;
616     }
617     return PyDict_SetItemString(interp->codecs.error_registry,
618                                 name, error);
619 }
620 
621 /* Lookup the error handling callback function registered under the
622    name error. As a special case NULL can be passed, in which case
623    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)624 PyObject *PyCodec_LookupError(const char *name)
625 {
626     PyInterpreterState *interp = _PyInterpreterState_GET();
627     assert(interp->codecs.initialized);
628 
629     if (name==NULL)
630         name = "strict";
631     PyObject *handler;
632     if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
633         return NULL;
634     }
635     if (handler == NULL) {
636         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
637         return NULL;
638     }
639     return handler;
640 }
641 
wrong_exception_type(PyObject * exc)642 static void wrong_exception_type(PyObject *exc)
643 {
644     PyErr_Format(PyExc_TypeError,
645                  "don't know how to handle %.200s in error callback",
646                  Py_TYPE(exc)->tp_name);
647 }
648 
PyCodec_StrictErrors(PyObject * exc)649 PyObject *PyCodec_StrictErrors(PyObject *exc)
650 {
651     if (PyExceptionInstance_Check(exc))
652         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
653     else
654         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
655     return NULL;
656 }
657 
658 
PyCodec_IgnoreErrors(PyObject * exc)659 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
660 {
661     Py_ssize_t end;
662 
663     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
664         if (PyUnicodeEncodeError_GetEnd(exc, &end))
665             return NULL;
666     }
667     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
668         if (PyUnicodeDecodeError_GetEnd(exc, &end))
669             return NULL;
670     }
671     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
672         if (PyUnicodeTranslateError_GetEnd(exc, &end))
673             return NULL;
674     }
675     else {
676         wrong_exception_type(exc);
677         return NULL;
678     }
679     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
680 }
681 
682 
PyCodec_ReplaceErrors(PyObject * exc)683 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
684 {
685     Py_ssize_t start, end, i, len;
686 
687     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
688         PyObject *res;
689         Py_UCS1 *outp;
690         if (PyUnicodeEncodeError_GetStart(exc, &start))
691             return NULL;
692         if (PyUnicodeEncodeError_GetEnd(exc, &end))
693             return NULL;
694         len = end - start;
695         res = PyUnicode_New(len, '?');
696         if (res == NULL)
697             return NULL;
698         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
699         outp = PyUnicode_1BYTE_DATA(res);
700         for (i = 0; i < len; ++i)
701             outp[i] = '?';
702         assert(_PyUnicode_CheckConsistency(res, 1));
703         return Py_BuildValue("(Nn)", res, end);
704     }
705     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
706         if (PyUnicodeDecodeError_GetEnd(exc, &end))
707             return NULL;
708         return Py_BuildValue("(Cn)",
709                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
710                              end);
711     }
712     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
713         PyObject *res;
714         Py_UCS2 *outp;
715         if (PyUnicodeTranslateError_GetStart(exc, &start))
716             return NULL;
717         if (PyUnicodeTranslateError_GetEnd(exc, &end))
718             return NULL;
719         len = end - start;
720         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
721         if (res == NULL)
722             return NULL;
723         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
724         outp = PyUnicode_2BYTE_DATA(res);
725         for (i = 0; i < len; i++)
726             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
727         assert(_PyUnicode_CheckConsistency(res, 1));
728         return Py_BuildValue("(Nn)", res, end);
729     }
730     else {
731         wrong_exception_type(exc);
732         return NULL;
733     }
734 }
735 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)736 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
737 {
738     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
739         PyObject *restuple;
740         PyObject *object;
741         Py_ssize_t i;
742         Py_ssize_t start;
743         Py_ssize_t end;
744         PyObject *res;
745         Py_UCS1 *outp;
746         Py_ssize_t ressize;
747         Py_UCS4 ch;
748         if (PyUnicodeEncodeError_GetStart(exc, &start))
749             return NULL;
750         if (PyUnicodeEncodeError_GetEnd(exc, &end))
751             return NULL;
752         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
753             return NULL;
754         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
755             end = start + PY_SSIZE_T_MAX / (2+7+1);
756         for (i = start, ressize = 0; i < end; ++i) {
757             /* object is guaranteed to be "ready" */
758             ch = PyUnicode_READ_CHAR(object, i);
759             if (ch<10)
760                 ressize += 2+1+1;
761             else if (ch<100)
762                 ressize += 2+2+1;
763             else if (ch<1000)
764                 ressize += 2+3+1;
765             else if (ch<10000)
766                 ressize += 2+4+1;
767             else if (ch<100000)
768                 ressize += 2+5+1;
769             else if (ch<1000000)
770                 ressize += 2+6+1;
771             else
772                 ressize += 2+7+1;
773         }
774         /* allocate replacement */
775         res = PyUnicode_New(ressize, 127);
776         if (res == NULL) {
777             Py_DECREF(object);
778             return NULL;
779         }
780         outp = PyUnicode_1BYTE_DATA(res);
781         /* generate replacement */
782         for (i = start; i < end; ++i) {
783             int digits;
784             int base;
785             ch = PyUnicode_READ_CHAR(object, i);
786             *outp++ = '&';
787             *outp++ = '#';
788             if (ch<10) {
789                 digits = 1;
790                 base = 1;
791             }
792             else if (ch<100) {
793                 digits = 2;
794                 base = 10;
795             }
796             else if (ch<1000) {
797                 digits = 3;
798                 base = 100;
799             }
800             else if (ch<10000) {
801                 digits = 4;
802                 base = 1000;
803             }
804             else if (ch<100000) {
805                 digits = 5;
806                 base = 10000;
807             }
808             else if (ch<1000000) {
809                 digits = 6;
810                 base = 100000;
811             }
812             else {
813                 digits = 7;
814                 base = 1000000;
815             }
816             while (digits-->0) {
817                 *outp++ = '0' + ch/base;
818                 ch %= base;
819                 base /= 10;
820             }
821             *outp++ = ';';
822         }
823         assert(_PyUnicode_CheckConsistency(res, 1));
824         restuple = Py_BuildValue("(Nn)", res, end);
825         Py_DECREF(object);
826         return restuple;
827     }
828     else {
829         wrong_exception_type(exc);
830         return NULL;
831     }
832 }
833 
PyCodec_BackslashReplaceErrors(PyObject * exc)834 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
835 {
836     PyObject *object;
837     Py_ssize_t i;
838     Py_ssize_t start;
839     Py_ssize_t end;
840     PyObject *res;
841     Py_UCS1 *outp;
842     int ressize;
843     Py_UCS4 c;
844 
845     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
846         const unsigned char *p;
847         if (PyUnicodeDecodeError_GetStart(exc, &start))
848             return NULL;
849         if (PyUnicodeDecodeError_GetEnd(exc, &end))
850             return NULL;
851         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
852             return NULL;
853         p = (const unsigned char*)PyBytes_AS_STRING(object);
854         res = PyUnicode_New(4 * (end - start), 127);
855         if (res == NULL) {
856             Py_DECREF(object);
857             return NULL;
858         }
859         outp = PyUnicode_1BYTE_DATA(res);
860         for (i = start; i < end; i++, outp += 4) {
861             unsigned char c = p[i];
862             outp[0] = '\\';
863             outp[1] = 'x';
864             outp[2] = Py_hexdigits[(c>>4)&0xf];
865             outp[3] = Py_hexdigits[c&0xf];
866         }
867 
868         assert(_PyUnicode_CheckConsistency(res, 1));
869         Py_DECREF(object);
870         return Py_BuildValue("(Nn)", res, end);
871     }
872     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
873         if (PyUnicodeEncodeError_GetStart(exc, &start))
874             return NULL;
875         if (PyUnicodeEncodeError_GetEnd(exc, &end))
876             return NULL;
877         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
878             return NULL;
879     }
880     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
881         if (PyUnicodeTranslateError_GetStart(exc, &start))
882             return NULL;
883         if (PyUnicodeTranslateError_GetEnd(exc, &end))
884             return NULL;
885         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
886             return NULL;
887     }
888     else {
889         wrong_exception_type(exc);
890         return NULL;
891     }
892 
893     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
894         end = start + PY_SSIZE_T_MAX / (1+1+8);
895     for (i = start, ressize = 0; i < end; ++i) {
896         /* object is guaranteed to be "ready" */
897         c = PyUnicode_READ_CHAR(object, i);
898         if (c >= 0x10000) {
899             ressize += 1+1+8;
900         }
901         else if (c >= 0x100) {
902             ressize += 1+1+4;
903         }
904         else
905             ressize += 1+1+2;
906     }
907     res = PyUnicode_New(ressize, 127);
908     if (res == NULL) {
909         Py_DECREF(object);
910         return NULL;
911     }
912     outp = PyUnicode_1BYTE_DATA(res);
913     for (i = start; i < end; ++i) {
914         c = PyUnicode_READ_CHAR(object, i);
915         *outp++ = '\\';
916         if (c >= 0x00010000) {
917             *outp++ = 'U';
918             *outp++ = Py_hexdigits[(c>>28)&0xf];
919             *outp++ = Py_hexdigits[(c>>24)&0xf];
920             *outp++ = Py_hexdigits[(c>>20)&0xf];
921             *outp++ = Py_hexdigits[(c>>16)&0xf];
922             *outp++ = Py_hexdigits[(c>>12)&0xf];
923             *outp++ = Py_hexdigits[(c>>8)&0xf];
924         }
925         else if (c >= 0x100) {
926             *outp++ = 'u';
927             *outp++ = Py_hexdigits[(c>>12)&0xf];
928             *outp++ = Py_hexdigits[(c>>8)&0xf];
929         }
930         else
931             *outp++ = 'x';
932         *outp++ = Py_hexdigits[(c>>4)&0xf];
933         *outp++ = Py_hexdigits[c&0xf];
934     }
935 
936     assert(_PyUnicode_CheckConsistency(res, 1));
937     Py_DECREF(object);
938     return Py_BuildValue("(Nn)", res, end);
939 }
940 
PyCodec_NameReplaceErrors(PyObject * exc)941 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
942 {
943     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
944         PyObject *restuple;
945         PyObject *object;
946         Py_ssize_t i;
947         Py_ssize_t start;
948         Py_ssize_t end;
949         PyObject *res;
950         Py_UCS1 *outp;
951         Py_ssize_t ressize;
952         int replsize;
953         Py_UCS4 c;
954         char buffer[256]; /* NAME_MAXLEN */
955         if (PyUnicodeEncodeError_GetStart(exc, &start))
956             return NULL;
957         if (PyUnicodeEncodeError_GetEnd(exc, &end))
958             return NULL;
959         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
960             return NULL;
961         _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
962         if (ucnhash_capi == NULL) {
963             return NULL;
964         }
965         for (i = start, ressize = 0; i < end; ++i) {
966             /* object is guaranteed to be "ready" */
967             c = PyUnicode_READ_CHAR(object, i);
968             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
969                 replsize = 1+1+1+(int)strlen(buffer)+1;
970             }
971             else if (c >= 0x10000) {
972                 replsize = 1+1+8;
973             }
974             else if (c >= 0x100) {
975                 replsize = 1+1+4;
976             }
977             else
978                 replsize = 1+1+2;
979             if (ressize > PY_SSIZE_T_MAX - replsize)
980                 break;
981             ressize += replsize;
982         }
983         end = i;
984         res = PyUnicode_New(ressize, 127);
985         if (res==NULL)
986             return NULL;
987         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
988             i < end; ++i) {
989             c = PyUnicode_READ_CHAR(object, i);
990             *outp++ = '\\';
991             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
992                 *outp++ = 'N';
993                 *outp++ = '{';
994                 strcpy((char *)outp, buffer);
995                 outp += strlen(buffer);
996                 *outp++ = '}';
997                 continue;
998             }
999             if (c >= 0x00010000) {
1000                 *outp++ = 'U';
1001                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1002                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1003                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1004                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1005                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1006                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1007             }
1008             else if (c >= 0x100) {
1009                 *outp++ = 'u';
1010                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1011                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1012             }
1013             else
1014                 *outp++ = 'x';
1015             *outp++ = Py_hexdigits[(c>>4)&0xf];
1016             *outp++ = Py_hexdigits[c&0xf];
1017         }
1018 
1019         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1020         assert(_PyUnicode_CheckConsistency(res, 1));
1021         restuple = Py_BuildValue("(Nn)", res, end);
1022         Py_DECREF(object);
1023         return restuple;
1024     }
1025     else {
1026         wrong_exception_type(exc);
1027         return NULL;
1028     }
1029 }
1030 
1031 #define ENC_UNKNOWN     -1
1032 #define ENC_UTF8        0
1033 #define ENC_UTF16BE     1
1034 #define ENC_UTF16LE     2
1035 #define ENC_UTF32BE     3
1036 #define ENC_UTF32LE     4
1037 
1038 static int
get_standard_encoding(const char * encoding,int * bytelength)1039 get_standard_encoding(const char *encoding, int *bytelength)
1040 {
1041     if (Py_TOLOWER(encoding[0]) == 'u' &&
1042         Py_TOLOWER(encoding[1]) == 't' &&
1043         Py_TOLOWER(encoding[2]) == 'f') {
1044         encoding += 3;
1045         if (*encoding == '-' || *encoding == '_' )
1046             encoding++;
1047         if (encoding[0] == '8' && encoding[1] == '\0') {
1048             *bytelength = 3;
1049             return ENC_UTF8;
1050         }
1051         else if (encoding[0] == '1' && encoding[1] == '6') {
1052             encoding += 2;
1053             *bytelength = 2;
1054             if (*encoding == '\0') {
1055 #ifdef WORDS_BIGENDIAN
1056                 return ENC_UTF16BE;
1057 #else
1058                 return ENC_UTF16LE;
1059 #endif
1060             }
1061             if (*encoding == '-' || *encoding == '_' )
1062                 encoding++;
1063             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1064                 if (Py_TOLOWER(encoding[0]) == 'b')
1065                     return ENC_UTF16BE;
1066                 if (Py_TOLOWER(encoding[0]) == 'l')
1067                     return ENC_UTF16LE;
1068             }
1069         }
1070         else if (encoding[0] == '3' && encoding[1] == '2') {
1071             encoding += 2;
1072             *bytelength = 4;
1073             if (*encoding == '\0') {
1074 #ifdef WORDS_BIGENDIAN
1075                 return ENC_UTF32BE;
1076 #else
1077                 return ENC_UTF32LE;
1078 #endif
1079             }
1080             if (*encoding == '-' || *encoding == '_' )
1081                 encoding++;
1082             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1083                 if (Py_TOLOWER(encoding[0]) == 'b')
1084                     return ENC_UTF32BE;
1085                 if (Py_TOLOWER(encoding[0]) == 'l')
1086                     return ENC_UTF32LE;
1087             }
1088         }
1089     }
1090     else if (strcmp(encoding, "CP_UTF8") == 0) {
1091         *bytelength = 3;
1092         return ENC_UTF8;
1093     }
1094     return ENC_UNKNOWN;
1095 }
1096 
1097 /* This handler is declared static until someone demonstrates
1098    a need to call it directly. */
1099 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1100 PyCodec_SurrogatePassErrors(PyObject *exc)
1101 {
1102     PyObject *restuple;
1103     PyObject *object;
1104     PyObject *encode;
1105     const char *encoding;
1106     int code;
1107     int bytelength;
1108     Py_ssize_t i;
1109     Py_ssize_t start;
1110     Py_ssize_t end;
1111     PyObject *res;
1112 
1113     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1114         unsigned char *outp;
1115         if (PyUnicodeEncodeError_GetStart(exc, &start))
1116             return NULL;
1117         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1118             return NULL;
1119         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1120             return NULL;
1121         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1122             Py_DECREF(object);
1123             return NULL;
1124         }
1125         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1126             Py_DECREF(object);
1127             Py_DECREF(encode);
1128             return NULL;
1129         }
1130         code = get_standard_encoding(encoding, &bytelength);
1131         Py_DECREF(encode);
1132         if (code == ENC_UNKNOWN) {
1133             /* Not supported, fail with original exception */
1134             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1135             Py_DECREF(object);
1136             return NULL;
1137         }
1138 
1139         if (end - start > PY_SSIZE_T_MAX / bytelength)
1140             end = start + PY_SSIZE_T_MAX / bytelength;
1141         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1142         if (!res) {
1143             Py_DECREF(object);
1144             return NULL;
1145         }
1146         outp = (unsigned char*)PyBytes_AsString(res);
1147         for (i = start; i < end; i++) {
1148             /* object is guaranteed to be "ready" */
1149             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1150             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1151                 /* Not a surrogate, fail with original exception */
1152                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1153                 Py_DECREF(res);
1154                 Py_DECREF(object);
1155                 return NULL;
1156             }
1157             switch (code) {
1158             case ENC_UTF8:
1159                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1160                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1161                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1162                 break;
1163             case ENC_UTF16LE:
1164                 *outp++ = (unsigned char) ch;
1165                 *outp++ = (unsigned char)(ch >> 8);
1166                 break;
1167             case ENC_UTF16BE:
1168                 *outp++ = (unsigned char)(ch >> 8);
1169                 *outp++ = (unsigned char) ch;
1170                 break;
1171             case ENC_UTF32LE:
1172                 *outp++ = (unsigned char) ch;
1173                 *outp++ = (unsigned char)(ch >> 8);
1174                 *outp++ = (unsigned char)(ch >> 16);
1175                 *outp++ = (unsigned char)(ch >> 24);
1176                 break;
1177             case ENC_UTF32BE:
1178                 *outp++ = (unsigned char)(ch >> 24);
1179                 *outp++ = (unsigned char)(ch >> 16);
1180                 *outp++ = (unsigned char)(ch >> 8);
1181                 *outp++ = (unsigned char) ch;
1182                 break;
1183             }
1184         }
1185         restuple = Py_BuildValue("(On)", res, end);
1186         Py_DECREF(res);
1187         Py_DECREF(object);
1188         return restuple;
1189     }
1190     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1191         const unsigned char *p;
1192         Py_UCS4 ch = 0;
1193         if (PyUnicodeDecodeError_GetStart(exc, &start))
1194             return NULL;
1195         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1196             return NULL;
1197         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1198             return NULL;
1199         p = (const unsigned char*)PyBytes_AS_STRING(object);
1200         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1201             Py_DECREF(object);
1202             return NULL;
1203         }
1204         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1205             Py_DECREF(object);
1206             Py_DECREF(encode);
1207             return NULL;
1208         }
1209         code = get_standard_encoding(encoding, &bytelength);
1210         Py_DECREF(encode);
1211         if (code == ENC_UNKNOWN) {
1212             /* Not supported, fail with original exception */
1213             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1214             Py_DECREF(object);
1215             return NULL;
1216         }
1217 
1218         /* Try decoding a single surrogate character. If
1219            there are more, let the codec call us again. */
1220         p += start;
1221         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1222             switch (code) {
1223             case ENC_UTF8:
1224                 if ((p[0] & 0xf0) == 0xe0 &&
1225                     (p[1] & 0xc0) == 0x80 &&
1226                     (p[2] & 0xc0) == 0x80) {
1227                     /* it's a three-byte code */
1228                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1229                 }
1230                 break;
1231             case ENC_UTF16LE:
1232                 ch = p[1] << 8 | p[0];
1233                 break;
1234             case ENC_UTF16BE:
1235                 ch = p[0] << 8 | p[1];
1236                 break;
1237             case ENC_UTF32LE:
1238                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1239                 break;
1240             case ENC_UTF32BE:
1241                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1242                 break;
1243             }
1244         }
1245 
1246         Py_DECREF(object);
1247         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1248             /* it's not a surrogate - fail */
1249             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1250             return NULL;
1251         }
1252         res = PyUnicode_FromOrdinal(ch);
1253         if (res == NULL)
1254             return NULL;
1255         return Py_BuildValue("(Nn)", res, start + bytelength);
1256     }
1257     else {
1258         wrong_exception_type(exc);
1259         return NULL;
1260     }
1261 }
1262 
1263 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1264 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1265 {
1266     PyObject *restuple;
1267     PyObject *object;
1268     Py_ssize_t i;
1269     Py_ssize_t start;
1270     Py_ssize_t end;
1271     PyObject *res;
1272 
1273     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1274         char *outp;
1275         if (PyUnicodeEncodeError_GetStart(exc, &start))
1276             return NULL;
1277         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1278             return NULL;
1279         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1280             return NULL;
1281         res = PyBytes_FromStringAndSize(NULL, end-start);
1282         if (!res) {
1283             Py_DECREF(object);
1284             return NULL;
1285         }
1286         outp = PyBytes_AsString(res);
1287         for (i = start; i < end; i++) {
1288             /* object is guaranteed to be "ready" */
1289             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1290             if (ch < 0xdc80 || ch > 0xdcff) {
1291                 /* Not a UTF-8b surrogate, fail with original exception */
1292                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1293                 Py_DECREF(res);
1294                 Py_DECREF(object);
1295                 return NULL;
1296             }
1297             *outp++ = ch - 0xdc00;
1298         }
1299         restuple = Py_BuildValue("(On)", res, end);
1300         Py_DECREF(res);
1301         Py_DECREF(object);
1302         return restuple;
1303     }
1304     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1305         PyObject *str;
1306         const unsigned char *p;
1307         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1308         int consumed = 0;
1309         if (PyUnicodeDecodeError_GetStart(exc, &start))
1310             return NULL;
1311         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1312             return NULL;
1313         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1314             return NULL;
1315         p = (const unsigned char*)PyBytes_AS_STRING(object);
1316         while (consumed < 4 && consumed < end-start) {
1317             /* Refuse to escape ASCII bytes. */
1318             if (p[start+consumed] < 128)
1319                 break;
1320             ch[consumed] = 0xdc00 + p[start+consumed];
1321             consumed++;
1322         }
1323         Py_DECREF(object);
1324         if (!consumed) {
1325             /* codec complained about ASCII byte. */
1326             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1327             return NULL;
1328         }
1329         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1330         if (str == NULL)
1331             return NULL;
1332         return Py_BuildValue("(Nn)", str, start+consumed);
1333     }
1334     else {
1335         wrong_exception_type(exc);
1336         return NULL;
1337     }
1338 }
1339 
1340 
strict_errors(PyObject * self,PyObject * exc)1341 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1342 {
1343     return PyCodec_StrictErrors(exc);
1344 }
1345 
1346 
ignore_errors(PyObject * self,PyObject * exc)1347 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1348 {
1349     return PyCodec_IgnoreErrors(exc);
1350 }
1351 
1352 
replace_errors(PyObject * self,PyObject * exc)1353 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1354 {
1355     return PyCodec_ReplaceErrors(exc);
1356 }
1357 
1358 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1359 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1360 {
1361     return PyCodec_XMLCharRefReplaceErrors(exc);
1362 }
1363 
1364 
backslashreplace_errors(PyObject * self,PyObject * exc)1365 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1366 {
1367     return PyCodec_BackslashReplaceErrors(exc);
1368 }
1369 
namereplace_errors(PyObject * self,PyObject * exc)1370 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1371 {
1372     return PyCodec_NameReplaceErrors(exc);
1373 }
1374 
surrogatepass_errors(PyObject * self,PyObject * exc)1375 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1376 {
1377     return PyCodec_SurrogatePassErrors(exc);
1378 }
1379 
surrogateescape_errors(PyObject * self,PyObject * exc)1380 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1381 {
1382     return PyCodec_SurrogateEscapeErrors(exc);
1383 }
1384 
1385 PyStatus
_PyCodec_InitRegistry(PyInterpreterState * interp)1386 _PyCodec_InitRegistry(PyInterpreterState *interp)
1387 {
1388     static struct {
1389         const char *name;
1390         PyMethodDef def;
1391     } methods[] =
1392     {
1393         {
1394             "strict",
1395             {
1396                 "strict_errors",
1397                 strict_errors,
1398                 METH_O,
1399                 PyDoc_STR("Implements the 'strict' error handling, which "
1400                           "raises a UnicodeError on coding errors.")
1401             }
1402         },
1403         {
1404             "ignore",
1405             {
1406                 "ignore_errors",
1407                 ignore_errors,
1408                 METH_O,
1409                 PyDoc_STR("Implements the 'ignore' error handling, which "
1410                           "ignores malformed data and continues.")
1411             }
1412         },
1413         {
1414             "replace",
1415             {
1416                 "replace_errors",
1417                 replace_errors,
1418                 METH_O,
1419                 PyDoc_STR("Implements the 'replace' error handling, which "
1420                           "replaces malformed data with a replacement marker.")
1421             }
1422         },
1423         {
1424             "xmlcharrefreplace",
1425             {
1426                 "xmlcharrefreplace_errors",
1427                 xmlcharrefreplace_errors,
1428                 METH_O,
1429                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1430                           "which replaces an unencodable character with the "
1431                           "appropriate XML character reference.")
1432             }
1433         },
1434         {
1435             "backslashreplace",
1436             {
1437                 "backslashreplace_errors",
1438                 backslashreplace_errors,
1439                 METH_O,
1440                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1441                           "which replaces malformed data with a backslashed "
1442                           "escape sequence.")
1443             }
1444         },
1445         {
1446             "namereplace",
1447             {
1448                 "namereplace_errors",
1449                 namereplace_errors,
1450                 METH_O,
1451                 PyDoc_STR("Implements the 'namereplace' error handling, "
1452                           "which replaces an unencodable character with a "
1453                           "\\N{...} escape sequence.")
1454             }
1455         },
1456         {
1457             "surrogatepass",
1458             {
1459                 "surrogatepass",
1460                 surrogatepass_errors,
1461                 METH_O
1462             }
1463         },
1464         {
1465             "surrogateescape",
1466             {
1467                 "surrogateescape",
1468                 surrogateescape_errors,
1469                 METH_O
1470             }
1471         }
1472     };
1473 
1474     assert(interp->codecs.initialized == 0);
1475     interp->codecs.search_path = PyList_New(0);
1476     if (interp->codecs.search_path == NULL) {
1477         return PyStatus_NoMemory();
1478     }
1479     interp->codecs.search_cache = PyDict_New();
1480     if (interp->codecs.search_cache == NULL) {
1481         return PyStatus_NoMemory();
1482     }
1483     interp->codecs.error_registry = PyDict_New();
1484     if (interp->codecs.error_registry == NULL) {
1485         return PyStatus_NoMemory();
1486     }
1487     for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1488         PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1489         if (func == NULL) {
1490             return PyStatus_NoMemory();
1491         }
1492 
1493         int res = PyDict_SetItemString(interp->codecs.error_registry,
1494                                        methods[i].name, func);
1495         Py_DECREF(func);
1496         if (res < 0) {
1497             return PyStatus_Error("Failed to insert into codec error registry");
1498         }
1499     }
1500 
1501     interp->codecs.initialized = 1;
1502 
1503     // Importing `encodings' will call back into this module to register codec
1504     // search functions, so this is done after everything else is initialized.
1505     PyObject *mod = PyImport_ImportModule("encodings");
1506     if (mod == NULL) {
1507         return PyStatus_Error("Failed to import encodings module");
1508     }
1509     Py_DECREF(mod);
1510 
1511     return PyStatus_Ok();
1512 }
1513 
1514 void
_PyCodec_Fini(PyInterpreterState * interp)1515 _PyCodec_Fini(PyInterpreterState *interp)
1516 {
1517     Py_CLEAR(interp->codecs.search_path);
1518     Py_CLEAR(interp->codecs.search_cache);
1519     Py_CLEAR(interp->codecs.error_registry);
1520     interp->codecs.initialized = 0;
1521 }
1522