• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
13 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
14 #include "ucnhash.h"
15 #include <ctype.h>
16 
17 const char *Py_hexdigits = "0123456789abcdef";
18 
19 /* --- Codec Registry ----------------------------------------------------- */
20 
21 /* Import the standard encodings package which will register the first
22    codec search function.
23 
24    This is done in a lazy way so that the Unicode implementation does
25    not downgrade startup time of scripts not needing it.
26 
27    ImportErrors are silently ignored by this function. Only one try is
28    made.
29 
30 */
31 
32 static int _PyCodecRegistry_Init(void); /* Forward */
33 
PyCodec_Register(PyObject * search_function)34 int PyCodec_Register(PyObject *search_function)
35 {
36     PyInterpreterState *interp = _PyInterpreterState_GET();
37     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
38         goto onError;
39     if (search_function == NULL) {
40         PyErr_BadArgument();
41         goto onError;
42     }
43     if (!PyCallable_Check(search_function)) {
44         PyErr_SetString(PyExc_TypeError, "argument must be callable");
45         goto onError;
46     }
47     return PyList_Append(interp->codec_search_path, search_function);
48 
49  onError:
50     return -1;
51 }
52 
53 extern int _Py_normalize_encoding(const char *, char *, size_t);
54 
55 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
56    converted to lower case, spaces and hyphens are replaced with underscores. */
57 
58 static
normalizestring(const char * string)59 PyObject *normalizestring(const char *string)
60 {
61     size_t len = strlen(string);
62     char *encoding;
63     PyObject *v;
64 
65     if (len > PY_SSIZE_T_MAX) {
66         PyErr_SetString(PyExc_OverflowError, "string is too large");
67         return NULL;
68     }
69 
70     encoding = PyMem_Malloc(len + 1);
71     if (encoding == NULL)
72         return PyErr_NoMemory();
73 
74     if (!_Py_normalize_encoding(string, encoding, len + 1))
75     {
76         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
77         PyMem_Free(encoding);
78         return NULL;
79     }
80 
81     v = PyUnicode_FromString(encoding);
82     PyMem_Free(encoding);
83     return v;
84 }
85 
86 /* Lookup the given encoding and return a tuple providing the codec
87    facilities.
88 
89    The encoding string is looked up converted to all lower-case
90    characters. This makes encodings looked up through this mechanism
91    effectively case-insensitive.
92 
93    If no codec is found, a LookupError is set and NULL returned.
94 
95    As side effect, this tries to load the encodings package, if not
96    yet done. This is part of the lazy load strategy for the encodings
97    package.
98 
99 */
100 
_PyCodec_Lookup(const char * encoding)101 PyObject *_PyCodec_Lookup(const char *encoding)
102 {
103     if (encoding == NULL) {
104         PyErr_BadArgument();
105         return NULL;
106     }
107 
108     PyInterpreterState *interp = _PyInterpreterState_GET();
109     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
110         return NULL;
111     }
112 
113     /* Convert the encoding to a normalized Python string: all
114        characters are converted to lower case, spaces and hyphens are
115        replaced with underscores. */
116     PyObject *v = normalizestring(encoding);
117     if (v == NULL) {
118         return NULL;
119     }
120     PyUnicode_InternInPlace(&v);
121 
122     /* First, try to lookup the name in the registry dictionary */
123     PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124     if (result != NULL) {
125         Py_INCREF(result);
126         Py_DECREF(v);
127         return result;
128     }
129     else if (PyErr_Occurred()) {
130         goto onError;
131     }
132 
133     /* Next, scan the search functions in order of registration */
134     const Py_ssize_t len = PyList_Size(interp->codec_search_path);
135     if (len < 0)
136         goto onError;
137     if (len == 0) {
138         PyErr_SetString(PyExc_LookupError,
139                         "no codec search functions registered: "
140                         "can't find encoding");
141         goto onError;
142     }
143 
144     Py_ssize_t i;
145     for (i = 0; i < len; i++) {
146         PyObject *func;
147 
148         func = PyList_GetItem(interp->codec_search_path, i);
149         if (func == NULL)
150             goto onError;
151         result = PyObject_CallOneArg(func, v);
152         if (result == NULL)
153             goto onError;
154         if (result == Py_None) {
155             Py_DECREF(result);
156             continue;
157         }
158         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159             PyErr_SetString(PyExc_TypeError,
160                             "codec search functions must return 4-tuples");
161             Py_DECREF(result);
162             goto onError;
163         }
164         break;
165     }
166     if (i == len) {
167         /* XXX Perhaps we should cache misses too ? */
168         PyErr_Format(PyExc_LookupError,
169                      "unknown encoding: %s", encoding);
170         goto onError;
171     }
172 
173     /* Cache and return the result */
174     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175         Py_DECREF(result);
176         goto onError;
177     }
178     Py_DECREF(v);
179     return result;
180 
181  onError:
182     Py_DECREF(v);
183     return NULL;
184 }
185 
_PyCodec_Forget(const char * encoding)186 int _PyCodec_Forget(const char *encoding)
187 {
188     PyObject *v;
189     int result;
190 
191     PyInterpreterState *interp = _PyInterpreterState_GET();
192     if (interp->codec_search_path == NULL) {
193         return -1;
194     }
195 
196     /* Convert the encoding to a normalized Python string: all
197        characters are converted to lower case, spaces and hyphens are
198        replaced with underscores. */
199     v = normalizestring(encoding);
200     if (v == NULL) {
201         return -1;
202     }
203 
204     /* Drop the named codec from the internal cache */
205     result = PyDict_DelItem(interp->codec_search_cache, v);
206     Py_DECREF(v);
207 
208     return result;
209 }
210 
211 /* Codec registry encoding check API. */
212 
PyCodec_KnownEncoding(const char * encoding)213 int PyCodec_KnownEncoding(const char *encoding)
214 {
215     PyObject *codecs;
216 
217     codecs = _PyCodec_Lookup(encoding);
218     if (!codecs) {
219         PyErr_Clear();
220         return 0;
221     }
222     else {
223         Py_DECREF(codecs);
224         return 1;
225     }
226 }
227 
228 static
args_tuple(PyObject * object,const char * errors)229 PyObject *args_tuple(PyObject *object,
230                      const char *errors)
231 {
232     PyObject *args;
233 
234     args = PyTuple_New(1 + (errors != NULL));
235     if (args == NULL)
236         return NULL;
237     Py_INCREF(object);
238     PyTuple_SET_ITEM(args,0,object);
239     if (errors) {
240         PyObject *v;
241 
242         v = PyUnicode_FromString(errors);
243         if (v == NULL) {
244             Py_DECREF(args);
245             return NULL;
246         }
247         PyTuple_SET_ITEM(args, 1, v);
248     }
249     return args;
250 }
251 
252 /* Helper function to get a codec item */
253 
254 static
codec_getitem(const char * encoding,int index)255 PyObject *codec_getitem(const char *encoding, int index)
256 {
257     PyObject *codecs;
258     PyObject *v;
259 
260     codecs = _PyCodec_Lookup(encoding);
261     if (codecs == NULL)
262         return NULL;
263     v = PyTuple_GET_ITEM(codecs, index);
264     Py_DECREF(codecs);
265     Py_INCREF(v);
266     return v;
267 }
268 
269 /* Helper functions to create an incremental codec. */
270 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)271 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272                                      const char *errors,
273                                      const char *attrname)
274 {
275     PyObject *ret, *inccodec;
276 
277     inccodec = PyObject_GetAttrString(codec_info, attrname);
278     if (inccodec == NULL)
279         return NULL;
280     if (errors)
281         ret = PyObject_CallFunction(inccodec, "s", errors);
282     else
283         ret = _PyObject_CallNoArg(inccodec);
284     Py_DECREF(inccodec);
285     return ret;
286 }
287 
288 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)289 PyObject *codec_getincrementalcodec(const char *encoding,
290                                     const char *errors,
291                                     const char *attrname)
292 {
293     PyObject *codec_info, *ret;
294 
295     codec_info = _PyCodec_Lookup(encoding);
296     if (codec_info == NULL)
297         return NULL;
298     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299     Py_DECREF(codec_info);
300     return ret;
301 }
302 
303 /* Helper function to create a stream codec. */
304 
305 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)306 PyObject *codec_getstreamcodec(const char *encoding,
307                                PyObject *stream,
308                                const char *errors,
309                                const int index)
310 {
311     PyObject *codecs, *streamcodec, *codeccls;
312 
313     codecs = _PyCodec_Lookup(encoding);
314     if (codecs == NULL)
315         return NULL;
316 
317     codeccls = PyTuple_GET_ITEM(codecs, index);
318     if (errors != NULL)
319         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
320     else
321         streamcodec = PyObject_CallOneArg(codeccls, stream);
322     Py_DECREF(codecs);
323     return streamcodec;
324 }
325 
326 /* Helpers to work with the result of _PyCodec_Lookup
327 
328  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)329 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330                                              const char *errors)
331 {
332     return codec_makeincrementalcodec(codec_info, errors,
333                                       "incrementaldecoder");
334 }
335 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337                                              const char *errors)
338 {
339     return codec_makeincrementalcodec(codec_info, errors,
340                                       "incrementalencoder");
341 }
342 
343 
344 /* Convenience APIs to query the Codec registry.
345 
346    All APIs return a codec object with incremented refcount.
347 
348  */
349 
PyCodec_Encoder(const char * encoding)350 PyObject *PyCodec_Encoder(const char *encoding)
351 {
352     return codec_getitem(encoding, 0);
353 }
354 
PyCodec_Decoder(const char * encoding)355 PyObject *PyCodec_Decoder(const char *encoding)
356 {
357     return codec_getitem(encoding, 1);
358 }
359 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)360 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
361                                      const char *errors)
362 {
363     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
364 }
365 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)366 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
367                                      const char *errors)
368 {
369     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
370 }
371 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)372 PyObject *PyCodec_StreamReader(const char *encoding,
373                                PyObject *stream,
374                                const char *errors)
375 {
376     return codec_getstreamcodec(encoding, stream, errors, 2);
377 }
378 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamWriter(const char *encoding,
380                                PyObject *stream,
381                                const char *errors)
382 {
383     return codec_getstreamcodec(encoding, stream, errors, 3);
384 }
385 
386 /* Helper that tries to ensure the reported exception chain indicates the
387  * codec that was invoked to trigger the failure without changing the type
388  * of the exception raised.
389  */
390 static void
wrap_codec_error(const char * operation,const char * encoding)391 wrap_codec_error(const char *operation,
392                  const char *encoding)
393 {
394     /* TrySetFromCause will replace the active exception with a suitably
395      * updated clone if it can, otherwise it will leave the original
396      * exception alone.
397      */
398     _PyErr_TrySetFromCause("%s with '%s' codec failed",
399                            operation, encoding);
400 }
401 
402 /* Encode an object (e.g. a Unicode object) using the given encoding
403    and return the resulting encoded object (usually a Python string).
404 
405    errors is passed to the encoder factory as argument if non-NULL. */
406 
407 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)408 _PyCodec_EncodeInternal(PyObject *object,
409                         PyObject *encoder,
410                         const char *encoding,
411                         const char *errors)
412 {
413     PyObject *args = NULL, *result = NULL;
414     PyObject *v = NULL;
415 
416     args = args_tuple(object, errors);
417     if (args == NULL)
418         goto onError;
419 
420     result = PyObject_Call(encoder, args, NULL);
421     if (result == NULL) {
422         wrap_codec_error("encoding", encoding);
423         goto onError;
424     }
425 
426     if (!PyTuple_Check(result) ||
427         PyTuple_GET_SIZE(result) != 2) {
428         PyErr_SetString(PyExc_TypeError,
429                         "encoder must return a tuple (object, integer)");
430         goto onError;
431     }
432     v = PyTuple_GET_ITEM(result,0);
433     Py_INCREF(v);
434     /* We don't check or use the second (integer) entry. */
435 
436     Py_DECREF(args);
437     Py_DECREF(encoder);
438     Py_DECREF(result);
439     return v;
440 
441  onError:
442     Py_XDECREF(result);
443     Py_XDECREF(args);
444     Py_XDECREF(encoder);
445     return NULL;
446 }
447 
448 /* Decode an object (usually a Python string) using the given encoding
449    and return an equivalent object (e.g. a Unicode object).
450 
451    errors is passed to the decoder factory as argument if non-NULL. */
452 
453 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)454 _PyCodec_DecodeInternal(PyObject *object,
455                         PyObject *decoder,
456                         const char *encoding,
457                         const char *errors)
458 {
459     PyObject *args = NULL, *result = NULL;
460     PyObject *v;
461 
462     args = args_tuple(object, errors);
463     if (args == NULL)
464         goto onError;
465 
466     result = PyObject_Call(decoder, args, NULL);
467     if (result == NULL) {
468         wrap_codec_error("decoding", encoding);
469         goto onError;
470     }
471     if (!PyTuple_Check(result) ||
472         PyTuple_GET_SIZE(result) != 2) {
473         PyErr_SetString(PyExc_TypeError,
474                         "decoder must return a tuple (object,integer)");
475         goto onError;
476     }
477     v = PyTuple_GET_ITEM(result,0);
478     Py_INCREF(v);
479     /* We don't check or use the second (integer) entry. */
480 
481     Py_DECREF(args);
482     Py_DECREF(decoder);
483     Py_DECREF(result);
484     return v;
485 
486  onError:
487     Py_XDECREF(args);
488     Py_XDECREF(decoder);
489     Py_XDECREF(result);
490     return NULL;
491 }
492 
493 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)494 PyObject *PyCodec_Encode(PyObject *object,
495                          const char *encoding,
496                          const char *errors)
497 {
498     PyObject *encoder;
499 
500     encoder = PyCodec_Encoder(encoding);
501     if (encoder == NULL)
502         return NULL;
503 
504     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
505 }
506 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)507 PyObject *PyCodec_Decode(PyObject *object,
508                          const char *encoding,
509                          const char *errors)
510 {
511     PyObject *decoder;
512 
513     decoder = PyCodec_Decoder(encoding);
514     if (decoder == NULL)
515         return NULL;
516 
517     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
518 }
519 
520 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)521 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
522                                        const char *alternate_command)
523 {
524     _Py_IDENTIFIER(_is_text_encoding);
525     PyObject *codec;
526     PyObject *attr;
527     int is_text_codec;
528 
529     codec = _PyCodec_Lookup(encoding);
530     if (codec == NULL)
531         return NULL;
532 
533     /* Backwards compatibility: assume any raw tuple describes a text
534      * encoding, and the same for anything lacking the private
535      * attribute.
536      */
537     if (!PyTuple_CheckExact(codec)) {
538         if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
539             Py_DECREF(codec);
540             return NULL;
541         }
542         if (attr != NULL) {
543             is_text_codec = PyObject_IsTrue(attr);
544             Py_DECREF(attr);
545             if (is_text_codec <= 0) {
546                 Py_DECREF(codec);
547                 if (!is_text_codec)
548                     PyErr_Format(PyExc_LookupError,
549                                  "'%.400s' is not a text encoding; "
550                                  "use %s to handle arbitrary codecs",
551                                  encoding, alternate_command);
552                 return NULL;
553             }
554         }
555     }
556 
557     /* This appears to be a valid text encoding */
558     return codec;
559 }
560 
561 
562 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)563 PyObject *codec_getitem_checked(const char *encoding,
564                                 const char *alternate_command,
565                                 int index)
566 {
567     PyObject *codec;
568     PyObject *v;
569 
570     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571     if (codec == NULL)
572         return NULL;
573 
574     v = PyTuple_GET_ITEM(codec, index);
575     Py_INCREF(v);
576     Py_DECREF(codec);
577     return v;
578 }
579 
_PyCodec_TextEncoder(const char * encoding)580 static PyObject * _PyCodec_TextEncoder(const char *encoding)
581 {
582     return codec_getitem_checked(encoding, "codecs.encode()", 0);
583 }
584 
_PyCodec_TextDecoder(const char * encoding)585 static PyObject * _PyCodec_TextDecoder(const char *encoding)
586 {
587     return codec_getitem_checked(encoding, "codecs.decode()", 1);
588 }
589 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_EncodeText(PyObject *object,
591                               const char *encoding,
592                               const char *errors)
593 {
594     PyObject *encoder;
595 
596     encoder = _PyCodec_TextEncoder(encoding);
597     if (encoder == NULL)
598         return NULL;
599 
600     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601 }
602 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)603 PyObject *_PyCodec_DecodeText(PyObject *object,
604                               const char *encoding,
605                               const char *errors)
606 {
607     PyObject *decoder;
608 
609     decoder = _PyCodec_TextDecoder(encoding);
610     if (decoder == NULL)
611         return NULL;
612 
613     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614 }
615 
616 /* Register the error handling callback function error under the name
617    name. This function will be called by the codec when it encounters
618    an unencodable characters/undecodable bytes and doesn't know the
619    callback name, when name is specified as the error parameter
620    in the call to the encode/decode function.
621    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)622 int PyCodec_RegisterError(const char *name, PyObject *error)
623 {
624     PyInterpreterState *interp = _PyInterpreterState_GET();
625     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626         return -1;
627     if (!PyCallable_Check(error)) {
628         PyErr_SetString(PyExc_TypeError, "handler must be callable");
629         return -1;
630     }
631     return PyDict_SetItemString(interp->codec_error_registry,
632                                 name, error);
633 }
634 
635 /* Lookup the error handling callback function registered under the
636    name error. As a special case NULL can be passed, in which case
637    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)638 PyObject *PyCodec_LookupError(const char *name)
639 {
640     PyObject *handler = NULL;
641 
642     PyInterpreterState *interp = _PyInterpreterState_GET();
643     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644         return NULL;
645 
646     if (name==NULL)
647         name = "strict";
648     handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649     if (handler) {
650         Py_INCREF(handler);
651     }
652     else if (!PyErr_Occurred()) {
653         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654     }
655     return handler;
656 }
657 
wrong_exception_type(PyObject * exc)658 static void wrong_exception_type(PyObject *exc)
659 {
660     PyErr_Format(PyExc_TypeError,
661                  "don't know how to handle %.200s in error callback",
662                  Py_TYPE(exc)->tp_name);
663 }
664 
PyCodec_StrictErrors(PyObject * exc)665 PyObject *PyCodec_StrictErrors(PyObject *exc)
666 {
667     if (PyExceptionInstance_Check(exc))
668         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669     else
670         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671     return NULL;
672 }
673 
674 
PyCodec_IgnoreErrors(PyObject * exc)675 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676 {
677     Py_ssize_t end;
678 
679     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680         if (PyUnicodeEncodeError_GetEnd(exc, &end))
681             return NULL;
682     }
683     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684         if (PyUnicodeDecodeError_GetEnd(exc, &end))
685             return NULL;
686     }
687     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688         if (PyUnicodeTranslateError_GetEnd(exc, &end))
689             return NULL;
690     }
691     else {
692         wrong_exception_type(exc);
693         return NULL;
694     }
695     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696 }
697 
698 
PyCodec_ReplaceErrors(PyObject * exc)699 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700 {
701     Py_ssize_t start, end, i, len;
702 
703     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704         PyObject *res;
705         Py_UCS1 *outp;
706         if (PyUnicodeEncodeError_GetStart(exc, &start))
707             return NULL;
708         if (PyUnicodeEncodeError_GetEnd(exc, &end))
709             return NULL;
710         len = end - start;
711         res = PyUnicode_New(len, '?');
712         if (res == NULL)
713             return NULL;
714         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715         outp = PyUnicode_1BYTE_DATA(res);
716         for (i = 0; i < len; ++i)
717             outp[i] = '?';
718         assert(_PyUnicode_CheckConsistency(res, 1));
719         return Py_BuildValue("(Nn)", res, end);
720     }
721     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722         if (PyUnicodeDecodeError_GetEnd(exc, &end))
723             return NULL;
724         return Py_BuildValue("(Cn)",
725                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726                              end);
727     }
728     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729         PyObject *res;
730         Py_UCS2 *outp;
731         if (PyUnicodeTranslateError_GetStart(exc, &start))
732             return NULL;
733         if (PyUnicodeTranslateError_GetEnd(exc, &end))
734             return NULL;
735         len = end - start;
736         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737         if (res == NULL)
738             return NULL;
739         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740         outp = PyUnicode_2BYTE_DATA(res);
741         for (i = 0; i < len; i++)
742             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743         assert(_PyUnicode_CheckConsistency(res, 1));
744         return Py_BuildValue("(Nn)", res, end);
745     }
746     else {
747         wrong_exception_type(exc);
748         return NULL;
749     }
750 }
751 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)752 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753 {
754     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755         PyObject *restuple;
756         PyObject *object;
757         Py_ssize_t i;
758         Py_ssize_t start;
759         Py_ssize_t end;
760         PyObject *res;
761         Py_UCS1 *outp;
762         Py_ssize_t ressize;
763         Py_UCS4 ch;
764         if (PyUnicodeEncodeError_GetStart(exc, &start))
765             return NULL;
766         if (PyUnicodeEncodeError_GetEnd(exc, &end))
767             return NULL;
768         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769             return NULL;
770         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771             end = start + PY_SSIZE_T_MAX / (2+7+1);
772         for (i = start, ressize = 0; i < end; ++i) {
773             /* object is guaranteed to be "ready" */
774             ch = PyUnicode_READ_CHAR(object, i);
775             if (ch<10)
776                 ressize += 2+1+1;
777             else if (ch<100)
778                 ressize += 2+2+1;
779             else if (ch<1000)
780                 ressize += 2+3+1;
781             else if (ch<10000)
782                 ressize += 2+4+1;
783             else if (ch<100000)
784                 ressize += 2+5+1;
785             else if (ch<1000000)
786                 ressize += 2+6+1;
787             else
788                 ressize += 2+7+1;
789         }
790         /* allocate replacement */
791         res = PyUnicode_New(ressize, 127);
792         if (res == NULL) {
793             Py_DECREF(object);
794             return NULL;
795         }
796         outp = PyUnicode_1BYTE_DATA(res);
797         /* generate replacement */
798         for (i = start; i < end; ++i) {
799             int digits;
800             int base;
801             ch = PyUnicode_READ_CHAR(object, i);
802             *outp++ = '&';
803             *outp++ = '#';
804             if (ch<10) {
805                 digits = 1;
806                 base = 1;
807             }
808             else if (ch<100) {
809                 digits = 2;
810                 base = 10;
811             }
812             else if (ch<1000) {
813                 digits = 3;
814                 base = 100;
815             }
816             else if (ch<10000) {
817                 digits = 4;
818                 base = 1000;
819             }
820             else if (ch<100000) {
821                 digits = 5;
822                 base = 10000;
823             }
824             else if (ch<1000000) {
825                 digits = 6;
826                 base = 100000;
827             }
828             else {
829                 digits = 7;
830                 base = 1000000;
831             }
832             while (digits-->0) {
833                 *outp++ = '0' + ch/base;
834                 ch %= base;
835                 base /= 10;
836             }
837             *outp++ = ';';
838         }
839         assert(_PyUnicode_CheckConsistency(res, 1));
840         restuple = Py_BuildValue("(Nn)", res, end);
841         Py_DECREF(object);
842         return restuple;
843     }
844     else {
845         wrong_exception_type(exc);
846         return NULL;
847     }
848 }
849 
PyCodec_BackslashReplaceErrors(PyObject * exc)850 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851 {
852     PyObject *object;
853     Py_ssize_t i;
854     Py_ssize_t start;
855     Py_ssize_t end;
856     PyObject *res;
857     Py_UCS1 *outp;
858     int ressize;
859     Py_UCS4 c;
860 
861     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862         const unsigned char *p;
863         if (PyUnicodeDecodeError_GetStart(exc, &start))
864             return NULL;
865         if (PyUnicodeDecodeError_GetEnd(exc, &end))
866             return NULL;
867         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868             return NULL;
869         p = (const unsigned char*)PyBytes_AS_STRING(object);
870         res = PyUnicode_New(4 * (end - start), 127);
871         if (res == NULL) {
872             Py_DECREF(object);
873             return NULL;
874         }
875         outp = PyUnicode_1BYTE_DATA(res);
876         for (i = start; i < end; i++, outp += 4) {
877             unsigned char c = p[i];
878             outp[0] = '\\';
879             outp[1] = 'x';
880             outp[2] = Py_hexdigits[(c>>4)&0xf];
881             outp[3] = Py_hexdigits[c&0xf];
882         }
883 
884         assert(_PyUnicode_CheckConsistency(res, 1));
885         Py_DECREF(object);
886         return Py_BuildValue("(Nn)", res, end);
887     }
888     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889         if (PyUnicodeEncodeError_GetStart(exc, &start))
890             return NULL;
891         if (PyUnicodeEncodeError_GetEnd(exc, &end))
892             return NULL;
893         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894             return NULL;
895     }
896     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897         if (PyUnicodeTranslateError_GetStart(exc, &start))
898             return NULL;
899         if (PyUnicodeTranslateError_GetEnd(exc, &end))
900             return NULL;
901         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902             return NULL;
903     }
904     else {
905         wrong_exception_type(exc);
906         return NULL;
907     }
908 
909     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910         end = start + PY_SSIZE_T_MAX / (1+1+8);
911     for (i = start, ressize = 0; i < end; ++i) {
912         /* object is guaranteed to be "ready" */
913         c = PyUnicode_READ_CHAR(object, i);
914         if (c >= 0x10000) {
915             ressize += 1+1+8;
916         }
917         else if (c >= 0x100) {
918             ressize += 1+1+4;
919         }
920         else
921             ressize += 1+1+2;
922     }
923     res = PyUnicode_New(ressize, 127);
924     if (res == NULL) {
925         Py_DECREF(object);
926         return NULL;
927     }
928     outp = PyUnicode_1BYTE_DATA(res);
929     for (i = start; i < end; ++i) {
930         c = PyUnicode_READ_CHAR(object, i);
931         *outp++ = '\\';
932         if (c >= 0x00010000) {
933             *outp++ = 'U';
934             *outp++ = Py_hexdigits[(c>>28)&0xf];
935             *outp++ = Py_hexdigits[(c>>24)&0xf];
936             *outp++ = Py_hexdigits[(c>>20)&0xf];
937             *outp++ = Py_hexdigits[(c>>16)&0xf];
938             *outp++ = Py_hexdigits[(c>>12)&0xf];
939             *outp++ = Py_hexdigits[(c>>8)&0xf];
940         }
941         else if (c >= 0x100) {
942             *outp++ = 'u';
943             *outp++ = Py_hexdigits[(c>>12)&0xf];
944             *outp++ = Py_hexdigits[(c>>8)&0xf];
945         }
946         else
947             *outp++ = 'x';
948         *outp++ = Py_hexdigits[(c>>4)&0xf];
949         *outp++ = Py_hexdigits[c&0xf];
950     }
951 
952     assert(_PyUnicode_CheckConsistency(res, 1));
953     Py_DECREF(object);
954     return Py_BuildValue("(Nn)", res, end);
955 }
956 
957 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
958 
PyCodec_NameReplaceErrors(PyObject * exc)959 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960 {
961     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962         PyObject *restuple;
963         PyObject *object;
964         Py_ssize_t i;
965         Py_ssize_t start;
966         Py_ssize_t end;
967         PyObject *res;
968         Py_UCS1 *outp;
969         Py_ssize_t ressize;
970         int replsize;
971         Py_UCS4 c;
972         char buffer[256]; /* NAME_MAXLEN */
973         if (PyUnicodeEncodeError_GetStart(exc, &start))
974             return NULL;
975         if (PyUnicodeEncodeError_GetEnd(exc, &end))
976             return NULL;
977         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978             return NULL;
979         if (!ucnhash_CAPI) {
980             /* load the unicode data module */
981             ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982                                             PyUnicodeData_CAPSULE_NAME, 1);
983             if (!ucnhash_CAPI)
984                 return NULL;
985         }
986         for (i = start, ressize = 0; i < end; ++i) {
987             /* object is guaranteed to be "ready" */
988             c = PyUnicode_READ_CHAR(object, i);
989             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
990                 replsize = 1+1+1+(int)strlen(buffer)+1;
991             }
992             else if (c >= 0x10000) {
993                 replsize = 1+1+8;
994             }
995             else if (c >= 0x100) {
996                 replsize = 1+1+4;
997             }
998             else
999                 replsize = 1+1+2;
1000             if (ressize > PY_SSIZE_T_MAX - replsize)
1001                 break;
1002             ressize += replsize;
1003         }
1004         end = i;
1005         res = PyUnicode_New(ressize, 127);
1006         if (res==NULL)
1007             return NULL;
1008         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1009             i < end; ++i) {
1010             c = PyUnicode_READ_CHAR(object, i);
1011             *outp++ = '\\';
1012             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1013                 *outp++ = 'N';
1014                 *outp++ = '{';
1015                 strcpy((char *)outp, buffer);
1016                 outp += strlen(buffer);
1017                 *outp++ = '}';
1018                 continue;
1019             }
1020             if (c >= 0x00010000) {
1021                 *outp++ = 'U';
1022                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1023                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1024                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1025                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1026                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1027                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1028             }
1029             else if (c >= 0x100) {
1030                 *outp++ = 'u';
1031                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033             }
1034             else
1035                 *outp++ = 'x';
1036             *outp++ = Py_hexdigits[(c>>4)&0xf];
1037             *outp++ = Py_hexdigits[c&0xf];
1038         }
1039 
1040         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1041         assert(_PyUnicode_CheckConsistency(res, 1));
1042         restuple = Py_BuildValue("(Nn)", res, end);
1043         Py_DECREF(object);
1044         return restuple;
1045     }
1046     else {
1047         wrong_exception_type(exc);
1048         return NULL;
1049     }
1050 }
1051 
1052 #define ENC_UNKNOWN     -1
1053 #define ENC_UTF8        0
1054 #define ENC_UTF16BE     1
1055 #define ENC_UTF16LE     2
1056 #define ENC_UTF32BE     3
1057 #define ENC_UTF32LE     4
1058 
1059 static int
get_standard_encoding(const char * encoding,int * bytelength)1060 get_standard_encoding(const char *encoding, int *bytelength)
1061 {
1062     if (Py_TOLOWER(encoding[0]) == 'u' &&
1063         Py_TOLOWER(encoding[1]) == 't' &&
1064         Py_TOLOWER(encoding[2]) == 'f') {
1065         encoding += 3;
1066         if (*encoding == '-' || *encoding == '_' )
1067             encoding++;
1068         if (encoding[0] == '8' && encoding[1] == '\0') {
1069             *bytelength = 3;
1070             return ENC_UTF8;
1071         }
1072         else if (encoding[0] == '1' && encoding[1] == '6') {
1073             encoding += 2;
1074             *bytelength = 2;
1075             if (*encoding == '\0') {
1076 #ifdef WORDS_BIGENDIAN
1077                 return ENC_UTF16BE;
1078 #else
1079                 return ENC_UTF16LE;
1080 #endif
1081             }
1082             if (*encoding == '-' || *encoding == '_' )
1083                 encoding++;
1084             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1085                 if (Py_TOLOWER(encoding[0]) == 'b')
1086                     return ENC_UTF16BE;
1087                 if (Py_TOLOWER(encoding[0]) == 'l')
1088                     return ENC_UTF16LE;
1089             }
1090         }
1091         else if (encoding[0] == '3' && encoding[1] == '2') {
1092             encoding += 2;
1093             *bytelength = 4;
1094             if (*encoding == '\0') {
1095 #ifdef WORDS_BIGENDIAN
1096                 return ENC_UTF32BE;
1097 #else
1098                 return ENC_UTF32LE;
1099 #endif
1100             }
1101             if (*encoding == '-' || *encoding == '_' )
1102                 encoding++;
1103             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1104                 if (Py_TOLOWER(encoding[0]) == 'b')
1105                     return ENC_UTF32BE;
1106                 if (Py_TOLOWER(encoding[0]) == 'l')
1107                     return ENC_UTF32LE;
1108             }
1109         }
1110     }
1111     else if (strcmp(encoding, "CP_UTF8") == 0) {
1112         *bytelength = 3;
1113         return ENC_UTF8;
1114     }
1115     return ENC_UNKNOWN;
1116 }
1117 
1118 /* This handler is declared static until someone demonstrates
1119    a need to call it directly. */
1120 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1121 PyCodec_SurrogatePassErrors(PyObject *exc)
1122 {
1123     PyObject *restuple;
1124     PyObject *object;
1125     PyObject *encode;
1126     const char *encoding;
1127     int code;
1128     int bytelength;
1129     Py_ssize_t i;
1130     Py_ssize_t start;
1131     Py_ssize_t end;
1132     PyObject *res;
1133 
1134     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1135         unsigned char *outp;
1136         if (PyUnicodeEncodeError_GetStart(exc, &start))
1137             return NULL;
1138         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1139             return NULL;
1140         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1141             return NULL;
1142         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1143             Py_DECREF(object);
1144             return NULL;
1145         }
1146         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1147             Py_DECREF(object);
1148             Py_DECREF(encode);
1149             return NULL;
1150         }
1151         code = get_standard_encoding(encoding, &bytelength);
1152         Py_DECREF(encode);
1153         if (code == ENC_UNKNOWN) {
1154             /* Not supported, fail with original exception */
1155             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1156             Py_DECREF(object);
1157             return NULL;
1158         }
1159 
1160         if (end - start > PY_SSIZE_T_MAX / bytelength)
1161             end = start + PY_SSIZE_T_MAX / bytelength;
1162         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1163         if (!res) {
1164             Py_DECREF(object);
1165             return NULL;
1166         }
1167         outp = (unsigned char*)PyBytes_AsString(res);
1168         for (i = start; i < end; i++) {
1169             /* object is guaranteed to be "ready" */
1170             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1171             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1172                 /* Not a surrogate, fail with original exception */
1173                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1174                 Py_DECREF(res);
1175                 Py_DECREF(object);
1176                 return NULL;
1177             }
1178             switch (code) {
1179             case ENC_UTF8:
1180                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1181                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1182                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1183                 break;
1184             case ENC_UTF16LE:
1185                 *outp++ = (unsigned char) ch;
1186                 *outp++ = (unsigned char)(ch >> 8);
1187                 break;
1188             case ENC_UTF16BE:
1189                 *outp++ = (unsigned char)(ch >> 8);
1190                 *outp++ = (unsigned char) ch;
1191                 break;
1192             case ENC_UTF32LE:
1193                 *outp++ = (unsigned char) ch;
1194                 *outp++ = (unsigned char)(ch >> 8);
1195                 *outp++ = (unsigned char)(ch >> 16);
1196                 *outp++ = (unsigned char)(ch >> 24);
1197                 break;
1198             case ENC_UTF32BE:
1199                 *outp++ = (unsigned char)(ch >> 24);
1200                 *outp++ = (unsigned char)(ch >> 16);
1201                 *outp++ = (unsigned char)(ch >> 8);
1202                 *outp++ = (unsigned char) ch;
1203                 break;
1204             }
1205         }
1206         restuple = Py_BuildValue("(On)", res, end);
1207         Py_DECREF(res);
1208         Py_DECREF(object);
1209         return restuple;
1210     }
1211     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1212         const unsigned char *p;
1213         Py_UCS4 ch = 0;
1214         if (PyUnicodeDecodeError_GetStart(exc, &start))
1215             return NULL;
1216         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1217             return NULL;
1218         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1219             return NULL;
1220         p = (const unsigned char*)PyBytes_AS_STRING(object);
1221         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1222             Py_DECREF(object);
1223             return NULL;
1224         }
1225         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1226             Py_DECREF(object);
1227             Py_DECREF(encode);
1228             return NULL;
1229         }
1230         code = get_standard_encoding(encoding, &bytelength);
1231         Py_DECREF(encode);
1232         if (code == ENC_UNKNOWN) {
1233             /* Not supported, fail with original exception */
1234             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1235             Py_DECREF(object);
1236             return NULL;
1237         }
1238 
1239         /* Try decoding a single surrogate character. If
1240            there are more, let the codec call us again. */
1241         p += start;
1242         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1243             switch (code) {
1244             case ENC_UTF8:
1245                 if ((p[0] & 0xf0) == 0xe0 &&
1246                     (p[1] & 0xc0) == 0x80 &&
1247                     (p[2] & 0xc0) == 0x80) {
1248                     /* it's a three-byte code */
1249                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1250                 }
1251                 break;
1252             case ENC_UTF16LE:
1253                 ch = p[1] << 8 | p[0];
1254                 break;
1255             case ENC_UTF16BE:
1256                 ch = p[0] << 8 | p[1];
1257                 break;
1258             case ENC_UTF32LE:
1259                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1260                 break;
1261             case ENC_UTF32BE:
1262                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1263                 break;
1264             }
1265         }
1266 
1267         Py_DECREF(object);
1268         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1269             /* it's not a surrogate - fail */
1270             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1271             return NULL;
1272         }
1273         res = PyUnicode_FromOrdinal(ch);
1274         if (res == NULL)
1275             return NULL;
1276         return Py_BuildValue("(Nn)", res, start + bytelength);
1277     }
1278     else {
1279         wrong_exception_type(exc);
1280         return NULL;
1281     }
1282 }
1283 
1284 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1285 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1286 {
1287     PyObject *restuple;
1288     PyObject *object;
1289     Py_ssize_t i;
1290     Py_ssize_t start;
1291     Py_ssize_t end;
1292     PyObject *res;
1293 
1294     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1295         char *outp;
1296         if (PyUnicodeEncodeError_GetStart(exc, &start))
1297             return NULL;
1298         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1299             return NULL;
1300         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1301             return NULL;
1302         res = PyBytes_FromStringAndSize(NULL, end-start);
1303         if (!res) {
1304             Py_DECREF(object);
1305             return NULL;
1306         }
1307         outp = PyBytes_AsString(res);
1308         for (i = start; i < end; i++) {
1309             /* object is guaranteed to be "ready" */
1310             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1311             if (ch < 0xdc80 || ch > 0xdcff) {
1312                 /* Not a UTF-8b surrogate, fail with original exception */
1313                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1314                 Py_DECREF(res);
1315                 Py_DECREF(object);
1316                 return NULL;
1317             }
1318             *outp++ = ch - 0xdc00;
1319         }
1320         restuple = Py_BuildValue("(On)", res, end);
1321         Py_DECREF(res);
1322         Py_DECREF(object);
1323         return restuple;
1324     }
1325     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1326         PyObject *str;
1327         const unsigned char *p;
1328         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1329         int consumed = 0;
1330         if (PyUnicodeDecodeError_GetStart(exc, &start))
1331             return NULL;
1332         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1333             return NULL;
1334         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1335             return NULL;
1336         p = (const unsigned char*)PyBytes_AS_STRING(object);
1337         while (consumed < 4 && consumed < end-start) {
1338             /* Refuse to escape ASCII bytes. */
1339             if (p[start+consumed] < 128)
1340                 break;
1341             ch[consumed] = 0xdc00 + p[start+consumed];
1342             consumed++;
1343         }
1344         Py_DECREF(object);
1345         if (!consumed) {
1346             /* codec complained about ASCII byte. */
1347             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1348             return NULL;
1349         }
1350         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1351         if (str == NULL)
1352             return NULL;
1353         return Py_BuildValue("(Nn)", str, start+consumed);
1354     }
1355     else {
1356         wrong_exception_type(exc);
1357         return NULL;
1358     }
1359 }
1360 
1361 
strict_errors(PyObject * self,PyObject * exc)1362 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1363 {
1364     return PyCodec_StrictErrors(exc);
1365 }
1366 
1367 
ignore_errors(PyObject * self,PyObject * exc)1368 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1369 {
1370     return PyCodec_IgnoreErrors(exc);
1371 }
1372 
1373 
replace_errors(PyObject * self,PyObject * exc)1374 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1375 {
1376     return PyCodec_ReplaceErrors(exc);
1377 }
1378 
1379 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1380 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1381 {
1382     return PyCodec_XMLCharRefReplaceErrors(exc);
1383 }
1384 
1385 
backslashreplace_errors(PyObject * self,PyObject * exc)1386 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1387 {
1388     return PyCodec_BackslashReplaceErrors(exc);
1389 }
1390 
namereplace_errors(PyObject * self,PyObject * exc)1391 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1392 {
1393     return PyCodec_NameReplaceErrors(exc);
1394 }
1395 
surrogatepass_errors(PyObject * self,PyObject * exc)1396 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1397 {
1398     return PyCodec_SurrogatePassErrors(exc);
1399 }
1400 
surrogateescape_errors(PyObject * self,PyObject * exc)1401 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1402 {
1403     return PyCodec_SurrogateEscapeErrors(exc);
1404 }
1405 
_PyCodecRegistry_Init(void)1406 static int _PyCodecRegistry_Init(void)
1407 {
1408     static struct {
1409         const char *name;
1410         PyMethodDef def;
1411     } methods[] =
1412     {
1413         {
1414             "strict",
1415             {
1416                 "strict_errors",
1417                 strict_errors,
1418                 METH_O,
1419                 PyDoc_STR("Implements the 'strict' error handling, which "
1420                           "raises a UnicodeError on coding errors.")
1421             }
1422         },
1423         {
1424             "ignore",
1425             {
1426                 "ignore_errors",
1427                 ignore_errors,
1428                 METH_O,
1429                 PyDoc_STR("Implements the 'ignore' error handling, which "
1430                           "ignores malformed data and continues.")
1431             }
1432         },
1433         {
1434             "replace",
1435             {
1436                 "replace_errors",
1437                 replace_errors,
1438                 METH_O,
1439                 PyDoc_STR("Implements the 'replace' error handling, which "
1440                           "replaces malformed data with a replacement marker.")
1441             }
1442         },
1443         {
1444             "xmlcharrefreplace",
1445             {
1446                 "xmlcharrefreplace_errors",
1447                 xmlcharrefreplace_errors,
1448                 METH_O,
1449                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1450                           "which replaces an unencodable character with the "
1451                           "appropriate XML character reference.")
1452             }
1453         },
1454         {
1455             "backslashreplace",
1456             {
1457                 "backslashreplace_errors",
1458                 backslashreplace_errors,
1459                 METH_O,
1460                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1461                           "which replaces malformed data with a backslashed "
1462                           "escape sequence.")
1463             }
1464         },
1465         {
1466             "namereplace",
1467             {
1468                 "namereplace_errors",
1469                 namereplace_errors,
1470                 METH_O,
1471                 PyDoc_STR("Implements the 'namereplace' error handling, "
1472                           "which replaces an unencodable character with a "
1473                           "\\N{...} escape sequence.")
1474             }
1475         },
1476         {
1477             "surrogatepass",
1478             {
1479                 "surrogatepass",
1480                 surrogatepass_errors,
1481                 METH_O
1482             }
1483         },
1484         {
1485             "surrogateescape",
1486             {
1487                 "surrogateescape",
1488                 surrogateescape_errors,
1489                 METH_O
1490             }
1491         }
1492     };
1493 
1494     PyInterpreterState *interp = _PyInterpreterState_GET();
1495     PyObject *mod;
1496 
1497     if (interp->codec_search_path != NULL)
1498         return 0;
1499 
1500     interp->codec_search_path = PyList_New(0);
1501     if (interp->codec_search_path == NULL) {
1502         return -1;
1503     }
1504 
1505     interp->codec_search_cache = PyDict_New();
1506     if (interp->codec_search_cache == NULL) {
1507         return -1;
1508     }
1509 
1510     interp->codec_error_registry = PyDict_New();
1511     if (interp->codec_error_registry == NULL) {
1512         return -1;
1513     }
1514 
1515     for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516         PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517         if (!func) {
1518             return -1;
1519         }
1520 
1521         int res = PyCodec_RegisterError(methods[i].name, func);
1522         Py_DECREF(func);
1523         if (res) {
1524             return -1;
1525         }
1526     }
1527 
1528     mod = PyImport_ImportModuleNoBlock("encodings");
1529     if (mod == NULL) {
1530         return -1;
1531     }
1532     Py_DECREF(mod);
1533     interp->codecs_initialized = 1;
1534     return 0;
1535 }
1536