• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
13 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
14 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
15 #include <ctype.h>
16 
17 const char *Py_hexdigits = "0123456789abcdef";
18 
19 /* --- Codec Registry ----------------------------------------------------- */
20 
21 /* Import the standard encodings package which will register the first
22    codec search function.
23 
24    This is done in a lazy way so that the Unicode implementation does
25    not downgrade startup time of scripts not needing it.
26 
27    ImportErrors are silently ignored by this function. Only one try is
28    made.
29 
30 */
31 
32 static int _PyCodecRegistry_Init(void); /* Forward */
33 
PyCodec_Register(PyObject * search_function)34 int PyCodec_Register(PyObject *search_function)
35 {
36     PyInterpreterState *interp = _PyInterpreterState_GET();
37     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
38         goto onError;
39     if (search_function == NULL) {
40         PyErr_BadArgument();
41         goto onError;
42     }
43     if (!PyCallable_Check(search_function)) {
44         PyErr_SetString(PyExc_TypeError, "argument must be callable");
45         goto onError;
46     }
47     return PyList_Append(interp->codec_search_path, search_function);
48 
49  onError:
50     return -1;
51 }
52 
53 int
PyCodec_Unregister(PyObject * search_function)54 PyCodec_Unregister(PyObject *search_function)
55 {
56     PyInterpreterState *interp = PyInterpreterState_Get();
57     PyObject *codec_search_path = interp->codec_search_path;
58     /* Do nothing if codec_search_path is not created yet or was cleared. */
59     if (codec_search_path == NULL) {
60         return 0;
61     }
62 
63     assert(PyList_CheckExact(codec_search_path));
64     Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
65     for (Py_ssize_t i = 0; i < n; i++) {
66         PyObject *item = PyList_GET_ITEM(codec_search_path, i);
67         if (item == search_function) {
68             if (interp->codec_search_cache != NULL) {
69                 assert(PyDict_CheckExact(interp->codec_search_cache));
70                 PyDict_Clear(interp->codec_search_cache);
71             }
72             return PyList_SetSlice(codec_search_path, i, i+1, NULL);
73         }
74     }
75     return 0;
76 }
77 
78 extern int _Py_normalize_encoding(const char *, char *, size_t);
79 
80 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
81    converted to lower case, spaces and hyphens are replaced with underscores. */
82 
83 static
normalizestring(const char * string)84 PyObject *normalizestring(const char *string)
85 {
86     size_t len = strlen(string);
87     char *encoding;
88     PyObject *v;
89 
90     if (len > PY_SSIZE_T_MAX) {
91         PyErr_SetString(PyExc_OverflowError, "string is too large");
92         return NULL;
93     }
94 
95     encoding = PyMem_Malloc(len + 1);
96     if (encoding == NULL)
97         return PyErr_NoMemory();
98 
99     if (!_Py_normalize_encoding(string, encoding, len + 1))
100     {
101         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
102         PyMem_Free(encoding);
103         return NULL;
104     }
105 
106     v = PyUnicode_FromString(encoding);
107     PyMem_Free(encoding);
108     return v;
109 }
110 
111 /* Lookup the given encoding and return a tuple providing the codec
112    facilities.
113 
114    The encoding string is looked up converted to all lower-case
115    characters. This makes encodings looked up through this mechanism
116    effectively case-insensitive.
117 
118    If no codec is found, a LookupError is set and NULL returned.
119 
120    As side effect, this tries to load the encodings package, if not
121    yet done. This is part of the lazy load strategy for the encodings
122    package.
123 
124 */
125 
_PyCodec_Lookup(const char * encoding)126 PyObject *_PyCodec_Lookup(const char *encoding)
127 {
128     if (encoding == NULL) {
129         PyErr_BadArgument();
130         return NULL;
131     }
132 
133     PyInterpreterState *interp = _PyInterpreterState_GET();
134     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
135         return NULL;
136     }
137 
138     /* Convert the encoding to a normalized Python string: all
139        characters are converted to lower case, spaces and hyphens are
140        replaced with underscores. */
141     PyObject *v = normalizestring(encoding);
142     if (v == NULL) {
143         return NULL;
144     }
145     PyUnicode_InternInPlace(&v);
146 
147     /* First, try to lookup the name in the registry dictionary */
148     PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
149     if (result != NULL) {
150         Py_INCREF(result);
151         Py_DECREF(v);
152         return result;
153     }
154     else if (PyErr_Occurred()) {
155         goto onError;
156     }
157 
158     /* Next, scan the search functions in order of registration */
159     const Py_ssize_t len = PyList_Size(interp->codec_search_path);
160     if (len < 0)
161         goto onError;
162     if (len == 0) {
163         PyErr_SetString(PyExc_LookupError,
164                         "no codec search functions registered: "
165                         "can't find encoding");
166         goto onError;
167     }
168 
169     Py_ssize_t i;
170     for (i = 0; i < len; i++) {
171         PyObject *func;
172 
173         func = PyList_GetItem(interp->codec_search_path, i);
174         if (func == NULL)
175             goto onError;
176         result = PyObject_CallOneArg(func, v);
177         if (result == NULL)
178             goto onError;
179         if (result == Py_None) {
180             Py_DECREF(result);
181             continue;
182         }
183         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
184             PyErr_SetString(PyExc_TypeError,
185                             "codec search functions must return 4-tuples");
186             Py_DECREF(result);
187             goto onError;
188         }
189         break;
190     }
191     if (i == len) {
192         /* XXX Perhaps we should cache misses too ? */
193         PyErr_Format(PyExc_LookupError,
194                      "unknown encoding: %s", encoding);
195         goto onError;
196     }
197 
198     /* Cache and return the result */
199     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
200         Py_DECREF(result);
201         goto onError;
202     }
203     Py_DECREF(v);
204     return result;
205 
206  onError:
207     Py_DECREF(v);
208     return NULL;
209 }
210 
211 /* Codec registry encoding check API. */
212 
PyCodec_KnownEncoding(const char * encoding)213 int PyCodec_KnownEncoding(const char *encoding)
214 {
215     PyObject *codecs;
216 
217     codecs = _PyCodec_Lookup(encoding);
218     if (!codecs) {
219         PyErr_Clear();
220         return 0;
221     }
222     else {
223         Py_DECREF(codecs);
224         return 1;
225     }
226 }
227 
228 static
args_tuple(PyObject * object,const char * errors)229 PyObject *args_tuple(PyObject *object,
230                      const char *errors)
231 {
232     PyObject *args;
233 
234     args = PyTuple_New(1 + (errors != NULL));
235     if (args == NULL)
236         return NULL;
237     Py_INCREF(object);
238     PyTuple_SET_ITEM(args,0,object);
239     if (errors) {
240         PyObject *v;
241 
242         v = PyUnicode_FromString(errors);
243         if (v == NULL) {
244             Py_DECREF(args);
245             return NULL;
246         }
247         PyTuple_SET_ITEM(args, 1, v);
248     }
249     return args;
250 }
251 
252 /* Helper function to get a codec item */
253 
254 static
codec_getitem(const char * encoding,int index)255 PyObject *codec_getitem(const char *encoding, int index)
256 {
257     PyObject *codecs;
258     PyObject *v;
259 
260     codecs = _PyCodec_Lookup(encoding);
261     if (codecs == NULL)
262         return NULL;
263     v = PyTuple_GET_ITEM(codecs, index);
264     Py_DECREF(codecs);
265     Py_INCREF(v);
266     return v;
267 }
268 
269 /* Helper functions to create an incremental codec. */
270 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)271 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272                                      const char *errors,
273                                      const char *attrname)
274 {
275     PyObject *ret, *inccodec;
276 
277     inccodec = PyObject_GetAttrString(codec_info, attrname);
278     if (inccodec == NULL)
279         return NULL;
280     if (errors)
281         ret = PyObject_CallFunction(inccodec, "s", errors);
282     else
283         ret = _PyObject_CallNoArg(inccodec);
284     Py_DECREF(inccodec);
285     return ret;
286 }
287 
288 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)289 PyObject *codec_getincrementalcodec(const char *encoding,
290                                     const char *errors,
291                                     const char *attrname)
292 {
293     PyObject *codec_info, *ret;
294 
295     codec_info = _PyCodec_Lookup(encoding);
296     if (codec_info == NULL)
297         return NULL;
298     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299     Py_DECREF(codec_info);
300     return ret;
301 }
302 
303 /* Helper function to create a stream codec. */
304 
305 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)306 PyObject *codec_getstreamcodec(const char *encoding,
307                                PyObject *stream,
308                                const char *errors,
309                                const int index)
310 {
311     PyObject *codecs, *streamcodec, *codeccls;
312 
313     codecs = _PyCodec_Lookup(encoding);
314     if (codecs == NULL)
315         return NULL;
316 
317     codeccls = PyTuple_GET_ITEM(codecs, index);
318     if (errors != NULL)
319         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
320     else
321         streamcodec = PyObject_CallOneArg(codeccls, stream);
322     Py_DECREF(codecs);
323     return streamcodec;
324 }
325 
326 /* Helpers to work with the result of _PyCodec_Lookup
327 
328  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)329 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330                                              const char *errors)
331 {
332     return codec_makeincrementalcodec(codec_info, errors,
333                                       "incrementaldecoder");
334 }
335 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337                                              const char *errors)
338 {
339     return codec_makeincrementalcodec(codec_info, errors,
340                                       "incrementalencoder");
341 }
342 
343 
344 /* Convenience APIs to query the Codec registry.
345 
346    All APIs return a codec object with incremented refcount.
347 
348  */
349 
PyCodec_Encoder(const char * encoding)350 PyObject *PyCodec_Encoder(const char *encoding)
351 {
352     return codec_getitem(encoding, 0);
353 }
354 
PyCodec_Decoder(const char * encoding)355 PyObject *PyCodec_Decoder(const char *encoding)
356 {
357     return codec_getitem(encoding, 1);
358 }
359 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)360 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
361                                      const char *errors)
362 {
363     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
364 }
365 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)366 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
367                                      const char *errors)
368 {
369     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
370 }
371 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)372 PyObject *PyCodec_StreamReader(const char *encoding,
373                                PyObject *stream,
374                                const char *errors)
375 {
376     return codec_getstreamcodec(encoding, stream, errors, 2);
377 }
378 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamWriter(const char *encoding,
380                                PyObject *stream,
381                                const char *errors)
382 {
383     return codec_getstreamcodec(encoding, stream, errors, 3);
384 }
385 
386 /* Helper that tries to ensure the reported exception chain indicates the
387  * codec that was invoked to trigger the failure without changing the type
388  * of the exception raised.
389  */
390 static void
wrap_codec_error(const char * operation,const char * encoding)391 wrap_codec_error(const char *operation,
392                  const char *encoding)
393 {
394     /* TrySetFromCause will replace the active exception with a suitably
395      * updated clone if it can, otherwise it will leave the original
396      * exception alone.
397      */
398     _PyErr_TrySetFromCause("%s with '%s' codec failed",
399                            operation, encoding);
400 }
401 
402 /* Encode an object (e.g. a Unicode object) using the given encoding
403    and return the resulting encoded object (usually a Python string).
404 
405    errors is passed to the encoder factory as argument if non-NULL. */
406 
407 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)408 _PyCodec_EncodeInternal(PyObject *object,
409                         PyObject *encoder,
410                         const char *encoding,
411                         const char *errors)
412 {
413     PyObject *args = NULL, *result = NULL;
414     PyObject *v = NULL;
415 
416     args = args_tuple(object, errors);
417     if (args == NULL)
418         goto onError;
419 
420     result = PyObject_Call(encoder, args, NULL);
421     if (result == NULL) {
422         wrap_codec_error("encoding", encoding);
423         goto onError;
424     }
425 
426     if (!PyTuple_Check(result) ||
427         PyTuple_GET_SIZE(result) != 2) {
428         PyErr_SetString(PyExc_TypeError,
429                         "encoder must return a tuple (object, integer)");
430         goto onError;
431     }
432     v = PyTuple_GET_ITEM(result,0);
433     Py_INCREF(v);
434     /* We don't check or use the second (integer) entry. */
435 
436     Py_DECREF(args);
437     Py_DECREF(encoder);
438     Py_DECREF(result);
439     return v;
440 
441  onError:
442     Py_XDECREF(result);
443     Py_XDECREF(args);
444     Py_XDECREF(encoder);
445     return NULL;
446 }
447 
448 /* Decode an object (usually a Python string) using the given encoding
449    and return an equivalent object (e.g. a Unicode object).
450 
451    errors is passed to the decoder factory as argument if non-NULL. */
452 
453 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)454 _PyCodec_DecodeInternal(PyObject *object,
455                         PyObject *decoder,
456                         const char *encoding,
457                         const char *errors)
458 {
459     PyObject *args = NULL, *result = NULL;
460     PyObject *v;
461 
462     args = args_tuple(object, errors);
463     if (args == NULL)
464         goto onError;
465 
466     result = PyObject_Call(decoder, args, NULL);
467     if (result == NULL) {
468         wrap_codec_error("decoding", encoding);
469         goto onError;
470     }
471     if (!PyTuple_Check(result) ||
472         PyTuple_GET_SIZE(result) != 2) {
473         PyErr_SetString(PyExc_TypeError,
474                         "decoder must return a tuple (object,integer)");
475         goto onError;
476     }
477     v = PyTuple_GET_ITEM(result,0);
478     Py_INCREF(v);
479     /* We don't check or use the second (integer) entry. */
480 
481     Py_DECREF(args);
482     Py_DECREF(decoder);
483     Py_DECREF(result);
484     return v;
485 
486  onError:
487     Py_XDECREF(args);
488     Py_XDECREF(decoder);
489     Py_XDECREF(result);
490     return NULL;
491 }
492 
493 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)494 PyObject *PyCodec_Encode(PyObject *object,
495                          const char *encoding,
496                          const char *errors)
497 {
498     PyObject *encoder;
499 
500     encoder = PyCodec_Encoder(encoding);
501     if (encoder == NULL)
502         return NULL;
503 
504     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
505 }
506 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)507 PyObject *PyCodec_Decode(PyObject *object,
508                          const char *encoding,
509                          const char *errors)
510 {
511     PyObject *decoder;
512 
513     decoder = PyCodec_Decoder(encoding);
514     if (decoder == NULL)
515         return NULL;
516 
517     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
518 }
519 
520 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)521 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
522                                        const char *alternate_command)
523 {
524     _Py_IDENTIFIER(_is_text_encoding);
525     PyObject *codec;
526     PyObject *attr;
527     int is_text_codec;
528 
529     codec = _PyCodec_Lookup(encoding);
530     if (codec == NULL)
531         return NULL;
532 
533     /* Backwards compatibility: assume any raw tuple describes a text
534      * encoding, and the same for anything lacking the private
535      * attribute.
536      */
537     if (!PyTuple_CheckExact(codec)) {
538         if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
539             Py_DECREF(codec);
540             return NULL;
541         }
542         if (attr != NULL) {
543             is_text_codec = PyObject_IsTrue(attr);
544             Py_DECREF(attr);
545             if (is_text_codec <= 0) {
546                 Py_DECREF(codec);
547                 if (!is_text_codec)
548                     PyErr_Format(PyExc_LookupError,
549                                  "'%.400s' is not a text encoding; "
550                                  "use %s to handle arbitrary codecs",
551                                  encoding, alternate_command);
552                 return NULL;
553             }
554         }
555     }
556 
557     /* This appears to be a valid text encoding */
558     return codec;
559 }
560 
561 
562 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)563 PyObject *codec_getitem_checked(const char *encoding,
564                                 const char *alternate_command,
565                                 int index)
566 {
567     PyObject *codec;
568     PyObject *v;
569 
570     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571     if (codec == NULL)
572         return NULL;
573 
574     v = PyTuple_GET_ITEM(codec, index);
575     Py_INCREF(v);
576     Py_DECREF(codec);
577     return v;
578 }
579 
_PyCodec_TextEncoder(const char * encoding)580 static PyObject * _PyCodec_TextEncoder(const char *encoding)
581 {
582     return codec_getitem_checked(encoding, "codecs.encode()", 0);
583 }
584 
_PyCodec_TextDecoder(const char * encoding)585 static PyObject * _PyCodec_TextDecoder(const char *encoding)
586 {
587     return codec_getitem_checked(encoding, "codecs.decode()", 1);
588 }
589 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_EncodeText(PyObject *object,
591                               const char *encoding,
592                               const char *errors)
593 {
594     PyObject *encoder;
595 
596     encoder = _PyCodec_TextEncoder(encoding);
597     if (encoder == NULL)
598         return NULL;
599 
600     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601 }
602 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)603 PyObject *_PyCodec_DecodeText(PyObject *object,
604                               const char *encoding,
605                               const char *errors)
606 {
607     PyObject *decoder;
608 
609     decoder = _PyCodec_TextDecoder(encoding);
610     if (decoder == NULL)
611         return NULL;
612 
613     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614 }
615 
616 /* Register the error handling callback function error under the name
617    name. This function will be called by the codec when it encounters
618    an unencodable characters/undecodable bytes and doesn't know the
619    callback name, when name is specified as the error parameter
620    in the call to the encode/decode function.
621    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)622 int PyCodec_RegisterError(const char *name, PyObject *error)
623 {
624     PyInterpreterState *interp = _PyInterpreterState_GET();
625     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626         return -1;
627     if (!PyCallable_Check(error)) {
628         PyErr_SetString(PyExc_TypeError, "handler must be callable");
629         return -1;
630     }
631     return PyDict_SetItemString(interp->codec_error_registry,
632                                 name, error);
633 }
634 
635 /* Lookup the error handling callback function registered under the
636    name error. As a special case NULL can be passed, in which case
637    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)638 PyObject *PyCodec_LookupError(const char *name)
639 {
640     PyObject *handler = NULL;
641 
642     PyInterpreterState *interp = _PyInterpreterState_GET();
643     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644         return NULL;
645 
646     if (name==NULL)
647         name = "strict";
648     handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649     if (handler) {
650         Py_INCREF(handler);
651     }
652     else if (!PyErr_Occurred()) {
653         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654     }
655     return handler;
656 }
657 
wrong_exception_type(PyObject * exc)658 static void wrong_exception_type(PyObject *exc)
659 {
660     PyErr_Format(PyExc_TypeError,
661                  "don't know how to handle %.200s in error callback",
662                  Py_TYPE(exc)->tp_name);
663 }
664 
PyCodec_StrictErrors(PyObject * exc)665 PyObject *PyCodec_StrictErrors(PyObject *exc)
666 {
667     if (PyExceptionInstance_Check(exc))
668         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669     else
670         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671     return NULL;
672 }
673 
674 
PyCodec_IgnoreErrors(PyObject * exc)675 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676 {
677     Py_ssize_t end;
678 
679     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680         if (PyUnicodeEncodeError_GetEnd(exc, &end))
681             return NULL;
682     }
683     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684         if (PyUnicodeDecodeError_GetEnd(exc, &end))
685             return NULL;
686     }
687     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688         if (PyUnicodeTranslateError_GetEnd(exc, &end))
689             return NULL;
690     }
691     else {
692         wrong_exception_type(exc);
693         return NULL;
694     }
695     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696 }
697 
698 
PyCodec_ReplaceErrors(PyObject * exc)699 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700 {
701     Py_ssize_t start, end, i, len;
702 
703     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704         PyObject *res;
705         Py_UCS1 *outp;
706         if (PyUnicodeEncodeError_GetStart(exc, &start))
707             return NULL;
708         if (PyUnicodeEncodeError_GetEnd(exc, &end))
709             return NULL;
710         len = end - start;
711         res = PyUnicode_New(len, '?');
712         if (res == NULL)
713             return NULL;
714         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715         outp = PyUnicode_1BYTE_DATA(res);
716         for (i = 0; i < len; ++i)
717             outp[i] = '?';
718         assert(_PyUnicode_CheckConsistency(res, 1));
719         return Py_BuildValue("(Nn)", res, end);
720     }
721     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722         if (PyUnicodeDecodeError_GetEnd(exc, &end))
723             return NULL;
724         return Py_BuildValue("(Cn)",
725                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726                              end);
727     }
728     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729         PyObject *res;
730         Py_UCS2 *outp;
731         if (PyUnicodeTranslateError_GetStart(exc, &start))
732             return NULL;
733         if (PyUnicodeTranslateError_GetEnd(exc, &end))
734             return NULL;
735         len = end - start;
736         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737         if (res == NULL)
738             return NULL;
739         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740         outp = PyUnicode_2BYTE_DATA(res);
741         for (i = 0; i < len; i++)
742             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743         assert(_PyUnicode_CheckConsistency(res, 1));
744         return Py_BuildValue("(Nn)", res, end);
745     }
746     else {
747         wrong_exception_type(exc);
748         return NULL;
749     }
750 }
751 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)752 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753 {
754     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755         PyObject *restuple;
756         PyObject *object;
757         Py_ssize_t i;
758         Py_ssize_t start;
759         Py_ssize_t end;
760         PyObject *res;
761         Py_UCS1 *outp;
762         Py_ssize_t ressize;
763         Py_UCS4 ch;
764         if (PyUnicodeEncodeError_GetStart(exc, &start))
765             return NULL;
766         if (PyUnicodeEncodeError_GetEnd(exc, &end))
767             return NULL;
768         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769             return NULL;
770         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771             end = start + PY_SSIZE_T_MAX / (2+7+1);
772         for (i = start, ressize = 0; i < end; ++i) {
773             /* object is guaranteed to be "ready" */
774             ch = PyUnicode_READ_CHAR(object, i);
775             if (ch<10)
776                 ressize += 2+1+1;
777             else if (ch<100)
778                 ressize += 2+2+1;
779             else if (ch<1000)
780                 ressize += 2+3+1;
781             else if (ch<10000)
782                 ressize += 2+4+1;
783             else if (ch<100000)
784                 ressize += 2+5+1;
785             else if (ch<1000000)
786                 ressize += 2+6+1;
787             else
788                 ressize += 2+7+1;
789         }
790         /* allocate replacement */
791         res = PyUnicode_New(ressize, 127);
792         if (res == NULL) {
793             Py_DECREF(object);
794             return NULL;
795         }
796         outp = PyUnicode_1BYTE_DATA(res);
797         /* generate replacement */
798         for (i = start; i < end; ++i) {
799             int digits;
800             int base;
801             ch = PyUnicode_READ_CHAR(object, i);
802             *outp++ = '&';
803             *outp++ = '#';
804             if (ch<10) {
805                 digits = 1;
806                 base = 1;
807             }
808             else if (ch<100) {
809                 digits = 2;
810                 base = 10;
811             }
812             else if (ch<1000) {
813                 digits = 3;
814                 base = 100;
815             }
816             else if (ch<10000) {
817                 digits = 4;
818                 base = 1000;
819             }
820             else if (ch<100000) {
821                 digits = 5;
822                 base = 10000;
823             }
824             else if (ch<1000000) {
825                 digits = 6;
826                 base = 100000;
827             }
828             else {
829                 digits = 7;
830                 base = 1000000;
831             }
832             while (digits-->0) {
833                 *outp++ = '0' + ch/base;
834                 ch %= base;
835                 base /= 10;
836             }
837             *outp++ = ';';
838         }
839         assert(_PyUnicode_CheckConsistency(res, 1));
840         restuple = Py_BuildValue("(Nn)", res, end);
841         Py_DECREF(object);
842         return restuple;
843     }
844     else {
845         wrong_exception_type(exc);
846         return NULL;
847     }
848 }
849 
PyCodec_BackslashReplaceErrors(PyObject * exc)850 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851 {
852     PyObject *object;
853     Py_ssize_t i;
854     Py_ssize_t start;
855     Py_ssize_t end;
856     PyObject *res;
857     Py_UCS1 *outp;
858     int ressize;
859     Py_UCS4 c;
860 
861     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862         const unsigned char *p;
863         if (PyUnicodeDecodeError_GetStart(exc, &start))
864             return NULL;
865         if (PyUnicodeDecodeError_GetEnd(exc, &end))
866             return NULL;
867         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868             return NULL;
869         p = (const unsigned char*)PyBytes_AS_STRING(object);
870         res = PyUnicode_New(4 * (end - start), 127);
871         if (res == NULL) {
872             Py_DECREF(object);
873             return NULL;
874         }
875         outp = PyUnicode_1BYTE_DATA(res);
876         for (i = start; i < end; i++, outp += 4) {
877             unsigned char c = p[i];
878             outp[0] = '\\';
879             outp[1] = 'x';
880             outp[2] = Py_hexdigits[(c>>4)&0xf];
881             outp[3] = Py_hexdigits[c&0xf];
882         }
883 
884         assert(_PyUnicode_CheckConsistency(res, 1));
885         Py_DECREF(object);
886         return Py_BuildValue("(Nn)", res, end);
887     }
888     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889         if (PyUnicodeEncodeError_GetStart(exc, &start))
890             return NULL;
891         if (PyUnicodeEncodeError_GetEnd(exc, &end))
892             return NULL;
893         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894             return NULL;
895     }
896     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897         if (PyUnicodeTranslateError_GetStart(exc, &start))
898             return NULL;
899         if (PyUnicodeTranslateError_GetEnd(exc, &end))
900             return NULL;
901         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902             return NULL;
903     }
904     else {
905         wrong_exception_type(exc);
906         return NULL;
907     }
908 
909     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910         end = start + PY_SSIZE_T_MAX / (1+1+8);
911     for (i = start, ressize = 0; i < end; ++i) {
912         /* object is guaranteed to be "ready" */
913         c = PyUnicode_READ_CHAR(object, i);
914         if (c >= 0x10000) {
915             ressize += 1+1+8;
916         }
917         else if (c >= 0x100) {
918             ressize += 1+1+4;
919         }
920         else
921             ressize += 1+1+2;
922     }
923     res = PyUnicode_New(ressize, 127);
924     if (res == NULL) {
925         Py_DECREF(object);
926         return NULL;
927     }
928     outp = PyUnicode_1BYTE_DATA(res);
929     for (i = start; i < end; ++i) {
930         c = PyUnicode_READ_CHAR(object, i);
931         *outp++ = '\\';
932         if (c >= 0x00010000) {
933             *outp++ = 'U';
934             *outp++ = Py_hexdigits[(c>>28)&0xf];
935             *outp++ = Py_hexdigits[(c>>24)&0xf];
936             *outp++ = Py_hexdigits[(c>>20)&0xf];
937             *outp++ = Py_hexdigits[(c>>16)&0xf];
938             *outp++ = Py_hexdigits[(c>>12)&0xf];
939             *outp++ = Py_hexdigits[(c>>8)&0xf];
940         }
941         else if (c >= 0x100) {
942             *outp++ = 'u';
943             *outp++ = Py_hexdigits[(c>>12)&0xf];
944             *outp++ = Py_hexdigits[(c>>8)&0xf];
945         }
946         else
947             *outp++ = 'x';
948         *outp++ = Py_hexdigits[(c>>4)&0xf];
949         *outp++ = Py_hexdigits[c&0xf];
950     }
951 
952     assert(_PyUnicode_CheckConsistency(res, 1));
953     Py_DECREF(object);
954     return Py_BuildValue("(Nn)", res, end);
955 }
956 
957 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
958 
PyCodec_NameReplaceErrors(PyObject * exc)959 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960 {
961     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962         PyObject *restuple;
963         PyObject *object;
964         Py_ssize_t i;
965         Py_ssize_t start;
966         Py_ssize_t end;
967         PyObject *res;
968         Py_UCS1 *outp;
969         Py_ssize_t ressize;
970         int replsize;
971         Py_UCS4 c;
972         char buffer[256]; /* NAME_MAXLEN */
973         if (PyUnicodeEncodeError_GetStart(exc, &start))
974             return NULL;
975         if (PyUnicodeEncodeError_GetEnd(exc, &end))
976             return NULL;
977         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978             return NULL;
979         if (!ucnhash_capi) {
980             /* load the unicode data module */
981             ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982                                             PyUnicodeData_CAPSULE_NAME, 1);
983             if (!ucnhash_capi) {
984                 return NULL;
985             }
986         }
987         for (i = start, ressize = 0; i < end; ++i) {
988             /* object is guaranteed to be "ready" */
989             c = PyUnicode_READ_CHAR(object, i);
990             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
991                 replsize = 1+1+1+(int)strlen(buffer)+1;
992             }
993             else if (c >= 0x10000) {
994                 replsize = 1+1+8;
995             }
996             else if (c >= 0x100) {
997                 replsize = 1+1+4;
998             }
999             else
1000                 replsize = 1+1+2;
1001             if (ressize > PY_SSIZE_T_MAX - replsize)
1002                 break;
1003             ressize += replsize;
1004         }
1005         end = i;
1006         res = PyUnicode_New(ressize, 127);
1007         if (res==NULL)
1008             return NULL;
1009         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010             i < end; ++i) {
1011             c = PyUnicode_READ_CHAR(object, i);
1012             *outp++ = '\\';
1013             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1014                 *outp++ = 'N';
1015                 *outp++ = '{';
1016                 strcpy((char *)outp, buffer);
1017                 outp += strlen(buffer);
1018                 *outp++ = '}';
1019                 continue;
1020             }
1021             if (c >= 0x00010000) {
1022                 *outp++ = 'U';
1023                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029             }
1030             else if (c >= 0x100) {
1031                 *outp++ = 'u';
1032                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034             }
1035             else
1036                 *outp++ = 'x';
1037             *outp++ = Py_hexdigits[(c>>4)&0xf];
1038             *outp++ = Py_hexdigits[c&0xf];
1039         }
1040 
1041         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1042         assert(_PyUnicode_CheckConsistency(res, 1));
1043         restuple = Py_BuildValue("(Nn)", res, end);
1044         Py_DECREF(object);
1045         return restuple;
1046     }
1047     else {
1048         wrong_exception_type(exc);
1049         return NULL;
1050     }
1051 }
1052 
1053 #define ENC_UNKNOWN     -1
1054 #define ENC_UTF8        0
1055 #define ENC_UTF16BE     1
1056 #define ENC_UTF16LE     2
1057 #define ENC_UTF32BE     3
1058 #define ENC_UTF32LE     4
1059 
1060 static int
get_standard_encoding(const char * encoding,int * bytelength)1061 get_standard_encoding(const char *encoding, int *bytelength)
1062 {
1063     if (Py_TOLOWER(encoding[0]) == 'u' &&
1064         Py_TOLOWER(encoding[1]) == 't' &&
1065         Py_TOLOWER(encoding[2]) == 'f') {
1066         encoding += 3;
1067         if (*encoding == '-' || *encoding == '_' )
1068             encoding++;
1069         if (encoding[0] == '8' && encoding[1] == '\0') {
1070             *bytelength = 3;
1071             return ENC_UTF8;
1072         }
1073         else if (encoding[0] == '1' && encoding[1] == '6') {
1074             encoding += 2;
1075             *bytelength = 2;
1076             if (*encoding == '\0') {
1077 #ifdef WORDS_BIGENDIAN
1078                 return ENC_UTF16BE;
1079 #else
1080                 return ENC_UTF16LE;
1081 #endif
1082             }
1083             if (*encoding == '-' || *encoding == '_' )
1084                 encoding++;
1085             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086                 if (Py_TOLOWER(encoding[0]) == 'b')
1087                     return ENC_UTF16BE;
1088                 if (Py_TOLOWER(encoding[0]) == 'l')
1089                     return ENC_UTF16LE;
1090             }
1091         }
1092         else if (encoding[0] == '3' && encoding[1] == '2') {
1093             encoding += 2;
1094             *bytelength = 4;
1095             if (*encoding == '\0') {
1096 #ifdef WORDS_BIGENDIAN
1097                 return ENC_UTF32BE;
1098 #else
1099                 return ENC_UTF32LE;
1100 #endif
1101             }
1102             if (*encoding == '-' || *encoding == '_' )
1103                 encoding++;
1104             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105                 if (Py_TOLOWER(encoding[0]) == 'b')
1106                     return ENC_UTF32BE;
1107                 if (Py_TOLOWER(encoding[0]) == 'l')
1108                     return ENC_UTF32LE;
1109             }
1110         }
1111     }
1112     else if (strcmp(encoding, "CP_UTF8") == 0) {
1113         *bytelength = 3;
1114         return ENC_UTF8;
1115     }
1116     return ENC_UNKNOWN;
1117 }
1118 
1119 /* This handler is declared static until someone demonstrates
1120    a need to call it directly. */
1121 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1122 PyCodec_SurrogatePassErrors(PyObject *exc)
1123 {
1124     PyObject *restuple;
1125     PyObject *object;
1126     PyObject *encode;
1127     const char *encoding;
1128     int code;
1129     int bytelength;
1130     Py_ssize_t i;
1131     Py_ssize_t start;
1132     Py_ssize_t end;
1133     PyObject *res;
1134 
1135     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1136         unsigned char *outp;
1137         if (PyUnicodeEncodeError_GetStart(exc, &start))
1138             return NULL;
1139         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140             return NULL;
1141         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142             return NULL;
1143         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144             Py_DECREF(object);
1145             return NULL;
1146         }
1147         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148             Py_DECREF(object);
1149             Py_DECREF(encode);
1150             return NULL;
1151         }
1152         code = get_standard_encoding(encoding, &bytelength);
1153         Py_DECREF(encode);
1154         if (code == ENC_UNKNOWN) {
1155             /* Not supported, fail with original exception */
1156             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157             Py_DECREF(object);
1158             return NULL;
1159         }
1160 
1161         if (end - start > PY_SSIZE_T_MAX / bytelength)
1162             end = start + PY_SSIZE_T_MAX / bytelength;
1163         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1164         if (!res) {
1165             Py_DECREF(object);
1166             return NULL;
1167         }
1168         outp = (unsigned char*)PyBytes_AsString(res);
1169         for (i = start; i < end; i++) {
1170             /* object is guaranteed to be "ready" */
1171             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1172             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1173                 /* Not a surrogate, fail with original exception */
1174                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175                 Py_DECREF(res);
1176                 Py_DECREF(object);
1177                 return NULL;
1178             }
1179             switch (code) {
1180             case ENC_UTF8:
1181                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184                 break;
1185             case ENC_UTF16LE:
1186                 *outp++ = (unsigned char) ch;
1187                 *outp++ = (unsigned char)(ch >> 8);
1188                 break;
1189             case ENC_UTF16BE:
1190                 *outp++ = (unsigned char)(ch >> 8);
1191                 *outp++ = (unsigned char) ch;
1192                 break;
1193             case ENC_UTF32LE:
1194                 *outp++ = (unsigned char) ch;
1195                 *outp++ = (unsigned char)(ch >> 8);
1196                 *outp++ = (unsigned char)(ch >> 16);
1197                 *outp++ = (unsigned char)(ch >> 24);
1198                 break;
1199             case ENC_UTF32BE:
1200                 *outp++ = (unsigned char)(ch >> 24);
1201                 *outp++ = (unsigned char)(ch >> 16);
1202                 *outp++ = (unsigned char)(ch >> 8);
1203                 *outp++ = (unsigned char) ch;
1204                 break;
1205             }
1206         }
1207         restuple = Py_BuildValue("(On)", res, end);
1208         Py_DECREF(res);
1209         Py_DECREF(object);
1210         return restuple;
1211     }
1212     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1213         const unsigned char *p;
1214         Py_UCS4 ch = 0;
1215         if (PyUnicodeDecodeError_GetStart(exc, &start))
1216             return NULL;
1217         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218             return NULL;
1219         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220             return NULL;
1221         p = (const unsigned char*)PyBytes_AS_STRING(object);
1222         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223             Py_DECREF(object);
1224             return NULL;
1225         }
1226         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227             Py_DECREF(object);
1228             Py_DECREF(encode);
1229             return NULL;
1230         }
1231         code = get_standard_encoding(encoding, &bytelength);
1232         Py_DECREF(encode);
1233         if (code == ENC_UNKNOWN) {
1234             /* Not supported, fail with original exception */
1235             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236             Py_DECREF(object);
1237             return NULL;
1238         }
1239 
1240         /* Try decoding a single surrogate character. If
1241            there are more, let the codec call us again. */
1242         p += start;
1243         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244             switch (code) {
1245             case ENC_UTF8:
1246                 if ((p[0] & 0xf0) == 0xe0 &&
1247                     (p[1] & 0xc0) == 0x80 &&
1248                     (p[2] & 0xc0) == 0x80) {
1249                     /* it's a three-byte code */
1250                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251                 }
1252                 break;
1253             case ENC_UTF16LE:
1254                 ch = p[1] << 8 | p[0];
1255                 break;
1256             case ENC_UTF16BE:
1257                 ch = p[0] << 8 | p[1];
1258                 break;
1259             case ENC_UTF32LE:
1260                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261                 break;
1262             case ENC_UTF32BE:
1263                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264                 break;
1265             }
1266         }
1267 
1268         Py_DECREF(object);
1269         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270             /* it's not a surrogate - fail */
1271             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272             return NULL;
1273         }
1274         res = PyUnicode_FromOrdinal(ch);
1275         if (res == NULL)
1276             return NULL;
1277         return Py_BuildValue("(Nn)", res, start + bytelength);
1278     }
1279     else {
1280         wrong_exception_type(exc);
1281         return NULL;
1282     }
1283 }
1284 
1285 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1286 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1287 {
1288     PyObject *restuple;
1289     PyObject *object;
1290     Py_ssize_t i;
1291     Py_ssize_t start;
1292     Py_ssize_t end;
1293     PyObject *res;
1294 
1295     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1296         char *outp;
1297         if (PyUnicodeEncodeError_GetStart(exc, &start))
1298             return NULL;
1299         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300             return NULL;
1301         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302             return NULL;
1303         res = PyBytes_FromStringAndSize(NULL, end-start);
1304         if (!res) {
1305             Py_DECREF(object);
1306             return NULL;
1307         }
1308         outp = PyBytes_AsString(res);
1309         for (i = start; i < end; i++) {
1310             /* object is guaranteed to be "ready" */
1311             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1312             if (ch < 0xdc80 || ch > 0xdcff) {
1313                 /* Not a UTF-8b surrogate, fail with original exception */
1314                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315                 Py_DECREF(res);
1316                 Py_DECREF(object);
1317                 return NULL;
1318             }
1319             *outp++ = ch - 0xdc00;
1320         }
1321         restuple = Py_BuildValue("(On)", res, end);
1322         Py_DECREF(res);
1323         Py_DECREF(object);
1324         return restuple;
1325     }
1326     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1327         PyObject *str;
1328         const unsigned char *p;
1329         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1330         int consumed = 0;
1331         if (PyUnicodeDecodeError_GetStart(exc, &start))
1332             return NULL;
1333         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334             return NULL;
1335         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336             return NULL;
1337         p = (const unsigned char*)PyBytes_AS_STRING(object);
1338         while (consumed < 4 && consumed < end-start) {
1339             /* Refuse to escape ASCII bytes. */
1340             if (p[start+consumed] < 128)
1341                 break;
1342             ch[consumed] = 0xdc00 + p[start+consumed];
1343             consumed++;
1344         }
1345         Py_DECREF(object);
1346         if (!consumed) {
1347             /* codec complained about ASCII byte. */
1348             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349             return NULL;
1350         }
1351         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352         if (str == NULL)
1353             return NULL;
1354         return Py_BuildValue("(Nn)", str, start+consumed);
1355     }
1356     else {
1357         wrong_exception_type(exc);
1358         return NULL;
1359     }
1360 }
1361 
1362 
strict_errors(PyObject * self,PyObject * exc)1363 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364 {
1365     return PyCodec_StrictErrors(exc);
1366 }
1367 
1368 
ignore_errors(PyObject * self,PyObject * exc)1369 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370 {
1371     return PyCodec_IgnoreErrors(exc);
1372 }
1373 
1374 
replace_errors(PyObject * self,PyObject * exc)1375 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376 {
1377     return PyCodec_ReplaceErrors(exc);
1378 }
1379 
1380 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1381 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382 {
1383     return PyCodec_XMLCharRefReplaceErrors(exc);
1384 }
1385 
1386 
backslashreplace_errors(PyObject * self,PyObject * exc)1387 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388 {
1389     return PyCodec_BackslashReplaceErrors(exc);
1390 }
1391 
namereplace_errors(PyObject * self,PyObject * exc)1392 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393 {
1394     return PyCodec_NameReplaceErrors(exc);
1395 }
1396 
surrogatepass_errors(PyObject * self,PyObject * exc)1397 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1398 {
1399     return PyCodec_SurrogatePassErrors(exc);
1400 }
1401 
surrogateescape_errors(PyObject * self,PyObject * exc)1402 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1403 {
1404     return PyCodec_SurrogateEscapeErrors(exc);
1405 }
1406 
_PyCodecRegistry_Init(void)1407 static int _PyCodecRegistry_Init(void)
1408 {
1409     static struct {
1410         const char *name;
1411         PyMethodDef def;
1412     } methods[] =
1413     {
1414         {
1415             "strict",
1416             {
1417                 "strict_errors",
1418                 strict_errors,
1419                 METH_O,
1420                 PyDoc_STR("Implements the 'strict' error handling, which "
1421                           "raises a UnicodeError on coding errors.")
1422             }
1423         },
1424         {
1425             "ignore",
1426             {
1427                 "ignore_errors",
1428                 ignore_errors,
1429                 METH_O,
1430                 PyDoc_STR("Implements the 'ignore' error handling, which "
1431                           "ignores malformed data and continues.")
1432             }
1433         },
1434         {
1435             "replace",
1436             {
1437                 "replace_errors",
1438                 replace_errors,
1439                 METH_O,
1440                 PyDoc_STR("Implements the 'replace' error handling, which "
1441                           "replaces malformed data with a replacement marker.")
1442             }
1443         },
1444         {
1445             "xmlcharrefreplace",
1446             {
1447                 "xmlcharrefreplace_errors",
1448                 xmlcharrefreplace_errors,
1449                 METH_O,
1450                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451                           "which replaces an unencodable character with the "
1452                           "appropriate XML character reference.")
1453             }
1454         },
1455         {
1456             "backslashreplace",
1457             {
1458                 "backslashreplace_errors",
1459                 backslashreplace_errors,
1460                 METH_O,
1461                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1462                           "which replaces malformed data with a backslashed "
1463                           "escape sequence.")
1464             }
1465         },
1466         {
1467             "namereplace",
1468             {
1469                 "namereplace_errors",
1470                 namereplace_errors,
1471                 METH_O,
1472                 PyDoc_STR("Implements the 'namereplace' error handling, "
1473                           "which replaces an unencodable character with a "
1474                           "\\N{...} escape sequence.")
1475             }
1476         },
1477         {
1478             "surrogatepass",
1479             {
1480                 "surrogatepass",
1481                 surrogatepass_errors,
1482                 METH_O
1483             }
1484         },
1485         {
1486             "surrogateescape",
1487             {
1488                 "surrogateescape",
1489                 surrogateescape_errors,
1490                 METH_O
1491             }
1492         }
1493     };
1494 
1495     PyInterpreterState *interp = _PyInterpreterState_GET();
1496     PyObject *mod;
1497 
1498     if (interp->codec_search_path != NULL)
1499         return 0;
1500 
1501     interp->codec_search_path = PyList_New(0);
1502     if (interp->codec_search_path == NULL) {
1503         return -1;
1504     }
1505 
1506     interp->codec_search_cache = PyDict_New();
1507     if (interp->codec_search_cache == NULL) {
1508         return -1;
1509     }
1510 
1511     interp->codec_error_registry = PyDict_New();
1512     if (interp->codec_error_registry == NULL) {
1513         return -1;
1514     }
1515 
1516     for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1517         PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518         if (!func) {
1519             return -1;
1520         }
1521 
1522         int res = PyCodec_RegisterError(methods[i].name, func);
1523         Py_DECREF(func);
1524         if (res) {
1525             return -1;
1526         }
1527     }
1528 
1529     mod = PyImport_ImportModuleNoBlock("encodings");
1530     if (mod == NULL) {
1531         return -1;
1532     }
1533     Py_DECREF(mod);
1534     interp->codecs_initialized = 1;
1535     return 0;
1536 }
1537