• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "ucnhash.h"
13 #include <ctype.h>
14 
15 const char *Py_hexdigits = "0123456789abcdef";
16 
17 /* --- Codec Registry ----------------------------------------------------- */
18 
19 /* Import the standard encodings package which will register the first
20    codec search function.
21 
22    This is done in a lazy way so that the Unicode implementation does
23    not downgrade startup time of scripts not needing it.
24 
25    ImportErrors are silently ignored by this function. Only one try is
26    made.
27 
28 */
29 
30 static int _PyCodecRegistry_Init(void); /* Forward */
31 
PyCodec_Register(PyObject * search_function)32 int PyCodec_Register(PyObject *search_function)
33 {
34     PyInterpreterState *interp = PyThreadState_GET()->interp;
35     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
36         goto onError;
37     if (search_function == NULL) {
38         PyErr_BadArgument();
39         goto onError;
40     }
41     if (!PyCallable_Check(search_function)) {
42         PyErr_SetString(PyExc_TypeError, "argument must be callable");
43         goto onError;
44     }
45     return PyList_Append(interp->codec_search_path, search_function);
46 
47  onError:
48     return -1;
49 }
50 
51 /* Convert a string to a normalized Python string: all characters are
52    converted to lower case, spaces are replaced with underscores. */
53 
54 static
normalizestring(const char * string)55 PyObject *normalizestring(const char *string)
56 {
57     size_t i;
58     size_t len = strlen(string);
59     char *p;
60     PyObject *v;
61 
62     if (len > PY_SSIZE_T_MAX) {
63         PyErr_SetString(PyExc_OverflowError, "string is too large");
64         return NULL;
65     }
66 
67     p = PyMem_Malloc(len + 1);
68     if (p == NULL)
69         return PyErr_NoMemory();
70     for (i = 0; i < len; i++) {
71         char ch = string[i];
72         if (ch == ' ')
73             ch = '-';
74         else
75             ch = Py_TOLOWER(Py_CHARMASK(ch));
76         p[i] = ch;
77     }
78     p[i] = '\0';
79     v = PyUnicode_FromString(p);
80     if (v == NULL)
81         return NULL;
82     PyMem_Free(p);
83     return v;
84 }
85 
86 /* Lookup the given encoding and return a tuple providing the codec
87    facilities.
88 
89    The encoding string is looked up converted to all lower-case
90    characters. This makes encodings looked up through this mechanism
91    effectively case-insensitive.
92 
93    If no codec is found, a LookupError is set and NULL returned.
94 
95    As side effect, this tries to load the encodings package, if not
96    yet done. This is part of the lazy load strategy for the encodings
97    package.
98 
99 */
100 
_PyCodec_Lookup(const char * encoding)101 PyObject *_PyCodec_Lookup(const char *encoding)
102 {
103     PyInterpreterState *interp;
104     PyObject *result, *args = NULL, *v;
105     Py_ssize_t i, len;
106 
107     if (encoding == NULL) {
108         PyErr_BadArgument();
109         goto onError;
110     }
111 
112     interp = PyThreadState_GET()->interp;
113     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
114         goto onError;
115 
116     /* Convert the encoding to a normalized Python string: all
117        characters are converted to lower case, spaces and hyphens are
118        replaced with underscores. */
119     v = normalizestring(encoding);
120     if (v == NULL)
121         goto onError;
122     PyUnicode_InternInPlace(&v);
123 
124     /* First, try to lookup the name in the registry dictionary */
125     result = PyDict_GetItem(interp->codec_search_cache, v);
126     if (result != NULL) {
127         Py_INCREF(result);
128         Py_DECREF(v);
129         return result;
130     }
131 
132     /* Next, scan the search functions in order of registration */
133     args = PyTuple_New(1);
134     if (args == NULL)
135         goto onError;
136     PyTuple_SET_ITEM(args,0,v);
137 
138     len = PyList_Size(interp->codec_search_path);
139     if (len < 0)
140         goto onError;
141     if (len == 0) {
142         PyErr_SetString(PyExc_LookupError,
143                         "no codec search functions registered: "
144                         "can't find encoding");
145         goto onError;
146     }
147 
148     for (i = 0; i < len; i++) {
149         PyObject *func;
150 
151         func = PyList_GetItem(interp->codec_search_path, i);
152         if (func == NULL)
153             goto onError;
154         result = PyEval_CallObject(func, args);
155         if (result == NULL)
156             goto onError;
157         if (result == Py_None) {
158             Py_DECREF(result);
159             continue;
160         }
161         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162             PyErr_SetString(PyExc_TypeError,
163                             "codec search functions must return 4-tuples");
164             Py_DECREF(result);
165             goto onError;
166         }
167         break;
168     }
169     if (i == len) {
170         /* XXX Perhaps we should cache misses too ? */
171         PyErr_Format(PyExc_LookupError,
172                      "unknown encoding: %s", encoding);
173         goto onError;
174     }
175 
176     /* Cache and return the result */
177     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
178         Py_DECREF(result);
179         goto onError;
180     }
181     Py_DECREF(args);
182     return result;
183 
184  onError:
185     Py_XDECREF(args);
186     return NULL;
187 }
188 
_PyCodec_Forget(const char * encoding)189 int _PyCodec_Forget(const char *encoding)
190 {
191     PyInterpreterState *interp;
192     PyObject *v;
193     int result;
194 
195     interp = PyThreadState_GET()->interp;
196     if (interp->codec_search_path == NULL) {
197         return -1;
198     }
199 
200     /* Convert the encoding to a normalized Python string: all
201        characters are converted to lower case, spaces and hyphens are
202        replaced with underscores. */
203     v = normalizestring(encoding);
204     if (v == NULL) {
205         return -1;
206     }
207 
208     /* Drop the named codec from the internal cache */
209     result = PyDict_DelItem(interp->codec_search_cache, v);
210     Py_DECREF(v);
211 
212     return result;
213 }
214 
215 /* Codec registry encoding check API. */
216 
PyCodec_KnownEncoding(const char * encoding)217 int PyCodec_KnownEncoding(const char *encoding)
218 {
219     PyObject *codecs;
220 
221     codecs = _PyCodec_Lookup(encoding);
222     if (!codecs) {
223         PyErr_Clear();
224         return 0;
225     }
226     else {
227         Py_DECREF(codecs);
228         return 1;
229     }
230 }
231 
232 static
args_tuple(PyObject * object,const char * errors)233 PyObject *args_tuple(PyObject *object,
234                      const char *errors)
235 {
236     PyObject *args;
237 
238     args = PyTuple_New(1 + (errors != NULL));
239     if (args == NULL)
240         return NULL;
241     Py_INCREF(object);
242     PyTuple_SET_ITEM(args,0,object);
243     if (errors) {
244         PyObject *v;
245 
246         v = PyUnicode_FromString(errors);
247         if (v == NULL) {
248             Py_DECREF(args);
249             return NULL;
250         }
251         PyTuple_SET_ITEM(args, 1, v);
252     }
253     return args;
254 }
255 
256 /* Helper function to get a codec item */
257 
258 static
codec_getitem(const char * encoding,int index)259 PyObject *codec_getitem(const char *encoding, int index)
260 {
261     PyObject *codecs;
262     PyObject *v;
263 
264     codecs = _PyCodec_Lookup(encoding);
265     if (codecs == NULL)
266         return NULL;
267     v = PyTuple_GET_ITEM(codecs, index);
268     Py_DECREF(codecs);
269     Py_INCREF(v);
270     return v;
271 }
272 
273 /* Helper functions to create an incremental codec. */
274 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)275 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276                                      const char *errors,
277                                      const char *attrname)
278 {
279     PyObject *ret, *inccodec;
280 
281     inccodec = PyObject_GetAttrString(codec_info, attrname);
282     if (inccodec == NULL)
283         return NULL;
284     if (errors)
285         ret = PyObject_CallFunction(inccodec, "s", errors);
286     else
287         ret = PyObject_CallFunction(inccodec, NULL);
288     Py_DECREF(inccodec);
289     return ret;
290 }
291 
292 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)293 PyObject *codec_getincrementalcodec(const char *encoding,
294                                     const char *errors,
295                                     const char *attrname)
296 {
297     PyObject *codec_info, *ret;
298 
299     codec_info = _PyCodec_Lookup(encoding);
300     if (codec_info == NULL)
301         return NULL;
302     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303     Py_DECREF(codec_info);
304     return ret;
305 }
306 
307 /* Helper function to create a stream codec. */
308 
309 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)310 PyObject *codec_getstreamcodec(const char *encoding,
311                                PyObject *stream,
312                                const char *errors,
313                                const int index)
314 {
315     PyObject *codecs, *streamcodec, *codeccls;
316 
317     codecs = _PyCodec_Lookup(encoding);
318     if (codecs == NULL)
319         return NULL;
320 
321     codeccls = PyTuple_GET_ITEM(codecs, index);
322     if (errors != NULL)
323         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
324     else
325         streamcodec = PyObject_CallFunction(codeccls, "O", stream);
326     Py_DECREF(codecs);
327     return streamcodec;
328 }
329 
330 /* Helpers to work with the result of _PyCodec_Lookup
331 
332  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)333 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334                                              const char *errors)
335 {
336     return codec_makeincrementalcodec(codec_info, errors,
337                                       "incrementaldecoder");
338 }
339 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)340 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341                                              const char *errors)
342 {
343     return codec_makeincrementalcodec(codec_info, errors,
344                                       "incrementalencoder");
345 }
346 
347 
348 /* Convenience APIs to query the Codec registry.
349 
350    All APIs return a codec object with incremented refcount.
351 
352  */
353 
PyCodec_Encoder(const char * encoding)354 PyObject *PyCodec_Encoder(const char *encoding)
355 {
356     return codec_getitem(encoding, 0);
357 }
358 
PyCodec_Decoder(const char * encoding)359 PyObject *PyCodec_Decoder(const char *encoding)
360 {
361     return codec_getitem(encoding, 1);
362 }
363 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)364 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
365                                      const char *errors)
366 {
367     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
368 }
369 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)370 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
371                                      const char *errors)
372 {
373     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
374 }
375 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)376 PyObject *PyCodec_StreamReader(const char *encoding,
377                                PyObject *stream,
378                                const char *errors)
379 {
380     return codec_getstreamcodec(encoding, stream, errors, 2);
381 }
382 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)383 PyObject *PyCodec_StreamWriter(const char *encoding,
384                                PyObject *stream,
385                                const char *errors)
386 {
387     return codec_getstreamcodec(encoding, stream, errors, 3);
388 }
389 
390 /* Helper that tries to ensure the reported exception chain indicates the
391  * codec that was invoked to trigger the failure without changing the type
392  * of the exception raised.
393  */
394 static void
wrap_codec_error(const char * operation,const char * encoding)395 wrap_codec_error(const char *operation,
396                  const char *encoding)
397 {
398     /* TrySetFromCause will replace the active exception with a suitably
399      * updated clone if it can, otherwise it will leave the original
400      * exception alone.
401      */
402     _PyErr_TrySetFromCause("%s with '%s' codec failed",
403                            operation, encoding);
404 }
405 
406 /* Encode an object (e.g. a Unicode object) using the given encoding
407    and return the resulting encoded object (usually a Python string).
408 
409    errors is passed to the encoder factory as argument if non-NULL. */
410 
411 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)412 _PyCodec_EncodeInternal(PyObject *object,
413                         PyObject *encoder,
414                         const char *encoding,
415                         const char *errors)
416 {
417     PyObject *args = NULL, *result = NULL;
418     PyObject *v = NULL;
419 
420     args = args_tuple(object, errors);
421     if (args == NULL)
422         goto onError;
423 
424     result = PyEval_CallObject(encoder, args);
425     if (result == NULL) {
426         wrap_codec_error("encoding", encoding);
427         goto onError;
428     }
429 
430     if (!PyTuple_Check(result) ||
431         PyTuple_GET_SIZE(result) != 2) {
432         PyErr_SetString(PyExc_TypeError,
433                         "encoder must return a tuple (object, integer)");
434         goto onError;
435     }
436     v = PyTuple_GET_ITEM(result,0);
437     Py_INCREF(v);
438     /* We don't check or use the second (integer) entry. */
439 
440     Py_DECREF(args);
441     Py_DECREF(encoder);
442     Py_DECREF(result);
443     return v;
444 
445  onError:
446     Py_XDECREF(result);
447     Py_XDECREF(args);
448     Py_XDECREF(encoder);
449     return NULL;
450 }
451 
452 /* Decode an object (usually a Python string) using the given encoding
453    and return an equivalent object (e.g. a Unicode object).
454 
455    errors is passed to the decoder factory as argument if non-NULL. */
456 
457 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)458 _PyCodec_DecodeInternal(PyObject *object,
459                         PyObject *decoder,
460                         const char *encoding,
461                         const char *errors)
462 {
463     PyObject *args = NULL, *result = NULL;
464     PyObject *v;
465 
466     args = args_tuple(object, errors);
467     if (args == NULL)
468         goto onError;
469 
470     result = PyEval_CallObject(decoder,args);
471     if (result == NULL) {
472         wrap_codec_error("decoding", encoding);
473         goto onError;
474     }
475     if (!PyTuple_Check(result) ||
476         PyTuple_GET_SIZE(result) != 2) {
477         PyErr_SetString(PyExc_TypeError,
478                         "decoder must return a tuple (object,integer)");
479         goto onError;
480     }
481     v = PyTuple_GET_ITEM(result,0);
482     Py_INCREF(v);
483     /* We don't check or use the second (integer) entry. */
484 
485     Py_DECREF(args);
486     Py_DECREF(decoder);
487     Py_DECREF(result);
488     return v;
489 
490  onError:
491     Py_XDECREF(args);
492     Py_XDECREF(decoder);
493     Py_XDECREF(result);
494     return NULL;
495 }
496 
497 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)498 PyObject *PyCodec_Encode(PyObject *object,
499                          const char *encoding,
500                          const char *errors)
501 {
502     PyObject *encoder;
503 
504     encoder = PyCodec_Encoder(encoding);
505     if (encoder == NULL)
506         return NULL;
507 
508     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509 }
510 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)511 PyObject *PyCodec_Decode(PyObject *object,
512                          const char *encoding,
513                          const char *errors)
514 {
515     PyObject *decoder;
516 
517     decoder = PyCodec_Decoder(encoding);
518     if (decoder == NULL)
519         return NULL;
520 
521     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522 }
523 
524 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)525 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526                                        const char *alternate_command)
527 {
528     _Py_IDENTIFIER(_is_text_encoding);
529     PyObject *codec;
530     PyObject *attr;
531     int is_text_codec;
532 
533     codec = _PyCodec_Lookup(encoding);
534     if (codec == NULL)
535         return NULL;
536 
537     /* Backwards compatibility: assume any raw tuple describes a text
538      * encoding, and the same for anything lacking the private
539      * attribute.
540      */
541     if (!PyTuple_CheckExact(codec)) {
542         attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543         if (attr == NULL) {
544             if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545                 PyErr_Clear();
546             } else {
547                 Py_DECREF(codec);
548                 return NULL;
549             }
550         } else {
551             is_text_codec = PyObject_IsTrue(attr);
552             Py_DECREF(attr);
553             if (is_text_codec <= 0) {
554                 Py_DECREF(codec);
555                 if (!is_text_codec)
556                     PyErr_Format(PyExc_LookupError,
557                                  "'%.400s' is not a text encoding; "
558                                  "use %s to handle arbitrary codecs",
559                                  encoding, alternate_command);
560                 return NULL;
561             }
562         }
563     }
564 
565     /* This appears to be a valid text encoding */
566     return codec;
567 }
568 
569 
570 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)571 PyObject *codec_getitem_checked(const char *encoding,
572                                 const char *alternate_command,
573                                 int index)
574 {
575     PyObject *codec;
576     PyObject *v;
577 
578     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
579     if (codec == NULL)
580         return NULL;
581 
582     v = PyTuple_GET_ITEM(codec, index);
583     Py_INCREF(v);
584     Py_DECREF(codec);
585     return v;
586 }
587 
_PyCodec_TextEncoder(const char * encoding)588 static PyObject * _PyCodec_TextEncoder(const char *encoding)
589 {
590     return codec_getitem_checked(encoding, "codecs.encode()", 0);
591 }
592 
_PyCodec_TextDecoder(const char * encoding)593 static PyObject * _PyCodec_TextDecoder(const char *encoding)
594 {
595     return codec_getitem_checked(encoding, "codecs.decode()", 1);
596 }
597 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)598 PyObject *_PyCodec_EncodeText(PyObject *object,
599                               const char *encoding,
600                               const char *errors)
601 {
602     PyObject *encoder;
603 
604     encoder = _PyCodec_TextEncoder(encoding);
605     if (encoder == NULL)
606         return NULL;
607 
608     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
609 }
610 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)611 PyObject *_PyCodec_DecodeText(PyObject *object,
612                               const char *encoding,
613                               const char *errors)
614 {
615     PyObject *decoder;
616 
617     decoder = _PyCodec_TextDecoder(encoding);
618     if (decoder == NULL)
619         return NULL;
620 
621     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
622 }
623 
624 /* Register the error handling callback function error under the name
625    name. This function will be called by the codec when it encounters
626    an unencodable characters/undecodable bytes and doesn't know the
627    callback name, when name is specified as the error parameter
628    in the call to the encode/decode function.
629    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)630 int PyCodec_RegisterError(const char *name, PyObject *error)
631 {
632     PyInterpreterState *interp = PyThreadState_GET()->interp;
633     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
634         return -1;
635     if (!PyCallable_Check(error)) {
636         PyErr_SetString(PyExc_TypeError, "handler must be callable");
637         return -1;
638     }
639     return PyDict_SetItemString(interp->codec_error_registry,
640                                 name, error);
641 }
642 
643 /* Lookup the error handling callback function registered under the
644    name error. As a special case NULL can be passed, in which case
645    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)646 PyObject *PyCodec_LookupError(const char *name)
647 {
648     PyObject *handler = NULL;
649 
650     PyInterpreterState *interp = PyThreadState_GET()->interp;
651     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
652         return NULL;
653 
654     if (name==NULL)
655         name = "strict";
656     handler = PyDict_GetItemString(interp->codec_error_registry, name);
657     if (!handler)
658         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
659     else
660         Py_INCREF(handler);
661     return handler;
662 }
663 
wrong_exception_type(PyObject * exc)664 static void wrong_exception_type(PyObject *exc)
665 {
666     PyErr_Format(PyExc_TypeError,
667                  "don't know how to handle %.200s in error callback",
668                  exc->ob_type->tp_name);
669 }
670 
PyCodec_StrictErrors(PyObject * exc)671 PyObject *PyCodec_StrictErrors(PyObject *exc)
672 {
673     if (PyExceptionInstance_Check(exc))
674         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
675     else
676         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
677     return NULL;
678 }
679 
680 
PyCodec_IgnoreErrors(PyObject * exc)681 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
682 {
683     Py_ssize_t end;
684 
685     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
686         if (PyUnicodeEncodeError_GetEnd(exc, &end))
687             return NULL;
688     }
689     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
690         if (PyUnicodeDecodeError_GetEnd(exc, &end))
691             return NULL;
692     }
693     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
694         if (PyUnicodeTranslateError_GetEnd(exc, &end))
695             return NULL;
696     }
697     else {
698         wrong_exception_type(exc);
699         return NULL;
700     }
701     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
702 }
703 
704 
PyCodec_ReplaceErrors(PyObject * exc)705 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
706 {
707     Py_ssize_t start, end, i, len;
708 
709     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
710         PyObject *res;
711         int kind;
712         void *data;
713         if (PyUnicodeEncodeError_GetStart(exc, &start))
714             return NULL;
715         if (PyUnicodeEncodeError_GetEnd(exc, &end))
716             return NULL;
717         len = end - start;
718         res = PyUnicode_New(len, '?');
719         if (res == NULL)
720             return NULL;
721         kind = PyUnicode_KIND(res);
722         data = PyUnicode_DATA(res);
723         for (i = 0; i < len; ++i)
724             PyUnicode_WRITE(kind, data, i, '?');
725         assert(_PyUnicode_CheckConsistency(res, 1));
726         return Py_BuildValue("(Nn)", res, end);
727     }
728     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
729         if (PyUnicodeDecodeError_GetEnd(exc, &end))
730             return NULL;
731         return Py_BuildValue("(Cn)",
732                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
733                              end);
734     }
735     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
736         PyObject *res;
737         int kind;
738         void *data;
739         if (PyUnicodeTranslateError_GetStart(exc, &start))
740             return NULL;
741         if (PyUnicodeTranslateError_GetEnd(exc, &end))
742             return NULL;
743         len = end - start;
744         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
745         if (res == NULL)
746             return NULL;
747         kind = PyUnicode_KIND(res);
748         data = PyUnicode_DATA(res);
749         for (i=0; i < len; i++)
750             PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
751         assert(_PyUnicode_CheckConsistency(res, 1));
752         return Py_BuildValue("(Nn)", res, end);
753     }
754     else {
755         wrong_exception_type(exc);
756         return NULL;
757     }
758 }
759 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)760 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
761 {
762     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
763         PyObject *restuple;
764         PyObject *object;
765         Py_ssize_t i;
766         Py_ssize_t start;
767         Py_ssize_t end;
768         PyObject *res;
769         unsigned char *outp;
770         Py_ssize_t ressize;
771         Py_UCS4 ch;
772         if (PyUnicodeEncodeError_GetStart(exc, &start))
773             return NULL;
774         if (PyUnicodeEncodeError_GetEnd(exc, &end))
775             return NULL;
776         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
777             return NULL;
778         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
779             end = start + PY_SSIZE_T_MAX / (2+7+1);
780         for (i = start, ressize = 0; i < end; ++i) {
781             /* object is guaranteed to be "ready" */
782             ch = PyUnicode_READ_CHAR(object, i);
783             if (ch<10)
784                 ressize += 2+1+1;
785             else if (ch<100)
786                 ressize += 2+2+1;
787             else if (ch<1000)
788                 ressize += 2+3+1;
789             else if (ch<10000)
790                 ressize += 2+4+1;
791             else if (ch<100000)
792                 ressize += 2+5+1;
793             else if (ch<1000000)
794                 ressize += 2+6+1;
795             else
796                 ressize += 2+7+1;
797         }
798         /* allocate replacement */
799         res = PyUnicode_New(ressize, 127);
800         if (res == NULL) {
801             Py_DECREF(object);
802             return NULL;
803         }
804         outp = PyUnicode_1BYTE_DATA(res);
805         /* generate replacement */
806         for (i = start; i < end; ++i) {
807             int digits;
808             int base;
809             ch = PyUnicode_READ_CHAR(object, i);
810             *outp++ = '&';
811             *outp++ = '#';
812             if (ch<10) {
813                 digits = 1;
814                 base = 1;
815             }
816             else if (ch<100) {
817                 digits = 2;
818                 base = 10;
819             }
820             else if (ch<1000) {
821                 digits = 3;
822                 base = 100;
823             }
824             else if (ch<10000) {
825                 digits = 4;
826                 base = 1000;
827             }
828             else if (ch<100000) {
829                 digits = 5;
830                 base = 10000;
831             }
832             else if (ch<1000000) {
833                 digits = 6;
834                 base = 100000;
835             }
836             else {
837                 digits = 7;
838                 base = 1000000;
839             }
840             while (digits-->0) {
841                 *outp++ = '0' + ch/base;
842                 ch %= base;
843                 base /= 10;
844             }
845             *outp++ = ';';
846         }
847         assert(_PyUnicode_CheckConsistency(res, 1));
848         restuple = Py_BuildValue("(Nn)", res, end);
849         Py_DECREF(object);
850         return restuple;
851     }
852     else {
853         wrong_exception_type(exc);
854         return NULL;
855     }
856 }
857 
PyCodec_BackslashReplaceErrors(PyObject * exc)858 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
859 {
860     PyObject *object;
861     Py_ssize_t i;
862     Py_ssize_t start;
863     Py_ssize_t end;
864     PyObject *res;
865     unsigned char *outp;
866     int ressize;
867     Py_UCS4 c;
868 
869     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
870         unsigned char *p;
871         if (PyUnicodeDecodeError_GetStart(exc, &start))
872             return NULL;
873         if (PyUnicodeDecodeError_GetEnd(exc, &end))
874             return NULL;
875         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
876             return NULL;
877         if (!(p = (unsigned char*)PyBytes_AsString(object))) {
878             Py_DECREF(object);
879             return NULL;
880         }
881         res = PyUnicode_New(4 * (end - start), 127);
882         if (res == NULL) {
883             Py_DECREF(object);
884             return NULL;
885         }
886         outp = PyUnicode_1BYTE_DATA(res);
887         for (i = start; i < end; i++, outp += 4) {
888             unsigned char c = p[i];
889             outp[0] = '\\';
890             outp[1] = 'x';
891             outp[2] = Py_hexdigits[(c>>4)&0xf];
892             outp[3] = Py_hexdigits[c&0xf];
893         }
894 
895         assert(_PyUnicode_CheckConsistency(res, 1));
896         Py_DECREF(object);
897         return Py_BuildValue("(Nn)", res, end);
898     }
899     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
900         if (PyUnicodeEncodeError_GetStart(exc, &start))
901             return NULL;
902         if (PyUnicodeEncodeError_GetEnd(exc, &end))
903             return NULL;
904         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
905             return NULL;
906     }
907     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
908         if (PyUnicodeTranslateError_GetStart(exc, &start))
909             return NULL;
910         if (PyUnicodeTranslateError_GetEnd(exc, &end))
911             return NULL;
912         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
913             return NULL;
914     }
915     else {
916         wrong_exception_type(exc);
917         return NULL;
918     }
919 
920     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
921         end = start + PY_SSIZE_T_MAX / (1+1+8);
922     for (i = start, ressize = 0; i < end; ++i) {
923         /* object is guaranteed to be "ready" */
924         c = PyUnicode_READ_CHAR(object, i);
925         if (c >= 0x10000) {
926             ressize += 1+1+8;
927         }
928         else if (c >= 0x100) {
929             ressize += 1+1+4;
930         }
931         else
932             ressize += 1+1+2;
933     }
934     res = PyUnicode_New(ressize, 127);
935     if (res == NULL) {
936         Py_DECREF(object);
937         return NULL;
938     }
939     outp = PyUnicode_1BYTE_DATA(res);
940     for (i = start; i < end; ++i) {
941         c = PyUnicode_READ_CHAR(object, i);
942         *outp++ = '\\';
943         if (c >= 0x00010000) {
944             *outp++ = 'U';
945             *outp++ = Py_hexdigits[(c>>28)&0xf];
946             *outp++ = Py_hexdigits[(c>>24)&0xf];
947             *outp++ = Py_hexdigits[(c>>20)&0xf];
948             *outp++ = Py_hexdigits[(c>>16)&0xf];
949             *outp++ = Py_hexdigits[(c>>12)&0xf];
950             *outp++ = Py_hexdigits[(c>>8)&0xf];
951         }
952         else if (c >= 0x100) {
953             *outp++ = 'u';
954             *outp++ = Py_hexdigits[(c>>12)&0xf];
955             *outp++ = Py_hexdigits[(c>>8)&0xf];
956         }
957         else
958             *outp++ = 'x';
959         *outp++ = Py_hexdigits[(c>>4)&0xf];
960         *outp++ = Py_hexdigits[c&0xf];
961     }
962 
963     assert(_PyUnicode_CheckConsistency(res, 1));
964     Py_DECREF(object);
965     return Py_BuildValue("(Nn)", res, end);
966 }
967 
968 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
969 
PyCodec_NameReplaceErrors(PyObject * exc)970 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
971 {
972     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
973         PyObject *restuple;
974         PyObject *object;
975         Py_ssize_t i;
976         Py_ssize_t start;
977         Py_ssize_t end;
978         PyObject *res;
979         unsigned char *outp;
980         Py_ssize_t ressize;
981         int replsize;
982         Py_UCS4 c;
983         char buffer[256]; /* NAME_MAXLEN */
984         if (PyUnicodeEncodeError_GetStart(exc, &start))
985             return NULL;
986         if (PyUnicodeEncodeError_GetEnd(exc, &end))
987             return NULL;
988         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
989             return NULL;
990         if (!ucnhash_CAPI) {
991             /* load the unicode data module */
992             ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
993                                             PyUnicodeData_CAPSULE_NAME, 1);
994             if (!ucnhash_CAPI)
995                 return NULL;
996         }
997         for (i = start, ressize = 0; i < end; ++i) {
998             /* object is guaranteed to be "ready" */
999             c = PyUnicode_READ_CHAR(object, i);
1000             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1001                 replsize = 1+1+1+(int)strlen(buffer)+1;
1002             }
1003             else if (c >= 0x10000) {
1004                 replsize = 1+1+8;
1005             }
1006             else if (c >= 0x100) {
1007                 replsize = 1+1+4;
1008             }
1009             else
1010                 replsize = 1+1+2;
1011             if (ressize > PY_SSIZE_T_MAX - replsize)
1012                 break;
1013             ressize += replsize;
1014         }
1015         end = i;
1016         res = PyUnicode_New(ressize, 127);
1017         if (res==NULL)
1018             return NULL;
1019         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1020             i < end; ++i) {
1021             c = PyUnicode_READ_CHAR(object, i);
1022             *outp++ = '\\';
1023             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1024                 *outp++ = 'N';
1025                 *outp++ = '{';
1026                 strcpy((char *)outp, buffer);
1027                 outp += strlen(buffer);
1028                 *outp++ = '}';
1029                 continue;
1030             }
1031             if (c >= 0x00010000) {
1032                 *outp++ = 'U';
1033                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1034                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1035                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1036                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1037                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1038                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1039             }
1040             else if (c >= 0x100) {
1041                 *outp++ = 'u';
1042                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1043                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1044             }
1045             else
1046                 *outp++ = 'x';
1047             *outp++ = Py_hexdigits[(c>>4)&0xf];
1048             *outp++ = Py_hexdigits[c&0xf];
1049         }
1050 
1051         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1052         assert(_PyUnicode_CheckConsistency(res, 1));
1053         restuple = Py_BuildValue("(Nn)", res, end);
1054         Py_DECREF(object);
1055         return restuple;
1056     }
1057     else {
1058         wrong_exception_type(exc);
1059         return NULL;
1060     }
1061 }
1062 
1063 #define ENC_UNKNOWN     -1
1064 #define ENC_UTF8        0
1065 #define ENC_UTF16BE     1
1066 #define ENC_UTF16LE     2
1067 #define ENC_UTF32BE     3
1068 #define ENC_UTF32LE     4
1069 
1070 static int
get_standard_encoding(const char * encoding,int * bytelength)1071 get_standard_encoding(const char *encoding, int *bytelength)
1072 {
1073     if (Py_TOLOWER(encoding[0]) == 'u' &&
1074         Py_TOLOWER(encoding[1]) == 't' &&
1075         Py_TOLOWER(encoding[2]) == 'f') {
1076         encoding += 3;
1077         if (*encoding == '-' || *encoding == '_' )
1078             encoding++;
1079         if (encoding[0] == '8' && encoding[1] == '\0') {
1080             *bytelength = 3;
1081             return ENC_UTF8;
1082         }
1083         else if (encoding[0] == '1' && encoding[1] == '6') {
1084             encoding += 2;
1085             *bytelength = 2;
1086             if (*encoding == '\0') {
1087 #ifdef WORDS_BIGENDIAN
1088                 return ENC_UTF16BE;
1089 #else
1090                 return ENC_UTF16LE;
1091 #endif
1092             }
1093             if (*encoding == '-' || *encoding == '_' )
1094                 encoding++;
1095             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1096                 if (Py_TOLOWER(encoding[0]) == 'b')
1097                     return ENC_UTF16BE;
1098                 if (Py_TOLOWER(encoding[0]) == 'l')
1099                     return ENC_UTF16LE;
1100             }
1101         }
1102         else if (encoding[0] == '3' && encoding[1] == '2') {
1103             encoding += 2;
1104             *bytelength = 4;
1105             if (*encoding == '\0') {
1106 #ifdef WORDS_BIGENDIAN
1107                 return ENC_UTF32BE;
1108 #else
1109                 return ENC_UTF32LE;
1110 #endif
1111             }
1112             if (*encoding == '-' || *encoding == '_' )
1113                 encoding++;
1114             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1115                 if (Py_TOLOWER(encoding[0]) == 'b')
1116                     return ENC_UTF32BE;
1117                 if (Py_TOLOWER(encoding[0]) == 'l')
1118                     return ENC_UTF32LE;
1119             }
1120         }
1121     }
1122     else if (strcmp(encoding, "CP_UTF8") == 0) {
1123         *bytelength = 3;
1124         return ENC_UTF8;
1125     }
1126     return ENC_UNKNOWN;
1127 }
1128 
1129 /* This handler is declared static until someone demonstrates
1130    a need to call it directly. */
1131 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1132 PyCodec_SurrogatePassErrors(PyObject *exc)
1133 {
1134     PyObject *restuple;
1135     PyObject *object;
1136     PyObject *encode;
1137     char *encoding;
1138     int code;
1139     int bytelength;
1140     Py_ssize_t i;
1141     Py_ssize_t start;
1142     Py_ssize_t end;
1143     PyObject *res;
1144 
1145     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1146         unsigned char *outp;
1147         if (PyUnicodeEncodeError_GetStart(exc, &start))
1148             return NULL;
1149         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1150             return NULL;
1151         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1152             return NULL;
1153         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1154             Py_DECREF(object);
1155             return NULL;
1156         }
1157         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1158             Py_DECREF(object);
1159             Py_DECREF(encode);
1160             return NULL;
1161         }
1162         code = get_standard_encoding(encoding, &bytelength);
1163         Py_DECREF(encode);
1164         if (code == ENC_UNKNOWN) {
1165             /* Not supported, fail with original exception */
1166             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1167             Py_DECREF(object);
1168             return NULL;
1169         }
1170 
1171         if (end - start > PY_SSIZE_T_MAX / bytelength)
1172             end = start + PY_SSIZE_T_MAX / bytelength;
1173         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1174         if (!res) {
1175             Py_DECREF(object);
1176             return NULL;
1177         }
1178         outp = (unsigned char*)PyBytes_AsString(res);
1179         for (i = start; i < end; i++) {
1180             /* object is guaranteed to be "ready" */
1181             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1182             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1183                 /* Not a surrogate, fail with original exception */
1184                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1185                 Py_DECREF(res);
1186                 Py_DECREF(object);
1187                 return NULL;
1188             }
1189             switch (code) {
1190             case ENC_UTF8:
1191                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1192                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1193                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1194                 break;
1195             case ENC_UTF16LE:
1196                 *outp++ = (unsigned char) ch;
1197                 *outp++ = (unsigned char)(ch >> 8);
1198                 break;
1199             case ENC_UTF16BE:
1200                 *outp++ = (unsigned char)(ch >> 8);
1201                 *outp++ = (unsigned char) ch;
1202                 break;
1203             case ENC_UTF32LE:
1204                 *outp++ = (unsigned char) ch;
1205                 *outp++ = (unsigned char)(ch >> 8);
1206                 *outp++ = (unsigned char)(ch >> 16);
1207                 *outp++ = (unsigned char)(ch >> 24);
1208                 break;
1209             case ENC_UTF32BE:
1210                 *outp++ = (unsigned char)(ch >> 24);
1211                 *outp++ = (unsigned char)(ch >> 16);
1212                 *outp++ = (unsigned char)(ch >> 8);
1213                 *outp++ = (unsigned char) ch;
1214                 break;
1215             }
1216         }
1217         restuple = Py_BuildValue("(On)", res, end);
1218         Py_DECREF(res);
1219         Py_DECREF(object);
1220         return restuple;
1221     }
1222     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1223         unsigned char *p;
1224         Py_UCS4 ch = 0;
1225         if (PyUnicodeDecodeError_GetStart(exc, &start))
1226             return NULL;
1227         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1228             return NULL;
1229         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1230             return NULL;
1231         if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1232             Py_DECREF(object);
1233             return NULL;
1234         }
1235         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1236             Py_DECREF(object);
1237             return NULL;
1238         }
1239         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1240             Py_DECREF(object);
1241             Py_DECREF(encode);
1242             return NULL;
1243         }
1244         code = get_standard_encoding(encoding, &bytelength);
1245         Py_DECREF(encode);
1246         if (code == ENC_UNKNOWN) {
1247             /* Not supported, fail with original exception */
1248             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1249             Py_DECREF(object);
1250             return NULL;
1251         }
1252 
1253         /* Try decoding a single surrogate character. If
1254            there are more, let the codec call us again. */
1255         p += start;
1256         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1257             switch (code) {
1258             case ENC_UTF8:
1259                 if ((p[0] & 0xf0) == 0xe0 &&
1260                     (p[1] & 0xc0) == 0x80 &&
1261                     (p[2] & 0xc0) == 0x80) {
1262                     /* it's a three-byte code */
1263                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1264                 }
1265                 break;
1266             case ENC_UTF16LE:
1267                 ch = p[1] << 8 | p[0];
1268                 break;
1269             case ENC_UTF16BE:
1270                 ch = p[0] << 8 | p[1];
1271                 break;
1272             case ENC_UTF32LE:
1273                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1274                 break;
1275             case ENC_UTF32BE:
1276                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1277                 break;
1278             }
1279         }
1280 
1281         Py_DECREF(object);
1282         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1283             /* it's not a surrogate - fail */
1284             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1285             return NULL;
1286         }
1287         res = PyUnicode_FromOrdinal(ch);
1288         if (res == NULL)
1289             return NULL;
1290         return Py_BuildValue("(Nn)", res, start + bytelength);
1291     }
1292     else {
1293         wrong_exception_type(exc);
1294         return NULL;
1295     }
1296 }
1297 
1298 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1299 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1300 {
1301     PyObject *restuple;
1302     PyObject *object;
1303     Py_ssize_t i;
1304     Py_ssize_t start;
1305     Py_ssize_t end;
1306     PyObject *res;
1307 
1308     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1309         char *outp;
1310         if (PyUnicodeEncodeError_GetStart(exc, &start))
1311             return NULL;
1312         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1313             return NULL;
1314         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1315             return NULL;
1316         res = PyBytes_FromStringAndSize(NULL, end-start);
1317         if (!res) {
1318             Py_DECREF(object);
1319             return NULL;
1320         }
1321         outp = PyBytes_AsString(res);
1322         for (i = start; i < end; i++) {
1323             /* object is guaranteed to be "ready" */
1324             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1325             if (ch < 0xdc80 || ch > 0xdcff) {
1326                 /* Not a UTF-8b surrogate, fail with original exception */
1327                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1328                 Py_DECREF(res);
1329                 Py_DECREF(object);
1330                 return NULL;
1331             }
1332             *outp++ = ch - 0xdc00;
1333         }
1334         restuple = Py_BuildValue("(On)", res, end);
1335         Py_DECREF(res);
1336         Py_DECREF(object);
1337         return restuple;
1338     }
1339     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1340         PyObject *str;
1341         unsigned char *p;
1342         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1343         int consumed = 0;
1344         if (PyUnicodeDecodeError_GetStart(exc, &start))
1345             return NULL;
1346         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1347             return NULL;
1348         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1349             return NULL;
1350         if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1351             Py_DECREF(object);
1352             return NULL;
1353         }
1354         while (consumed < 4 && consumed < end-start) {
1355             /* Refuse to escape ASCII bytes. */
1356             if (p[start+consumed] < 128)
1357                 break;
1358             ch[consumed] = 0xdc00 + p[start+consumed];
1359             consumed++;
1360         }
1361         Py_DECREF(object);
1362         if (!consumed) {
1363             /* codec complained about ASCII byte. */
1364             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1365             return NULL;
1366         }
1367         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1368         if (str == NULL)
1369             return NULL;
1370         return Py_BuildValue("(Nn)", str, start+consumed);
1371     }
1372     else {
1373         wrong_exception_type(exc);
1374         return NULL;
1375     }
1376 }
1377 
1378 
strict_errors(PyObject * self,PyObject * exc)1379 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1380 {
1381     return PyCodec_StrictErrors(exc);
1382 }
1383 
1384 
ignore_errors(PyObject * self,PyObject * exc)1385 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1386 {
1387     return PyCodec_IgnoreErrors(exc);
1388 }
1389 
1390 
replace_errors(PyObject * self,PyObject * exc)1391 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1392 {
1393     return PyCodec_ReplaceErrors(exc);
1394 }
1395 
1396 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1397 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1398 {
1399     return PyCodec_XMLCharRefReplaceErrors(exc);
1400 }
1401 
1402 
backslashreplace_errors(PyObject * self,PyObject * exc)1403 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1404 {
1405     return PyCodec_BackslashReplaceErrors(exc);
1406 }
1407 
namereplace_errors(PyObject * self,PyObject * exc)1408 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1409 {
1410     return PyCodec_NameReplaceErrors(exc);
1411 }
1412 
surrogatepass_errors(PyObject * self,PyObject * exc)1413 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1414 {
1415     return PyCodec_SurrogatePassErrors(exc);
1416 }
1417 
surrogateescape_errors(PyObject * self,PyObject * exc)1418 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1419 {
1420     return PyCodec_SurrogateEscapeErrors(exc);
1421 }
1422 
_PyCodecRegistry_Init(void)1423 static int _PyCodecRegistry_Init(void)
1424 {
1425     static struct {
1426         char *name;
1427         PyMethodDef def;
1428     } methods[] =
1429     {
1430         {
1431             "strict",
1432             {
1433                 "strict_errors",
1434                 strict_errors,
1435                 METH_O,
1436                 PyDoc_STR("Implements the 'strict' error handling, which "
1437                           "raises a UnicodeError on coding errors.")
1438             }
1439         },
1440         {
1441             "ignore",
1442             {
1443                 "ignore_errors",
1444                 ignore_errors,
1445                 METH_O,
1446                 PyDoc_STR("Implements the 'ignore' error handling, which "
1447                           "ignores malformed data and continues.")
1448             }
1449         },
1450         {
1451             "replace",
1452             {
1453                 "replace_errors",
1454                 replace_errors,
1455                 METH_O,
1456                 PyDoc_STR("Implements the 'replace' error handling, which "
1457                           "replaces malformed data with a replacement marker.")
1458             }
1459         },
1460         {
1461             "xmlcharrefreplace",
1462             {
1463                 "xmlcharrefreplace_errors",
1464                 xmlcharrefreplace_errors,
1465                 METH_O,
1466                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1467                           "which replaces an unencodable character with the "
1468                           "appropriate XML character reference.")
1469             }
1470         },
1471         {
1472             "backslashreplace",
1473             {
1474                 "backslashreplace_errors",
1475                 backslashreplace_errors,
1476                 METH_O,
1477                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1478                           "which replaces malformed data with a backslashed "
1479                           "escape sequence.")
1480             }
1481         },
1482         {
1483             "namereplace",
1484             {
1485                 "namereplace_errors",
1486                 namereplace_errors,
1487                 METH_O,
1488                 PyDoc_STR("Implements the 'namereplace' error handling, "
1489                           "which replaces an unencodable character with a "
1490                           "\\N{...} escape sequence.")
1491             }
1492         },
1493         {
1494             "surrogatepass",
1495             {
1496                 "surrogatepass",
1497                 surrogatepass_errors,
1498                 METH_O
1499             }
1500         },
1501         {
1502             "surrogateescape",
1503             {
1504                 "surrogateescape",
1505                 surrogateescape_errors,
1506                 METH_O
1507             }
1508         }
1509     };
1510 
1511     PyInterpreterState *interp = PyThreadState_GET()->interp;
1512     PyObject *mod;
1513     unsigned i;
1514 
1515     if (interp->codec_search_path != NULL)
1516         return 0;
1517 
1518     interp->codec_search_path = PyList_New(0);
1519     interp->codec_search_cache = PyDict_New();
1520     interp->codec_error_registry = PyDict_New();
1521 
1522     if (interp->codec_error_registry) {
1523         for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1524             PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1525             int res;
1526             if (!func)
1527                 Py_FatalError("can't initialize codec error registry");
1528             res = PyCodec_RegisterError(methods[i].name, func);
1529             Py_DECREF(func);
1530             if (res)
1531                 Py_FatalError("can't initialize codec error registry");
1532         }
1533     }
1534 
1535     if (interp->codec_search_path == NULL ||
1536         interp->codec_search_cache == NULL ||
1537         interp->codec_error_registry == NULL)
1538         Py_FatalError("can't initialize codec registry");
1539 
1540     mod = PyImport_ImportModuleNoBlock("encodings");
1541     if (mod == NULL) {
1542         return -1;
1543     }
1544     Py_DECREF(mod);
1545     interp->codecs_initialized = 1;
1546     return 0;
1547 }
1548