• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "internal/pystate.h"
13 #include "ucnhash.h"
14 #include <ctype.h>
15 
16 const char *Py_hexdigits = "0123456789abcdef";
17 
18 /* --- Codec Registry ----------------------------------------------------- */
19 
20 /* Import the standard encodings package which will register the first
21    codec search function.
22 
23    This is done in a lazy way so that the Unicode implementation does
24    not downgrade startup time of scripts not needing it.
25 
26    ImportErrors are silently ignored by this function. Only one try is
27    made.
28 
29 */
30 
31 static int _PyCodecRegistry_Init(void); /* Forward */
32 
PyCodec_Register(PyObject * search_function)33 int PyCodec_Register(PyObject *search_function)
34 {
35     PyInterpreterState *interp = PyThreadState_GET()->interp;
36     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37         goto onError;
38     if (search_function == NULL) {
39         PyErr_BadArgument();
40         goto onError;
41     }
42     if (!PyCallable_Check(search_function)) {
43         PyErr_SetString(PyExc_TypeError, "argument must be callable");
44         goto onError;
45     }
46     return PyList_Append(interp->codec_search_path, search_function);
47 
48  onError:
49     return -1;
50 }
51 
52 /* Convert a string to a normalized Python string: all characters are
53    converted to lower case, spaces are replaced with underscores. */
54 
55 static
normalizestring(const char * string)56 PyObject *normalizestring(const char *string)
57 {
58     size_t i;
59     size_t len = strlen(string);
60     char *p;
61     PyObject *v;
62 
63     if (len > PY_SSIZE_T_MAX) {
64         PyErr_SetString(PyExc_OverflowError, "string is too large");
65         return NULL;
66     }
67 
68     p = PyMem_Malloc(len + 1);
69     if (p == NULL)
70         return PyErr_NoMemory();
71     for (i = 0; i < len; i++) {
72         char ch = string[i];
73         if (ch == ' ')
74             ch = '-';
75         else
76             ch = Py_TOLOWER(Py_CHARMASK(ch));
77         p[i] = ch;
78     }
79     p[i] = '\0';
80     v = PyUnicode_FromString(p);
81     PyMem_Free(p);
82     return v;
83 }
84 
85 /* Lookup the given encoding and return a tuple providing the codec
86    facilities.
87 
88    The encoding string is looked up converted to all lower-case
89    characters. This makes encodings looked up through this mechanism
90    effectively case-insensitive.
91 
92    If no codec is found, a LookupError is set and NULL returned.
93 
94    As side effect, this tries to load the encodings package, if not
95    yet done. This is part of the lazy load strategy for the encodings
96    package.
97 
98 */
99 
_PyCodec_Lookup(const char * encoding)100 PyObject *_PyCodec_Lookup(const char *encoding)
101 {
102     PyInterpreterState *interp;
103     PyObject *result, *args = NULL, *v;
104     Py_ssize_t i, len;
105 
106     if (encoding == NULL) {
107         PyErr_BadArgument();
108         goto onError;
109     }
110 
111     interp = PyThreadState_GET()->interp;
112     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
113         goto onError;
114 
115     /* Convert the encoding to a normalized Python string: all
116        characters are converted to lower case, spaces and hyphens are
117        replaced with underscores. */
118     v = normalizestring(encoding);
119     if (v == NULL)
120         goto onError;
121     PyUnicode_InternInPlace(&v);
122 
123     /* First, try to lookup the name in the registry dictionary */
124     result = PyDict_GetItem(interp->codec_search_cache, v);
125     if (result != NULL) {
126         Py_INCREF(result);
127         Py_DECREF(v);
128         return result;
129     }
130 
131     /* Next, scan the search functions in order of registration */
132     args = PyTuple_New(1);
133     if (args == NULL) {
134         Py_DECREF(v);
135         return NULL;
136     }
137     PyTuple_SET_ITEM(args,0,v);
138 
139     len = PyList_Size(interp->codec_search_path);
140     if (len < 0)
141         goto onError;
142     if (len == 0) {
143         PyErr_SetString(PyExc_LookupError,
144                         "no codec search functions registered: "
145                         "can't find encoding");
146         goto onError;
147     }
148 
149     for (i = 0; i < len; i++) {
150         PyObject *func;
151 
152         func = PyList_GetItem(interp->codec_search_path, i);
153         if (func == NULL)
154             goto onError;
155         result = PyEval_CallObject(func, args);
156         if (result == NULL)
157             goto onError;
158         if (result == Py_None) {
159             Py_DECREF(result);
160             continue;
161         }
162         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
163             PyErr_SetString(PyExc_TypeError,
164                             "codec search functions must return 4-tuples");
165             Py_DECREF(result);
166             goto onError;
167         }
168         break;
169     }
170     if (i == len) {
171         /* XXX Perhaps we should cache misses too ? */
172         PyErr_Format(PyExc_LookupError,
173                      "unknown encoding: %s", encoding);
174         goto onError;
175     }
176 
177     /* Cache and return the result */
178     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
179         Py_DECREF(result);
180         goto onError;
181     }
182     Py_DECREF(args);
183     return result;
184 
185  onError:
186     Py_XDECREF(args);
187     return NULL;
188 }
189 
_PyCodec_Forget(const char * encoding)190 int _PyCodec_Forget(const char *encoding)
191 {
192     PyInterpreterState *interp;
193     PyObject *v;
194     int result;
195 
196     interp = PyThreadState_GET()->interp;
197     if (interp->codec_search_path == NULL) {
198         return -1;
199     }
200 
201     /* Convert the encoding to a normalized Python string: all
202        characters are converted to lower case, spaces and hyphens are
203        replaced with underscores. */
204     v = normalizestring(encoding);
205     if (v == NULL) {
206         return -1;
207     }
208 
209     /* Drop the named codec from the internal cache */
210     result = PyDict_DelItem(interp->codec_search_cache, v);
211     Py_DECREF(v);
212 
213     return result;
214 }
215 
216 /* Codec registry encoding check API. */
217 
PyCodec_KnownEncoding(const char * encoding)218 int PyCodec_KnownEncoding(const char *encoding)
219 {
220     PyObject *codecs;
221 
222     codecs = _PyCodec_Lookup(encoding);
223     if (!codecs) {
224         PyErr_Clear();
225         return 0;
226     }
227     else {
228         Py_DECREF(codecs);
229         return 1;
230     }
231 }
232 
233 static
args_tuple(PyObject * object,const char * errors)234 PyObject *args_tuple(PyObject *object,
235                      const char *errors)
236 {
237     PyObject *args;
238 
239     args = PyTuple_New(1 + (errors != NULL));
240     if (args == NULL)
241         return NULL;
242     Py_INCREF(object);
243     PyTuple_SET_ITEM(args,0,object);
244     if (errors) {
245         PyObject *v;
246 
247         v = PyUnicode_FromString(errors);
248         if (v == NULL) {
249             Py_DECREF(args);
250             return NULL;
251         }
252         PyTuple_SET_ITEM(args, 1, v);
253     }
254     return args;
255 }
256 
257 /* Helper function to get a codec item */
258 
259 static
codec_getitem(const char * encoding,int index)260 PyObject *codec_getitem(const char *encoding, int index)
261 {
262     PyObject *codecs;
263     PyObject *v;
264 
265     codecs = _PyCodec_Lookup(encoding);
266     if (codecs == NULL)
267         return NULL;
268     v = PyTuple_GET_ITEM(codecs, index);
269     Py_DECREF(codecs);
270     Py_INCREF(v);
271     return v;
272 }
273 
274 /* Helper functions to create an incremental codec. */
275 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)276 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
277                                      const char *errors,
278                                      const char *attrname)
279 {
280     PyObject *ret, *inccodec;
281 
282     inccodec = PyObject_GetAttrString(codec_info, attrname);
283     if (inccodec == NULL)
284         return NULL;
285     if (errors)
286         ret = PyObject_CallFunction(inccodec, "s", errors);
287     else
288         ret = _PyObject_CallNoArg(inccodec);
289     Py_DECREF(inccodec);
290     return ret;
291 }
292 
293 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)294 PyObject *codec_getincrementalcodec(const char *encoding,
295                                     const char *errors,
296                                     const char *attrname)
297 {
298     PyObject *codec_info, *ret;
299 
300     codec_info = _PyCodec_Lookup(encoding);
301     if (codec_info == NULL)
302         return NULL;
303     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
304     Py_DECREF(codec_info);
305     return ret;
306 }
307 
308 /* Helper function to create a stream codec. */
309 
310 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)311 PyObject *codec_getstreamcodec(const char *encoding,
312                                PyObject *stream,
313                                const char *errors,
314                                const int index)
315 {
316     PyObject *codecs, *streamcodec, *codeccls;
317 
318     codecs = _PyCodec_Lookup(encoding);
319     if (codecs == NULL)
320         return NULL;
321 
322     codeccls = PyTuple_GET_ITEM(codecs, index);
323     if (errors != NULL)
324         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
325     else
326         streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
327     Py_DECREF(codecs);
328     return streamcodec;
329 }
330 
331 /* Helpers to work with the result of _PyCodec_Lookup
332 
333  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)334 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
335                                              const char *errors)
336 {
337     return codec_makeincrementalcodec(codec_info, errors,
338                                       "incrementaldecoder");
339 }
340 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)341 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
342                                              const char *errors)
343 {
344     return codec_makeincrementalcodec(codec_info, errors,
345                                       "incrementalencoder");
346 }
347 
348 
349 /* Convenience APIs to query the Codec registry.
350 
351    All APIs return a codec object with incremented refcount.
352 
353  */
354 
PyCodec_Encoder(const char * encoding)355 PyObject *PyCodec_Encoder(const char *encoding)
356 {
357     return codec_getitem(encoding, 0);
358 }
359 
PyCodec_Decoder(const char * encoding)360 PyObject *PyCodec_Decoder(const char *encoding)
361 {
362     return codec_getitem(encoding, 1);
363 }
364 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)365 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
366                                      const char *errors)
367 {
368     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
369 }
370 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)371 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
372                                      const char *errors)
373 {
374     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
375 }
376 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)377 PyObject *PyCodec_StreamReader(const char *encoding,
378                                PyObject *stream,
379                                const char *errors)
380 {
381     return codec_getstreamcodec(encoding, stream, errors, 2);
382 }
383 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)384 PyObject *PyCodec_StreamWriter(const char *encoding,
385                                PyObject *stream,
386                                const char *errors)
387 {
388     return codec_getstreamcodec(encoding, stream, errors, 3);
389 }
390 
391 /* Helper that tries to ensure the reported exception chain indicates the
392  * codec that was invoked to trigger the failure without changing the type
393  * of the exception raised.
394  */
395 static void
wrap_codec_error(const char * operation,const char * encoding)396 wrap_codec_error(const char *operation,
397                  const char *encoding)
398 {
399     /* TrySetFromCause will replace the active exception with a suitably
400      * updated clone if it can, otherwise it will leave the original
401      * exception alone.
402      */
403     _PyErr_TrySetFromCause("%s with '%s' codec failed",
404                            operation, encoding);
405 }
406 
407 /* Encode an object (e.g. a Unicode object) using the given encoding
408    and return the resulting encoded object (usually a Python string).
409 
410    errors is passed to the encoder factory as argument if non-NULL. */
411 
412 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)413 _PyCodec_EncodeInternal(PyObject *object,
414                         PyObject *encoder,
415                         const char *encoding,
416                         const char *errors)
417 {
418     PyObject *args = NULL, *result = NULL;
419     PyObject *v = NULL;
420 
421     args = args_tuple(object, errors);
422     if (args == NULL)
423         goto onError;
424 
425     result = PyEval_CallObject(encoder, args);
426     if (result == NULL) {
427         wrap_codec_error("encoding", encoding);
428         goto onError;
429     }
430 
431     if (!PyTuple_Check(result) ||
432         PyTuple_GET_SIZE(result) != 2) {
433         PyErr_SetString(PyExc_TypeError,
434                         "encoder must return a tuple (object, integer)");
435         goto onError;
436     }
437     v = PyTuple_GET_ITEM(result,0);
438     Py_INCREF(v);
439     /* We don't check or use the second (integer) entry. */
440 
441     Py_DECREF(args);
442     Py_DECREF(encoder);
443     Py_DECREF(result);
444     return v;
445 
446  onError:
447     Py_XDECREF(result);
448     Py_XDECREF(args);
449     Py_XDECREF(encoder);
450     return NULL;
451 }
452 
453 /* Decode an object (usually a Python string) using the given encoding
454    and return an equivalent object (e.g. a Unicode object).
455 
456    errors is passed to the decoder factory as argument if non-NULL. */
457 
458 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)459 _PyCodec_DecodeInternal(PyObject *object,
460                         PyObject *decoder,
461                         const char *encoding,
462                         const char *errors)
463 {
464     PyObject *args = NULL, *result = NULL;
465     PyObject *v;
466 
467     args = args_tuple(object, errors);
468     if (args == NULL)
469         goto onError;
470 
471     result = PyEval_CallObject(decoder,args);
472     if (result == NULL) {
473         wrap_codec_error("decoding", encoding);
474         goto onError;
475     }
476     if (!PyTuple_Check(result) ||
477         PyTuple_GET_SIZE(result) != 2) {
478         PyErr_SetString(PyExc_TypeError,
479                         "decoder must return a tuple (object,integer)");
480         goto onError;
481     }
482     v = PyTuple_GET_ITEM(result,0);
483     Py_INCREF(v);
484     /* We don't check or use the second (integer) entry. */
485 
486     Py_DECREF(args);
487     Py_DECREF(decoder);
488     Py_DECREF(result);
489     return v;
490 
491  onError:
492     Py_XDECREF(args);
493     Py_XDECREF(decoder);
494     Py_XDECREF(result);
495     return NULL;
496 }
497 
498 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)499 PyObject *PyCodec_Encode(PyObject *object,
500                          const char *encoding,
501                          const char *errors)
502 {
503     PyObject *encoder;
504 
505     encoder = PyCodec_Encoder(encoding);
506     if (encoder == NULL)
507         return NULL;
508 
509     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
510 }
511 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)512 PyObject *PyCodec_Decode(PyObject *object,
513                          const char *encoding,
514                          const char *errors)
515 {
516     PyObject *decoder;
517 
518     decoder = PyCodec_Decoder(encoding);
519     if (decoder == NULL)
520         return NULL;
521 
522     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
523 }
524 
525 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)526 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
527                                        const char *alternate_command)
528 {
529     _Py_IDENTIFIER(_is_text_encoding);
530     PyObject *codec;
531     PyObject *attr;
532     int is_text_codec;
533 
534     codec = _PyCodec_Lookup(encoding);
535     if (codec == NULL)
536         return NULL;
537 
538     /* Backwards compatibility: assume any raw tuple describes a text
539      * encoding, and the same for anything lacking the private
540      * attribute.
541      */
542     if (!PyTuple_CheckExact(codec)) {
543         if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
544             Py_DECREF(codec);
545             return NULL;
546         }
547         if (attr != NULL) {
548             is_text_codec = PyObject_IsTrue(attr);
549             Py_DECREF(attr);
550             if (is_text_codec <= 0) {
551                 Py_DECREF(codec);
552                 if (!is_text_codec)
553                     PyErr_Format(PyExc_LookupError,
554                                  "'%.400s' is not a text encoding; "
555                                  "use %s to handle arbitrary codecs",
556                                  encoding, alternate_command);
557                 return NULL;
558             }
559         }
560     }
561 
562     /* This appears to be a valid text encoding */
563     return codec;
564 }
565 
566 
567 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)568 PyObject *codec_getitem_checked(const char *encoding,
569                                 const char *alternate_command,
570                                 int index)
571 {
572     PyObject *codec;
573     PyObject *v;
574 
575     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
576     if (codec == NULL)
577         return NULL;
578 
579     v = PyTuple_GET_ITEM(codec, index);
580     Py_INCREF(v);
581     Py_DECREF(codec);
582     return v;
583 }
584 
_PyCodec_TextEncoder(const char * encoding)585 static PyObject * _PyCodec_TextEncoder(const char *encoding)
586 {
587     return codec_getitem_checked(encoding, "codecs.encode()", 0);
588 }
589 
_PyCodec_TextDecoder(const char * encoding)590 static PyObject * _PyCodec_TextDecoder(const char *encoding)
591 {
592     return codec_getitem_checked(encoding, "codecs.decode()", 1);
593 }
594 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)595 PyObject *_PyCodec_EncodeText(PyObject *object,
596                               const char *encoding,
597                               const char *errors)
598 {
599     PyObject *encoder;
600 
601     encoder = _PyCodec_TextEncoder(encoding);
602     if (encoder == NULL)
603         return NULL;
604 
605     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
606 }
607 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)608 PyObject *_PyCodec_DecodeText(PyObject *object,
609                               const char *encoding,
610                               const char *errors)
611 {
612     PyObject *decoder;
613 
614     decoder = _PyCodec_TextDecoder(encoding);
615     if (decoder == NULL)
616         return NULL;
617 
618     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
619 }
620 
621 /* Register the error handling callback function error under the name
622    name. This function will be called by the codec when it encounters
623    an unencodable characters/undecodable bytes and doesn't know the
624    callback name, when name is specified as the error parameter
625    in the call to the encode/decode function.
626    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)627 int PyCodec_RegisterError(const char *name, PyObject *error)
628 {
629     PyInterpreterState *interp = PyThreadState_GET()->interp;
630     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
631         return -1;
632     if (!PyCallable_Check(error)) {
633         PyErr_SetString(PyExc_TypeError, "handler must be callable");
634         return -1;
635     }
636     return PyDict_SetItemString(interp->codec_error_registry,
637                                 name, error);
638 }
639 
640 /* Lookup the error handling callback function registered under the
641    name error. As a special case NULL can be passed, in which case
642    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)643 PyObject *PyCodec_LookupError(const char *name)
644 {
645     PyObject *handler = NULL;
646 
647     PyInterpreterState *interp = PyThreadState_GET()->interp;
648     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
649         return NULL;
650 
651     if (name==NULL)
652         name = "strict";
653     handler = PyDict_GetItemString(interp->codec_error_registry, name);
654     if (!handler)
655         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
656     else
657         Py_INCREF(handler);
658     return handler;
659 }
660 
wrong_exception_type(PyObject * exc)661 static void wrong_exception_type(PyObject *exc)
662 {
663     PyErr_Format(PyExc_TypeError,
664                  "don't know how to handle %.200s in error callback",
665                  exc->ob_type->tp_name);
666 }
667 
PyCodec_StrictErrors(PyObject * exc)668 PyObject *PyCodec_StrictErrors(PyObject *exc)
669 {
670     if (PyExceptionInstance_Check(exc))
671         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
672     else
673         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
674     return NULL;
675 }
676 
677 
PyCodec_IgnoreErrors(PyObject * exc)678 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
679 {
680     Py_ssize_t end;
681 
682     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
683         if (PyUnicodeEncodeError_GetEnd(exc, &end))
684             return NULL;
685     }
686     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
687         if (PyUnicodeDecodeError_GetEnd(exc, &end))
688             return NULL;
689     }
690     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
691         if (PyUnicodeTranslateError_GetEnd(exc, &end))
692             return NULL;
693     }
694     else {
695         wrong_exception_type(exc);
696         return NULL;
697     }
698     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
699 }
700 
701 
PyCodec_ReplaceErrors(PyObject * exc)702 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
703 {
704     Py_ssize_t start, end, i, len;
705 
706     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
707         PyObject *res;
708         int kind;
709         void *data;
710         if (PyUnicodeEncodeError_GetStart(exc, &start))
711             return NULL;
712         if (PyUnicodeEncodeError_GetEnd(exc, &end))
713             return NULL;
714         len = end - start;
715         res = PyUnicode_New(len, '?');
716         if (res == NULL)
717             return NULL;
718         kind = PyUnicode_KIND(res);
719         data = PyUnicode_DATA(res);
720         for (i = 0; i < len; ++i)
721             PyUnicode_WRITE(kind, data, i, '?');
722         assert(_PyUnicode_CheckConsistency(res, 1));
723         return Py_BuildValue("(Nn)", res, end);
724     }
725     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
726         if (PyUnicodeDecodeError_GetEnd(exc, &end))
727             return NULL;
728         return Py_BuildValue("(Cn)",
729                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
730                              end);
731     }
732     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
733         PyObject *res;
734         int kind;
735         void *data;
736         if (PyUnicodeTranslateError_GetStart(exc, &start))
737             return NULL;
738         if (PyUnicodeTranslateError_GetEnd(exc, &end))
739             return NULL;
740         len = end - start;
741         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
742         if (res == NULL)
743             return NULL;
744         kind = PyUnicode_KIND(res);
745         data = PyUnicode_DATA(res);
746         for (i=0; i < len; i++)
747             PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
748         assert(_PyUnicode_CheckConsistency(res, 1));
749         return Py_BuildValue("(Nn)", res, end);
750     }
751     else {
752         wrong_exception_type(exc);
753         return NULL;
754     }
755 }
756 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)757 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
758 {
759     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
760         PyObject *restuple;
761         PyObject *object;
762         Py_ssize_t i;
763         Py_ssize_t start;
764         Py_ssize_t end;
765         PyObject *res;
766         unsigned char *outp;
767         Py_ssize_t ressize;
768         Py_UCS4 ch;
769         if (PyUnicodeEncodeError_GetStart(exc, &start))
770             return NULL;
771         if (PyUnicodeEncodeError_GetEnd(exc, &end))
772             return NULL;
773         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
774             return NULL;
775         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
776             end = start + PY_SSIZE_T_MAX / (2+7+1);
777         for (i = start, ressize = 0; i < end; ++i) {
778             /* object is guaranteed to be "ready" */
779             ch = PyUnicode_READ_CHAR(object, i);
780             if (ch<10)
781                 ressize += 2+1+1;
782             else if (ch<100)
783                 ressize += 2+2+1;
784             else if (ch<1000)
785                 ressize += 2+3+1;
786             else if (ch<10000)
787                 ressize += 2+4+1;
788             else if (ch<100000)
789                 ressize += 2+5+1;
790             else if (ch<1000000)
791                 ressize += 2+6+1;
792             else
793                 ressize += 2+7+1;
794         }
795         /* allocate replacement */
796         res = PyUnicode_New(ressize, 127);
797         if (res == NULL) {
798             Py_DECREF(object);
799             return NULL;
800         }
801         outp = PyUnicode_1BYTE_DATA(res);
802         /* generate replacement */
803         for (i = start; i < end; ++i) {
804             int digits;
805             int base;
806             ch = PyUnicode_READ_CHAR(object, i);
807             *outp++ = '&';
808             *outp++ = '#';
809             if (ch<10) {
810                 digits = 1;
811                 base = 1;
812             }
813             else if (ch<100) {
814                 digits = 2;
815                 base = 10;
816             }
817             else if (ch<1000) {
818                 digits = 3;
819                 base = 100;
820             }
821             else if (ch<10000) {
822                 digits = 4;
823                 base = 1000;
824             }
825             else if (ch<100000) {
826                 digits = 5;
827                 base = 10000;
828             }
829             else if (ch<1000000) {
830                 digits = 6;
831                 base = 100000;
832             }
833             else {
834                 digits = 7;
835                 base = 1000000;
836             }
837             while (digits-->0) {
838                 *outp++ = '0' + ch/base;
839                 ch %= base;
840                 base /= 10;
841             }
842             *outp++ = ';';
843         }
844         assert(_PyUnicode_CheckConsistency(res, 1));
845         restuple = Py_BuildValue("(Nn)", res, end);
846         Py_DECREF(object);
847         return restuple;
848     }
849     else {
850         wrong_exception_type(exc);
851         return NULL;
852     }
853 }
854 
PyCodec_BackslashReplaceErrors(PyObject * exc)855 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
856 {
857     PyObject *object;
858     Py_ssize_t i;
859     Py_ssize_t start;
860     Py_ssize_t end;
861     PyObject *res;
862     unsigned char *outp;
863     int ressize;
864     Py_UCS4 c;
865 
866     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
867         const unsigned char *p;
868         if (PyUnicodeDecodeError_GetStart(exc, &start))
869             return NULL;
870         if (PyUnicodeDecodeError_GetEnd(exc, &end))
871             return NULL;
872         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
873             return NULL;
874         p = (const unsigned char*)PyBytes_AS_STRING(object);
875         res = PyUnicode_New(4 * (end - start), 127);
876         if (res == NULL) {
877             Py_DECREF(object);
878             return NULL;
879         }
880         outp = PyUnicode_1BYTE_DATA(res);
881         for (i = start; i < end; i++, outp += 4) {
882             unsigned char c = p[i];
883             outp[0] = '\\';
884             outp[1] = 'x';
885             outp[2] = Py_hexdigits[(c>>4)&0xf];
886             outp[3] = Py_hexdigits[c&0xf];
887         }
888 
889         assert(_PyUnicode_CheckConsistency(res, 1));
890         Py_DECREF(object);
891         return Py_BuildValue("(Nn)", res, end);
892     }
893     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
894         if (PyUnicodeEncodeError_GetStart(exc, &start))
895             return NULL;
896         if (PyUnicodeEncodeError_GetEnd(exc, &end))
897             return NULL;
898         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
899             return NULL;
900     }
901     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
902         if (PyUnicodeTranslateError_GetStart(exc, &start))
903             return NULL;
904         if (PyUnicodeTranslateError_GetEnd(exc, &end))
905             return NULL;
906         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
907             return NULL;
908     }
909     else {
910         wrong_exception_type(exc);
911         return NULL;
912     }
913 
914     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
915         end = start + PY_SSIZE_T_MAX / (1+1+8);
916     for (i = start, ressize = 0; i < end; ++i) {
917         /* object is guaranteed to be "ready" */
918         c = PyUnicode_READ_CHAR(object, i);
919         if (c >= 0x10000) {
920             ressize += 1+1+8;
921         }
922         else if (c >= 0x100) {
923             ressize += 1+1+4;
924         }
925         else
926             ressize += 1+1+2;
927     }
928     res = PyUnicode_New(ressize, 127);
929     if (res == NULL) {
930         Py_DECREF(object);
931         return NULL;
932     }
933     outp = PyUnicode_1BYTE_DATA(res);
934     for (i = start; i < end; ++i) {
935         c = PyUnicode_READ_CHAR(object, i);
936         *outp++ = '\\';
937         if (c >= 0x00010000) {
938             *outp++ = 'U';
939             *outp++ = Py_hexdigits[(c>>28)&0xf];
940             *outp++ = Py_hexdigits[(c>>24)&0xf];
941             *outp++ = Py_hexdigits[(c>>20)&0xf];
942             *outp++ = Py_hexdigits[(c>>16)&0xf];
943             *outp++ = Py_hexdigits[(c>>12)&0xf];
944             *outp++ = Py_hexdigits[(c>>8)&0xf];
945         }
946         else if (c >= 0x100) {
947             *outp++ = 'u';
948             *outp++ = Py_hexdigits[(c>>12)&0xf];
949             *outp++ = Py_hexdigits[(c>>8)&0xf];
950         }
951         else
952             *outp++ = 'x';
953         *outp++ = Py_hexdigits[(c>>4)&0xf];
954         *outp++ = Py_hexdigits[c&0xf];
955     }
956 
957     assert(_PyUnicode_CheckConsistency(res, 1));
958     Py_DECREF(object);
959     return Py_BuildValue("(Nn)", res, end);
960 }
961 
962 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
963 
PyCodec_NameReplaceErrors(PyObject * exc)964 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
965 {
966     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
967         PyObject *restuple;
968         PyObject *object;
969         Py_ssize_t i;
970         Py_ssize_t start;
971         Py_ssize_t end;
972         PyObject *res;
973         unsigned char *outp;
974         Py_ssize_t ressize;
975         int replsize;
976         Py_UCS4 c;
977         char buffer[256]; /* NAME_MAXLEN */
978         if (PyUnicodeEncodeError_GetStart(exc, &start))
979             return NULL;
980         if (PyUnicodeEncodeError_GetEnd(exc, &end))
981             return NULL;
982         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
983             return NULL;
984         if (!ucnhash_CAPI) {
985             /* load the unicode data module */
986             ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
987                                             PyUnicodeData_CAPSULE_NAME, 1);
988             if (!ucnhash_CAPI)
989                 return NULL;
990         }
991         for (i = start, ressize = 0; i < end; ++i) {
992             /* object is guaranteed to be "ready" */
993             c = PyUnicode_READ_CHAR(object, i);
994             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
995                 replsize = 1+1+1+(int)strlen(buffer)+1;
996             }
997             else if (c >= 0x10000) {
998                 replsize = 1+1+8;
999             }
1000             else if (c >= 0x100) {
1001                 replsize = 1+1+4;
1002             }
1003             else
1004                 replsize = 1+1+2;
1005             if (ressize > PY_SSIZE_T_MAX - replsize)
1006                 break;
1007             ressize += replsize;
1008         }
1009         end = i;
1010         res = PyUnicode_New(ressize, 127);
1011         if (res==NULL)
1012             return NULL;
1013         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1014             i < end; ++i) {
1015             c = PyUnicode_READ_CHAR(object, i);
1016             *outp++ = '\\';
1017             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1018                 *outp++ = 'N';
1019                 *outp++ = '{';
1020                 strcpy((char *)outp, buffer);
1021                 outp += strlen(buffer);
1022                 *outp++ = '}';
1023                 continue;
1024             }
1025             if (c >= 0x00010000) {
1026                 *outp++ = 'U';
1027                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1028                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1029                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1030                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1031                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033             }
1034             else if (c >= 0x100) {
1035                 *outp++ = 'u';
1036                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1037                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1038             }
1039             else
1040                 *outp++ = 'x';
1041             *outp++ = Py_hexdigits[(c>>4)&0xf];
1042             *outp++ = Py_hexdigits[c&0xf];
1043         }
1044 
1045         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1046         assert(_PyUnicode_CheckConsistency(res, 1));
1047         restuple = Py_BuildValue("(Nn)", res, end);
1048         Py_DECREF(object);
1049         return restuple;
1050     }
1051     else {
1052         wrong_exception_type(exc);
1053         return NULL;
1054     }
1055 }
1056 
1057 #define ENC_UNKNOWN     -1
1058 #define ENC_UTF8        0
1059 #define ENC_UTF16BE     1
1060 #define ENC_UTF16LE     2
1061 #define ENC_UTF32BE     3
1062 #define ENC_UTF32LE     4
1063 
1064 static int
get_standard_encoding(const char * encoding,int * bytelength)1065 get_standard_encoding(const char *encoding, int *bytelength)
1066 {
1067     if (Py_TOLOWER(encoding[0]) == 'u' &&
1068         Py_TOLOWER(encoding[1]) == 't' &&
1069         Py_TOLOWER(encoding[2]) == 'f') {
1070         encoding += 3;
1071         if (*encoding == '-' || *encoding == '_' )
1072             encoding++;
1073         if (encoding[0] == '8' && encoding[1] == '\0') {
1074             *bytelength = 3;
1075             return ENC_UTF8;
1076         }
1077         else if (encoding[0] == '1' && encoding[1] == '6') {
1078             encoding += 2;
1079             *bytelength = 2;
1080             if (*encoding == '\0') {
1081 #ifdef WORDS_BIGENDIAN
1082                 return ENC_UTF16BE;
1083 #else
1084                 return ENC_UTF16LE;
1085 #endif
1086             }
1087             if (*encoding == '-' || *encoding == '_' )
1088                 encoding++;
1089             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1090                 if (Py_TOLOWER(encoding[0]) == 'b')
1091                     return ENC_UTF16BE;
1092                 if (Py_TOLOWER(encoding[0]) == 'l')
1093                     return ENC_UTF16LE;
1094             }
1095         }
1096         else if (encoding[0] == '3' && encoding[1] == '2') {
1097             encoding += 2;
1098             *bytelength = 4;
1099             if (*encoding == '\0') {
1100 #ifdef WORDS_BIGENDIAN
1101                 return ENC_UTF32BE;
1102 #else
1103                 return ENC_UTF32LE;
1104 #endif
1105             }
1106             if (*encoding == '-' || *encoding == '_' )
1107                 encoding++;
1108             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1109                 if (Py_TOLOWER(encoding[0]) == 'b')
1110                     return ENC_UTF32BE;
1111                 if (Py_TOLOWER(encoding[0]) == 'l')
1112                     return ENC_UTF32LE;
1113             }
1114         }
1115     }
1116     else if (strcmp(encoding, "CP_UTF8") == 0) {
1117         *bytelength = 3;
1118         return ENC_UTF8;
1119     }
1120     return ENC_UNKNOWN;
1121 }
1122 
1123 /* This handler is declared static until someone demonstrates
1124    a need to call it directly. */
1125 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1126 PyCodec_SurrogatePassErrors(PyObject *exc)
1127 {
1128     PyObject *restuple;
1129     PyObject *object;
1130     PyObject *encode;
1131     const char *encoding;
1132     int code;
1133     int bytelength;
1134     Py_ssize_t i;
1135     Py_ssize_t start;
1136     Py_ssize_t end;
1137     PyObject *res;
1138 
1139     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1140         unsigned char *outp;
1141         if (PyUnicodeEncodeError_GetStart(exc, &start))
1142             return NULL;
1143         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1144             return NULL;
1145         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1146             return NULL;
1147         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1148             Py_DECREF(object);
1149             return NULL;
1150         }
1151         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1152             Py_DECREF(object);
1153             Py_DECREF(encode);
1154             return NULL;
1155         }
1156         code = get_standard_encoding(encoding, &bytelength);
1157         Py_DECREF(encode);
1158         if (code == ENC_UNKNOWN) {
1159             /* Not supported, fail with original exception */
1160             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1161             Py_DECREF(object);
1162             return NULL;
1163         }
1164 
1165         if (end - start > PY_SSIZE_T_MAX / bytelength)
1166             end = start + PY_SSIZE_T_MAX / bytelength;
1167         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1168         if (!res) {
1169             Py_DECREF(object);
1170             return NULL;
1171         }
1172         outp = (unsigned char*)PyBytes_AsString(res);
1173         for (i = start; i < end; i++) {
1174             /* object is guaranteed to be "ready" */
1175             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1176             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1177                 /* Not a surrogate, fail with original exception */
1178                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1179                 Py_DECREF(res);
1180                 Py_DECREF(object);
1181                 return NULL;
1182             }
1183             switch (code) {
1184             case ENC_UTF8:
1185                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1186                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1187                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1188                 break;
1189             case ENC_UTF16LE:
1190                 *outp++ = (unsigned char) ch;
1191                 *outp++ = (unsigned char)(ch >> 8);
1192                 break;
1193             case ENC_UTF16BE:
1194                 *outp++ = (unsigned char)(ch >> 8);
1195                 *outp++ = (unsigned char) ch;
1196                 break;
1197             case ENC_UTF32LE:
1198                 *outp++ = (unsigned char) ch;
1199                 *outp++ = (unsigned char)(ch >> 8);
1200                 *outp++ = (unsigned char)(ch >> 16);
1201                 *outp++ = (unsigned char)(ch >> 24);
1202                 break;
1203             case ENC_UTF32BE:
1204                 *outp++ = (unsigned char)(ch >> 24);
1205                 *outp++ = (unsigned char)(ch >> 16);
1206                 *outp++ = (unsigned char)(ch >> 8);
1207                 *outp++ = (unsigned char) ch;
1208                 break;
1209             }
1210         }
1211         restuple = Py_BuildValue("(On)", res, end);
1212         Py_DECREF(res);
1213         Py_DECREF(object);
1214         return restuple;
1215     }
1216     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1217         const unsigned char *p;
1218         Py_UCS4 ch = 0;
1219         if (PyUnicodeDecodeError_GetStart(exc, &start))
1220             return NULL;
1221         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1222             return NULL;
1223         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1224             return NULL;
1225         p = (const unsigned char*)PyBytes_AS_STRING(object);
1226         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1227             Py_DECREF(object);
1228             return NULL;
1229         }
1230         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1231             Py_DECREF(object);
1232             Py_DECREF(encode);
1233             return NULL;
1234         }
1235         code = get_standard_encoding(encoding, &bytelength);
1236         Py_DECREF(encode);
1237         if (code == ENC_UNKNOWN) {
1238             /* Not supported, fail with original exception */
1239             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1240             Py_DECREF(object);
1241             return NULL;
1242         }
1243 
1244         /* Try decoding a single surrogate character. If
1245            there are more, let the codec call us again. */
1246         p += start;
1247         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1248             switch (code) {
1249             case ENC_UTF8:
1250                 if ((p[0] & 0xf0) == 0xe0 &&
1251                     (p[1] & 0xc0) == 0x80 &&
1252                     (p[2] & 0xc0) == 0x80) {
1253                     /* it's a three-byte code */
1254                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1255                 }
1256                 break;
1257             case ENC_UTF16LE:
1258                 ch = p[1] << 8 | p[0];
1259                 break;
1260             case ENC_UTF16BE:
1261                 ch = p[0] << 8 | p[1];
1262                 break;
1263             case ENC_UTF32LE:
1264                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1265                 break;
1266             case ENC_UTF32BE:
1267                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1268                 break;
1269             }
1270         }
1271 
1272         Py_DECREF(object);
1273         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1274             /* it's not a surrogate - fail */
1275             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1276             return NULL;
1277         }
1278         res = PyUnicode_FromOrdinal(ch);
1279         if (res == NULL)
1280             return NULL;
1281         return Py_BuildValue("(Nn)", res, start + bytelength);
1282     }
1283     else {
1284         wrong_exception_type(exc);
1285         return NULL;
1286     }
1287 }
1288 
1289 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1290 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1291 {
1292     PyObject *restuple;
1293     PyObject *object;
1294     Py_ssize_t i;
1295     Py_ssize_t start;
1296     Py_ssize_t end;
1297     PyObject *res;
1298 
1299     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1300         char *outp;
1301         if (PyUnicodeEncodeError_GetStart(exc, &start))
1302             return NULL;
1303         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1304             return NULL;
1305         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1306             return NULL;
1307         res = PyBytes_FromStringAndSize(NULL, end-start);
1308         if (!res) {
1309             Py_DECREF(object);
1310             return NULL;
1311         }
1312         outp = PyBytes_AsString(res);
1313         for (i = start; i < end; i++) {
1314             /* object is guaranteed to be "ready" */
1315             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1316             if (ch < 0xdc80 || ch > 0xdcff) {
1317                 /* Not a UTF-8b surrogate, fail with original exception */
1318                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1319                 Py_DECREF(res);
1320                 Py_DECREF(object);
1321                 return NULL;
1322             }
1323             *outp++ = ch - 0xdc00;
1324         }
1325         restuple = Py_BuildValue("(On)", res, end);
1326         Py_DECREF(res);
1327         Py_DECREF(object);
1328         return restuple;
1329     }
1330     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1331         PyObject *str;
1332         const unsigned char *p;
1333         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1334         int consumed = 0;
1335         if (PyUnicodeDecodeError_GetStart(exc, &start))
1336             return NULL;
1337         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1338             return NULL;
1339         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1340             return NULL;
1341         p = (const unsigned char*)PyBytes_AS_STRING(object);
1342         while (consumed < 4 && consumed < end-start) {
1343             /* Refuse to escape ASCII bytes. */
1344             if (p[start+consumed] < 128)
1345                 break;
1346             ch[consumed] = 0xdc00 + p[start+consumed];
1347             consumed++;
1348         }
1349         Py_DECREF(object);
1350         if (!consumed) {
1351             /* codec complained about ASCII byte. */
1352             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1353             return NULL;
1354         }
1355         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1356         if (str == NULL)
1357             return NULL;
1358         return Py_BuildValue("(Nn)", str, start+consumed);
1359     }
1360     else {
1361         wrong_exception_type(exc);
1362         return NULL;
1363     }
1364 }
1365 
1366 
strict_errors(PyObject * self,PyObject * exc)1367 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1368 {
1369     return PyCodec_StrictErrors(exc);
1370 }
1371 
1372 
ignore_errors(PyObject * self,PyObject * exc)1373 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1374 {
1375     return PyCodec_IgnoreErrors(exc);
1376 }
1377 
1378 
replace_errors(PyObject * self,PyObject * exc)1379 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1380 {
1381     return PyCodec_ReplaceErrors(exc);
1382 }
1383 
1384 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1385 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1386 {
1387     return PyCodec_XMLCharRefReplaceErrors(exc);
1388 }
1389 
1390 
backslashreplace_errors(PyObject * self,PyObject * exc)1391 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1392 {
1393     return PyCodec_BackslashReplaceErrors(exc);
1394 }
1395 
namereplace_errors(PyObject * self,PyObject * exc)1396 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1397 {
1398     return PyCodec_NameReplaceErrors(exc);
1399 }
1400 
surrogatepass_errors(PyObject * self,PyObject * exc)1401 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1402 {
1403     return PyCodec_SurrogatePassErrors(exc);
1404 }
1405 
surrogateescape_errors(PyObject * self,PyObject * exc)1406 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1407 {
1408     return PyCodec_SurrogateEscapeErrors(exc);
1409 }
1410 
_PyCodecRegistry_Init(void)1411 static int _PyCodecRegistry_Init(void)
1412 {
1413     static struct {
1414         char *name;
1415         PyMethodDef def;
1416     } methods[] =
1417     {
1418         {
1419             "strict",
1420             {
1421                 "strict_errors",
1422                 strict_errors,
1423                 METH_O,
1424                 PyDoc_STR("Implements the 'strict' error handling, which "
1425                           "raises a UnicodeError on coding errors.")
1426             }
1427         },
1428         {
1429             "ignore",
1430             {
1431                 "ignore_errors",
1432                 ignore_errors,
1433                 METH_O,
1434                 PyDoc_STR("Implements the 'ignore' error handling, which "
1435                           "ignores malformed data and continues.")
1436             }
1437         },
1438         {
1439             "replace",
1440             {
1441                 "replace_errors",
1442                 replace_errors,
1443                 METH_O,
1444                 PyDoc_STR("Implements the 'replace' error handling, which "
1445                           "replaces malformed data with a replacement marker.")
1446             }
1447         },
1448         {
1449             "xmlcharrefreplace",
1450             {
1451                 "xmlcharrefreplace_errors",
1452                 xmlcharrefreplace_errors,
1453                 METH_O,
1454                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1455                           "which replaces an unencodable character with the "
1456                           "appropriate XML character reference.")
1457             }
1458         },
1459         {
1460             "backslashreplace",
1461             {
1462                 "backslashreplace_errors",
1463                 backslashreplace_errors,
1464                 METH_O,
1465                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1466                           "which replaces malformed data with a backslashed "
1467                           "escape sequence.")
1468             }
1469         },
1470         {
1471             "namereplace",
1472             {
1473                 "namereplace_errors",
1474                 namereplace_errors,
1475                 METH_O,
1476                 PyDoc_STR("Implements the 'namereplace' error handling, "
1477                           "which replaces an unencodable character with a "
1478                           "\\N{...} escape sequence.")
1479             }
1480         },
1481         {
1482             "surrogatepass",
1483             {
1484                 "surrogatepass",
1485                 surrogatepass_errors,
1486                 METH_O
1487             }
1488         },
1489         {
1490             "surrogateescape",
1491             {
1492                 "surrogateescape",
1493                 surrogateescape_errors,
1494                 METH_O
1495             }
1496         }
1497     };
1498 
1499     PyInterpreterState *interp = PyThreadState_GET()->interp;
1500     PyObject *mod;
1501     unsigned i;
1502 
1503     if (interp->codec_search_path != NULL)
1504         return 0;
1505 
1506     interp->codec_search_path = PyList_New(0);
1507     interp->codec_search_cache = PyDict_New();
1508     interp->codec_error_registry = PyDict_New();
1509 
1510     if (interp->codec_error_registry) {
1511         for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1512             PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1513             int res;
1514             if (!func)
1515                 Py_FatalError("can't initialize codec error registry");
1516             res = PyCodec_RegisterError(methods[i].name, func);
1517             Py_DECREF(func);
1518             if (res)
1519                 Py_FatalError("can't initialize codec error registry");
1520         }
1521     }
1522 
1523     if (interp->codec_search_path == NULL ||
1524         interp->codec_search_cache == NULL ||
1525         interp->codec_error_registry == NULL)
1526         Py_FatalError("can't initialize codec registry");
1527 
1528     mod = PyImport_ImportModuleNoBlock("encodings");
1529     if (mod == NULL) {
1530         return -1;
1531     }
1532     Py_DECREF(mod);
1533     interp->codecs_initialized = 1;
1534     return 0;
1535 }
1536