1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "internal/pystate.h"
13 #include "ucnhash.h"
14 #include <ctype.h>
15
16 const char *Py_hexdigits = "0123456789abcdef";
17
18 /* --- Codec Registry ----------------------------------------------------- */
19
20 /* Import the standard encodings package which will register the first
21 codec search function.
22
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
26 ImportErrors are silently ignored by this function. Only one try is
27 made.
28
29 */
30
31 static int _PyCodecRegistry_Init(void); /* Forward */
32
PyCodec_Register(PyObject * search_function)33 int PyCodec_Register(PyObject *search_function)
34 {
35 PyInterpreterState *interp = PyThreadState_GET()->interp;
36 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37 goto onError;
38 if (search_function == NULL) {
39 PyErr_BadArgument();
40 goto onError;
41 }
42 if (!PyCallable_Check(search_function)) {
43 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
45 }
46 return PyList_Append(interp->codec_search_path, search_function);
47
48 onError:
49 return -1;
50 }
51
52 /* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
55 static
normalizestring(const char * string)56 PyObject *normalizestring(const char *string)
57 {
58 size_t i;
59 size_t len = strlen(string);
60 char *p;
61 PyObject *v;
62
63 if (len > PY_SSIZE_T_MAX) {
64 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
66 }
67
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
70 return PyErr_NoMemory();
71 for (i = 0; i < len; i++) {
72 char ch = string[i];
73 if (ch == ' ')
74 ch = '-';
75 else
76 ch = Py_TOLOWER(Py_CHARMASK(ch));
77 p[i] = ch;
78 }
79 p[i] = '\0';
80 v = PyUnicode_FromString(p);
81 PyMem_Free(p);
82 return v;
83 }
84
85 /* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
92 If no codec is found, a LookupError is set and NULL returned.
93
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98 */
99
_PyCodec_Lookup(const char * encoding)100 PyObject *_PyCodec_Lookup(const char *encoding)
101 {
102 PyInterpreterState *interp;
103 PyObject *result, *args = NULL, *v;
104 Py_ssize_t i, len;
105
106 if (encoding == NULL) {
107 PyErr_BadArgument();
108 goto onError;
109 }
110
111 interp = PyThreadState_GET()->interp;
112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
113 goto onError;
114
115 /* Convert the encoding to a normalized Python string: all
116 characters are converted to lower case, spaces and hyphens are
117 replaced with underscores. */
118 v = normalizestring(encoding);
119 if (v == NULL)
120 goto onError;
121 PyUnicode_InternInPlace(&v);
122
123 /* First, try to lookup the name in the registry dictionary */
124 result = PyDict_GetItem(interp->codec_search_cache, v);
125 if (result != NULL) {
126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
129 }
130
131 /* Next, scan the search functions in order of registration */
132 args = PyTuple_New(1);
133 if (args == NULL) {
134 Py_DECREF(v);
135 return NULL;
136 }
137 PyTuple_SET_ITEM(args,0,v);
138
139 len = PyList_Size(interp->codec_search_path);
140 if (len < 0)
141 goto onError;
142 if (len == 0) {
143 PyErr_SetString(PyExc_LookupError,
144 "no codec search functions registered: "
145 "can't find encoding");
146 goto onError;
147 }
148
149 for (i = 0; i < len; i++) {
150 PyObject *func;
151
152 func = PyList_GetItem(interp->codec_search_path, i);
153 if (func == NULL)
154 goto onError;
155 result = PyEval_CallObject(func, args);
156 if (result == NULL)
157 goto onError;
158 if (result == Py_None) {
159 Py_DECREF(result);
160 continue;
161 }
162 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
163 PyErr_SetString(PyExc_TypeError,
164 "codec search functions must return 4-tuples");
165 Py_DECREF(result);
166 goto onError;
167 }
168 break;
169 }
170 if (i == len) {
171 /* XXX Perhaps we should cache misses too ? */
172 PyErr_Format(PyExc_LookupError,
173 "unknown encoding: %s", encoding);
174 goto onError;
175 }
176
177 /* Cache and return the result */
178 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
179 Py_DECREF(result);
180 goto onError;
181 }
182 Py_DECREF(args);
183 return result;
184
185 onError:
186 Py_XDECREF(args);
187 return NULL;
188 }
189
_PyCodec_Forget(const char * encoding)190 int _PyCodec_Forget(const char *encoding)
191 {
192 PyInterpreterState *interp;
193 PyObject *v;
194 int result;
195
196 interp = PyThreadState_GET()->interp;
197 if (interp->codec_search_path == NULL) {
198 return -1;
199 }
200
201 /* Convert the encoding to a normalized Python string: all
202 characters are converted to lower case, spaces and hyphens are
203 replaced with underscores. */
204 v = normalizestring(encoding);
205 if (v == NULL) {
206 return -1;
207 }
208
209 /* Drop the named codec from the internal cache */
210 result = PyDict_DelItem(interp->codec_search_cache, v);
211 Py_DECREF(v);
212
213 return result;
214 }
215
216 /* Codec registry encoding check API. */
217
PyCodec_KnownEncoding(const char * encoding)218 int PyCodec_KnownEncoding(const char *encoding)
219 {
220 PyObject *codecs;
221
222 codecs = _PyCodec_Lookup(encoding);
223 if (!codecs) {
224 PyErr_Clear();
225 return 0;
226 }
227 else {
228 Py_DECREF(codecs);
229 return 1;
230 }
231 }
232
233 static
args_tuple(PyObject * object,const char * errors)234 PyObject *args_tuple(PyObject *object,
235 const char *errors)
236 {
237 PyObject *args;
238
239 args = PyTuple_New(1 + (errors != NULL));
240 if (args == NULL)
241 return NULL;
242 Py_INCREF(object);
243 PyTuple_SET_ITEM(args,0,object);
244 if (errors) {
245 PyObject *v;
246
247 v = PyUnicode_FromString(errors);
248 if (v == NULL) {
249 Py_DECREF(args);
250 return NULL;
251 }
252 PyTuple_SET_ITEM(args, 1, v);
253 }
254 return args;
255 }
256
257 /* Helper function to get a codec item */
258
259 static
codec_getitem(const char * encoding,int index)260 PyObject *codec_getitem(const char *encoding, int index)
261 {
262 PyObject *codecs;
263 PyObject *v;
264
265 codecs = _PyCodec_Lookup(encoding);
266 if (codecs == NULL)
267 return NULL;
268 v = PyTuple_GET_ITEM(codecs, index);
269 Py_DECREF(codecs);
270 Py_INCREF(v);
271 return v;
272 }
273
274 /* Helper functions to create an incremental codec. */
275 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)276 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
277 const char *errors,
278 const char *attrname)
279 {
280 PyObject *ret, *inccodec;
281
282 inccodec = PyObject_GetAttrString(codec_info, attrname);
283 if (inccodec == NULL)
284 return NULL;
285 if (errors)
286 ret = PyObject_CallFunction(inccodec, "s", errors);
287 else
288 ret = _PyObject_CallNoArg(inccodec);
289 Py_DECREF(inccodec);
290 return ret;
291 }
292
293 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)294 PyObject *codec_getincrementalcodec(const char *encoding,
295 const char *errors,
296 const char *attrname)
297 {
298 PyObject *codec_info, *ret;
299
300 codec_info = _PyCodec_Lookup(encoding);
301 if (codec_info == NULL)
302 return NULL;
303 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
304 Py_DECREF(codec_info);
305 return ret;
306 }
307
308 /* Helper function to create a stream codec. */
309
310 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)311 PyObject *codec_getstreamcodec(const char *encoding,
312 PyObject *stream,
313 const char *errors,
314 const int index)
315 {
316 PyObject *codecs, *streamcodec, *codeccls;
317
318 codecs = _PyCodec_Lookup(encoding);
319 if (codecs == NULL)
320 return NULL;
321
322 codeccls = PyTuple_GET_ITEM(codecs, index);
323 if (errors != NULL)
324 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
325 else
326 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
327 Py_DECREF(codecs);
328 return streamcodec;
329 }
330
331 /* Helpers to work with the result of _PyCodec_Lookup
332
333 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)334 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
335 const char *errors)
336 {
337 return codec_makeincrementalcodec(codec_info, errors,
338 "incrementaldecoder");
339 }
340
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)341 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
342 const char *errors)
343 {
344 return codec_makeincrementalcodec(codec_info, errors,
345 "incrementalencoder");
346 }
347
348
349 /* Convenience APIs to query the Codec registry.
350
351 All APIs return a codec object with incremented refcount.
352
353 */
354
PyCodec_Encoder(const char * encoding)355 PyObject *PyCodec_Encoder(const char *encoding)
356 {
357 return codec_getitem(encoding, 0);
358 }
359
PyCodec_Decoder(const char * encoding)360 PyObject *PyCodec_Decoder(const char *encoding)
361 {
362 return codec_getitem(encoding, 1);
363 }
364
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)365 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
366 const char *errors)
367 {
368 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
369 }
370
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)371 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
372 const char *errors)
373 {
374 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
375 }
376
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)377 PyObject *PyCodec_StreamReader(const char *encoding,
378 PyObject *stream,
379 const char *errors)
380 {
381 return codec_getstreamcodec(encoding, stream, errors, 2);
382 }
383
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)384 PyObject *PyCodec_StreamWriter(const char *encoding,
385 PyObject *stream,
386 const char *errors)
387 {
388 return codec_getstreamcodec(encoding, stream, errors, 3);
389 }
390
391 /* Helper that tries to ensure the reported exception chain indicates the
392 * codec that was invoked to trigger the failure without changing the type
393 * of the exception raised.
394 */
395 static void
wrap_codec_error(const char * operation,const char * encoding)396 wrap_codec_error(const char *operation,
397 const char *encoding)
398 {
399 /* TrySetFromCause will replace the active exception with a suitably
400 * updated clone if it can, otherwise it will leave the original
401 * exception alone.
402 */
403 _PyErr_TrySetFromCause("%s with '%s' codec failed",
404 operation, encoding);
405 }
406
407 /* Encode an object (e.g. a Unicode object) using the given encoding
408 and return the resulting encoded object (usually a Python string).
409
410 errors is passed to the encoder factory as argument if non-NULL. */
411
412 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)413 _PyCodec_EncodeInternal(PyObject *object,
414 PyObject *encoder,
415 const char *encoding,
416 const char *errors)
417 {
418 PyObject *args = NULL, *result = NULL;
419 PyObject *v = NULL;
420
421 args = args_tuple(object, errors);
422 if (args == NULL)
423 goto onError;
424
425 result = PyEval_CallObject(encoder, args);
426 if (result == NULL) {
427 wrap_codec_error("encoding", encoding);
428 goto onError;
429 }
430
431 if (!PyTuple_Check(result) ||
432 PyTuple_GET_SIZE(result) != 2) {
433 PyErr_SetString(PyExc_TypeError,
434 "encoder must return a tuple (object, integer)");
435 goto onError;
436 }
437 v = PyTuple_GET_ITEM(result,0);
438 Py_INCREF(v);
439 /* We don't check or use the second (integer) entry. */
440
441 Py_DECREF(args);
442 Py_DECREF(encoder);
443 Py_DECREF(result);
444 return v;
445
446 onError:
447 Py_XDECREF(result);
448 Py_XDECREF(args);
449 Py_XDECREF(encoder);
450 return NULL;
451 }
452
453 /* Decode an object (usually a Python string) using the given encoding
454 and return an equivalent object (e.g. a Unicode object).
455
456 errors is passed to the decoder factory as argument if non-NULL. */
457
458 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)459 _PyCodec_DecodeInternal(PyObject *object,
460 PyObject *decoder,
461 const char *encoding,
462 const char *errors)
463 {
464 PyObject *args = NULL, *result = NULL;
465 PyObject *v;
466
467 args = args_tuple(object, errors);
468 if (args == NULL)
469 goto onError;
470
471 result = PyEval_CallObject(decoder,args);
472 if (result == NULL) {
473 wrap_codec_error("decoding", encoding);
474 goto onError;
475 }
476 if (!PyTuple_Check(result) ||
477 PyTuple_GET_SIZE(result) != 2) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoder must return a tuple (object,integer)");
480 goto onError;
481 }
482 v = PyTuple_GET_ITEM(result,0);
483 Py_INCREF(v);
484 /* We don't check or use the second (integer) entry. */
485
486 Py_DECREF(args);
487 Py_DECREF(decoder);
488 Py_DECREF(result);
489 return v;
490
491 onError:
492 Py_XDECREF(args);
493 Py_XDECREF(decoder);
494 Py_XDECREF(result);
495 return NULL;
496 }
497
498 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)499 PyObject *PyCodec_Encode(PyObject *object,
500 const char *encoding,
501 const char *errors)
502 {
503 PyObject *encoder;
504
505 encoder = PyCodec_Encoder(encoding);
506 if (encoder == NULL)
507 return NULL;
508
509 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
510 }
511
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)512 PyObject *PyCodec_Decode(PyObject *object,
513 const char *encoding,
514 const char *errors)
515 {
516 PyObject *decoder;
517
518 decoder = PyCodec_Decoder(encoding);
519 if (decoder == NULL)
520 return NULL;
521
522 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
523 }
524
525 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)526 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
527 const char *alternate_command)
528 {
529 _Py_IDENTIFIER(_is_text_encoding);
530 PyObject *codec;
531 PyObject *attr;
532 int is_text_codec;
533
534 codec = _PyCodec_Lookup(encoding);
535 if (codec == NULL)
536 return NULL;
537
538 /* Backwards compatibility: assume any raw tuple describes a text
539 * encoding, and the same for anything lacking the private
540 * attribute.
541 */
542 if (!PyTuple_CheckExact(codec)) {
543 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
544 Py_DECREF(codec);
545 return NULL;
546 }
547 if (attr != NULL) {
548 is_text_codec = PyObject_IsTrue(attr);
549 Py_DECREF(attr);
550 if (is_text_codec <= 0) {
551 Py_DECREF(codec);
552 if (!is_text_codec)
553 PyErr_Format(PyExc_LookupError,
554 "'%.400s' is not a text encoding; "
555 "use %s to handle arbitrary codecs",
556 encoding, alternate_command);
557 return NULL;
558 }
559 }
560 }
561
562 /* This appears to be a valid text encoding */
563 return codec;
564 }
565
566
567 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)568 PyObject *codec_getitem_checked(const char *encoding,
569 const char *alternate_command,
570 int index)
571 {
572 PyObject *codec;
573 PyObject *v;
574
575 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
576 if (codec == NULL)
577 return NULL;
578
579 v = PyTuple_GET_ITEM(codec, index);
580 Py_INCREF(v);
581 Py_DECREF(codec);
582 return v;
583 }
584
_PyCodec_TextEncoder(const char * encoding)585 static PyObject * _PyCodec_TextEncoder(const char *encoding)
586 {
587 return codec_getitem_checked(encoding, "codecs.encode()", 0);
588 }
589
_PyCodec_TextDecoder(const char * encoding)590 static PyObject * _PyCodec_TextDecoder(const char *encoding)
591 {
592 return codec_getitem_checked(encoding, "codecs.decode()", 1);
593 }
594
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)595 PyObject *_PyCodec_EncodeText(PyObject *object,
596 const char *encoding,
597 const char *errors)
598 {
599 PyObject *encoder;
600
601 encoder = _PyCodec_TextEncoder(encoding);
602 if (encoder == NULL)
603 return NULL;
604
605 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
606 }
607
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)608 PyObject *_PyCodec_DecodeText(PyObject *object,
609 const char *encoding,
610 const char *errors)
611 {
612 PyObject *decoder;
613
614 decoder = _PyCodec_TextDecoder(encoding);
615 if (decoder == NULL)
616 return NULL;
617
618 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
619 }
620
621 /* Register the error handling callback function error under the name
622 name. This function will be called by the codec when it encounters
623 an unencodable characters/undecodable bytes and doesn't know the
624 callback name, when name is specified as the error parameter
625 in the call to the encode/decode function.
626 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)627 int PyCodec_RegisterError(const char *name, PyObject *error)
628 {
629 PyInterpreterState *interp = PyThreadState_GET()->interp;
630 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
631 return -1;
632 if (!PyCallable_Check(error)) {
633 PyErr_SetString(PyExc_TypeError, "handler must be callable");
634 return -1;
635 }
636 return PyDict_SetItemString(interp->codec_error_registry,
637 name, error);
638 }
639
640 /* Lookup the error handling callback function registered under the
641 name error. As a special case NULL can be passed, in which case
642 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)643 PyObject *PyCodec_LookupError(const char *name)
644 {
645 PyObject *handler = NULL;
646
647 PyInterpreterState *interp = PyThreadState_GET()->interp;
648 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
649 return NULL;
650
651 if (name==NULL)
652 name = "strict";
653 handler = PyDict_GetItemString(interp->codec_error_registry, name);
654 if (!handler)
655 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
656 else
657 Py_INCREF(handler);
658 return handler;
659 }
660
wrong_exception_type(PyObject * exc)661 static void wrong_exception_type(PyObject *exc)
662 {
663 PyErr_Format(PyExc_TypeError,
664 "don't know how to handle %.200s in error callback",
665 exc->ob_type->tp_name);
666 }
667
PyCodec_StrictErrors(PyObject * exc)668 PyObject *PyCodec_StrictErrors(PyObject *exc)
669 {
670 if (PyExceptionInstance_Check(exc))
671 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
672 else
673 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
674 return NULL;
675 }
676
677
PyCodec_IgnoreErrors(PyObject * exc)678 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
679 {
680 Py_ssize_t end;
681
682 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
683 if (PyUnicodeEncodeError_GetEnd(exc, &end))
684 return NULL;
685 }
686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
687 if (PyUnicodeDecodeError_GetEnd(exc, &end))
688 return NULL;
689 }
690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
691 if (PyUnicodeTranslateError_GetEnd(exc, &end))
692 return NULL;
693 }
694 else {
695 wrong_exception_type(exc);
696 return NULL;
697 }
698 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
699 }
700
701
PyCodec_ReplaceErrors(PyObject * exc)702 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
703 {
704 Py_ssize_t start, end, i, len;
705
706 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
707 PyObject *res;
708 int kind;
709 void *data;
710 if (PyUnicodeEncodeError_GetStart(exc, &start))
711 return NULL;
712 if (PyUnicodeEncodeError_GetEnd(exc, &end))
713 return NULL;
714 len = end - start;
715 res = PyUnicode_New(len, '?');
716 if (res == NULL)
717 return NULL;
718 kind = PyUnicode_KIND(res);
719 data = PyUnicode_DATA(res);
720 for (i = 0; i < len; ++i)
721 PyUnicode_WRITE(kind, data, i, '?');
722 assert(_PyUnicode_CheckConsistency(res, 1));
723 return Py_BuildValue("(Nn)", res, end);
724 }
725 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
726 if (PyUnicodeDecodeError_GetEnd(exc, &end))
727 return NULL;
728 return Py_BuildValue("(Cn)",
729 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
730 end);
731 }
732 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
733 PyObject *res;
734 int kind;
735 void *data;
736 if (PyUnicodeTranslateError_GetStart(exc, &start))
737 return NULL;
738 if (PyUnicodeTranslateError_GetEnd(exc, &end))
739 return NULL;
740 len = end - start;
741 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
742 if (res == NULL)
743 return NULL;
744 kind = PyUnicode_KIND(res);
745 data = PyUnicode_DATA(res);
746 for (i=0; i < len; i++)
747 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
748 assert(_PyUnicode_CheckConsistency(res, 1));
749 return Py_BuildValue("(Nn)", res, end);
750 }
751 else {
752 wrong_exception_type(exc);
753 return NULL;
754 }
755 }
756
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)757 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
758 {
759 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
760 PyObject *restuple;
761 PyObject *object;
762 Py_ssize_t i;
763 Py_ssize_t start;
764 Py_ssize_t end;
765 PyObject *res;
766 unsigned char *outp;
767 Py_ssize_t ressize;
768 Py_UCS4 ch;
769 if (PyUnicodeEncodeError_GetStart(exc, &start))
770 return NULL;
771 if (PyUnicodeEncodeError_GetEnd(exc, &end))
772 return NULL;
773 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
774 return NULL;
775 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
776 end = start + PY_SSIZE_T_MAX / (2+7+1);
777 for (i = start, ressize = 0; i < end; ++i) {
778 /* object is guaranteed to be "ready" */
779 ch = PyUnicode_READ_CHAR(object, i);
780 if (ch<10)
781 ressize += 2+1+1;
782 else if (ch<100)
783 ressize += 2+2+1;
784 else if (ch<1000)
785 ressize += 2+3+1;
786 else if (ch<10000)
787 ressize += 2+4+1;
788 else if (ch<100000)
789 ressize += 2+5+1;
790 else if (ch<1000000)
791 ressize += 2+6+1;
792 else
793 ressize += 2+7+1;
794 }
795 /* allocate replacement */
796 res = PyUnicode_New(ressize, 127);
797 if (res == NULL) {
798 Py_DECREF(object);
799 return NULL;
800 }
801 outp = PyUnicode_1BYTE_DATA(res);
802 /* generate replacement */
803 for (i = start; i < end; ++i) {
804 int digits;
805 int base;
806 ch = PyUnicode_READ_CHAR(object, i);
807 *outp++ = '&';
808 *outp++ = '#';
809 if (ch<10) {
810 digits = 1;
811 base = 1;
812 }
813 else if (ch<100) {
814 digits = 2;
815 base = 10;
816 }
817 else if (ch<1000) {
818 digits = 3;
819 base = 100;
820 }
821 else if (ch<10000) {
822 digits = 4;
823 base = 1000;
824 }
825 else if (ch<100000) {
826 digits = 5;
827 base = 10000;
828 }
829 else if (ch<1000000) {
830 digits = 6;
831 base = 100000;
832 }
833 else {
834 digits = 7;
835 base = 1000000;
836 }
837 while (digits-->0) {
838 *outp++ = '0' + ch/base;
839 ch %= base;
840 base /= 10;
841 }
842 *outp++ = ';';
843 }
844 assert(_PyUnicode_CheckConsistency(res, 1));
845 restuple = Py_BuildValue("(Nn)", res, end);
846 Py_DECREF(object);
847 return restuple;
848 }
849 else {
850 wrong_exception_type(exc);
851 return NULL;
852 }
853 }
854
PyCodec_BackslashReplaceErrors(PyObject * exc)855 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
856 {
857 PyObject *object;
858 Py_ssize_t i;
859 Py_ssize_t start;
860 Py_ssize_t end;
861 PyObject *res;
862 unsigned char *outp;
863 int ressize;
864 Py_UCS4 c;
865
866 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
867 const unsigned char *p;
868 if (PyUnicodeDecodeError_GetStart(exc, &start))
869 return NULL;
870 if (PyUnicodeDecodeError_GetEnd(exc, &end))
871 return NULL;
872 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
873 return NULL;
874 p = (const unsigned char*)PyBytes_AS_STRING(object);
875 res = PyUnicode_New(4 * (end - start), 127);
876 if (res == NULL) {
877 Py_DECREF(object);
878 return NULL;
879 }
880 outp = PyUnicode_1BYTE_DATA(res);
881 for (i = start; i < end; i++, outp += 4) {
882 unsigned char c = p[i];
883 outp[0] = '\\';
884 outp[1] = 'x';
885 outp[2] = Py_hexdigits[(c>>4)&0xf];
886 outp[3] = Py_hexdigits[c&0xf];
887 }
888
889 assert(_PyUnicode_CheckConsistency(res, 1));
890 Py_DECREF(object);
891 return Py_BuildValue("(Nn)", res, end);
892 }
893 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
894 if (PyUnicodeEncodeError_GetStart(exc, &start))
895 return NULL;
896 if (PyUnicodeEncodeError_GetEnd(exc, &end))
897 return NULL;
898 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
899 return NULL;
900 }
901 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
902 if (PyUnicodeTranslateError_GetStart(exc, &start))
903 return NULL;
904 if (PyUnicodeTranslateError_GetEnd(exc, &end))
905 return NULL;
906 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
907 return NULL;
908 }
909 else {
910 wrong_exception_type(exc);
911 return NULL;
912 }
913
914 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
915 end = start + PY_SSIZE_T_MAX / (1+1+8);
916 for (i = start, ressize = 0; i < end; ++i) {
917 /* object is guaranteed to be "ready" */
918 c = PyUnicode_READ_CHAR(object, i);
919 if (c >= 0x10000) {
920 ressize += 1+1+8;
921 }
922 else if (c >= 0x100) {
923 ressize += 1+1+4;
924 }
925 else
926 ressize += 1+1+2;
927 }
928 res = PyUnicode_New(ressize, 127);
929 if (res == NULL) {
930 Py_DECREF(object);
931 return NULL;
932 }
933 outp = PyUnicode_1BYTE_DATA(res);
934 for (i = start; i < end; ++i) {
935 c = PyUnicode_READ_CHAR(object, i);
936 *outp++ = '\\';
937 if (c >= 0x00010000) {
938 *outp++ = 'U';
939 *outp++ = Py_hexdigits[(c>>28)&0xf];
940 *outp++ = Py_hexdigits[(c>>24)&0xf];
941 *outp++ = Py_hexdigits[(c>>20)&0xf];
942 *outp++ = Py_hexdigits[(c>>16)&0xf];
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else if (c >= 0x100) {
947 *outp++ = 'u';
948 *outp++ = Py_hexdigits[(c>>12)&0xf];
949 *outp++ = Py_hexdigits[(c>>8)&0xf];
950 }
951 else
952 *outp++ = 'x';
953 *outp++ = Py_hexdigits[(c>>4)&0xf];
954 *outp++ = Py_hexdigits[c&0xf];
955 }
956
957 assert(_PyUnicode_CheckConsistency(res, 1));
958 Py_DECREF(object);
959 return Py_BuildValue("(Nn)", res, end);
960 }
961
962 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
963
PyCodec_NameReplaceErrors(PyObject * exc)964 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
965 {
966 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
967 PyObject *restuple;
968 PyObject *object;
969 Py_ssize_t i;
970 Py_ssize_t start;
971 Py_ssize_t end;
972 PyObject *res;
973 unsigned char *outp;
974 Py_ssize_t ressize;
975 int replsize;
976 Py_UCS4 c;
977 char buffer[256]; /* NAME_MAXLEN */
978 if (PyUnicodeEncodeError_GetStart(exc, &start))
979 return NULL;
980 if (PyUnicodeEncodeError_GetEnd(exc, &end))
981 return NULL;
982 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
983 return NULL;
984 if (!ucnhash_CAPI) {
985 /* load the unicode data module */
986 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
987 PyUnicodeData_CAPSULE_NAME, 1);
988 if (!ucnhash_CAPI)
989 return NULL;
990 }
991 for (i = start, ressize = 0; i < end; ++i) {
992 /* object is guaranteed to be "ready" */
993 c = PyUnicode_READ_CHAR(object, i);
994 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
995 replsize = 1+1+1+(int)strlen(buffer)+1;
996 }
997 else if (c >= 0x10000) {
998 replsize = 1+1+8;
999 }
1000 else if (c >= 0x100) {
1001 replsize = 1+1+4;
1002 }
1003 else
1004 replsize = 1+1+2;
1005 if (ressize > PY_SSIZE_T_MAX - replsize)
1006 break;
1007 ressize += replsize;
1008 }
1009 end = i;
1010 res = PyUnicode_New(ressize, 127);
1011 if (res==NULL)
1012 return NULL;
1013 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1014 i < end; ++i) {
1015 c = PyUnicode_READ_CHAR(object, i);
1016 *outp++ = '\\';
1017 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1018 *outp++ = 'N';
1019 *outp++ = '{';
1020 strcpy((char *)outp, buffer);
1021 outp += strlen(buffer);
1022 *outp++ = '}';
1023 continue;
1024 }
1025 if (c >= 0x00010000) {
1026 *outp++ = 'U';
1027 *outp++ = Py_hexdigits[(c>>28)&0xf];
1028 *outp++ = Py_hexdigits[(c>>24)&0xf];
1029 *outp++ = Py_hexdigits[(c>>20)&0xf];
1030 *outp++ = Py_hexdigits[(c>>16)&0xf];
1031 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033 }
1034 else if (c >= 0x100) {
1035 *outp++ = 'u';
1036 *outp++ = Py_hexdigits[(c>>12)&0xf];
1037 *outp++ = Py_hexdigits[(c>>8)&0xf];
1038 }
1039 else
1040 *outp++ = 'x';
1041 *outp++ = Py_hexdigits[(c>>4)&0xf];
1042 *outp++ = Py_hexdigits[c&0xf];
1043 }
1044
1045 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1046 assert(_PyUnicode_CheckConsistency(res, 1));
1047 restuple = Py_BuildValue("(Nn)", res, end);
1048 Py_DECREF(object);
1049 return restuple;
1050 }
1051 else {
1052 wrong_exception_type(exc);
1053 return NULL;
1054 }
1055 }
1056
1057 #define ENC_UNKNOWN -1
1058 #define ENC_UTF8 0
1059 #define ENC_UTF16BE 1
1060 #define ENC_UTF16LE 2
1061 #define ENC_UTF32BE 3
1062 #define ENC_UTF32LE 4
1063
1064 static int
get_standard_encoding(const char * encoding,int * bytelength)1065 get_standard_encoding(const char *encoding, int *bytelength)
1066 {
1067 if (Py_TOLOWER(encoding[0]) == 'u' &&
1068 Py_TOLOWER(encoding[1]) == 't' &&
1069 Py_TOLOWER(encoding[2]) == 'f') {
1070 encoding += 3;
1071 if (*encoding == '-' || *encoding == '_' )
1072 encoding++;
1073 if (encoding[0] == '8' && encoding[1] == '\0') {
1074 *bytelength = 3;
1075 return ENC_UTF8;
1076 }
1077 else if (encoding[0] == '1' && encoding[1] == '6') {
1078 encoding += 2;
1079 *bytelength = 2;
1080 if (*encoding == '\0') {
1081 #ifdef WORDS_BIGENDIAN
1082 return ENC_UTF16BE;
1083 #else
1084 return ENC_UTF16LE;
1085 #endif
1086 }
1087 if (*encoding == '-' || *encoding == '_' )
1088 encoding++;
1089 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1090 if (Py_TOLOWER(encoding[0]) == 'b')
1091 return ENC_UTF16BE;
1092 if (Py_TOLOWER(encoding[0]) == 'l')
1093 return ENC_UTF16LE;
1094 }
1095 }
1096 else if (encoding[0] == '3' && encoding[1] == '2') {
1097 encoding += 2;
1098 *bytelength = 4;
1099 if (*encoding == '\0') {
1100 #ifdef WORDS_BIGENDIAN
1101 return ENC_UTF32BE;
1102 #else
1103 return ENC_UTF32LE;
1104 #endif
1105 }
1106 if (*encoding == '-' || *encoding == '_' )
1107 encoding++;
1108 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1109 if (Py_TOLOWER(encoding[0]) == 'b')
1110 return ENC_UTF32BE;
1111 if (Py_TOLOWER(encoding[0]) == 'l')
1112 return ENC_UTF32LE;
1113 }
1114 }
1115 }
1116 else if (strcmp(encoding, "CP_UTF8") == 0) {
1117 *bytelength = 3;
1118 return ENC_UTF8;
1119 }
1120 return ENC_UNKNOWN;
1121 }
1122
1123 /* This handler is declared static until someone demonstrates
1124 a need to call it directly. */
1125 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1126 PyCodec_SurrogatePassErrors(PyObject *exc)
1127 {
1128 PyObject *restuple;
1129 PyObject *object;
1130 PyObject *encode;
1131 const char *encoding;
1132 int code;
1133 int bytelength;
1134 Py_ssize_t i;
1135 Py_ssize_t start;
1136 Py_ssize_t end;
1137 PyObject *res;
1138
1139 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1140 unsigned char *outp;
1141 if (PyUnicodeEncodeError_GetStart(exc, &start))
1142 return NULL;
1143 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1144 return NULL;
1145 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1146 return NULL;
1147 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1148 Py_DECREF(object);
1149 return NULL;
1150 }
1151 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1152 Py_DECREF(object);
1153 Py_DECREF(encode);
1154 return NULL;
1155 }
1156 code = get_standard_encoding(encoding, &bytelength);
1157 Py_DECREF(encode);
1158 if (code == ENC_UNKNOWN) {
1159 /* Not supported, fail with original exception */
1160 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1161 Py_DECREF(object);
1162 return NULL;
1163 }
1164
1165 if (end - start > PY_SSIZE_T_MAX / bytelength)
1166 end = start + PY_SSIZE_T_MAX / bytelength;
1167 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1168 if (!res) {
1169 Py_DECREF(object);
1170 return NULL;
1171 }
1172 outp = (unsigned char*)PyBytes_AsString(res);
1173 for (i = start; i < end; i++) {
1174 /* object is guaranteed to be "ready" */
1175 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1176 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1177 /* Not a surrogate, fail with original exception */
1178 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1179 Py_DECREF(res);
1180 Py_DECREF(object);
1181 return NULL;
1182 }
1183 switch (code) {
1184 case ENC_UTF8:
1185 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1186 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1187 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1188 break;
1189 case ENC_UTF16LE:
1190 *outp++ = (unsigned char) ch;
1191 *outp++ = (unsigned char)(ch >> 8);
1192 break;
1193 case ENC_UTF16BE:
1194 *outp++ = (unsigned char)(ch >> 8);
1195 *outp++ = (unsigned char) ch;
1196 break;
1197 case ENC_UTF32LE:
1198 *outp++ = (unsigned char) ch;
1199 *outp++ = (unsigned char)(ch >> 8);
1200 *outp++ = (unsigned char)(ch >> 16);
1201 *outp++ = (unsigned char)(ch >> 24);
1202 break;
1203 case ENC_UTF32BE:
1204 *outp++ = (unsigned char)(ch >> 24);
1205 *outp++ = (unsigned char)(ch >> 16);
1206 *outp++ = (unsigned char)(ch >> 8);
1207 *outp++ = (unsigned char) ch;
1208 break;
1209 }
1210 }
1211 restuple = Py_BuildValue("(On)", res, end);
1212 Py_DECREF(res);
1213 Py_DECREF(object);
1214 return restuple;
1215 }
1216 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1217 const unsigned char *p;
1218 Py_UCS4 ch = 0;
1219 if (PyUnicodeDecodeError_GetStart(exc, &start))
1220 return NULL;
1221 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1222 return NULL;
1223 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1224 return NULL;
1225 p = (const unsigned char*)PyBytes_AS_STRING(object);
1226 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1227 Py_DECREF(object);
1228 return NULL;
1229 }
1230 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1231 Py_DECREF(object);
1232 Py_DECREF(encode);
1233 return NULL;
1234 }
1235 code = get_standard_encoding(encoding, &bytelength);
1236 Py_DECREF(encode);
1237 if (code == ENC_UNKNOWN) {
1238 /* Not supported, fail with original exception */
1239 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1240 Py_DECREF(object);
1241 return NULL;
1242 }
1243
1244 /* Try decoding a single surrogate character. If
1245 there are more, let the codec call us again. */
1246 p += start;
1247 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1248 switch (code) {
1249 case ENC_UTF8:
1250 if ((p[0] & 0xf0) == 0xe0 &&
1251 (p[1] & 0xc0) == 0x80 &&
1252 (p[2] & 0xc0) == 0x80) {
1253 /* it's a three-byte code */
1254 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1255 }
1256 break;
1257 case ENC_UTF16LE:
1258 ch = p[1] << 8 | p[0];
1259 break;
1260 case ENC_UTF16BE:
1261 ch = p[0] << 8 | p[1];
1262 break;
1263 case ENC_UTF32LE:
1264 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1265 break;
1266 case ENC_UTF32BE:
1267 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1268 break;
1269 }
1270 }
1271
1272 Py_DECREF(object);
1273 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1274 /* it's not a surrogate - fail */
1275 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1276 return NULL;
1277 }
1278 res = PyUnicode_FromOrdinal(ch);
1279 if (res == NULL)
1280 return NULL;
1281 return Py_BuildValue("(Nn)", res, start + bytelength);
1282 }
1283 else {
1284 wrong_exception_type(exc);
1285 return NULL;
1286 }
1287 }
1288
1289 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1290 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1291 {
1292 PyObject *restuple;
1293 PyObject *object;
1294 Py_ssize_t i;
1295 Py_ssize_t start;
1296 Py_ssize_t end;
1297 PyObject *res;
1298
1299 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1300 char *outp;
1301 if (PyUnicodeEncodeError_GetStart(exc, &start))
1302 return NULL;
1303 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1304 return NULL;
1305 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1306 return NULL;
1307 res = PyBytes_FromStringAndSize(NULL, end-start);
1308 if (!res) {
1309 Py_DECREF(object);
1310 return NULL;
1311 }
1312 outp = PyBytes_AsString(res);
1313 for (i = start; i < end; i++) {
1314 /* object is guaranteed to be "ready" */
1315 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1316 if (ch < 0xdc80 || ch > 0xdcff) {
1317 /* Not a UTF-8b surrogate, fail with original exception */
1318 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1319 Py_DECREF(res);
1320 Py_DECREF(object);
1321 return NULL;
1322 }
1323 *outp++ = ch - 0xdc00;
1324 }
1325 restuple = Py_BuildValue("(On)", res, end);
1326 Py_DECREF(res);
1327 Py_DECREF(object);
1328 return restuple;
1329 }
1330 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1331 PyObject *str;
1332 const unsigned char *p;
1333 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1334 int consumed = 0;
1335 if (PyUnicodeDecodeError_GetStart(exc, &start))
1336 return NULL;
1337 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1338 return NULL;
1339 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1340 return NULL;
1341 p = (const unsigned char*)PyBytes_AS_STRING(object);
1342 while (consumed < 4 && consumed < end-start) {
1343 /* Refuse to escape ASCII bytes. */
1344 if (p[start+consumed] < 128)
1345 break;
1346 ch[consumed] = 0xdc00 + p[start+consumed];
1347 consumed++;
1348 }
1349 Py_DECREF(object);
1350 if (!consumed) {
1351 /* codec complained about ASCII byte. */
1352 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1353 return NULL;
1354 }
1355 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1356 if (str == NULL)
1357 return NULL;
1358 return Py_BuildValue("(Nn)", str, start+consumed);
1359 }
1360 else {
1361 wrong_exception_type(exc);
1362 return NULL;
1363 }
1364 }
1365
1366
strict_errors(PyObject * self,PyObject * exc)1367 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1368 {
1369 return PyCodec_StrictErrors(exc);
1370 }
1371
1372
ignore_errors(PyObject * self,PyObject * exc)1373 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1374 {
1375 return PyCodec_IgnoreErrors(exc);
1376 }
1377
1378
replace_errors(PyObject * self,PyObject * exc)1379 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1380 {
1381 return PyCodec_ReplaceErrors(exc);
1382 }
1383
1384
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1385 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1386 {
1387 return PyCodec_XMLCharRefReplaceErrors(exc);
1388 }
1389
1390
backslashreplace_errors(PyObject * self,PyObject * exc)1391 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1392 {
1393 return PyCodec_BackslashReplaceErrors(exc);
1394 }
1395
namereplace_errors(PyObject * self,PyObject * exc)1396 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1397 {
1398 return PyCodec_NameReplaceErrors(exc);
1399 }
1400
surrogatepass_errors(PyObject * self,PyObject * exc)1401 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1402 {
1403 return PyCodec_SurrogatePassErrors(exc);
1404 }
1405
surrogateescape_errors(PyObject * self,PyObject * exc)1406 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1407 {
1408 return PyCodec_SurrogateEscapeErrors(exc);
1409 }
1410
_PyCodecRegistry_Init(void)1411 static int _PyCodecRegistry_Init(void)
1412 {
1413 static struct {
1414 char *name;
1415 PyMethodDef def;
1416 } methods[] =
1417 {
1418 {
1419 "strict",
1420 {
1421 "strict_errors",
1422 strict_errors,
1423 METH_O,
1424 PyDoc_STR("Implements the 'strict' error handling, which "
1425 "raises a UnicodeError on coding errors.")
1426 }
1427 },
1428 {
1429 "ignore",
1430 {
1431 "ignore_errors",
1432 ignore_errors,
1433 METH_O,
1434 PyDoc_STR("Implements the 'ignore' error handling, which "
1435 "ignores malformed data and continues.")
1436 }
1437 },
1438 {
1439 "replace",
1440 {
1441 "replace_errors",
1442 replace_errors,
1443 METH_O,
1444 PyDoc_STR("Implements the 'replace' error handling, which "
1445 "replaces malformed data with a replacement marker.")
1446 }
1447 },
1448 {
1449 "xmlcharrefreplace",
1450 {
1451 "xmlcharrefreplace_errors",
1452 xmlcharrefreplace_errors,
1453 METH_O,
1454 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1455 "which replaces an unencodable character with the "
1456 "appropriate XML character reference.")
1457 }
1458 },
1459 {
1460 "backslashreplace",
1461 {
1462 "backslashreplace_errors",
1463 backslashreplace_errors,
1464 METH_O,
1465 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1466 "which replaces malformed data with a backslashed "
1467 "escape sequence.")
1468 }
1469 },
1470 {
1471 "namereplace",
1472 {
1473 "namereplace_errors",
1474 namereplace_errors,
1475 METH_O,
1476 PyDoc_STR("Implements the 'namereplace' error handling, "
1477 "which replaces an unencodable character with a "
1478 "\\N{...} escape sequence.")
1479 }
1480 },
1481 {
1482 "surrogatepass",
1483 {
1484 "surrogatepass",
1485 surrogatepass_errors,
1486 METH_O
1487 }
1488 },
1489 {
1490 "surrogateescape",
1491 {
1492 "surrogateescape",
1493 surrogateescape_errors,
1494 METH_O
1495 }
1496 }
1497 };
1498
1499 PyInterpreterState *interp = PyThreadState_GET()->interp;
1500 PyObject *mod;
1501 unsigned i;
1502
1503 if (interp->codec_search_path != NULL)
1504 return 0;
1505
1506 interp->codec_search_path = PyList_New(0);
1507 interp->codec_search_cache = PyDict_New();
1508 interp->codec_error_registry = PyDict_New();
1509
1510 if (interp->codec_error_registry) {
1511 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1512 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1513 int res;
1514 if (!func)
1515 Py_FatalError("can't initialize codec error registry");
1516 res = PyCodec_RegisterError(methods[i].name, func);
1517 Py_DECREF(func);
1518 if (res)
1519 Py_FatalError("can't initialize codec error registry");
1520 }
1521 }
1522
1523 if (interp->codec_search_path == NULL ||
1524 interp->codec_search_cache == NULL ||
1525 interp->codec_error_registry == NULL)
1526 Py_FatalError("can't initialize codec registry");
1527
1528 mod = PyImport_ImportModuleNoBlock("encodings");
1529 if (mod == NULL) {
1530 return -1;
1531 }
1532 Py_DECREF(mod);
1533 interp->codecs_initialized = 1;
1534 return 0;
1535 }
1536