1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "ucnhash.h"
13 #include <ctype.h>
14
15 const char *Py_hexdigits = "0123456789abcdef";
16
17 /* --- Codec Registry ----------------------------------------------------- */
18
19 /* Import the standard encodings package which will register the first
20 codec search function.
21
22 This is done in a lazy way so that the Unicode implementation does
23 not downgrade startup time of scripts not needing it.
24
25 ImportErrors are silently ignored by this function. Only one try is
26 made.
27
28 */
29
30 static int _PyCodecRegistry_Init(void); /* Forward */
31
PyCodec_Register(PyObject * search_function)32 int PyCodec_Register(PyObject *search_function)
33 {
34 PyInterpreterState *interp = PyThreadState_GET()->interp;
35 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
36 goto onError;
37 if (search_function == NULL) {
38 PyErr_BadArgument();
39 goto onError;
40 }
41 if (!PyCallable_Check(search_function)) {
42 PyErr_SetString(PyExc_TypeError, "argument must be callable");
43 goto onError;
44 }
45 return PyList_Append(interp->codec_search_path, search_function);
46
47 onError:
48 return -1;
49 }
50
51 /* Convert a string to a normalized Python string: all characters are
52 converted to lower case, spaces are replaced with underscores. */
53
54 static
normalizestring(const char * string)55 PyObject *normalizestring(const char *string)
56 {
57 size_t i;
58 size_t len = strlen(string);
59 char *p;
60 PyObject *v;
61
62 if (len > PY_SSIZE_T_MAX) {
63 PyErr_SetString(PyExc_OverflowError, "string is too large");
64 return NULL;
65 }
66
67 p = PyMem_Malloc(len + 1);
68 if (p == NULL)
69 return PyErr_NoMemory();
70 for (i = 0; i < len; i++) {
71 char ch = string[i];
72 if (ch == ' ')
73 ch = '-';
74 else
75 ch = Py_TOLOWER(Py_CHARMASK(ch));
76 p[i] = ch;
77 }
78 p[i] = '\0';
79 v = PyUnicode_FromString(p);
80 if (v == NULL)
81 return NULL;
82 PyMem_Free(p);
83 return v;
84 }
85
86 /* Lookup the given encoding and return a tuple providing the codec
87 facilities.
88
89 The encoding string is looked up converted to all lower-case
90 characters. This makes encodings looked up through this mechanism
91 effectively case-insensitive.
92
93 If no codec is found, a LookupError is set and NULL returned.
94
95 As side effect, this tries to load the encodings package, if not
96 yet done. This is part of the lazy load strategy for the encodings
97 package.
98
99 */
100
_PyCodec_Lookup(const char * encoding)101 PyObject *_PyCodec_Lookup(const char *encoding)
102 {
103 PyInterpreterState *interp;
104 PyObject *result, *args = NULL, *v;
105 Py_ssize_t i, len;
106
107 if (encoding == NULL) {
108 PyErr_BadArgument();
109 goto onError;
110 }
111
112 interp = PyThreadState_GET()->interp;
113 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
114 goto onError;
115
116 /* Convert the encoding to a normalized Python string: all
117 characters are converted to lower case, spaces and hyphens are
118 replaced with underscores. */
119 v = normalizestring(encoding);
120 if (v == NULL)
121 goto onError;
122 PyUnicode_InternInPlace(&v);
123
124 /* First, try to lookup the name in the registry dictionary */
125 result = PyDict_GetItem(interp->codec_search_cache, v);
126 if (result != NULL) {
127 Py_INCREF(result);
128 Py_DECREF(v);
129 return result;
130 }
131
132 /* Next, scan the search functions in order of registration */
133 args = PyTuple_New(1);
134 if (args == NULL)
135 goto onError;
136 PyTuple_SET_ITEM(args,0,v);
137
138 len = PyList_Size(interp->codec_search_path);
139 if (len < 0)
140 goto onError;
141 if (len == 0) {
142 PyErr_SetString(PyExc_LookupError,
143 "no codec search functions registered: "
144 "can't find encoding");
145 goto onError;
146 }
147
148 for (i = 0; i < len; i++) {
149 PyObject *func;
150
151 func = PyList_GetItem(interp->codec_search_path, i);
152 if (func == NULL)
153 goto onError;
154 result = PyEval_CallObject(func, args);
155 if (result == NULL)
156 goto onError;
157 if (result == Py_None) {
158 Py_DECREF(result);
159 continue;
160 }
161 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162 PyErr_SetString(PyExc_TypeError,
163 "codec search functions must return 4-tuples");
164 Py_DECREF(result);
165 goto onError;
166 }
167 break;
168 }
169 if (i == len) {
170 /* XXX Perhaps we should cache misses too ? */
171 PyErr_Format(PyExc_LookupError,
172 "unknown encoding: %s", encoding);
173 goto onError;
174 }
175
176 /* Cache and return the result */
177 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
178 Py_DECREF(result);
179 goto onError;
180 }
181 Py_DECREF(args);
182 return result;
183
184 onError:
185 Py_XDECREF(args);
186 return NULL;
187 }
188
_PyCodec_Forget(const char * encoding)189 int _PyCodec_Forget(const char *encoding)
190 {
191 PyInterpreterState *interp;
192 PyObject *v;
193 int result;
194
195 interp = PyThreadState_GET()->interp;
196 if (interp->codec_search_path == NULL) {
197 return -1;
198 }
199
200 /* Convert the encoding to a normalized Python string: all
201 characters are converted to lower case, spaces and hyphens are
202 replaced with underscores. */
203 v = normalizestring(encoding);
204 if (v == NULL) {
205 return -1;
206 }
207
208 /* Drop the named codec from the internal cache */
209 result = PyDict_DelItem(interp->codec_search_cache, v);
210 Py_DECREF(v);
211
212 return result;
213 }
214
215 /* Codec registry encoding check API. */
216
PyCodec_KnownEncoding(const char * encoding)217 int PyCodec_KnownEncoding(const char *encoding)
218 {
219 PyObject *codecs;
220
221 codecs = _PyCodec_Lookup(encoding);
222 if (!codecs) {
223 PyErr_Clear();
224 return 0;
225 }
226 else {
227 Py_DECREF(codecs);
228 return 1;
229 }
230 }
231
232 static
args_tuple(PyObject * object,const char * errors)233 PyObject *args_tuple(PyObject *object,
234 const char *errors)
235 {
236 PyObject *args;
237
238 args = PyTuple_New(1 + (errors != NULL));
239 if (args == NULL)
240 return NULL;
241 Py_INCREF(object);
242 PyTuple_SET_ITEM(args,0,object);
243 if (errors) {
244 PyObject *v;
245
246 v = PyUnicode_FromString(errors);
247 if (v == NULL) {
248 Py_DECREF(args);
249 return NULL;
250 }
251 PyTuple_SET_ITEM(args, 1, v);
252 }
253 return args;
254 }
255
256 /* Helper function to get a codec item */
257
258 static
codec_getitem(const char * encoding,int index)259 PyObject *codec_getitem(const char *encoding, int index)
260 {
261 PyObject *codecs;
262 PyObject *v;
263
264 codecs = _PyCodec_Lookup(encoding);
265 if (codecs == NULL)
266 return NULL;
267 v = PyTuple_GET_ITEM(codecs, index);
268 Py_DECREF(codecs);
269 Py_INCREF(v);
270 return v;
271 }
272
273 /* Helper functions to create an incremental codec. */
274 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)275 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276 const char *errors,
277 const char *attrname)
278 {
279 PyObject *ret, *inccodec;
280
281 inccodec = PyObject_GetAttrString(codec_info, attrname);
282 if (inccodec == NULL)
283 return NULL;
284 if (errors)
285 ret = PyObject_CallFunction(inccodec, "s", errors);
286 else
287 ret = PyObject_CallFunction(inccodec, NULL);
288 Py_DECREF(inccodec);
289 return ret;
290 }
291
292 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)293 PyObject *codec_getincrementalcodec(const char *encoding,
294 const char *errors,
295 const char *attrname)
296 {
297 PyObject *codec_info, *ret;
298
299 codec_info = _PyCodec_Lookup(encoding);
300 if (codec_info == NULL)
301 return NULL;
302 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303 Py_DECREF(codec_info);
304 return ret;
305 }
306
307 /* Helper function to create a stream codec. */
308
309 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)310 PyObject *codec_getstreamcodec(const char *encoding,
311 PyObject *stream,
312 const char *errors,
313 const int index)
314 {
315 PyObject *codecs, *streamcodec, *codeccls;
316
317 codecs = _PyCodec_Lookup(encoding);
318 if (codecs == NULL)
319 return NULL;
320
321 codeccls = PyTuple_GET_ITEM(codecs, index);
322 if (errors != NULL)
323 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
324 else
325 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
326 Py_DECREF(codecs);
327 return streamcodec;
328 }
329
330 /* Helpers to work with the result of _PyCodec_Lookup
331
332 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)333 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334 const char *errors)
335 {
336 return codec_makeincrementalcodec(codec_info, errors,
337 "incrementaldecoder");
338 }
339
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)340 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341 const char *errors)
342 {
343 return codec_makeincrementalcodec(codec_info, errors,
344 "incrementalencoder");
345 }
346
347
348 /* Convenience APIs to query the Codec registry.
349
350 All APIs return a codec object with incremented refcount.
351
352 */
353
PyCodec_Encoder(const char * encoding)354 PyObject *PyCodec_Encoder(const char *encoding)
355 {
356 return codec_getitem(encoding, 0);
357 }
358
PyCodec_Decoder(const char * encoding)359 PyObject *PyCodec_Decoder(const char *encoding)
360 {
361 return codec_getitem(encoding, 1);
362 }
363
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)364 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
365 const char *errors)
366 {
367 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
368 }
369
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)370 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
371 const char *errors)
372 {
373 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
374 }
375
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)376 PyObject *PyCodec_StreamReader(const char *encoding,
377 PyObject *stream,
378 const char *errors)
379 {
380 return codec_getstreamcodec(encoding, stream, errors, 2);
381 }
382
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)383 PyObject *PyCodec_StreamWriter(const char *encoding,
384 PyObject *stream,
385 const char *errors)
386 {
387 return codec_getstreamcodec(encoding, stream, errors, 3);
388 }
389
390 /* Helper that tries to ensure the reported exception chain indicates the
391 * codec that was invoked to trigger the failure without changing the type
392 * of the exception raised.
393 */
394 static void
wrap_codec_error(const char * operation,const char * encoding)395 wrap_codec_error(const char *operation,
396 const char *encoding)
397 {
398 /* TrySetFromCause will replace the active exception with a suitably
399 * updated clone if it can, otherwise it will leave the original
400 * exception alone.
401 */
402 _PyErr_TrySetFromCause("%s with '%s' codec failed",
403 operation, encoding);
404 }
405
406 /* Encode an object (e.g. a Unicode object) using the given encoding
407 and return the resulting encoded object (usually a Python string).
408
409 errors is passed to the encoder factory as argument if non-NULL. */
410
411 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)412 _PyCodec_EncodeInternal(PyObject *object,
413 PyObject *encoder,
414 const char *encoding,
415 const char *errors)
416 {
417 PyObject *args = NULL, *result = NULL;
418 PyObject *v = NULL;
419
420 args = args_tuple(object, errors);
421 if (args == NULL)
422 goto onError;
423
424 result = PyEval_CallObject(encoder, args);
425 if (result == NULL) {
426 wrap_codec_error("encoding", encoding);
427 goto onError;
428 }
429
430 if (!PyTuple_Check(result) ||
431 PyTuple_GET_SIZE(result) != 2) {
432 PyErr_SetString(PyExc_TypeError,
433 "encoder must return a tuple (object, integer)");
434 goto onError;
435 }
436 v = PyTuple_GET_ITEM(result,0);
437 Py_INCREF(v);
438 /* We don't check or use the second (integer) entry. */
439
440 Py_DECREF(args);
441 Py_DECREF(encoder);
442 Py_DECREF(result);
443 return v;
444
445 onError:
446 Py_XDECREF(result);
447 Py_XDECREF(args);
448 Py_XDECREF(encoder);
449 return NULL;
450 }
451
452 /* Decode an object (usually a Python string) using the given encoding
453 and return an equivalent object (e.g. a Unicode object).
454
455 errors is passed to the decoder factory as argument if non-NULL. */
456
457 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)458 _PyCodec_DecodeInternal(PyObject *object,
459 PyObject *decoder,
460 const char *encoding,
461 const char *errors)
462 {
463 PyObject *args = NULL, *result = NULL;
464 PyObject *v;
465
466 args = args_tuple(object, errors);
467 if (args == NULL)
468 goto onError;
469
470 result = PyEval_CallObject(decoder,args);
471 if (result == NULL) {
472 wrap_codec_error("decoding", encoding);
473 goto onError;
474 }
475 if (!PyTuple_Check(result) ||
476 PyTuple_GET_SIZE(result) != 2) {
477 PyErr_SetString(PyExc_TypeError,
478 "decoder must return a tuple (object,integer)");
479 goto onError;
480 }
481 v = PyTuple_GET_ITEM(result,0);
482 Py_INCREF(v);
483 /* We don't check or use the second (integer) entry. */
484
485 Py_DECREF(args);
486 Py_DECREF(decoder);
487 Py_DECREF(result);
488 return v;
489
490 onError:
491 Py_XDECREF(args);
492 Py_XDECREF(decoder);
493 Py_XDECREF(result);
494 return NULL;
495 }
496
497 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)498 PyObject *PyCodec_Encode(PyObject *object,
499 const char *encoding,
500 const char *errors)
501 {
502 PyObject *encoder;
503
504 encoder = PyCodec_Encoder(encoding);
505 if (encoder == NULL)
506 return NULL;
507
508 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509 }
510
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)511 PyObject *PyCodec_Decode(PyObject *object,
512 const char *encoding,
513 const char *errors)
514 {
515 PyObject *decoder;
516
517 decoder = PyCodec_Decoder(encoding);
518 if (decoder == NULL)
519 return NULL;
520
521 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522 }
523
524 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)525 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526 const char *alternate_command)
527 {
528 _Py_IDENTIFIER(_is_text_encoding);
529 PyObject *codec;
530 PyObject *attr;
531 int is_text_codec;
532
533 codec = _PyCodec_Lookup(encoding);
534 if (codec == NULL)
535 return NULL;
536
537 /* Backwards compatibility: assume any raw tuple describes a text
538 * encoding, and the same for anything lacking the private
539 * attribute.
540 */
541 if (!PyTuple_CheckExact(codec)) {
542 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543 if (attr == NULL) {
544 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545 PyErr_Clear();
546 } else {
547 Py_DECREF(codec);
548 return NULL;
549 }
550 } else {
551 is_text_codec = PyObject_IsTrue(attr);
552 Py_DECREF(attr);
553 if (is_text_codec <= 0) {
554 Py_DECREF(codec);
555 if (!is_text_codec)
556 PyErr_Format(PyExc_LookupError,
557 "'%.400s' is not a text encoding; "
558 "use %s to handle arbitrary codecs",
559 encoding, alternate_command);
560 return NULL;
561 }
562 }
563 }
564
565 /* This appears to be a valid text encoding */
566 return codec;
567 }
568
569
570 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)571 PyObject *codec_getitem_checked(const char *encoding,
572 const char *alternate_command,
573 int index)
574 {
575 PyObject *codec;
576 PyObject *v;
577
578 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
579 if (codec == NULL)
580 return NULL;
581
582 v = PyTuple_GET_ITEM(codec, index);
583 Py_INCREF(v);
584 Py_DECREF(codec);
585 return v;
586 }
587
_PyCodec_TextEncoder(const char * encoding)588 static PyObject * _PyCodec_TextEncoder(const char *encoding)
589 {
590 return codec_getitem_checked(encoding, "codecs.encode()", 0);
591 }
592
_PyCodec_TextDecoder(const char * encoding)593 static PyObject * _PyCodec_TextDecoder(const char *encoding)
594 {
595 return codec_getitem_checked(encoding, "codecs.decode()", 1);
596 }
597
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)598 PyObject *_PyCodec_EncodeText(PyObject *object,
599 const char *encoding,
600 const char *errors)
601 {
602 PyObject *encoder;
603
604 encoder = _PyCodec_TextEncoder(encoding);
605 if (encoder == NULL)
606 return NULL;
607
608 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
609 }
610
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)611 PyObject *_PyCodec_DecodeText(PyObject *object,
612 const char *encoding,
613 const char *errors)
614 {
615 PyObject *decoder;
616
617 decoder = _PyCodec_TextDecoder(encoding);
618 if (decoder == NULL)
619 return NULL;
620
621 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
622 }
623
624 /* Register the error handling callback function error under the name
625 name. This function will be called by the codec when it encounters
626 an unencodable characters/undecodable bytes and doesn't know the
627 callback name, when name is specified as the error parameter
628 in the call to the encode/decode function.
629 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)630 int PyCodec_RegisterError(const char *name, PyObject *error)
631 {
632 PyInterpreterState *interp = PyThreadState_GET()->interp;
633 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
634 return -1;
635 if (!PyCallable_Check(error)) {
636 PyErr_SetString(PyExc_TypeError, "handler must be callable");
637 return -1;
638 }
639 return PyDict_SetItemString(interp->codec_error_registry,
640 name, error);
641 }
642
643 /* Lookup the error handling callback function registered under the
644 name error. As a special case NULL can be passed, in which case
645 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)646 PyObject *PyCodec_LookupError(const char *name)
647 {
648 PyObject *handler = NULL;
649
650 PyInterpreterState *interp = PyThreadState_GET()->interp;
651 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
652 return NULL;
653
654 if (name==NULL)
655 name = "strict";
656 handler = PyDict_GetItemString(interp->codec_error_registry, name);
657 if (!handler)
658 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
659 else
660 Py_INCREF(handler);
661 return handler;
662 }
663
wrong_exception_type(PyObject * exc)664 static void wrong_exception_type(PyObject *exc)
665 {
666 PyErr_Format(PyExc_TypeError,
667 "don't know how to handle %.200s in error callback",
668 exc->ob_type->tp_name);
669 }
670
PyCodec_StrictErrors(PyObject * exc)671 PyObject *PyCodec_StrictErrors(PyObject *exc)
672 {
673 if (PyExceptionInstance_Check(exc))
674 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
675 else
676 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
677 return NULL;
678 }
679
680
PyCodec_IgnoreErrors(PyObject * exc)681 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
682 {
683 Py_ssize_t end;
684
685 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
686 if (PyUnicodeEncodeError_GetEnd(exc, &end))
687 return NULL;
688 }
689 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
690 if (PyUnicodeDecodeError_GetEnd(exc, &end))
691 return NULL;
692 }
693 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
694 if (PyUnicodeTranslateError_GetEnd(exc, &end))
695 return NULL;
696 }
697 else {
698 wrong_exception_type(exc);
699 return NULL;
700 }
701 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
702 }
703
704
PyCodec_ReplaceErrors(PyObject * exc)705 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
706 {
707 Py_ssize_t start, end, i, len;
708
709 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
710 PyObject *res;
711 int kind;
712 void *data;
713 if (PyUnicodeEncodeError_GetStart(exc, &start))
714 return NULL;
715 if (PyUnicodeEncodeError_GetEnd(exc, &end))
716 return NULL;
717 len = end - start;
718 res = PyUnicode_New(len, '?');
719 if (res == NULL)
720 return NULL;
721 kind = PyUnicode_KIND(res);
722 data = PyUnicode_DATA(res);
723 for (i = 0; i < len; ++i)
724 PyUnicode_WRITE(kind, data, i, '?');
725 assert(_PyUnicode_CheckConsistency(res, 1));
726 return Py_BuildValue("(Nn)", res, end);
727 }
728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
729 if (PyUnicodeDecodeError_GetEnd(exc, &end))
730 return NULL;
731 return Py_BuildValue("(Cn)",
732 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
733 end);
734 }
735 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
736 PyObject *res;
737 int kind;
738 void *data;
739 if (PyUnicodeTranslateError_GetStart(exc, &start))
740 return NULL;
741 if (PyUnicodeTranslateError_GetEnd(exc, &end))
742 return NULL;
743 len = end - start;
744 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
745 if (res == NULL)
746 return NULL;
747 kind = PyUnicode_KIND(res);
748 data = PyUnicode_DATA(res);
749 for (i=0; i < len; i++)
750 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
751 assert(_PyUnicode_CheckConsistency(res, 1));
752 return Py_BuildValue("(Nn)", res, end);
753 }
754 else {
755 wrong_exception_type(exc);
756 return NULL;
757 }
758 }
759
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)760 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
761 {
762 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
763 PyObject *restuple;
764 PyObject *object;
765 Py_ssize_t i;
766 Py_ssize_t start;
767 Py_ssize_t end;
768 PyObject *res;
769 unsigned char *outp;
770 Py_ssize_t ressize;
771 Py_UCS4 ch;
772 if (PyUnicodeEncodeError_GetStart(exc, &start))
773 return NULL;
774 if (PyUnicodeEncodeError_GetEnd(exc, &end))
775 return NULL;
776 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
777 return NULL;
778 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
779 end = start + PY_SSIZE_T_MAX / (2+7+1);
780 for (i = start, ressize = 0; i < end; ++i) {
781 /* object is guaranteed to be "ready" */
782 ch = PyUnicode_READ_CHAR(object, i);
783 if (ch<10)
784 ressize += 2+1+1;
785 else if (ch<100)
786 ressize += 2+2+1;
787 else if (ch<1000)
788 ressize += 2+3+1;
789 else if (ch<10000)
790 ressize += 2+4+1;
791 else if (ch<100000)
792 ressize += 2+5+1;
793 else if (ch<1000000)
794 ressize += 2+6+1;
795 else
796 ressize += 2+7+1;
797 }
798 /* allocate replacement */
799 res = PyUnicode_New(ressize, 127);
800 if (res == NULL) {
801 Py_DECREF(object);
802 return NULL;
803 }
804 outp = PyUnicode_1BYTE_DATA(res);
805 /* generate replacement */
806 for (i = start; i < end; ++i) {
807 int digits;
808 int base;
809 ch = PyUnicode_READ_CHAR(object, i);
810 *outp++ = '&';
811 *outp++ = '#';
812 if (ch<10) {
813 digits = 1;
814 base = 1;
815 }
816 else if (ch<100) {
817 digits = 2;
818 base = 10;
819 }
820 else if (ch<1000) {
821 digits = 3;
822 base = 100;
823 }
824 else if (ch<10000) {
825 digits = 4;
826 base = 1000;
827 }
828 else if (ch<100000) {
829 digits = 5;
830 base = 10000;
831 }
832 else if (ch<1000000) {
833 digits = 6;
834 base = 100000;
835 }
836 else {
837 digits = 7;
838 base = 1000000;
839 }
840 while (digits-->0) {
841 *outp++ = '0' + ch/base;
842 ch %= base;
843 base /= 10;
844 }
845 *outp++ = ';';
846 }
847 assert(_PyUnicode_CheckConsistency(res, 1));
848 restuple = Py_BuildValue("(Nn)", res, end);
849 Py_DECREF(object);
850 return restuple;
851 }
852 else {
853 wrong_exception_type(exc);
854 return NULL;
855 }
856 }
857
PyCodec_BackslashReplaceErrors(PyObject * exc)858 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
859 {
860 PyObject *object;
861 Py_ssize_t i;
862 Py_ssize_t start;
863 Py_ssize_t end;
864 PyObject *res;
865 unsigned char *outp;
866 int ressize;
867 Py_UCS4 c;
868
869 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
870 unsigned char *p;
871 if (PyUnicodeDecodeError_GetStart(exc, &start))
872 return NULL;
873 if (PyUnicodeDecodeError_GetEnd(exc, &end))
874 return NULL;
875 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
876 return NULL;
877 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
878 Py_DECREF(object);
879 return NULL;
880 }
881 res = PyUnicode_New(4 * (end - start), 127);
882 if (res == NULL) {
883 Py_DECREF(object);
884 return NULL;
885 }
886 outp = PyUnicode_1BYTE_DATA(res);
887 for (i = start; i < end; i++, outp += 4) {
888 unsigned char c = p[i];
889 outp[0] = '\\';
890 outp[1] = 'x';
891 outp[2] = Py_hexdigits[(c>>4)&0xf];
892 outp[3] = Py_hexdigits[c&0xf];
893 }
894
895 assert(_PyUnicode_CheckConsistency(res, 1));
896 Py_DECREF(object);
897 return Py_BuildValue("(Nn)", res, end);
898 }
899 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
900 if (PyUnicodeEncodeError_GetStart(exc, &start))
901 return NULL;
902 if (PyUnicodeEncodeError_GetEnd(exc, &end))
903 return NULL;
904 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
905 return NULL;
906 }
907 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
908 if (PyUnicodeTranslateError_GetStart(exc, &start))
909 return NULL;
910 if (PyUnicodeTranslateError_GetEnd(exc, &end))
911 return NULL;
912 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
913 return NULL;
914 }
915 else {
916 wrong_exception_type(exc);
917 return NULL;
918 }
919
920 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
921 end = start + PY_SSIZE_T_MAX / (1+1+8);
922 for (i = start, ressize = 0; i < end; ++i) {
923 /* object is guaranteed to be "ready" */
924 c = PyUnicode_READ_CHAR(object, i);
925 if (c >= 0x10000) {
926 ressize += 1+1+8;
927 }
928 else if (c >= 0x100) {
929 ressize += 1+1+4;
930 }
931 else
932 ressize += 1+1+2;
933 }
934 res = PyUnicode_New(ressize, 127);
935 if (res == NULL) {
936 Py_DECREF(object);
937 return NULL;
938 }
939 outp = PyUnicode_1BYTE_DATA(res);
940 for (i = start; i < end; ++i) {
941 c = PyUnicode_READ_CHAR(object, i);
942 *outp++ = '\\';
943 if (c >= 0x00010000) {
944 *outp++ = 'U';
945 *outp++ = Py_hexdigits[(c>>28)&0xf];
946 *outp++ = Py_hexdigits[(c>>24)&0xf];
947 *outp++ = Py_hexdigits[(c>>20)&0xf];
948 *outp++ = Py_hexdigits[(c>>16)&0xf];
949 *outp++ = Py_hexdigits[(c>>12)&0xf];
950 *outp++ = Py_hexdigits[(c>>8)&0xf];
951 }
952 else if (c >= 0x100) {
953 *outp++ = 'u';
954 *outp++ = Py_hexdigits[(c>>12)&0xf];
955 *outp++ = Py_hexdigits[(c>>8)&0xf];
956 }
957 else
958 *outp++ = 'x';
959 *outp++ = Py_hexdigits[(c>>4)&0xf];
960 *outp++ = Py_hexdigits[c&0xf];
961 }
962
963 assert(_PyUnicode_CheckConsistency(res, 1));
964 Py_DECREF(object);
965 return Py_BuildValue("(Nn)", res, end);
966 }
967
968 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
969
PyCodec_NameReplaceErrors(PyObject * exc)970 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
971 {
972 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
973 PyObject *restuple;
974 PyObject *object;
975 Py_ssize_t i;
976 Py_ssize_t start;
977 Py_ssize_t end;
978 PyObject *res;
979 unsigned char *outp;
980 Py_ssize_t ressize;
981 int replsize;
982 Py_UCS4 c;
983 char buffer[256]; /* NAME_MAXLEN */
984 if (PyUnicodeEncodeError_GetStart(exc, &start))
985 return NULL;
986 if (PyUnicodeEncodeError_GetEnd(exc, &end))
987 return NULL;
988 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
989 return NULL;
990 if (!ucnhash_CAPI) {
991 /* load the unicode data module */
992 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
993 PyUnicodeData_CAPSULE_NAME, 1);
994 if (!ucnhash_CAPI)
995 return NULL;
996 }
997 for (i = start, ressize = 0; i < end; ++i) {
998 /* object is guaranteed to be "ready" */
999 c = PyUnicode_READ_CHAR(object, i);
1000 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1001 replsize = 1+1+1+(int)strlen(buffer)+1;
1002 }
1003 else if (c >= 0x10000) {
1004 replsize = 1+1+8;
1005 }
1006 else if (c >= 0x100) {
1007 replsize = 1+1+4;
1008 }
1009 else
1010 replsize = 1+1+2;
1011 if (ressize > PY_SSIZE_T_MAX - replsize)
1012 break;
1013 ressize += replsize;
1014 }
1015 end = i;
1016 res = PyUnicode_New(ressize, 127);
1017 if (res==NULL)
1018 return NULL;
1019 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1020 i < end; ++i) {
1021 c = PyUnicode_READ_CHAR(object, i);
1022 *outp++ = '\\';
1023 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1024 *outp++ = 'N';
1025 *outp++ = '{';
1026 strcpy((char *)outp, buffer);
1027 outp += strlen(buffer);
1028 *outp++ = '}';
1029 continue;
1030 }
1031 if (c >= 0x00010000) {
1032 *outp++ = 'U';
1033 *outp++ = Py_hexdigits[(c>>28)&0xf];
1034 *outp++ = Py_hexdigits[(c>>24)&0xf];
1035 *outp++ = Py_hexdigits[(c>>20)&0xf];
1036 *outp++ = Py_hexdigits[(c>>16)&0xf];
1037 *outp++ = Py_hexdigits[(c>>12)&0xf];
1038 *outp++ = Py_hexdigits[(c>>8)&0xf];
1039 }
1040 else if (c >= 0x100) {
1041 *outp++ = 'u';
1042 *outp++ = Py_hexdigits[(c>>12)&0xf];
1043 *outp++ = Py_hexdigits[(c>>8)&0xf];
1044 }
1045 else
1046 *outp++ = 'x';
1047 *outp++ = Py_hexdigits[(c>>4)&0xf];
1048 *outp++ = Py_hexdigits[c&0xf];
1049 }
1050
1051 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1052 assert(_PyUnicode_CheckConsistency(res, 1));
1053 restuple = Py_BuildValue("(Nn)", res, end);
1054 Py_DECREF(object);
1055 return restuple;
1056 }
1057 else {
1058 wrong_exception_type(exc);
1059 return NULL;
1060 }
1061 }
1062
1063 #define ENC_UNKNOWN -1
1064 #define ENC_UTF8 0
1065 #define ENC_UTF16BE 1
1066 #define ENC_UTF16LE 2
1067 #define ENC_UTF32BE 3
1068 #define ENC_UTF32LE 4
1069
1070 static int
get_standard_encoding(const char * encoding,int * bytelength)1071 get_standard_encoding(const char *encoding, int *bytelength)
1072 {
1073 if (Py_TOLOWER(encoding[0]) == 'u' &&
1074 Py_TOLOWER(encoding[1]) == 't' &&
1075 Py_TOLOWER(encoding[2]) == 'f') {
1076 encoding += 3;
1077 if (*encoding == '-' || *encoding == '_' )
1078 encoding++;
1079 if (encoding[0] == '8' && encoding[1] == '\0') {
1080 *bytelength = 3;
1081 return ENC_UTF8;
1082 }
1083 else if (encoding[0] == '1' && encoding[1] == '6') {
1084 encoding += 2;
1085 *bytelength = 2;
1086 if (*encoding == '\0') {
1087 #ifdef WORDS_BIGENDIAN
1088 return ENC_UTF16BE;
1089 #else
1090 return ENC_UTF16LE;
1091 #endif
1092 }
1093 if (*encoding == '-' || *encoding == '_' )
1094 encoding++;
1095 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1096 if (Py_TOLOWER(encoding[0]) == 'b')
1097 return ENC_UTF16BE;
1098 if (Py_TOLOWER(encoding[0]) == 'l')
1099 return ENC_UTF16LE;
1100 }
1101 }
1102 else if (encoding[0] == '3' && encoding[1] == '2') {
1103 encoding += 2;
1104 *bytelength = 4;
1105 if (*encoding == '\0') {
1106 #ifdef WORDS_BIGENDIAN
1107 return ENC_UTF32BE;
1108 #else
1109 return ENC_UTF32LE;
1110 #endif
1111 }
1112 if (*encoding == '-' || *encoding == '_' )
1113 encoding++;
1114 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1115 if (Py_TOLOWER(encoding[0]) == 'b')
1116 return ENC_UTF32BE;
1117 if (Py_TOLOWER(encoding[0]) == 'l')
1118 return ENC_UTF32LE;
1119 }
1120 }
1121 }
1122 else if (strcmp(encoding, "CP_UTF8") == 0) {
1123 *bytelength = 3;
1124 return ENC_UTF8;
1125 }
1126 return ENC_UNKNOWN;
1127 }
1128
1129 /* This handler is declared static until someone demonstrates
1130 a need to call it directly. */
1131 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1132 PyCodec_SurrogatePassErrors(PyObject *exc)
1133 {
1134 PyObject *restuple;
1135 PyObject *object;
1136 PyObject *encode;
1137 char *encoding;
1138 int code;
1139 int bytelength;
1140 Py_ssize_t i;
1141 Py_ssize_t start;
1142 Py_ssize_t end;
1143 PyObject *res;
1144
1145 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1146 unsigned char *outp;
1147 if (PyUnicodeEncodeError_GetStart(exc, &start))
1148 return NULL;
1149 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1150 return NULL;
1151 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1152 return NULL;
1153 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1154 Py_DECREF(object);
1155 return NULL;
1156 }
1157 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1158 Py_DECREF(object);
1159 Py_DECREF(encode);
1160 return NULL;
1161 }
1162 code = get_standard_encoding(encoding, &bytelength);
1163 Py_DECREF(encode);
1164 if (code == ENC_UNKNOWN) {
1165 /* Not supported, fail with original exception */
1166 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1167 Py_DECREF(object);
1168 return NULL;
1169 }
1170
1171 if (end - start > PY_SSIZE_T_MAX / bytelength)
1172 end = start + PY_SSIZE_T_MAX / bytelength;
1173 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1174 if (!res) {
1175 Py_DECREF(object);
1176 return NULL;
1177 }
1178 outp = (unsigned char*)PyBytes_AsString(res);
1179 for (i = start; i < end; i++) {
1180 /* object is guaranteed to be "ready" */
1181 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1182 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1183 /* Not a surrogate, fail with original exception */
1184 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1185 Py_DECREF(res);
1186 Py_DECREF(object);
1187 return NULL;
1188 }
1189 switch (code) {
1190 case ENC_UTF8:
1191 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1192 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1193 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1194 break;
1195 case ENC_UTF16LE:
1196 *outp++ = (unsigned char) ch;
1197 *outp++ = (unsigned char)(ch >> 8);
1198 break;
1199 case ENC_UTF16BE:
1200 *outp++ = (unsigned char)(ch >> 8);
1201 *outp++ = (unsigned char) ch;
1202 break;
1203 case ENC_UTF32LE:
1204 *outp++ = (unsigned char) ch;
1205 *outp++ = (unsigned char)(ch >> 8);
1206 *outp++ = (unsigned char)(ch >> 16);
1207 *outp++ = (unsigned char)(ch >> 24);
1208 break;
1209 case ENC_UTF32BE:
1210 *outp++ = (unsigned char)(ch >> 24);
1211 *outp++ = (unsigned char)(ch >> 16);
1212 *outp++ = (unsigned char)(ch >> 8);
1213 *outp++ = (unsigned char) ch;
1214 break;
1215 }
1216 }
1217 restuple = Py_BuildValue("(On)", res, end);
1218 Py_DECREF(res);
1219 Py_DECREF(object);
1220 return restuple;
1221 }
1222 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1223 unsigned char *p;
1224 Py_UCS4 ch = 0;
1225 if (PyUnicodeDecodeError_GetStart(exc, &start))
1226 return NULL;
1227 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1228 return NULL;
1229 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1230 return NULL;
1231 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1232 Py_DECREF(object);
1233 return NULL;
1234 }
1235 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1236 Py_DECREF(object);
1237 return NULL;
1238 }
1239 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1240 Py_DECREF(object);
1241 Py_DECREF(encode);
1242 return NULL;
1243 }
1244 code = get_standard_encoding(encoding, &bytelength);
1245 Py_DECREF(encode);
1246 if (code == ENC_UNKNOWN) {
1247 /* Not supported, fail with original exception */
1248 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1249 Py_DECREF(object);
1250 return NULL;
1251 }
1252
1253 /* Try decoding a single surrogate character. If
1254 there are more, let the codec call us again. */
1255 p += start;
1256 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1257 switch (code) {
1258 case ENC_UTF8:
1259 if ((p[0] & 0xf0) == 0xe0 &&
1260 (p[1] & 0xc0) == 0x80 &&
1261 (p[2] & 0xc0) == 0x80) {
1262 /* it's a three-byte code */
1263 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1264 }
1265 break;
1266 case ENC_UTF16LE:
1267 ch = p[1] << 8 | p[0];
1268 break;
1269 case ENC_UTF16BE:
1270 ch = p[0] << 8 | p[1];
1271 break;
1272 case ENC_UTF32LE:
1273 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1274 break;
1275 case ENC_UTF32BE:
1276 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1277 break;
1278 }
1279 }
1280
1281 Py_DECREF(object);
1282 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1283 /* it's not a surrogate - fail */
1284 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1285 return NULL;
1286 }
1287 res = PyUnicode_FromOrdinal(ch);
1288 if (res == NULL)
1289 return NULL;
1290 return Py_BuildValue("(Nn)", res, start + bytelength);
1291 }
1292 else {
1293 wrong_exception_type(exc);
1294 return NULL;
1295 }
1296 }
1297
1298 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1299 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1300 {
1301 PyObject *restuple;
1302 PyObject *object;
1303 Py_ssize_t i;
1304 Py_ssize_t start;
1305 Py_ssize_t end;
1306 PyObject *res;
1307
1308 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1309 char *outp;
1310 if (PyUnicodeEncodeError_GetStart(exc, &start))
1311 return NULL;
1312 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1313 return NULL;
1314 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1315 return NULL;
1316 res = PyBytes_FromStringAndSize(NULL, end-start);
1317 if (!res) {
1318 Py_DECREF(object);
1319 return NULL;
1320 }
1321 outp = PyBytes_AsString(res);
1322 for (i = start; i < end; i++) {
1323 /* object is guaranteed to be "ready" */
1324 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1325 if (ch < 0xdc80 || ch > 0xdcff) {
1326 /* Not a UTF-8b surrogate, fail with original exception */
1327 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1328 Py_DECREF(res);
1329 Py_DECREF(object);
1330 return NULL;
1331 }
1332 *outp++ = ch - 0xdc00;
1333 }
1334 restuple = Py_BuildValue("(On)", res, end);
1335 Py_DECREF(res);
1336 Py_DECREF(object);
1337 return restuple;
1338 }
1339 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1340 PyObject *str;
1341 unsigned char *p;
1342 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1343 int consumed = 0;
1344 if (PyUnicodeDecodeError_GetStart(exc, &start))
1345 return NULL;
1346 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1347 return NULL;
1348 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1349 return NULL;
1350 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1351 Py_DECREF(object);
1352 return NULL;
1353 }
1354 while (consumed < 4 && consumed < end-start) {
1355 /* Refuse to escape ASCII bytes. */
1356 if (p[start+consumed] < 128)
1357 break;
1358 ch[consumed] = 0xdc00 + p[start+consumed];
1359 consumed++;
1360 }
1361 Py_DECREF(object);
1362 if (!consumed) {
1363 /* codec complained about ASCII byte. */
1364 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1365 return NULL;
1366 }
1367 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1368 if (str == NULL)
1369 return NULL;
1370 return Py_BuildValue("(Nn)", str, start+consumed);
1371 }
1372 else {
1373 wrong_exception_type(exc);
1374 return NULL;
1375 }
1376 }
1377
1378
strict_errors(PyObject * self,PyObject * exc)1379 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1380 {
1381 return PyCodec_StrictErrors(exc);
1382 }
1383
1384
ignore_errors(PyObject * self,PyObject * exc)1385 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1386 {
1387 return PyCodec_IgnoreErrors(exc);
1388 }
1389
1390
replace_errors(PyObject * self,PyObject * exc)1391 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1392 {
1393 return PyCodec_ReplaceErrors(exc);
1394 }
1395
1396
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1397 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1398 {
1399 return PyCodec_XMLCharRefReplaceErrors(exc);
1400 }
1401
1402
backslashreplace_errors(PyObject * self,PyObject * exc)1403 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1404 {
1405 return PyCodec_BackslashReplaceErrors(exc);
1406 }
1407
namereplace_errors(PyObject * self,PyObject * exc)1408 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1409 {
1410 return PyCodec_NameReplaceErrors(exc);
1411 }
1412
surrogatepass_errors(PyObject * self,PyObject * exc)1413 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1414 {
1415 return PyCodec_SurrogatePassErrors(exc);
1416 }
1417
surrogateescape_errors(PyObject * self,PyObject * exc)1418 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1419 {
1420 return PyCodec_SurrogateEscapeErrors(exc);
1421 }
1422
_PyCodecRegistry_Init(void)1423 static int _PyCodecRegistry_Init(void)
1424 {
1425 static struct {
1426 char *name;
1427 PyMethodDef def;
1428 } methods[] =
1429 {
1430 {
1431 "strict",
1432 {
1433 "strict_errors",
1434 strict_errors,
1435 METH_O,
1436 PyDoc_STR("Implements the 'strict' error handling, which "
1437 "raises a UnicodeError on coding errors.")
1438 }
1439 },
1440 {
1441 "ignore",
1442 {
1443 "ignore_errors",
1444 ignore_errors,
1445 METH_O,
1446 PyDoc_STR("Implements the 'ignore' error handling, which "
1447 "ignores malformed data and continues.")
1448 }
1449 },
1450 {
1451 "replace",
1452 {
1453 "replace_errors",
1454 replace_errors,
1455 METH_O,
1456 PyDoc_STR("Implements the 'replace' error handling, which "
1457 "replaces malformed data with a replacement marker.")
1458 }
1459 },
1460 {
1461 "xmlcharrefreplace",
1462 {
1463 "xmlcharrefreplace_errors",
1464 xmlcharrefreplace_errors,
1465 METH_O,
1466 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1467 "which replaces an unencodable character with the "
1468 "appropriate XML character reference.")
1469 }
1470 },
1471 {
1472 "backslashreplace",
1473 {
1474 "backslashreplace_errors",
1475 backslashreplace_errors,
1476 METH_O,
1477 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1478 "which replaces malformed data with a backslashed "
1479 "escape sequence.")
1480 }
1481 },
1482 {
1483 "namereplace",
1484 {
1485 "namereplace_errors",
1486 namereplace_errors,
1487 METH_O,
1488 PyDoc_STR("Implements the 'namereplace' error handling, "
1489 "which replaces an unencodable character with a "
1490 "\\N{...} escape sequence.")
1491 }
1492 },
1493 {
1494 "surrogatepass",
1495 {
1496 "surrogatepass",
1497 surrogatepass_errors,
1498 METH_O
1499 }
1500 },
1501 {
1502 "surrogateescape",
1503 {
1504 "surrogateescape",
1505 surrogateescape_errors,
1506 METH_O
1507 }
1508 }
1509 };
1510
1511 PyInterpreterState *interp = PyThreadState_GET()->interp;
1512 PyObject *mod;
1513 unsigned i;
1514
1515 if (interp->codec_search_path != NULL)
1516 return 0;
1517
1518 interp->codec_search_path = PyList_New(0);
1519 interp->codec_search_cache = PyDict_New();
1520 interp->codec_error_registry = PyDict_New();
1521
1522 if (interp->codec_error_registry) {
1523 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1524 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1525 int res;
1526 if (!func)
1527 Py_FatalError("can't initialize codec error registry");
1528 res = PyCodec_RegisterError(methods[i].name, func);
1529 Py_DECREF(func);
1530 if (res)
1531 Py_FatalError("can't initialize codec error registry");
1532 }
1533 }
1534
1535 if (interp->codec_search_path == NULL ||
1536 interp->codec_search_cache == NULL ||
1537 interp->codec_error_registry == NULL)
1538 Py_FatalError("can't initialize codec registry");
1539
1540 mod = PyImport_ImportModuleNoBlock("encodings");
1541 if (mod == NULL) {
1542 return -1;
1543 }
1544 Py_DECREF(mod);
1545 interp->codecs_initialized = 1;
1546 return 0;
1547 }
1548