1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include <ctype.h>
13
14 /* --- Codec Registry ----------------------------------------------------- */
15
16 /* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
22 ImportErrors are silently ignored by this function. Only one try is
23 made.
24
25 */
26
27 static int _PyCodecRegistry_Init(void); /* Forward */
28
PyCodec_Register(PyObject * search_function)29 int PyCodec_Register(PyObject *search_function)
30 {
31 PyInterpreterState *interp = PyThreadState_GET()->interp;
32 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
34 if (search_function == NULL) {
35 PyErr_BadArgument();
36 goto onError;
37 }
38 if (!PyCallable_Check(search_function)) {
39 PyErr_SetString(PyExc_TypeError, "argument must be callable");
40 goto onError;
41 }
42 return PyList_Append(interp->codec_search_path, search_function);
43
44 onError:
45 return -1;
46 }
47
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
51 static
normalizestring(const char * string)52 PyObject *normalizestring(const char *string)
53 {
54 register size_t i;
55 size_t len = strlen(string);
56 char *p;
57 PyObject *v;
58
59 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
63
64 v = PyString_FromStringAndSize(NULL, len);
65 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
68 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
73 ch = Py_TOLOWER(Py_CHARMASK(ch));
74 p[i] = ch;
75 }
76 return v;
77 }
78
79 /* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
86 If no codec is found, a LookupError is set and NULL returned.
87
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92 */
93
_PyCodec_Lookup(const char * encoding)94 PyObject *_PyCodec_Lookup(const char *encoding)
95 {
96 PyInterpreterState *interp;
97 PyObject *result, *args = NULL, *v;
98 Py_ssize_t i, len;
99
100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
104
105 interp = PyThreadState_GET()->interp;
106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
107 goto onError;
108
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v = normalizestring(encoding);
113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
118 result = PyDict_GetItem(interp->codec_search_cache, v);
119 if (result != NULL) {
120 Py_INCREF(result);
121 Py_DECREF(v);
122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
130
131 len = PyList_Size(interp->codec_search_path);
132 if (len < 0)
133 goto onError;
134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
144 func = PyList_GetItem(interp->codec_search_path, i);
145 if (func == NULL)
146 goto onError;
147 result = PyEval_CallObject(func, args);
148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
166 goto onError;
167 }
168
169 /* Cache and return the result */
170 PyDict_SetItem(interp->codec_search_cache, v, result);
171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177 }
178
179 static
args_tuple(PyObject * object,const char * errors)180 PyObject *args_tuple(PyObject *object,
181 const char *errors)
182 {
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201 }
202
203 /* Helper function to get a codec item */
204
205 static
codec_getitem(const char * encoding,int index)206 PyObject *codec_getitem(const char *encoding, int index)
207 {
208 PyObject *codecs;
209 PyObject *v;
210
211 codecs = _PyCodec_Lookup(encoding);
212 if (codecs == NULL)
213 return NULL;
214 v = PyTuple_GET_ITEM(codecs, index);
215 Py_DECREF(codecs);
216 Py_INCREF(v);
217 return v;
218 }
219
220 /* Helper functions to create an incremental codec. */
221 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)222 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
223 const char *errors,
224 const char *attrname)
225 {
226 PyObject *ret, *inccodec;
227
228 inccodec = PyObject_GetAttrString(codec_info, attrname);
229 if (inccodec == NULL)
230 return NULL;
231 if (errors)
232 ret = PyObject_CallFunction(inccodec, "s", errors);
233 else
234 ret = PyObject_CallFunction(inccodec, NULL);
235 Py_DECREF(inccodec);
236 return ret;
237 }
238
239 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)240 PyObject *codec_getincrementalcodec(const char *encoding,
241 const char *errors,
242 const char *attrname)
243 {
244 PyObject *codec_info, *ret;
245
246 codec_info = _PyCodec_Lookup(encoding);
247 if (codec_info == NULL)
248 return NULL;
249 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
250 Py_DECREF(codec_info);
251 return ret;
252 }
253
254 /* Helper function to create a stream codec. */
255
256 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)257 PyObject *codec_getstreamcodec(const char *encoding,
258 PyObject *stream,
259 const char *errors,
260 const int index)
261 {
262 PyObject *codecs, *streamcodec, *codeccls;
263
264 codecs = _PyCodec_Lookup(encoding);
265 if (codecs == NULL)
266 return NULL;
267
268 codeccls = PyTuple_GET_ITEM(codecs, index);
269 if (errors != NULL)
270 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
271 else
272 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
273 Py_DECREF(codecs);
274 return streamcodec;
275 }
276
277 /* Helpers to work with the result of _PyCodec_Lookup
278
279 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)280 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
281 const char *errors)
282 {
283 return codec_makeincrementalcodec(codec_info, errors,
284 "incrementaldecoder");
285 }
286
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)287 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
288 const char *errors)
289 {
290 return codec_makeincrementalcodec(codec_info, errors,
291 "incrementalencoder");
292 }
293
294
295 /* Convenience APIs to query the Codec registry.
296
297 All APIs return a codec object with incremented refcount.
298
299 */
300
PyCodec_Encoder(const char * encoding)301 PyObject *PyCodec_Encoder(const char *encoding)
302 {
303 return codec_getitem(encoding, 0);
304 }
305
PyCodec_Decoder(const char * encoding)306 PyObject *PyCodec_Decoder(const char *encoding)
307 {
308 return codec_getitem(encoding, 1);
309 }
310
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)311 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
312 const char *errors)
313 {
314 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
315 }
316
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)317 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
318 const char *errors)
319 {
320 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
321 }
322
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)323 PyObject *PyCodec_StreamReader(const char *encoding,
324 PyObject *stream,
325 const char *errors)
326 {
327 return codec_getstreamcodec(encoding, stream, errors, 2);
328 }
329
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)330 PyObject *PyCodec_StreamWriter(const char *encoding,
331 PyObject *stream,
332 const char *errors)
333 {
334 return codec_getstreamcodec(encoding, stream, errors, 3);
335 }
336
337 /* Encode an object (e.g. a Unicode object) using the given encoding
338 and return the resulting encoded object (usually a Python string).
339
340 errors is passed to the encoder factory as argument if non-NULL. */
341
342 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)343 _PyCodec_EncodeInternal(PyObject *object,
344 PyObject *encoder,
345 const char *encoding,
346 const char *errors)
347 {
348 PyObject *args = NULL, *result = NULL;
349 PyObject *v;
350
351 args = args_tuple(object, errors);
352 if (args == NULL)
353 goto onError;
354
355 result = PyEval_CallObject(encoder,args);
356 if (result == NULL)
357 goto onError;
358
359 if (!PyTuple_Check(result) ||
360 PyTuple_GET_SIZE(result) != 2) {
361 PyErr_SetString(PyExc_TypeError,
362 "encoder must return a tuple (object,integer)");
363 goto onError;
364 }
365 v = PyTuple_GET_ITEM(result,0);
366 Py_INCREF(v);
367 /* We don't check or use the second (integer) entry. */
368
369 Py_DECREF(args);
370 Py_DECREF(encoder);
371 Py_DECREF(result);
372 return v;
373
374 onError:
375 Py_XDECREF(result);
376 Py_XDECREF(args);
377 Py_XDECREF(encoder);
378 return NULL;
379 }
380
381 /* Decode an object (usually a Python string) using the given encoding
382 and return an equivalent object (e.g. a Unicode object).
383
384 errors is passed to the decoder factory as argument if non-NULL. */
385
386 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)387 _PyCodec_DecodeInternal(PyObject *object,
388 PyObject *decoder,
389 const char *encoding,
390 const char *errors)
391 {
392 PyObject *args = NULL, *result = NULL;
393 PyObject *v;
394
395 args = args_tuple(object, errors);
396 if (args == NULL)
397 goto onError;
398
399 result = PyEval_CallObject(decoder,args);
400 if (result == NULL)
401 goto onError;
402 if (!PyTuple_Check(result) ||
403 PyTuple_GET_SIZE(result) != 2) {
404 PyErr_SetString(PyExc_TypeError,
405 "decoder must return a tuple (object,integer)");
406 goto onError;
407 }
408 v = PyTuple_GET_ITEM(result,0);
409 Py_INCREF(v);
410 /* We don't check or use the second (integer) entry. */
411
412 Py_DECREF(args);
413 Py_DECREF(decoder);
414 Py_DECREF(result);
415 return v;
416
417 onError:
418 Py_XDECREF(args);
419 Py_XDECREF(decoder);
420 Py_XDECREF(result);
421 return NULL;
422 }
423
424 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)425 PyObject *PyCodec_Encode(PyObject *object,
426 const char *encoding,
427 const char *errors)
428 {
429 PyObject *encoder;
430
431 encoder = PyCodec_Encoder(encoding);
432 if (encoder == NULL)
433 return NULL;
434
435 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
436 }
437
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)438 PyObject *PyCodec_Decode(PyObject *object,
439 const char *encoding,
440 const char *errors)
441 {
442 PyObject *decoder;
443
444 decoder = PyCodec_Decoder(encoding);
445 if (decoder == NULL)
446 return NULL;
447
448 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
449 }
450
451 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)452 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
453 const char *alternate_command)
454 {
455 PyObject *codec;
456 PyObject *attr;
457 int is_text_codec;
458
459 codec = _PyCodec_Lookup(encoding);
460 if (codec == NULL)
461 return NULL;
462
463 /* Backwards compatibility: assume any raw tuple describes a text
464 * encoding, and the same for anything lacking the private
465 * attribute.
466 */
467 if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
468 attr = PyObject_GetAttrString(codec, "_is_text_encoding");
469 if (attr == NULL) {
470 if (!PyErr_ExceptionMatches(PyExc_AttributeError))
471 goto onError;
472 PyErr_Clear();
473 } else {
474 is_text_codec = PyObject_IsTrue(attr);
475 Py_DECREF(attr);
476 if (is_text_codec < 0)
477 goto onError;
478 if (!is_text_codec) {
479 PyObject *msg = PyString_FromFormat(
480 "'%.400s' is not a text encoding; "
481 "use %s to handle arbitrary codecs",
482 encoding, alternate_command);
483 if (msg == NULL)
484 goto onError;
485 if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
486 Py_DECREF(msg);
487 goto onError;
488 }
489 Py_DECREF(msg);
490 }
491 }
492 }
493
494 /* This appears to be a valid text encoding */
495 return codec;
496
497 onError:
498 Py_DECREF(codec);
499 return NULL;
500 }
501
502
503 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)504 PyObject *codec_getitem_checked(const char *encoding,
505 const char *alternate_command,
506 int index)
507 {
508 PyObject *codec;
509 PyObject *v;
510
511 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
512 if (codec == NULL)
513 return NULL;
514
515 v = PyTuple_GET_ITEM(codec, index);
516 Py_INCREF(v);
517 Py_DECREF(codec);
518 return v;
519 }
520
_PyCodec_TextEncoder(const char * encoding)521 static PyObject * _PyCodec_TextEncoder(const char *encoding)
522 {
523 return codec_getitem_checked(encoding, "codecs.encode()", 0);
524 }
525
_PyCodec_TextDecoder(const char * encoding)526 static PyObject * _PyCodec_TextDecoder(const char *encoding)
527 {
528 return codec_getitem_checked(encoding, "codecs.decode()", 1);
529 }
530
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)531 PyObject *_PyCodec_EncodeText(PyObject *object,
532 const char *encoding,
533 const char *errors)
534 {
535 PyObject *encoder;
536
537 encoder = _PyCodec_TextEncoder(encoding);
538 if (encoder == NULL)
539 return NULL;
540
541 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
542 }
543
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)544 PyObject *_PyCodec_DecodeText(PyObject *object,
545 const char *encoding,
546 const char *errors)
547 {
548 PyObject *decoder;
549
550 decoder = _PyCodec_TextDecoder(encoding);
551 if (decoder == NULL)
552 return NULL;
553
554 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
555 }
556
557 /* Register the error handling callback function error under the name
558 name. This function will be called by the codec when it encounters
559 an unencodable characters/undecodable bytes and doesn't know the
560 callback name, when name is specified as the error parameter
561 in the call to the encode/decode function.
562 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)563 int PyCodec_RegisterError(const char *name, PyObject *error)
564 {
565 PyInterpreterState *interp = PyThreadState_GET()->interp;
566 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
567 return -1;
568 if (!PyCallable_Check(error)) {
569 PyErr_SetString(PyExc_TypeError, "handler must be callable");
570 return -1;
571 }
572 return PyDict_SetItemString(interp->codec_error_registry,
573 (char *)name, error);
574 }
575
576 /* Lookup the error handling callback function registered under the
577 name error. As a special case NULL can be passed, in which case
578 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)579 PyObject *PyCodec_LookupError(const char *name)
580 {
581 PyObject *handler = NULL;
582
583 PyInterpreterState *interp = PyThreadState_GET()->interp;
584 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
585 return NULL;
586
587 if (name==NULL)
588 name = "strict";
589 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
590 if (!handler)
591 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
592 else
593 Py_INCREF(handler);
594 return handler;
595 }
596
wrong_exception_type(PyObject * exc)597 static void wrong_exception_type(PyObject *exc)
598 {
599 PyObject *type = PyObject_GetAttrString(exc, "__class__");
600 if (type != NULL) {
601 PyObject *name = PyObject_GetAttrString(type, "__name__");
602 Py_DECREF(type);
603 if (name != NULL) {
604 PyObject *string = PyObject_Str(name);
605 Py_DECREF(name);
606 if (string != NULL) {
607 PyErr_Format(PyExc_TypeError,
608 "don't know how to handle %.400s in error callback",
609 PyString_AS_STRING(string));
610 Py_DECREF(string);
611 }
612 }
613 }
614 }
615
PyCodec_StrictErrors(PyObject * exc)616 PyObject *PyCodec_StrictErrors(PyObject *exc)
617 {
618 if (PyExceptionInstance_Check(exc))
619 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
620 else
621 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
622 return NULL;
623 }
624
625
626 #ifdef Py_USING_UNICODE
PyCodec_IgnoreErrors(PyObject * exc)627 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
628 {
629 Py_ssize_t end;
630
631 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
632 if (PyUnicodeEncodeError_GetEnd(exc, &end))
633 return NULL;
634 }
635 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
636 if (PyUnicodeDecodeError_GetEnd(exc, &end))
637 return NULL;
638 }
639 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
640 if (PyUnicodeTranslateError_GetEnd(exc, &end))
641 return NULL;
642 }
643 else {
644 wrong_exception_type(exc);
645 return NULL;
646 }
647 /* ouch: passing NULL, 0, pos gives None instead of u'' */
648 return Py_BuildValue("(u#n)", &end, 0, end);
649 }
650
651
PyCodec_ReplaceErrors(PyObject * exc)652 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
653 {
654 PyObject *restuple;
655 Py_ssize_t start;
656 Py_ssize_t end;
657 Py_ssize_t i;
658
659 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
660 PyObject *res;
661 Py_UNICODE *p;
662 if (PyUnicodeEncodeError_GetStart(exc, &start))
663 return NULL;
664 if (PyUnicodeEncodeError_GetEnd(exc, &end))
665 return NULL;
666 res = PyUnicode_FromUnicode(NULL, end-start);
667 if (res == NULL)
668 return NULL;
669 for (p = PyUnicode_AS_UNICODE(res), i = start;
670 i<end; ++p, ++i)
671 *p = '?';
672 restuple = Py_BuildValue("(On)", res, end);
673 Py_DECREF(res);
674 return restuple;
675 }
676 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
677 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
678 if (PyUnicodeDecodeError_GetEnd(exc, &end))
679 return NULL;
680 return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end);
681 }
682 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
683 PyObject *res;
684 Py_UNICODE *p;
685 if (PyUnicodeTranslateError_GetStart(exc, &start))
686 return NULL;
687 if (PyUnicodeTranslateError_GetEnd(exc, &end))
688 return NULL;
689 res = PyUnicode_FromUnicode(NULL, end-start);
690 if (res == NULL)
691 return NULL;
692 for (p = PyUnicode_AS_UNICODE(res), i = start;
693 i<end; ++p, ++i)
694 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
695 restuple = Py_BuildValue("(On)", res, end);
696 Py_DECREF(res);
697 return restuple;
698 }
699 else {
700 wrong_exception_type(exc);
701 return NULL;
702 }
703 }
704
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)705 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
706 {
707 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
708 PyObject *restuple;
709 PyObject *object;
710 Py_ssize_t start;
711 Py_ssize_t end;
712 PyObject *res;
713 Py_UNICODE *p;
714 Py_UNICODE *startp;
715 Py_UNICODE *e;
716 Py_UNICODE *outp;
717 Py_ssize_t ressize;
718 if (PyUnicodeEncodeError_GetStart(exc, &start))
719 return NULL;
720 if (PyUnicodeEncodeError_GetEnd(exc, &end))
721 return NULL;
722 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
723 return NULL;
724 startp = PyUnicode_AS_UNICODE(object);
725 if (end - start > PY_SSIZE_T_MAX / (2+7+1)) {
726 end = start + PY_SSIZE_T_MAX / (2+7+1);
727 #ifndef Py_UNICODE_WIDE
728 if (0xD800 <= startp[end - 1] && startp[end - 1] <= 0xDBFF)
729 end--;
730 #endif
731 }
732 e = startp + end;
733 for (p = startp+start, ressize = 0; p < e;) {
734 Py_UCS4 ch = *p++;
735 #ifndef Py_UNICODE_WIDE
736 if ((0xD800 <= ch && ch <= 0xDBFF) &&
737 (p < e) &&
738 (0xDC00 <= *p && *p <= 0xDFFF)) {
739 ch = ((((ch & 0x03FF) << 10) |
740 ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
741 }
742 #endif
743 if (ch < 10)
744 ressize += 2+1+1;
745 else if (ch < 100)
746 ressize += 2+2+1;
747 else if (ch < 1000)
748 ressize += 2+3+1;
749 else if (ch < 10000)
750 ressize += 2+4+1;
751 else if (ch < 100000)
752 ressize += 2+5+1;
753 else if (ch < 1000000)
754 ressize += 2+6+1;
755 else
756 ressize += 2+7+1;
757 }
758 /* allocate replacement */
759 res = PyUnicode_FromUnicode(NULL, ressize);
760 if (res == NULL) {
761 Py_DECREF(object);
762 return NULL;
763 }
764 /* generate replacement */
765 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
766 int digits;
767 int base;
768 Py_UCS4 ch = *p++;
769 #ifndef Py_UNICODE_WIDE
770 if ((0xD800 <= ch && ch <= 0xDBFF) &&
771 (p < startp+end) &&
772 (0xDC00 <= *p && *p <= 0xDFFF)) {
773 ch = ((((ch & 0x03FF) << 10) |
774 ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
775 }
776 #endif
777 *outp++ = '&';
778 *outp++ = '#';
779 if (ch < 10) {
780 digits = 1;
781 base = 1;
782 }
783 else if (ch < 100) {
784 digits = 2;
785 base = 10;
786 }
787 else if (ch < 1000) {
788 digits = 3;
789 base = 100;
790 }
791 else if (ch < 10000) {
792 digits = 4;
793 base = 1000;
794 }
795 else if (ch < 100000) {
796 digits = 5;
797 base = 10000;
798 }
799 else if (ch < 1000000) {
800 digits = 6;
801 base = 100000;
802 }
803 else {
804 digits = 7;
805 base = 1000000;
806 }
807 while (digits-->0) {
808 *outp++ = '0' + ch/base;
809 ch %= base;
810 base /= 10;
811 }
812 *outp++ = ';';
813 }
814 restuple = Py_BuildValue("(On)", res, end);
815 Py_DECREF(res);
816 Py_DECREF(object);
817 return restuple;
818 }
819 else {
820 wrong_exception_type(exc);
821 return NULL;
822 }
823 }
824
825 static Py_UNICODE hexdigits[] = {
826 '0', '1', '2', '3', '4', '5', '6', '7',
827 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
828 };
829
PyCodec_BackslashReplaceErrors(PyObject * exc)830 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
831 {
832 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
833 PyObject *restuple;
834 PyObject *object;
835 Py_ssize_t start;
836 Py_ssize_t end;
837 PyObject *res;
838 Py_UNICODE *p;
839 Py_UNICODE *startp;
840 Py_UNICODE *outp;
841 Py_ssize_t ressize;
842 if (PyUnicodeEncodeError_GetStart(exc, &start))
843 return NULL;
844 if (PyUnicodeEncodeError_GetEnd(exc, &end))
845 return NULL;
846 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
847 return NULL;
848 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
849 end = start + PY_SSIZE_T_MAX / (1+1+8);
850 startp = PyUnicode_AS_UNICODE(object);
851 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
852 #ifdef Py_UNICODE_WIDE
853 if (*p >= 0x00010000)
854 ressize += 1+1+8;
855 else
856 #endif
857 if (*p >= 0x100) {
858 ressize += 1+1+4;
859 }
860 else
861 ressize += 1+1+2;
862 }
863 res = PyUnicode_FromUnicode(NULL, ressize);
864 if (res == NULL) {
865 Py_DECREF(object);
866 return NULL;
867 }
868 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
869 p < startp+end; ++p) {
870 Py_UNICODE c = *p;
871 *outp++ = '\\';
872 #ifdef Py_UNICODE_WIDE
873 if (c >= 0x00010000) {
874 *outp++ = 'U';
875 *outp++ = hexdigits[(c>>28)&0xf];
876 *outp++ = hexdigits[(c>>24)&0xf];
877 *outp++ = hexdigits[(c>>20)&0xf];
878 *outp++ = hexdigits[(c>>16)&0xf];
879 *outp++ = hexdigits[(c>>12)&0xf];
880 *outp++ = hexdigits[(c>>8)&0xf];
881 }
882 else
883 #endif
884 if (c >= 0x100) {
885 *outp++ = 'u';
886 *outp++ = hexdigits[(c>>12)&0xf];
887 *outp++ = hexdigits[(c>>8)&0xf];
888 }
889 else
890 *outp++ = 'x';
891 *outp++ = hexdigits[(c>>4)&0xf];
892 *outp++ = hexdigits[c&0xf];
893 }
894
895 restuple = Py_BuildValue("(On)", res, end);
896 Py_DECREF(res);
897 Py_DECREF(object);
898 return restuple;
899 }
900 else {
901 wrong_exception_type(exc);
902 return NULL;
903 }
904 }
905 #endif
906
strict_errors(PyObject * self,PyObject * exc)907 static PyObject *strict_errors(PyObject *self, PyObject *exc)
908 {
909 return PyCodec_StrictErrors(exc);
910 }
911
912
913 #ifdef Py_USING_UNICODE
ignore_errors(PyObject * self,PyObject * exc)914 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
915 {
916 return PyCodec_IgnoreErrors(exc);
917 }
918
919
replace_errors(PyObject * self,PyObject * exc)920 static PyObject *replace_errors(PyObject *self, PyObject *exc)
921 {
922 return PyCodec_ReplaceErrors(exc);
923 }
924
925
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)926 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
927 {
928 return PyCodec_XMLCharRefReplaceErrors(exc);
929 }
930
931
backslashreplace_errors(PyObject * self,PyObject * exc)932 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
933 {
934 return PyCodec_BackslashReplaceErrors(exc);
935 }
936 #endif
937
_PyCodecRegistry_Init(void)938 static int _PyCodecRegistry_Init(void)
939 {
940 static struct {
941 char *name;
942 PyMethodDef def;
943 } methods[] =
944 {
945 {
946 "strict",
947 {
948 "strict_errors",
949 strict_errors,
950 METH_O,
951 PyDoc_STR("Implements the 'strict' error handling, which "
952 "raises a UnicodeError on coding errors.")
953 }
954 },
955 #ifdef Py_USING_UNICODE
956 {
957 "ignore",
958 {
959 "ignore_errors",
960 ignore_errors,
961 METH_O,
962 PyDoc_STR("Implements the 'ignore' error handling, which "
963 "ignores malformed data and continues.")
964 }
965 },
966 {
967 "replace",
968 {
969 "replace_errors",
970 replace_errors,
971 METH_O,
972 PyDoc_STR("Implements the 'replace' error handling, which "
973 "replaces malformed data with a replacement marker.")
974 }
975 },
976 {
977 "xmlcharrefreplace",
978 {
979 "xmlcharrefreplace_errors",
980 xmlcharrefreplace_errors,
981 METH_O,
982 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
983 "which replaces an unencodable character with the "
984 "appropriate XML character reference.")
985 }
986 },
987 {
988 "backslashreplace",
989 {
990 "backslashreplace_errors",
991 backslashreplace_errors,
992 METH_O,
993 PyDoc_STR("Implements the 'backslashreplace' error handling, "
994 "which replaces an unencodable character with a "
995 "backslashed escape sequence.")
996 }
997 }
998 #endif
999 };
1000
1001 PyInterpreterState *interp = PyThreadState_GET()->interp;
1002 PyObject *mod;
1003 unsigned i;
1004
1005 if (interp->codec_search_path != NULL)
1006 return 0;
1007
1008 interp->codec_search_path = PyList_New(0);
1009 interp->codec_search_cache = PyDict_New();
1010 interp->codec_error_registry = PyDict_New();
1011
1012 if (interp->codec_error_registry) {
1013 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
1014 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1015 int res;
1016 if (!func)
1017 Py_FatalError("can't initialize codec error registry");
1018 res = PyCodec_RegisterError(methods[i].name, func);
1019 Py_DECREF(func);
1020 if (res)
1021 Py_FatalError("can't initialize codec error registry");
1022 }
1023 }
1024
1025 if (interp->codec_search_path == NULL ||
1026 interp->codec_search_cache == NULL ||
1027 interp->codec_error_registry == NULL)
1028 Py_FatalError("can't initialize codec registry");
1029
1030 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
1031 if (mod == NULL) {
1032 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
1033 /* Ignore ImportErrors... this is done so that
1034 distributions can disable the encodings package. Note
1035 that other errors are not masked, e.g. SystemErrors
1036 raised to inform the user of an error in the Python
1037 configuration are still reported back to the user. */
1038 PyErr_Clear();
1039 return 0;
1040 }
1041 return -1;
1042 }
1043 Py_DECREF(mod);
1044 return 0;
1045 }
1046