1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "ucnhash.h"
44 #include "bytes_methods.h"
45 #include "stringlib/eq.h"
46
47 #ifdef MS_WINDOWS
48 #include <windows.h>
49 #endif
50
51 /*[clinic input]
52 class str "PyUnicodeObject *" "&PyUnicode_Type"
53 [clinic start generated code]*/
54 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
55
56 /* --- Globals ------------------------------------------------------------
57
58 NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
61
62 */
63
64
65 #ifdef __cplusplus
66 extern "C" {
67 #endif
68
69 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70 #define MAX_UNICODE 0x10ffff
71
72 #ifdef Py_DEBUG
73 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74 #else
75 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76 #endif
77
78 #define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80 #define PyUnicode_UTF8(op) \
81 (assert(_PyUnicode_CHECK(op)), \
82 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
86 #define _PyUnicode_UTF8_LENGTH(op) \
87 (((PyCompactUnicodeObject*)(op))->utf8_length)
88 #define PyUnicode_UTF8_LENGTH(op) \
89 (assert(_PyUnicode_CHECK(op)), \
90 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
94 #define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96 #define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98 #define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100 #define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102 #define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
104 #define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
106 ((PyASCIIObject *)(op))->state.kind)
107 #define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
109 ((PyASCIIObject *)(op))->length)
110 #define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
112
113 #undef PyUnicode_READY
114 #define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
117 0 : \
118 _PyUnicode_Ready(op)))
119
120 #define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124 #define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
128 /* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
130 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
135 /* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 ((_PyUnicode_WSTR(op) && \
139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142 /* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
161 } \
162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
164 } while (0)
165
166 #ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168 # define OVERALLOCATE_FACTOR 2
169 #else
170 /* On Linux, overallocate by 25% is the best factor */
171 # define OVERALLOCATE_FACTOR 4
172 #endif
173
174 /* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
181 */
182 static PyObject *interned = NULL;
183
184 /* The empty Unicode object is shared to improve performance. */
185 static PyObject *unicode_empty = NULL;
186
187 #define _Py_INCREF_UNICODE_EMPTY() \
188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
194 Py_INCREF(unicode_empty); \
195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
197 } \
198 } while (0)
199
200 #define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
205
206 /* Forward declaration */
207 static inline int
208 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
210 /* List of static strings. */
211 static _Py_Identifier *static_strings = NULL;
212
213 /* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
215 static PyObject *unicode_latin1[256] = {NULL};
216
217 /* Fast detection of the most frequent whitespace characters */
218 const unsigned char _Py_ascii_whitespace[] = {
219 0, 0, 0, 0, 0, 0, 0, 0,
220 /* case 0x0009: * CHARACTER TABULATION */
221 /* case 0x000A: * LINE FEED */
222 /* case 0x000B: * LINE TABULATION */
223 /* case 0x000C: * FORM FEED */
224 /* case 0x000D: * CARRIAGE RETURN */
225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 /* case 0x001C: * FILE SEPARATOR */
228 /* case 0x001D: * GROUP SEPARATOR */
229 /* case 0x001E: * RECORD SEPARATOR */
230 /* case 0x001F: * UNIT SEPARATOR */
231 0, 0, 0, 0, 1, 1, 1, 1,
232 /* case 0x0020: * SPACE */
233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
246 };
247
248 /* forward */
249 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
250 static PyObject* get_latin1_char(unsigned char ch);
251 static int unicode_modifiable(PyObject *unicode);
252
253
254 static PyObject *
255 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
256 static PyObject *
257 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258 static PyObject *
259 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261 static PyObject *
262 unicode_encode_call_errorhandler(const char *errors,
263 PyObject **errorHandler,const char *encoding, const char *reason,
264 PyObject *unicode, PyObject **exceptionObject,
265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
267 static void
268 raise_encode_exception(PyObject **exceptionObject,
269 const char *encoding,
270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
273
274 /* Same for linebreaks */
275 static const unsigned char ascii_linebreak[] = {
276 0, 0, 0, 0, 0, 0, 0, 0,
277 /* 0x000A, * LINE FEED */
278 /* 0x000B, * LINE TABULATION */
279 /* 0x000C, * FORM FEED */
280 /* 0x000D, * CARRIAGE RETURN */
281 0, 0, 1, 1, 1, 1, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 /* 0x001C, * FILE SEPARATOR */
284 /* 0x001D, * GROUP SEPARATOR */
285 /* 0x001E, * RECORD SEPARATOR */
286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
300 };
301
302 #include "clinic/unicodeobject.c.h"
303
304 typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314 } _Py_error_handler;
315
316 static _Py_error_handler
get_error_handler(const char * errors)317 get_error_handler(const char *errors)
318 {
319 if (errors == NULL || strcmp(errors, "strict") == 0) {
320 return _Py_ERROR_STRICT;
321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
323 return _Py_ERROR_SURROGATEESCAPE;
324 }
325 if (strcmp(errors, "replace") == 0) {
326 return _Py_ERROR_REPLACE;
327 }
328 if (strcmp(errors, "ignore") == 0) {
329 return _Py_ERROR_IGNORE;
330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
332 return _Py_ERROR_BACKSLASHREPLACE;
333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
335 return _Py_ERROR_SURROGATEPASS;
336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
338 return _Py_ERROR_XMLCHARREFREPLACE;
339 }
340 return _Py_ERROR_OTHER;
341 }
342
343 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
345 Py_UNICODE
PyUnicode_GetMax(void)346 PyUnicode_GetMax(void)
347 {
348 #ifdef Py_UNICODE_WIDE
349 return 0x10FFFF;
350 #else
351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
354 #endif
355 }
356
357 #ifdef Py_DEBUG
358 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)359 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
360 {
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
370 assert(kind == PyUnicode_1BYTE_KIND);
371 assert(ascii->state.ready == 1);
372 }
373 else {
374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
375 void *data;
376
377 if (ascii->state.compact == 1) {
378 data = compact + 1;
379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
382 assert(ascii->state.ascii == 0);
383 assert(ascii->state.ready == 1);
384 assert (compact->utf8 != data);
385 }
386 else {
387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
417 if (
418 #if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420 #else
421 kind == PyUnicode_4BYTE_KIND
422 #endif
423 )
424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
429 }
430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
435 }
436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
445 for (i=0; i < ascii->length; i++)
446 {
447 ch = PyUnicode_READ(kind, data, i);
448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
452 if (ascii->state.ascii == 0) {
453 assert(maxchar >= 128);
454 assert(maxchar <= 255);
455 }
456 else
457 assert(maxchar < 128);
458 }
459 else if (kind == PyUnicode_2BYTE_KIND) {
460 assert(maxchar >= 0x100);
461 assert(maxchar <= 0xFFFF);
462 }
463 else {
464 assert(maxchar >= 0x10000);
465 assert(maxchar <= MAX_UNICODE);
466 }
467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
468 }
469 return 1;
470 }
471 #endif
472
473 static PyObject*
unicode_result_wchar(PyObject * unicode)474 unicode_result_wchar(PyObject *unicode)
475 {
476 #ifndef Py_DEBUG
477 Py_ssize_t len;
478
479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
481 Py_DECREF(unicode);
482 _Py_RETURN_UNICODE_EMPTY();
483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
487 if ((Py_UCS4)ch < 256) {
488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
495 Py_DECREF(unicode);
496 return NULL;
497 }
498 #else
499 assert(Py_REFCNT(unicode) == 1);
500
501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504 #endif
505 return unicode;
506 }
507
508 static PyObject*
unicode_result_ready(PyObject * unicode)509 unicode_result_ready(PyObject *unicode)
510 {
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
516 Py_DECREF(unicode);
517 _Py_RETURN_UNICODE_EMPTY();
518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546 }
547
548 static PyObject*
unicode_result(PyObject * unicode)549 unicode_result(PyObject *unicode)
550 {
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556 }
557
558 static PyObject*
unicode_result_unchanged(PyObject * unicode)559 unicode_result_unchanged(PyObject *unicode)
560 {
561 if (PyUnicode_CheckExact(unicode)) {
562 if (PyUnicode_READY(unicode) == -1)
563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
569 return _PyUnicode_Copy(unicode);
570 }
571
572 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)575 backslashreplace(_PyBytesWriter *writer, char *str,
576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577 {
578 Py_ssize_t size, i;
579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
599 incr = 2+8;
600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
625 }
626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
635 }
636 return str;
637 }
638
639 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)642 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644 {
645 Py_ssize_t size, i;
646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693 }
694
695 /* --- Bloom Filters ----------------------------------------------------- */
696
697 /* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701 /* the linebreak mask is set up by Unicode_Init below */
702
703 #if LONG_BIT >= 128
704 #define BLOOM_WIDTH 128
705 #elif LONG_BIT >= 64
706 #define BLOOM_WIDTH 64
707 #elif LONG_BIT >= 32
708 #define BLOOM_WIDTH 32
709 #else
710 #error "LONG_BIT is smaller than 32"
711 #endif
712
713 #define BLOOM_MASK unsigned long
714
715 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
716
717 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
718
719 #define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
722
723 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)724 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
725 {
726 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
738 /* calculate simple bloom-style bitmask for a given unicode string */
739
740 BLOOM_MASK mask;
741
742 mask = 0;
743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
756 return mask;
757
758 #undef BLOOM_UPDATE
759 }
760
761 static int
ensure_unicode(PyObject * obj)762 ensure_unicode(PyObject *obj)
763 {
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771 }
772
773 /* Compilation of templated routines */
774
775 #include "stringlib/asciilib.h"
776 #include "stringlib/fastsearch.h"
777 #include "stringlib/partition.h"
778 #include "stringlib/split.h"
779 #include "stringlib/count.h"
780 #include "stringlib/find.h"
781 #include "stringlib/find_max_char.h"
782 #include "stringlib/localeutil.h"
783 #include "stringlib/undef.h"
784
785 #include "stringlib/ucs1lib.h"
786 #include "stringlib/fastsearch.h"
787 #include "stringlib/partition.h"
788 #include "stringlib/split.h"
789 #include "stringlib/count.h"
790 #include "stringlib/find.h"
791 #include "stringlib/replace.h"
792 #include "stringlib/find_max_char.h"
793 #include "stringlib/localeutil.h"
794 #include "stringlib/undef.h"
795
796 #include "stringlib/ucs2lib.h"
797 #include "stringlib/fastsearch.h"
798 #include "stringlib/partition.h"
799 #include "stringlib/split.h"
800 #include "stringlib/count.h"
801 #include "stringlib/find.h"
802 #include "stringlib/replace.h"
803 #include "stringlib/find_max_char.h"
804 #include "stringlib/localeutil.h"
805 #include "stringlib/undef.h"
806
807 #include "stringlib/ucs4lib.h"
808 #include "stringlib/fastsearch.h"
809 #include "stringlib/partition.h"
810 #include "stringlib/split.h"
811 #include "stringlib/count.h"
812 #include "stringlib/find.h"
813 #include "stringlib/replace.h"
814 #include "stringlib/find_max_char.h"
815 #include "stringlib/localeutil.h"
816 #include "stringlib/undef.h"
817
818 #include "stringlib/unicodedefs.h"
819 #include "stringlib/fastsearch.h"
820 #include "stringlib/count.h"
821 #include "stringlib/find.h"
822 #include "stringlib/undef.h"
823
824 /* --- Unicode Object ----------------------------------------------------- */
825
826 static PyObject *
827 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
828
829 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)830 findchar(const void *s, int kind,
831 Py_ssize_t size, Py_UCS4 ch,
832 int direction)
833 {
834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
836 if ((Py_UCS1) ch != ch)
837 return -1;
838 if (direction > 0)
839 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840 else
841 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
842 case PyUnicode_2BYTE_KIND:
843 if ((Py_UCS2) ch != ch)
844 return -1;
845 if (direction > 0)
846 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847 else
848 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
849 case PyUnicode_4BYTE_KIND:
850 if (direction > 0)
851 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852 else
853 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
854 default:
855 assert(0);
856 return -1;
857 }
858 }
859
860 #ifdef Py_DEBUG
861 /* Fill the data of a Unicode string with invalid characters to detect bugs
862 earlier.
863
864 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866 invalid character in Unicode 6.0. */
867 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)868 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869 {
870 int kind = PyUnicode_KIND(unicode);
871 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873 if (length <= old_length)
874 return;
875 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876 }
877 #endif
878
879 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)880 resize_compact(PyObject *unicode, Py_ssize_t length)
881 {
882 Py_ssize_t char_size;
883 Py_ssize_t struct_size;
884 Py_ssize_t new_size;
885 int share_wstr;
886 PyObject *new_unicode;
887 #ifdef Py_DEBUG
888 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889 #endif
890
891 assert(unicode_modifiable(unicode));
892 assert(PyUnicode_IS_READY(unicode));
893 assert(PyUnicode_IS_COMPACT(unicode));
894
895 char_size = PyUnicode_KIND(unicode);
896 if (PyUnicode_IS_ASCII(unicode))
897 struct_size = sizeof(PyASCIIObject);
898 else
899 struct_size = sizeof(PyCompactUnicodeObject);
900 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
901
902 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903 PyErr_NoMemory();
904 return NULL;
905 }
906 new_size = (struct_size + (length + 1) * char_size);
907
908 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909 PyObject_DEL(_PyUnicode_UTF8(unicode));
910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
912 }
913 _Py_DEC_REFTOTAL;
914 _Py_ForgetReference(unicode);
915
916 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
917 if (new_unicode == NULL) {
918 _Py_NewReference(unicode);
919 PyErr_NoMemory();
920 return NULL;
921 }
922 unicode = new_unicode;
923 _Py_NewReference(unicode);
924
925 _PyUnicode_LENGTH(unicode) = length;
926 if (share_wstr) {
927 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
928 if (!PyUnicode_IS_ASCII(unicode))
929 _PyUnicode_WSTR_LENGTH(unicode) = length;
930 }
931 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932 PyObject_DEL(_PyUnicode_WSTR(unicode));
933 _PyUnicode_WSTR(unicode) = NULL;
934 if (!PyUnicode_IS_ASCII(unicode))
935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
936 }
937 #ifdef Py_DEBUG
938 unicode_fill_invalid(unicode, old_length);
939 #endif
940 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941 length, 0);
942 assert(_PyUnicode_CheckConsistency(unicode, 0));
943 return unicode;
944 }
945
946 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)947 resize_inplace(PyObject *unicode, Py_ssize_t length)
948 {
949 wchar_t *wstr;
950 Py_ssize_t new_size;
951 assert(!PyUnicode_IS_COMPACT(unicode));
952 assert(Py_REFCNT(unicode) == 1);
953
954 if (PyUnicode_IS_READY(unicode)) {
955 Py_ssize_t char_size;
956 int share_wstr, share_utf8;
957 void *data;
958 #ifdef Py_DEBUG
959 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960 #endif
961
962 data = _PyUnicode_DATA_ANY(unicode);
963 char_size = PyUnicode_KIND(unicode);
964 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
966
967 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968 PyErr_NoMemory();
969 return -1;
970 }
971 new_size = (length + 1) * char_size;
972
973 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974 {
975 PyObject_DEL(_PyUnicode_UTF8(unicode));
976 _PyUnicode_UTF8(unicode) = NULL;
977 _PyUnicode_UTF8_LENGTH(unicode) = 0;
978 }
979
980 data = (PyObject *)PyObject_REALLOC(data, new_size);
981 if (data == NULL) {
982 PyErr_NoMemory();
983 return -1;
984 }
985 _PyUnicode_DATA_ANY(unicode) = data;
986 if (share_wstr) {
987 _PyUnicode_WSTR(unicode) = data;
988 _PyUnicode_WSTR_LENGTH(unicode) = length;
989 }
990 if (share_utf8) {
991 _PyUnicode_UTF8(unicode) = data;
992 _PyUnicode_UTF8_LENGTH(unicode) = length;
993 }
994 _PyUnicode_LENGTH(unicode) = length;
995 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
996 #ifdef Py_DEBUG
997 unicode_fill_invalid(unicode, old_length);
998 #endif
999 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1000 assert(_PyUnicode_CheckConsistency(unicode, 0));
1001 return 0;
1002 }
1003 }
1004 assert(_PyUnicode_WSTR(unicode) != NULL);
1005
1006 /* check for integer overflow */
1007 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1008 PyErr_NoMemory();
1009 return -1;
1010 }
1011 new_size = sizeof(wchar_t) * (length + 1);
1012 wstr = _PyUnicode_WSTR(unicode);
1013 wstr = PyObject_REALLOC(wstr, new_size);
1014 if (!wstr) {
1015 PyErr_NoMemory();
1016 return -1;
1017 }
1018 _PyUnicode_WSTR(unicode) = wstr;
1019 _PyUnicode_WSTR(unicode)[length] = 0;
1020 _PyUnicode_WSTR_LENGTH(unicode) = length;
1021 assert(_PyUnicode_CheckConsistency(unicode, 0));
1022 return 0;
1023 }
1024
1025 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1026 resize_copy(PyObject *unicode, Py_ssize_t length)
1027 {
1028 Py_ssize_t copy_length;
1029 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1030 PyObject *copy;
1031
1032 if (PyUnicode_READY(unicode) == -1)
1033 return NULL;
1034
1035 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036 if (copy == NULL)
1037 return NULL;
1038
1039 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1040 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1041 return copy;
1042 }
1043 else {
1044 PyObject *w;
1045
1046 w = (PyObject*)_PyUnicode_New(length);
1047 if (w == NULL)
1048 return NULL;
1049 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050 copy_length = Py_MIN(copy_length, length);
1051 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1052 copy_length * sizeof(wchar_t));
1053 return w;
1054 }
1055 }
1056
1057 /* We allocate one more byte to make sure the string is
1058 Ux0000 terminated; some code (e.g. new_identifier)
1059 relies on that.
1060
1061 XXX This allocator could further be enhanced by assuring that the
1062 free list never reduces its size below 1.
1063
1064 */
1065
1066 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1067 _PyUnicode_New(Py_ssize_t length)
1068 {
1069 PyUnicodeObject *unicode;
1070 size_t new_size;
1071
1072 /* Optimization for empty strings */
1073 if (length == 0 && unicode_empty != NULL) {
1074 Py_INCREF(unicode_empty);
1075 return (PyUnicodeObject*)unicode_empty;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1080 return (PyUnicodeObject *)PyErr_NoMemory();
1081 }
1082 if (length < 0) {
1083 PyErr_SetString(PyExc_SystemError,
1084 "Negative size passed to _PyUnicode_New");
1085 return NULL;
1086 }
1087
1088 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089 if (unicode == NULL)
1090 return NULL;
1091 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1092
1093 _PyUnicode_WSTR_LENGTH(unicode) = length;
1094 _PyUnicode_HASH(unicode) = -1;
1095 _PyUnicode_STATE(unicode).interned = 0;
1096 _PyUnicode_STATE(unicode).kind = 0;
1097 _PyUnicode_STATE(unicode).compact = 0;
1098 _PyUnicode_STATE(unicode).ready = 0;
1099 _PyUnicode_STATE(unicode).ascii = 0;
1100 _PyUnicode_DATA_ANY(unicode) = NULL;
1101 _PyUnicode_LENGTH(unicode) = 0;
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104
1105 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106 if (!_PyUnicode_WSTR(unicode)) {
1107 Py_DECREF(unicode);
1108 PyErr_NoMemory();
1109 return NULL;
1110 }
1111
1112 /* Initialize the first element to guard against cases where
1113 * the caller fails before initializing str -- unicode_resize()
1114 * reads str[0], and the Keep-Alive optimization can keep memory
1115 * allocated for str alive across a call to unicode_dealloc(unicode).
1116 * We don't want unicode_resize to read uninitialized memory in
1117 * that case.
1118 */
1119 _PyUnicode_WSTR(unicode)[0] = 0;
1120 _PyUnicode_WSTR(unicode)[length] = 0;
1121
1122 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1123 return unicode;
1124 }
1125
1126 static const char*
unicode_kind_name(PyObject * unicode)1127 unicode_kind_name(PyObject *unicode)
1128 {
1129 /* don't check consistency: unicode_kind_name() is called from
1130 _PyUnicode_Dump() */
1131 if (!PyUnicode_IS_COMPACT(unicode))
1132 {
1133 if (!PyUnicode_IS_READY(unicode))
1134 return "wstr";
1135 switch (PyUnicode_KIND(unicode))
1136 {
1137 case PyUnicode_1BYTE_KIND:
1138 if (PyUnicode_IS_ASCII(unicode))
1139 return "legacy ascii";
1140 else
1141 return "legacy latin1";
1142 case PyUnicode_2BYTE_KIND:
1143 return "legacy UCS2";
1144 case PyUnicode_4BYTE_KIND:
1145 return "legacy UCS4";
1146 default:
1147 return "<legacy invalid kind>";
1148 }
1149 }
1150 assert(PyUnicode_IS_READY(unicode));
1151 switch (PyUnicode_KIND(unicode)) {
1152 case PyUnicode_1BYTE_KIND:
1153 if (PyUnicode_IS_ASCII(unicode))
1154 return "ascii";
1155 else
1156 return "latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "UCS4";
1161 default:
1162 return "<invalid compact kind>";
1163 }
1164 }
1165
1166 #ifdef Py_DEBUG
1167 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1168 char *_PyUnicode_utf8(void *unicode){
1169 return PyUnicode_UTF8(unicode);
1170 }
1171
_PyUnicode_compact_data(void * unicode)1172 void *_PyUnicode_compact_data(void *unicode) {
1173 return _PyUnicode_COMPACT_DATA(unicode);
1174 }
_PyUnicode_data(void * unicode)1175 void *_PyUnicode_data(void *unicode){
1176 printf("obj %p\n", unicode);
1177 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182 return PyUnicode_DATA(unicode);
1183 }
1184
1185 void
_PyUnicode_Dump(PyObject * op)1186 _PyUnicode_Dump(PyObject *op)
1187 {
1188 PyASCIIObject *ascii = (PyASCIIObject *)op;
1189 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191 void *data;
1192
1193 if (ascii->state.compact)
1194 {
1195 if (ascii->state.ascii)
1196 data = (ascii + 1);
1197 else
1198 data = (compact + 1);
1199 }
1200 else
1201 data = unicode->data.any;
1202 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203 unicode_kind_name(op), ascii->length);
1204
1205 if (ascii->wstr == data)
1206 printf("shared ");
1207 printf("wstr=%p", ascii->wstr);
1208
1209 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1210 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1211 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212 printf("shared ");
1213 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214 compact->utf8, compact->utf8_length);
1215 }
1216 printf(", data=%p\n", data);
1217 }
1218 #endif
1219
1220 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1221 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222 {
1223 PyObject *obj;
1224 PyCompactUnicodeObject *unicode;
1225 void *data;
1226 enum PyUnicode_Kind kind;
1227 int is_sharing, is_ascii;
1228 Py_ssize_t char_size;
1229 Py_ssize_t struct_size;
1230
1231 /* Optimization for empty strings */
1232 if (size == 0 && unicode_empty != NULL) {
1233 Py_INCREF(unicode_empty);
1234 return unicode_empty;
1235 }
1236
1237 is_ascii = 0;
1238 is_sharing = 0;
1239 struct_size = sizeof(PyCompactUnicodeObject);
1240 if (maxchar < 128) {
1241 kind = PyUnicode_1BYTE_KIND;
1242 char_size = 1;
1243 is_ascii = 1;
1244 struct_size = sizeof(PyASCIIObject);
1245 }
1246 else if (maxchar < 256) {
1247 kind = PyUnicode_1BYTE_KIND;
1248 char_size = 1;
1249 }
1250 else if (maxchar < 65536) {
1251 kind = PyUnicode_2BYTE_KIND;
1252 char_size = 2;
1253 if (sizeof(wchar_t) == 2)
1254 is_sharing = 1;
1255 }
1256 else {
1257 if (maxchar > MAX_UNICODE) {
1258 PyErr_SetString(PyExc_SystemError,
1259 "invalid maximum character passed to PyUnicode_New");
1260 return NULL;
1261 }
1262 kind = PyUnicode_4BYTE_KIND;
1263 char_size = 4;
1264 if (sizeof(wchar_t) == 4)
1265 is_sharing = 1;
1266 }
1267
1268 /* Ensure we won't overflow the size. */
1269 if (size < 0) {
1270 PyErr_SetString(PyExc_SystemError,
1271 "Negative size passed to PyUnicode_New");
1272 return NULL;
1273 }
1274 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275 return PyErr_NoMemory();
1276
1277 /* Duplicated allocation code from _PyObject_New() instead of a call to
1278 * PyObject_New() so we are able to allocate space for the object and
1279 * it's data buffer.
1280 */
1281 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282 if (obj == NULL)
1283 return PyErr_NoMemory();
1284 obj = PyObject_INIT(obj, &PyUnicode_Type);
1285 if (obj == NULL)
1286 return NULL;
1287
1288 unicode = (PyCompactUnicodeObject *)obj;
1289 if (is_ascii)
1290 data = ((PyASCIIObject*)obj) + 1;
1291 else
1292 data = unicode + 1;
1293 _PyUnicode_LENGTH(unicode) = size;
1294 _PyUnicode_HASH(unicode) = -1;
1295 _PyUnicode_STATE(unicode).interned = 0;
1296 _PyUnicode_STATE(unicode).kind = kind;
1297 _PyUnicode_STATE(unicode).compact = 1;
1298 _PyUnicode_STATE(unicode).ready = 1;
1299 _PyUnicode_STATE(unicode).ascii = is_ascii;
1300 if (is_ascii) {
1301 ((char*)data)[size] = 0;
1302 _PyUnicode_WSTR(unicode) = NULL;
1303 }
1304 else if (kind == PyUnicode_1BYTE_KIND) {
1305 ((char*)data)[size] = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1308 unicode->utf8 = NULL;
1309 unicode->utf8_length = 0;
1310 }
1311 else {
1312 unicode->utf8 = NULL;
1313 unicode->utf8_length = 0;
1314 if (kind == PyUnicode_2BYTE_KIND)
1315 ((Py_UCS2*)data)[size] = 0;
1316 else /* kind == PyUnicode_4BYTE_KIND */
1317 ((Py_UCS4*)data)[size] = 0;
1318 if (is_sharing) {
1319 _PyUnicode_WSTR_LENGTH(unicode) = size;
1320 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321 }
1322 else {
1323 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324 _PyUnicode_WSTR(unicode) = NULL;
1325 }
1326 }
1327 #ifdef Py_DEBUG
1328 unicode_fill_invalid((PyObject*)unicode, 0);
1329 #endif
1330 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1331 return obj;
1332 }
1333
1334 #if SIZEOF_WCHAR_T == 2
1335 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336 will decode surrogate pairs, the other conversions are implemented as macros
1337 for efficiency.
1338
1339 This function assumes that unicode can hold one more code point than wstr
1340 characters for a terminating null character. */
1341 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1342 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1343 PyObject *unicode)
1344 {
1345 const wchar_t *iter;
1346 Py_UCS4 *ucs4_out;
1347
1348 assert(unicode != NULL);
1349 assert(_PyUnicode_CHECK(unicode));
1350 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352
1353 for (iter = begin; iter < end; ) {
1354 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355 _PyUnicode_GET_LENGTH(unicode)));
1356 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357 && (iter+1) < end
1358 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1359 {
1360 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1361 iter += 2;
1362 }
1363 else {
1364 *ucs4_out++ = *iter;
1365 iter++;
1366 }
1367 }
1368 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369 _PyUnicode_GET_LENGTH(unicode)));
1370
1371 }
1372 #endif
1373
1374 static int
unicode_check_modifiable(PyObject * unicode)1375 unicode_check_modifiable(PyObject *unicode)
1376 {
1377 if (!unicode_modifiable(unicode)) {
1378 PyErr_SetString(PyExc_SystemError,
1379 "Cannot modify a string currently used");
1380 return -1;
1381 }
1382 return 0;
1383 }
1384
1385 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1386 _copy_characters(PyObject *to, Py_ssize_t to_start,
1387 PyObject *from, Py_ssize_t from_start,
1388 Py_ssize_t how_many, int check_maxchar)
1389 {
1390 unsigned int from_kind, to_kind;
1391 void *from_data, *to_data;
1392
1393 assert(0 <= how_many);
1394 assert(0 <= from_start);
1395 assert(0 <= to_start);
1396 assert(PyUnicode_Check(from));
1397 assert(PyUnicode_IS_READY(from));
1398 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1399
1400 assert(PyUnicode_Check(to));
1401 assert(PyUnicode_IS_READY(to));
1402 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403
1404 if (how_many == 0)
1405 return 0;
1406
1407 from_kind = PyUnicode_KIND(from);
1408 from_data = PyUnicode_DATA(from);
1409 to_kind = PyUnicode_KIND(to);
1410 to_data = PyUnicode_DATA(to);
1411
1412 #ifdef Py_DEBUG
1413 if (!check_maxchar
1414 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415 {
1416 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417 Py_UCS4 ch;
1418 Py_ssize_t i;
1419 for (i=0; i < how_many; i++) {
1420 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421 assert(ch <= to_maxchar);
1422 }
1423 }
1424 #endif
1425
1426 if (from_kind == to_kind) {
1427 if (check_maxchar
1428 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429 {
1430 /* Writing Latin-1 characters into an ASCII string requires to
1431 check that all written characters are pure ASCII */
1432 Py_UCS4 max_char;
1433 max_char = ucs1lib_find_max_char(from_data,
1434 (Py_UCS1*)from_data + how_many);
1435 if (max_char >= 128)
1436 return -1;
1437 }
1438 memcpy((char*)to_data + to_kind * to_start,
1439 (char*)from_data + from_kind * from_start,
1440 to_kind * how_many);
1441 }
1442 else if (from_kind == PyUnicode_1BYTE_KIND
1443 && to_kind == PyUnicode_2BYTE_KIND)
1444 {
1445 _PyUnicode_CONVERT_BYTES(
1446 Py_UCS1, Py_UCS2,
1447 PyUnicode_1BYTE_DATA(from) + from_start,
1448 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449 PyUnicode_2BYTE_DATA(to) + to_start
1450 );
1451 }
1452 else if (from_kind == PyUnicode_1BYTE_KIND
1453 && to_kind == PyUnicode_4BYTE_KIND)
1454 {
1455 _PyUnicode_CONVERT_BYTES(
1456 Py_UCS1, Py_UCS4,
1457 PyUnicode_1BYTE_DATA(from) + from_start,
1458 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459 PyUnicode_4BYTE_DATA(to) + to_start
1460 );
1461 }
1462 else if (from_kind == PyUnicode_2BYTE_KIND
1463 && to_kind == PyUnicode_4BYTE_KIND)
1464 {
1465 _PyUnicode_CONVERT_BYTES(
1466 Py_UCS2, Py_UCS4,
1467 PyUnicode_2BYTE_DATA(from) + from_start,
1468 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469 PyUnicode_4BYTE_DATA(to) + to_start
1470 );
1471 }
1472 else {
1473 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474
1475 if (!check_maxchar) {
1476 if (from_kind == PyUnicode_2BYTE_KIND
1477 && to_kind == PyUnicode_1BYTE_KIND)
1478 {
1479 _PyUnicode_CONVERT_BYTES(
1480 Py_UCS2, Py_UCS1,
1481 PyUnicode_2BYTE_DATA(from) + from_start,
1482 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483 PyUnicode_1BYTE_DATA(to) + to_start
1484 );
1485 }
1486 else if (from_kind == PyUnicode_4BYTE_KIND
1487 && to_kind == PyUnicode_1BYTE_KIND)
1488 {
1489 _PyUnicode_CONVERT_BYTES(
1490 Py_UCS4, Py_UCS1,
1491 PyUnicode_4BYTE_DATA(from) + from_start,
1492 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493 PyUnicode_1BYTE_DATA(to) + to_start
1494 );
1495 }
1496 else if (from_kind == PyUnicode_4BYTE_KIND
1497 && to_kind == PyUnicode_2BYTE_KIND)
1498 {
1499 _PyUnicode_CONVERT_BYTES(
1500 Py_UCS4, Py_UCS2,
1501 PyUnicode_4BYTE_DATA(from) + from_start,
1502 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503 PyUnicode_2BYTE_DATA(to) + to_start
1504 );
1505 }
1506 else {
1507 assert(0);
1508 return -1;
1509 }
1510 }
1511 else {
1512 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1513 Py_UCS4 ch;
1514 Py_ssize_t i;
1515
1516 for (i=0; i < how_many; i++) {
1517 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1518 if (ch > to_maxchar)
1519 return -1;
1520 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521 }
1522 }
1523 }
1524 return 0;
1525 }
1526
1527 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1528 _PyUnicode_FastCopyCharacters(
1529 PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1531 {
1532 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533 }
1534
1535 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1536 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537 PyObject *from, Py_ssize_t from_start,
1538 Py_ssize_t how_many)
1539 {
1540 int err;
1541
1542 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543 PyErr_BadInternalCall();
1544 return -1;
1545 }
1546
1547 if (PyUnicode_READY(from) == -1)
1548 return -1;
1549 if (PyUnicode_READY(to) == -1)
1550 return -1;
1551
1552 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1553 PyErr_SetString(PyExc_IndexError, "string index out of range");
1554 return -1;
1555 }
1556 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1557 PyErr_SetString(PyExc_IndexError, "string index out of range");
1558 return -1;
1559 }
1560 if (how_many < 0) {
1561 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562 return -1;
1563 }
1564 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1565 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566 PyErr_Format(PyExc_SystemError,
1567 "Cannot write %zi characters at %zi "
1568 "in a string of %zi characters",
1569 how_many, to_start, PyUnicode_GET_LENGTH(to));
1570 return -1;
1571 }
1572
1573 if (how_many == 0)
1574 return 0;
1575
1576 if (unicode_check_modifiable(to))
1577 return -1;
1578
1579 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580 if (err) {
1581 PyErr_Format(PyExc_SystemError,
1582 "Cannot copy %s characters "
1583 "into a string of %s characters",
1584 unicode_kind_name(from),
1585 unicode_kind_name(to));
1586 return -1;
1587 }
1588 return how_many;
1589 }
1590
1591 /* Find the maximum code point and count the number of surrogate pairs so a
1592 correct string length can be computed before converting a string to UCS4.
1593 This function counts single surrogates as a character and not as a pair.
1594
1595 Return 0 on success, or -1 on error. */
1596 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1597 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1599 {
1600 const wchar_t *iter;
1601 Py_UCS4 ch;
1602
1603 assert(num_surrogates != NULL && maxchar != NULL);
1604 *num_surrogates = 0;
1605 *maxchar = 0;
1606
1607 for (iter = begin; iter < end; ) {
1608 #if SIZEOF_WCHAR_T == 2
1609 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610 && (iter+1) < end
1611 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612 {
1613 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614 ++(*num_surrogates);
1615 iter += 2;
1616 }
1617 else
1618 #endif
1619 {
1620 ch = *iter;
1621 iter++;
1622 }
1623 if (ch > *maxchar) {
1624 *maxchar = ch;
1625 if (*maxchar > MAX_UNICODE) {
1626 PyErr_Format(PyExc_ValueError,
1627 "character U+%x is not in range [U+0000; U+10ffff]",
1628 ch);
1629 return -1;
1630 }
1631 }
1632 }
1633 return 0;
1634 }
1635
1636 int
_PyUnicode_Ready(PyObject * unicode)1637 _PyUnicode_Ready(PyObject *unicode)
1638 {
1639 wchar_t *end;
1640 Py_UCS4 maxchar = 0;
1641 Py_ssize_t num_surrogates;
1642 #if SIZEOF_WCHAR_T == 2
1643 Py_ssize_t length_wo_surrogates;
1644 #endif
1645
1646 /* _PyUnicode_Ready() is only intended for old-style API usage where
1647 strings were created using _PyObject_New() and where no canonical
1648 representation (the str field) has been set yet aka strings
1649 which are not yet ready. */
1650 assert(_PyUnicode_CHECK(unicode));
1651 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1652 assert(_PyUnicode_WSTR(unicode) != NULL);
1653 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1654 assert(_PyUnicode_UTF8(unicode) == NULL);
1655 /* Actually, it should neither be interned nor be anything else: */
1656 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1657
1658 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1659 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1660 &maxchar, &num_surrogates) == -1)
1661 return -1;
1662
1663 if (maxchar < 256) {
1664 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665 if (!_PyUnicode_DATA_ANY(unicode)) {
1666 PyErr_NoMemory();
1667 return -1;
1668 }
1669 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1670 _PyUnicode_WSTR(unicode), end,
1671 PyUnicode_1BYTE_DATA(unicode));
1672 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675 if (maxchar < 128) {
1676 _PyUnicode_STATE(unicode).ascii = 1;
1677 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1678 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1679 }
1680 else {
1681 _PyUnicode_STATE(unicode).ascii = 0;
1682 _PyUnicode_UTF8(unicode) = NULL;
1683 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1684 }
1685 PyObject_FREE(_PyUnicode_WSTR(unicode));
1686 _PyUnicode_WSTR(unicode) = NULL;
1687 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688 }
1689 /* In this case we might have to convert down from 4-byte native
1690 wchar_t to 2-byte unicode. */
1691 else if (maxchar < 65536) {
1692 assert(num_surrogates == 0 &&
1693 "FindMaxCharAndNumSurrogatePairs() messed up");
1694
1695 #if SIZEOF_WCHAR_T == 2
1696 /* We can share representations and are done. */
1697 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1698 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1701 _PyUnicode_UTF8(unicode) = NULL;
1702 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1703 #else
1704 /* sizeof(wchar_t) == 4 */
1705 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1706 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1707 if (!_PyUnicode_DATA_ANY(unicode)) {
1708 PyErr_NoMemory();
1709 return -1;
1710 }
1711 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712 _PyUnicode_WSTR(unicode), end,
1713 PyUnicode_2BYTE_DATA(unicode));
1714 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1717 _PyUnicode_UTF8(unicode) = NULL;
1718 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1719 PyObject_FREE(_PyUnicode_WSTR(unicode));
1720 _PyUnicode_WSTR(unicode) = NULL;
1721 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722 #endif
1723 }
1724 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725 else {
1726 #if SIZEOF_WCHAR_T == 2
1727 /* in case the native representation is 2-bytes, we need to allocate a
1728 new normalized 4-byte version. */
1729 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1730 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735 if (!_PyUnicode_DATA_ANY(unicode)) {
1736 PyErr_NoMemory();
1737 return -1;
1738 }
1739 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744 _PyUnicode_STATE(unicode).ready = 1;
1745 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1746 PyObject_FREE(_PyUnicode_WSTR(unicode));
1747 _PyUnicode_WSTR(unicode) = NULL;
1748 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749 #else
1750 assert(num_surrogates == 0);
1751
1752 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1753 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1754 _PyUnicode_UTF8(unicode) = NULL;
1755 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757 #endif
1758 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759 }
1760 _PyUnicode_STATE(unicode).ready = 1;
1761 assert(_PyUnicode_CheckConsistency(unicode, 1));
1762 return 0;
1763 }
1764
1765 static void
unicode_dealloc(PyObject * unicode)1766 unicode_dealloc(PyObject *unicode)
1767 {
1768 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1769 case SSTATE_NOT_INTERNED:
1770 break;
1771
1772 case SSTATE_INTERNED_MORTAL:
1773 /* revive dead object temporarily for DelItem */
1774 Py_REFCNT(unicode) = 3;
1775 if (PyDict_DelItem(interned, unicode) != 0)
1776 Py_FatalError(
1777 "deletion of interned string failed");
1778 break;
1779
1780 case SSTATE_INTERNED_IMMORTAL:
1781 Py_FatalError("Immortal interned string died.");
1782
1783 default:
1784 Py_FatalError("Inconsistent interned string state.");
1785 }
1786
1787 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1788 PyObject_DEL(_PyUnicode_WSTR(unicode));
1789 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1790 PyObject_DEL(_PyUnicode_UTF8(unicode));
1791 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1793
1794 Py_TYPE(unicode)->tp_free(unicode);
1795 }
1796
1797 #ifdef Py_DEBUG
1798 static int
unicode_is_singleton(PyObject * unicode)1799 unicode_is_singleton(PyObject *unicode)
1800 {
1801 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802 if (unicode == unicode_empty)
1803 return 1;
1804 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805 {
1806 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807 if (ch < 256 && unicode_latin1[ch] == unicode)
1808 return 1;
1809 }
1810 return 0;
1811 }
1812 #endif
1813
1814 static int
unicode_modifiable(PyObject * unicode)1815 unicode_modifiable(PyObject *unicode)
1816 {
1817 assert(_PyUnicode_CHECK(unicode));
1818 if (Py_REFCNT(unicode) != 1)
1819 return 0;
1820 if (_PyUnicode_HASH(unicode) != -1)
1821 return 0;
1822 if (PyUnicode_CHECK_INTERNED(unicode))
1823 return 0;
1824 if (!PyUnicode_CheckExact(unicode))
1825 return 0;
1826 #ifdef Py_DEBUG
1827 /* singleton refcount is greater than 1 */
1828 assert(!unicode_is_singleton(unicode));
1829 #endif
1830 return 1;
1831 }
1832
1833 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1834 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835 {
1836 PyObject *unicode;
1837 Py_ssize_t old_length;
1838
1839 assert(p_unicode != NULL);
1840 unicode = *p_unicode;
1841
1842 assert(unicode != NULL);
1843 assert(PyUnicode_Check(unicode));
1844 assert(0 <= length);
1845
1846 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1847 old_length = PyUnicode_WSTR_LENGTH(unicode);
1848 else
1849 old_length = PyUnicode_GET_LENGTH(unicode);
1850 if (old_length == length)
1851 return 0;
1852
1853 if (length == 0) {
1854 _Py_INCREF_UNICODE_EMPTY();
1855 if (!unicode_empty)
1856 return -1;
1857 Py_SETREF(*p_unicode, unicode_empty);
1858 return 0;
1859 }
1860
1861 if (!unicode_modifiable(unicode)) {
1862 PyObject *copy = resize_copy(unicode, length);
1863 if (copy == NULL)
1864 return -1;
1865 Py_SETREF(*p_unicode, copy);
1866 return 0;
1867 }
1868
1869 if (PyUnicode_IS_COMPACT(unicode)) {
1870 PyObject *new_unicode = resize_compact(unicode, length);
1871 if (new_unicode == NULL)
1872 return -1;
1873 *p_unicode = new_unicode;
1874 return 0;
1875 }
1876 return resize_inplace(unicode, length);
1877 }
1878
1879 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1880 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1881 {
1882 PyObject *unicode;
1883 if (p_unicode == NULL) {
1884 PyErr_BadInternalCall();
1885 return -1;
1886 }
1887 unicode = *p_unicode;
1888 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1889 {
1890 PyErr_BadInternalCall();
1891 return -1;
1892 }
1893 return unicode_resize(p_unicode, length);
1894 }
1895
1896 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1897
1898 WARNING: The function doesn't copy the terminating null character and
1899 doesn't check the maximum character (may write a latin1 character in an
1900 ASCII string). */
1901 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1902 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903 const char *str, Py_ssize_t len)
1904 {
1905 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906 void *data = PyUnicode_DATA(unicode);
1907 const char *end = str + len;
1908
1909 switch (kind) {
1910 case PyUnicode_1BYTE_KIND: {
1911 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1912 #ifdef Py_DEBUG
1913 if (PyUnicode_IS_ASCII(unicode)) {
1914 Py_UCS4 maxchar = ucs1lib_find_max_char(
1915 (const Py_UCS1*)str,
1916 (const Py_UCS1*)str + len);
1917 assert(maxchar < 128);
1918 }
1919 #endif
1920 memcpy((char *) data + index, str, len);
1921 break;
1922 }
1923 case PyUnicode_2BYTE_KIND: {
1924 Py_UCS2 *start = (Py_UCS2 *)data + index;
1925 Py_UCS2 *ucs2 = start;
1926 assert(index <= PyUnicode_GET_LENGTH(unicode));
1927
1928 for (; str < end; ++ucs2, ++str)
1929 *ucs2 = (Py_UCS2)*str;
1930
1931 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1932 break;
1933 }
1934 default: {
1935 Py_UCS4 *start = (Py_UCS4 *)data + index;
1936 Py_UCS4 *ucs4 = start;
1937 assert(kind == PyUnicode_4BYTE_KIND);
1938 assert(index <= PyUnicode_GET_LENGTH(unicode));
1939
1940 for (; str < end; ++ucs4, ++str)
1941 *ucs4 = (Py_UCS4)*str;
1942
1943 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1944 }
1945 }
1946 }
1947
1948 static PyObject*
get_latin1_char(unsigned char ch)1949 get_latin1_char(unsigned char ch)
1950 {
1951 PyObject *unicode = unicode_latin1[ch];
1952 if (!unicode) {
1953 unicode = PyUnicode_New(1, ch);
1954 if (!unicode)
1955 return NULL;
1956 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1957 assert(_PyUnicode_CheckConsistency(unicode, 1));
1958 unicode_latin1[ch] = unicode;
1959 }
1960 Py_INCREF(unicode);
1961 return unicode;
1962 }
1963
1964 static PyObject*
unicode_char(Py_UCS4 ch)1965 unicode_char(Py_UCS4 ch)
1966 {
1967 PyObject *unicode;
1968
1969 assert(ch <= MAX_UNICODE);
1970
1971 if (ch < 256)
1972 return get_latin1_char(ch);
1973
1974 unicode = PyUnicode_New(1, ch);
1975 if (unicode == NULL)
1976 return NULL;
1977 switch (PyUnicode_KIND(unicode)) {
1978 case PyUnicode_1BYTE_KIND:
1979 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980 break;
1981 case PyUnicode_2BYTE_KIND:
1982 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983 break;
1984 default:
1985 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987 }
1988 assert(_PyUnicode_CheckConsistency(unicode, 1));
1989 return unicode;
1990 }
1991
1992 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)1993 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1994 {
1995 PyObject *unicode;
1996 Py_UCS4 maxchar = 0;
1997 Py_ssize_t num_surrogates;
1998
1999 if (u == NULL)
2000 return (PyObject*)_PyUnicode_New(size);
2001
2002 /* If the Unicode data is known at construction time, we can apply
2003 some optimizations which share commonly used objects. */
2004
2005 /* Optimization for empty strings */
2006 if (size == 0)
2007 _Py_RETURN_UNICODE_EMPTY();
2008
2009 /* Single character Unicode objects in the Latin-1 range are
2010 shared when using this constructor */
2011 if (size == 1 && (Py_UCS4)*u < 256)
2012 return get_latin1_char((unsigned char)*u);
2013
2014 /* If not empty and not single character, copy the Unicode data
2015 into the new object */
2016 if (find_maxchar_surrogates(u, u + size,
2017 &maxchar, &num_surrogates) == -1)
2018 return NULL;
2019
2020 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2021 if (!unicode)
2022 return NULL;
2023
2024 switch (PyUnicode_KIND(unicode)) {
2025 case PyUnicode_1BYTE_KIND:
2026 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2027 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028 break;
2029 case PyUnicode_2BYTE_KIND:
2030 #if Py_UNICODE_SIZE == 2
2031 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2032 #else
2033 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2034 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035 #endif
2036 break;
2037 case PyUnicode_4BYTE_KIND:
2038 #if SIZEOF_WCHAR_T == 2
2039 /* This is the only case which has to process surrogates, thus
2040 a simple copy loop is not enough and we need a function. */
2041 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2042 #else
2043 assert(num_surrogates == 0);
2044 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2045 #endif
2046 break;
2047 default:
2048 assert(0 && "Impossible state");
2049 }
2050
2051 return unicode_result(unicode);
2052 }
2053
2054 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2055 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2056 {
2057 if (size < 0) {
2058 PyErr_SetString(PyExc_SystemError,
2059 "Negative size passed to PyUnicode_FromStringAndSize");
2060 return NULL;
2061 }
2062 if (u != NULL)
2063 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064 else
2065 return (PyObject *)_PyUnicode_New(size);
2066 }
2067
2068 PyObject *
PyUnicode_FromString(const char * u)2069 PyUnicode_FromString(const char *u)
2070 {
2071 size_t size = strlen(u);
2072 if (size > PY_SSIZE_T_MAX) {
2073 PyErr_SetString(PyExc_OverflowError, "input too long");
2074 return NULL;
2075 }
2076 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2077 }
2078
2079 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2080 _PyUnicode_FromId(_Py_Identifier *id)
2081 {
2082 if (!id->object) {
2083 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084 strlen(id->string),
2085 NULL, NULL);
2086 if (!id->object)
2087 return NULL;
2088 PyUnicode_InternInPlace(&id->object);
2089 assert(!id->next);
2090 id->next = static_strings;
2091 static_strings = id;
2092 }
2093 return id->object;
2094 }
2095
2096 void
_PyUnicode_ClearStaticStrings()2097 _PyUnicode_ClearStaticStrings()
2098 {
2099 _Py_Identifier *tmp, *s = static_strings;
2100 while (s) {
2101 Py_CLEAR(s->object);
2102 tmp = s->next;
2103 s->next = NULL;
2104 s = tmp;
2105 }
2106 static_strings = NULL;
2107 }
2108
2109 /* Internal function, doesn't check maximum character */
2110
2111 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2112 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2113 {
2114 const unsigned char *s = (const unsigned char *)buffer;
2115 PyObject *unicode;
2116 if (size == 1) {
2117 #ifdef Py_DEBUG
2118 assert((unsigned char)s[0] < 128);
2119 #endif
2120 return get_latin1_char(s[0]);
2121 }
2122 unicode = PyUnicode_New(size, 127);
2123 if (!unicode)
2124 return NULL;
2125 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126 assert(_PyUnicode_CheckConsistency(unicode, 1));
2127 return unicode;
2128 }
2129
2130 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2131 kind_maxchar_limit(unsigned int kind)
2132 {
2133 switch (kind) {
2134 case PyUnicode_1BYTE_KIND:
2135 return 0x80;
2136 case PyUnicode_2BYTE_KIND:
2137 return 0x100;
2138 case PyUnicode_4BYTE_KIND:
2139 return 0x10000;
2140 default:
2141 assert(0 && "invalid kind");
2142 return MAX_UNICODE;
2143 }
2144 }
2145
2146 static inline Py_UCS4
align_maxchar(Py_UCS4 maxchar)2147 align_maxchar(Py_UCS4 maxchar)
2148 {
2149 if (maxchar <= 127)
2150 return 127;
2151 else if (maxchar <= 255)
2152 return 255;
2153 else if (maxchar <= 65535)
2154 return 65535;
2155 else
2156 return MAX_UNICODE;
2157 }
2158
2159 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2160 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2161 {
2162 PyObject *res;
2163 unsigned char max_char;
2164
2165 if (size == 0)
2166 _Py_RETURN_UNICODE_EMPTY();
2167 assert(size > 0);
2168 if (size == 1)
2169 return get_latin1_char(u[0]);
2170
2171 max_char = ucs1lib_find_max_char(u, u + size);
2172 res = PyUnicode_New(size, max_char);
2173 if (!res)
2174 return NULL;
2175 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2176 assert(_PyUnicode_CheckConsistency(res, 1));
2177 return res;
2178 }
2179
2180 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2181 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2182 {
2183 PyObject *res;
2184 Py_UCS2 max_char;
2185
2186 if (size == 0)
2187 _Py_RETURN_UNICODE_EMPTY();
2188 assert(size > 0);
2189 if (size == 1)
2190 return unicode_char(u[0]);
2191
2192 max_char = ucs2lib_find_max_char(u, u + size);
2193 res = PyUnicode_New(size, max_char);
2194 if (!res)
2195 return NULL;
2196 if (max_char >= 256)
2197 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2198 else {
2199 _PyUnicode_CONVERT_BYTES(
2200 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201 }
2202 assert(_PyUnicode_CheckConsistency(res, 1));
2203 return res;
2204 }
2205
2206 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2207 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2208 {
2209 PyObject *res;
2210 Py_UCS4 max_char;
2211
2212 if (size == 0)
2213 _Py_RETURN_UNICODE_EMPTY();
2214 assert(size > 0);
2215 if (size == 1)
2216 return unicode_char(u[0]);
2217
2218 max_char = ucs4lib_find_max_char(u, u + size);
2219 res = PyUnicode_New(size, max_char);
2220 if (!res)
2221 return NULL;
2222 if (max_char < 256)
2223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224 PyUnicode_1BYTE_DATA(res));
2225 else if (max_char < 0x10000)
2226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227 PyUnicode_2BYTE_DATA(res));
2228 else
2229 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2230 assert(_PyUnicode_CheckConsistency(res, 1));
2231 return res;
2232 }
2233
2234 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2235 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236 {
2237 if (size < 0) {
2238 PyErr_SetString(PyExc_ValueError, "size must be positive");
2239 return NULL;
2240 }
2241 switch (kind) {
2242 case PyUnicode_1BYTE_KIND:
2243 return _PyUnicode_FromUCS1(buffer, size);
2244 case PyUnicode_2BYTE_KIND:
2245 return _PyUnicode_FromUCS2(buffer, size);
2246 case PyUnicode_4BYTE_KIND:
2247 return _PyUnicode_FromUCS4(buffer, size);
2248 default:
2249 PyErr_SetString(PyExc_SystemError, "invalid kind");
2250 return NULL;
2251 }
2252 }
2253
2254 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2255 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256 {
2257 enum PyUnicode_Kind kind;
2258 void *startptr, *endptr;
2259
2260 assert(PyUnicode_IS_READY(unicode));
2261 assert(0 <= start);
2262 assert(end <= PyUnicode_GET_LENGTH(unicode));
2263 assert(start <= end);
2264
2265 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266 return PyUnicode_MAX_CHAR_VALUE(unicode);
2267
2268 if (start == end)
2269 return 127;
2270
2271 if (PyUnicode_IS_ASCII(unicode))
2272 return 127;
2273
2274 kind = PyUnicode_KIND(unicode);
2275 startptr = PyUnicode_DATA(unicode);
2276 endptr = (char *)startptr + end * kind;
2277 startptr = (char *)startptr + start * kind;
2278 switch(kind) {
2279 case PyUnicode_1BYTE_KIND:
2280 return ucs1lib_find_max_char(startptr, endptr);
2281 case PyUnicode_2BYTE_KIND:
2282 return ucs2lib_find_max_char(startptr, endptr);
2283 case PyUnicode_4BYTE_KIND:
2284 return ucs4lib_find_max_char(startptr, endptr);
2285 default:
2286 assert(0);
2287 return 0;
2288 }
2289 }
2290
2291 /* Ensure that a string uses the most efficient storage, if it is not the
2292 case: create a new string with of the right kind. Write NULL into *p_unicode
2293 on error. */
2294 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2295 unicode_adjust_maxchar(PyObject **p_unicode)
2296 {
2297 PyObject *unicode, *copy;
2298 Py_UCS4 max_char;
2299 Py_ssize_t len;
2300 unsigned int kind;
2301
2302 assert(p_unicode != NULL);
2303 unicode = *p_unicode;
2304 assert(PyUnicode_IS_READY(unicode));
2305 if (PyUnicode_IS_ASCII(unicode))
2306 return;
2307
2308 len = PyUnicode_GET_LENGTH(unicode);
2309 kind = PyUnicode_KIND(unicode);
2310 if (kind == PyUnicode_1BYTE_KIND) {
2311 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2312 max_char = ucs1lib_find_max_char(u, u + len);
2313 if (max_char >= 128)
2314 return;
2315 }
2316 else if (kind == PyUnicode_2BYTE_KIND) {
2317 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2318 max_char = ucs2lib_find_max_char(u, u + len);
2319 if (max_char >= 256)
2320 return;
2321 }
2322 else {
2323 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2324 assert(kind == PyUnicode_4BYTE_KIND);
2325 max_char = ucs4lib_find_max_char(u, u + len);
2326 if (max_char >= 0x10000)
2327 return;
2328 }
2329 copy = PyUnicode_New(len, max_char);
2330 if (copy != NULL)
2331 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2332 Py_DECREF(unicode);
2333 *p_unicode = copy;
2334 }
2335
2336 PyObject*
_PyUnicode_Copy(PyObject * unicode)2337 _PyUnicode_Copy(PyObject *unicode)
2338 {
2339 Py_ssize_t length;
2340 PyObject *copy;
2341
2342 if (!PyUnicode_Check(unicode)) {
2343 PyErr_BadInternalCall();
2344 return NULL;
2345 }
2346 if (PyUnicode_READY(unicode) == -1)
2347 return NULL;
2348
2349 length = PyUnicode_GET_LENGTH(unicode);
2350 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2351 if (!copy)
2352 return NULL;
2353 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354
2355 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2356 length * PyUnicode_KIND(unicode));
2357 assert(_PyUnicode_CheckConsistency(copy, 1));
2358 return copy;
2359 }
2360
2361
2362 /* Widen Unicode objects to larger buffers. Don't write terminating null
2363 character. Return NULL on error. */
2364
2365 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2366 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367 {
2368 Py_ssize_t len;
2369 void *result;
2370 unsigned int skind;
2371
2372 if (PyUnicode_READY(s) == -1)
2373 return NULL;
2374
2375 len = PyUnicode_GET_LENGTH(s);
2376 skind = PyUnicode_KIND(s);
2377 if (skind >= kind) {
2378 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2379 return NULL;
2380 }
2381 switch (kind) {
2382 case PyUnicode_2BYTE_KIND:
2383 result = PyMem_New(Py_UCS2, len);
2384 if (!result)
2385 return PyErr_NoMemory();
2386 assert(skind == PyUnicode_1BYTE_KIND);
2387 _PyUnicode_CONVERT_BYTES(
2388 Py_UCS1, Py_UCS2,
2389 PyUnicode_1BYTE_DATA(s),
2390 PyUnicode_1BYTE_DATA(s) + len,
2391 result);
2392 return result;
2393 case PyUnicode_4BYTE_KIND:
2394 result = PyMem_New(Py_UCS4, len);
2395 if (!result)
2396 return PyErr_NoMemory();
2397 if (skind == PyUnicode_2BYTE_KIND) {
2398 _PyUnicode_CONVERT_BYTES(
2399 Py_UCS2, Py_UCS4,
2400 PyUnicode_2BYTE_DATA(s),
2401 PyUnicode_2BYTE_DATA(s) + len,
2402 result);
2403 }
2404 else {
2405 assert(skind == PyUnicode_1BYTE_KIND);
2406 _PyUnicode_CONVERT_BYTES(
2407 Py_UCS1, Py_UCS4,
2408 PyUnicode_1BYTE_DATA(s),
2409 PyUnicode_1BYTE_DATA(s) + len,
2410 result);
2411 }
2412 return result;
2413 default:
2414 break;
2415 }
2416 PyErr_SetString(PyExc_SystemError, "invalid kind");
2417 return NULL;
2418 }
2419
2420 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2421 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422 int copy_null)
2423 {
2424 int kind;
2425 void *data;
2426 Py_ssize_t len, targetlen;
2427 if (PyUnicode_READY(string) == -1)
2428 return NULL;
2429 kind = PyUnicode_KIND(string);
2430 data = PyUnicode_DATA(string);
2431 len = PyUnicode_GET_LENGTH(string);
2432 targetlen = len;
2433 if (copy_null)
2434 targetlen++;
2435 if (!target) {
2436 target = PyMem_New(Py_UCS4, targetlen);
2437 if (!target) {
2438 PyErr_NoMemory();
2439 return NULL;
2440 }
2441 }
2442 else {
2443 if (targetsize < targetlen) {
2444 PyErr_Format(PyExc_SystemError,
2445 "string is longer than the buffer");
2446 if (copy_null && 0 < targetsize)
2447 target[0] = 0;
2448 return NULL;
2449 }
2450 }
2451 if (kind == PyUnicode_1BYTE_KIND) {
2452 Py_UCS1 *start = (Py_UCS1 *) data;
2453 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 Py_UCS2 *start = (Py_UCS2 *) data;
2457 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458 }
2459 else {
2460 assert(kind == PyUnicode_4BYTE_KIND);
2461 memcpy(target, data, len * sizeof(Py_UCS4));
2462 }
2463 if (copy_null)
2464 target[len] = 0;
2465 return target;
2466 }
2467
2468 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2469 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470 int copy_null)
2471 {
2472 if (target == NULL || targetsize < 0) {
2473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
2476 return as_ucs4(string, target, targetsize, copy_null);
2477 }
2478
2479 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2480 PyUnicode_AsUCS4Copy(PyObject *string)
2481 {
2482 return as_ucs4(string, NULL, 0, 1);
2483 }
2484
2485 #ifdef HAVE_WCHAR_H
2486
2487 PyObject *
PyUnicode_FromWideChar(const wchar_t * w,Py_ssize_t size)2488 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2489 {
2490 if (w == NULL) {
2491 if (size == 0)
2492 _Py_RETURN_UNICODE_EMPTY();
2493 PyErr_BadInternalCall();
2494 return NULL;
2495 }
2496
2497 if (size == -1) {
2498 size = wcslen(w);
2499 }
2500
2501 return PyUnicode_FromUnicode(w, size);
2502 }
2503
2504 #endif /* HAVE_WCHAR_H */
2505
2506 /* maximum number of characters required for output of %lld or %p.
2507 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2509 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2510
2511 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2512 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513 Py_ssize_t width, Py_ssize_t precision)
2514 {
2515 Py_ssize_t length, fill, arglen;
2516 Py_UCS4 maxchar;
2517
2518 if (PyUnicode_READY(str) == -1)
2519 return -1;
2520
2521 length = PyUnicode_GET_LENGTH(str);
2522 if ((precision == -1 || precision >= length)
2523 && width <= length)
2524 return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526 if (precision != -1)
2527 length = Py_MIN(precision, length);
2528
2529 arglen = Py_MAX(length, width);
2530 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532 else
2533 maxchar = writer->maxchar;
2534
2535 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536 return -1;
2537
2538 if (width > length) {
2539 fill = width - length;
2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541 return -1;
2542 writer->pos += fill;
2543 }
2544
2545 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546 str, 0, length);
2547 writer->pos += length;
2548 return 0;
2549 }
2550
2551 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2552 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553 Py_ssize_t width, Py_ssize_t precision)
2554 {
2555 /* UTF-8 */
2556 Py_ssize_t length;
2557 PyObject *unicode;
2558 int res;
2559
2560 length = strlen(str);
2561 if (precision != -1)
2562 length = Py_MIN(length, precision);
2563 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564 if (unicode == NULL)
2565 return -1;
2566
2567 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568 Py_DECREF(unicode);
2569 return res;
2570 }
2571
2572 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2573 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574 const char *f, va_list *vargs)
2575 {
2576 const char *p;
2577 Py_ssize_t len;
2578 int zeropad;
2579 Py_ssize_t width;
2580 Py_ssize_t precision;
2581 int longflag;
2582 int longlongflag;
2583 int size_tflag;
2584 Py_ssize_t fill;
2585
2586 p = f;
2587 f++;
2588 zeropad = 0;
2589 if (*f == '0') {
2590 zeropad = 1;
2591 f++;
2592 }
2593
2594 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2595 width = -1;
2596 if (Py_ISDIGIT((unsigned)*f)) {
2597 width = *f - '0';
2598 f++;
2599 while (Py_ISDIGIT((unsigned)*f)) {
2600 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2601 PyErr_SetString(PyExc_ValueError,
2602 "width too big");
2603 return NULL;
2604 }
2605 width = (width * 10) + (*f - '0');
2606 f++;
2607 }
2608 }
2609 precision = -1;
2610 if (*f == '.') {
2611 f++;
2612 if (Py_ISDIGIT((unsigned)*f)) {
2613 precision = (*f - '0');
2614 f++;
2615 while (Py_ISDIGIT((unsigned)*f)) {
2616 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617 PyErr_SetString(PyExc_ValueError,
2618 "precision too big");
2619 return NULL;
2620 }
2621 precision = (precision * 10) + (*f - '0');
2622 f++;
2623 }
2624 }
2625 if (*f == '%') {
2626 /* "%.3%s" => f points to "3" */
2627 f--;
2628 }
2629 }
2630 if (*f == '\0') {
2631 /* bogus format "%.123" => go backward, f points to "3" */
2632 f--;
2633 }
2634
2635 /* Handle %ld, %lu, %lld and %llu. */
2636 longflag = 0;
2637 longlongflag = 0;
2638 size_tflag = 0;
2639 if (*f == 'l') {
2640 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2641 longflag = 1;
2642 ++f;
2643 }
2644 else if (f[1] == 'l' &&
2645 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2646 longlongflag = 1;
2647 f += 2;
2648 }
2649 }
2650 /* handle the size_t flag. */
2651 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2652 size_tflag = 1;
2653 ++f;
2654 }
2655
2656 if (f[1] == '\0')
2657 writer->overallocate = 0;
2658
2659 switch (*f) {
2660 case 'c':
2661 {
2662 int ordinal = va_arg(*vargs, int);
2663 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2664 PyErr_SetString(PyExc_OverflowError,
2665 "character argument not in range(0x110000)");
2666 return NULL;
2667 }
2668 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2669 return NULL;
2670 break;
2671 }
2672
2673 case 'i':
2674 case 'd':
2675 case 'u':
2676 case 'x':
2677 {
2678 /* used by sprintf */
2679 char buffer[MAX_LONG_LONG_CHARS];
2680 Py_ssize_t arglen;
2681
2682 if (*f == 'u') {
2683 if (longflag)
2684 len = sprintf(buffer, "%lu",
2685 va_arg(*vargs, unsigned long));
2686 else if (longlongflag)
2687 len = sprintf(buffer, "%llu",
2688 va_arg(*vargs, unsigned long long));
2689 else if (size_tflag)
2690 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2691 va_arg(*vargs, size_t));
2692 else
2693 len = sprintf(buffer, "%u",
2694 va_arg(*vargs, unsigned int));
2695 }
2696 else if (*f == 'x') {
2697 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2698 }
2699 else {
2700 if (longflag)
2701 len = sprintf(buffer, "%li",
2702 va_arg(*vargs, long));
2703 else if (longlongflag)
2704 len = sprintf(buffer, "%lli",
2705 va_arg(*vargs, long long));
2706 else if (size_tflag)
2707 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2708 va_arg(*vargs, Py_ssize_t));
2709 else
2710 len = sprintf(buffer, "%i",
2711 va_arg(*vargs, int));
2712 }
2713 assert(len >= 0);
2714
2715 if (precision < len)
2716 precision = len;
2717
2718 arglen = Py_MAX(precision, width);
2719 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720 return NULL;
2721
2722 if (width > precision) {
2723 Py_UCS4 fillchar;
2724 fill = width - precision;
2725 fillchar = zeropad?'0':' ';
2726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727 return NULL;
2728 writer->pos += fill;
2729 }
2730 if (precision > len) {
2731 fill = precision - len;
2732 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733 return NULL;
2734 writer->pos += fill;
2735 }
2736
2737 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738 return NULL;
2739 break;
2740 }
2741
2742 case 'p':
2743 {
2744 char number[MAX_LONG_LONG_CHARS];
2745
2746 len = sprintf(number, "%p", va_arg(*vargs, void*));
2747 assert(len >= 0);
2748
2749 /* %p is ill-defined: ensure leading 0x. */
2750 if (number[1] == 'X')
2751 number[1] = 'x';
2752 else if (number[1] != 'x') {
2753 memmove(number + 2, number,
2754 strlen(number) + 1);
2755 number[0] = '0';
2756 number[1] = 'x';
2757 len += 2;
2758 }
2759
2760 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2761 return NULL;
2762 break;
2763 }
2764
2765 case 's':
2766 {
2767 /* UTF-8 */
2768 const char *s = va_arg(*vargs, const char*);
2769 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2770 return NULL;
2771 break;
2772 }
2773
2774 case 'U':
2775 {
2776 PyObject *obj = va_arg(*vargs, PyObject *);
2777 assert(obj && _PyUnicode_CHECK(obj));
2778
2779 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2780 return NULL;
2781 break;
2782 }
2783
2784 case 'V':
2785 {
2786 PyObject *obj = va_arg(*vargs, PyObject *);
2787 const char *str = va_arg(*vargs, const char *);
2788 if (obj) {
2789 assert(_PyUnicode_CHECK(obj));
2790 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2791 return NULL;
2792 }
2793 else {
2794 assert(str != NULL);
2795 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2796 return NULL;
2797 }
2798 break;
2799 }
2800
2801 case 'S':
2802 {
2803 PyObject *obj = va_arg(*vargs, PyObject *);
2804 PyObject *str;
2805 assert(obj);
2806 str = PyObject_Str(obj);
2807 if (!str)
2808 return NULL;
2809 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2810 Py_DECREF(str);
2811 return NULL;
2812 }
2813 Py_DECREF(str);
2814 break;
2815 }
2816
2817 case 'R':
2818 {
2819 PyObject *obj = va_arg(*vargs, PyObject *);
2820 PyObject *repr;
2821 assert(obj);
2822 repr = PyObject_Repr(obj);
2823 if (!repr)
2824 return NULL;
2825 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2826 Py_DECREF(repr);
2827 return NULL;
2828 }
2829 Py_DECREF(repr);
2830 break;
2831 }
2832
2833 case 'A':
2834 {
2835 PyObject *obj = va_arg(*vargs, PyObject *);
2836 PyObject *ascii;
2837 assert(obj);
2838 ascii = PyObject_ASCII(obj);
2839 if (!ascii)
2840 return NULL;
2841 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2842 Py_DECREF(ascii);
2843 return NULL;
2844 }
2845 Py_DECREF(ascii);
2846 break;
2847 }
2848
2849 case '%':
2850 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2851 return NULL;
2852 break;
2853
2854 default:
2855 /* if we stumble upon an unknown formatting code, copy the rest
2856 of the format string to the output string. (we cannot just
2857 skip the code, since there's no way to know what's in the
2858 argument list) */
2859 len = strlen(p);
2860 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2861 return NULL;
2862 f = p+len;
2863 return f;
2864 }
2865
2866 f++;
2867 return f;
2868 }
2869
2870 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2871 PyUnicode_FromFormatV(const char *format, va_list vargs)
2872 {
2873 va_list vargs2;
2874 const char *f;
2875 _PyUnicodeWriter writer;
2876
2877 _PyUnicodeWriter_Init(&writer);
2878 writer.min_length = strlen(format) + 100;
2879 writer.overallocate = 1;
2880
2881 // Copy varags to be able to pass a reference to a subfunction.
2882 va_copy(vargs2, vargs);
2883
2884 for (f = format; *f; ) {
2885 if (*f == '%') {
2886 f = unicode_fromformat_arg(&writer, f, &vargs2);
2887 if (f == NULL)
2888 goto fail;
2889 }
2890 else {
2891 const char *p;
2892 Py_ssize_t len;
2893
2894 p = f;
2895 do
2896 {
2897 if ((unsigned char)*p > 127) {
2898 PyErr_Format(PyExc_ValueError,
2899 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900 "string, got a non-ASCII byte: 0x%02x",
2901 (unsigned char)*p);
2902 goto fail;
2903 }
2904 p++;
2905 }
2906 while (*p != '\0' && *p != '%');
2907 len = p - f;
2908
2909 if (*p == '\0')
2910 writer.overallocate = 0;
2911
2912 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2913 goto fail;
2914
2915 f = p;
2916 }
2917 }
2918 va_end(vargs2);
2919 return _PyUnicodeWriter_Finish(&writer);
2920
2921 fail:
2922 va_end(vargs2);
2923 _PyUnicodeWriter_Dealloc(&writer);
2924 return NULL;
2925 }
2926
2927 PyObject *
PyUnicode_FromFormat(const char * format,...)2928 PyUnicode_FromFormat(const char *format, ...)
2929 {
2930 PyObject* ret;
2931 va_list vargs;
2932
2933 #ifdef HAVE_STDARG_PROTOTYPES
2934 va_start(vargs, format);
2935 #else
2936 va_start(vargs);
2937 #endif
2938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
2941 }
2942
2943 #ifdef HAVE_WCHAR_H
2944
2945 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
2948 - If w is NULL: return the number of wide characters (including the null
2949 character) required to convert the unicode object. Ignore size argument.
2950
2951 - Otherwise: return the number of wide characters (excluding the null
2952 character) written into w. Write at most size wide characters (including
2953 the null character). */
2954 static Py_ssize_t
unicode_aswidechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2955 unicode_aswidechar(PyObject *unicode,
2956 wchar_t *w,
2957 Py_ssize_t size)
2958 {
2959 Py_ssize_t res;
2960 const wchar_t *wstr;
2961
2962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2963 if (wstr == NULL)
2964 return -1;
2965
2966 if (w != NULL) {
2967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
2971 memcpy(w, wstr, size * sizeof(wchar_t));
2972 return res;
2973 }
2974 else
2975 return res + 1;
2976 }
2977
2978 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2979 PyUnicode_AsWideChar(PyObject *unicode,
2980 wchar_t *w,
2981 Py_ssize_t size)
2982 {
2983 if (unicode == NULL) {
2984 PyErr_BadInternalCall();
2985 return -1;
2986 }
2987 return unicode_aswidechar(unicode, w, size);
2988 }
2989
2990 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)2991 PyUnicode_AsWideCharString(PyObject *unicode,
2992 Py_ssize_t *size)
2993 {
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
3002 buflen = unicode_aswidechar(unicode, NULL, 0);
3003 if (buflen == -1)
3004 return NULL;
3005 buffer = PyMem_NEW(wchar_t, buflen);
3006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
3010 buflen = unicode_aswidechar(unicode, buffer, buflen);
3011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
3013 return NULL;
3014 }
3015 if (size != NULL)
3016 *size = buflen;
3017 return buffer;
3018 }
3019
3020 #endif /* HAVE_WCHAR_H */
3021
3022 PyObject *
PyUnicode_FromOrdinal(int ordinal)3023 PyUnicode_FromOrdinal(int ordinal)
3024 {
3025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
3029 }
3030
3031 return unicode_char((Py_UCS4)ordinal);
3032 }
3033
3034 PyObject *
PyUnicode_FromObject(PyObject * obj)3035 PyUnicode_FromObject(PyObject *obj)
3036 {
3037 /* XXX Perhaps we should make this API an alias of
3038 PyObject_Str() instead ?! */
3039 if (PyUnicode_CheckExact(obj)) {
3040 if (PyUnicode_READY(obj) == -1)
3041 return NULL;
3042 Py_INCREF(obj);
3043 return obj;
3044 }
3045 if (PyUnicode_Check(obj)) {
3046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
3048 return _PyUnicode_Copy(obj);
3049 }
3050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
3052 Py_TYPE(obj)->tp_name);
3053 return NULL;
3054 }
3055
3056 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3057 PyUnicode_FromEncodedObject(PyObject *obj,
3058 const char *encoding,
3059 const char *errors)
3060 {
3061 Py_buffer buffer;
3062 PyObject *v;
3063
3064 if (obj == NULL) {
3065 PyErr_BadInternalCall();
3066 return NULL;
3067 }
3068
3069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
3071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
3076 return v;
3077 }
3078
3079 if (PyUnicode_Check(obj)) {
3080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
3083 }
3084
3085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
3088 "decoding to str: need a bytes-like object, %.80s found",
3089 Py_TYPE(obj)->tp_name);
3090 return NULL;
3091 }
3092
3093 if (buffer.len == 0) {
3094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
3096 }
3097
3098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3099 PyBuffer_Release(&buffer);
3100 return v;
3101 }
3102
3103 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105 longer than lower_len-1). */
3106 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3107 _Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
3110 {
3111 const char *e;
3112 char *l;
3113 char *l_end;
3114 int punct;
3115
3116 assert(encoding != NULL);
3117
3118 e = encoding;
3119 l = lower;
3120 l_end = &lower[lower_len - 1];
3121 punct = 0;
3122 while (1) {
3123 char c = *e;
3124 if (c == 0) {
3125 break;
3126 }
3127
3128 if (Py_ISALNUM(c) || c == '.') {
3129 if (punct && l != lower) {
3130 if (l == l_end) {
3131 return 0;
3132 }
3133 *l++ = '_';
3134 }
3135 punct = 0;
3136
3137 if (l == l_end) {
3138 return 0;
3139 }
3140 *l++ = Py_TOLOWER(c);
3141 }
3142 else {
3143 punct = 1;
3144 }
3145
3146 e++;
3147 }
3148 *l = '\0';
3149 return 1;
3150 }
3151
3152 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3153 PyUnicode_Decode(const char *s,
3154 Py_ssize_t size,
3155 const char *encoding,
3156 const char *errors)
3157 {
3158 PyObject *buffer = NULL, *unicode;
3159 Py_buffer info;
3160 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161
3162 if (encoding == NULL) {
3163 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164 }
3165
3166 /* Shortcuts for common default encodings */
3167 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168 char *lower = buflower;
3169
3170 /* Fast paths */
3171 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172 lower += 3;
3173 if (*lower == '_') {
3174 /* Match "utf8" and "utf_8" */
3175 lower++;
3176 }
3177
3178 if (lower[0] == '8' && lower[1] == 0) {
3179 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180 }
3181 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183 }
3184 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186 }
3187 }
3188 else {
3189 if (strcmp(lower, "ascii") == 0
3190 || strcmp(lower, "us_ascii") == 0) {
3191 return PyUnicode_DecodeASCII(s, size, errors);
3192 }
3193 #ifdef MS_WINDOWS
3194 else if (strcmp(lower, "mbcs") == 0) {
3195 return PyUnicode_DecodeMBCS(s, size, errors);
3196 }
3197 #endif
3198 else if (strcmp(lower, "latin1") == 0
3199 || strcmp(lower, "latin_1") == 0
3200 || strcmp(lower, "iso_8859_1") == 0
3201 || strcmp(lower, "iso8859_1") == 0) {
3202 return PyUnicode_DecodeLatin1(s, size, errors);
3203 }
3204 }
3205 }
3206
3207 /* Decode via the codec registry */
3208 buffer = NULL;
3209 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3210 goto onError;
3211 buffer = PyMemoryView_FromBuffer(&info);
3212 if (buffer == NULL)
3213 goto onError;
3214 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3215 if (unicode == NULL)
3216 goto onError;
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_Format(PyExc_TypeError,
3219 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220 "use codecs.decode() to decode to arbitrary types",
3221 encoding,
3222 Py_TYPE(unicode)->tp_name);
3223 Py_DECREF(unicode);
3224 goto onError;
3225 }
3226 Py_DECREF(buffer);
3227 return unicode_result(unicode);
3228
3229 onError:
3230 Py_XDECREF(buffer);
3231 return NULL;
3232 }
3233
3234 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3235 PyUnicode_AsDecodedObject(PyObject *unicode,
3236 const char *encoding,
3237 const char *errors)
3238 {
3239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
3241 return NULL;
3242 }
3243
3244 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245 "PyUnicode_AsDecodedObject() is deprecated; "
3246 "use PyCodec_Decode() to decode from str", 1) < 0)
3247 return NULL;
3248
3249 if (encoding == NULL)
3250 encoding = PyUnicode_GetDefaultEncoding();
3251
3252 /* Decode via the codec registry */
3253 return PyCodec_Decode(unicode, encoding, errors);
3254 }
3255
3256 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3257 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3258 const char *encoding,
3259 const char *errors)
3260 {
3261 PyObject *v;
3262
3263 if (!PyUnicode_Check(unicode)) {
3264 PyErr_BadArgument();
3265 goto onError;
3266 }
3267
3268 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269 "PyUnicode_AsDecodedUnicode() is deprecated; "
3270 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271 return NULL;
3272
3273 if (encoding == NULL)
3274 encoding = PyUnicode_GetDefaultEncoding();
3275
3276 /* Decode via the codec registry */
3277 v = PyCodec_Decode(unicode, encoding, errors);
3278 if (v == NULL)
3279 goto onError;
3280 if (!PyUnicode_Check(v)) {
3281 PyErr_Format(PyExc_TypeError,
3282 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283 "use codecs.decode() to decode to arbitrary types",
3284 encoding,
3285 Py_TYPE(unicode)->tp_name);
3286 Py_DECREF(v);
3287 goto onError;
3288 }
3289 return unicode_result(v);
3290
3291 onError:
3292 return NULL;
3293 }
3294
3295 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3296 PyUnicode_Encode(const Py_UNICODE *s,
3297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
3300 {
3301 PyObject *v, *unicode;
3302
3303 unicode = PyUnicode_FromUnicode(s, size);
3304 if (unicode == NULL)
3305 return NULL;
3306 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307 Py_DECREF(unicode);
3308 return v;
3309 }
3310
3311 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3312 PyUnicode_AsEncodedObject(PyObject *unicode,
3313 const char *encoding,
3314 const char *errors)
3315 {
3316 PyObject *v;
3317
3318 if (!PyUnicode_Check(unicode)) {
3319 PyErr_BadArgument();
3320 goto onError;
3321 }
3322
3323 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324 "PyUnicode_AsEncodedObject() is deprecated; "
3325 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326 "or PyCodec_Encode() for generic encoding", 1) < 0)
3327 return NULL;
3328
3329 if (encoding == NULL)
3330 encoding = PyUnicode_GetDefaultEncoding();
3331
3332 /* Encode via the codec registry */
3333 v = PyCodec_Encode(unicode, encoding, errors);
3334 if (v == NULL)
3335 goto onError;
3336 return v;
3337
3338 onError:
3339 return NULL;
3340 }
3341
3342 static size_t
wcstombs_errorpos(const wchar_t * wstr)3343 wcstombs_errorpos(const wchar_t *wstr)
3344 {
3345 size_t len;
3346 #if SIZEOF_WCHAR_T == 2
3347 wchar_t buf[3];
3348 #else
3349 wchar_t buf[2];
3350 #endif
3351 char outbuf[MB_LEN_MAX];
3352 const wchar_t *start, *previous;
3353
3354 #if SIZEOF_WCHAR_T == 2
3355 buf[2] = 0;
3356 #else
3357 buf[1] = 0;
3358 #endif
3359 start = wstr;
3360 while (*wstr != L'\0')
3361 {
3362 previous = wstr;
3363 #if SIZEOF_WCHAR_T == 2
3364 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366 {
3367 buf[0] = wstr[0];
3368 buf[1] = wstr[1];
3369 wstr += 2;
3370 }
3371 else {
3372 buf[0] = *wstr;
3373 buf[1] = 0;
3374 wstr++;
3375 }
3376 #else
3377 buf[0] = *wstr;
3378 wstr++;
3379 #endif
3380 len = wcstombs(outbuf, buf, sizeof(outbuf));
3381 if (len == (size_t)-1)
3382 return previous - start;
3383 }
3384
3385 /* failed to find the unencodable character */
3386 return 0;
3387 }
3388
3389 static int
locale_error_handler(const char * errors,int * surrogateescape)3390 locale_error_handler(const char *errors, int *surrogateescape)
3391 {
3392 _Py_error_handler error_handler = get_error_handler(errors);
3393 switch (error_handler)
3394 {
3395 case _Py_ERROR_STRICT:
3396 *surrogateescape = 0;
3397 return 0;
3398 case _Py_ERROR_SURROGATEESCAPE:
3399 *surrogateescape = 1;
3400 return 0;
3401 default:
3402 PyErr_Format(PyExc_ValueError,
3403 "only 'strict' and 'surrogateescape' error handlers "
3404 "are supported, not '%s'",
3405 errors);
3406 return -1;
3407 }
3408 }
3409
3410 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3411 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3412 {
3413 Py_ssize_t wlen, wlen2;
3414 wchar_t *wstr;
3415 PyObject *bytes = NULL;
3416 char *errmsg;
3417 PyObject *reason = NULL;
3418 PyObject *exc;
3419 size_t error_pos;
3420 int surrogateescape;
3421
3422 if (locale_error_handler(errors, &surrogateescape) < 0)
3423 return NULL;
3424
3425 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426 if (wstr == NULL)
3427 return NULL;
3428
3429 wlen2 = wcslen(wstr);
3430 if (wlen2 != wlen) {
3431 PyMem_Free(wstr);
3432 PyErr_SetString(PyExc_ValueError, "embedded null character");
3433 return NULL;
3434 }
3435
3436 if (surrogateescape) {
3437 /* "surrogateescape" error handler */
3438 char *str;
3439
3440 str = Py_EncodeLocale(wstr, &error_pos);
3441 if (str == NULL) {
3442 if (error_pos == (size_t)-1) {
3443 PyErr_NoMemory();
3444 PyMem_Free(wstr);
3445 return NULL;
3446 }
3447 else {
3448 goto encode_error;
3449 }
3450 }
3451 PyMem_Free(wstr);
3452
3453 bytes = PyBytes_FromString(str);
3454 PyMem_Free(str);
3455 }
3456 else {
3457 /* strict mode */
3458 size_t len, len2;
3459
3460 len = wcstombs(NULL, wstr, 0);
3461 if (len == (size_t)-1) {
3462 error_pos = (size_t)-1;
3463 goto encode_error;
3464 }
3465
3466 bytes = PyBytes_FromStringAndSize(NULL, len);
3467 if (bytes == NULL) {
3468 PyMem_Free(wstr);
3469 return NULL;
3470 }
3471
3472 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473 if (len2 == (size_t)-1 || len2 > len) {
3474 error_pos = (size_t)-1;
3475 goto encode_error;
3476 }
3477 PyMem_Free(wstr);
3478 }
3479 return bytes;
3480
3481 encode_error:
3482 errmsg = strerror(errno);
3483 assert(errmsg != NULL);
3484
3485 if (error_pos == (size_t)-1)
3486 error_pos = wcstombs_errorpos(wstr);
3487
3488 PyMem_Free(wstr);
3489 Py_XDECREF(bytes);
3490
3491 if (errmsg != NULL) {
3492 size_t errlen;
3493 wstr = Py_DecodeLocale(errmsg, &errlen);
3494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
3496 PyMem_RawFree(wstr);
3497 } else
3498 errmsg = NULL;
3499 }
3500 if (errmsg == NULL)
3501 reason = PyUnicode_FromString(
3502 "wcstombs() encountered an unencodable "
3503 "wide character");
3504 if (reason == NULL)
3505 return NULL;
3506
3507 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508 "locale", unicode,
3509 (Py_ssize_t)error_pos,
3510 (Py_ssize_t)(error_pos+1),
3511 reason);
3512 Py_DECREF(reason);
3513 if (exc != NULL) {
3514 PyCodec_StrictErrors(exc);
3515 Py_XDECREF(exc);
3516 }
3517 return NULL;
3518 }
3519
3520 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3521 PyUnicode_EncodeFSDefault(PyObject *unicode)
3522 {
3523 #if defined(__APPLE__)
3524 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3525 #else
3526 PyInterpreterState *interp = PyThreadState_GET()->interp;
3527 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528 cannot use it to encode and decode filenames before it is loaded. Load
3529 the Python codec requires to encode at least its own filename. Use the C
3530 version of the locale codec until the codec registry is initialized and
3531 the Python codec is loaded.
3532
3533 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534 cannot only rely on it: check also interp->fscodec_initialized for
3535 subinterpreters. */
3536 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3537 return PyUnicode_AsEncodedString(unicode,
3538 Py_FileSystemDefaultEncoding,
3539 Py_FileSystemDefaultEncodeErrors);
3540 }
3541 else {
3542 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3543 }
3544 #endif
3545 }
3546
3547 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3548 PyUnicode_AsEncodedString(PyObject *unicode,
3549 const char *encoding,
3550 const char *errors)
3551 {
3552 PyObject *v;
3553 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3554
3555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
3557 return NULL;
3558 }
3559
3560 if (encoding == NULL) {
3561 return _PyUnicode_AsUTF8String(unicode, errors);
3562 }
3563
3564 /* Shortcuts for common default encodings */
3565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
3577 return _PyUnicode_AsUTF8String(unicode, errors);
3578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584 }
3585 }
3586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return _PyUnicode_AsASCIIString(unicode, errors);
3590 }
3591 #ifdef MS_WINDOWS
3592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594 }
3595 #endif
3596 else if (strcmp(lower, "latin1") == 0 ||
3597 strcmp(lower, "latin_1") == 0 ||
3598 strcmp(lower, "iso_8859_1") == 0 ||
3599 strcmp(lower, "iso8859_1") == 0) {
3600 return _PyUnicode_AsLatin1String(unicode, errors);
3601 }
3602 }
3603 }
3604
3605 /* Encode via the codec registry */
3606 v = _PyCodec_EncodeText(unicode, encoding, errors);
3607 if (v == NULL)
3608 return NULL;
3609
3610 /* The normal path */
3611 if (PyBytes_Check(v))
3612 return v;
3613
3614 /* If the codec returns a buffer, raise a warning and convert to bytes */
3615 if (PyByteArray_Check(v)) {
3616 int error;
3617 PyObject *b;
3618
3619 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3620 "encoder %s returned bytearray instead of bytes; "
3621 "use codecs.encode() to encode to arbitrary types",
3622 encoding);
3623 if (error) {
3624 Py_DECREF(v);
3625 return NULL;
3626 }
3627
3628 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629 Py_DECREF(v);
3630 return b;
3631 }
3632
3633 PyErr_Format(PyExc_TypeError,
3634 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635 "use codecs.encode() to encode to arbitrary types",
3636 encoding,
3637 Py_TYPE(v)->tp_name);
3638 Py_DECREF(v);
3639 return NULL;
3640 }
3641
3642 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3643 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3644 const char *encoding,
3645 const char *errors)
3646 {
3647 PyObject *v;
3648
3649 if (!PyUnicode_Check(unicode)) {
3650 PyErr_BadArgument();
3651 goto onError;
3652 }
3653
3654 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655 "PyUnicode_AsEncodedUnicode() is deprecated; "
3656 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657 return NULL;
3658
3659 if (encoding == NULL)
3660 encoding = PyUnicode_GetDefaultEncoding();
3661
3662 /* Encode via the codec registry */
3663 v = PyCodec_Encode(unicode, encoding, errors);
3664 if (v == NULL)
3665 goto onError;
3666 if (!PyUnicode_Check(v)) {
3667 PyErr_Format(PyExc_TypeError,
3668 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669 "use codecs.encode() to encode to arbitrary types",
3670 encoding,
3671 Py_TYPE(v)->tp_name);
3672 Py_DECREF(v);
3673 goto onError;
3674 }
3675 return v;
3676
3677 onError:
3678 return NULL;
3679 }
3680
3681 static size_t
mbstowcs_errorpos(const char * str,size_t len)3682 mbstowcs_errorpos(const char *str, size_t len)
3683 {
3684 #ifdef HAVE_MBRTOWC
3685 const char *start = str;
3686 mbstate_t mbs;
3687 size_t converted;
3688 wchar_t ch;
3689
3690 memset(&mbs, 0, sizeof mbs);
3691 while (len)
3692 {
3693 converted = mbrtowc(&ch, str, len, &mbs);
3694 if (converted == 0)
3695 /* Reached end of string */
3696 break;
3697 if (converted == (size_t)-1 || converted == (size_t)-2) {
3698 /* Conversion error or incomplete character */
3699 return str - start;
3700 }
3701 else {
3702 str += converted;
3703 len -= converted;
3704 }
3705 }
3706 /* failed to find the undecodable byte sequence */
3707 return 0;
3708 #endif
3709 return 0;
3710 }
3711
3712 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3713 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3714 const char *errors)
3715 {
3716 wchar_t smallbuf[256];
3717 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718 wchar_t *wstr;
3719 size_t wlen, wlen2;
3720 PyObject *unicode;
3721 int surrogateescape;
3722 size_t error_pos;
3723 char *errmsg;
3724 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3725 PyObject *exc;
3726
3727 if (locale_error_handler(errors, &surrogateescape) < 0)
3728 return NULL;
3729
3730 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3731 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3732 return NULL;
3733 }
3734
3735 if (surrogateescape) {
3736 /* "surrogateescape" error handler */
3737 wstr = Py_DecodeLocale(str, &wlen);
3738 if (wstr == NULL) {
3739 if (wlen == (size_t)-1)
3740 PyErr_NoMemory();
3741 else
3742 PyErr_SetFromErrno(PyExc_OSError);
3743 return NULL;
3744 }
3745
3746 unicode = PyUnicode_FromWideChar(wstr, wlen);
3747 PyMem_RawFree(wstr);
3748 }
3749 else {
3750 /* strict mode */
3751 #ifndef HAVE_BROKEN_MBSTOWCS
3752 wlen = mbstowcs(NULL, str, 0);
3753 #else
3754 wlen = len;
3755 #endif
3756 if (wlen == (size_t)-1)
3757 goto decode_error;
3758 if (wlen+1 <= smallbuf_len) {
3759 wstr = smallbuf;
3760 }
3761 else {
3762 wstr = PyMem_New(wchar_t, wlen+1);
3763 if (!wstr)
3764 return PyErr_NoMemory();
3765 }
3766
3767 wlen2 = mbstowcs(wstr, str, wlen+1);
3768 if (wlen2 == (size_t)-1) {
3769 if (wstr != smallbuf)
3770 PyMem_Free(wstr);
3771 goto decode_error;
3772 }
3773 #ifdef HAVE_BROKEN_MBSTOWCS
3774 assert(wlen2 == wlen);
3775 #endif
3776 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777 if (wstr != smallbuf)
3778 PyMem_Free(wstr);
3779 }
3780 return unicode;
3781
3782 decode_error:
3783 reason = NULL;
3784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
3788 if (errmsg != NULL) {
3789 size_t errlen;
3790 wstr = Py_DecodeLocale(errmsg, &errlen);
3791 if (wstr != NULL) {
3792 reason = PyUnicode_FromWideChar(wstr, errlen);
3793 PyMem_RawFree(wstr);
3794 }
3795 }
3796 if (reason == NULL)
3797 reason = PyUnicode_FromString(
3798 "mbstowcs() encountered an invalid multibyte sequence");
3799 if (reason == NULL)
3800 return NULL;
3801
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803 "locale", str, len,
3804 (Py_ssize_t)error_pos,
3805 (Py_ssize_t)(error_pos+1),
3806 reason);
3807 Py_DECREF(reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
3810 Py_XDECREF(exc);
3811 }
3812 return NULL;
3813 }
3814
3815 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3816 PyUnicode_DecodeLocale(const char *str, const char *errors)
3817 {
3818 Py_ssize_t size = (Py_ssize_t)strlen(str);
3819 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3820 }
3821
3822
3823 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3824 PyUnicode_DecodeFSDefault(const char *s) {
3825 Py_ssize_t size = (Py_ssize_t)strlen(s);
3826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827 }
3828
3829 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3830 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831 {
3832 #if defined(__APPLE__)
3833 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3834 #else
3835 PyInterpreterState *interp = PyThreadState_GET()->interp;
3836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837 cannot use it to encode and decode filenames before it is loaded. Load
3838 the Python codec requires to encode at least its own filename. Use the C
3839 version of the locale codec until the codec registry is initialized and
3840 the Python codec is loaded.
3841
3842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843 cannot only rely on it: check also interp->fscodec_initialized for
3844 subinterpreters. */
3845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3846 return PyUnicode_Decode(s, size,
3847 Py_FileSystemDefaultEncoding,
3848 Py_FileSystemDefaultEncodeErrors);
3849 }
3850 else {
3851 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3852 }
3853 #endif
3854 }
3855
3856
3857 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3858 PyUnicode_FSConverter(PyObject* arg, void* addr)
3859 {
3860 PyObject *path = NULL;
3861 PyObject *output = NULL;
3862 Py_ssize_t size;
3863 void *data;
3864 if (arg == NULL) {
3865 Py_DECREF(*(PyObject**)addr);
3866 *(PyObject**)addr = NULL;
3867 return 1;
3868 }
3869 path = PyOS_FSPath(arg);
3870 if (path == NULL) {
3871 return 0;
3872 }
3873 if (PyBytes_Check(path)) {
3874 output = path;
3875 }
3876 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3877 output = PyUnicode_EncodeFSDefault(path);
3878 Py_DECREF(path);
3879 if (!output) {
3880 return 0;
3881 }
3882 assert(PyBytes_Check(output));
3883 }
3884
3885 size = PyBytes_GET_SIZE(output);
3886 data = PyBytes_AS_STRING(output);
3887 if ((size_t)size != strlen(data)) {
3888 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3889 Py_DECREF(output);
3890 return 0;
3891 }
3892 *(PyObject**)addr = output;
3893 return Py_CLEANUP_SUPPORTED;
3894 }
3895
3896
3897 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3898 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899 {
3900 int is_buffer = 0;
3901 PyObject *path = NULL;
3902 PyObject *output = NULL;
3903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
3905 return 1;
3906 }
3907
3908 is_buffer = PyObject_CheckBuffer(arg);
3909 if (!is_buffer) {
3910 path = PyOS_FSPath(arg);
3911 if (path == NULL) {
3912 return 0;
3913 }
3914 }
3915 else {
3916 path = arg;
3917 Py_INCREF(arg);
3918 }
3919
3920 if (PyUnicode_Check(path)) {
3921 if (PyUnicode_READY(path) == -1) {
3922 Py_DECREF(path);
3923 return 0;
3924 }
3925 output = path;
3926 }
3927 else if (PyBytes_Check(path) || is_buffer) {
3928 PyObject *path_bytes = NULL;
3929
3930 if (!PyBytes_Check(path) &&
3931 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3932 "path should be string, bytes, or os.PathLike, not %.200s",
3933 Py_TYPE(arg)->tp_name)) {
3934 Py_DECREF(path);
3935 return 0;
3936 }
3937 path_bytes = PyBytes_FromObject(path);
3938 Py_DECREF(path);
3939 if (!path_bytes) {
3940 return 0;
3941 }
3942 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3943 PyBytes_GET_SIZE(path_bytes));
3944 Py_DECREF(path_bytes);
3945 if (!output) {
3946 return 0;
3947 }
3948 }
3949 else {
3950 PyErr_Format(PyExc_TypeError,
3951 "path should be string, bytes, or os.PathLike, not %.200s",
3952 Py_TYPE(arg)->tp_name);
3953 Py_DECREF(path);
3954 return 0;
3955 }
3956 if (PyUnicode_READY(output) == -1) {
3957 Py_DECREF(output);
3958 return 0;
3959 }
3960 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3961 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3962 PyErr_SetString(PyExc_ValueError, "embedded null character");
3963 Py_DECREF(output);
3964 return 0;
3965 }
3966 *(PyObject**)addr = output;
3967 return Py_CLEANUP_SUPPORTED;
3968 }
3969
3970
3971 char*
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3972 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3973 {
3974 PyObject *bytes;
3975
3976 if (!PyUnicode_Check(unicode)) {
3977 PyErr_BadArgument();
3978 return NULL;
3979 }
3980 if (PyUnicode_READY(unicode) == -1)
3981 return NULL;
3982
3983 if (PyUnicode_UTF8(unicode) == NULL) {
3984 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3985 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3986 if (bytes == NULL)
3987 return NULL;
3988 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3989 if (_PyUnicode_UTF8(unicode) == NULL) {
3990 PyErr_NoMemory();
3991 Py_DECREF(bytes);
3992 return NULL;
3993 }
3994 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3995 memcpy(_PyUnicode_UTF8(unicode),
3996 PyBytes_AS_STRING(bytes),
3997 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3998 Py_DECREF(bytes);
3999 }
4000
4001 if (psize)
4002 *psize = PyUnicode_UTF8_LENGTH(unicode);
4003 return PyUnicode_UTF8(unicode);
4004 }
4005
4006 char*
PyUnicode_AsUTF8(PyObject * unicode)4007 PyUnicode_AsUTF8(PyObject *unicode)
4008 {
4009 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4010 }
4011
4012 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4013 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4014 {
4015 const unsigned char *one_byte;
4016 #if SIZEOF_WCHAR_T == 4
4017 const Py_UCS2 *two_bytes;
4018 #else
4019 const Py_UCS4 *four_bytes;
4020 const Py_UCS4 *ucs4_end;
4021 Py_ssize_t num_surrogates;
4022 #endif
4023 wchar_t *w;
4024 wchar_t *wchar_end;
4025
4026 if (!PyUnicode_Check(unicode)) {
4027 PyErr_BadArgument();
4028 return NULL;
4029 }
4030 if (_PyUnicode_WSTR(unicode) == NULL) {
4031 /* Non-ASCII compact unicode object */
4032 assert(_PyUnicode_KIND(unicode) != 0);
4033 assert(PyUnicode_IS_READY(unicode));
4034
4035 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4036 #if SIZEOF_WCHAR_T == 2
4037 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4038 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4039 num_surrogates = 0;
4040
4041 for (; four_bytes < ucs4_end; ++four_bytes) {
4042 if (*four_bytes > 0xFFFF)
4043 ++num_surrogates;
4044 }
4045
4046 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4047 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4048 if (!_PyUnicode_WSTR(unicode)) {
4049 PyErr_NoMemory();
4050 return NULL;
4051 }
4052 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4053
4054 w = _PyUnicode_WSTR(unicode);
4055 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4056 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4057 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4058 if (*four_bytes > 0xFFFF) {
4059 assert(*four_bytes <= MAX_UNICODE);
4060 /* encode surrogate pair in this case */
4061 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4062 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4063 }
4064 else
4065 *w = *four_bytes;
4066
4067 if (w > wchar_end) {
4068 assert(0 && "Miscalculated string end");
4069 }
4070 }
4071 *w = 0;
4072 #else
4073 /* sizeof(wchar_t) == 4 */
4074 Py_FatalError("Impossible unicode object state, wstr and str "
4075 "should share memory already.");
4076 return NULL;
4077 #endif
4078 }
4079 else {
4080 if ((size_t)_PyUnicode_LENGTH(unicode) >
4081 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4082 PyErr_NoMemory();
4083 return NULL;
4084 }
4085 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4086 (_PyUnicode_LENGTH(unicode) + 1));
4087 if (!_PyUnicode_WSTR(unicode)) {
4088 PyErr_NoMemory();
4089 return NULL;
4090 }
4091 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4092 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4093 w = _PyUnicode_WSTR(unicode);
4094 wchar_end = w + _PyUnicode_LENGTH(unicode);
4095
4096 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4097 one_byte = PyUnicode_1BYTE_DATA(unicode);
4098 for (; w < wchar_end; ++one_byte, ++w)
4099 *w = *one_byte;
4100 /* null-terminate the wstr */
4101 *w = 0;
4102 }
4103 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4104 #if SIZEOF_WCHAR_T == 4
4105 two_bytes = PyUnicode_2BYTE_DATA(unicode);
4106 for (; w < wchar_end; ++two_bytes, ++w)
4107 *w = *two_bytes;
4108 /* null-terminate the wstr */
4109 *w = 0;
4110 #else
4111 /* sizeof(wchar_t) == 2 */
4112 PyObject_FREE(_PyUnicode_WSTR(unicode));
4113 _PyUnicode_WSTR(unicode) = NULL;
4114 Py_FatalError("Impossible unicode object state, wstr "
4115 "and str should share memory already.");
4116 return NULL;
4117 #endif
4118 }
4119 else {
4120 assert(0 && "This should never happen.");
4121 }
4122 }
4123 }
4124 if (size != NULL)
4125 *size = PyUnicode_WSTR_LENGTH(unicode);
4126 return _PyUnicode_WSTR(unicode);
4127 }
4128
4129 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4130 PyUnicode_AsUnicode(PyObject *unicode)
4131 {
4132 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4133 }
4134
4135
4136 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4137 PyUnicode_GetSize(PyObject *unicode)
4138 {
4139 if (!PyUnicode_Check(unicode)) {
4140 PyErr_BadArgument();
4141 goto onError;
4142 }
4143 return PyUnicode_GET_SIZE(unicode);
4144
4145 onError:
4146 return -1;
4147 }
4148
4149 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4150 PyUnicode_GetLength(PyObject *unicode)
4151 {
4152 if (!PyUnicode_Check(unicode)) {
4153 PyErr_BadArgument();
4154 return -1;
4155 }
4156 if (PyUnicode_READY(unicode) == -1)
4157 return -1;
4158 return PyUnicode_GET_LENGTH(unicode);
4159 }
4160
4161 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163 {
4164 void *data;
4165 int kind;
4166
4167 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4168 PyErr_BadArgument();
4169 return (Py_UCS4)-1;
4170 }
4171 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4172 PyErr_SetString(PyExc_IndexError, "string index out of range");
4173 return (Py_UCS4)-1;
4174 }
4175 data = PyUnicode_DATA(unicode);
4176 kind = PyUnicode_KIND(unicode);
4177 return PyUnicode_READ(kind, data, index);
4178 }
4179
4180 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4181 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4182 {
4183 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4184 PyErr_BadArgument();
4185 return -1;
4186 }
4187 assert(PyUnicode_IS_READY(unicode));
4188 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4189 PyErr_SetString(PyExc_IndexError, "string index out of range");
4190 return -1;
4191 }
4192 if (unicode_check_modifiable(unicode))
4193 return -1;
4194 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4195 PyErr_SetString(PyExc_ValueError, "character out of range");
4196 return -1;
4197 }
4198 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4199 index, ch);
4200 return 0;
4201 }
4202
4203 const char *
PyUnicode_GetDefaultEncoding(void)4204 PyUnicode_GetDefaultEncoding(void)
4205 {
4206 return "utf-8";
4207 }
4208
4209 /* create or adjust a UnicodeDecodeError */
4210 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4211 make_decode_exception(PyObject **exceptionObject,
4212 const char *encoding,
4213 const char *input, Py_ssize_t length,
4214 Py_ssize_t startpos, Py_ssize_t endpos,
4215 const char *reason)
4216 {
4217 if (*exceptionObject == NULL) {
4218 *exceptionObject = PyUnicodeDecodeError_Create(
4219 encoding, input, length, startpos, endpos, reason);
4220 }
4221 else {
4222 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4223 goto onError;
4224 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4225 goto onError;
4226 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4227 goto onError;
4228 }
4229 return;
4230
4231 onError:
4232 Py_CLEAR(*exceptionObject);
4233 }
4234
4235 #ifdef MS_WINDOWS
4236 /* error handling callback helper:
4237 build arguments, call the callback and check the arguments,
4238 if no exception occurred, copy the replacement to the output
4239 and adjust various state variables.
4240 return 0 on success, -1 on error
4241 */
4242
4243 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4244 unicode_decode_call_errorhandler_wchar(
4245 const char *errors, PyObject **errorHandler,
4246 const char *encoding, const char *reason,
4247 const char **input, const char **inend, Py_ssize_t *startinpos,
4248 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4249 PyObject **output, Py_ssize_t *outpos)
4250 {
4251 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4252
4253 PyObject *restuple = NULL;
4254 PyObject *repunicode = NULL;
4255 Py_ssize_t outsize;
4256 Py_ssize_t insize;
4257 Py_ssize_t requiredsize;
4258 Py_ssize_t newpos;
4259 PyObject *inputobj = NULL;
4260 wchar_t *repwstr;
4261 Py_ssize_t repwlen;
4262
4263 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4264 outsize = _PyUnicode_WSTR_LENGTH(*output);
4265
4266 if (*errorHandler == NULL) {
4267 *errorHandler = PyCodec_LookupError(errors);
4268 if (*errorHandler == NULL)
4269 goto onError;
4270 }
4271
4272 make_decode_exception(exceptionObject,
4273 encoding,
4274 *input, *inend - *input,
4275 *startinpos, *endinpos,
4276 reason);
4277 if (*exceptionObject == NULL)
4278 goto onError;
4279
4280 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4281 if (restuple == NULL)
4282 goto onError;
4283 if (!PyTuple_Check(restuple)) {
4284 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4285 goto onError;
4286 }
4287 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4288 goto onError;
4289
4290 /* Copy back the bytes variables, which might have been modified by the
4291 callback */
4292 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4293 if (!inputobj)
4294 goto onError;
4295 if (!PyBytes_Check(inputobj)) {
4296 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4297 }
4298 *input = PyBytes_AS_STRING(inputobj);
4299 insize = PyBytes_GET_SIZE(inputobj);
4300 *inend = *input + insize;
4301 /* we can DECREF safely, as the exception has another reference,
4302 so the object won't go away. */
4303 Py_DECREF(inputobj);
4304
4305 if (newpos<0)
4306 newpos = insize+newpos;
4307 if (newpos<0 || newpos>insize) {
4308 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4309 goto onError;
4310 }
4311
4312 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4313 if (repwstr == NULL)
4314 goto onError;
4315 /* need more space? (at least enough for what we
4316 have+the replacement+the rest of the string (starting
4317 at the new input position), so we won't have to check space
4318 when there are no errors in the rest of the string) */
4319 requiredsize = *outpos;
4320 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4321 goto overflow;
4322 requiredsize += repwlen;
4323 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4324 goto overflow;
4325 requiredsize += insize - newpos;
4326 if (requiredsize > outsize) {
4327 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4328 requiredsize = 2*outsize;
4329 if (unicode_resize(output, requiredsize) < 0)
4330 goto onError;
4331 }
4332 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4333 *outpos += repwlen;
4334 *endinpos = newpos;
4335 *inptr = *input + newpos;
4336
4337 /* we made it! */
4338 Py_XDECREF(restuple);
4339 return 0;
4340
4341 overflow:
4342 PyErr_SetString(PyExc_OverflowError,
4343 "decoded result is too long for a Python string");
4344
4345 onError:
4346 Py_XDECREF(restuple);
4347 return -1;
4348 }
4349 #endif /* MS_WINDOWS */
4350
4351 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4352 unicode_decode_call_errorhandler_writer(
4353 const char *errors, PyObject **errorHandler,
4354 const char *encoding, const char *reason,
4355 const char **input, const char **inend, Py_ssize_t *startinpos,
4356 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4357 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4358 {
4359 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4360
4361 PyObject *restuple = NULL;
4362 PyObject *repunicode = NULL;
4363 Py_ssize_t insize;
4364 Py_ssize_t newpos;
4365 Py_ssize_t replen;
4366 PyObject *inputobj = NULL;
4367
4368 if (*errorHandler == NULL) {
4369 *errorHandler = PyCodec_LookupError(errors);
4370 if (*errorHandler == NULL)
4371 goto onError;
4372 }
4373
4374 make_decode_exception(exceptionObject,
4375 encoding,
4376 *input, *inend - *input,
4377 *startinpos, *endinpos,
4378 reason);
4379 if (*exceptionObject == NULL)
4380 goto onError;
4381
4382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4383 if (restuple == NULL)
4384 goto onError;
4385 if (!PyTuple_Check(restuple)) {
4386 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4387 goto onError;
4388 }
4389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4390 goto onError;
4391
4392 /* Copy back the bytes variables, which might have been modified by the
4393 callback */
4394 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4395 if (!inputobj)
4396 goto onError;
4397 if (!PyBytes_Check(inputobj)) {
4398 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4399 }
4400 *input = PyBytes_AS_STRING(inputobj);
4401 insize = PyBytes_GET_SIZE(inputobj);
4402 *inend = *input + insize;
4403 /* we can DECREF safely, as the exception has another reference,
4404 so the object won't go away. */
4405 Py_DECREF(inputobj);
4406
4407 if (newpos<0)
4408 newpos = insize+newpos;
4409 if (newpos<0 || newpos>insize) {
4410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4411 goto onError;
4412 }
4413
4414 if (PyUnicode_READY(repunicode) < 0)
4415 goto onError;
4416 replen = PyUnicode_GET_LENGTH(repunicode);
4417 if (replen > 1) {
4418 writer->min_length += replen - 1;
4419 writer->overallocate = 1;
4420 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422 goto onError;
4423 }
4424 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4425 goto onError;
4426
4427 *endinpos = newpos;
4428 *inptr = *input + newpos;
4429
4430 /* we made it! */
4431 Py_XDECREF(restuple);
4432 return 0;
4433
4434 onError:
4435 Py_XDECREF(restuple);
4436 return -1;
4437 }
4438
4439 /* --- UTF-7 Codec -------------------------------------------------------- */
4440
4441 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4442
4443 /* Three simple macros defining base-64. */
4444
4445 /* Is c a base-64 character? */
4446
4447 #define IS_BASE64(c) \
4448 (((c) >= 'A' && (c) <= 'Z') || \
4449 ((c) >= 'a' && (c) <= 'z') || \
4450 ((c) >= '0' && (c) <= '9') || \
4451 (c) == '+' || (c) == '/')
4452
4453 /* given that c is a base-64 character, what is its base-64 value? */
4454
4455 #define FROM_BASE64(c) \
4456 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4457 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4458 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4459 (c) == '+' ? 62 : 63)
4460
4461 /* What is the base-64 character of the bottom 6 bits of n? */
4462
4463 #define TO_BASE64(n) \
4464 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465
4466 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467 * decoded as itself. We are permissive on decoding; the only ASCII
4468 * byte not decoding to itself is the + which begins a base64
4469 * string. */
4470
4471 #define DECODE_DIRECT(c) \
4472 ((c) <= 127 && (c) != '+')
4473
4474 /* The UTF-7 encoder treats ASCII characters differently according to
4475 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476 * the above). See RFC2152. This array identifies these different
4477 * sets:
4478 * 0 : "Set D"
4479 * alphanumeric and '(),-./:?
4480 * 1 : "Set O"
4481 * !"#$%&*;<=>@[]^_`{|}
4482 * 2 : "whitespace"
4483 * ht nl cr sp
4484 * 3 : special (must be base64 encoded)
4485 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486 */
4487
4488 static
4489 char utf7_category[128] = {
4490 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4491 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4492 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4494 /* sp ! " # $ % & ' ( ) * + , - . / */
4495 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4496 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4498 /* @ A B C D E F G H I J K L M N O */
4499 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4500 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4501 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4502 /* ` a b c d e f g h i j k l m n o */
4503 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4504 /* p q r s t u v w x y z { | } ~ del */
4505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4506 };
4507
4508 /* ENCODE_DIRECT: this character should be encoded as itself. The
4509 * answer depends on whether we are encoding set O as itself, and also
4510 * on whether we are encoding whitespace as itself. RFC2152 makes it
4511 * clear that the answers to these questions vary between
4512 * applications, so this code needs to be flexible. */
4513
4514 #define ENCODE_DIRECT(c, directO, directWS) \
4515 ((c) < 128 && (c) > 0 && \
4516 ((utf7_category[(c)] == 0) || \
4517 (directWS && (utf7_category[(c)] == 2)) || \
4518 (directO && (utf7_category[(c)] == 1))))
4519
4520 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4521 PyUnicode_DecodeUTF7(const char *s,
4522 Py_ssize_t size,
4523 const char *errors)
4524 {
4525 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526 }
4527
4528 /* The decoder. The only state we preserve is our read position,
4529 * i.e. how many characters we have consumed. So if we end in the
4530 * middle of a shift sequence we have to back off the read position
4531 * and the output to the beginning of the sequence, otherwise we lose
4532 * all the shift state (seen bits, number of bits seen, high
4533 * surrogate). */
4534
4535 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4536 PyUnicode_DecodeUTF7Stateful(const char *s,
4537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
4540 {
4541 const char *starts = s;
4542 Py_ssize_t startinpos;
4543 Py_ssize_t endinpos;
4544 const char *e;
4545 _PyUnicodeWriter writer;
4546 const char *errmsg = "";
4547 int inShift = 0;
4548 Py_ssize_t shiftOutStart;
4549 unsigned int base64bits = 0;
4550 unsigned long base64buffer = 0;
4551 Py_UCS4 surrogate = 0;
4552 PyObject *errorHandler = NULL;
4553 PyObject *exc = NULL;
4554
4555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
4558 _Py_RETURN_UNICODE_EMPTY();
4559 }
4560
4561 /* Start off assuming it's all ASCII. Widen later as necessary. */
4562 _PyUnicodeWriter_Init(&writer);
4563 writer.min_length = size;
4564
4565 shiftOutStart = 0;
4566 e = s + size;
4567
4568 while (s < e) {
4569 Py_UCS4 ch;
4570 restart:
4571 ch = (unsigned char) *s;
4572
4573 if (inShift) { /* in a base-64 section */
4574 if (IS_BASE64(ch)) { /* consume a base-64 character */
4575 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576 base64bits += 6;
4577 s++;
4578 if (base64bits >= 16) {
4579 /* we have enough bits for a UTF-16 value */
4580 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4581 base64bits -= 16;
4582 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4583 assert(outCh <= 0xffff);
4584 if (surrogate) {
4585 /* expecting a second surrogate */
4586 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4588 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4589 goto onError;
4590 surrogate = 0;
4591 continue;
4592 }
4593 else {
4594 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4595 goto onError;
4596 surrogate = 0;
4597 }
4598 }
4599 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4600 /* first surrogate */
4601 surrogate = outCh;
4602 }
4603 else {
4604 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4605 goto onError;
4606 }
4607 }
4608 }
4609 else { /* now leaving a base-64 section */
4610 inShift = 0;
4611 if (base64bits > 0) { /* left-over bits */
4612 if (base64bits >= 6) {
4613 /* We've seen at least one base-64 character */
4614 s++;
4615 errmsg = "partial character in shift sequence";
4616 goto utf7Error;
4617 }
4618 else {
4619 /* Some bits remain; they should be zero */
4620 if (base64buffer != 0) {
4621 s++;
4622 errmsg = "non-zero padding bits in shift sequence";
4623 goto utf7Error;
4624 }
4625 }
4626 }
4627 if (surrogate && DECODE_DIRECT(ch)) {
4628 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629 goto onError;
4630 }
4631 surrogate = 0;
4632 if (ch == '-') {
4633 /* '-' is absorbed; other terminating
4634 characters are preserved */
4635 s++;
4636 }
4637 }
4638 }
4639 else if ( ch == '+' ) {
4640 startinpos = s-starts;
4641 s++; /* consume '+' */
4642 if (s < e && *s == '-') { /* '+-' encodes '+' */
4643 s++;
4644 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4645 goto onError;
4646 }
4647 else { /* begin base64-encoded section */
4648 inShift = 1;
4649 surrogate = 0;
4650 shiftOutStart = writer.pos;
4651 base64bits = 0;
4652 base64buffer = 0;
4653 }
4654 }
4655 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4656 s++;
4657 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4658 goto onError;
4659 }
4660 else {
4661 startinpos = s-starts;
4662 s++;
4663 errmsg = "unexpected special character";
4664 goto utf7Error;
4665 }
4666 continue;
4667 utf7Error:
4668 endinpos = s-starts;
4669 if (unicode_decode_call_errorhandler_writer(
4670 errors, &errorHandler,
4671 "utf7", errmsg,
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
4673 &writer))
4674 goto onError;
4675 }
4676
4677 /* end of string */
4678
4679 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680 /* if we're in an inconsistent state, that's an error */
4681 inShift = 0;
4682 if (surrogate ||
4683 (base64bits >= 6) ||
4684 (base64bits > 0 && base64buffer != 0)) {
4685 endinpos = size;
4686 if (unicode_decode_call_errorhandler_writer(
4687 errors, &errorHandler,
4688 "utf7", "unterminated shift sequence",
4689 &starts, &e, &startinpos, &endinpos, &exc, &s,
4690 &writer))
4691 goto onError;
4692 if (s < e)
4693 goto restart;
4694 }
4695 }
4696
4697 /* return state */
4698 if (consumed) {
4699 if (inShift) {
4700 *consumed = startinpos;
4701 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4702 PyObject *result = PyUnicode_FromKindAndData(
4703 writer.kind, writer.data, shiftOutStart);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 _PyUnicodeWriter_Dealloc(&writer);
4707 return result;
4708 }
4709 writer.pos = shiftOutStart; /* back off output */
4710 }
4711 else {
4712 *consumed = s-starts;
4713 }
4714 }
4715
4716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
4718 return _PyUnicodeWriter_Finish(&writer);
4719
4720 onError:
4721 Py_XDECREF(errorHandler);
4722 Py_XDECREF(exc);
4723 _PyUnicodeWriter_Dealloc(&writer);
4724 return NULL;
4725 }
4726
4727
4728 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4729 _PyUnicode_EncodeUTF7(PyObject *str,
4730 int base64SetO,
4731 int base64WhiteSpace,
4732 const char *errors)
4733 {
4734 int kind;
4735 void *data;
4736 Py_ssize_t len;
4737 PyObject *v;
4738 int inShift = 0;
4739 Py_ssize_t i;
4740 unsigned int base64bits = 0;
4741 unsigned long base64buffer = 0;
4742 char * out;
4743 char * start;
4744
4745 if (PyUnicode_READY(str) == -1)
4746 return NULL;
4747 kind = PyUnicode_KIND(str);
4748 data = PyUnicode_DATA(str);
4749 len = PyUnicode_GET_LENGTH(str);
4750
4751 if (len == 0)
4752 return PyBytes_FromStringAndSize(NULL, 0);
4753
4754 /* It might be possible to tighten this worst case */
4755 if (len > PY_SSIZE_T_MAX / 8)
4756 return PyErr_NoMemory();
4757 v = PyBytes_FromStringAndSize(NULL, len * 8);
4758 if (v == NULL)
4759 return NULL;
4760
4761 start = out = PyBytes_AS_STRING(v);
4762 for (i = 0; i < len; ++i) {
4763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4764
4765 if (inShift) {
4766 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767 /* shifting out */
4768 if (base64bits) { /* output remaining bits */
4769 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770 base64buffer = 0;
4771 base64bits = 0;
4772 }
4773 inShift = 0;
4774 /* Characters not in the BASE64 set implicitly unshift the sequence
4775 so no '-' is required, except if the character is itself a '-' */
4776 if (IS_BASE64(ch) || ch == '-') {
4777 *out++ = '-';
4778 }
4779 *out++ = (char) ch;
4780 }
4781 else {
4782 goto encode_char;
4783 }
4784 }
4785 else { /* not in a shift sequence */
4786 if (ch == '+') {
4787 *out++ = '+';
4788 *out++ = '-';
4789 }
4790 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791 *out++ = (char) ch;
4792 }
4793 else {
4794 *out++ = '+';
4795 inShift = 1;
4796 goto encode_char;
4797 }
4798 }
4799 continue;
4800 encode_char:
4801 if (ch >= 0x10000) {
4802 assert(ch <= MAX_UNICODE);
4803
4804 /* code first surrogate */
4805 base64bits += 16;
4806 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4807 while (base64bits >= 6) {
4808 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809 base64bits -= 6;
4810 }
4811 /* prepare second surrogate */
4812 ch = Py_UNICODE_LOW_SURROGATE(ch);
4813 }
4814 base64bits += 16;
4815 base64buffer = (base64buffer << 16) | ch;
4816 while (base64bits >= 6) {
4817 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818 base64bits -= 6;
4819 }
4820 }
4821 if (base64bits)
4822 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823 if (inShift)
4824 *out++ = '-';
4825 if (_PyBytes_Resize(&v, out - start) < 0)
4826 return NULL;
4827 return v;
4828 }
4829 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4830 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831 Py_ssize_t size,
4832 int base64SetO,
4833 int base64WhiteSpace,
4834 const char *errors)
4835 {
4836 PyObject *result;
4837 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4838 if (tmp == NULL)
4839 return NULL;
4840 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4841 base64WhiteSpace, errors);
4842 Py_DECREF(tmp);
4843 return result;
4844 }
4845
4846 #undef IS_BASE64
4847 #undef FROM_BASE64
4848 #undef TO_BASE64
4849 #undef DECODE_DIRECT
4850 #undef ENCODE_DIRECT
4851
4852 /* --- UTF-8 Codec -------------------------------------------------------- */
4853
4854 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4855 PyUnicode_DecodeUTF8(const char *s,
4856 Py_ssize_t size,
4857 const char *errors)
4858 {
4859 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860 }
4861
4862 #include "stringlib/asciilib.h"
4863 #include "stringlib/codecs.h"
4864 #include "stringlib/undef.h"
4865
4866 #include "stringlib/ucs1lib.h"
4867 #include "stringlib/codecs.h"
4868 #include "stringlib/undef.h"
4869
4870 #include "stringlib/ucs2lib.h"
4871 #include "stringlib/codecs.h"
4872 #include "stringlib/undef.h"
4873
4874 #include "stringlib/ucs4lib.h"
4875 #include "stringlib/codecs.h"
4876 #include "stringlib/undef.h"
4877
4878 /* Mask to quickly check whether a C 'long' contains a
4879 non-ASCII, UTF8-encoded char. */
4880 #if (SIZEOF_LONG == 8)
4881 # define ASCII_CHAR_MASK 0x8080808080808080UL
4882 #elif (SIZEOF_LONG == 4)
4883 # define ASCII_CHAR_MASK 0x80808080UL
4884 #else
4885 # error C 'long' size should be either 4 or 8!
4886 #endif
4887
4888 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4889 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4890 {
4891 const char *p = start;
4892 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4893
4894 /*
4895 * Issue #17237: m68k is a bit different from most architectures in
4896 * that objects do not use "natural alignment" - for example, int and
4897 * long are only aligned at 2-byte boundaries. Therefore the assert()
4898 * won't work; also, tests have shown that skipping the "optimised
4899 * version" will even speed up m68k.
4900 */
4901 #if !defined(__m68k__)
4902 #if SIZEOF_LONG <= SIZEOF_VOID_P
4903 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4905 /* Fast path, see in STRINGLIB(utf8_decode) for
4906 an explanation. */
4907 /* Help allocation */
4908 const char *_p = p;
4909 Py_UCS1 * q = dest;
4910 while (_p < aligned_end) {
4911 unsigned long value = *(const unsigned long *) _p;
4912 if (value & ASCII_CHAR_MASK)
4913 break;
4914 *((unsigned long *)q) = value;
4915 _p += SIZEOF_LONG;
4916 q += SIZEOF_LONG;
4917 }
4918 p = _p;
4919 while (p < end) {
4920 if ((unsigned char)*p & 0x80)
4921 break;
4922 *q++ = *p++;
4923 }
4924 return p - start;
4925 }
4926 #endif
4927 #endif
4928 while (p < end) {
4929 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930 for an explanation. */
4931 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4932 /* Help allocation */
4933 const char *_p = p;
4934 while (_p < aligned_end) {
4935 unsigned long value = *(unsigned long *) _p;
4936 if (value & ASCII_CHAR_MASK)
4937 break;
4938 _p += SIZEOF_LONG;
4939 }
4940 p = _p;
4941 if (_p == end)
4942 break;
4943 }
4944 if ((unsigned char)*p & 0x80)
4945 break;
4946 ++p;
4947 }
4948 memcpy(dest, start, p - start);
4949 return p - start;
4950 }
4951
4952 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4953 PyUnicode_DecodeUTF8Stateful(const char *s,
4954 Py_ssize_t size,
4955 const char *errors,
4956 Py_ssize_t *consumed)
4957 {
4958 _PyUnicodeWriter writer;
4959 const char *starts = s;
4960 const char *end = s + size;
4961
4962 Py_ssize_t startinpos;
4963 Py_ssize_t endinpos;
4964 const char *errmsg = "";
4965 PyObject *error_handler_obj = NULL;
4966 PyObject *exc = NULL;
4967 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4968
4969 if (size == 0) {
4970 if (consumed)
4971 *consumed = 0;
4972 _Py_RETURN_UNICODE_EMPTY();
4973 }
4974
4975 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976 if (size == 1 && (unsigned char)s[0] < 128) {
4977 if (consumed)
4978 *consumed = 1;
4979 return get_latin1_char((unsigned char)s[0]);
4980 }
4981
4982 _PyUnicodeWriter_Init(&writer);
4983 writer.min_length = size;
4984 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4985 goto onError;
4986
4987 writer.pos = ascii_decode(s, end, writer.data);
4988 s += writer.pos;
4989 while (s < end) {
4990 Py_UCS4 ch;
4991 int kind = writer.kind;
4992
4993 if (kind == PyUnicode_1BYTE_KIND) {
4994 if (PyUnicode_IS_ASCII(writer.buffer))
4995 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4996 else
4997 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4998 } else if (kind == PyUnicode_2BYTE_KIND) {
4999 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5000 } else {
5001 assert(kind == PyUnicode_4BYTE_KIND);
5002 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5003 }
5004
5005 switch (ch) {
5006 case 0:
5007 if (s == end || consumed)
5008 goto End;
5009 errmsg = "unexpected end of data";
5010 startinpos = s - starts;
5011 endinpos = end - starts;
5012 break;
5013 case 1:
5014 errmsg = "invalid start byte";
5015 startinpos = s - starts;
5016 endinpos = startinpos + 1;
5017 break;
5018 case 2:
5019 case 3:
5020 case 4:
5021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
5023 endinpos = startinpos + ch - 1;
5024 break;
5025 default:
5026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5027 goto onError;
5028 continue;
5029 }
5030
5031 if (error_handler == _Py_ERROR_UNKNOWN)
5032 error_handler = get_error_handler(errors);
5033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
5046 {
5047 Py_ssize_t i;
5048
5049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
5051 for (i=startinpos; i<endinpos; i++) {
5052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
5059 }
5060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
5069 }
5070
5071 End:
5072 if (consumed)
5073 *consumed = s - starts;
5074
5075 Py_XDECREF(error_handler_obj);
5076 Py_XDECREF(exc);
5077 return _PyUnicodeWriter_Finish(&writer);
5078
5079 onError:
5080 Py_XDECREF(error_handler_obj);
5081 Py_XDECREF(exc);
5082 _PyUnicodeWriter_Dealloc(&writer);
5083 return NULL;
5084 }
5085
5086 #if defined(__APPLE__) || defined(__ANDROID__)
5087
5088 /* Simplified UTF-8 decoder using surrogateescape error handler,
5089 used to decode the command line arguments on Mac OS X and Android.
5090
5091 Return a pointer to a newly allocated wide character string (use
5092 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5093
5094 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * s,Py_ssize_t size)5095 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096 {
5097 const char *e;
5098 wchar_t *unicode;
5099 Py_ssize_t outpos;
5100
5101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
5103 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5104 return NULL;
5105 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5106 if (!unicode)
5107 return NULL;
5108
5109 /* Unpack UTF-8 encoded data */
5110 e = s + size;
5111 outpos = 0;
5112 while (s < e) {
5113 Py_UCS4 ch;
5114 #if SIZEOF_WCHAR_T == 4
5115 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5116 #else
5117 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5118 #endif
5119 if (ch > 0xFF) {
5120 #if SIZEOF_WCHAR_T == 4
5121 assert(0);
5122 #else
5123 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5124 /* compute and append the two surrogates: */
5125 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127 #endif
5128 }
5129 else {
5130 if (!ch && s == e)
5131 break;
5132 /* surrogateescape */
5133 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134 }
5135 }
5136 unicode[outpos] = L'\0';
5137 return unicode;
5138 }
5139
5140 #endif /* __APPLE__ or __ANDROID__ */
5141
5142 /* Primary internal function which creates utf8 encoded bytes objects.
5143
5144 Allocation strategy: if the string is short, convert into a stack buffer
5145 and allocate exactly as much space needed at the end. Else allocate the
5146 maximum possible needed (4 result bytes per Unicode character), and return
5147 the excess memory at the end.
5148 */
5149 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5150 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5151 {
5152 enum PyUnicode_Kind kind;
5153 void *data;
5154 Py_ssize_t size;
5155
5156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160
5161 if (PyUnicode_READY(unicode) == -1)
5162 return NULL;
5163
5164 if (PyUnicode_UTF8(unicode))
5165 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166 PyUnicode_UTF8_LENGTH(unicode));
5167
5168 kind = PyUnicode_KIND(unicode);
5169 data = PyUnicode_DATA(unicode);
5170 size = PyUnicode_GET_LENGTH(unicode);
5171
5172 switch (kind) {
5173 default:
5174 assert(0);
5175 case PyUnicode_1BYTE_KIND:
5176 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177 assert(!PyUnicode_IS_ASCII(unicode));
5178 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179 case PyUnicode_2BYTE_KIND:
5180 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181 case PyUnicode_4BYTE_KIND:
5182 return ucs4lib_utf8_encoder(unicode, data, size, errors);
5183 }
5184 }
5185
5186 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5187 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188 Py_ssize_t size,
5189 const char *errors)
5190 {
5191 PyObject *v, *unicode;
5192
5193 unicode = PyUnicode_FromUnicode(s, size);
5194 if (unicode == NULL)
5195 return NULL;
5196 v = _PyUnicode_AsUTF8String(unicode, errors);
5197 Py_DECREF(unicode);
5198 return v;
5199 }
5200
5201 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5202 PyUnicode_AsUTF8String(PyObject *unicode)
5203 {
5204 return _PyUnicode_AsUTF8String(unicode, NULL);
5205 }
5206
5207 /* --- UTF-32 Codec ------------------------------------------------------- */
5208
5209 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5210 PyUnicode_DecodeUTF32(const char *s,
5211 Py_ssize_t size,
5212 const char *errors,
5213 int *byteorder)
5214 {
5215 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216 }
5217
5218 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5219 PyUnicode_DecodeUTF32Stateful(const char *s,
5220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
5224 {
5225 const char *starts = s;
5226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
5228 _PyUnicodeWriter writer;
5229 const unsigned char *q, *e;
5230 int le, bo = 0; /* assume native ordering by default */
5231 const char *encoding;
5232 const char *errmsg = "";
5233 PyObject *errorHandler = NULL;
5234 PyObject *exc = NULL;
5235
5236 q = (unsigned char *)s;
5237 e = q + size;
5238
5239 if (byteorder)
5240 bo = *byteorder;
5241
5242 /* Check for BOM marks (U+FEFF) in the input and adjust current
5243 byte order setting accordingly. In native mode, the leading BOM
5244 mark is skipped, in all other modes, it is copied to the output
5245 stream as-is (giving a ZWNBSP character). */
5246 if (bo == 0 && size >= 4) {
5247 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5248 if (bom == 0x0000FEFF) {
5249 bo = -1;
5250 q += 4;
5251 }
5252 else if (bom == 0xFFFE0000) {
5253 bo = 1;
5254 q += 4;
5255 }
5256 if (byteorder)
5257 *byteorder = bo;
5258 }
5259
5260 if (q == e) {
5261 if (consumed)
5262 *consumed = size;
5263 _Py_RETURN_UNICODE_EMPTY();
5264 }
5265
5266 #ifdef WORDS_BIGENDIAN
5267 le = bo < 0;
5268 #else
5269 le = bo <= 0;
5270 #endif
5271 encoding = le ? "utf-32-le" : "utf-32-be";
5272
5273 _PyUnicodeWriter_Init(&writer);
5274 writer.min_length = (e - q + 3) / 4;
5275 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5276 goto onError;
5277
5278 while (1) {
5279 Py_UCS4 ch = 0;
5280 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5281
5282 if (e - q >= 4) {
5283 enum PyUnicode_Kind kind = writer.kind;
5284 void *data = writer.data;
5285 const unsigned char *last = e - 4;
5286 Py_ssize_t pos = writer.pos;
5287 if (le) {
5288 do {
5289 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5290 if (ch > maxch)
5291 break;
5292 if (kind != PyUnicode_1BYTE_KIND &&
5293 Py_UNICODE_IS_SURROGATE(ch))
5294 break;
5295 PyUnicode_WRITE(kind, data, pos++, ch);
5296 q += 4;
5297 } while (q <= last);
5298 }
5299 else {
5300 do {
5301 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5302 if (ch > maxch)
5303 break;
5304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
5307 PyUnicode_WRITE(kind, data, pos++, ch);
5308 q += 4;
5309 } while (q <= last);
5310 }
5311 writer.pos = pos;
5312 }
5313
5314 if (Py_UNICODE_IS_SURROGATE(ch)) {
5315 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5316 startinpos = ((const char *)q) - starts;
5317 endinpos = startinpos + 4;
5318 }
5319 else if (ch <= maxch) {
5320 if (q == e || consumed)
5321 break;
5322 /* remaining bytes at the end? (size should be divisible by 4) */
5323 errmsg = "truncated data";
5324 startinpos = ((const char *)q) - starts;
5325 endinpos = ((const char *)e) - starts;
5326 }
5327 else {
5328 if (ch < 0x110000) {
5329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5330 goto onError;
5331 q += 4;
5332 continue;
5333 }
5334 errmsg = "code point not in range(0x110000)";
5335 startinpos = ((const char *)q) - starts;
5336 endinpos = startinpos + 4;
5337 }
5338
5339 /* The remaining input chars are ignored if the callback
5340 chooses to skip the input */
5341 if (unicode_decode_call_errorhandler_writer(
5342 errors, &errorHandler,
5343 encoding, errmsg,
5344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5345 &writer))
5346 goto onError;
5347 }
5348
5349 if (consumed)
5350 *consumed = (const char *)q-starts;
5351
5352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
5354 return _PyUnicodeWriter_Finish(&writer);
5355
5356 onError:
5357 _PyUnicodeWriter_Dealloc(&writer);
5358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
5360 return NULL;
5361 }
5362
5363 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5364 _PyUnicode_EncodeUTF32(PyObject *str,
5365 const char *errors,
5366 int byteorder)
5367 {
5368 enum PyUnicode_Kind kind;
5369 const void *data;
5370 Py_ssize_t len;
5371 PyObject *v;
5372 uint32_t *out;
5373 #if PY_LITTLE_ENDIAN
5374 int native_ordering = byteorder <= 0;
5375 #else
5376 int native_ordering = byteorder >= 0;
5377 #endif
5378 const char *encoding;
5379 Py_ssize_t nsize, pos;
5380 PyObject *errorHandler = NULL;
5381 PyObject *exc = NULL;
5382 PyObject *rep = NULL;
5383
5384 if (!PyUnicode_Check(str)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
5388 if (PyUnicode_READY(str) == -1)
5389 return NULL;
5390 kind = PyUnicode_KIND(str);
5391 data = PyUnicode_DATA(str);
5392 len = PyUnicode_GET_LENGTH(str);
5393
5394 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5395 return PyErr_NoMemory();
5396 nsize = len + (byteorder == 0);
5397 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5398 if (v == NULL)
5399 return NULL;
5400
5401 /* output buffer is 4-bytes aligned */
5402 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5403 out = (uint32_t *)PyBytes_AS_STRING(v);
5404 if (byteorder == 0)
5405 *out++ = 0xFEFF;
5406 if (len == 0)
5407 goto done;
5408
5409 if (byteorder == -1)
5410 encoding = "utf-32-le";
5411 else if (byteorder == 1)
5412 encoding = "utf-32-be";
5413 else
5414 encoding = "utf-32";
5415
5416 if (kind == PyUnicode_1BYTE_KIND) {
5417 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418 goto done;
5419 }
5420
5421 pos = 0;
5422 while (pos < len) {
5423 Py_ssize_t repsize, moreunits;
5424
5425 if (kind == PyUnicode_2BYTE_KIND) {
5426 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427 &out, native_ordering);
5428 }
5429 else {
5430 assert(kind == PyUnicode_4BYTE_KIND);
5431 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432 &out, native_ordering);
5433 }
5434 if (pos == len)
5435 break;
5436
5437 rep = unicode_encode_call_errorhandler(
5438 errors, &errorHandler,
5439 encoding, "surrogates not allowed",
5440 str, &exc, pos, pos + 1, &pos);
5441 if (!rep)
5442 goto error;
5443
5444 if (PyBytes_Check(rep)) {
5445 repsize = PyBytes_GET_SIZE(rep);
5446 if (repsize & 3) {
5447 raise_encode_exception(&exc, encoding,
5448 str, pos - 1, pos,
5449 "surrogates not allowed");
5450 goto error;
5451 }
5452 moreunits = repsize / 4;
5453 }
5454 else {
5455 assert(PyUnicode_Check(rep));
5456 if (PyUnicode_READY(rep) < 0)
5457 goto error;
5458 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459 if (!PyUnicode_IS_ASCII(rep)) {
5460 raise_encode_exception(&exc, encoding,
5461 str, pos - 1, pos,
5462 "surrogates not allowed");
5463 goto error;
5464 }
5465 }
5466
5467 /* four bytes are reserved for each surrogate */
5468 if (moreunits > 1) {
5469 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5470 Py_ssize_t morebytes = 4 * (moreunits - 1);
5471 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472 /* integer overflow */
5473 PyErr_NoMemory();
5474 goto error;
5475 }
5476 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477 goto error;
5478 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5479 }
5480
5481 if (PyBytes_Check(rep)) {
5482 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5483 out += moreunits;
5484 } else /* rep is unicode */ {
5485 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5486 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487 &out, native_ordering);
5488 }
5489
5490 Py_CLEAR(rep);
5491 }
5492
5493 /* Cut back to size actually needed. This is necessary for, for example,
5494 encoding of a string containing isolated surrogates and the 'ignore'
5495 handler is used. */
5496 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5497 if (nsize != PyBytes_GET_SIZE(v))
5498 _PyBytes_Resize(&v, nsize);
5499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
5501 done:
5502 return v;
5503 error:
5504 Py_XDECREF(rep);
5505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
5507 Py_XDECREF(v);
5508 return NULL;
5509 }
5510
5511 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5512 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513 Py_ssize_t size,
5514 const char *errors,
5515 int byteorder)
5516 {
5517 PyObject *result;
5518 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5519 if (tmp == NULL)
5520 return NULL;
5521 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522 Py_DECREF(tmp);
5523 return result;
5524 }
5525
5526 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5527 PyUnicode_AsUTF32String(PyObject *unicode)
5528 {
5529 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5530 }
5531
5532 /* --- UTF-16 Codec ------------------------------------------------------- */
5533
5534 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5535 PyUnicode_DecodeUTF16(const char *s,
5536 Py_ssize_t size,
5537 const char *errors,
5538 int *byteorder)
5539 {
5540 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541 }
5542
5543 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5544 PyUnicode_DecodeUTF16Stateful(const char *s,
5545 Py_ssize_t size,
5546 const char *errors,
5547 int *byteorder,
5548 Py_ssize_t *consumed)
5549 {
5550 const char *starts = s;
5551 Py_ssize_t startinpos;
5552 Py_ssize_t endinpos;
5553 _PyUnicodeWriter writer;
5554 const unsigned char *q, *e;
5555 int bo = 0; /* assume native ordering by default */
5556 int native_ordering;
5557 const char *errmsg = "";
5558 PyObject *errorHandler = NULL;
5559 PyObject *exc = NULL;
5560 const char *encoding;
5561
5562 q = (unsigned char *)s;
5563 e = q + size;
5564
5565 if (byteorder)
5566 bo = *byteorder;
5567
5568 /* Check for BOM marks (U+FEFF) in the input and adjust current
5569 byte order setting accordingly. In native mode, the leading BOM
5570 mark is skipped, in all other modes, it is copied to the output
5571 stream as-is (giving a ZWNBSP character). */
5572 if (bo == 0 && size >= 2) {
5573 const Py_UCS4 bom = (q[1] << 8) | q[0];
5574 if (bom == 0xFEFF) {
5575 q += 2;
5576 bo = -1;
5577 }
5578 else if (bom == 0xFFFE) {
5579 q += 2;
5580 bo = 1;
5581 }
5582 if (byteorder)
5583 *byteorder = bo;
5584 }
5585
5586 if (q == e) {
5587 if (consumed)
5588 *consumed = size;
5589 _Py_RETURN_UNICODE_EMPTY();
5590 }
5591
5592 #if PY_LITTLE_ENDIAN
5593 native_ordering = bo <= 0;
5594 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5595 #else
5596 native_ordering = bo >= 0;
5597 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5598 #endif
5599
5600 /* Note: size will always be longer than the resulting Unicode
5601 character count */
5602 _PyUnicodeWriter_Init(&writer);
5603 writer.min_length = (e - q + 1) / 2;
5604 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5605 goto onError;
5606
5607 while (1) {
5608 Py_UCS4 ch = 0;
5609 if (e - q >= 2) {
5610 int kind = writer.kind;
5611 if (kind == PyUnicode_1BYTE_KIND) {
5612 if (PyUnicode_IS_ASCII(writer.buffer))
5613 ch = asciilib_utf16_decode(&q, e,
5614 (Py_UCS1*)writer.data, &writer.pos,
5615 native_ordering);
5616 else
5617 ch = ucs1lib_utf16_decode(&q, e,
5618 (Py_UCS1*)writer.data, &writer.pos,
5619 native_ordering);
5620 } else if (kind == PyUnicode_2BYTE_KIND) {
5621 ch = ucs2lib_utf16_decode(&q, e,
5622 (Py_UCS2*)writer.data, &writer.pos,
5623 native_ordering);
5624 } else {
5625 assert(kind == PyUnicode_4BYTE_KIND);
5626 ch = ucs4lib_utf16_decode(&q, e,
5627 (Py_UCS4*)writer.data, &writer.pos,
5628 native_ordering);
5629 }
5630 }
5631
5632 switch (ch)
5633 {
5634 case 0:
5635 /* remaining byte at the end? (size should be even) */
5636 if (q == e || consumed)
5637 goto End;
5638 errmsg = "truncated data";
5639 startinpos = ((const char *)q) - starts;
5640 endinpos = ((const char *)e) - starts;
5641 break;
5642 /* The remaining input chars are ignored if the callback
5643 chooses to skip the input */
5644 case 1:
5645 q -= 2;
5646 if (consumed)
5647 goto End;
5648 errmsg = "unexpected end of data";
5649 startinpos = ((const char *)q) - starts;
5650 endinpos = ((const char *)e) - starts;
5651 break;
5652 case 2:
5653 errmsg = "illegal encoding";
5654 startinpos = ((const char *)q) - 2 - starts;
5655 endinpos = startinpos + 2;
5656 break;
5657 case 3:
5658 errmsg = "illegal UTF-16 surrogate";
5659 startinpos = ((const char *)q) - 4 - starts;
5660 endinpos = startinpos + 2;
5661 break;
5662 default:
5663 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5664 goto onError;
5665 continue;
5666 }
5667
5668 if (unicode_decode_call_errorhandler_writer(
5669 errors,
5670 &errorHandler,
5671 encoding, errmsg,
5672 &starts,
5673 (const char **)&e,
5674 &startinpos,
5675 &endinpos,
5676 &exc,
5677 (const char **)&q,
5678 &writer))
5679 goto onError;
5680 }
5681
5682 End:
5683 if (consumed)
5684 *consumed = (const char *)q-starts;
5685
5686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
5688 return _PyUnicodeWriter_Finish(&writer);
5689
5690 onError:
5691 _PyUnicodeWriter_Dealloc(&writer);
5692 Py_XDECREF(errorHandler);
5693 Py_XDECREF(exc);
5694 return NULL;
5695 }
5696
5697 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5698 _PyUnicode_EncodeUTF16(PyObject *str,
5699 const char *errors,
5700 int byteorder)
5701 {
5702 enum PyUnicode_Kind kind;
5703 const void *data;
5704 Py_ssize_t len;
5705 PyObject *v;
5706 unsigned short *out;
5707 Py_ssize_t pairs;
5708 #if PY_BIG_ENDIAN
5709 int native_ordering = byteorder >= 0;
5710 #else
5711 int native_ordering = byteorder <= 0;
5712 #endif
5713 const char *encoding;
5714 Py_ssize_t nsize, pos;
5715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
5717 PyObject *rep = NULL;
5718
5719 if (!PyUnicode_Check(str)) {
5720 PyErr_BadArgument();
5721 return NULL;
5722 }
5723 if (PyUnicode_READY(str) == -1)
5724 return NULL;
5725 kind = PyUnicode_KIND(str);
5726 data = PyUnicode_DATA(str);
5727 len = PyUnicode_GET_LENGTH(str);
5728
5729 pairs = 0;
5730 if (kind == PyUnicode_4BYTE_KIND) {
5731 const Py_UCS4 *in = (const Py_UCS4 *)data;
5732 const Py_UCS4 *end = in + len;
5733 while (in < end) {
5734 if (*in++ >= 0x10000) {
5735 pairs++;
5736 }
5737 }
5738 }
5739 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5740 return PyErr_NoMemory();
5741 }
5742 nsize = len + pairs + (byteorder == 0);
5743 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5744 if (v == NULL) {
5745 return NULL;
5746 }
5747
5748 /* output buffer is 2-bytes aligned */
5749 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5750 out = (unsigned short *)PyBytes_AS_STRING(v);
5751 if (byteorder == 0) {
5752 *out++ = 0xFEFF;
5753 }
5754 if (len == 0) {
5755 goto done;
5756 }
5757
5758 if (kind == PyUnicode_1BYTE_KIND) {
5759 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760 goto done;
5761 }
5762
5763 if (byteorder < 0) {
5764 encoding = "utf-16-le";
5765 }
5766 else if (byteorder > 0) {
5767 encoding = "utf-16-be";
5768 }
5769 else {
5770 encoding = "utf-16";
5771 }
5772
5773 pos = 0;
5774 while (pos < len) {
5775 Py_ssize_t repsize, moreunits;
5776
5777 if (kind == PyUnicode_2BYTE_KIND) {
5778 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779 &out, native_ordering);
5780 }
5781 else {
5782 assert(kind == PyUnicode_4BYTE_KIND);
5783 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784 &out, native_ordering);
5785 }
5786 if (pos == len)
5787 break;
5788
5789 rep = unicode_encode_call_errorhandler(
5790 errors, &errorHandler,
5791 encoding, "surrogates not allowed",
5792 str, &exc, pos, pos + 1, &pos);
5793 if (!rep)
5794 goto error;
5795
5796 if (PyBytes_Check(rep)) {
5797 repsize = PyBytes_GET_SIZE(rep);
5798 if (repsize & 1) {
5799 raise_encode_exception(&exc, encoding,
5800 str, pos - 1, pos,
5801 "surrogates not allowed");
5802 goto error;
5803 }
5804 moreunits = repsize / 2;
5805 }
5806 else {
5807 assert(PyUnicode_Check(rep));
5808 if (PyUnicode_READY(rep) < 0)
5809 goto error;
5810 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811 if (!PyUnicode_IS_ASCII(rep)) {
5812 raise_encode_exception(&exc, encoding,
5813 str, pos - 1, pos,
5814 "surrogates not allowed");
5815 goto error;
5816 }
5817 }
5818
5819 /* two bytes are reserved for each surrogate */
5820 if (moreunits > 1) {
5821 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822 Py_ssize_t morebytes = 2 * (moreunits - 1);
5823 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
5828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829 goto error;
5830 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831 }
5832
5833 if (PyBytes_Check(rep)) {
5834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5835 out += moreunits;
5836 } else /* rep is unicode */ {
5837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
5840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore' handler
5847 is used. */
5848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
5853 done:
5854 return v;
5855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
5861 #undef STORECHAR
5862 }
5863
5864 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5865 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866 Py_ssize_t size,
5867 const char *errors,
5868 int byteorder)
5869 {
5870 PyObject *result;
5871 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872 if (tmp == NULL)
5873 return NULL;
5874 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875 Py_DECREF(tmp);
5876 return result;
5877 }
5878
5879 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5880 PyUnicode_AsUTF16String(PyObject *unicode)
5881 {
5882 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5883 }
5884
5885 /* --- Unicode Escape Codec ----------------------------------------------- */
5886
5887 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5888
5889 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5890 _PyUnicode_DecodeUnicodeEscape(const char *s,
5891 Py_ssize_t size,
5892 const char *errors,
5893 const char **first_invalid_escape)
5894 {
5895 const char *starts = s;
5896 _PyUnicodeWriter writer;
5897 const char *end;
5898 PyObject *errorHandler = NULL;
5899 PyObject *exc = NULL;
5900
5901 // so we can remember if we've seen an invalid escape char or not
5902 *first_invalid_escape = NULL;
5903
5904 if (size == 0) {
5905 _Py_RETURN_UNICODE_EMPTY();
5906 }
5907 /* Escaped strings will always be longer than the resulting
5908 Unicode string, so we start with size here and then reduce the
5909 length after conversion to the true value.
5910 (but if the error callback returns a long replacement string
5911 we'll have to allocate more space) */
5912 _PyUnicodeWriter_Init(&writer);
5913 writer.min_length = size;
5914 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915 goto onError;
5916 }
5917
5918 end = s + size;
5919 while (s < end) {
5920 unsigned char c = (unsigned char) *s++;
5921 Py_UCS4 ch;
5922 int count;
5923 Py_ssize_t startinpos;
5924 Py_ssize_t endinpos;
5925 const char *message;
5926
5927 #define WRITE_ASCII_CHAR(ch) \
5928 do { \
5929 assert(ch <= 127); \
5930 assert(writer.pos < writer.size); \
5931 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5932 } while(0)
5933
5934 #define WRITE_CHAR(ch) \
5935 do { \
5936 if (ch <= writer.maxchar) { \
5937 assert(writer.pos < writer.size); \
5938 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939 } \
5940 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941 goto onError; \
5942 } \
5943 } while(0)
5944
5945 /* Non-escape characters are interpreted as Unicode ordinals */
5946 if (c != '\\') {
5947 WRITE_CHAR(c);
5948 continue;
5949 }
5950
5951 startinpos = s - starts - 1;
5952 /* \ - Escapes */
5953 if (s >= end) {
5954 message = "\\ at end of string";
5955 goto error;
5956 }
5957 c = (unsigned char) *s++;
5958
5959 assert(writer.pos < writer.size);
5960 switch (c) {
5961
5962 /* \x escapes */
5963 case '\n': continue;
5964 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965 case '\'': WRITE_ASCII_CHAR('\''); continue;
5966 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967 case 'b': WRITE_ASCII_CHAR('\b'); continue;
5968 /* FF */
5969 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970 case 't': WRITE_ASCII_CHAR('\t'); continue;
5971 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972 case 'r': WRITE_ASCII_CHAR('\r'); continue;
5973 /* VT */
5974 case 'v': WRITE_ASCII_CHAR('\013'); continue;
5975 /* BEL, not classic C */
5976 case 'a': WRITE_ASCII_CHAR('\007'); continue;
5977
5978 /* \OOO (octal) escapes */
5979 case '0': case '1': case '2': case '3':
5980 case '4': case '5': case '6': case '7':
5981 ch = c - '0';
5982 if (s < end && '0' <= *s && *s <= '7') {
5983 ch = (ch<<3) + *s++ - '0';
5984 if (s < end && '0' <= *s && *s <= '7') {
5985 ch = (ch<<3) + *s++ - '0';
5986 }
5987 }
5988 WRITE_CHAR(ch);
5989 continue;
5990
5991 /* hex escapes */
5992 /* \xXX */
5993 case 'x':
5994 count = 2;
5995 message = "truncated \\xXX escape";
5996 goto hexescape;
5997
5998 /* \uXXXX */
5999 case 'u':
6000 count = 4;
6001 message = "truncated \\uXXXX escape";
6002 goto hexescape;
6003
6004 /* \UXXXXXXXX */
6005 case 'U':
6006 count = 8;
6007 message = "truncated \\UXXXXXXXX escape";
6008 hexescape:
6009 for (ch = 0; count && s < end; ++s, --count) {
6010 c = (unsigned char)*s;
6011 ch <<= 4;
6012 if (c >= '0' && c <= '9') {
6013 ch += c - '0';
6014 }
6015 else if (c >= 'a' && c <= 'f') {
6016 ch += c - ('a' - 10);
6017 }
6018 else if (c >= 'A' && c <= 'F') {
6019 ch += c - ('A' - 10);
6020 }
6021 else {
6022 break;
6023 }
6024 }
6025 if (count) {
6026 goto error;
6027 }
6028
6029 /* when we get here, ch is a 32-bit unicode character */
6030 if (ch > MAX_UNICODE) {
6031 message = "illegal Unicode character";
6032 goto error;
6033 }
6034
6035 WRITE_CHAR(ch);
6036 continue;
6037
6038 /* \N{name} */
6039 case 'N':
6040 if (ucnhash_CAPI == NULL) {
6041 /* load the unicode data module */
6042 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043 PyUnicodeData_CAPSULE_NAME, 1);
6044 if (ucnhash_CAPI == NULL) {
6045 PyErr_SetString(
6046 PyExc_UnicodeError,
6047 "\\N escapes not supported (can't load unicodedata module)"
6048 );
6049 goto onError;
6050 }
6051 }
6052
6053 message = "malformed \\N character escape";
6054 if (*s == '{') {
6055 const char *start = ++s;
6056 size_t namelen;
6057 /* look for the closing brace */
6058 while (s < end && *s != '}')
6059 s++;
6060 namelen = s - start;
6061 if (namelen && s < end) {
6062 /* found a name. look it up in the unicode database */
6063 s++;
6064 ch = 0xffffffff; /* in case 'getcode' messes up */
6065 if (namelen <= INT_MAX &&
6066 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067 &ch, 0)) {
6068 assert(ch <= MAX_UNICODE);
6069 WRITE_CHAR(ch);
6070 continue;
6071 }
6072 message = "unknown Unicode character name";
6073 }
6074 }
6075 goto error;
6076
6077 default:
6078 if (*first_invalid_escape == NULL) {
6079 *first_invalid_escape = s-1; /* Back up one char, since we've
6080 already incremented s. */
6081 }
6082 WRITE_ASCII_CHAR('\\');
6083 WRITE_CHAR(c);
6084 continue;
6085 }
6086
6087 error:
6088 endinpos = s-starts;
6089 writer.min_length = end - s + writer.pos;
6090 if (unicode_decode_call_errorhandler_writer(
6091 errors, &errorHandler,
6092 "unicodeescape", message,
6093 &starts, &end, &startinpos, &endinpos, &exc, &s,
6094 &writer)) {
6095 goto onError;
6096 }
6097 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098 goto onError;
6099 }
6100
6101 #undef WRITE_ASCII_CHAR
6102 #undef WRITE_CHAR
6103 }
6104
6105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
6107 return _PyUnicodeWriter_Finish(&writer);
6108
6109 onError:
6110 _PyUnicodeWriter_Dealloc(&writer);
6111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
6113 return NULL;
6114 }
6115
6116 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6117 PyUnicode_DecodeUnicodeEscape(const char *s,
6118 Py_ssize_t size,
6119 const char *errors)
6120 {
6121 const char *first_invalid_escape;
6122 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123 &first_invalid_escape);
6124 if (result == NULL)
6125 return NULL;
6126 if (first_invalid_escape != NULL) {
6127 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128 "invalid escape sequence '\\%c'",
6129 *first_invalid_escape) < 0) {
6130 Py_DECREF(result);
6131 return NULL;
6132 }
6133 }
6134 return result;
6135 }
6136
6137 /* Return a Unicode-Escape string version of the Unicode object. */
6138
6139 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6140 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6141 {
6142 Py_ssize_t i, len;
6143 PyObject *repr;
6144 char *p;
6145 enum PyUnicode_Kind kind;
6146 void *data;
6147 Py_ssize_t expandsize;
6148
6149 /* Initial allocation is based on the longest-possible character
6150 escape.
6151
6152 For UCS1 strings it's '\xxx', 4 bytes per source character.
6153 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6155 */
6156
6157 if (!PyUnicode_Check(unicode)) {
6158 PyErr_BadArgument();
6159 return NULL;
6160 }
6161 if (PyUnicode_READY(unicode) == -1) {
6162 return NULL;
6163 }
6164
6165 len = PyUnicode_GET_LENGTH(unicode);
6166 if (len == 0) {
6167 return PyBytes_FromStringAndSize(NULL, 0);
6168 }
6169
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173 bytes, and 1 byte characters 4. */
6174 expandsize = kind * 2 + 2;
6175 if (len > PY_SSIZE_T_MAX / expandsize) {
6176 return PyErr_NoMemory();
6177 }
6178 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179 if (repr == NULL) {
6180 return NULL;
6181 }
6182
6183 p = PyBytes_AS_STRING(repr);
6184 for (i = 0; i < len; i++) {
6185 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6186
6187 /* U+0000-U+00ff range */
6188 if (ch < 0x100) {
6189 if (ch >= ' ' && ch < 127) {
6190 if (ch != '\\') {
6191 /* Copy printable US ASCII as-is */
6192 *p++ = (char) ch;
6193 }
6194 /* Escape backslashes */
6195 else {
6196 *p++ = '\\';
6197 *p++ = '\\';
6198 }
6199 }
6200
6201 /* Map special whitespace to '\t', \n', '\r' */
6202 else if (ch == '\t') {
6203 *p++ = '\\';
6204 *p++ = 't';
6205 }
6206 else if (ch == '\n') {
6207 *p++ = '\\';
6208 *p++ = 'n';
6209 }
6210 else if (ch == '\r') {
6211 *p++ = '\\';
6212 *p++ = 'r';
6213 }
6214
6215 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216 else {
6217 *p++ = '\\';
6218 *p++ = 'x';
6219 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220 *p++ = Py_hexdigits[ch & 0x000F];
6221 }
6222 }
6223 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6224 else if (ch < 0x10000) {
6225 *p++ = '\\';
6226 *p++ = 'u';
6227 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
6232 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233 else {
6234
6235 /* Make sure that the first two digits are zero */
6236 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6237 *p++ = '\\';
6238 *p++ = 'U';
6239 *p++ = '0';
6240 *p++ = '0';
6241 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246 *p++ = Py_hexdigits[ch & 0x0000000F];
6247 }
6248 }
6249
6250 assert(p - PyBytes_AS_STRING(repr) > 0);
6251 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252 return NULL;
6253 }
6254 return repr;
6255 }
6256
6257 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6258 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259 Py_ssize_t size)
6260 {
6261 PyObject *result;
6262 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6263 if (tmp == NULL) {
6264 return NULL;
6265 }
6266
6267 result = PyUnicode_AsUnicodeEscapeString(tmp);
6268 Py_DECREF(tmp);
6269 return result;
6270 }
6271
6272 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6273
6274 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6275 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6276 Py_ssize_t size,
6277 const char *errors)
6278 {
6279 const char *starts = s;
6280 _PyUnicodeWriter writer;
6281 const char *end;
6282 PyObject *errorHandler = NULL;
6283 PyObject *exc = NULL;
6284
6285 if (size == 0) {
6286 _Py_RETURN_UNICODE_EMPTY();
6287 }
6288
6289 /* Escaped strings will always be longer than the resulting
6290 Unicode string, so we start with size here and then reduce the
6291 length after conversion to the true value. (But decoding error
6292 handler might have to resize the string) */
6293 _PyUnicodeWriter_Init(&writer);
6294 writer.min_length = size;
6295 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296 goto onError;
6297 }
6298
6299 end = s + size;
6300 while (s < end) {
6301 unsigned char c = (unsigned char) *s++;
6302 Py_UCS4 ch;
6303 int count;
6304 Py_ssize_t startinpos;
6305 Py_ssize_t endinpos;
6306 const char *message;
6307
6308 #define WRITE_CHAR(ch) \
6309 do { \
6310 if (ch <= writer.maxchar) { \
6311 assert(writer.pos < writer.size); \
6312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313 } \
6314 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315 goto onError; \
6316 } \
6317 } while(0)
6318
6319 /* Non-escape characters are interpreted as Unicode ordinals */
6320 if (c != '\\' || s >= end) {
6321 WRITE_CHAR(c);
6322 continue;
6323 }
6324
6325 c = (unsigned char) *s++;
6326 if (c == 'u') {
6327 count = 4;
6328 message = "truncated \\uXXXX escape";
6329 }
6330 else if (c == 'U') {
6331 count = 8;
6332 message = "truncated \\UXXXXXXXX escape";
6333 }
6334 else {
6335 assert(writer.pos < writer.size);
6336 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337 WRITE_CHAR(c);
6338 continue;
6339 }
6340 startinpos = s - starts - 2;
6341
6342 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343 for (ch = 0; count && s < end; ++s, --count) {
6344 c = (unsigned char)*s;
6345 ch <<= 4;
6346 if (c >= '0' && c <= '9') {
6347 ch += c - '0';
6348 }
6349 else if (c >= 'a' && c <= 'f') {
6350 ch += c - ('a' - 10);
6351 }
6352 else if (c >= 'A' && c <= 'F') {
6353 ch += c - ('A' - 10);
6354 }
6355 else {
6356 break;
6357 }
6358 }
6359 if (!count) {
6360 if (ch <= MAX_UNICODE) {
6361 WRITE_CHAR(ch);
6362 continue;
6363 }
6364 message = "\\Uxxxxxxxx out of range";
6365 }
6366
6367 endinpos = s-starts;
6368 writer.min_length = end - s + writer.pos;
6369 if (unicode_decode_call_errorhandler_writer(
6370 errors, &errorHandler,
6371 "rawunicodeescape", message,
6372 &starts, &end, &startinpos, &endinpos, &exc, &s,
6373 &writer)) {
6374 goto onError;
6375 }
6376 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377 goto onError;
6378 }
6379
6380 #undef WRITE_CHAR
6381 }
6382 Py_XDECREF(errorHandler);
6383 Py_XDECREF(exc);
6384 return _PyUnicodeWriter_Finish(&writer);
6385
6386 onError:
6387 _PyUnicodeWriter_Dealloc(&writer);
6388 Py_XDECREF(errorHandler);
6389 Py_XDECREF(exc);
6390 return NULL;
6391
6392 }
6393
6394
6395 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6396 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6397 {
6398 PyObject *repr;
6399 char *p;
6400 Py_ssize_t expandsize, pos;
6401 int kind;
6402 void *data;
6403 Py_ssize_t len;
6404
6405 if (!PyUnicode_Check(unicode)) {
6406 PyErr_BadArgument();
6407 return NULL;
6408 }
6409 if (PyUnicode_READY(unicode) == -1) {
6410 return NULL;
6411 }
6412 kind = PyUnicode_KIND(unicode);
6413 data = PyUnicode_DATA(unicode);
6414 len = PyUnicode_GET_LENGTH(unicode);
6415 if (kind == PyUnicode_1BYTE_KIND) {
6416 return PyBytes_FromStringAndSize(data, len);
6417 }
6418
6419 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420 bytes, and 1 byte characters 4. */
6421 expandsize = kind * 2 + 2;
6422
6423 if (len > PY_SSIZE_T_MAX / expandsize) {
6424 return PyErr_NoMemory();
6425 }
6426 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427 if (repr == NULL) {
6428 return NULL;
6429 }
6430 if (len == 0) {
6431 return repr;
6432 }
6433
6434 p = PyBytes_AS_STRING(repr);
6435 for (pos = 0; pos < len; pos++) {
6436 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6437
6438 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439 if (ch < 0x100) {
6440 *p++ = (char) ch;
6441 }
6442 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443 else if (ch < 0x10000) {
6444 *p++ = '\\';
6445 *p++ = 'u';
6446 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449 *p++ = Py_hexdigits[ch & 15];
6450 }
6451 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452 else {
6453 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454 *p++ = '\\';
6455 *p++ = 'U';
6456 *p++ = '0';
6457 *p++ = '0';
6458 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463 *p++ = Py_hexdigits[ch & 15];
6464 }
6465 }
6466
6467 assert(p > PyBytes_AS_STRING(repr));
6468 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469 return NULL;
6470 }
6471 return repr;
6472 }
6473
6474 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6475 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476 Py_ssize_t size)
6477 {
6478 PyObject *result;
6479 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6480 if (tmp == NULL)
6481 return NULL;
6482 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483 Py_DECREF(tmp);
6484 return result;
6485 }
6486
6487 /* --- Unicode Internal Codec ------------------------------------------- */
6488
6489 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6490 _PyUnicode_DecodeUnicodeInternal(const char *s,
6491 Py_ssize_t size,
6492 const char *errors)
6493 {
6494 const char *starts = s;
6495 Py_ssize_t startinpos;
6496 Py_ssize_t endinpos;
6497 _PyUnicodeWriter writer;
6498 const char *end;
6499 const char *reason;
6500 PyObject *errorHandler = NULL;
6501 PyObject *exc = NULL;
6502
6503 if (PyErr_WarnEx(PyExc_DeprecationWarning,
6504 "unicode_internal codec has been deprecated",
6505 1))
6506 return NULL;
6507
6508 if (size == 0)
6509 _Py_RETURN_UNICODE_EMPTY();
6510
6511 _PyUnicodeWriter_Init(&writer);
6512 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513 PyErr_NoMemory();
6514 goto onError;
6515 }
6516 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6517
6518 end = s + size;
6519 while (s < end) {
6520 Py_UNICODE uch;
6521 Py_UCS4 ch;
6522 if (end - s < Py_UNICODE_SIZE) {
6523 endinpos = end-starts;
6524 reason = "truncated input";
6525 goto error;
6526 }
6527 /* We copy the raw representation one byte at a time because the
6528 pointer may be unaligned (see test_codeccallbacks). */
6529 ((char *) &uch)[0] = s[0];
6530 ((char *) &uch)[1] = s[1];
6531 #ifdef Py_UNICODE_WIDE
6532 ((char *) &uch)[2] = s[2];
6533 ((char *) &uch)[3] = s[3];
6534 #endif
6535 ch = uch;
6536 #ifdef Py_UNICODE_WIDE
6537 /* We have to sanity check the raw data, otherwise doom looms for
6538 some malformed UCS-4 data. */
6539 if (ch > 0x10ffff) {
6540 endinpos = s - starts + Py_UNICODE_SIZE;
6541 reason = "illegal code point (> 0x10FFFF)";
6542 goto error;
6543 }
6544 #endif
6545 s += Py_UNICODE_SIZE;
6546 #ifndef Py_UNICODE_WIDE
6547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6548 {
6549 Py_UNICODE uch2;
6550 ((char *) &uch2)[0] = s[0];
6551 ((char *) &uch2)[1] = s[1];
6552 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6553 {
6554 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6555 s += Py_UNICODE_SIZE;
6556 }
6557 }
6558 #endif
6559
6560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6561 goto onError;
6562 continue;
6563
6564 error:
6565 startinpos = s - starts;
6566 if (unicode_decode_call_errorhandler_writer(
6567 errors, &errorHandler,
6568 "unicode_internal", reason,
6569 &starts, &end, &startinpos, &endinpos, &exc, &s,
6570 &writer))
6571 goto onError;
6572 }
6573
6574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
6576 return _PyUnicodeWriter_Finish(&writer);
6577
6578 onError:
6579 _PyUnicodeWriter_Dealloc(&writer);
6580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
6582 return NULL;
6583 }
6584
6585 /* --- Latin-1 Codec ------------------------------------------------------ */
6586
6587 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6588 PyUnicode_DecodeLatin1(const char *s,
6589 Py_ssize_t size,
6590 const char *errors)
6591 {
6592 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6593 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6594 }
6595
6596 /* create or adjust a UnicodeEncodeError */
6597 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6598 make_encode_exception(PyObject **exceptionObject,
6599 const char *encoding,
6600 PyObject *unicode,
6601 Py_ssize_t startpos, Py_ssize_t endpos,
6602 const char *reason)
6603 {
6604 if (*exceptionObject == NULL) {
6605 *exceptionObject = PyObject_CallFunction(
6606 PyExc_UnicodeEncodeError, "sOnns",
6607 encoding, unicode, startpos, endpos, reason);
6608 }
6609 else {
6610 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611 goto onError;
6612 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613 goto onError;
6614 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615 goto onError;
6616 return;
6617 onError:
6618 Py_CLEAR(*exceptionObject);
6619 }
6620 }
6621
6622 /* raises a UnicodeEncodeError */
6623 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6624 raise_encode_exception(PyObject **exceptionObject,
6625 const char *encoding,
6626 PyObject *unicode,
6627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 const char *reason)
6629 {
6630 make_encode_exception(exceptionObject,
6631 encoding, unicode, startpos, endpos, reason);
6632 if (*exceptionObject != NULL)
6633 PyCodec_StrictErrors(*exceptionObject);
6634 }
6635
6636 /* error handling callback helper:
6637 build arguments, call the callback and check the arguments,
6638 put the result into newpos and return the replacement string, which
6639 has to be freed by the caller */
6640 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6641 unicode_encode_call_errorhandler(const char *errors,
6642 PyObject **errorHandler,
6643 const char *encoding, const char *reason,
6644 PyObject *unicode, PyObject **exceptionObject,
6645 Py_ssize_t startpos, Py_ssize_t endpos,
6646 Py_ssize_t *newpos)
6647 {
6648 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6649 Py_ssize_t len;
6650 PyObject *restuple;
6651 PyObject *resunicode;
6652
6653 if (*errorHandler == NULL) {
6654 *errorHandler = PyCodec_LookupError(errors);
6655 if (*errorHandler == NULL)
6656 return NULL;
6657 }
6658
6659 if (PyUnicode_READY(unicode) == -1)
6660 return NULL;
6661 len = PyUnicode_GET_LENGTH(unicode);
6662
6663 make_encode_exception(exceptionObject,
6664 encoding, unicode, startpos, endpos, reason);
6665 if (*exceptionObject == NULL)
6666 return NULL;
6667
6668 restuple = PyObject_CallFunctionObjArgs(
6669 *errorHandler, *exceptionObject, NULL);
6670 if (restuple == NULL)
6671 return NULL;
6672 if (!PyTuple_Check(restuple)) {
6673 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6674 Py_DECREF(restuple);
6675 return NULL;
6676 }
6677 if (!PyArg_ParseTuple(restuple, argparse,
6678 &resunicode, newpos)) {
6679 Py_DECREF(restuple);
6680 return NULL;
6681 }
6682 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684 Py_DECREF(restuple);
6685 return NULL;
6686 }
6687 if (*newpos<0)
6688 *newpos = len + *newpos;
6689 if (*newpos<0 || *newpos>len) {
6690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6691 Py_DECREF(restuple);
6692 return NULL;
6693 }
6694 Py_INCREF(resunicode);
6695 Py_DECREF(restuple);
6696 return resunicode;
6697 }
6698
6699 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6700 unicode_encode_ucs1(PyObject *unicode,
6701 const char *errors,
6702 const Py_UCS4 limit)
6703 {
6704 /* input state */
6705 Py_ssize_t pos=0, size;
6706 int kind;
6707 void *data;
6708 /* pointer into the output */
6709 char *str;
6710 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6712 PyObject *error_handler_obj = NULL;
6713 PyObject *exc = NULL;
6714 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6715 PyObject *rep = NULL;
6716 /* output object */
6717 _PyBytesWriter writer;
6718
6719 if (PyUnicode_READY(unicode) == -1)
6720 return NULL;
6721 size = PyUnicode_GET_LENGTH(unicode);
6722 kind = PyUnicode_KIND(unicode);
6723 data = PyUnicode_DATA(unicode);
6724 /* allocate enough for a simple encoding without
6725 replacements, if we need more, we'll resize */
6726 if (size == 0)
6727 return PyBytes_FromStringAndSize(NULL, 0);
6728
6729 _PyBytesWriter_Init(&writer);
6730 str = _PyBytesWriter_Alloc(&writer, size);
6731 if (str == NULL)
6732 return NULL;
6733
6734 while (pos < size) {
6735 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6736
6737 /* can we encode this? */
6738 if (ch < limit) {
6739 /* no overflow check, because we know that the space is enough */
6740 *str++ = (char)ch;
6741 ++pos;
6742 }
6743 else {
6744 Py_ssize_t newpos, i;
6745 /* startpos for collecting unencodable chars */
6746 Py_ssize_t collstart = pos;
6747 Py_ssize_t collend = collstart + 1;
6748 /* find all unecodable characters */
6749
6750 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6751 ++collend;
6752
6753 /* Only overallocate the buffer if it's not the last write */
6754 writer.overallocate = (collend < size);
6755
6756 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6757 if (error_handler == _Py_ERROR_UNKNOWN)
6758 error_handler = get_error_handler(errors);
6759
6760 switch (error_handler) {
6761 case _Py_ERROR_STRICT:
6762 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6763 goto onError;
6764
6765 case _Py_ERROR_REPLACE:
6766 memset(str, '?', collend - collstart);
6767 str += (collend - collstart);
6768 /* fall through ignore error handler */
6769 case _Py_ERROR_IGNORE:
6770 pos = collend;
6771 break;
6772
6773 case _Py_ERROR_BACKSLASHREPLACE:
6774 /* subtract preallocated bytes */
6775 writer.min_size -= (collend - collstart);
6776 str = backslashreplace(&writer, str,
6777 unicode, collstart, collend);
6778 if (str == NULL)
6779 goto onError;
6780 pos = collend;
6781 break;
6782
6783 case _Py_ERROR_XMLCHARREFREPLACE:
6784 /* subtract preallocated bytes */
6785 writer.min_size -= (collend - collstart);
6786 str = xmlcharrefreplace(&writer, str,
6787 unicode, collstart, collend);
6788 if (str == NULL)
6789 goto onError;
6790 pos = collend;
6791 break;
6792
6793 case _Py_ERROR_SURROGATEESCAPE:
6794 for (i = collstart; i < collend; ++i) {
6795 ch = PyUnicode_READ(kind, data, i);
6796 if (ch < 0xdc80 || 0xdcff < ch) {
6797 /* Not a UTF-8b surrogate */
6798 break;
6799 }
6800 *str++ = (char)(ch - 0xdc00);
6801 ++pos;
6802 }
6803 if (i >= collend)
6804 break;
6805 collstart = pos;
6806 assert(collstart != collend);
6807 /* fallback to general error handling */
6808
6809 default:
6810 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811 encoding, reason, unicode, &exc,
6812 collstart, collend, &newpos);
6813 if (rep == NULL)
6814 goto onError;
6815
6816 /* subtract preallocated bytes */
6817 writer.min_size -= 1;
6818
6819 if (PyBytes_Check(rep)) {
6820 /* Directly copy bytes result to output. */
6821 str = _PyBytesWriter_WriteBytes(&writer, str,
6822 PyBytes_AS_STRING(rep),
6823 PyBytes_GET_SIZE(rep));
6824 if (str == NULL)
6825 goto onError;
6826 }
6827 else {
6828 assert(PyUnicode_Check(rep));
6829
6830 if (PyUnicode_READY(rep) < 0)
6831 goto onError;
6832
6833 if (PyUnicode_IS_ASCII(rep)) {
6834 /* Fast path: all characters are smaller than limit */
6835 assert(limit >= 128);
6836 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6837 str = _PyBytesWriter_WriteBytes(&writer, str,
6838 PyUnicode_DATA(rep),
6839 PyUnicode_GET_LENGTH(rep));
6840 }
6841 else {
6842 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6843
6844 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6845 if (str == NULL)
6846 goto onError;
6847
6848 /* check if there is anything unencodable in the
6849 replacement and copy it to the output */
6850 for (i = 0; repsize-->0; ++i, ++str) {
6851 ch = PyUnicode_READ_CHAR(rep, i);
6852 if (ch >= limit) {
6853 raise_encode_exception(&exc, encoding, unicode,
6854 pos, pos+1, reason);
6855 goto onError;
6856 }
6857 *str = (char)ch;
6858 }
6859 }
6860 }
6861 pos = newpos;
6862 Py_CLEAR(rep);
6863 }
6864
6865 /* If overallocation was disabled, ensure that it was the last
6866 write. Otherwise, we missed an optimization */
6867 assert(writer.overallocate || pos == size);
6868 }
6869 }
6870
6871 Py_XDECREF(error_handler_obj);
6872 Py_XDECREF(exc);
6873 return _PyBytesWriter_Finish(&writer, str);
6874
6875 onError:
6876 Py_XDECREF(rep);
6877 _PyBytesWriter_Dealloc(&writer);
6878 Py_XDECREF(error_handler_obj);
6879 Py_XDECREF(exc);
6880 return NULL;
6881 }
6882
6883 /* Deprecated */
6884 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6885 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6886 Py_ssize_t size,
6887 const char *errors)
6888 {
6889 PyObject *result;
6890 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
6896 }
6897
6898 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6899 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6900 {
6901 if (!PyUnicode_Check(unicode)) {
6902 PyErr_BadArgument();
6903 return NULL;
6904 }
6905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
6914 return unicode_encode_ucs1(unicode, errors, 256);
6915 }
6916
6917 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6918 PyUnicode_AsLatin1String(PyObject *unicode)
6919 {
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
6921 }
6922
6923 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
6925 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6926 PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
6929 {
6930 const char *starts = s;
6931 _PyUnicodeWriter writer;
6932 int kind;
6933 void *data;
6934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
6937 const char *e;
6938 PyObject *error_handler_obj = NULL;
6939 PyObject *exc = NULL;
6940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6941
6942 if (size == 0)
6943 _Py_RETURN_UNICODE_EMPTY();
6944
6945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6946 if (size == 1 && (unsigned char)s[0] < 128)
6947 return get_latin1_char((unsigned char)s[0]);
6948
6949 _PyUnicodeWriter_Init(&writer);
6950 writer.min_length = size;
6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6952 return NULL;
6953
6954 e = s + size;
6955 data = writer.data;
6956 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6957 writer.pos = outpos;
6958 if (writer.pos == size)
6959 return _PyUnicodeWriter_Finish(&writer);
6960
6961 s += writer.pos;
6962 kind = writer.kind;
6963 while (s < e) {
6964 unsigned char c = (unsigned char)*s;
6965 if (c < 128) {
6966 PyUnicode_WRITE(kind, data, writer.pos, c);
6967 writer.pos++;
6968 ++s;
6969 continue;
6970 }
6971
6972 /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974 if (error_handler == _Py_ERROR_UNKNOWN)
6975 error_handler = get_error_handler(errors);
6976
6977 switch (error_handler)
6978 {
6979 case _Py_ERROR_REPLACE:
6980 case _Py_ERROR_SURROGATEESCAPE:
6981 /* Fast-path: the error handler only writes one character,
6982 but we may switch to UCS2 at the first write */
6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984 goto onError;
6985 kind = writer.kind;
6986 data = writer.data;
6987
6988 if (error_handler == _Py_ERROR_REPLACE)
6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990 else
6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992 writer.pos++;
6993 ++s;
6994 break;
6995
6996 case _Py_ERROR_IGNORE:
6997 ++s;
6998 break;
6999
7000 default:
7001 startinpos = s-starts;
7002 endinpos = startinpos + 1;
7003 if (unicode_decode_call_errorhandler_writer(
7004 errors, &error_handler_obj,
7005 "ascii", "ordinal not in range(128)",
7006 &starts, &e, &startinpos, &endinpos, &exc, &s,
7007 &writer))
7008 goto onError;
7009 kind = writer.kind;
7010 data = writer.data;
7011 }
7012 }
7013 Py_XDECREF(error_handler_obj);
7014 Py_XDECREF(exc);
7015 return _PyUnicodeWriter_Finish(&writer);
7016
7017 onError:
7018 _PyUnicodeWriter_Dealloc(&writer);
7019 Py_XDECREF(error_handler_obj);
7020 Py_XDECREF(exc);
7021 return NULL;
7022 }
7023
7024 /* Deprecated */
7025 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7026 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 const char *errors)
7029 {
7030 PyObject *result;
7031 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7032 if (unicode == NULL)
7033 return NULL;
7034 result = unicode_encode_ucs1(unicode, errors, 128);
7035 Py_DECREF(unicode);
7036 return result;
7037 }
7038
7039 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7040 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7041 {
7042 if (!PyUnicode_Check(unicode)) {
7043 PyErr_BadArgument();
7044 return NULL;
7045 }
7046 if (PyUnicode_READY(unicode) == -1)
7047 return NULL;
7048 /* Fast path: if it is an ASCII-only string, construct bytes object
7049 directly. Else defer to above function to raise the exception. */
7050 if (PyUnicode_IS_ASCII(unicode))
7051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052 PyUnicode_GET_LENGTH(unicode));
7053 return unicode_encode_ucs1(unicode, errors, 128);
7054 }
7055
7056 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7057 PyUnicode_AsASCIIString(PyObject *unicode)
7058 {
7059 return _PyUnicode_AsASCIIString(unicode, NULL);
7060 }
7061
7062 #ifdef MS_WINDOWS
7063
7064 /* --- MBCS codecs for Windows -------------------------------------------- */
7065
7066 #if SIZEOF_INT < SIZEOF_SIZE_T
7067 #define NEED_RETRY
7068 #endif
7069
7070 #ifndef WC_ERR_INVALID_CHARS
7071 # define WC_ERR_INVALID_CHARS 0x0080
7072 #endif
7073
7074 static const char*
code_page_name(UINT code_page,PyObject ** obj)7075 code_page_name(UINT code_page, PyObject **obj)
7076 {
7077 *obj = NULL;
7078 if (code_page == CP_ACP)
7079 return "mbcs";
7080 if (code_page == CP_UTF7)
7081 return "CP_UTF7";
7082 if (code_page == CP_UTF8)
7083 return "CP_UTF8";
7084
7085 *obj = PyBytes_FromFormat("cp%u", code_page);
7086 if (*obj == NULL)
7087 return NULL;
7088 return PyBytes_AS_STRING(*obj);
7089 }
7090
7091 static DWORD
decode_code_page_flags(UINT code_page)7092 decode_code_page_flags(UINT code_page)
7093 {
7094 if (code_page == CP_UTF7) {
7095 /* The CP_UTF7 decoder only supports flags=0 */
7096 return 0;
7097 }
7098 else
7099 return MB_ERR_INVALID_CHARS;
7100 }
7101
7102 /*
7103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
7106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
7108 */
7109 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7110 decode_code_page_strict(UINT code_page,
7111 PyObject **v,
7112 const char *in,
7113 int insize)
7114 {
7115 const DWORD flags = decode_code_page_flags(code_page);
7116 wchar_t *out;
7117 DWORD outsize;
7118
7119 /* First get the size of the result */
7120 assert(insize > 0);
7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122 if (outsize <= 0)
7123 goto error;
7124
7125 if (*v == NULL) {
7126 /* Create unicode object */
7127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7128 *v = (PyObject*)_PyUnicode_New(outsize);
7129 if (*v == NULL)
7130 return -1;
7131 out = PyUnicode_AS_UNICODE(*v);
7132 }
7133 else {
7134 /* Extend unicode object */
7135 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7136 if (unicode_resize(v, n + outsize) < 0)
7137 return -1;
7138 out = PyUnicode_AS_UNICODE(*v) + n;
7139 }
7140
7141 /* Do the conversion */
7142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143 if (outsize <= 0)
7144 goto error;
7145 return insize;
7146
7147 error:
7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
7151 return -1;
7152 }
7153
7154 /*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
7158 * Returns consumed size if succeed, or raise an OSError or
7159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7162 decode_code_page_errors(UINT code_page,
7163 PyObject **v,
7164 const char *in, const int size,
7165 const char *errors, int final)
7166 {
7167 const char *startin = in;
7168 const char *endin = in + size;
7169 const DWORD flags = decode_code_page_flags(code_page);
7170 /* Ideally, we should get reason from FormatMessage. This is the Windows
7171 2000 English version of the message. */
7172 const char *reason = "No mapping for the Unicode character exists "
7173 "in the target code page.";
7174 /* each step cannot decode more than 1 character, but a character can be
7175 represented as a surrogate pair */
7176 wchar_t buffer[2], *startout, *out;
7177 int insize;
7178 Py_ssize_t outsize;
7179 PyObject *errorHandler = NULL;
7180 PyObject *exc = NULL;
7181 PyObject *encoding_obj = NULL;
7182 const char *encoding;
7183 DWORD err;
7184 int ret = -1;
7185
7186 assert(size > 0);
7187
7188 encoding = code_page_name(code_page, &encoding_obj);
7189 if (encoding == NULL)
7190 return -1;
7191
7192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194 UnicodeDecodeError. */
7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196 if (exc != NULL) {
7197 PyCodec_StrictErrors(exc);
7198 Py_CLEAR(exc);
7199 }
7200 goto error;
7201 }
7202
7203 if (*v == NULL) {
7204 /* Create unicode object */
7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
7209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7211 if (*v == NULL)
7212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v);
7214 }
7215 else {
7216 /* Extend unicode object */
7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219 PyErr_NoMemory();
7220 goto error;
7221 }
7222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v) + n;
7225 }
7226
7227 /* Decode the byte string character per character */
7228 out = startout;
7229 while (in < endin)
7230 {
7231 /* Decode a character */
7232 insize = 1;
7233 do
7234 {
7235 outsize = MultiByteToWideChar(code_page, flags,
7236 in, insize,
7237 buffer, Py_ARRAY_LENGTH(buffer));
7238 if (outsize > 0)
7239 break;
7240 err = GetLastError();
7241 if (err != ERROR_NO_UNICODE_TRANSLATION
7242 && err != ERROR_INSUFFICIENT_BUFFER)
7243 {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247 insize++;
7248 }
7249 /* 4=maximum length of a UTF-8 sequence */
7250 while (insize <= 4 && (in + insize) <= endin);
7251
7252 if (outsize <= 0) {
7253 Py_ssize_t startinpos, endinpos, outpos;
7254
7255 /* last character in partial decode? */
7256 if (in + insize >= endin && !final)
7257 break;
7258
7259 startinpos = in - startin;
7260 endinpos = startinpos + 1;
7261 outpos = out - PyUnicode_AS_UNICODE(*v);
7262 if (unicode_decode_call_errorhandler_wchar(
7263 errors, &errorHandler,
7264 encoding, reason,
7265 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7266 v, &outpos))
7267 {
7268 goto error;
7269 }
7270 out = PyUnicode_AS_UNICODE(*v) + outpos;
7271 }
7272 else {
7273 in += insize;
7274 memcpy(out, buffer, outsize * sizeof(wchar_t));
7275 out += outsize;
7276 }
7277 }
7278
7279 /* write a NUL character at the end */
7280 *out = 0;
7281
7282 /* Extend unicode object */
7283 outsize = out - startout;
7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7285 if (unicode_resize(v, outsize) < 0)
7286 goto error;
7287 /* (in - startin) <= size and size is an int */
7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7289
7290 error:
7291 Py_XDECREF(encoding_obj);
7292 Py_XDECREF(errorHandler);
7293 Py_XDECREF(exc);
7294 return ret;
7295 }
7296
7297 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7298 decode_code_page_stateful(int code_page,
7299 const char *s, Py_ssize_t size,
7300 const char *errors, Py_ssize_t *consumed)
7301 {
7302 PyObject *v = NULL;
7303 int chunk_size, final, converted, done;
7304
7305 if (code_page < 0) {
7306 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307 return NULL;
7308 }
7309
7310 if (consumed)
7311 *consumed = 0;
7312
7313 do
7314 {
7315 #ifdef NEED_RETRY
7316 if (size > INT_MAX) {
7317 chunk_size = INT_MAX;
7318 final = 0;
7319 done = 0;
7320 }
7321 else
7322 #endif
7323 {
7324 chunk_size = (int)size;
7325 final = (consumed == NULL);
7326 done = 1;
7327 }
7328
7329 if (chunk_size == 0 && done) {
7330 if (v != NULL)
7331 break;
7332 _Py_RETURN_UNICODE_EMPTY();
7333 }
7334
7335 converted = decode_code_page_strict(code_page, &v,
7336 s, chunk_size);
7337 if (converted == -2)
7338 converted = decode_code_page_errors(code_page, &v,
7339 s, chunk_size,
7340 errors, final);
7341 assert(converted != 0 || done);
7342
7343 if (converted < 0) {
7344 Py_XDECREF(v);
7345 return NULL;
7346 }
7347
7348 if (consumed)
7349 *consumed += converted;
7350
7351 s += converted;
7352 size -= converted;
7353 } while (!done);
7354
7355 return unicode_result(v);
7356 }
7357
7358 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7359 PyUnicode_DecodeCodePageStateful(int code_page,
7360 const char *s,
7361 Py_ssize_t size,
7362 const char *errors,
7363 Py_ssize_t *consumed)
7364 {
7365 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7366 }
7367
7368 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7369 PyUnicode_DecodeMBCSStateful(const char *s,
7370 Py_ssize_t size,
7371 const char *errors,
7372 Py_ssize_t *consumed)
7373 {
7374 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7375 }
7376
7377 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7378 PyUnicode_DecodeMBCS(const char *s,
7379 Py_ssize_t size,
7380 const char *errors)
7381 {
7382 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7383 }
7384
7385 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7386 encode_code_page_flags(UINT code_page, const char *errors)
7387 {
7388 if (code_page == CP_UTF8) {
7389 return WC_ERR_INVALID_CHARS;
7390 }
7391 else if (code_page == CP_UTF7) {
7392 /* CP_UTF7 only supports flags=0 */
7393 return 0;
7394 }
7395 else {
7396 if (errors != NULL && strcmp(errors, "replace") == 0)
7397 return 0;
7398 else
7399 return WC_NO_BEST_FIT_CHARS;
7400 }
7401 }
7402
7403 /*
7404 * Encode a Unicode string to a Windows code page into a byte string in strict
7405 * mode.
7406 *
7407 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7408 * an OSError and returns -1 on other error.
7409 */
7410 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7411 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7412 PyObject *unicode, Py_ssize_t offset, int len,
7413 const char* errors)
7414 {
7415 BOOL usedDefaultChar = FALSE;
7416 BOOL *pusedDefaultChar = &usedDefaultChar;
7417 int outsize;
7418 wchar_t *p;
7419 Py_ssize_t size;
7420 const DWORD flags = encode_code_page_flags(code_page, NULL);
7421 char *out;
7422 /* Create a substring so that we can get the UTF-16 representation
7423 of just the slice under consideration. */
7424 PyObject *substring;
7425
7426 assert(len > 0);
7427
7428 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7429 pusedDefaultChar = &usedDefaultChar;
7430 else
7431 pusedDefaultChar = NULL;
7432
7433 substring = PyUnicode_Substring(unicode, offset, offset+len);
7434 if (substring == NULL)
7435 return -1;
7436 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7437 if (p == NULL) {
7438 Py_DECREF(substring);
7439 return -1;
7440 }
7441 assert(size <= INT_MAX);
7442
7443 /* First get the size of the result */
7444 outsize = WideCharToMultiByte(code_page, flags,
7445 p, (int)size,
7446 NULL, 0,
7447 NULL, pusedDefaultChar);
7448 if (outsize <= 0)
7449 goto error;
7450 /* If we used a default char, then we failed! */
7451 if (pusedDefaultChar && *pusedDefaultChar) {
7452 Py_DECREF(substring);
7453 return -2;
7454 }
7455
7456 if (*outbytes == NULL) {
7457 /* Create string object */
7458 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7459 if (*outbytes == NULL) {
7460 Py_DECREF(substring);
7461 return -1;
7462 }
7463 out = PyBytes_AS_STRING(*outbytes);
7464 }
7465 else {
7466 /* Extend string object */
7467 const Py_ssize_t n = PyBytes_Size(*outbytes);
7468 if (outsize > PY_SSIZE_T_MAX - n) {
7469 PyErr_NoMemory();
7470 Py_DECREF(substring);
7471 return -1;
7472 }
7473 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7474 Py_DECREF(substring);
7475 return -1;
7476 }
7477 out = PyBytes_AS_STRING(*outbytes) + n;
7478 }
7479
7480 /* Do the conversion */
7481 outsize = WideCharToMultiByte(code_page, flags,
7482 p, (int)size,
7483 out, outsize,
7484 NULL, pusedDefaultChar);
7485 Py_CLEAR(substring);
7486 if (outsize <= 0)
7487 goto error;
7488 if (pusedDefaultChar && *pusedDefaultChar)
7489 return -2;
7490 return 0;
7491
7492 error:
7493 Py_XDECREF(substring);
7494 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7495 return -2;
7496 PyErr_SetFromWindowsErr(0);
7497 return -1;
7498 }
7499
7500 /*
7501 * Encode a Unicode string to a Windows code page into a byte string using an
7502 * error handler.
7503 *
7504 * Returns consumed characters if succeed, or raise an OSError and returns
7505 * -1 on other error.
7506 */
7507 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7508 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7509 PyObject *unicode, Py_ssize_t unicode_offset,
7510 Py_ssize_t insize, const char* errors)
7511 {
7512 const DWORD flags = encode_code_page_flags(code_page, errors);
7513 Py_ssize_t pos = unicode_offset;
7514 Py_ssize_t endin = unicode_offset + insize;
7515 /* Ideally, we should get reason from FormatMessage. This is the Windows
7516 2000 English version of the message. */
7517 const char *reason = "invalid character";
7518 /* 4=maximum length of a UTF-8 sequence */
7519 char buffer[4];
7520 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7521 Py_ssize_t outsize;
7522 char *out;
7523 PyObject *errorHandler = NULL;
7524 PyObject *exc = NULL;
7525 PyObject *encoding_obj = NULL;
7526 const char *encoding;
7527 Py_ssize_t newpos, newoutsize;
7528 PyObject *rep;
7529 int ret = -1;
7530
7531 assert(insize > 0);
7532
7533 encoding = code_page_name(code_page, &encoding_obj);
7534 if (encoding == NULL)
7535 return -1;
7536
7537 if (errors == NULL || strcmp(errors, "strict") == 0) {
7538 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7539 then we raise a UnicodeEncodeError. */
7540 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7541 if (exc != NULL) {
7542 PyCodec_StrictErrors(exc);
7543 Py_DECREF(exc);
7544 }
7545 Py_XDECREF(encoding_obj);
7546 return -1;
7547 }
7548
7549 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7550 pusedDefaultChar = &usedDefaultChar;
7551 else
7552 pusedDefaultChar = NULL;
7553
7554 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7555 PyErr_NoMemory();
7556 goto error;
7557 }
7558 outsize = insize * Py_ARRAY_LENGTH(buffer);
7559
7560 if (*outbytes == NULL) {
7561 /* Create string object */
7562 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7563 if (*outbytes == NULL)
7564 goto error;
7565 out = PyBytes_AS_STRING(*outbytes);
7566 }
7567 else {
7568 /* Extend string object */
7569 Py_ssize_t n = PyBytes_Size(*outbytes);
7570 if (n > PY_SSIZE_T_MAX - outsize) {
7571 PyErr_NoMemory();
7572 goto error;
7573 }
7574 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7575 goto error;
7576 out = PyBytes_AS_STRING(*outbytes) + n;
7577 }
7578
7579 /* Encode the string character per character */
7580 while (pos < endin)
7581 {
7582 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7583 wchar_t chars[2];
7584 int charsize;
7585 if (ch < 0x10000) {
7586 chars[0] = (wchar_t)ch;
7587 charsize = 1;
7588 }
7589 else {
7590 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7591 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7592 charsize = 2;
7593 }
7594
7595 outsize = WideCharToMultiByte(code_page, flags,
7596 chars, charsize,
7597 buffer, Py_ARRAY_LENGTH(buffer),
7598 NULL, pusedDefaultChar);
7599 if (outsize > 0) {
7600 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7601 {
7602 pos++;
7603 memcpy(out, buffer, outsize);
7604 out += outsize;
7605 continue;
7606 }
7607 }
7608 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7609 PyErr_SetFromWindowsErr(0);
7610 goto error;
7611 }
7612
7613 rep = unicode_encode_call_errorhandler(
7614 errors, &errorHandler, encoding, reason,
7615 unicode, &exc,
7616 pos, pos + 1, &newpos);
7617 if (rep == NULL)
7618 goto error;
7619 pos = newpos;
7620
7621 if (PyBytes_Check(rep)) {
7622 outsize = PyBytes_GET_SIZE(rep);
7623 if (outsize != 1) {
7624 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7625 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7626 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7627 Py_DECREF(rep);
7628 goto error;
7629 }
7630 out = PyBytes_AS_STRING(*outbytes) + offset;
7631 }
7632 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7633 out += outsize;
7634 }
7635 else {
7636 Py_ssize_t i;
7637 enum PyUnicode_Kind kind;
7638 void *data;
7639
7640 if (PyUnicode_READY(rep) == -1) {
7641 Py_DECREF(rep);
7642 goto error;
7643 }
7644
7645 outsize = PyUnicode_GET_LENGTH(rep);
7646 if (outsize != 1) {
7647 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7648 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7649 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7650 Py_DECREF(rep);
7651 goto error;
7652 }
7653 out = PyBytes_AS_STRING(*outbytes) + offset;
7654 }
7655 kind = PyUnicode_KIND(rep);
7656 data = PyUnicode_DATA(rep);
7657 for (i=0; i < outsize; i++) {
7658 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7659 if (ch > 127) {
7660 raise_encode_exception(&exc,
7661 encoding, unicode,
7662 pos, pos + 1,
7663 "unable to encode error handler result to ASCII");
7664 Py_DECREF(rep);
7665 goto error;
7666 }
7667 *out = (unsigned char)ch;
7668 out++;
7669 }
7670 }
7671 Py_DECREF(rep);
7672 }
7673 /* write a NUL byte */
7674 *out = 0;
7675 outsize = out - PyBytes_AS_STRING(*outbytes);
7676 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7677 if (_PyBytes_Resize(outbytes, outsize) < 0)
7678 goto error;
7679 ret = 0;
7680
7681 error:
7682 Py_XDECREF(encoding_obj);
7683 Py_XDECREF(errorHandler);
7684 Py_XDECREF(exc);
7685 return ret;
7686 }
7687
7688 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7689 encode_code_page(int code_page,
7690 PyObject *unicode,
7691 const char *errors)
7692 {
7693 Py_ssize_t len;
7694 PyObject *outbytes = NULL;
7695 Py_ssize_t offset;
7696 int chunk_len, ret, done;
7697
7698 if (!PyUnicode_Check(unicode)) {
7699 PyErr_BadArgument();
7700 return NULL;
7701 }
7702
7703 if (PyUnicode_READY(unicode) == -1)
7704 return NULL;
7705 len = PyUnicode_GET_LENGTH(unicode);
7706
7707 if (code_page < 0) {
7708 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7709 return NULL;
7710 }
7711
7712 if (len == 0)
7713 return PyBytes_FromStringAndSize(NULL, 0);
7714
7715 offset = 0;
7716 do
7717 {
7718 #ifdef NEED_RETRY
7719 /* UTF-16 encoding may double the size, so use only INT_MAX/2
7720 chunks. */
7721 if (len > INT_MAX/2) {
7722 chunk_len = INT_MAX/2;
7723 done = 0;
7724 }
7725 else
7726 #endif
7727 {
7728 chunk_len = (int)len;
7729 done = 1;
7730 }
7731
7732 ret = encode_code_page_strict(code_page, &outbytes,
7733 unicode, offset, chunk_len,
7734 errors);
7735 if (ret == -2)
7736 ret = encode_code_page_errors(code_page, &outbytes,
7737 unicode, offset,
7738 chunk_len, errors);
7739 if (ret < 0) {
7740 Py_XDECREF(outbytes);
7741 return NULL;
7742 }
7743
7744 offset += chunk_len;
7745 len -= chunk_len;
7746 } while (!done);
7747
7748 return outbytes;
7749 }
7750
7751 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7752 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7753 Py_ssize_t size,
7754 const char *errors)
7755 {
7756 PyObject *unicode, *res;
7757 unicode = PyUnicode_FromUnicode(p, size);
7758 if (unicode == NULL)
7759 return NULL;
7760 res = encode_code_page(CP_ACP, unicode, errors);
7761 Py_DECREF(unicode);
7762 return res;
7763 }
7764
7765 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7766 PyUnicode_EncodeCodePage(int code_page,
7767 PyObject *unicode,
7768 const char *errors)
7769 {
7770 return encode_code_page(code_page, unicode, errors);
7771 }
7772
7773 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7774 PyUnicode_AsMBCSString(PyObject *unicode)
7775 {
7776 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7777 }
7778
7779 #undef NEED_RETRY
7780
7781 #endif /* MS_WINDOWS */
7782
7783 /* --- Character Mapping Codec -------------------------------------------- */
7784
7785 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7786 charmap_decode_string(const char *s,
7787 Py_ssize_t size,
7788 PyObject *mapping,
7789 const char *errors,
7790 _PyUnicodeWriter *writer)
7791 {
7792 const char *starts = s;
7793 const char *e;
7794 Py_ssize_t startinpos, endinpos;
7795 PyObject *errorHandler = NULL, *exc = NULL;
7796 Py_ssize_t maplen;
7797 enum PyUnicode_Kind mapkind;
7798 void *mapdata;
7799 Py_UCS4 x;
7800 unsigned char ch;
7801
7802 if (PyUnicode_READY(mapping) == -1)
7803 return -1;
7804
7805 maplen = PyUnicode_GET_LENGTH(mapping);
7806 mapdata = PyUnicode_DATA(mapping);
7807 mapkind = PyUnicode_KIND(mapping);
7808
7809 e = s + size;
7810
7811 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7812 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7813 * is disabled in encoding aliases, latin1 is preferred because
7814 * its implementation is faster. */
7815 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7816 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7817 Py_UCS4 maxchar = writer->maxchar;
7818
7819 assert (writer->kind == PyUnicode_1BYTE_KIND);
7820 while (s < e) {
7821 ch = *s;
7822 x = mapdata_ucs1[ch];
7823 if (x > maxchar) {
7824 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7825 goto onError;
7826 maxchar = writer->maxchar;
7827 outdata = (Py_UCS1 *)writer->data;
7828 }
7829 outdata[writer->pos] = x;
7830 writer->pos++;
7831 ++s;
7832 }
7833 return 0;
7834 }
7835
7836 while (s < e) {
7837 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7838 enum PyUnicode_Kind outkind = writer->kind;
7839 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7840 if (outkind == PyUnicode_1BYTE_KIND) {
7841 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7842 Py_UCS4 maxchar = writer->maxchar;
7843 while (s < e) {
7844 ch = *s;
7845 x = mapdata_ucs2[ch];
7846 if (x > maxchar)
7847 goto Error;
7848 outdata[writer->pos] = x;
7849 writer->pos++;
7850 ++s;
7851 }
7852 break;
7853 }
7854 else if (outkind == PyUnicode_2BYTE_KIND) {
7855 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7856 while (s < e) {
7857 ch = *s;
7858 x = mapdata_ucs2[ch];
7859 if (x == 0xFFFE)
7860 goto Error;
7861 outdata[writer->pos] = x;
7862 writer->pos++;
7863 ++s;
7864 }
7865 break;
7866 }
7867 }
7868 ch = *s;
7869
7870 if (ch < maplen)
7871 x = PyUnicode_READ(mapkind, mapdata, ch);
7872 else
7873 x = 0xfffe; /* invalid value */
7874 Error:
7875 if (x == 0xfffe)
7876 {
7877 /* undefined mapping */
7878 startinpos = s-starts;
7879 endinpos = startinpos+1;
7880 if (unicode_decode_call_errorhandler_writer(
7881 errors, &errorHandler,
7882 "charmap", "character maps to <undefined>",
7883 &starts, &e, &startinpos, &endinpos, &exc, &s,
7884 writer)) {
7885 goto onError;
7886 }
7887 continue;
7888 }
7889
7890 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7891 goto onError;
7892 ++s;
7893 }
7894 Py_XDECREF(errorHandler);
7895 Py_XDECREF(exc);
7896 return 0;
7897
7898 onError:
7899 Py_XDECREF(errorHandler);
7900 Py_XDECREF(exc);
7901 return -1;
7902 }
7903
7904 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7905 charmap_decode_mapping(const char *s,
7906 Py_ssize_t size,
7907 PyObject *mapping,
7908 const char *errors,
7909 _PyUnicodeWriter *writer)
7910 {
7911 const char *starts = s;
7912 const char *e;
7913 Py_ssize_t startinpos, endinpos;
7914 PyObject *errorHandler = NULL, *exc = NULL;
7915 unsigned char ch;
7916 PyObject *key, *item = NULL;
7917
7918 e = s + size;
7919
7920 while (s < e) {
7921 ch = *s;
7922
7923 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7924 key = PyLong_FromLong((long)ch);
7925 if (key == NULL)
7926 goto onError;
7927
7928 item = PyObject_GetItem(mapping, key);
7929 Py_DECREF(key);
7930 if (item == NULL) {
7931 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7932 /* No mapping found means: mapping is undefined. */
7933 PyErr_Clear();
7934 goto Undefined;
7935 } else
7936 goto onError;
7937 }
7938
7939 /* Apply mapping */
7940 if (item == Py_None)
7941 goto Undefined;
7942 if (PyLong_Check(item)) {
7943 long value = PyLong_AS_LONG(item);
7944 if (value == 0xFFFE)
7945 goto Undefined;
7946 if (value < 0 || value > MAX_UNICODE) {
7947 PyErr_Format(PyExc_TypeError,
7948 "character mapping must be in range(0x%lx)",
7949 (unsigned long)MAX_UNICODE + 1);
7950 goto onError;
7951 }
7952
7953 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7954 goto onError;
7955 }
7956 else if (PyUnicode_Check(item)) {
7957 if (PyUnicode_READY(item) == -1)
7958 goto onError;
7959 if (PyUnicode_GET_LENGTH(item) == 1) {
7960 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7961 if (value == 0xFFFE)
7962 goto Undefined;
7963 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7964 goto onError;
7965 }
7966 else {
7967 writer->overallocate = 1;
7968 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7969 goto onError;
7970 }
7971 }
7972 else {
7973 /* wrong return value */
7974 PyErr_SetString(PyExc_TypeError,
7975 "character mapping must return integer, None or str");
7976 goto onError;
7977 }
7978 Py_CLEAR(item);
7979 ++s;
7980 continue;
7981
7982 Undefined:
7983 /* undefined mapping */
7984 Py_CLEAR(item);
7985 startinpos = s-starts;
7986 endinpos = startinpos+1;
7987 if (unicode_decode_call_errorhandler_writer(
7988 errors, &errorHandler,
7989 "charmap", "character maps to <undefined>",
7990 &starts, &e, &startinpos, &endinpos, &exc, &s,
7991 writer)) {
7992 goto onError;
7993 }
7994 }
7995 Py_XDECREF(errorHandler);
7996 Py_XDECREF(exc);
7997 return 0;
7998
7999 onError:
8000 Py_XDECREF(item);
8001 Py_XDECREF(errorHandler);
8002 Py_XDECREF(exc);
8003 return -1;
8004 }
8005
8006 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8007 PyUnicode_DecodeCharmap(const char *s,
8008 Py_ssize_t size,
8009 PyObject *mapping,
8010 const char *errors)
8011 {
8012 _PyUnicodeWriter writer;
8013
8014 /* Default to Latin-1 */
8015 if (mapping == NULL)
8016 return PyUnicode_DecodeLatin1(s, size, errors);
8017
8018 if (size == 0)
8019 _Py_RETURN_UNICODE_EMPTY();
8020 _PyUnicodeWriter_Init(&writer);
8021 writer.min_length = size;
8022 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8023 goto onError;
8024
8025 if (PyUnicode_CheckExact(mapping)) {
8026 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8027 goto onError;
8028 }
8029 else {
8030 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
8032 }
8033 return _PyUnicodeWriter_Finish(&writer);
8034
8035 onError:
8036 _PyUnicodeWriter_Dealloc(&writer);
8037 return NULL;
8038 }
8039
8040 /* Charmap encoding: the lookup table */
8041
8042 struct encoding_map {
8043 PyObject_HEAD
8044 unsigned char level1[32];
8045 int count2, count3;
8046 unsigned char level23[1];
8047 };
8048
8049 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8050 encoding_map_size(PyObject *obj, PyObject* args)
8051 {
8052 struct encoding_map *map = (struct encoding_map*)obj;
8053 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8054 128*map->count3);
8055 }
8056
8057 static PyMethodDef encoding_map_methods[] = {
8058 {"size", encoding_map_size, METH_NOARGS,
8059 PyDoc_STR("Return the size (in bytes) of this object") },
8060 { 0 }
8061 };
8062
8063 static void
encoding_map_dealloc(PyObject * o)8064 encoding_map_dealloc(PyObject* o)
8065 {
8066 PyObject_FREE(o);
8067 }
8068
8069 static PyTypeObject EncodingMapType = {
8070 PyVarObject_HEAD_INIT(NULL, 0)
8071 "EncodingMap", /*tp_name*/
8072 sizeof(struct encoding_map), /*tp_basicsize*/
8073 0, /*tp_itemsize*/
8074 /* methods */
8075 encoding_map_dealloc, /*tp_dealloc*/
8076 0, /*tp_print*/
8077 0, /*tp_getattr*/
8078 0, /*tp_setattr*/
8079 0, /*tp_reserved*/
8080 0, /*tp_repr*/
8081 0, /*tp_as_number*/
8082 0, /*tp_as_sequence*/
8083 0, /*tp_as_mapping*/
8084 0, /*tp_hash*/
8085 0, /*tp_call*/
8086 0, /*tp_str*/
8087 0, /*tp_getattro*/
8088 0, /*tp_setattro*/
8089 0, /*tp_as_buffer*/
8090 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8091 0, /*tp_doc*/
8092 0, /*tp_traverse*/
8093 0, /*tp_clear*/
8094 0, /*tp_richcompare*/
8095 0, /*tp_weaklistoffset*/
8096 0, /*tp_iter*/
8097 0, /*tp_iternext*/
8098 encoding_map_methods, /*tp_methods*/
8099 0, /*tp_members*/
8100 0, /*tp_getset*/
8101 0, /*tp_base*/
8102 0, /*tp_dict*/
8103 0, /*tp_descr_get*/
8104 0, /*tp_descr_set*/
8105 0, /*tp_dictoffset*/
8106 0, /*tp_init*/
8107 0, /*tp_alloc*/
8108 0, /*tp_new*/
8109 0, /*tp_free*/
8110 0, /*tp_is_gc*/
8111 };
8112
8113 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8114 PyUnicode_BuildEncodingMap(PyObject* string)
8115 {
8116 PyObject *result;
8117 struct encoding_map *mresult;
8118 int i;
8119 int need_dict = 0;
8120 unsigned char level1[32];
8121 unsigned char level2[512];
8122 unsigned char *mlevel1, *mlevel2, *mlevel3;
8123 int count2 = 0, count3 = 0;
8124 int kind;
8125 void *data;
8126 Py_ssize_t length;
8127 Py_UCS4 ch;
8128
8129 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8130 PyErr_BadArgument();
8131 return NULL;
8132 }
8133 kind = PyUnicode_KIND(string);
8134 data = PyUnicode_DATA(string);
8135 length = PyUnicode_GET_LENGTH(string);
8136 length = Py_MIN(length, 256);
8137 memset(level1, 0xFF, sizeof level1);
8138 memset(level2, 0xFF, sizeof level2);
8139
8140 /* If there isn't a one-to-one mapping of NULL to \0,
8141 or if there are non-BMP characters, we need to use
8142 a mapping dictionary. */
8143 if (PyUnicode_READ(kind, data, 0) != 0)
8144 need_dict = 1;
8145 for (i = 1; i < length; i++) {
8146 int l1, l2;
8147 ch = PyUnicode_READ(kind, data, i);
8148 if (ch == 0 || ch > 0xFFFF) {
8149 need_dict = 1;
8150 break;
8151 }
8152 if (ch == 0xFFFE)
8153 /* unmapped character */
8154 continue;
8155 l1 = ch >> 11;
8156 l2 = ch >> 7;
8157 if (level1[l1] == 0xFF)
8158 level1[l1] = count2++;
8159 if (level2[l2] == 0xFF)
8160 level2[l2] = count3++;
8161 }
8162
8163 if (count2 >= 0xFF || count3 >= 0xFF)
8164 need_dict = 1;
8165
8166 if (need_dict) {
8167 PyObject *result = PyDict_New();
8168 PyObject *key, *value;
8169 if (!result)
8170 return NULL;
8171 for (i = 0; i < length; i++) {
8172 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8173 value = PyLong_FromLong(i);
8174 if (!key || !value)
8175 goto failed1;
8176 if (PyDict_SetItem(result, key, value) == -1)
8177 goto failed1;
8178 Py_DECREF(key);
8179 Py_DECREF(value);
8180 }
8181 return result;
8182 failed1:
8183 Py_XDECREF(key);
8184 Py_XDECREF(value);
8185 Py_DECREF(result);
8186 return NULL;
8187 }
8188
8189 /* Create a three-level trie */
8190 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8191 16*count2 + 128*count3 - 1);
8192 if (!result)
8193 return PyErr_NoMemory();
8194 PyObject_Init(result, &EncodingMapType);
8195 mresult = (struct encoding_map*)result;
8196 mresult->count2 = count2;
8197 mresult->count3 = count3;
8198 mlevel1 = mresult->level1;
8199 mlevel2 = mresult->level23;
8200 mlevel3 = mresult->level23 + 16*count2;
8201 memcpy(mlevel1, level1, 32);
8202 memset(mlevel2, 0xFF, 16*count2);
8203 memset(mlevel3, 0, 128*count3);
8204 count3 = 0;
8205 for (i = 1; i < length; i++) {
8206 int o1, o2, o3, i2, i3;
8207 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8208 if (ch == 0xFFFE)
8209 /* unmapped character */
8210 continue;
8211 o1 = ch>>11;
8212 o2 = (ch>>7) & 0xF;
8213 i2 = 16*mlevel1[o1] + o2;
8214 if (mlevel2[i2] == 0xFF)
8215 mlevel2[i2] = count3++;
8216 o3 = ch & 0x7F;
8217 i3 = 128*mlevel2[i2] + o3;
8218 mlevel3[i3] = i;
8219 }
8220 return result;
8221 }
8222
8223 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8224 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8225 {
8226 struct encoding_map *map = (struct encoding_map*)mapping;
8227 int l1 = c>>11;
8228 int l2 = (c>>7) & 0xF;
8229 int l3 = c & 0x7F;
8230 int i;
8231
8232 if (c > 0xFFFF)
8233 return -1;
8234 if (c == 0)
8235 return 0;
8236 /* level 1*/
8237 i = map->level1[l1];
8238 if (i == 0xFF) {
8239 return -1;
8240 }
8241 /* level 2*/
8242 i = map->level23[16*i+l2];
8243 if (i == 0xFF) {
8244 return -1;
8245 }
8246 /* level 3 */
8247 i = map->level23[16*map->count2 + 128*i + l3];
8248 if (i == 0) {
8249 return -1;
8250 }
8251 return i;
8252 }
8253
8254 /* Lookup the character ch in the mapping. If the character
8255 can't be found, Py_None is returned (or NULL, if another
8256 error occurred). */
8257 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8258 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8259 {
8260 PyObject *w = PyLong_FromLong((long)c);
8261 PyObject *x;
8262
8263 if (w == NULL)
8264 return NULL;
8265 x = PyObject_GetItem(mapping, w);
8266 Py_DECREF(w);
8267 if (x == NULL) {
8268 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8269 /* No mapping found means: mapping is undefined. */
8270 PyErr_Clear();
8271 x = Py_None;
8272 Py_INCREF(x);
8273 return x;
8274 } else
8275 return NULL;
8276 }
8277 else if (x == Py_None)
8278 return x;
8279 else if (PyLong_Check(x)) {
8280 long value = PyLong_AS_LONG(x);
8281 if (value < 0 || value > 255) {
8282 PyErr_SetString(PyExc_TypeError,
8283 "character mapping must be in range(256)");
8284 Py_DECREF(x);
8285 return NULL;
8286 }
8287 return x;
8288 }
8289 else if (PyBytes_Check(x))
8290 return x;
8291 else {
8292 /* wrong return value */
8293 PyErr_Format(PyExc_TypeError,
8294 "character mapping must return integer, bytes or None, not %.400s",
8295 x->ob_type->tp_name);
8296 Py_DECREF(x);
8297 return NULL;
8298 }
8299 }
8300
8301 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8302 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8303 {
8304 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8305 /* exponentially overallocate to minimize reallocations */
8306 if (requiredsize < 2*outsize)
8307 requiredsize = 2*outsize;
8308 if (_PyBytes_Resize(outobj, requiredsize))
8309 return -1;
8310 return 0;
8311 }
8312
8313 typedef enum charmapencode_result {
8314 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8315 } charmapencode_result;
8316 /* lookup the character, put the result in the output string and adjust
8317 various state variables. Resize the output bytes object if not enough
8318 space is available. Return a new reference to the object that
8319 was put in the output buffer, or Py_None, if the mapping was undefined
8320 (in which case no character was written) or NULL, if a
8321 reallocation error occurred. The caller must decref the result */
8322 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8323 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8324 PyObject **outobj, Py_ssize_t *outpos)
8325 {
8326 PyObject *rep;
8327 char *outstart;
8328 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8329
8330 if (Py_TYPE(mapping) == &EncodingMapType) {
8331 int res = encoding_map_lookup(c, mapping);
8332 Py_ssize_t requiredsize = *outpos+1;
8333 if (res == -1)
8334 return enc_FAILED;
8335 if (outsize<requiredsize)
8336 if (charmapencode_resize(outobj, outpos, requiredsize))
8337 return enc_EXCEPTION;
8338 outstart = PyBytes_AS_STRING(*outobj);
8339 outstart[(*outpos)++] = (char)res;
8340 return enc_SUCCESS;
8341 }
8342
8343 rep = charmapencode_lookup(c, mapping);
8344 if (rep==NULL)
8345 return enc_EXCEPTION;
8346 else if (rep==Py_None) {
8347 Py_DECREF(rep);
8348 return enc_FAILED;
8349 } else {
8350 if (PyLong_Check(rep)) {
8351 Py_ssize_t requiredsize = *outpos+1;
8352 if (outsize<requiredsize)
8353 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8354 Py_DECREF(rep);
8355 return enc_EXCEPTION;
8356 }
8357 outstart = PyBytes_AS_STRING(*outobj);
8358 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8359 }
8360 else {
8361 const char *repchars = PyBytes_AS_STRING(rep);
8362 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8363 Py_ssize_t requiredsize = *outpos+repsize;
8364 if (outsize<requiredsize)
8365 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8366 Py_DECREF(rep);
8367 return enc_EXCEPTION;
8368 }
8369 outstart = PyBytes_AS_STRING(*outobj);
8370 memcpy(outstart + *outpos, repchars, repsize);
8371 *outpos += repsize;
8372 }
8373 }
8374 Py_DECREF(rep);
8375 return enc_SUCCESS;
8376 }
8377
8378 /* handle an error in PyUnicode_EncodeCharmap
8379 Return 0 on success, -1 on error */
8380 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8381 charmap_encoding_error(
8382 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8383 PyObject **exceptionObject,
8384 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8385 PyObject **res, Py_ssize_t *respos)
8386 {
8387 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8388 Py_ssize_t size, repsize;
8389 Py_ssize_t newpos;
8390 enum PyUnicode_Kind kind;
8391 void *data;
8392 Py_ssize_t index;
8393 /* startpos for collecting unencodable chars */
8394 Py_ssize_t collstartpos = *inpos;
8395 Py_ssize_t collendpos = *inpos+1;
8396 Py_ssize_t collpos;
8397 char *encoding = "charmap";
8398 char *reason = "character maps to <undefined>";
8399 charmapencode_result x;
8400 Py_UCS4 ch;
8401 int val;
8402
8403 if (PyUnicode_READY(unicode) == -1)
8404 return -1;
8405 size = PyUnicode_GET_LENGTH(unicode);
8406 /* find all unencodable characters */
8407 while (collendpos < size) {
8408 PyObject *rep;
8409 if (Py_TYPE(mapping) == &EncodingMapType) {
8410 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8411 val = encoding_map_lookup(ch, mapping);
8412 if (val != -1)
8413 break;
8414 ++collendpos;
8415 continue;
8416 }
8417
8418 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8419 rep = charmapencode_lookup(ch, mapping);
8420 if (rep==NULL)
8421 return -1;
8422 else if (rep!=Py_None) {
8423 Py_DECREF(rep);
8424 break;
8425 }
8426 Py_DECREF(rep);
8427 ++collendpos;
8428 }
8429 /* cache callback name lookup
8430 * (if not done yet, i.e. it's the first error) */
8431 if (*error_handler == _Py_ERROR_UNKNOWN)
8432 *error_handler = get_error_handler(errors);
8433
8434 switch (*error_handler) {
8435 case _Py_ERROR_STRICT:
8436 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8437 return -1;
8438
8439 case _Py_ERROR_REPLACE:
8440 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8441 x = charmapencode_output('?', mapping, res, respos);
8442 if (x==enc_EXCEPTION) {
8443 return -1;
8444 }
8445 else if (x==enc_FAILED) {
8446 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8447 return -1;
8448 }
8449 }
8450 /* fall through */
8451 case _Py_ERROR_IGNORE:
8452 *inpos = collendpos;
8453 break;
8454
8455 case _Py_ERROR_XMLCHARREFREPLACE:
8456 /* generate replacement (temporarily (mis)uses p) */
8457 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8458 char buffer[2+29+1+1];
8459 char *cp;
8460 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8461 for (cp = buffer; *cp; ++cp) {
8462 x = charmapencode_output(*cp, mapping, res, respos);
8463 if (x==enc_EXCEPTION)
8464 return -1;
8465 else if (x==enc_FAILED) {
8466 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8467 return -1;
8468 }
8469 }
8470 }
8471 *inpos = collendpos;
8472 break;
8473
8474 default:
8475 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8476 encoding, reason, unicode, exceptionObject,
8477 collstartpos, collendpos, &newpos);
8478 if (repunicode == NULL)
8479 return -1;
8480 if (PyBytes_Check(repunicode)) {
8481 /* Directly copy bytes result to output. */
8482 Py_ssize_t outsize = PyBytes_Size(*res);
8483 Py_ssize_t requiredsize;
8484 repsize = PyBytes_Size(repunicode);
8485 requiredsize = *respos + repsize;
8486 if (requiredsize > outsize)
8487 /* Make room for all additional bytes. */
8488 if (charmapencode_resize(res, respos, requiredsize)) {
8489 Py_DECREF(repunicode);
8490 return -1;
8491 }
8492 memcpy(PyBytes_AsString(*res) + *respos,
8493 PyBytes_AsString(repunicode), repsize);
8494 *respos += repsize;
8495 *inpos = newpos;
8496 Py_DECREF(repunicode);
8497 break;
8498 }
8499 /* generate replacement */
8500 if (PyUnicode_READY(repunicode) == -1) {
8501 Py_DECREF(repunicode);
8502 return -1;
8503 }
8504 repsize = PyUnicode_GET_LENGTH(repunicode);
8505 data = PyUnicode_DATA(repunicode);
8506 kind = PyUnicode_KIND(repunicode);
8507 for (index = 0; index < repsize; index++) {
8508 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8509 x = charmapencode_output(repch, mapping, res, respos);
8510 if (x==enc_EXCEPTION) {
8511 Py_DECREF(repunicode);
8512 return -1;
8513 }
8514 else if (x==enc_FAILED) {
8515 Py_DECREF(repunicode);
8516 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8517 return -1;
8518 }
8519 }
8520 *inpos = newpos;
8521 Py_DECREF(repunicode);
8522 }
8523 return 0;
8524 }
8525
8526 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8527 _PyUnicode_EncodeCharmap(PyObject *unicode,
8528 PyObject *mapping,
8529 const char *errors)
8530 {
8531 /* output object */
8532 PyObject *res = NULL;
8533 /* current input position */
8534 Py_ssize_t inpos = 0;
8535 Py_ssize_t size;
8536 /* current output position */
8537 Py_ssize_t respos = 0;
8538 PyObject *error_handler_obj = NULL;
8539 PyObject *exc = NULL;
8540 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8541 void *data;
8542 int kind;
8543
8544 if (PyUnicode_READY(unicode) == -1)
8545 return NULL;
8546 size = PyUnicode_GET_LENGTH(unicode);
8547 data = PyUnicode_DATA(unicode);
8548 kind = PyUnicode_KIND(unicode);
8549
8550 /* Default to Latin-1 */
8551 if (mapping == NULL)
8552 return unicode_encode_ucs1(unicode, errors, 256);
8553
8554 /* allocate enough for a simple encoding without
8555 replacements, if we need more, we'll resize */
8556 res = PyBytes_FromStringAndSize(NULL, size);
8557 if (res == NULL)
8558 goto onError;
8559 if (size == 0)
8560 return res;
8561
8562 while (inpos<size) {
8563 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8564 /* try to encode it */
8565 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8566 if (x==enc_EXCEPTION) /* error */
8567 goto onError;
8568 if (x==enc_FAILED) { /* unencodable character */
8569 if (charmap_encoding_error(unicode, &inpos, mapping,
8570 &exc,
8571 &error_handler, &error_handler_obj, errors,
8572 &res, &respos)) {
8573 goto onError;
8574 }
8575 }
8576 else
8577 /* done with this character => adjust input position */
8578 ++inpos;
8579 }
8580
8581 /* Resize if we allocated to much */
8582 if (respos<PyBytes_GET_SIZE(res))
8583 if (_PyBytes_Resize(&res, respos) < 0)
8584 goto onError;
8585
8586 Py_XDECREF(exc);
8587 Py_XDECREF(error_handler_obj);
8588 return res;
8589
8590 onError:
8591 Py_XDECREF(res);
8592 Py_XDECREF(exc);
8593 Py_XDECREF(error_handler_obj);
8594 return NULL;
8595 }
8596
8597 /* Deprecated */
8598 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8599 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8600 Py_ssize_t size,
8601 PyObject *mapping,
8602 const char *errors)
8603 {
8604 PyObject *result;
8605 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8606 if (unicode == NULL)
8607 return NULL;
8608 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8609 Py_DECREF(unicode);
8610 return result;
8611 }
8612
8613 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8614 PyUnicode_AsCharmapString(PyObject *unicode,
8615 PyObject *mapping)
8616 {
8617 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8618 PyErr_BadArgument();
8619 return NULL;
8620 }
8621 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8622 }
8623
8624 /* create or adjust a UnicodeTranslateError */
8625 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8626 make_translate_exception(PyObject **exceptionObject,
8627 PyObject *unicode,
8628 Py_ssize_t startpos, Py_ssize_t endpos,
8629 const char *reason)
8630 {
8631 if (*exceptionObject == NULL) {
8632 *exceptionObject = _PyUnicodeTranslateError_Create(
8633 unicode, startpos, endpos, reason);
8634 }
8635 else {
8636 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8637 goto onError;
8638 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8639 goto onError;
8640 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8641 goto onError;
8642 return;
8643 onError:
8644 Py_CLEAR(*exceptionObject);
8645 }
8646 }
8647
8648 /* error handling callback helper:
8649 build arguments, call the callback and check the arguments,
8650 put the result into newpos and return the replacement string, which
8651 has to be freed by the caller */
8652 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8653 unicode_translate_call_errorhandler(const char *errors,
8654 PyObject **errorHandler,
8655 const char *reason,
8656 PyObject *unicode, PyObject **exceptionObject,
8657 Py_ssize_t startpos, Py_ssize_t endpos,
8658 Py_ssize_t *newpos)
8659 {
8660 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
8661
8662 Py_ssize_t i_newpos;
8663 PyObject *restuple;
8664 PyObject *resunicode;
8665
8666 if (*errorHandler == NULL) {
8667 *errorHandler = PyCodec_LookupError(errors);
8668 if (*errorHandler == NULL)
8669 return NULL;
8670 }
8671
8672 make_translate_exception(exceptionObject,
8673 unicode, startpos, endpos, reason);
8674 if (*exceptionObject == NULL)
8675 return NULL;
8676
8677 restuple = PyObject_CallFunctionObjArgs(
8678 *errorHandler, *exceptionObject, NULL);
8679 if (restuple == NULL)
8680 return NULL;
8681 if (!PyTuple_Check(restuple)) {
8682 PyErr_SetString(PyExc_TypeError, &argparse[4]);
8683 Py_DECREF(restuple);
8684 return NULL;
8685 }
8686 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8687 &resunicode, &i_newpos)) {
8688 Py_DECREF(restuple);
8689 return NULL;
8690 }
8691 if (i_newpos<0)
8692 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8693 else
8694 *newpos = i_newpos;
8695 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8696 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8697 Py_DECREF(restuple);
8698 return NULL;
8699 }
8700 Py_INCREF(resunicode);
8701 Py_DECREF(restuple);
8702 return resunicode;
8703 }
8704
8705 /* Lookup the character ch in the mapping and put the result in result,
8706 which must be decrefed by the caller.
8707 Return 0 on success, -1 on error */
8708 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8709 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8710 {
8711 PyObject *w = PyLong_FromLong((long)c);
8712 PyObject *x;
8713
8714 if (w == NULL)
8715 return -1;
8716 x = PyObject_GetItem(mapping, w);
8717 Py_DECREF(w);
8718 if (x == NULL) {
8719 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8720 /* No mapping found means: use 1:1 mapping. */
8721 PyErr_Clear();
8722 *result = NULL;
8723 return 0;
8724 } else
8725 return -1;
8726 }
8727 else if (x == Py_None) {
8728 *result = x;
8729 return 0;
8730 }
8731 else if (PyLong_Check(x)) {
8732 long value = PyLong_AS_LONG(x);
8733 if (value < 0 || value > MAX_UNICODE) {
8734 PyErr_Format(PyExc_ValueError,
8735 "character mapping must be in range(0x%x)",
8736 MAX_UNICODE+1);
8737 Py_DECREF(x);
8738 return -1;
8739 }
8740 *result = x;
8741 return 0;
8742 }
8743 else if (PyUnicode_Check(x)) {
8744 *result = x;
8745 return 0;
8746 }
8747 else {
8748 /* wrong return value */
8749 PyErr_SetString(PyExc_TypeError,
8750 "character mapping must return integer, None or str");
8751 Py_DECREF(x);
8752 return -1;
8753 }
8754 }
8755
8756 /* lookup the character, write the result into the writer.
8757 Return 1 if the result was written into the writer, return 0 if the mapping
8758 was undefined, raise an exception return -1 on error. */
8759 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8760 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8761 _PyUnicodeWriter *writer)
8762 {
8763 PyObject *item;
8764
8765 if (charmaptranslate_lookup(ch, mapping, &item))
8766 return -1;
8767
8768 if (item == NULL) {
8769 /* not found => default to 1:1 mapping */
8770 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8771 return -1;
8772 }
8773 return 1;
8774 }
8775
8776 if (item == Py_None) {
8777 Py_DECREF(item);
8778 return 0;
8779 }
8780
8781 if (PyLong_Check(item)) {
8782 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8783 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8784 used it */
8785 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8786 Py_DECREF(item);
8787 return -1;
8788 }
8789 Py_DECREF(item);
8790 return 1;
8791 }
8792
8793 if (!PyUnicode_Check(item)) {
8794 Py_DECREF(item);
8795 return -1;
8796 }
8797
8798 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8799 Py_DECREF(item);
8800 return -1;
8801 }
8802
8803 Py_DECREF(item);
8804 return 1;
8805 }
8806
8807 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8808 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8809 Py_UCS1 *translate)
8810 {
8811 PyObject *item = NULL;
8812 int ret = 0;
8813
8814 if (charmaptranslate_lookup(ch, mapping, &item)) {
8815 return -1;
8816 }
8817
8818 if (item == Py_None) {
8819 /* deletion */
8820 translate[ch] = 0xfe;
8821 }
8822 else if (item == NULL) {
8823 /* not found => default to 1:1 mapping */
8824 translate[ch] = ch;
8825 return 1;
8826 }
8827 else if (PyLong_Check(item)) {
8828 long replace = PyLong_AS_LONG(item);
8829 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8830 used it */
8831 if (127 < replace) {
8832 /* invalid character or character outside ASCII:
8833 skip the fast translate */
8834 goto exit;
8835 }
8836 translate[ch] = (Py_UCS1)replace;
8837 }
8838 else if (PyUnicode_Check(item)) {
8839 Py_UCS4 replace;
8840
8841 if (PyUnicode_READY(item) == -1) {
8842 Py_DECREF(item);
8843 return -1;
8844 }
8845 if (PyUnicode_GET_LENGTH(item) != 1)
8846 goto exit;
8847
8848 replace = PyUnicode_READ_CHAR(item, 0);
8849 if (replace > 127)
8850 goto exit;
8851 translate[ch] = (Py_UCS1)replace;
8852 }
8853 else {
8854 /* not None, NULL, long or unicode */
8855 goto exit;
8856 }
8857 ret = 1;
8858
8859 exit:
8860 Py_DECREF(item);
8861 return ret;
8862 }
8863
8864 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8865 was translated into writer, return 0 if the input string was partially
8866 translated into writer, raise an exception and return -1 on error. */
8867 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8868 unicode_fast_translate(PyObject *input, PyObject *mapping,
8869 _PyUnicodeWriter *writer, int ignore,
8870 Py_ssize_t *input_pos)
8871 {
8872 Py_UCS1 ascii_table[128], ch, ch2;
8873 Py_ssize_t len;
8874 Py_UCS1 *in, *end, *out;
8875 int res = 0;
8876
8877 len = PyUnicode_GET_LENGTH(input);
8878
8879 memset(ascii_table, 0xff, 128);
8880
8881 in = PyUnicode_1BYTE_DATA(input);
8882 end = in + len;
8883
8884 assert(PyUnicode_IS_ASCII(writer->buffer));
8885 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8886 out = PyUnicode_1BYTE_DATA(writer->buffer);
8887
8888 for (; in < end; in++) {
8889 ch = *in;
8890 ch2 = ascii_table[ch];
8891 if (ch2 == 0xff) {
8892 int translate = unicode_fast_translate_lookup(mapping, ch,
8893 ascii_table);
8894 if (translate < 0)
8895 return -1;
8896 if (translate == 0)
8897 goto exit;
8898 ch2 = ascii_table[ch];
8899 }
8900 if (ch2 == 0xfe) {
8901 if (ignore)
8902 continue;
8903 goto exit;
8904 }
8905 assert(ch2 < 128);
8906 *out = ch2;
8907 out++;
8908 }
8909 res = 1;
8910
8911 exit:
8912 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8913 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8914 return res;
8915 }
8916
8917 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8918 _PyUnicode_TranslateCharmap(PyObject *input,
8919 PyObject *mapping,
8920 const char *errors)
8921 {
8922 /* input object */
8923 char *data;
8924 Py_ssize_t size, i;
8925 int kind;
8926 /* output buffer */
8927 _PyUnicodeWriter writer;
8928 /* error handler */
8929 char *reason = "character maps to <undefined>";
8930 PyObject *errorHandler = NULL;
8931 PyObject *exc = NULL;
8932 int ignore;
8933 int res;
8934
8935 if (mapping == NULL) {
8936 PyErr_BadArgument();
8937 return NULL;
8938 }
8939
8940 if (PyUnicode_READY(input) == -1)
8941 return NULL;
8942 data = (char*)PyUnicode_DATA(input);
8943 kind = PyUnicode_KIND(input);
8944 size = PyUnicode_GET_LENGTH(input);
8945
8946 if (size == 0)
8947 return PyUnicode_FromObject(input);
8948
8949 /* allocate enough for a simple 1:1 translation without
8950 replacements, if we need more, we'll resize */
8951 _PyUnicodeWriter_Init(&writer);
8952 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8953 goto onError;
8954
8955 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8956
8957 if (PyUnicode_READY(input) == -1)
8958 return NULL;
8959 if (PyUnicode_IS_ASCII(input)) {
8960 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8961 if (res < 0) {
8962 _PyUnicodeWriter_Dealloc(&writer);
8963 return NULL;
8964 }
8965 if (res == 1)
8966 return _PyUnicodeWriter_Finish(&writer);
8967 }
8968 else {
8969 i = 0;
8970 }
8971
8972 while (i<size) {
8973 /* try to encode it */
8974 int translate;
8975 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8976 Py_ssize_t newpos;
8977 /* startpos for collecting untranslatable chars */
8978 Py_ssize_t collstart;
8979 Py_ssize_t collend;
8980 Py_UCS4 ch;
8981
8982 ch = PyUnicode_READ(kind, data, i);
8983 translate = charmaptranslate_output(ch, mapping, &writer);
8984 if (translate < 0)
8985 goto onError;
8986
8987 if (translate != 0) {
8988 /* it worked => adjust input pointer */
8989 ++i;
8990 continue;
8991 }
8992
8993 /* untranslatable character */
8994 collstart = i;
8995 collend = i+1;
8996
8997 /* find all untranslatable characters */
8998 while (collend < size) {
8999 PyObject *x;
9000 ch = PyUnicode_READ(kind, data, collend);
9001 if (charmaptranslate_lookup(ch, mapping, &x))
9002 goto onError;
9003 Py_XDECREF(x);
9004 if (x != Py_None)
9005 break;
9006 ++collend;
9007 }
9008
9009 if (ignore) {
9010 i = collend;
9011 }
9012 else {
9013 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9014 reason, input, &exc,
9015 collstart, collend, &newpos);
9016 if (repunicode == NULL)
9017 goto onError;
9018 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9019 Py_DECREF(repunicode);
9020 goto onError;
9021 }
9022 Py_DECREF(repunicode);
9023 i = newpos;
9024 }
9025 }
9026 Py_XDECREF(exc);
9027 Py_XDECREF(errorHandler);
9028 return _PyUnicodeWriter_Finish(&writer);
9029
9030 onError:
9031 _PyUnicodeWriter_Dealloc(&writer);
9032 Py_XDECREF(exc);
9033 Py_XDECREF(errorHandler);
9034 return NULL;
9035 }
9036
9037 /* Deprecated. Use PyUnicode_Translate instead. */
9038 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9039 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9040 Py_ssize_t size,
9041 PyObject *mapping,
9042 const char *errors)
9043 {
9044 PyObject *result;
9045 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9046 if (!unicode)
9047 return NULL;
9048 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9049 Py_DECREF(unicode);
9050 return result;
9051 }
9052
9053 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9054 PyUnicode_Translate(PyObject *str,
9055 PyObject *mapping,
9056 const char *errors)
9057 {
9058 if (ensure_unicode(str) < 0)
9059 return NULL;
9060 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9061 }
9062
9063 static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject * self)9064 fix_decimal_and_space_to_ascii(PyObject *self)
9065 {
9066 /* No need to call PyUnicode_READY(self) because this function is only
9067 called as a callback from fixup() which does it already. */
9068 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9069 const int kind = PyUnicode_KIND(self);
9070 void *data = PyUnicode_DATA(self);
9071 Py_UCS4 maxchar = 127, ch, fixed;
9072 int modified = 0;
9073 Py_ssize_t i;
9074
9075 for (i = 0; i < len; ++i) {
9076 ch = PyUnicode_READ(kind, data, i);
9077 fixed = 0;
9078 if (ch > 127) {
9079 if (Py_UNICODE_ISSPACE(ch))
9080 fixed = ' ';
9081 else {
9082 const int decimal = Py_UNICODE_TODECIMAL(ch);
9083 if (decimal >= 0)
9084 fixed = '0' + decimal;
9085 }
9086 if (fixed != 0) {
9087 modified = 1;
9088 maxchar = Py_MAX(maxchar, fixed);
9089 PyUnicode_WRITE(kind, data, i, fixed);
9090 }
9091 else
9092 maxchar = Py_MAX(maxchar, ch);
9093 }
9094 }
9095
9096 return (modified) ? maxchar : 0;
9097 }
9098
9099 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9100 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9101 {
9102 if (!PyUnicode_Check(unicode)) {
9103 PyErr_BadInternalCall();
9104 return NULL;
9105 }
9106 if (PyUnicode_READY(unicode) == -1)
9107 return NULL;
9108 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9109 /* If the string is already ASCII, just return the same string */
9110 Py_INCREF(unicode);
9111 return unicode;
9112 }
9113 return fixup(unicode, fix_decimal_and_space_to_ascii);
9114 }
9115
9116 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9117 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9118 Py_ssize_t length)
9119 {
9120 PyObject *decimal;
9121 Py_ssize_t i;
9122 Py_UCS4 maxchar;
9123 enum PyUnicode_Kind kind;
9124 void *data;
9125
9126 maxchar = 127;
9127 for (i = 0; i < length; i++) {
9128 Py_UCS4 ch = s[i];
9129 if (ch > 127) {
9130 int decimal = Py_UNICODE_TODECIMAL(ch);
9131 if (decimal >= 0)
9132 ch = '0' + decimal;
9133 maxchar = Py_MAX(maxchar, ch);
9134 }
9135 }
9136
9137 /* Copy to a new string */
9138 decimal = PyUnicode_New(length, maxchar);
9139 if (decimal == NULL)
9140 return decimal;
9141 kind = PyUnicode_KIND(decimal);
9142 data = PyUnicode_DATA(decimal);
9143 /* Iterate over code points */
9144 for (i = 0; i < length; i++) {
9145 Py_UCS4 ch = s[i];
9146 if (ch > 127) {
9147 int decimal = Py_UNICODE_TODECIMAL(ch);
9148 if (decimal >= 0)
9149 ch = '0' + decimal;
9150 }
9151 PyUnicode_WRITE(kind, data, i, ch);
9152 }
9153 return unicode_result(decimal);
9154 }
9155 /* --- Decimal Encoder ---------------------------------------------------- */
9156
9157 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9158 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9159 Py_ssize_t length,
9160 char *output,
9161 const char *errors)
9162 {
9163 PyObject *unicode;
9164 Py_ssize_t i;
9165 enum PyUnicode_Kind kind;
9166 void *data;
9167
9168 if (output == NULL) {
9169 PyErr_BadArgument();
9170 return -1;
9171 }
9172
9173 unicode = PyUnicode_FromUnicode(s, length);
9174 if (unicode == NULL)
9175 return -1;
9176
9177 if (PyUnicode_READY(unicode) == -1) {
9178 Py_DECREF(unicode);
9179 return -1;
9180 }
9181 kind = PyUnicode_KIND(unicode);
9182 data = PyUnicode_DATA(unicode);
9183
9184 for (i=0; i < length; ) {
9185 PyObject *exc;
9186 Py_UCS4 ch;
9187 int decimal;
9188 Py_ssize_t startpos;
9189
9190 ch = PyUnicode_READ(kind, data, i);
9191
9192 if (Py_UNICODE_ISSPACE(ch)) {
9193 *output++ = ' ';
9194 i++;
9195 continue;
9196 }
9197 decimal = Py_UNICODE_TODECIMAL(ch);
9198 if (decimal >= 0) {
9199 *output++ = '0' + decimal;
9200 i++;
9201 continue;
9202 }
9203 if (0 < ch && ch < 256) {
9204 *output++ = (char)ch;
9205 i++;
9206 continue;
9207 }
9208
9209 startpos = i;
9210 exc = NULL;
9211 raise_encode_exception(&exc, "decimal", unicode,
9212 startpos, startpos+1,
9213 "invalid decimal Unicode string");
9214 Py_XDECREF(exc);
9215 Py_DECREF(unicode);
9216 return -1;
9217 }
9218 /* 0-terminate the output string */
9219 *output++ = '\0';
9220 Py_DECREF(unicode);
9221 return 0;
9222 }
9223
9224 /* --- Helpers ------------------------------------------------------------ */
9225
9226 /* helper macro to fixup start/end slice values */
9227 #define ADJUST_INDICES(start, end, len) \
9228 if (end > len) \
9229 end = len; \
9230 else if (end < 0) { \
9231 end += len; \
9232 if (end < 0) \
9233 end = 0; \
9234 } \
9235 if (start < 0) { \
9236 start += len; \
9237 if (start < 0) \
9238 start = 0; \
9239 }
9240
9241 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9242 any_find_slice(PyObject* s1, PyObject* s2,
9243 Py_ssize_t start,
9244 Py_ssize_t end,
9245 int direction)
9246 {
9247 int kind1, kind2;
9248 void *buf1, *buf2;
9249 Py_ssize_t len1, len2, result;
9250
9251 kind1 = PyUnicode_KIND(s1);
9252 kind2 = PyUnicode_KIND(s2);
9253 if (kind1 < kind2)
9254 return -1;
9255
9256 len1 = PyUnicode_GET_LENGTH(s1);
9257 len2 = PyUnicode_GET_LENGTH(s2);
9258 ADJUST_INDICES(start, end, len1);
9259 if (end - start < len2)
9260 return -1;
9261
9262 buf1 = PyUnicode_DATA(s1);
9263 buf2 = PyUnicode_DATA(s2);
9264 if (len2 == 1) {
9265 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9266 result = findchar((const char *)buf1 + kind1*start,
9267 kind1, end - start, ch, direction);
9268 if (result == -1)
9269 return -1;
9270 else
9271 return start + result;
9272 }
9273
9274 if (kind2 != kind1) {
9275 buf2 = _PyUnicode_AsKind(s2, kind1);
9276 if (!buf2)
9277 return -2;
9278 }
9279
9280 if (direction > 0) {
9281 switch (kind1) {
9282 case PyUnicode_1BYTE_KIND:
9283 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9284 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9285 else
9286 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9287 break;
9288 case PyUnicode_2BYTE_KIND:
9289 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9290 break;
9291 case PyUnicode_4BYTE_KIND:
9292 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9293 break;
9294 default:
9295 assert(0); result = -2;
9296 }
9297 }
9298 else {
9299 switch (kind1) {
9300 case PyUnicode_1BYTE_KIND:
9301 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9302 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 else
9304 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305 break;
9306 case PyUnicode_2BYTE_KIND:
9307 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308 break;
9309 case PyUnicode_4BYTE_KIND:
9310 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311 break;
9312 default:
9313 assert(0); result = -2;
9314 }
9315 }
9316
9317 if (kind2 != kind1)
9318 PyMem_Free(buf2);
9319
9320 return result;
9321 }
9322
9323 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(PyObject * unicode,Py_ssize_t index,Py_ssize_t n_buffer,void * digits,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9324 _PyUnicode_InsertThousandsGrouping(
9325 PyObject *unicode, Py_ssize_t index,
9326 Py_ssize_t n_buffer,
9327 void *digits, Py_ssize_t n_digits,
9328 Py_ssize_t min_width,
9329 const char *grouping, PyObject *thousands_sep,
9330 Py_UCS4 *maxchar)
9331 {
9332 unsigned int kind, thousands_sep_kind;
9333 char *data, *thousands_sep_data;
9334 Py_ssize_t thousands_sep_len;
9335 Py_ssize_t len;
9336
9337 if (unicode != NULL) {
9338 kind = PyUnicode_KIND(unicode);
9339 data = (char *) PyUnicode_DATA(unicode) + index * kind;
9340 }
9341 else {
9342 kind = PyUnicode_1BYTE_KIND;
9343 data = NULL;
9344 }
9345 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9346 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9347 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9348 if (unicode != NULL && thousands_sep_kind != kind) {
9349 if (thousands_sep_kind < kind) {
9350 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9351 if (!thousands_sep_data)
9352 return -1;
9353 }
9354 else {
9355 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9356 if (!data)
9357 return -1;
9358 }
9359 }
9360
9361 switch (kind) {
9362 case PyUnicode_1BYTE_KIND:
9363 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9364 len = asciilib_InsertThousandsGrouping(
9365 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9366 min_width, grouping,
9367 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9368 else
9369 len = ucs1lib_InsertThousandsGrouping(
9370 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9371 min_width, grouping,
9372 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9373 break;
9374 case PyUnicode_2BYTE_KIND:
9375 len = ucs2lib_InsertThousandsGrouping(
9376 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9377 min_width, grouping,
9378 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9379 break;
9380 case PyUnicode_4BYTE_KIND:
9381 len = ucs4lib_InsertThousandsGrouping(
9382 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9383 min_width, grouping,
9384 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9385 break;
9386 default:
9387 assert(0);
9388 return -1;
9389 }
9390 if (unicode != NULL && thousands_sep_kind != kind) {
9391 if (thousands_sep_kind < kind)
9392 PyMem_Free(thousands_sep_data);
9393 else
9394 PyMem_Free(data);
9395 }
9396 if (unicode == NULL) {
9397 *maxchar = 127;
9398 if (len != n_digits) {
9399 *maxchar = Py_MAX(*maxchar,
9400 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9401 }
9402 }
9403 return len;
9404 }
9405
9406
9407 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9408 PyUnicode_Count(PyObject *str,
9409 PyObject *substr,
9410 Py_ssize_t start,
9411 Py_ssize_t end)
9412 {
9413 Py_ssize_t result;
9414 int kind1, kind2;
9415 void *buf1 = NULL, *buf2 = NULL;
9416 Py_ssize_t len1, len2;
9417
9418 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9419 return -1;
9420
9421 kind1 = PyUnicode_KIND(str);
9422 kind2 = PyUnicode_KIND(substr);
9423 if (kind1 < kind2)
9424 return 0;
9425
9426 len1 = PyUnicode_GET_LENGTH(str);
9427 len2 = PyUnicode_GET_LENGTH(substr);
9428 ADJUST_INDICES(start, end, len1);
9429 if (end - start < len2)
9430 return 0;
9431
9432 buf1 = PyUnicode_DATA(str);
9433 buf2 = PyUnicode_DATA(substr);
9434 if (kind2 != kind1) {
9435 buf2 = _PyUnicode_AsKind(substr, kind1);
9436 if (!buf2)
9437 goto onError;
9438 }
9439
9440 switch (kind1) {
9441 case PyUnicode_1BYTE_KIND:
9442 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9443 result = asciilib_count(
9444 ((Py_UCS1*)buf1) + start, end - start,
9445 buf2, len2, PY_SSIZE_T_MAX
9446 );
9447 else
9448 result = ucs1lib_count(
9449 ((Py_UCS1*)buf1) + start, end - start,
9450 buf2, len2, PY_SSIZE_T_MAX
9451 );
9452 break;
9453 case PyUnicode_2BYTE_KIND:
9454 result = ucs2lib_count(
9455 ((Py_UCS2*)buf1) + start, end - start,
9456 buf2, len2, PY_SSIZE_T_MAX
9457 );
9458 break;
9459 case PyUnicode_4BYTE_KIND:
9460 result = ucs4lib_count(
9461 ((Py_UCS4*)buf1) + start, end - start,
9462 buf2, len2, PY_SSIZE_T_MAX
9463 );
9464 break;
9465 default:
9466 assert(0); result = 0;
9467 }
9468
9469 if (kind2 != kind1)
9470 PyMem_Free(buf2);
9471
9472 return result;
9473 onError:
9474 if (kind2 != kind1 && buf2)
9475 PyMem_Free(buf2);
9476 return -1;
9477 }
9478
9479 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9480 PyUnicode_Find(PyObject *str,
9481 PyObject *substr,
9482 Py_ssize_t start,
9483 Py_ssize_t end,
9484 int direction)
9485 {
9486 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9487 return -2;
9488
9489 return any_find_slice(str, substr, start, end, direction);
9490 }
9491
9492 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9493 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9494 Py_ssize_t start, Py_ssize_t end,
9495 int direction)
9496 {
9497 int kind;
9498 Py_ssize_t result;
9499 if (PyUnicode_READY(str) == -1)
9500 return -2;
9501 if (start < 0 || end < 0) {
9502 PyErr_SetString(PyExc_IndexError, "string index out of range");
9503 return -2;
9504 }
9505 if (end > PyUnicode_GET_LENGTH(str))
9506 end = PyUnicode_GET_LENGTH(str);
9507 if (start >= end)
9508 return -1;
9509 kind = PyUnicode_KIND(str);
9510 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9511 kind, end-start, ch, direction);
9512 if (result == -1)
9513 return -1;
9514 else
9515 return start + result;
9516 }
9517
9518 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9519 tailmatch(PyObject *self,
9520 PyObject *substring,
9521 Py_ssize_t start,
9522 Py_ssize_t end,
9523 int direction)
9524 {
9525 int kind_self;
9526 int kind_sub;
9527 void *data_self;
9528 void *data_sub;
9529 Py_ssize_t offset;
9530 Py_ssize_t i;
9531 Py_ssize_t end_sub;
9532
9533 if (PyUnicode_READY(self) == -1 ||
9534 PyUnicode_READY(substring) == -1)
9535 return -1;
9536
9537 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9538 end -= PyUnicode_GET_LENGTH(substring);
9539 if (end < start)
9540 return 0;
9541
9542 if (PyUnicode_GET_LENGTH(substring) == 0)
9543 return 1;
9544
9545 kind_self = PyUnicode_KIND(self);
9546 data_self = PyUnicode_DATA(self);
9547 kind_sub = PyUnicode_KIND(substring);
9548 data_sub = PyUnicode_DATA(substring);
9549 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9550
9551 if (direction > 0)
9552 offset = end;
9553 else
9554 offset = start;
9555
9556 if (PyUnicode_READ(kind_self, data_self, offset) ==
9557 PyUnicode_READ(kind_sub, data_sub, 0) &&
9558 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9559 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9560 /* If both are of the same kind, memcmp is sufficient */
9561 if (kind_self == kind_sub) {
9562 return ! memcmp((char *)data_self +
9563 (offset * PyUnicode_KIND(substring)),
9564 data_sub,
9565 PyUnicode_GET_LENGTH(substring) *
9566 PyUnicode_KIND(substring));
9567 }
9568 /* otherwise we have to compare each character by first accessing it */
9569 else {
9570 /* We do not need to compare 0 and len(substring)-1 because
9571 the if statement above ensured already that they are equal
9572 when we end up here. */
9573 for (i = 1; i < end_sub; ++i) {
9574 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9575 PyUnicode_READ(kind_sub, data_sub, i))
9576 return 0;
9577 }
9578 return 1;
9579 }
9580 }
9581
9582 return 0;
9583 }
9584
9585 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9586 PyUnicode_Tailmatch(PyObject *str,
9587 PyObject *substr,
9588 Py_ssize_t start,
9589 Py_ssize_t end,
9590 int direction)
9591 {
9592 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9593 return -1;
9594
9595 return tailmatch(str, substr, start, end, direction);
9596 }
9597
9598 /* Apply fixfct filter to the Unicode object self and return a
9599 reference to the modified object */
9600
9601 static PyObject *
fixup(PyObject * self,Py_UCS4 (* fixfct)(PyObject * s))9602 fixup(PyObject *self,
9603 Py_UCS4 (*fixfct)(PyObject *s))
9604 {
9605 PyObject *u;
9606 Py_UCS4 maxchar_old, maxchar_new = 0;
9607 PyObject *v;
9608
9609 u = _PyUnicode_Copy(self);
9610 if (u == NULL)
9611 return NULL;
9612 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9613
9614 /* fix functions return the new maximum character in a string,
9615 if the kind of the resulting unicode object does not change,
9616 everything is fine. Otherwise we need to change the string kind
9617 and re-run the fix function. */
9618 maxchar_new = fixfct(u);
9619
9620 if (maxchar_new == 0) {
9621 /* no changes */;
9622 if (PyUnicode_CheckExact(self)) {
9623 Py_DECREF(u);
9624 Py_INCREF(self);
9625 return self;
9626 }
9627 else
9628 return u;
9629 }
9630
9631 maxchar_new = align_maxchar(maxchar_new);
9632
9633 if (maxchar_new == maxchar_old)
9634 return u;
9635
9636 /* In case the maximum character changed, we need to
9637 convert the string to the new category. */
9638 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9639 if (v == NULL) {
9640 Py_DECREF(u);
9641 return NULL;
9642 }
9643 if (maxchar_new > maxchar_old) {
9644 /* If the maxchar increased so that the kind changed, not all
9645 characters are representable anymore and we need to fix the
9646 string again. This only happens in very few cases. */
9647 _PyUnicode_FastCopyCharacters(v, 0,
9648 self, 0, PyUnicode_GET_LENGTH(self));
9649 maxchar_old = fixfct(v);
9650 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9651 }
9652 else {
9653 _PyUnicode_FastCopyCharacters(v, 0,
9654 u, 0, PyUnicode_GET_LENGTH(self));
9655 }
9656 Py_DECREF(u);
9657 assert(_PyUnicode_CheckConsistency(v, 1));
9658 return v;
9659 }
9660
9661 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9662 ascii_upper_or_lower(PyObject *self, int lower)
9663 {
9664 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9665 char *resdata, *data = PyUnicode_DATA(self);
9666 PyObject *res;
9667
9668 res = PyUnicode_New(len, 127);
9669 if (res == NULL)
9670 return NULL;
9671 resdata = PyUnicode_DATA(res);
9672 if (lower)
9673 _Py_bytes_lower(resdata, data, len);
9674 else
9675 _Py_bytes_upper(resdata, data, len);
9676 return res;
9677 }
9678
9679 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9680 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9681 {
9682 Py_ssize_t j;
9683 int final_sigma;
9684 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9685 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9686
9687 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9688
9689 where ! is a negation and \p{xxx} is a character with property xxx.
9690 */
9691 for (j = i - 1; j >= 0; j--) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
9695 }
9696 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9697 if (final_sigma) {
9698 for (j = i + 1; j < length; j++) {
9699 c = PyUnicode_READ(kind, data, j);
9700 if (!_PyUnicode_IsCaseIgnorable(c))
9701 break;
9702 }
9703 final_sigma = j == length || !_PyUnicode_IsCased(c);
9704 }
9705 return (final_sigma) ? 0x3C2 : 0x3C3;
9706 }
9707
9708 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9709 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9710 Py_UCS4 c, Py_UCS4 *mapped)
9711 {
9712 /* Obscure special case. */
9713 if (c == 0x3A3) {
9714 mapped[0] = handle_capital_sigma(kind, data, length, i);
9715 return 1;
9716 }
9717 return _PyUnicode_ToLowerFull(c, mapped);
9718 }
9719
9720 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9721 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9722 {
9723 Py_ssize_t i, k = 0;
9724 int n_res, j;
9725 Py_UCS4 c, mapped[3];
9726
9727 c = PyUnicode_READ(kind, data, 0);
9728 n_res = _PyUnicode_ToUpperFull(c, mapped);
9729 for (j = 0; j < n_res; j++) {
9730 *maxchar = Py_MAX(*maxchar, mapped[j]);
9731 res[k++] = mapped[j];
9732 }
9733 for (i = 1; i < length; i++) {
9734 c = PyUnicode_READ(kind, data, i);
9735 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9736 for (j = 0; j < n_res; j++) {
9737 *maxchar = Py_MAX(*maxchar, mapped[j]);
9738 res[k++] = mapped[j];
9739 }
9740 }
9741 return k;
9742 }
9743
9744 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9745 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9746 Py_ssize_t i, k = 0;
9747
9748 for (i = 0; i < length; i++) {
9749 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750 int n_res, j;
9751 if (Py_UNICODE_ISUPPER(c)) {
9752 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753 }
9754 else if (Py_UNICODE_ISLOWER(c)) {
9755 n_res = _PyUnicode_ToUpperFull(c, mapped);
9756 }
9757 else {
9758 n_res = 1;
9759 mapped[0] = c;
9760 }
9761 for (j = 0; j < n_res; j++) {
9762 *maxchar = Py_MAX(*maxchar, mapped[j]);
9763 res[k++] = mapped[j];
9764 }
9765 }
9766 return k;
9767 }
9768
9769 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9770 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9771 Py_UCS4 *maxchar, int lower)
9772 {
9773 Py_ssize_t i, k = 0;
9774
9775 for (i = 0; i < length; i++) {
9776 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9777 int n_res, j;
9778 if (lower)
9779 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9780 else
9781 n_res = _PyUnicode_ToUpperFull(c, mapped);
9782 for (j = 0; j < n_res; j++) {
9783 *maxchar = Py_MAX(*maxchar, mapped[j]);
9784 res[k++] = mapped[j];
9785 }
9786 }
9787 return k;
9788 }
9789
9790 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9791 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792 {
9793 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9794 }
9795
9796 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9797 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798 {
9799 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9800 }
9801
9802 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9803 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9804 {
9805 Py_ssize_t i, k = 0;
9806
9807 for (i = 0; i < length; i++) {
9808 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9809 Py_UCS4 mapped[3];
9810 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9811 for (j = 0; j < n_res; j++) {
9812 *maxchar = Py_MAX(*maxchar, mapped[j]);
9813 res[k++] = mapped[j];
9814 }
9815 }
9816 return k;
9817 }
9818
9819 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9820 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821 {
9822 Py_ssize_t i, k = 0;
9823 int previous_is_cased;
9824
9825 previous_is_cased = 0;
9826 for (i = 0; i < length; i++) {
9827 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9828 Py_UCS4 mapped[3];
9829 int n_res, j;
9830
9831 if (previous_is_cased)
9832 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9833 else
9834 n_res = _PyUnicode_ToTitleFull(c, mapped);
9835
9836 for (j = 0; j < n_res; j++) {
9837 *maxchar = Py_MAX(*maxchar, mapped[j]);
9838 res[k++] = mapped[j];
9839 }
9840
9841 previous_is_cased = _PyUnicode_IsCased(c);
9842 }
9843 return k;
9844 }
9845
9846 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9847 case_operation(PyObject *self,
9848 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9849 {
9850 PyObject *res = NULL;
9851 Py_ssize_t length, newlength = 0;
9852 int kind, outkind;
9853 void *data, *outdata;
9854 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9855
9856 assert(PyUnicode_IS_READY(self));
9857
9858 kind = PyUnicode_KIND(self);
9859 data = PyUnicode_DATA(self);
9860 length = PyUnicode_GET_LENGTH(self);
9861 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9862 PyErr_SetString(PyExc_OverflowError, "string is too long");
9863 return NULL;
9864 }
9865 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9866 if (tmp == NULL)
9867 return PyErr_NoMemory();
9868 newlength = perform(kind, data, length, tmp, &maxchar);
9869 res = PyUnicode_New(newlength, maxchar);
9870 if (res == NULL)
9871 goto leave;
9872 tmpend = tmp + newlength;
9873 outdata = PyUnicode_DATA(res);
9874 outkind = PyUnicode_KIND(res);
9875 switch (outkind) {
9876 case PyUnicode_1BYTE_KIND:
9877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9878 break;
9879 case PyUnicode_2BYTE_KIND:
9880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9881 break;
9882 case PyUnicode_4BYTE_KIND:
9883 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9884 break;
9885 default:
9886 assert(0);
9887 break;
9888 }
9889 leave:
9890 PyMem_FREE(tmp);
9891 return res;
9892 }
9893
9894 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9895 PyUnicode_Join(PyObject *separator, PyObject *seq)
9896 {
9897 PyObject *res;
9898 PyObject *fseq;
9899 Py_ssize_t seqlen;
9900 PyObject **items;
9901
9902 fseq = PySequence_Fast(seq, "can only join an iterable");
9903 if (fseq == NULL) {
9904 return NULL;
9905 }
9906
9907 /* NOTE: the following code can't call back into Python code,
9908 * so we are sure that fseq won't be mutated.
9909 */
9910
9911 items = PySequence_Fast_ITEMS(fseq);
9912 seqlen = PySequence_Fast_GET_SIZE(fseq);
9913 res = _PyUnicode_JoinArray(separator, items, seqlen);
9914 Py_DECREF(fseq);
9915 return res;
9916 }
9917
9918 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject ** items,Py_ssize_t seqlen)9919 _PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9920 {
9921 PyObject *res = NULL; /* the result */
9922 PyObject *sep = NULL;
9923 Py_ssize_t seplen;
9924 PyObject *item;
9925 Py_ssize_t sz, i, res_offset;
9926 Py_UCS4 maxchar;
9927 Py_UCS4 item_maxchar;
9928 int use_memcpy;
9929 unsigned char *res_data = NULL, *sep_data = NULL;
9930 PyObject *last_obj;
9931 unsigned int kind = 0;
9932
9933 /* If empty sequence, return u"". */
9934 if (seqlen == 0) {
9935 _Py_RETURN_UNICODE_EMPTY();
9936 }
9937
9938 /* If singleton sequence with an exact Unicode, return that. */
9939 last_obj = NULL;
9940 if (seqlen == 1) {
9941 if (PyUnicode_CheckExact(items[0])) {
9942 res = items[0];
9943 Py_INCREF(res);
9944 return res;
9945 }
9946 seplen = 0;
9947 maxchar = 0;
9948 }
9949 else {
9950 /* Set up sep and seplen */
9951 if (separator == NULL) {
9952 /* fall back to a blank space separator */
9953 sep = PyUnicode_FromOrdinal(' ');
9954 if (!sep)
9955 goto onError;
9956 seplen = 1;
9957 maxchar = 32;
9958 }
9959 else {
9960 if (!PyUnicode_Check(separator)) {
9961 PyErr_Format(PyExc_TypeError,
9962 "separator: expected str instance,"
9963 " %.80s found",
9964 Py_TYPE(separator)->tp_name);
9965 goto onError;
9966 }
9967 if (PyUnicode_READY(separator))
9968 goto onError;
9969 sep = separator;
9970 seplen = PyUnicode_GET_LENGTH(separator);
9971 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9972 /* inc refcount to keep this code path symmetric with the
9973 above case of a blank separator */
9974 Py_INCREF(sep);
9975 }
9976 last_obj = sep;
9977 }
9978
9979 /* There are at least two things to join, or else we have a subclass
9980 * of str in the sequence.
9981 * Do a pre-pass to figure out the total amount of space we'll
9982 * need (sz), and see whether all argument are strings.
9983 */
9984 sz = 0;
9985 #ifdef Py_DEBUG
9986 use_memcpy = 0;
9987 #else
9988 use_memcpy = 1;
9989 #endif
9990 for (i = 0; i < seqlen; i++) {
9991 size_t add_sz;
9992 item = items[i];
9993 if (!PyUnicode_Check(item)) {
9994 PyErr_Format(PyExc_TypeError,
9995 "sequence item %zd: expected str instance,"
9996 " %.80s found",
9997 i, Py_TYPE(item)->tp_name);
9998 goto onError;
9999 }
10000 if (PyUnicode_READY(item) == -1)
10001 goto onError;
10002 add_sz = PyUnicode_GET_LENGTH(item);
10003 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10004 maxchar = Py_MAX(maxchar, item_maxchar);
10005 if (i != 0) {
10006 add_sz += seplen;
10007 }
10008 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10009 PyErr_SetString(PyExc_OverflowError,
10010 "join() result is too long for a Python string");
10011 goto onError;
10012 }
10013 sz += add_sz;
10014 if (use_memcpy && last_obj != NULL) {
10015 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10016 use_memcpy = 0;
10017 }
10018 last_obj = item;
10019 }
10020
10021 res = PyUnicode_New(sz, maxchar);
10022 if (res == NULL)
10023 goto onError;
10024
10025 /* Catenate everything. */
10026 #ifdef Py_DEBUG
10027 use_memcpy = 0;
10028 #else
10029 if (use_memcpy) {
10030 res_data = PyUnicode_1BYTE_DATA(res);
10031 kind = PyUnicode_KIND(res);
10032 if (seplen != 0)
10033 sep_data = PyUnicode_1BYTE_DATA(sep);
10034 }
10035 #endif
10036 if (use_memcpy) {
10037 for (i = 0; i < seqlen; ++i) {
10038 Py_ssize_t itemlen;
10039 item = items[i];
10040
10041 /* Copy item, and maybe the separator. */
10042 if (i && seplen != 0) {
10043 memcpy(res_data,
10044 sep_data,
10045 kind * seplen);
10046 res_data += kind * seplen;
10047 }
10048
10049 itemlen = PyUnicode_GET_LENGTH(item);
10050 if (itemlen != 0) {
10051 memcpy(res_data,
10052 PyUnicode_DATA(item),
10053 kind * itemlen);
10054 res_data += kind * itemlen;
10055 }
10056 }
10057 assert(res_data == PyUnicode_1BYTE_DATA(res)
10058 + kind * PyUnicode_GET_LENGTH(res));
10059 }
10060 else {
10061 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10062 Py_ssize_t itemlen;
10063 item = items[i];
10064
10065 /* Copy item, and maybe the separator. */
10066 if (i && seplen != 0) {
10067 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10068 res_offset += seplen;
10069 }
10070
10071 itemlen = PyUnicode_GET_LENGTH(item);
10072 if (itemlen != 0) {
10073 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10074 res_offset += itemlen;
10075 }
10076 }
10077 assert(res_offset == PyUnicode_GET_LENGTH(res));
10078 }
10079
10080 Py_XDECREF(sep);
10081 assert(_PyUnicode_CheckConsistency(res, 1));
10082 return res;
10083
10084 onError:
10085 Py_XDECREF(sep);
10086 Py_XDECREF(res);
10087 return NULL;
10088 }
10089
10090 #define FILL(kind, data, value, start, length) \
10091 do { \
10092 Py_ssize_t i_ = 0; \
10093 assert(kind != PyUnicode_WCHAR_KIND); \
10094 switch ((kind)) { \
10095 case PyUnicode_1BYTE_KIND: { \
10096 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10097 memset(to_, (unsigned char)value, (length)); \
10098 break; \
10099 } \
10100 case PyUnicode_2BYTE_KIND: { \
10101 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10102 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10103 break; \
10104 } \
10105 case PyUnicode_4BYTE_KIND: { \
10106 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10107 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10108 break; \
10109 } \
10110 default: assert(0); \
10111 } \
10112 } while (0)
10113
10114 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10115 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10116 Py_UCS4 fill_char)
10117 {
10118 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10119 const void *data = PyUnicode_DATA(unicode);
10120 assert(PyUnicode_IS_READY(unicode));
10121 assert(unicode_modifiable(unicode));
10122 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10123 assert(start >= 0);
10124 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10125 FILL(kind, data, fill_char, start, length);
10126 }
10127
10128 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10129 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10130 Py_UCS4 fill_char)
10131 {
10132 Py_ssize_t maxlen;
10133
10134 if (!PyUnicode_Check(unicode)) {
10135 PyErr_BadInternalCall();
10136 return -1;
10137 }
10138 if (PyUnicode_READY(unicode) == -1)
10139 return -1;
10140 if (unicode_check_modifiable(unicode))
10141 return -1;
10142
10143 if (start < 0) {
10144 PyErr_SetString(PyExc_IndexError, "string index out of range");
10145 return -1;
10146 }
10147 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10148 PyErr_SetString(PyExc_ValueError,
10149 "fill character is bigger than "
10150 "the string maximum character");
10151 return -1;
10152 }
10153
10154 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10155 length = Py_MIN(maxlen, length);
10156 if (length <= 0)
10157 return 0;
10158
10159 _PyUnicode_FastFill(unicode, start, length, fill_char);
10160 return length;
10161 }
10162
10163 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10164 pad(PyObject *self,
10165 Py_ssize_t left,
10166 Py_ssize_t right,
10167 Py_UCS4 fill)
10168 {
10169 PyObject *u;
10170 Py_UCS4 maxchar;
10171 int kind;
10172 void *data;
10173
10174 if (left < 0)
10175 left = 0;
10176 if (right < 0)
10177 right = 0;
10178
10179 if (left == 0 && right == 0)
10180 return unicode_result_unchanged(self);
10181
10182 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10183 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10184 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10185 return NULL;
10186 }
10187 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10188 maxchar = Py_MAX(maxchar, fill);
10189 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10190 if (!u)
10191 return NULL;
10192
10193 kind = PyUnicode_KIND(u);
10194 data = PyUnicode_DATA(u);
10195 if (left)
10196 FILL(kind, data, fill, 0, left);
10197 if (right)
10198 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10199 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10200 assert(_PyUnicode_CheckConsistency(u, 1));
10201 return u;
10202 }
10203
10204 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10205 PyUnicode_Splitlines(PyObject *string, int keepends)
10206 {
10207 PyObject *list;
10208
10209 if (ensure_unicode(string) < 0)
10210 return NULL;
10211
10212 switch (PyUnicode_KIND(string)) {
10213 case PyUnicode_1BYTE_KIND:
10214 if (PyUnicode_IS_ASCII(string))
10215 list = asciilib_splitlines(
10216 string, PyUnicode_1BYTE_DATA(string),
10217 PyUnicode_GET_LENGTH(string), keepends);
10218 else
10219 list = ucs1lib_splitlines(
10220 string, PyUnicode_1BYTE_DATA(string),
10221 PyUnicode_GET_LENGTH(string), keepends);
10222 break;
10223 case PyUnicode_2BYTE_KIND:
10224 list = ucs2lib_splitlines(
10225 string, PyUnicode_2BYTE_DATA(string),
10226 PyUnicode_GET_LENGTH(string), keepends);
10227 break;
10228 case PyUnicode_4BYTE_KIND:
10229 list = ucs4lib_splitlines(
10230 string, PyUnicode_4BYTE_DATA(string),
10231 PyUnicode_GET_LENGTH(string), keepends);
10232 break;
10233 default:
10234 assert(0);
10235 list = 0;
10236 }
10237 return list;
10238 }
10239
10240 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10241 split(PyObject *self,
10242 PyObject *substring,
10243 Py_ssize_t maxcount)
10244 {
10245 int kind1, kind2;
10246 void *buf1, *buf2;
10247 Py_ssize_t len1, len2;
10248 PyObject* out;
10249
10250 if (maxcount < 0)
10251 maxcount = PY_SSIZE_T_MAX;
10252
10253 if (PyUnicode_READY(self) == -1)
10254 return NULL;
10255
10256 if (substring == NULL)
10257 switch (PyUnicode_KIND(self)) {
10258 case PyUnicode_1BYTE_KIND:
10259 if (PyUnicode_IS_ASCII(self))
10260 return asciilib_split_whitespace(
10261 self, PyUnicode_1BYTE_DATA(self),
10262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
10264 else
10265 return ucs1lib_split_whitespace(
10266 self, PyUnicode_1BYTE_DATA(self),
10267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
10269 case PyUnicode_2BYTE_KIND:
10270 return ucs2lib_split_whitespace(
10271 self, PyUnicode_2BYTE_DATA(self),
10272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 case PyUnicode_4BYTE_KIND:
10275 return ucs4lib_split_whitespace(
10276 self, PyUnicode_4BYTE_DATA(self),
10277 PyUnicode_GET_LENGTH(self), maxcount
10278 );
10279 default:
10280 assert(0);
10281 return NULL;
10282 }
10283
10284 if (PyUnicode_READY(substring) == -1)
10285 return NULL;
10286
10287 kind1 = PyUnicode_KIND(self);
10288 kind2 = PyUnicode_KIND(substring);
10289 len1 = PyUnicode_GET_LENGTH(self);
10290 len2 = PyUnicode_GET_LENGTH(substring);
10291 if (kind1 < kind2 || len1 < len2) {
10292 out = PyList_New(1);
10293 if (out == NULL)
10294 return NULL;
10295 Py_INCREF(self);
10296 PyList_SET_ITEM(out, 0, self);
10297 return out;
10298 }
10299 buf1 = PyUnicode_DATA(self);
10300 buf2 = PyUnicode_DATA(substring);
10301 if (kind2 != kind1) {
10302 buf2 = _PyUnicode_AsKind(substring, kind1);
10303 if (!buf2)
10304 return NULL;
10305 }
10306
10307 switch (kind1) {
10308 case PyUnicode_1BYTE_KIND:
10309 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10310 out = asciilib_split(
10311 self, buf1, len1, buf2, len2, maxcount);
10312 else
10313 out = ucs1lib_split(
10314 self, buf1, len1, buf2, len2, maxcount);
10315 break;
10316 case PyUnicode_2BYTE_KIND:
10317 out = ucs2lib_split(
10318 self, buf1, len1, buf2, len2, maxcount);
10319 break;
10320 case PyUnicode_4BYTE_KIND:
10321 out = ucs4lib_split(
10322 self, buf1, len1, buf2, len2, maxcount);
10323 break;
10324 default:
10325 out = NULL;
10326 }
10327 if (kind2 != kind1)
10328 PyMem_Free(buf2);
10329 return out;
10330 }
10331
10332 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10333 rsplit(PyObject *self,
10334 PyObject *substring,
10335 Py_ssize_t maxcount)
10336 {
10337 int kind1, kind2;
10338 void *buf1, *buf2;
10339 Py_ssize_t len1, len2;
10340 PyObject* out;
10341
10342 if (maxcount < 0)
10343 maxcount = PY_SSIZE_T_MAX;
10344
10345 if (PyUnicode_READY(self) == -1)
10346 return NULL;
10347
10348 if (substring == NULL)
10349 switch (PyUnicode_KIND(self)) {
10350 case PyUnicode_1BYTE_KIND:
10351 if (PyUnicode_IS_ASCII(self))
10352 return asciilib_rsplit_whitespace(
10353 self, PyUnicode_1BYTE_DATA(self),
10354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
10356 else
10357 return ucs1lib_rsplit_whitespace(
10358 self, PyUnicode_1BYTE_DATA(self),
10359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 case PyUnicode_2BYTE_KIND:
10362 return ucs2lib_rsplit_whitespace(
10363 self, PyUnicode_2BYTE_DATA(self),
10364 PyUnicode_GET_LENGTH(self), maxcount
10365 );
10366 case PyUnicode_4BYTE_KIND:
10367 return ucs4lib_rsplit_whitespace(
10368 self, PyUnicode_4BYTE_DATA(self),
10369 PyUnicode_GET_LENGTH(self), maxcount
10370 );
10371 default:
10372 assert(0);
10373 return NULL;
10374 }
10375
10376 if (PyUnicode_READY(substring) == -1)
10377 return NULL;
10378
10379 kind1 = PyUnicode_KIND(self);
10380 kind2 = PyUnicode_KIND(substring);
10381 len1 = PyUnicode_GET_LENGTH(self);
10382 len2 = PyUnicode_GET_LENGTH(substring);
10383 if (kind1 < kind2 || len1 < len2) {
10384 out = PyList_New(1);
10385 if (out == NULL)
10386 return NULL;
10387 Py_INCREF(self);
10388 PyList_SET_ITEM(out, 0, self);
10389 return out;
10390 }
10391 buf1 = PyUnicode_DATA(self);
10392 buf2 = PyUnicode_DATA(substring);
10393 if (kind2 != kind1) {
10394 buf2 = _PyUnicode_AsKind(substring, kind1);
10395 if (!buf2)
10396 return NULL;
10397 }
10398
10399 switch (kind1) {
10400 case PyUnicode_1BYTE_KIND:
10401 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10402 out = asciilib_rsplit(
10403 self, buf1, len1, buf2, len2, maxcount);
10404 else
10405 out = ucs1lib_rsplit(
10406 self, buf1, len1, buf2, len2, maxcount);
10407 break;
10408 case PyUnicode_2BYTE_KIND:
10409 out = ucs2lib_rsplit(
10410 self, buf1, len1, buf2, len2, maxcount);
10411 break;
10412 case PyUnicode_4BYTE_KIND:
10413 out = ucs4lib_rsplit(
10414 self, buf1, len1, buf2, len2, maxcount);
10415 break;
10416 default:
10417 out = NULL;
10418 }
10419 if (kind2 != kind1)
10420 PyMem_Free(buf2);
10421 return out;
10422 }
10423
10424 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10425 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10426 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10427 {
10428 switch (kind) {
10429 case PyUnicode_1BYTE_KIND:
10430 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10431 return asciilib_find(buf1, len1, buf2, len2, offset);
10432 else
10433 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10434 case PyUnicode_2BYTE_KIND:
10435 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10436 case PyUnicode_4BYTE_KIND:
10437 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10438 }
10439 assert(0);
10440 return -1;
10441 }
10442
10443 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10444 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10445 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10446 {
10447 switch (kind) {
10448 case PyUnicode_1BYTE_KIND:
10449 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10450 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10451 else
10452 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10453 case PyUnicode_2BYTE_KIND:
10454 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10455 case PyUnicode_4BYTE_KIND:
10456 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10457 }
10458 assert(0);
10459 return 0;
10460 }
10461
10462 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10463 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10464 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10465 {
10466 int kind = PyUnicode_KIND(u);
10467 void *data = PyUnicode_DATA(u);
10468 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10469 if (kind == PyUnicode_1BYTE_KIND) {
10470 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10471 (Py_UCS1 *)data + len,
10472 u1, u2, maxcount);
10473 }
10474 else if (kind == PyUnicode_2BYTE_KIND) {
10475 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10476 (Py_UCS2 *)data + len,
10477 u1, u2, maxcount);
10478 }
10479 else {
10480 assert(kind == PyUnicode_4BYTE_KIND);
10481 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10482 (Py_UCS4 *)data + len,
10483 u1, u2, maxcount);
10484 }
10485 }
10486
10487 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10488 replace(PyObject *self, PyObject *str1,
10489 PyObject *str2, Py_ssize_t maxcount)
10490 {
10491 PyObject *u;
10492 char *sbuf = PyUnicode_DATA(self);
10493 char *buf1 = PyUnicode_DATA(str1);
10494 char *buf2 = PyUnicode_DATA(str2);
10495 int srelease = 0, release1 = 0, release2 = 0;
10496 int skind = PyUnicode_KIND(self);
10497 int kind1 = PyUnicode_KIND(str1);
10498 int kind2 = PyUnicode_KIND(str2);
10499 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10500 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10501 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10502 int mayshrink;
10503 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10504
10505 if (maxcount < 0)
10506 maxcount = PY_SSIZE_T_MAX;
10507 else if (maxcount == 0 || slen == 0)
10508 goto nothing;
10509
10510 if (str1 == str2)
10511 goto nothing;
10512
10513 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10514 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10515 if (maxchar < maxchar_str1)
10516 /* substring too wide to be present */
10517 goto nothing;
10518 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10519 /* Replacing str1 with str2 may cause a maxchar reduction in the
10520 result string. */
10521 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10522 maxchar = Py_MAX(maxchar, maxchar_str2);
10523
10524 if (len1 == len2) {
10525 /* same length */
10526 if (len1 == 0)
10527 goto nothing;
10528 if (len1 == 1) {
10529 /* replace characters */
10530 Py_UCS4 u1, u2;
10531 Py_ssize_t pos;
10532
10533 u1 = PyUnicode_READ(kind1, buf1, 0);
10534 pos = findchar(sbuf, skind, slen, u1, 1);
10535 if (pos < 0)
10536 goto nothing;
10537 u2 = PyUnicode_READ(kind2, buf2, 0);
10538 u = PyUnicode_New(slen, maxchar);
10539 if (!u)
10540 goto error;
10541
10542 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10543 replace_1char_inplace(u, pos, u1, u2, maxcount);
10544 }
10545 else {
10546 int rkind = skind;
10547 char *res;
10548 Py_ssize_t i;
10549
10550 if (kind1 < rkind) {
10551 /* widen substring */
10552 buf1 = _PyUnicode_AsKind(str1, rkind);
10553 if (!buf1) goto error;
10554 release1 = 1;
10555 }
10556 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10557 if (i < 0)
10558 goto nothing;
10559 if (rkind > kind2) {
10560 /* widen replacement */
10561 buf2 = _PyUnicode_AsKind(str2, rkind);
10562 if (!buf2) goto error;
10563 release2 = 1;
10564 }
10565 else if (rkind < kind2) {
10566 /* widen self and buf1 */
10567 rkind = kind2;
10568 if (release1) PyMem_Free(buf1);
10569 release1 = 0;
10570 sbuf = _PyUnicode_AsKind(self, rkind);
10571 if (!sbuf) goto error;
10572 srelease = 1;
10573 buf1 = _PyUnicode_AsKind(str1, rkind);
10574 if (!buf1) goto error;
10575 release1 = 1;
10576 }
10577 u = PyUnicode_New(slen, maxchar);
10578 if (!u)
10579 goto error;
10580 assert(PyUnicode_KIND(u) == rkind);
10581 res = PyUnicode_DATA(u);
10582
10583 memcpy(res, sbuf, rkind * slen);
10584 /* change everything in-place, starting with this one */
10585 memcpy(res + rkind * i,
10586 buf2,
10587 rkind * len2);
10588 i += len1;
10589
10590 while ( --maxcount > 0) {
10591 i = anylib_find(rkind, self,
10592 sbuf+rkind*i, slen-i,
10593 str1, buf1, len1, i);
10594 if (i == -1)
10595 break;
10596 memcpy(res + rkind * i,
10597 buf2,
10598 rkind * len2);
10599 i += len1;
10600 }
10601 }
10602 }
10603 else {
10604 Py_ssize_t n, i, j, ires;
10605 Py_ssize_t new_size;
10606 int rkind = skind;
10607 char *res;
10608
10609 if (kind1 < rkind) {
10610 /* widen substring */
10611 buf1 = _PyUnicode_AsKind(str1, rkind);
10612 if (!buf1) goto error;
10613 release1 = 1;
10614 }
10615 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10616 if (n == 0)
10617 goto nothing;
10618 if (kind2 < rkind) {
10619 /* widen replacement */
10620 buf2 = _PyUnicode_AsKind(str2, rkind);
10621 if (!buf2) goto error;
10622 release2 = 1;
10623 }
10624 else if (kind2 > rkind) {
10625 /* widen self and buf1 */
10626 rkind = kind2;
10627 sbuf = _PyUnicode_AsKind(self, rkind);
10628 if (!sbuf) goto error;
10629 srelease = 1;
10630 if (release1) PyMem_Free(buf1);
10631 release1 = 0;
10632 buf1 = _PyUnicode_AsKind(str1, rkind);
10633 if (!buf1) goto error;
10634 release1 = 1;
10635 }
10636 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10637 PyUnicode_GET_LENGTH(str1))); */
10638 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10639 PyErr_SetString(PyExc_OverflowError,
10640 "replace string is too long");
10641 goto error;
10642 }
10643 new_size = slen + n * (len2 - len1);
10644 if (new_size == 0) {
10645 _Py_INCREF_UNICODE_EMPTY();
10646 if (!unicode_empty)
10647 goto error;
10648 u = unicode_empty;
10649 goto done;
10650 }
10651 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10652 PyErr_SetString(PyExc_OverflowError,
10653 "replace string is too long");
10654 goto error;
10655 }
10656 u = PyUnicode_New(new_size, maxchar);
10657 if (!u)
10658 goto error;
10659 assert(PyUnicode_KIND(u) == rkind);
10660 res = PyUnicode_DATA(u);
10661 ires = i = 0;
10662 if (len1 > 0) {
10663 while (n-- > 0) {
10664 /* look for next match */
10665 j = anylib_find(rkind, self,
10666 sbuf + rkind * i, slen-i,
10667 str1, buf1, len1, i);
10668 if (j == -1)
10669 break;
10670 else if (j > i) {
10671 /* copy unchanged part [i:j] */
10672 memcpy(res + rkind * ires,
10673 sbuf + rkind * i,
10674 rkind * (j-i));
10675 ires += j - i;
10676 }
10677 /* copy substitution string */
10678 if (len2 > 0) {
10679 memcpy(res + rkind * ires,
10680 buf2,
10681 rkind * len2);
10682 ires += len2;
10683 }
10684 i = j + len1;
10685 }
10686 if (i < slen)
10687 /* copy tail [i:] */
10688 memcpy(res + rkind * ires,
10689 sbuf + rkind * i,
10690 rkind * (slen-i));
10691 }
10692 else {
10693 /* interleave */
10694 while (n > 0) {
10695 memcpy(res + rkind * ires,
10696 buf2,
10697 rkind * len2);
10698 ires += len2;
10699 if (--n <= 0)
10700 break;
10701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind);
10704 ires++;
10705 i++;
10706 }
10707 memcpy(res + rkind * ires,
10708 sbuf + rkind * i,
10709 rkind * (slen-i));
10710 }
10711 }
10712
10713 if (mayshrink) {
10714 unicode_adjust_maxchar(&u);
10715 if (u == NULL)
10716 goto error;
10717 }
10718
10719 done:
10720 if (srelease)
10721 PyMem_FREE(sbuf);
10722 if (release1)
10723 PyMem_FREE(buf1);
10724 if (release2)
10725 PyMem_FREE(buf2);
10726 assert(_PyUnicode_CheckConsistency(u, 1));
10727 return u;
10728
10729 nothing:
10730 /* nothing to replace; return original string (when possible) */
10731 if (srelease)
10732 PyMem_FREE(sbuf);
10733 if (release1)
10734 PyMem_FREE(buf1);
10735 if (release2)
10736 PyMem_FREE(buf2);
10737 return unicode_result_unchanged(self);
10738
10739 error:
10740 if (srelease && sbuf)
10741 PyMem_FREE(sbuf);
10742 if (release1 && buf1)
10743 PyMem_FREE(buf1);
10744 if (release2 && buf2)
10745 PyMem_FREE(buf2);
10746 return NULL;
10747 }
10748
10749 /* --- Unicode Object Methods --------------------------------------------- */
10750
10751 PyDoc_STRVAR(title__doc__,
10752 "S.title() -> str\n\
10753 \n\
10754 Return a titlecased version of S, i.e. words start with title case\n\
10755 characters, all remaining cased characters have lower case.");
10756
10757 static PyObject*
unicode_title(PyObject * self)10758 unicode_title(PyObject *self)
10759 {
10760 if (PyUnicode_READY(self) == -1)
10761 return NULL;
10762 return case_operation(self, do_title);
10763 }
10764
10765 PyDoc_STRVAR(capitalize__doc__,
10766 "S.capitalize() -> str\n\
10767 \n\
10768 Return a capitalized version of S, i.e. make the first character\n\
10769 have upper case and the rest lower case.");
10770
10771 static PyObject*
unicode_capitalize(PyObject * self)10772 unicode_capitalize(PyObject *self)
10773 {
10774 if (PyUnicode_READY(self) == -1)
10775 return NULL;
10776 if (PyUnicode_GET_LENGTH(self) == 0)
10777 return unicode_result_unchanged(self);
10778 return case_operation(self, do_capitalize);
10779 }
10780
10781 PyDoc_STRVAR(casefold__doc__,
10782 "S.casefold() -> str\n\
10783 \n\
10784 Return a version of S suitable for caseless comparisons.");
10785
10786 static PyObject *
unicode_casefold(PyObject * self)10787 unicode_casefold(PyObject *self)
10788 {
10789 if (PyUnicode_READY(self) == -1)
10790 return NULL;
10791 if (PyUnicode_IS_ASCII(self))
10792 return ascii_upper_or_lower(self, 1);
10793 return case_operation(self, do_casefold);
10794 }
10795
10796
10797 /* Argument converter. Accepts a single Unicode character. */
10798
10799 static int
convert_uc(PyObject * obj,void * addr)10800 convert_uc(PyObject *obj, void *addr)
10801 {
10802 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10803
10804 if (!PyUnicode_Check(obj)) {
10805 PyErr_Format(PyExc_TypeError,
10806 "The fill character must be a unicode character, "
10807 "not %.100s", Py_TYPE(obj)->tp_name);
10808 return 0;
10809 }
10810 if (PyUnicode_READY(obj) < 0)
10811 return 0;
10812 if (PyUnicode_GET_LENGTH(obj) != 1) {
10813 PyErr_SetString(PyExc_TypeError,
10814 "The fill character must be exactly one character long");
10815 return 0;
10816 }
10817 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10818 return 1;
10819 }
10820
10821 PyDoc_STRVAR(center__doc__,
10822 "S.center(width[, fillchar]) -> str\n\
10823 \n\
10824 Return S centered in a string of length width. Padding is\n\
10825 done using the specified fill character (default is a space)");
10826
10827 static PyObject *
unicode_center(PyObject * self,PyObject * args)10828 unicode_center(PyObject *self, PyObject *args)
10829 {
10830 Py_ssize_t marg, left;
10831 Py_ssize_t width;
10832 Py_UCS4 fillchar = ' ';
10833
10834 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10835 return NULL;
10836
10837 if (PyUnicode_READY(self) == -1)
10838 return NULL;
10839
10840 if (PyUnicode_GET_LENGTH(self) >= width)
10841 return unicode_result_unchanged(self);
10842
10843 marg = width - PyUnicode_GET_LENGTH(self);
10844 left = marg / 2 + (marg & width & 1);
10845
10846 return pad(self, left, marg - left, fillchar);
10847 }
10848
10849 /* This function assumes that str1 and str2 are readied by the caller. */
10850
10851 static int
unicode_compare(PyObject * str1,PyObject * str2)10852 unicode_compare(PyObject *str1, PyObject *str2)
10853 {
10854 #define COMPARE(TYPE1, TYPE2) \
10855 do { \
10856 TYPE1* p1 = (TYPE1 *)data1; \
10857 TYPE2* p2 = (TYPE2 *)data2; \
10858 TYPE1* end = p1 + len; \
10859 Py_UCS4 c1, c2; \
10860 for (; p1 != end; p1++, p2++) { \
10861 c1 = *p1; \
10862 c2 = *p2; \
10863 if (c1 != c2) \
10864 return (c1 < c2) ? -1 : 1; \
10865 } \
10866 } \
10867 while (0)
10868
10869 int kind1, kind2;
10870 void *data1, *data2;
10871 Py_ssize_t len1, len2, len;
10872
10873 kind1 = PyUnicode_KIND(str1);
10874 kind2 = PyUnicode_KIND(str2);
10875 data1 = PyUnicode_DATA(str1);
10876 data2 = PyUnicode_DATA(str2);
10877 len1 = PyUnicode_GET_LENGTH(str1);
10878 len2 = PyUnicode_GET_LENGTH(str2);
10879 len = Py_MIN(len1, len2);
10880
10881 switch(kind1) {
10882 case PyUnicode_1BYTE_KIND:
10883 {
10884 switch(kind2) {
10885 case PyUnicode_1BYTE_KIND:
10886 {
10887 int cmp = memcmp(data1, data2, len);
10888 /* normalize result of memcmp() into the range [-1; 1] */
10889 if (cmp < 0)
10890 return -1;
10891 if (cmp > 0)
10892 return 1;
10893 break;
10894 }
10895 case PyUnicode_2BYTE_KIND:
10896 COMPARE(Py_UCS1, Py_UCS2);
10897 break;
10898 case PyUnicode_4BYTE_KIND:
10899 COMPARE(Py_UCS1, Py_UCS4);
10900 break;
10901 default:
10902 assert(0);
10903 }
10904 break;
10905 }
10906 case PyUnicode_2BYTE_KIND:
10907 {
10908 switch(kind2) {
10909 case PyUnicode_1BYTE_KIND:
10910 COMPARE(Py_UCS2, Py_UCS1);
10911 break;
10912 case PyUnicode_2BYTE_KIND:
10913 {
10914 COMPARE(Py_UCS2, Py_UCS2);
10915 break;
10916 }
10917 case PyUnicode_4BYTE_KIND:
10918 COMPARE(Py_UCS2, Py_UCS4);
10919 break;
10920 default:
10921 assert(0);
10922 }
10923 break;
10924 }
10925 case PyUnicode_4BYTE_KIND:
10926 {
10927 switch(kind2) {
10928 case PyUnicode_1BYTE_KIND:
10929 COMPARE(Py_UCS4, Py_UCS1);
10930 break;
10931 case PyUnicode_2BYTE_KIND:
10932 COMPARE(Py_UCS4, Py_UCS2);
10933 break;
10934 case PyUnicode_4BYTE_KIND:
10935 {
10936 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10937 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10938 /* normalize result of wmemcmp() into the range [-1; 1] */
10939 if (cmp < 0)
10940 return -1;
10941 if (cmp > 0)
10942 return 1;
10943 #else
10944 COMPARE(Py_UCS4, Py_UCS4);
10945 #endif
10946 break;
10947 }
10948 default:
10949 assert(0);
10950 }
10951 break;
10952 }
10953 default:
10954 assert(0);
10955 }
10956
10957 if (len1 == len2)
10958 return 0;
10959 if (len1 < len2)
10960 return -1;
10961 else
10962 return 1;
10963
10964 #undef COMPARE
10965 }
10966
10967 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10968 unicode_compare_eq(PyObject *str1, PyObject *str2)
10969 {
10970 int kind;
10971 void *data1, *data2;
10972 Py_ssize_t len;
10973 int cmp;
10974
10975 len = PyUnicode_GET_LENGTH(str1);
10976 if (PyUnicode_GET_LENGTH(str2) != len)
10977 return 0;
10978 kind = PyUnicode_KIND(str1);
10979 if (PyUnicode_KIND(str2) != kind)
10980 return 0;
10981 data1 = PyUnicode_DATA(str1);
10982 data2 = PyUnicode_DATA(str2);
10983
10984 cmp = memcmp(data1, data2, len * kind);
10985 return (cmp == 0);
10986 }
10987
10988
10989 int
PyUnicode_Compare(PyObject * left,PyObject * right)10990 PyUnicode_Compare(PyObject *left, PyObject *right)
10991 {
10992 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10993 if (PyUnicode_READY(left) == -1 ||
10994 PyUnicode_READY(right) == -1)
10995 return -1;
10996
10997 /* a string is equal to itself */
10998 if (left == right)
10999 return 0;
11000
11001 return unicode_compare(left, right);
11002 }
11003 PyErr_Format(PyExc_TypeError,
11004 "Can't compare %.100s and %.100s",
11005 left->ob_type->tp_name,
11006 right->ob_type->tp_name);
11007 return -1;
11008 }
11009
11010 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11011 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11012 {
11013 Py_ssize_t i;
11014 int kind;
11015 Py_UCS4 chr;
11016 const unsigned char *ustr = (const unsigned char *)str;
11017
11018 assert(_PyUnicode_CHECK(uni));
11019 if (!PyUnicode_IS_READY(uni)) {
11020 const wchar_t *ws = _PyUnicode_WSTR(uni);
11021 /* Compare Unicode string and source character set string */
11022 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11023 if (chr != ustr[i])
11024 return (chr < ustr[i]) ? -1 : 1;
11025 }
11026 /* This check keeps Python strings that end in '\0' from comparing equal
11027 to C strings identical up to that point. */
11028 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11029 return 1; /* uni is longer */
11030 if (ustr[i])
11031 return -1; /* str is longer */
11032 return 0;
11033 }
11034 kind = PyUnicode_KIND(uni);
11035 if (kind == PyUnicode_1BYTE_KIND) {
11036 const void *data = PyUnicode_1BYTE_DATA(uni);
11037 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11038 size_t len, len2 = strlen(str);
11039 int cmp;
11040
11041 len = Py_MIN(len1, len2);
11042 cmp = memcmp(data, str, len);
11043 if (cmp != 0) {
11044 if (cmp < 0)
11045 return -1;
11046 else
11047 return 1;
11048 }
11049 if (len1 > len2)
11050 return 1; /* uni is longer */
11051 if (len1 < len2)
11052 return -1; /* str is longer */
11053 return 0;
11054 }
11055 else {
11056 void *data = PyUnicode_DATA(uni);
11057 /* Compare Unicode string and source character set string */
11058 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11059 if (chr != (unsigned char)str[i])
11060 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11061 /* This check keeps Python strings that end in '\0' from comparing equal
11062 to C strings identical up to that point. */
11063 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11064 return 1; /* uni is longer */
11065 if (str[i])
11066 return -1; /* str is longer */
11067 return 0;
11068 }
11069 }
11070
11071 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11072 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11073 {
11074 size_t i, len;
11075 const wchar_t *p;
11076 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11077 if (strlen(str) != len)
11078 return 0;
11079 p = _PyUnicode_WSTR(unicode);
11080 assert(p);
11081 for (i = 0; i < len; i++) {
11082 unsigned char c = (unsigned char)str[i];
11083 if (c >= 128 || p[i] != (wchar_t)c)
11084 return 0;
11085 }
11086 return 1;
11087 }
11088
11089 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11090 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11091 {
11092 size_t len;
11093 assert(_PyUnicode_CHECK(unicode));
11094 assert(str);
11095 #ifndef NDEBUG
11096 for (const char *p = str; *p; p++) {
11097 assert((unsigned char)*p < 128);
11098 }
11099 #endif
11100 if (PyUnicode_READY(unicode) == -1) {
11101 /* Memory error or bad data */
11102 PyErr_Clear();
11103 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11104 }
11105 if (!PyUnicode_IS_ASCII(unicode))
11106 return 0;
11107 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11108 return strlen(str) == len &&
11109 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11110 }
11111
11112 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11113 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11114 {
11115 PyObject *right_uni;
11116 Py_hash_t hash;
11117
11118 assert(_PyUnicode_CHECK(left));
11119 assert(right->string);
11120 #ifndef NDEBUG
11121 for (const char *p = right->string; *p; p++) {
11122 assert((unsigned char)*p < 128);
11123 }
11124 #endif
11125
11126 if (PyUnicode_READY(left) == -1) {
11127 /* memory error or bad data */
11128 PyErr_Clear();
11129 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11130 }
11131
11132 if (!PyUnicode_IS_ASCII(left))
11133 return 0;
11134
11135 right_uni = _PyUnicode_FromId(right); /* borrowed */
11136 if (right_uni == NULL) {
11137 /* memory error or bad data */
11138 PyErr_Clear();
11139 return _PyUnicode_EqualToASCIIString(left, right->string);
11140 }
11141
11142 if (left == right_uni)
11143 return 1;
11144
11145 if (PyUnicode_CHECK_INTERNED(left))
11146 return 0;
11147
11148 assert(_PyUnicode_HASH(right_uni) != 1);
11149 hash = _PyUnicode_HASH(left);
11150 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11151 return 0;
11152
11153 return unicode_compare_eq(left, right_uni);
11154 }
11155
11156 #define TEST_COND(cond) \
11157 ((cond) ? Py_True : Py_False)
11158
11159 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11161 {
11162 int result;
11163 PyObject *v;
11164
11165 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11166 Py_RETURN_NOTIMPLEMENTED;
11167
11168 if (PyUnicode_READY(left) == -1 ||
11169 PyUnicode_READY(right) == -1)
11170 return NULL;
11171
11172 if (left == right) {
11173 switch (op) {
11174 case Py_EQ:
11175 case Py_LE:
11176 case Py_GE:
11177 /* a string is equal to itself */
11178 v = Py_True;
11179 break;
11180 case Py_NE:
11181 case Py_LT:
11182 case Py_GT:
11183 v = Py_False;
11184 break;
11185 default:
11186 PyErr_BadArgument();
11187 return NULL;
11188 }
11189 }
11190 else if (op == Py_EQ || op == Py_NE) {
11191 result = unicode_compare_eq(left, right);
11192 result ^= (op == Py_NE);
11193 v = TEST_COND(result);
11194 }
11195 else {
11196 result = unicode_compare(left, right);
11197
11198 /* Convert the return value to a Boolean */
11199 switch (op) {
11200 case Py_LE:
11201 v = TEST_COND(result <= 0);
11202 break;
11203 case Py_GE:
11204 v = TEST_COND(result >= 0);
11205 break;
11206 case Py_LT:
11207 v = TEST_COND(result == -1);
11208 break;
11209 case Py_GT:
11210 v = TEST_COND(result == 1);
11211 break;
11212 default:
11213 PyErr_BadArgument();
11214 return NULL;
11215 }
11216 }
11217 Py_INCREF(v);
11218 return v;
11219 }
11220
11221 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11222 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11223 {
11224 return unicode_eq(aa, bb);
11225 }
11226
11227 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11228 PyUnicode_Contains(PyObject *str, PyObject *substr)
11229 {
11230 int kind1, kind2;
11231 void *buf1, *buf2;
11232 Py_ssize_t len1, len2;
11233 int result;
11234
11235 if (!PyUnicode_Check(substr)) {
11236 PyErr_Format(PyExc_TypeError,
11237 "'in <string>' requires string as left operand, not %.100s",
11238 Py_TYPE(substr)->tp_name);
11239 return -1;
11240 }
11241 if (PyUnicode_READY(substr) == -1)
11242 return -1;
11243 if (ensure_unicode(str) < 0)
11244 return -1;
11245
11246 kind1 = PyUnicode_KIND(str);
11247 kind2 = PyUnicode_KIND(substr);
11248 if (kind1 < kind2)
11249 return 0;
11250 len1 = PyUnicode_GET_LENGTH(str);
11251 len2 = PyUnicode_GET_LENGTH(substr);
11252 if (len1 < len2)
11253 return 0;
11254 buf1 = PyUnicode_DATA(str);
11255 buf2 = PyUnicode_DATA(substr);
11256 if (len2 == 1) {
11257 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259 return result;
11260 }
11261 if (kind2 != kind1) {
11262 buf2 = _PyUnicode_AsKind(substr, kind1);
11263 if (!buf2)
11264 return -1;
11265 }
11266
11267 switch (kind1) {
11268 case PyUnicode_1BYTE_KIND:
11269 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270 break;
11271 case PyUnicode_2BYTE_KIND:
11272 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273 break;
11274 case PyUnicode_4BYTE_KIND:
11275 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276 break;
11277 default:
11278 result = -1;
11279 assert(0);
11280 }
11281
11282 if (kind2 != kind1)
11283 PyMem_Free(buf2);
11284
11285 return result;
11286 }
11287
11288 /* Concat to string or Unicode object giving a new Unicode object. */
11289
11290 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11291 PyUnicode_Concat(PyObject *left, PyObject *right)
11292 {
11293 PyObject *result;
11294 Py_UCS4 maxchar, maxchar2;
11295 Py_ssize_t left_len, right_len, new_len;
11296
11297 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11298 return NULL;
11299
11300 /* Shortcuts */
11301 if (left == unicode_empty)
11302 return PyUnicode_FromObject(right);
11303 if (right == unicode_empty)
11304 return PyUnicode_FromObject(left);
11305
11306 left_len = PyUnicode_GET_LENGTH(left);
11307 right_len = PyUnicode_GET_LENGTH(right);
11308 if (left_len > PY_SSIZE_T_MAX - right_len) {
11309 PyErr_SetString(PyExc_OverflowError,
11310 "strings are too large to concat");
11311 return NULL;
11312 }
11313 new_len = left_len + right_len;
11314
11315 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11316 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11317 maxchar = Py_MAX(maxchar, maxchar2);
11318
11319 /* Concat the two Unicode strings */
11320 result = PyUnicode_New(new_len, maxchar);
11321 if (result == NULL)
11322 return NULL;
11323 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11324 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11325 assert(_PyUnicode_CheckConsistency(result, 1));
11326 return result;
11327 }
11328
11329 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11330 PyUnicode_Append(PyObject **p_left, PyObject *right)
11331 {
11332 PyObject *left, *res;
11333 Py_UCS4 maxchar, maxchar2;
11334 Py_ssize_t left_len, right_len, new_len;
11335
11336 if (p_left == NULL) {
11337 if (!PyErr_Occurred())
11338 PyErr_BadInternalCall();
11339 return;
11340 }
11341 left = *p_left;
11342 if (right == NULL || left == NULL
11343 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11344 if (!PyErr_Occurred())
11345 PyErr_BadInternalCall();
11346 goto error;
11347 }
11348
11349 if (PyUnicode_READY(left) == -1)
11350 goto error;
11351 if (PyUnicode_READY(right) == -1)
11352 goto error;
11353
11354 /* Shortcuts */
11355 if (left == unicode_empty) {
11356 Py_DECREF(left);
11357 Py_INCREF(right);
11358 *p_left = right;
11359 return;
11360 }
11361 if (right == unicode_empty)
11362 return;
11363
11364 left_len = PyUnicode_GET_LENGTH(left);
11365 right_len = PyUnicode_GET_LENGTH(right);
11366 if (left_len > PY_SSIZE_T_MAX - right_len) {
11367 PyErr_SetString(PyExc_OverflowError,
11368 "strings are too large to concat");
11369 goto error;
11370 }
11371 new_len = left_len + right_len;
11372
11373 if (unicode_modifiable(left)
11374 && PyUnicode_CheckExact(right)
11375 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11376 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11377 to change the structure size, but characters are stored just after
11378 the structure, and so it requires to move all characters which is
11379 not so different than duplicating the string. */
11380 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11381 {
11382 /* append inplace */
11383 if (unicode_resize(p_left, new_len) != 0)
11384 goto error;
11385
11386 /* copy 'right' into the newly allocated area of 'left' */
11387 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11388 }
11389 else {
11390 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11391 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11392 maxchar = Py_MAX(maxchar, maxchar2);
11393
11394 /* Concat the two Unicode strings */
11395 res = PyUnicode_New(new_len, maxchar);
11396 if (res == NULL)
11397 goto error;
11398 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11399 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11400 Py_DECREF(left);
11401 *p_left = res;
11402 }
11403 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11404 return;
11405
11406 error:
11407 Py_CLEAR(*p_left);
11408 }
11409
11410 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11411 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11412 {
11413 PyUnicode_Append(pleft, right);
11414 Py_XDECREF(right);
11415 }
11416
11417 /*
11418 Wraps stringlib_parse_args_finds() and additionally ensures that the
11419 first argument is a unicode object.
11420 */
11421
11422 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11423 parse_args_finds_unicode(const char * function_name, PyObject *args,
11424 PyObject **substring,
11425 Py_ssize_t *start, Py_ssize_t *end)
11426 {
11427 if(stringlib_parse_args_finds(function_name, args, substring,
11428 start, end)) {
11429 if (ensure_unicode(*substring) < 0)
11430 return 0;
11431 return 1;
11432 }
11433 return 0;
11434 }
11435
11436 PyDoc_STRVAR(count__doc__,
11437 "S.count(sub[, start[, end]]) -> int\n\
11438 \n\
11439 Return the number of non-overlapping occurrences of substring sub in\n\
11440 string S[start:end]. Optional arguments start and end are\n\
11441 interpreted as in slice notation.");
11442
11443 static PyObject *
unicode_count(PyObject * self,PyObject * args)11444 unicode_count(PyObject *self, PyObject *args)
11445 {
11446 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11447 Py_ssize_t start = 0;
11448 Py_ssize_t end = PY_SSIZE_T_MAX;
11449 PyObject *result;
11450 int kind1, kind2;
11451 void *buf1, *buf2;
11452 Py_ssize_t len1, len2, iresult;
11453
11454 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11455 return NULL;
11456
11457 kind1 = PyUnicode_KIND(self);
11458 kind2 = PyUnicode_KIND(substring);
11459 if (kind1 < kind2)
11460 return PyLong_FromLong(0);
11461
11462 len1 = PyUnicode_GET_LENGTH(self);
11463 len2 = PyUnicode_GET_LENGTH(substring);
11464 ADJUST_INDICES(start, end, len1);
11465 if (end - start < len2)
11466 return PyLong_FromLong(0);
11467
11468 buf1 = PyUnicode_DATA(self);
11469 buf2 = PyUnicode_DATA(substring);
11470 if (kind2 != kind1) {
11471 buf2 = _PyUnicode_AsKind(substring, kind1);
11472 if (!buf2)
11473 return NULL;
11474 }
11475 switch (kind1) {
11476 case PyUnicode_1BYTE_KIND:
11477 iresult = ucs1lib_count(
11478 ((Py_UCS1*)buf1) + start, end - start,
11479 buf2, len2, PY_SSIZE_T_MAX
11480 );
11481 break;
11482 case PyUnicode_2BYTE_KIND:
11483 iresult = ucs2lib_count(
11484 ((Py_UCS2*)buf1) + start, end - start,
11485 buf2, len2, PY_SSIZE_T_MAX
11486 );
11487 break;
11488 case PyUnicode_4BYTE_KIND:
11489 iresult = ucs4lib_count(
11490 ((Py_UCS4*)buf1) + start, end - start,
11491 buf2, len2, PY_SSIZE_T_MAX
11492 );
11493 break;
11494 default:
11495 assert(0); iresult = 0;
11496 }
11497
11498 result = PyLong_FromSsize_t(iresult);
11499
11500 if (kind2 != kind1)
11501 PyMem_Free(buf2);
11502
11503 return result;
11504 }
11505
11506 PyDoc_STRVAR(encode__doc__,
11507 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11508 \n\
11509 Encode S using the codec registered for encoding. Default encoding\n\
11510 is 'utf-8'. errors may be given to set a different error\n\
11511 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11512 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11513 'xmlcharrefreplace' as well as any other name registered with\n\
11514 codecs.register_error that can handle UnicodeEncodeErrors.");
11515
11516 static PyObject *
unicode_encode(PyObject * self,PyObject * args,PyObject * kwargs)11517 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11518 {
11519 static char *kwlist[] = {"encoding", "errors", 0};
11520 char *encoding = NULL;
11521 char *errors = NULL;
11522
11523 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11524 kwlist, &encoding, &errors))
11525 return NULL;
11526 return PyUnicode_AsEncodedString(self, encoding, errors);
11527 }
11528
11529 PyDoc_STRVAR(expandtabs__doc__,
11530 "S.expandtabs(tabsize=8) -> str\n\
11531 \n\
11532 Return a copy of S where all tab characters are expanded using spaces.\n\
11533 If tabsize is not given, a tab size of 8 characters is assumed.");
11534
11535 static PyObject*
unicode_expandtabs(PyObject * self,PyObject * args,PyObject * kwds)11536 unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11537 {
11538 Py_ssize_t i, j, line_pos, src_len, incr;
11539 Py_UCS4 ch;
11540 PyObject *u;
11541 void *src_data, *dest_data;
11542 static char *kwlist[] = {"tabsize", 0};
11543 int tabsize = 8;
11544 int kind;
11545 int found;
11546
11547 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11548 kwlist, &tabsize))
11549 return NULL;
11550
11551 if (PyUnicode_READY(self) == -1)
11552 return NULL;
11553
11554 /* First pass: determine size of output string */
11555 src_len = PyUnicode_GET_LENGTH(self);
11556 i = j = line_pos = 0;
11557 kind = PyUnicode_KIND(self);
11558 src_data = PyUnicode_DATA(self);
11559 found = 0;
11560 for (; i < src_len; i++) {
11561 ch = PyUnicode_READ(kind, src_data, i);
11562 if (ch == '\t') {
11563 found = 1;
11564 if (tabsize > 0) {
11565 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11566 if (j > PY_SSIZE_T_MAX - incr)
11567 goto overflow;
11568 line_pos += incr;
11569 j += incr;
11570 }
11571 }
11572 else {
11573 if (j > PY_SSIZE_T_MAX - 1)
11574 goto overflow;
11575 line_pos++;
11576 j++;
11577 if (ch == '\n' || ch == '\r')
11578 line_pos = 0;
11579 }
11580 }
11581 if (!found)
11582 return unicode_result_unchanged(self);
11583
11584 /* Second pass: create output string and fill it */
11585 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11586 if (!u)
11587 return NULL;
11588 dest_data = PyUnicode_DATA(u);
11589
11590 i = j = line_pos = 0;
11591
11592 for (; i < src_len; i++) {
11593 ch = PyUnicode_READ(kind, src_data, i);
11594 if (ch == '\t') {
11595 if (tabsize > 0) {
11596 incr = tabsize - (line_pos % tabsize);
11597 line_pos += incr;
11598 FILL(kind, dest_data, ' ', j, incr);
11599 j += incr;
11600 }
11601 }
11602 else {
11603 line_pos++;
11604 PyUnicode_WRITE(kind, dest_data, j, ch);
11605 j++;
11606 if (ch == '\n' || ch == '\r')
11607 line_pos = 0;
11608 }
11609 }
11610 assert (j == PyUnicode_GET_LENGTH(u));
11611 return unicode_result(u);
11612
11613 overflow:
11614 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11615 return NULL;
11616 }
11617
11618 PyDoc_STRVAR(find__doc__,
11619 "S.find(sub[, start[, end]]) -> int\n\
11620 \n\
11621 Return the lowest index in S where substring sub is found,\n\
11622 such that sub is contained within S[start:end]. Optional\n\
11623 arguments start and end are interpreted as in slice notation.\n\
11624 \n\
11625 Return -1 on failure.");
11626
11627 static PyObject *
unicode_find(PyObject * self,PyObject * args)11628 unicode_find(PyObject *self, PyObject *args)
11629 {
11630 /* initialize variables to prevent gcc warning */
11631 PyObject *substring = NULL;
11632 Py_ssize_t start = 0;
11633 Py_ssize_t end = 0;
11634 Py_ssize_t result;
11635
11636 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11637 return NULL;
11638
11639 if (PyUnicode_READY(self) == -1)
11640 return NULL;
11641
11642 result = any_find_slice(self, substring, start, end, 1);
11643
11644 if (result == -2)
11645 return NULL;
11646
11647 return PyLong_FromSsize_t(result);
11648 }
11649
11650 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11651 unicode_getitem(PyObject *self, Py_ssize_t index)
11652 {
11653 void *data;
11654 enum PyUnicode_Kind kind;
11655 Py_UCS4 ch;
11656
11657 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11658 PyErr_BadArgument();
11659 return NULL;
11660 }
11661 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11662 PyErr_SetString(PyExc_IndexError, "string index out of range");
11663 return NULL;
11664 }
11665 kind = PyUnicode_KIND(self);
11666 data = PyUnicode_DATA(self);
11667 ch = PyUnicode_READ(kind, data, index);
11668 return unicode_char(ch);
11669 }
11670
11671 /* Believe it or not, this produces the same value for ASCII strings
11672 as bytes_hash(). */
11673 static Py_hash_t
unicode_hash(PyObject * self)11674 unicode_hash(PyObject *self)
11675 {
11676 Py_ssize_t len;
11677 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11678
11679 #ifdef Py_DEBUG
11680 assert(_Py_HashSecret_Initialized);
11681 #endif
11682 if (_PyUnicode_HASH(self) != -1)
11683 return _PyUnicode_HASH(self);
11684 if (PyUnicode_READY(self) == -1)
11685 return -1;
11686 len = PyUnicode_GET_LENGTH(self);
11687 /*
11688 We make the hash of the empty string be 0, rather than using
11689 (prefix ^ suffix), since this slightly obfuscates the hash secret
11690 */
11691 if (len == 0) {
11692 _PyUnicode_HASH(self) = 0;
11693 return 0;
11694 }
11695 x = _Py_HashBytes(PyUnicode_DATA(self),
11696 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11697 _PyUnicode_HASH(self) = x;
11698 return x;
11699 }
11700
11701 PyDoc_STRVAR(index__doc__,
11702 "S.index(sub[, start[, end]]) -> int\n\
11703 \n\
11704 Like S.find() but raise ValueError when the substring is not found.");
11705
11706 static PyObject *
unicode_index(PyObject * self,PyObject * args)11707 unicode_index(PyObject *self, PyObject *args)
11708 {
11709 /* initialize variables to prevent gcc warning */
11710 Py_ssize_t result;
11711 PyObject *substring = NULL;
11712 Py_ssize_t start = 0;
11713 Py_ssize_t end = 0;
11714
11715 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11716 return NULL;
11717
11718 if (PyUnicode_READY(self) == -1)
11719 return NULL;
11720
11721 result = any_find_slice(self, substring, start, end, 1);
11722
11723 if (result == -2)
11724 return NULL;
11725
11726 if (result < 0) {
11727 PyErr_SetString(PyExc_ValueError, "substring not found");
11728 return NULL;
11729 }
11730
11731 return PyLong_FromSsize_t(result);
11732 }
11733
11734 PyDoc_STRVAR(islower__doc__,
11735 "S.islower() -> bool\n\
11736 \n\
11737 Return True if all cased characters in S are lowercase and there is\n\
11738 at least one cased character in S, False otherwise.");
11739
11740 static PyObject*
unicode_islower(PyObject * self)11741 unicode_islower(PyObject *self)
11742 {
11743 Py_ssize_t i, length;
11744 int kind;
11745 void *data;
11746 int cased;
11747
11748 if (PyUnicode_READY(self) == -1)
11749 return NULL;
11750 length = PyUnicode_GET_LENGTH(self);
11751 kind = PyUnicode_KIND(self);
11752 data = PyUnicode_DATA(self);
11753
11754 /* Shortcut for single character strings */
11755 if (length == 1)
11756 return PyBool_FromLong(
11757 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11758
11759 /* Special case for empty strings */
11760 if (length == 0)
11761 return PyBool_FromLong(0);
11762
11763 cased = 0;
11764 for (i = 0; i < length; i++) {
11765 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11766
11767 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11768 return PyBool_FromLong(0);
11769 else if (!cased && Py_UNICODE_ISLOWER(ch))
11770 cased = 1;
11771 }
11772 return PyBool_FromLong(cased);
11773 }
11774
11775 PyDoc_STRVAR(isupper__doc__,
11776 "S.isupper() -> bool\n\
11777 \n\
11778 Return True if all cased characters in S are uppercase and there is\n\
11779 at least one cased character in S, False otherwise.");
11780
11781 static PyObject*
unicode_isupper(PyObject * self)11782 unicode_isupper(PyObject *self)
11783 {
11784 Py_ssize_t i, length;
11785 int kind;
11786 void *data;
11787 int cased;
11788
11789 if (PyUnicode_READY(self) == -1)
11790 return NULL;
11791 length = PyUnicode_GET_LENGTH(self);
11792 kind = PyUnicode_KIND(self);
11793 data = PyUnicode_DATA(self);
11794
11795 /* Shortcut for single character strings */
11796 if (length == 1)
11797 return PyBool_FromLong(
11798 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11799
11800 /* Special case for empty strings */
11801 if (length == 0)
11802 return PyBool_FromLong(0);
11803
11804 cased = 0;
11805 for (i = 0; i < length; i++) {
11806 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11807
11808 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11809 return PyBool_FromLong(0);
11810 else if (!cased && Py_UNICODE_ISUPPER(ch))
11811 cased = 1;
11812 }
11813 return PyBool_FromLong(cased);
11814 }
11815
11816 PyDoc_STRVAR(istitle__doc__,
11817 "S.istitle() -> bool\n\
11818 \n\
11819 Return True if S is a titlecased string and there is at least one\n\
11820 character in S, i.e. upper- and titlecase characters may only\n\
11821 follow uncased characters and lowercase characters only cased ones.\n\
11822 Return False otherwise.");
11823
11824 static PyObject*
unicode_istitle(PyObject * self)11825 unicode_istitle(PyObject *self)
11826 {
11827 Py_ssize_t i, length;
11828 int kind;
11829 void *data;
11830 int cased, previous_is_cased;
11831
11832 if (PyUnicode_READY(self) == -1)
11833 return NULL;
11834 length = PyUnicode_GET_LENGTH(self);
11835 kind = PyUnicode_KIND(self);
11836 data = PyUnicode_DATA(self);
11837
11838 /* Shortcut for single character strings */
11839 if (length == 1) {
11840 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11841 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11842 (Py_UNICODE_ISUPPER(ch) != 0));
11843 }
11844
11845 /* Special case for empty strings */
11846 if (length == 0)
11847 return PyBool_FromLong(0);
11848
11849 cased = 0;
11850 previous_is_cased = 0;
11851 for (i = 0; i < length; i++) {
11852 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11853
11854 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11855 if (previous_is_cased)
11856 return PyBool_FromLong(0);
11857 previous_is_cased = 1;
11858 cased = 1;
11859 }
11860 else if (Py_UNICODE_ISLOWER(ch)) {
11861 if (!previous_is_cased)
11862 return PyBool_FromLong(0);
11863 previous_is_cased = 1;
11864 cased = 1;
11865 }
11866 else
11867 previous_is_cased = 0;
11868 }
11869 return PyBool_FromLong(cased);
11870 }
11871
11872 PyDoc_STRVAR(isspace__doc__,
11873 "S.isspace() -> bool\n\
11874 \n\
11875 Return True if all characters in S are whitespace\n\
11876 and there is at least one character in S, False otherwise.");
11877
11878 static PyObject*
unicode_isspace(PyObject * self)11879 unicode_isspace(PyObject *self)
11880 {
11881 Py_ssize_t i, length;
11882 int kind;
11883 void *data;
11884
11885 if (PyUnicode_READY(self) == -1)
11886 return NULL;
11887 length = PyUnicode_GET_LENGTH(self);
11888 kind = PyUnicode_KIND(self);
11889 data = PyUnicode_DATA(self);
11890
11891 /* Shortcut for single character strings */
11892 if (length == 1)
11893 return PyBool_FromLong(
11894 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11895
11896 /* Special case for empty strings */
11897 if (length == 0)
11898 return PyBool_FromLong(0);
11899
11900 for (i = 0; i < length; i++) {
11901 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11902 if (!Py_UNICODE_ISSPACE(ch))
11903 return PyBool_FromLong(0);
11904 }
11905 return PyBool_FromLong(1);
11906 }
11907
11908 PyDoc_STRVAR(isalpha__doc__,
11909 "S.isalpha() -> bool\n\
11910 \n\
11911 Return True if all characters in S are alphabetic\n\
11912 and there is at least one character in S, False otherwise.");
11913
11914 static PyObject*
unicode_isalpha(PyObject * self)11915 unicode_isalpha(PyObject *self)
11916 {
11917 Py_ssize_t i, length;
11918 int kind;
11919 void *data;
11920
11921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923 length = PyUnicode_GET_LENGTH(self);
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
11926
11927 /* Shortcut for single character strings */
11928 if (length == 1)
11929 return PyBool_FromLong(
11930 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11931
11932 /* Special case for empty strings */
11933 if (length == 0)
11934 return PyBool_FromLong(0);
11935
11936 for (i = 0; i < length; i++) {
11937 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11938 return PyBool_FromLong(0);
11939 }
11940 return PyBool_FromLong(1);
11941 }
11942
11943 PyDoc_STRVAR(isalnum__doc__,
11944 "S.isalnum() -> bool\n\
11945 \n\
11946 Return True if all characters in S are alphanumeric\n\
11947 and there is at least one character in S, False otherwise.");
11948
11949 static PyObject*
unicode_isalnum(PyObject * self)11950 unicode_isalnum(PyObject *self)
11951 {
11952 int kind;
11953 void *data;
11954 Py_ssize_t len, i;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
11961 len = PyUnicode_GET_LENGTH(self);
11962
11963 /* Shortcut for single character strings */
11964 if (len == 1) {
11965 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967 }
11968
11969 /* Special case for empty strings */
11970 if (len == 0)
11971 return PyBool_FromLong(0);
11972
11973 for (i = 0; i < len; i++) {
11974 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975 if (!Py_UNICODE_ISALNUM(ch))
11976 return PyBool_FromLong(0);
11977 }
11978 return PyBool_FromLong(1);
11979 }
11980
11981 PyDoc_STRVAR(isdecimal__doc__,
11982 "S.isdecimal() -> bool\n\
11983 \n\
11984 Return True if there are only decimal characters in S,\n\
11985 False otherwise.");
11986
11987 static PyObject*
unicode_isdecimal(PyObject * self)11988 unicode_isdecimal(PyObject *self)
11989 {
11990 Py_ssize_t i, length;
11991 int kind;
11992 void *data;
11993
11994 if (PyUnicode_READY(self) == -1)
11995 return NULL;
11996 length = PyUnicode_GET_LENGTH(self);
11997 kind = PyUnicode_KIND(self);
11998 data = PyUnicode_DATA(self);
11999
12000 /* Shortcut for single character strings */
12001 if (length == 1)
12002 return PyBool_FromLong(
12003 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12004
12005 /* Special case for empty strings */
12006 if (length == 0)
12007 return PyBool_FromLong(0);
12008
12009 for (i = 0; i < length; i++) {
12010 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12011 return PyBool_FromLong(0);
12012 }
12013 return PyBool_FromLong(1);
12014 }
12015
12016 PyDoc_STRVAR(isdigit__doc__,
12017 "S.isdigit() -> bool\n\
12018 \n\
12019 Return True if all characters in S are digits\n\
12020 and there is at least one character in S, False otherwise.");
12021
12022 static PyObject*
unicode_isdigit(PyObject * self)12023 unicode_isdigit(PyObject *self)
12024 {
12025 Py_ssize_t i, length;
12026 int kind;
12027 void *data;
12028
12029 if (PyUnicode_READY(self) == -1)
12030 return NULL;
12031 length = PyUnicode_GET_LENGTH(self);
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_DATA(self);
12034
12035 /* Shortcut for single character strings */
12036 if (length == 1) {
12037 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12038 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12039 }
12040
12041 /* Special case for empty strings */
12042 if (length == 0)
12043 return PyBool_FromLong(0);
12044
12045 for (i = 0; i < length; i++) {
12046 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12047 return PyBool_FromLong(0);
12048 }
12049 return PyBool_FromLong(1);
12050 }
12051
12052 PyDoc_STRVAR(isnumeric__doc__,
12053 "S.isnumeric() -> bool\n\
12054 \n\
12055 Return True if there are only numeric characters in S,\n\
12056 False otherwise.");
12057
12058 static PyObject*
unicode_isnumeric(PyObject * self)12059 unicode_isnumeric(PyObject *self)
12060 {
12061 Py_ssize_t i, length;
12062 int kind;
12063 void *data;
12064
12065 if (PyUnicode_READY(self) == -1)
12066 return NULL;
12067 length = PyUnicode_GET_LENGTH(self);
12068 kind = PyUnicode_KIND(self);
12069 data = PyUnicode_DATA(self);
12070
12071 /* Shortcut for single character strings */
12072 if (length == 1)
12073 return PyBool_FromLong(
12074 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12075
12076 /* Special case for empty strings */
12077 if (length == 0)
12078 return PyBool_FromLong(0);
12079
12080 for (i = 0; i < length; i++) {
12081 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12082 return PyBool_FromLong(0);
12083 }
12084 return PyBool_FromLong(1);
12085 }
12086
12087 int
PyUnicode_IsIdentifier(PyObject * self)12088 PyUnicode_IsIdentifier(PyObject *self)
12089 {
12090 int kind;
12091 void *data;
12092 Py_ssize_t i;
12093 Py_UCS4 first;
12094
12095 if (PyUnicode_READY(self) == -1) {
12096 Py_FatalError("identifier not ready");
12097 return 0;
12098 }
12099
12100 /* Special case for empty strings */
12101 if (PyUnicode_GET_LENGTH(self) == 0)
12102 return 0;
12103 kind = PyUnicode_KIND(self);
12104 data = PyUnicode_DATA(self);
12105
12106 /* PEP 3131 says that the first character must be in
12107 XID_Start and subsequent characters in XID_Continue,
12108 and for the ASCII range, the 2.x rules apply (i.e
12109 start with letters and underscore, continue with
12110 letters, digits, underscore). However, given the current
12111 definition of XID_Start and XID_Continue, it is sufficient
12112 to check just for these, except that _ must be allowed
12113 as starting an identifier. */
12114 first = PyUnicode_READ(kind, data, 0);
12115 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12116 return 0;
12117
12118 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12119 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12120 return 0;
12121 return 1;
12122 }
12123
12124 PyDoc_STRVAR(isidentifier__doc__,
12125 "S.isidentifier() -> bool\n\
12126 \n\
12127 Return True if S is a valid identifier according\n\
12128 to the language definition.\n\
12129 \n\
12130 Use keyword.iskeyword() to test for reserved identifiers\n\
12131 such as \"def\" and \"class\".\n");
12132
12133 static PyObject*
unicode_isidentifier(PyObject * self)12134 unicode_isidentifier(PyObject *self)
12135 {
12136 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12137 }
12138
12139 PyDoc_STRVAR(isprintable__doc__,
12140 "S.isprintable() -> bool\n\
12141 \n\
12142 Return True if all characters in S are considered\n\
12143 printable in repr() or S is empty, False otherwise.");
12144
12145 static PyObject*
unicode_isprintable(PyObject * self)12146 unicode_isprintable(PyObject *self)
12147 {
12148 Py_ssize_t i, length;
12149 int kind;
12150 void *data;
12151
12152 if (PyUnicode_READY(self) == -1)
12153 return NULL;
12154 length = PyUnicode_GET_LENGTH(self);
12155 kind = PyUnicode_KIND(self);
12156 data = PyUnicode_DATA(self);
12157
12158 /* Shortcut for single character strings */
12159 if (length == 1)
12160 return PyBool_FromLong(
12161 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12162
12163 for (i = 0; i < length; i++) {
12164 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12165 Py_RETURN_FALSE;
12166 }
12167 }
12168 Py_RETURN_TRUE;
12169 }
12170
12171 PyDoc_STRVAR(join__doc__,
12172 "S.join(iterable) -> str\n\
12173 \n\
12174 Return a string which is the concatenation of the strings in the\n\
12175 iterable. The separator between elements is S.");
12176
12177 static PyObject*
unicode_join(PyObject * self,PyObject * data)12178 unicode_join(PyObject *self, PyObject *data)
12179 {
12180 return PyUnicode_Join(self, data);
12181 }
12182
12183 static Py_ssize_t
unicode_length(PyObject * self)12184 unicode_length(PyObject *self)
12185 {
12186 if (PyUnicode_READY(self) == -1)
12187 return -1;
12188 return PyUnicode_GET_LENGTH(self);
12189 }
12190
12191 PyDoc_STRVAR(ljust__doc__,
12192 "S.ljust(width[, fillchar]) -> str\n\
12193 \n\
12194 Return S left-justified in a Unicode string of length width. Padding is\n\
12195 done using the specified fill character (default is a space).");
12196
12197 static PyObject *
unicode_ljust(PyObject * self,PyObject * args)12198 unicode_ljust(PyObject *self, PyObject *args)
12199 {
12200 Py_ssize_t width;
12201 Py_UCS4 fillchar = ' ';
12202
12203 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
12204 return NULL;
12205
12206 if (PyUnicode_READY(self) == -1)
12207 return NULL;
12208
12209 if (PyUnicode_GET_LENGTH(self) >= width)
12210 return unicode_result_unchanged(self);
12211
12212 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12213 }
12214
12215 PyDoc_STRVAR(lower__doc__,
12216 "S.lower() -> str\n\
12217 \n\
12218 Return a copy of the string S converted to lowercase.");
12219
12220 static PyObject*
unicode_lower(PyObject * self)12221 unicode_lower(PyObject *self)
12222 {
12223 if (PyUnicode_READY(self) == -1)
12224 return NULL;
12225 if (PyUnicode_IS_ASCII(self))
12226 return ascii_upper_or_lower(self, 1);
12227 return case_operation(self, do_lower);
12228 }
12229
12230 #define LEFTSTRIP 0
12231 #define RIGHTSTRIP 1
12232 #define BOTHSTRIP 2
12233
12234 /* Arrays indexed by above */
12235 static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12236
12237 #define STRIPNAME(i) (stripformat[i]+3)
12238
12239 /* externally visible for str.strip(unicode) */
12240 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12241 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12242 {
12243 void *data;
12244 int kind;
12245 Py_ssize_t i, j, len;
12246 BLOOM_MASK sepmask;
12247 Py_ssize_t seplen;
12248
12249 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12250 return NULL;
12251
12252 kind = PyUnicode_KIND(self);
12253 data = PyUnicode_DATA(self);
12254 len = PyUnicode_GET_LENGTH(self);
12255 seplen = PyUnicode_GET_LENGTH(sepobj);
12256 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12257 PyUnicode_DATA(sepobj),
12258 seplen);
12259
12260 i = 0;
12261 if (striptype != RIGHTSTRIP) {
12262 while (i < len) {
12263 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12264 if (!BLOOM(sepmask, ch))
12265 break;
12266 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12267 break;
12268 i++;
12269 }
12270 }
12271
12272 j = len;
12273 if (striptype != LEFTSTRIP) {
12274 j--;
12275 while (j >= i) {
12276 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12277 if (!BLOOM(sepmask, ch))
12278 break;
12279 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12280 break;
12281 j--;
12282 }
12283
12284 j++;
12285 }
12286
12287 return PyUnicode_Substring(self, i, j);
12288 }
12289
12290 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12291 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12292 {
12293 unsigned char *data;
12294 int kind;
12295 Py_ssize_t length;
12296
12297 if (PyUnicode_READY(self) == -1)
12298 return NULL;
12299
12300 length = PyUnicode_GET_LENGTH(self);
12301 end = Py_MIN(end, length);
12302
12303 if (start == 0 && end == length)
12304 return unicode_result_unchanged(self);
12305
12306 if (start < 0 || end < 0) {
12307 PyErr_SetString(PyExc_IndexError, "string index out of range");
12308 return NULL;
12309 }
12310 if (start >= length || end < start)
12311 _Py_RETURN_UNICODE_EMPTY();
12312
12313 length = end - start;
12314 if (PyUnicode_IS_ASCII(self)) {
12315 data = PyUnicode_1BYTE_DATA(self);
12316 return _PyUnicode_FromASCII((char*)(data + start), length);
12317 }
12318 else {
12319 kind = PyUnicode_KIND(self);
12320 data = PyUnicode_1BYTE_DATA(self);
12321 return PyUnicode_FromKindAndData(kind,
12322 data + kind * start,
12323 length);
12324 }
12325 }
12326
12327 static PyObject *
do_strip(PyObject * self,int striptype)12328 do_strip(PyObject *self, int striptype)
12329 {
12330 Py_ssize_t len, i, j;
12331
12332 if (PyUnicode_READY(self) == -1)
12333 return NULL;
12334
12335 len = PyUnicode_GET_LENGTH(self);
12336
12337 if (PyUnicode_IS_ASCII(self)) {
12338 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12339
12340 i = 0;
12341 if (striptype != RIGHTSTRIP) {
12342 while (i < len) {
12343 Py_UCS1 ch = data[i];
12344 if (!_Py_ascii_whitespace[ch])
12345 break;
12346 i++;
12347 }
12348 }
12349
12350 j = len;
12351 if (striptype != LEFTSTRIP) {
12352 j--;
12353 while (j >= i) {
12354 Py_UCS1 ch = data[j];
12355 if (!_Py_ascii_whitespace[ch])
12356 break;
12357 j--;
12358 }
12359 j++;
12360 }
12361 }
12362 else {
12363 int kind = PyUnicode_KIND(self);
12364 void *data = PyUnicode_DATA(self);
12365
12366 i = 0;
12367 if (striptype != RIGHTSTRIP) {
12368 while (i < len) {
12369 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12370 if (!Py_UNICODE_ISSPACE(ch))
12371 break;
12372 i++;
12373 }
12374 }
12375
12376 j = len;
12377 if (striptype != LEFTSTRIP) {
12378 j--;
12379 while (j >= i) {
12380 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12381 if (!Py_UNICODE_ISSPACE(ch))
12382 break;
12383 j--;
12384 }
12385 j++;
12386 }
12387 }
12388
12389 return PyUnicode_Substring(self, i, j);
12390 }
12391
12392
12393 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * args)12394 do_argstrip(PyObject *self, int striptype, PyObject *args)
12395 {
12396 PyObject *sep = NULL;
12397
12398 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12399 return NULL;
12400
12401 if (sep != NULL && sep != Py_None) {
12402 if (PyUnicode_Check(sep))
12403 return _PyUnicode_XStrip(self, striptype, sep);
12404 else {
12405 PyErr_Format(PyExc_TypeError,
12406 "%s arg must be None or str",
12407 STRIPNAME(striptype));
12408 return NULL;
12409 }
12410 }
12411
12412 return do_strip(self, striptype);
12413 }
12414
12415
12416 PyDoc_STRVAR(strip__doc__,
12417 "S.strip([chars]) -> str\n\
12418 \n\
12419 Return a copy of the string S with leading and trailing\n\
12420 whitespace removed.\n\
12421 If chars is given and not None, remove characters in chars instead.");
12422
12423 static PyObject *
unicode_strip(PyObject * self,PyObject * args)12424 unicode_strip(PyObject *self, PyObject *args)
12425 {
12426 if (PyTuple_GET_SIZE(args) == 0)
12427 return do_strip(self, BOTHSTRIP); /* Common case */
12428 else
12429 return do_argstrip(self, BOTHSTRIP, args);
12430 }
12431
12432
12433 PyDoc_STRVAR(lstrip__doc__,
12434 "S.lstrip([chars]) -> str\n\
12435 \n\
12436 Return a copy of the string S with leading whitespace removed.\n\
12437 If chars is given and not None, remove characters in chars instead.");
12438
12439 static PyObject *
unicode_lstrip(PyObject * self,PyObject * args)12440 unicode_lstrip(PyObject *self, PyObject *args)
12441 {
12442 if (PyTuple_GET_SIZE(args) == 0)
12443 return do_strip(self, LEFTSTRIP); /* Common case */
12444 else
12445 return do_argstrip(self, LEFTSTRIP, args);
12446 }
12447
12448
12449 PyDoc_STRVAR(rstrip__doc__,
12450 "S.rstrip([chars]) -> str\n\
12451 \n\
12452 Return a copy of the string S with trailing whitespace removed.\n\
12453 If chars is given and not None, remove characters in chars instead.");
12454
12455 static PyObject *
unicode_rstrip(PyObject * self,PyObject * args)12456 unicode_rstrip(PyObject *self, PyObject *args)
12457 {
12458 if (PyTuple_GET_SIZE(args) == 0)
12459 return do_strip(self, RIGHTSTRIP); /* Common case */
12460 else
12461 return do_argstrip(self, RIGHTSTRIP, args);
12462 }
12463
12464
12465 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12466 unicode_repeat(PyObject *str, Py_ssize_t len)
12467 {
12468 PyObject *u;
12469 Py_ssize_t nchars, n;
12470
12471 if (len < 1)
12472 _Py_RETURN_UNICODE_EMPTY();
12473
12474 /* no repeat, return original string */
12475 if (len == 1)
12476 return unicode_result_unchanged(str);
12477
12478 if (PyUnicode_READY(str) == -1)
12479 return NULL;
12480
12481 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12482 PyErr_SetString(PyExc_OverflowError,
12483 "repeated string is too long");
12484 return NULL;
12485 }
12486 nchars = len * PyUnicode_GET_LENGTH(str);
12487
12488 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12489 if (!u)
12490 return NULL;
12491 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12492
12493 if (PyUnicode_GET_LENGTH(str) == 1) {
12494 const int kind = PyUnicode_KIND(str);
12495 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12496 if (kind == PyUnicode_1BYTE_KIND) {
12497 void *to = PyUnicode_DATA(u);
12498 memset(to, (unsigned char)fill_char, len);
12499 }
12500 else if (kind == PyUnicode_2BYTE_KIND) {
12501 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12502 for (n = 0; n < len; ++n)
12503 ucs2[n] = fill_char;
12504 } else {
12505 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12506 assert(kind == PyUnicode_4BYTE_KIND);
12507 for (n = 0; n < len; ++n)
12508 ucs4[n] = fill_char;
12509 }
12510 }
12511 else {
12512 /* number of characters copied this far */
12513 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12514 const Py_ssize_t char_size = PyUnicode_KIND(str);
12515 char *to = (char *) PyUnicode_DATA(u);
12516 memcpy(to, PyUnicode_DATA(str),
12517 PyUnicode_GET_LENGTH(str) * char_size);
12518 while (done < nchars) {
12519 n = (done <= nchars-done) ? done : nchars-done;
12520 memcpy(to + (done * char_size), to, n * char_size);
12521 done += n;
12522 }
12523 }
12524
12525 assert(_PyUnicode_CheckConsistency(u, 1));
12526 return u;
12527 }
12528
12529 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12530 PyUnicode_Replace(PyObject *str,
12531 PyObject *substr,
12532 PyObject *replstr,
12533 Py_ssize_t maxcount)
12534 {
12535 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12536 ensure_unicode(replstr) < 0)
12537 return NULL;
12538 return replace(str, substr, replstr, maxcount);
12539 }
12540
12541 PyDoc_STRVAR(replace__doc__,
12542 "S.replace(old, new[, count]) -> str\n\
12543 \n\
12544 Return a copy of S with all occurrences of substring\n\
12545 old replaced by new. If the optional argument count is\n\
12546 given, only the first count occurrences are replaced.");
12547
12548 static PyObject*
unicode_replace(PyObject * self,PyObject * args)12549 unicode_replace(PyObject *self, PyObject *args)
12550 {
12551 PyObject *str1;
12552 PyObject *str2;
12553 Py_ssize_t maxcount = -1;
12554
12555 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
12556 return NULL;
12557 if (PyUnicode_READY(self) == -1)
12558 return NULL;
12559 return replace(self, str1, str2, maxcount);
12560 }
12561
12562 static PyObject *
unicode_repr(PyObject * unicode)12563 unicode_repr(PyObject *unicode)
12564 {
12565 PyObject *repr;
12566 Py_ssize_t isize;
12567 Py_ssize_t osize, squote, dquote, i, o;
12568 Py_UCS4 max, quote;
12569 int ikind, okind, unchanged;
12570 void *idata, *odata;
12571
12572 if (PyUnicode_READY(unicode) == -1)
12573 return NULL;
12574
12575 isize = PyUnicode_GET_LENGTH(unicode);
12576 idata = PyUnicode_DATA(unicode);
12577
12578 /* Compute length of output, quote characters, and
12579 maximum character */
12580 osize = 0;
12581 max = 127;
12582 squote = dquote = 0;
12583 ikind = PyUnicode_KIND(unicode);
12584 for (i = 0; i < isize; i++) {
12585 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12586 Py_ssize_t incr = 1;
12587 switch (ch) {
12588 case '\'': squote++; break;
12589 case '"': dquote++; break;
12590 case '\\': case '\t': case '\r': case '\n':
12591 incr = 2;
12592 break;
12593 default:
12594 /* Fast-path ASCII */
12595 if (ch < ' ' || ch == 0x7f)
12596 incr = 4; /* \xHH */
12597 else if (ch < 0x7f)
12598 ;
12599 else if (Py_UNICODE_ISPRINTABLE(ch))
12600 max = ch > max ? ch : max;
12601 else if (ch < 0x100)
12602 incr = 4; /* \xHH */
12603 else if (ch < 0x10000)
12604 incr = 6; /* \uHHHH */
12605 else
12606 incr = 10; /* \uHHHHHHHH */
12607 }
12608 if (osize > PY_SSIZE_T_MAX - incr) {
12609 PyErr_SetString(PyExc_OverflowError,
12610 "string is too long to generate repr");
12611 return NULL;
12612 }
12613 osize += incr;
12614 }
12615
12616 quote = '\'';
12617 unchanged = (osize == isize);
12618 if (squote) {
12619 unchanged = 0;
12620 if (dquote)
12621 /* Both squote and dquote present. Use squote,
12622 and escape them */
12623 osize += squote;
12624 else
12625 quote = '"';
12626 }
12627 osize += 2; /* quotes */
12628
12629 repr = PyUnicode_New(osize, max);
12630 if (repr == NULL)
12631 return NULL;
12632 okind = PyUnicode_KIND(repr);
12633 odata = PyUnicode_DATA(repr);
12634
12635 PyUnicode_WRITE(okind, odata, 0, quote);
12636 PyUnicode_WRITE(okind, odata, osize-1, quote);
12637 if (unchanged) {
12638 _PyUnicode_FastCopyCharacters(repr, 1,
12639 unicode, 0,
12640 isize);
12641 }
12642 else {
12643 for (i = 0, o = 1; i < isize; i++) {
12644 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12645
12646 /* Escape quotes and backslashes */
12647 if ((ch == quote) || (ch == '\\')) {
12648 PyUnicode_WRITE(okind, odata, o++, '\\');
12649 PyUnicode_WRITE(okind, odata, o++, ch);
12650 continue;
12651 }
12652
12653 /* Map special whitespace to '\t', \n', '\r' */
12654 if (ch == '\t') {
12655 PyUnicode_WRITE(okind, odata, o++, '\\');
12656 PyUnicode_WRITE(okind, odata, o++, 't');
12657 }
12658 else if (ch == '\n') {
12659 PyUnicode_WRITE(okind, odata, o++, '\\');
12660 PyUnicode_WRITE(okind, odata, o++, 'n');
12661 }
12662 else if (ch == '\r') {
12663 PyUnicode_WRITE(okind, odata, o++, '\\');
12664 PyUnicode_WRITE(okind, odata, o++, 'r');
12665 }
12666
12667 /* Map non-printable US ASCII to '\xhh' */
12668 else if (ch < ' ' || ch == 0x7F) {
12669 PyUnicode_WRITE(okind, odata, o++, '\\');
12670 PyUnicode_WRITE(okind, odata, o++, 'x');
12671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12672 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12673 }
12674
12675 /* Copy ASCII characters as-is */
12676 else if (ch < 0x7F) {
12677 PyUnicode_WRITE(okind, odata, o++, ch);
12678 }
12679
12680 /* Non-ASCII characters */
12681 else {
12682 /* Map Unicode whitespace and control characters
12683 (categories Z* and C* except ASCII space)
12684 */
12685 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12686 PyUnicode_WRITE(okind, odata, o++, '\\');
12687 /* Map 8-bit characters to '\xhh' */
12688 if (ch <= 0xff) {
12689 PyUnicode_WRITE(okind, odata, o++, 'x');
12690 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12691 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12692 }
12693 /* Map 16-bit characters to '\uxxxx' */
12694 else if (ch <= 0xffff) {
12695 PyUnicode_WRITE(okind, odata, o++, 'u');
12696 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12697 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12698 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12699 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12700 }
12701 /* Map 21-bit characters to '\U00xxxxxx' */
12702 else {
12703 PyUnicode_WRITE(okind, odata, o++, 'U');
12704 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12705 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12706 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12707 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12708 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12709 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12710 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12711 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12712 }
12713 }
12714 /* Copy characters as-is */
12715 else {
12716 PyUnicode_WRITE(okind, odata, o++, ch);
12717 }
12718 }
12719 }
12720 }
12721 /* Closing quote already added at the beginning */
12722 assert(_PyUnicode_CheckConsistency(repr, 1));
12723 return repr;
12724 }
12725
12726 PyDoc_STRVAR(rfind__doc__,
12727 "S.rfind(sub[, start[, end]]) -> int\n\
12728 \n\
12729 Return the highest index in S where substring sub is found,\n\
12730 such that sub is contained within S[start:end]. Optional\n\
12731 arguments start and end are interpreted as in slice notation.\n\
12732 \n\
12733 Return -1 on failure.");
12734
12735 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12736 unicode_rfind(PyObject *self, PyObject *args)
12737 {
12738 /* initialize variables to prevent gcc warning */
12739 PyObject *substring = NULL;
12740 Py_ssize_t start = 0;
12741 Py_ssize_t end = 0;
12742 Py_ssize_t result;
12743
12744 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12745 return NULL;
12746
12747 if (PyUnicode_READY(self) == -1)
12748 return NULL;
12749
12750 result = any_find_slice(self, substring, start, end, -1);
12751
12752 if (result == -2)
12753 return NULL;
12754
12755 return PyLong_FromSsize_t(result);
12756 }
12757
12758 PyDoc_STRVAR(rindex__doc__,
12759 "S.rindex(sub[, start[, end]]) -> int\n\
12760 \n\
12761 Like S.rfind() but raise ValueError when the substring is not found.");
12762
12763 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12764 unicode_rindex(PyObject *self, PyObject *args)
12765 {
12766 /* initialize variables to prevent gcc warning */
12767 PyObject *substring = NULL;
12768 Py_ssize_t start = 0;
12769 Py_ssize_t end = 0;
12770 Py_ssize_t result;
12771
12772 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12773 return NULL;
12774
12775 if (PyUnicode_READY(self) == -1)
12776 return NULL;
12777
12778 result = any_find_slice(self, substring, start, end, -1);
12779
12780 if (result == -2)
12781 return NULL;
12782
12783 if (result < 0) {
12784 PyErr_SetString(PyExc_ValueError, "substring not found");
12785 return NULL;
12786 }
12787
12788 return PyLong_FromSsize_t(result);
12789 }
12790
12791 PyDoc_STRVAR(rjust__doc__,
12792 "S.rjust(width[, fillchar]) -> str\n\
12793 \n\
12794 Return S right-justified in a string of length width. Padding is\n\
12795 done using the specified fill character (default is a space).");
12796
12797 static PyObject *
unicode_rjust(PyObject * self,PyObject * args)12798 unicode_rjust(PyObject *self, PyObject *args)
12799 {
12800 Py_ssize_t width;
12801 Py_UCS4 fillchar = ' ';
12802
12803 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12804 return NULL;
12805
12806 if (PyUnicode_READY(self) == -1)
12807 return NULL;
12808
12809 if (PyUnicode_GET_LENGTH(self) >= width)
12810 return unicode_result_unchanged(self);
12811
12812 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12813 }
12814
12815 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12816 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12817 {
12818 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12819 return NULL;
12820
12821 return split(s, sep, maxsplit);
12822 }
12823
12824 PyDoc_STRVAR(split__doc__,
12825 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12826 \n\
12827 Return a list of the words in S, using sep as the\n\
12828 delimiter string. If maxsplit is given, at most maxsplit\n\
12829 splits are done. If sep is not specified or is None, any\n\
12830 whitespace string is a separator and empty strings are\n\
12831 removed from the result.");
12832
12833 static PyObject*
unicode_split(PyObject * self,PyObject * args,PyObject * kwds)12834 unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12835 {
12836 static char *kwlist[] = {"sep", "maxsplit", 0};
12837 PyObject *substring = Py_None;
12838 Py_ssize_t maxcount = -1;
12839
12840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12841 kwlist, &substring, &maxcount))
12842 return NULL;
12843
12844 if (substring == Py_None)
12845 return split(self, NULL, maxcount);
12846
12847 if (PyUnicode_Check(substring))
12848 return split(self, substring, maxcount);
12849
12850 PyErr_Format(PyExc_TypeError,
12851 "must be str or None, not %.100s",
12852 Py_TYPE(substring)->tp_name);
12853 return NULL;
12854 }
12855
12856 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12857 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12858 {
12859 PyObject* out;
12860 int kind1, kind2;
12861 void *buf1, *buf2;
12862 Py_ssize_t len1, len2;
12863
12864 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12865 return NULL;
12866
12867 kind1 = PyUnicode_KIND(str_obj);
12868 kind2 = PyUnicode_KIND(sep_obj);
12869 len1 = PyUnicode_GET_LENGTH(str_obj);
12870 len2 = PyUnicode_GET_LENGTH(sep_obj);
12871 if (kind1 < kind2 || len1 < len2) {
12872 _Py_INCREF_UNICODE_EMPTY();
12873 if (!unicode_empty)
12874 out = NULL;
12875 else {
12876 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12877 Py_DECREF(unicode_empty);
12878 }
12879 return out;
12880 }
12881 buf1 = PyUnicode_DATA(str_obj);
12882 buf2 = PyUnicode_DATA(sep_obj);
12883 if (kind2 != kind1) {
12884 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12885 if (!buf2)
12886 return NULL;
12887 }
12888
12889 switch (kind1) {
12890 case PyUnicode_1BYTE_KIND:
12891 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12892 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893 else
12894 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12895 break;
12896 case PyUnicode_2BYTE_KIND:
12897 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12898 break;
12899 case PyUnicode_4BYTE_KIND:
12900 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12901 break;
12902 default:
12903 assert(0);
12904 out = 0;
12905 }
12906
12907 if (kind2 != kind1)
12908 PyMem_Free(buf2);
12909
12910 return out;
12911 }
12912
12913
12914 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12915 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12916 {
12917 PyObject* out;
12918 int kind1, kind2;
12919 void *buf1, *buf2;
12920 Py_ssize_t len1, len2;
12921
12922 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12923 return NULL;
12924
12925 kind1 = PyUnicode_KIND(str_obj);
12926 kind2 = PyUnicode_KIND(sep_obj);
12927 len1 = PyUnicode_GET_LENGTH(str_obj);
12928 len2 = PyUnicode_GET_LENGTH(sep_obj);
12929 if (kind1 < kind2 || len1 < len2) {
12930 _Py_INCREF_UNICODE_EMPTY();
12931 if (!unicode_empty)
12932 out = NULL;
12933 else {
12934 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12935 Py_DECREF(unicode_empty);
12936 }
12937 return out;
12938 }
12939 buf1 = PyUnicode_DATA(str_obj);
12940 buf2 = PyUnicode_DATA(sep_obj);
12941 if (kind2 != kind1) {
12942 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12943 if (!buf2)
12944 return NULL;
12945 }
12946
12947 switch (kind1) {
12948 case PyUnicode_1BYTE_KIND:
12949 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12950 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951 else
12952 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12953 break;
12954 case PyUnicode_2BYTE_KIND:
12955 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12956 break;
12957 case PyUnicode_4BYTE_KIND:
12958 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 default:
12961 assert(0);
12962 out = 0;
12963 }
12964
12965 if (kind2 != kind1)
12966 PyMem_Free(buf2);
12967
12968 return out;
12969 }
12970
12971 PyDoc_STRVAR(partition__doc__,
12972 "S.partition(sep) -> (head, sep, tail)\n\
12973 \n\
12974 Search for the separator sep in S, and return the part before it,\n\
12975 the separator itself, and the part after it. If the separator is not\n\
12976 found, return S and two empty strings.");
12977
12978 static PyObject*
unicode_partition(PyObject * self,PyObject * separator)12979 unicode_partition(PyObject *self, PyObject *separator)
12980 {
12981 return PyUnicode_Partition(self, separator);
12982 }
12983
12984 PyDoc_STRVAR(rpartition__doc__,
12985 "S.rpartition(sep) -> (head, sep, tail)\n\
12986 \n\
12987 Search for the separator sep in S, starting at the end of S, and return\n\
12988 the part before it, the separator itself, and the part after it. If the\n\
12989 separator is not found, return two empty strings and S.");
12990
12991 static PyObject*
unicode_rpartition(PyObject * self,PyObject * separator)12992 unicode_rpartition(PyObject *self, PyObject *separator)
12993 {
12994 return PyUnicode_RPartition(self, separator);
12995 }
12996
12997 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12998 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12999 {
13000 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13001 return NULL;
13002
13003 return rsplit(s, sep, maxsplit);
13004 }
13005
13006 PyDoc_STRVAR(rsplit__doc__,
13007 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
13008 \n\
13009 Return a list of the words in S, using sep as the\n\
13010 delimiter string, starting at the end of the string and\n\
13011 working to the front. If maxsplit is given, at most maxsplit\n\
13012 splits are done. If sep is not specified, any whitespace string\n\
13013 is a separator.");
13014
13015 static PyObject*
unicode_rsplit(PyObject * self,PyObject * args,PyObject * kwds)13016 unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
13017 {
13018 static char *kwlist[] = {"sep", "maxsplit", 0};
13019 PyObject *substring = Py_None;
13020 Py_ssize_t maxcount = -1;
13021
13022 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13023 kwlist, &substring, &maxcount))
13024 return NULL;
13025
13026 if (substring == Py_None)
13027 return rsplit(self, NULL, maxcount);
13028
13029 if (PyUnicode_Check(substring))
13030 return rsplit(self, substring, maxcount);
13031
13032 PyErr_Format(PyExc_TypeError,
13033 "must be str or None, not %.100s",
13034 Py_TYPE(substring)->tp_name);
13035 return NULL;
13036 }
13037
13038 PyDoc_STRVAR(splitlines__doc__,
13039 "S.splitlines([keepends]) -> list of strings\n\
13040 \n\
13041 Return a list of the lines in S, breaking at line boundaries.\n\
13042 Line breaks are not included in the resulting list unless keepends\n\
13043 is given and true.");
13044
13045 static PyObject*
unicode_splitlines(PyObject * self,PyObject * args,PyObject * kwds)13046 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
13047 {
13048 static char *kwlist[] = {"keepends", 0};
13049 int keepends = 0;
13050
13051 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13052 kwlist, &keepends))
13053 return NULL;
13054
13055 return PyUnicode_Splitlines(self, keepends);
13056 }
13057
13058 static
unicode_str(PyObject * self)13059 PyObject *unicode_str(PyObject *self)
13060 {
13061 return unicode_result_unchanged(self);
13062 }
13063
13064 PyDoc_STRVAR(swapcase__doc__,
13065 "S.swapcase() -> str\n\
13066 \n\
13067 Return a copy of S with uppercase characters converted to lowercase\n\
13068 and vice versa.");
13069
13070 static PyObject*
unicode_swapcase(PyObject * self)13071 unicode_swapcase(PyObject *self)
13072 {
13073 if (PyUnicode_READY(self) == -1)
13074 return NULL;
13075 return case_operation(self, do_swapcase);
13076 }
13077
13078 /*[clinic input]
13079
13080 @staticmethod
13081 str.maketrans as unicode_maketrans
13082
13083 x: object
13084
13085 y: unicode=NULL
13086
13087 z: unicode=NULL
13088
13089 /
13090
13091 Return a translation table usable for str.translate().
13092
13093 If there is only one argument, it must be a dictionary mapping Unicode
13094 ordinals (integers) or characters to Unicode ordinals, strings or None.
13095 Character keys will be then converted to ordinals.
13096 If there are two arguments, they must be strings of equal length, and
13097 in the resulting dictionary, each character in x will be mapped to the
13098 character at the same position in y. If there is a third argument, it
13099 must be a string, whose characters will be mapped to None in the result.
13100 [clinic start generated code]*/
13101
13102 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13103 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13104 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13105 {
13106 PyObject *new = NULL, *key, *value;
13107 Py_ssize_t i = 0;
13108 int res;
13109
13110 new = PyDict_New();
13111 if (!new)
13112 return NULL;
13113 if (y != NULL) {
13114 int x_kind, y_kind, z_kind;
13115 void *x_data, *y_data, *z_data;
13116
13117 /* x must be a string too, of equal length */
13118 if (!PyUnicode_Check(x)) {
13119 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13120 "be a string if there is a second argument");
13121 goto err;
13122 }
13123 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13124 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13125 "arguments must have equal length");
13126 goto err;
13127 }
13128 /* create entries for translating chars in x to those in y */
13129 x_kind = PyUnicode_KIND(x);
13130 y_kind = PyUnicode_KIND(y);
13131 x_data = PyUnicode_DATA(x);
13132 y_data = PyUnicode_DATA(y);
13133 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13134 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13135 if (!key)
13136 goto err;
13137 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13138 if (!value) {
13139 Py_DECREF(key);
13140 goto err;
13141 }
13142 res = PyDict_SetItem(new, key, value);
13143 Py_DECREF(key);
13144 Py_DECREF(value);
13145 if (res < 0)
13146 goto err;
13147 }
13148 /* create entries for deleting chars in z */
13149 if (z != NULL) {
13150 z_kind = PyUnicode_KIND(z);
13151 z_data = PyUnicode_DATA(z);
13152 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13153 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13154 if (!key)
13155 goto err;
13156 res = PyDict_SetItem(new, key, Py_None);
13157 Py_DECREF(key);
13158 if (res < 0)
13159 goto err;
13160 }
13161 }
13162 } else {
13163 int kind;
13164 void *data;
13165
13166 /* x must be a dict */
13167 if (!PyDict_CheckExact(x)) {
13168 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13169 "to maketrans it must be a dict");
13170 goto err;
13171 }
13172 /* copy entries into the new dict, converting string keys to int keys */
13173 while (PyDict_Next(x, &i, &key, &value)) {
13174 if (PyUnicode_Check(key)) {
13175 /* convert string keys to integer keys */
13176 PyObject *newkey;
13177 if (PyUnicode_GET_LENGTH(key) != 1) {
13178 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13179 "table must be of length 1");
13180 goto err;
13181 }
13182 kind = PyUnicode_KIND(key);
13183 data = PyUnicode_DATA(key);
13184 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13185 if (!newkey)
13186 goto err;
13187 res = PyDict_SetItem(new, newkey, value);
13188 Py_DECREF(newkey);
13189 if (res < 0)
13190 goto err;
13191 } else if (PyLong_Check(key)) {
13192 /* just keep integer keys */
13193 if (PyDict_SetItem(new, key, value) < 0)
13194 goto err;
13195 } else {
13196 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13197 "be strings or integers");
13198 goto err;
13199 }
13200 }
13201 }
13202 return new;
13203 err:
13204 Py_DECREF(new);
13205 return NULL;
13206 }
13207
13208 PyDoc_STRVAR(translate__doc__,
13209 "S.translate(table) -> str\n\
13210 \n\
13211 Return a copy of the string S in which each character has been mapped\n\
13212 through the given translation table. The table must implement\n\
13213 lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13214 mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13215 this operation raises LookupError, the character is left untouched.\n\
13216 Characters mapped to None are deleted.");
13217
13218 static PyObject*
unicode_translate(PyObject * self,PyObject * table)13219 unicode_translate(PyObject *self, PyObject *table)
13220 {
13221 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13222 }
13223
13224 PyDoc_STRVAR(upper__doc__,
13225 "S.upper() -> str\n\
13226 \n\
13227 Return a copy of S converted to uppercase.");
13228
13229 static PyObject*
unicode_upper(PyObject * self)13230 unicode_upper(PyObject *self)
13231 {
13232 if (PyUnicode_READY(self) == -1)
13233 return NULL;
13234 if (PyUnicode_IS_ASCII(self))
13235 return ascii_upper_or_lower(self, 0);
13236 return case_operation(self, do_upper);
13237 }
13238
13239 PyDoc_STRVAR(zfill__doc__,
13240 "S.zfill(width) -> str\n\
13241 \n\
13242 Pad a numeric string S with zeros on the left, to fill a field\n\
13243 of the specified width. The string S is never truncated.");
13244
13245 static PyObject *
unicode_zfill(PyObject * self,PyObject * args)13246 unicode_zfill(PyObject *self, PyObject *args)
13247 {
13248 Py_ssize_t fill;
13249 PyObject *u;
13250 Py_ssize_t width;
13251 int kind;
13252 void *data;
13253 Py_UCS4 chr;
13254
13255 if (!PyArg_ParseTuple(args, "n:zfill", &width))
13256 return NULL;
13257
13258 if (PyUnicode_READY(self) == -1)
13259 return NULL;
13260
13261 if (PyUnicode_GET_LENGTH(self) >= width)
13262 return unicode_result_unchanged(self);
13263
13264 fill = width - PyUnicode_GET_LENGTH(self);
13265
13266 u = pad(self, fill, 0, '0');
13267
13268 if (u == NULL)
13269 return NULL;
13270
13271 kind = PyUnicode_KIND(u);
13272 data = PyUnicode_DATA(u);
13273 chr = PyUnicode_READ(kind, data, fill);
13274
13275 if (chr == '+' || chr == '-') {
13276 /* move sign to beginning of string */
13277 PyUnicode_WRITE(kind, data, 0, chr);
13278 PyUnicode_WRITE(kind, data, fill, '0');
13279 }
13280
13281 assert(_PyUnicode_CheckConsistency(u, 1));
13282 return u;
13283 }
13284
13285 #if 0
13286 static PyObject *
13287 unicode__decimal2ascii(PyObject *self)
13288 {
13289 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13290 }
13291 #endif
13292
13293 PyDoc_STRVAR(startswith__doc__,
13294 "S.startswith(prefix[, start[, end]]) -> bool\n\
13295 \n\
13296 Return True if S starts with the specified prefix, False otherwise.\n\
13297 With optional start, test S beginning at that position.\n\
13298 With optional end, stop comparing S at that position.\n\
13299 prefix can also be a tuple of strings to try.");
13300
13301 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13302 unicode_startswith(PyObject *self,
13303 PyObject *args)
13304 {
13305 PyObject *subobj;
13306 PyObject *substring;
13307 Py_ssize_t start = 0;
13308 Py_ssize_t end = PY_SSIZE_T_MAX;
13309 int result;
13310
13311 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13312 return NULL;
13313 if (PyTuple_Check(subobj)) {
13314 Py_ssize_t i;
13315 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13316 substring = PyTuple_GET_ITEM(subobj, i);
13317 if (!PyUnicode_Check(substring)) {
13318 PyErr_Format(PyExc_TypeError,
13319 "tuple for startswith must only contain str, "
13320 "not %.100s",
13321 Py_TYPE(substring)->tp_name);
13322 return NULL;
13323 }
13324 result = tailmatch(self, substring, start, end, -1);
13325 if (result == -1)
13326 return NULL;
13327 if (result) {
13328 Py_RETURN_TRUE;
13329 }
13330 }
13331 /* nothing matched */
13332 Py_RETURN_FALSE;
13333 }
13334 if (!PyUnicode_Check(subobj)) {
13335 PyErr_Format(PyExc_TypeError,
13336 "startswith first arg must be str or "
13337 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13338 return NULL;
13339 }
13340 result = tailmatch(self, subobj, start, end, -1);
13341 if (result == -1)
13342 return NULL;
13343 return PyBool_FromLong(result);
13344 }
13345
13346
13347 PyDoc_STRVAR(endswith__doc__,
13348 "S.endswith(suffix[, start[, end]]) -> bool\n\
13349 \n\
13350 Return True if S ends with the specified suffix, False otherwise.\n\
13351 With optional start, test S beginning at that position.\n\
13352 With optional end, stop comparing S at that position.\n\
13353 suffix can also be a tuple of strings to try.");
13354
13355 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13356 unicode_endswith(PyObject *self,
13357 PyObject *args)
13358 {
13359 PyObject *subobj;
13360 PyObject *substring;
13361 Py_ssize_t start = 0;
13362 Py_ssize_t end = PY_SSIZE_T_MAX;
13363 int result;
13364
13365 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13366 return NULL;
13367 if (PyTuple_Check(subobj)) {
13368 Py_ssize_t i;
13369 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13370 substring = PyTuple_GET_ITEM(subobj, i);
13371 if (!PyUnicode_Check(substring)) {
13372 PyErr_Format(PyExc_TypeError,
13373 "tuple for endswith must only contain str, "
13374 "not %.100s",
13375 Py_TYPE(substring)->tp_name);
13376 return NULL;
13377 }
13378 result = tailmatch(self, substring, start, end, +1);
13379 if (result == -1)
13380 return NULL;
13381 if (result) {
13382 Py_RETURN_TRUE;
13383 }
13384 }
13385 Py_RETURN_FALSE;
13386 }
13387 if (!PyUnicode_Check(subobj)) {
13388 PyErr_Format(PyExc_TypeError,
13389 "endswith first arg must be str or "
13390 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13391 return NULL;
13392 }
13393 result = tailmatch(self, subobj, start, end, +1);
13394 if (result == -1)
13395 return NULL;
13396 return PyBool_FromLong(result);
13397 }
13398
13399 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13400 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13401 {
13402 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13403 writer->data = PyUnicode_DATA(writer->buffer);
13404
13405 if (!writer->readonly) {
13406 writer->kind = PyUnicode_KIND(writer->buffer);
13407 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13408 }
13409 else {
13410 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13411 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13412 writer->kind = PyUnicode_WCHAR_KIND;
13413 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13414
13415 /* Copy-on-write mode: set buffer size to 0 so
13416 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13417 * next write. */
13418 writer->size = 0;
13419 }
13420 }
13421
13422 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13423 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13424 {
13425 memset(writer, 0, sizeof(*writer));
13426
13427 /* ASCII is the bare minimum */
13428 writer->min_char = 127;
13429
13430 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13431 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13432 writer->kind = PyUnicode_WCHAR_KIND;
13433 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13434 }
13435
13436 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13437 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13438 Py_ssize_t length, Py_UCS4 maxchar)
13439 {
13440 Py_ssize_t newlen;
13441 PyObject *newbuffer;
13442
13443 assert(maxchar <= MAX_UNICODE);
13444
13445 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13446 assert((maxchar > writer->maxchar && length >= 0)
13447 || length > 0);
13448
13449 if (length > PY_SSIZE_T_MAX - writer->pos) {
13450 PyErr_NoMemory();
13451 return -1;
13452 }
13453 newlen = writer->pos + length;
13454
13455 maxchar = Py_MAX(maxchar, writer->min_char);
13456
13457 if (writer->buffer == NULL) {
13458 assert(!writer->readonly);
13459 if (writer->overallocate
13460 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13461 /* overallocate to limit the number of realloc() */
13462 newlen += newlen / OVERALLOCATE_FACTOR;
13463 }
13464 if (newlen < writer->min_length)
13465 newlen = writer->min_length;
13466
13467 writer->buffer = PyUnicode_New(newlen, maxchar);
13468 if (writer->buffer == NULL)
13469 return -1;
13470 }
13471 else if (newlen > writer->size) {
13472 if (writer->overallocate
13473 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13474 /* overallocate to limit the number of realloc() */
13475 newlen += newlen / OVERALLOCATE_FACTOR;
13476 }
13477 if (newlen < writer->min_length)
13478 newlen = writer->min_length;
13479
13480 if (maxchar > writer->maxchar || writer->readonly) {
13481 /* resize + widen */
13482 maxchar = Py_MAX(maxchar, writer->maxchar);
13483 newbuffer = PyUnicode_New(newlen, maxchar);
13484 if (newbuffer == NULL)
13485 return -1;
13486 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13487 writer->buffer, 0, writer->pos);
13488 Py_DECREF(writer->buffer);
13489 writer->readonly = 0;
13490 }
13491 else {
13492 newbuffer = resize_compact(writer->buffer, newlen);
13493 if (newbuffer == NULL)
13494 return -1;
13495 }
13496 writer->buffer = newbuffer;
13497 }
13498 else if (maxchar > writer->maxchar) {
13499 assert(!writer->readonly);
13500 newbuffer = PyUnicode_New(writer->size, maxchar);
13501 if (newbuffer == NULL)
13502 return -1;
13503 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13504 writer->buffer, 0, writer->pos);
13505 Py_SETREF(writer->buffer, newbuffer);
13506 }
13507 _PyUnicodeWriter_Update(writer);
13508 return 0;
13509
13510 #undef OVERALLOCATE_FACTOR
13511 }
13512
13513 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13514 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13515 enum PyUnicode_Kind kind)
13516 {
13517 Py_UCS4 maxchar;
13518
13519 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13520 assert(writer->kind < kind);
13521
13522 switch (kind)
13523 {
13524 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13525 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13526 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13527 default:
13528 assert(0 && "invalid kind");
13529 return -1;
13530 }
13531
13532 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13533 }
13534
13535 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13536 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13537 {
13538 assert(ch <= MAX_UNICODE);
13539 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13540 return -1;
13541 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13542 writer->pos++;
13543 return 0;
13544 }
13545
13546 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13547 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13548 {
13549 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13550 }
13551
13552 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13553 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13554 {
13555 Py_UCS4 maxchar;
13556 Py_ssize_t len;
13557
13558 if (PyUnicode_READY(str) == -1)
13559 return -1;
13560 len = PyUnicode_GET_LENGTH(str);
13561 if (len == 0)
13562 return 0;
13563 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13564 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13565 if (writer->buffer == NULL && !writer->overallocate) {
13566 assert(_PyUnicode_CheckConsistency(str, 1));
13567 writer->readonly = 1;
13568 Py_INCREF(str);
13569 writer->buffer = str;
13570 _PyUnicodeWriter_Update(writer);
13571 writer->pos += len;
13572 return 0;
13573 }
13574 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13575 return -1;
13576 }
13577 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13578 str, 0, len);
13579 writer->pos += len;
13580 return 0;
13581 }
13582
13583 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13584 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13585 Py_ssize_t start, Py_ssize_t end)
13586 {
13587 Py_UCS4 maxchar;
13588 Py_ssize_t len;
13589
13590 if (PyUnicode_READY(str) == -1)
13591 return -1;
13592
13593 assert(0 <= start);
13594 assert(end <= PyUnicode_GET_LENGTH(str));
13595 assert(start <= end);
13596
13597 if (end == 0)
13598 return 0;
13599
13600 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13601 return _PyUnicodeWriter_WriteStr(writer, str);
13602
13603 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13604 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13605 else
13606 maxchar = writer->maxchar;
13607 len = end - start;
13608
13609 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13610 return -1;
13611
13612 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13613 str, start, len);
13614 writer->pos += len;
13615 return 0;
13616 }
13617
13618 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13619 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13620 const char *ascii, Py_ssize_t len)
13621 {
13622 if (len == -1)
13623 len = strlen(ascii);
13624
13625 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13626
13627 if (writer->buffer == NULL && !writer->overallocate) {
13628 PyObject *str;
13629
13630 str = _PyUnicode_FromASCII(ascii, len);
13631 if (str == NULL)
13632 return -1;
13633
13634 writer->readonly = 1;
13635 writer->buffer = str;
13636 _PyUnicodeWriter_Update(writer);
13637 writer->pos += len;
13638 return 0;
13639 }
13640
13641 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13642 return -1;
13643
13644 switch (writer->kind)
13645 {
13646 case PyUnicode_1BYTE_KIND:
13647 {
13648 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13649 Py_UCS1 *data = writer->data;
13650
13651 memcpy(data + writer->pos, str, len);
13652 break;
13653 }
13654 case PyUnicode_2BYTE_KIND:
13655 {
13656 _PyUnicode_CONVERT_BYTES(
13657 Py_UCS1, Py_UCS2,
13658 ascii, ascii + len,
13659 (Py_UCS2 *)writer->data + writer->pos);
13660 break;
13661 }
13662 case PyUnicode_4BYTE_KIND:
13663 {
13664 _PyUnicode_CONVERT_BYTES(
13665 Py_UCS1, Py_UCS4,
13666 ascii, ascii + len,
13667 (Py_UCS4 *)writer->data + writer->pos);
13668 break;
13669 }
13670 default:
13671 assert(0);
13672 }
13673
13674 writer->pos += len;
13675 return 0;
13676 }
13677
13678 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13679 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13680 const char *str, Py_ssize_t len)
13681 {
13682 Py_UCS4 maxchar;
13683
13684 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13685 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13686 return -1;
13687 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13688 writer->pos += len;
13689 return 0;
13690 }
13691
13692 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13693 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13694 {
13695 PyObject *str;
13696
13697 if (writer->pos == 0) {
13698 Py_CLEAR(writer->buffer);
13699 _Py_RETURN_UNICODE_EMPTY();
13700 }
13701
13702 str = writer->buffer;
13703 writer->buffer = NULL;
13704
13705 if (writer->readonly) {
13706 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13707 return str;
13708 }
13709
13710 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13711 PyObject *str2;
13712 str2 = resize_compact(str, writer->pos);
13713 if (str2 == NULL) {
13714 Py_DECREF(str);
13715 return NULL;
13716 }
13717 str = str2;
13718 }
13719
13720 assert(_PyUnicode_CheckConsistency(str, 1));
13721 return unicode_result_ready(str);
13722 }
13723
13724 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13725 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13726 {
13727 Py_CLEAR(writer->buffer);
13728 }
13729
13730 #include "stringlib/unicode_format.h"
13731
13732 PyDoc_STRVAR(format__doc__,
13733 "S.format(*args, **kwargs) -> str\n\
13734 \n\
13735 Return a formatted version of S, using substitutions from args and kwargs.\n\
13736 The substitutions are identified by braces ('{' and '}').");
13737
13738 PyDoc_STRVAR(format_map__doc__,
13739 "S.format_map(mapping) -> str\n\
13740 \n\
13741 Return a formatted version of S, using substitutions from mapping.\n\
13742 The substitutions are identified by braces ('{' and '}').");
13743
13744 static PyObject *
unicode__format__(PyObject * self,PyObject * args)13745 unicode__format__(PyObject* self, PyObject* args)
13746 {
13747 PyObject *format_spec;
13748 _PyUnicodeWriter writer;
13749 int ret;
13750
13751 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13752 return NULL;
13753
13754 if (PyUnicode_READY(self) == -1)
13755 return NULL;
13756 _PyUnicodeWriter_Init(&writer);
13757 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13758 self, format_spec, 0,
13759 PyUnicode_GET_LENGTH(format_spec));
13760 if (ret == -1) {
13761 _PyUnicodeWriter_Dealloc(&writer);
13762 return NULL;
13763 }
13764 return _PyUnicodeWriter_Finish(&writer);
13765 }
13766
13767 PyDoc_STRVAR(p_format__doc__,
13768 "S.__format__(format_spec) -> str\n\
13769 \n\
13770 Return a formatted version of S as described by format_spec.");
13771
13772 static PyObject *
unicode__sizeof__(PyObject * v)13773 unicode__sizeof__(PyObject *v)
13774 {
13775 Py_ssize_t size;
13776
13777 /* If it's a compact object, account for base structure +
13778 character data. */
13779 if (PyUnicode_IS_COMPACT_ASCII(v))
13780 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13781 else if (PyUnicode_IS_COMPACT(v))
13782 size = sizeof(PyCompactUnicodeObject) +
13783 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13784 else {
13785 /* If it is a two-block object, account for base object, and
13786 for character block if present. */
13787 size = sizeof(PyUnicodeObject);
13788 if (_PyUnicode_DATA_ANY(v))
13789 size += (PyUnicode_GET_LENGTH(v) + 1) *
13790 PyUnicode_KIND(v);
13791 }
13792 /* If the wstr pointer is present, account for it unless it is shared
13793 with the data pointer. Check if the data is not shared. */
13794 if (_PyUnicode_HAS_WSTR_MEMORY(v))
13795 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13796 if (_PyUnicode_HAS_UTF8_MEMORY(v))
13797 size += PyUnicode_UTF8_LENGTH(v) + 1;
13798
13799 return PyLong_FromSsize_t(size);
13800 }
13801
13802 PyDoc_STRVAR(sizeof__doc__,
13803 "S.__sizeof__() -> size of S in memory, in bytes");
13804
13805 static PyObject *
unicode_getnewargs(PyObject * v)13806 unicode_getnewargs(PyObject *v)
13807 {
13808 PyObject *copy = _PyUnicode_Copy(v);
13809 if (!copy)
13810 return NULL;
13811 return Py_BuildValue("(N)", copy);
13812 }
13813
13814 static PyMethodDef unicode_methods[] = {
13815 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13816 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13817 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13818 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13819 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13820 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13821 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13822 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13823 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13824 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13825 {"expandtabs", (PyCFunction) unicode_expandtabs,
13826 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13827 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13828 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13829 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13830 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13831 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13832 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13833 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13834 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13835 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13836 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13837 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13838 {"splitlines", (PyCFunction) unicode_splitlines,
13839 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13840 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13841 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13842 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13843 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13844 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13845 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13846 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13847 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13848 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13849 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13850 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13851 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13852 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13853 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13854 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13855 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13856 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13857 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13858 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13859 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13860 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13861 UNICODE_MAKETRANS_METHODDEF
13862 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13863 #if 0
13864 /* These methods are just used for debugging the implementation. */
13865 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13866 #endif
13867
13868 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
13869 {NULL, NULL}
13870 };
13871
13872 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13873 unicode_mod(PyObject *v, PyObject *w)
13874 {
13875 if (!PyUnicode_Check(v))
13876 Py_RETURN_NOTIMPLEMENTED;
13877 return PyUnicode_Format(v, w);
13878 }
13879
13880 static PyNumberMethods unicode_as_number = {
13881 0, /*nb_add*/
13882 0, /*nb_subtract*/
13883 0, /*nb_multiply*/
13884 unicode_mod, /*nb_remainder*/
13885 };
13886
13887 static PySequenceMethods unicode_as_sequence = {
13888 (lenfunc) unicode_length, /* sq_length */
13889 PyUnicode_Concat, /* sq_concat */
13890 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13891 (ssizeargfunc) unicode_getitem, /* sq_item */
13892 0, /* sq_slice */
13893 0, /* sq_ass_item */
13894 0, /* sq_ass_slice */
13895 PyUnicode_Contains, /* sq_contains */
13896 };
13897
13898 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13899 unicode_subscript(PyObject* self, PyObject* item)
13900 {
13901 if (PyUnicode_READY(self) == -1)
13902 return NULL;
13903
13904 if (PyIndex_Check(item)) {
13905 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13906 if (i == -1 && PyErr_Occurred())
13907 return NULL;
13908 if (i < 0)
13909 i += PyUnicode_GET_LENGTH(self);
13910 return unicode_getitem(self, i);
13911 } else if (PySlice_Check(item)) {
13912 Py_ssize_t start, stop, step, slicelength, cur, i;
13913 PyObject *result;
13914 void *src_data, *dest_data;
13915 int src_kind, dest_kind;
13916 Py_UCS4 ch, max_char, kind_limit;
13917
13918 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13919 &start, &stop, &step, &slicelength) < 0) {
13920 return NULL;
13921 }
13922
13923 if (slicelength <= 0) {
13924 _Py_RETURN_UNICODE_EMPTY();
13925 } else if (start == 0 && step == 1 &&
13926 slicelength == PyUnicode_GET_LENGTH(self)) {
13927 return unicode_result_unchanged(self);
13928 } else if (step == 1) {
13929 return PyUnicode_Substring(self,
13930 start, start + slicelength);
13931 }
13932 /* General case */
13933 src_kind = PyUnicode_KIND(self);
13934 src_data = PyUnicode_DATA(self);
13935 if (!PyUnicode_IS_ASCII(self)) {
13936 kind_limit = kind_maxchar_limit(src_kind);
13937 max_char = 0;
13938 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13939 ch = PyUnicode_READ(src_kind, src_data, cur);
13940 if (ch > max_char) {
13941 max_char = ch;
13942 if (max_char >= kind_limit)
13943 break;
13944 }
13945 }
13946 }
13947 else
13948 max_char = 127;
13949 result = PyUnicode_New(slicelength, max_char);
13950 if (result == NULL)
13951 return NULL;
13952 dest_kind = PyUnicode_KIND(result);
13953 dest_data = PyUnicode_DATA(result);
13954
13955 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13956 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13957 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13958 }
13959 assert(_PyUnicode_CheckConsistency(result, 1));
13960 return result;
13961 } else {
13962 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13963 return NULL;
13964 }
13965 }
13966
13967 static PyMappingMethods unicode_as_mapping = {
13968 (lenfunc)unicode_length, /* mp_length */
13969 (binaryfunc)unicode_subscript, /* mp_subscript */
13970 (objobjargproc)0, /* mp_ass_subscript */
13971 };
13972
13973
13974 /* Helpers for PyUnicode_Format() */
13975
13976 struct unicode_formatter_t {
13977 PyObject *args;
13978 int args_owned;
13979 Py_ssize_t arglen, argidx;
13980 PyObject *dict;
13981
13982 enum PyUnicode_Kind fmtkind;
13983 Py_ssize_t fmtcnt, fmtpos;
13984 void *fmtdata;
13985 PyObject *fmtstr;
13986
13987 _PyUnicodeWriter writer;
13988 };
13989
13990 struct unicode_format_arg_t {
13991 Py_UCS4 ch;
13992 int flags;
13993 Py_ssize_t width;
13994 int prec;
13995 int sign;
13996 };
13997
13998 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)13999 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14000 {
14001 Py_ssize_t argidx = ctx->argidx;
14002
14003 if (argidx < ctx->arglen) {
14004 ctx->argidx++;
14005 if (ctx->arglen < 0)
14006 return ctx->args;
14007 else
14008 return PyTuple_GetItem(ctx->args, argidx);
14009 }
14010 PyErr_SetString(PyExc_TypeError,
14011 "not enough arguments for format string");
14012 return NULL;
14013 }
14014
14015 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14016
14017 /* Format a float into the writer if the writer is not NULL, or into *p_output
14018 otherwise.
14019
14020 Return 0 on success, raise an exception and return -1 on error. */
14021 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14022 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14023 PyObject **p_output,
14024 _PyUnicodeWriter *writer)
14025 {
14026 char *p;
14027 double x;
14028 Py_ssize_t len;
14029 int prec;
14030 int dtoa_flags;
14031
14032 x = PyFloat_AsDouble(v);
14033 if (x == -1.0 && PyErr_Occurred())
14034 return -1;
14035
14036 prec = arg->prec;
14037 if (prec < 0)
14038 prec = 6;
14039
14040 if (arg->flags & F_ALT)
14041 dtoa_flags = Py_DTSF_ALT;
14042 else
14043 dtoa_flags = 0;
14044 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14045 if (p == NULL)
14046 return -1;
14047 len = strlen(p);
14048 if (writer) {
14049 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14050 PyMem_Free(p);
14051 return -1;
14052 }
14053 }
14054 else
14055 *p_output = _PyUnicode_FromASCII(p, len);
14056 PyMem_Free(p);
14057 return 0;
14058 }
14059
14060 /* formatlong() emulates the format codes d, u, o, x and X, and
14061 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14062 * Python's regular ints.
14063 * Return value: a new PyUnicodeObject*, or NULL if error.
14064 * The output string is of the form
14065 * "-"? ("0x" | "0X")? digit+
14066 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14067 * set in flags. The case of hex digits will be correct,
14068 * There will be at least prec digits, zero-filled on the left if
14069 * necessary to get that many.
14070 * val object to be converted
14071 * flags bitmask of format flags; only F_ALT is looked at
14072 * prec minimum number of digits; 0-fill on left if needed
14073 * type a character in [duoxX]; u acts the same as d
14074 *
14075 * CAUTION: o, x and X conversions on regular ints can never
14076 * produce a '-' sign, but can for Python's unbounded ints.
14077 */
14078 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14079 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14080 {
14081 PyObject *result = NULL;
14082 char *buf;
14083 Py_ssize_t i;
14084 int sign; /* 1 if '-', else 0 */
14085 int len; /* number of characters */
14086 Py_ssize_t llen;
14087 int numdigits; /* len == numnondigits + numdigits */
14088 int numnondigits = 0;
14089
14090 /* Avoid exceeding SSIZE_T_MAX */
14091 if (prec > INT_MAX-3) {
14092 PyErr_SetString(PyExc_OverflowError,
14093 "precision too large");
14094 return NULL;
14095 }
14096
14097 assert(PyLong_Check(val));
14098
14099 switch (type) {
14100 default:
14101 assert(!"'type' not in [diuoxX]");
14102 case 'd':
14103 case 'i':
14104 case 'u':
14105 /* int and int subclasses should print numerically when a numeric */
14106 /* format code is used (see issue18780) */
14107 result = PyNumber_ToBase(val, 10);
14108 break;
14109 case 'o':
14110 numnondigits = 2;
14111 result = PyNumber_ToBase(val, 8);
14112 break;
14113 case 'x':
14114 case 'X':
14115 numnondigits = 2;
14116 result = PyNumber_ToBase(val, 16);
14117 break;
14118 }
14119 if (!result)
14120 return NULL;
14121
14122 assert(unicode_modifiable(result));
14123 assert(PyUnicode_IS_READY(result));
14124 assert(PyUnicode_IS_ASCII(result));
14125
14126 /* To modify the string in-place, there can only be one reference. */
14127 if (Py_REFCNT(result) != 1) {
14128 Py_DECREF(result);
14129 PyErr_BadInternalCall();
14130 return NULL;
14131 }
14132 buf = PyUnicode_DATA(result);
14133 llen = PyUnicode_GET_LENGTH(result);
14134 if (llen > INT_MAX) {
14135 Py_DECREF(result);
14136 PyErr_SetString(PyExc_ValueError,
14137 "string too large in _PyUnicode_FormatLong");
14138 return NULL;
14139 }
14140 len = (int)llen;
14141 sign = buf[0] == '-';
14142 numnondigits += sign;
14143 numdigits = len - numnondigits;
14144 assert(numdigits > 0);
14145
14146 /* Get rid of base marker unless F_ALT */
14147 if (((alt) == 0 &&
14148 (type == 'o' || type == 'x' || type == 'X'))) {
14149 assert(buf[sign] == '0');
14150 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14151 buf[sign+1] == 'o');
14152 numnondigits -= 2;
14153 buf += 2;
14154 len -= 2;
14155 if (sign)
14156 buf[0] = '-';
14157 assert(len == numnondigits + numdigits);
14158 assert(numdigits > 0);
14159 }
14160
14161 /* Fill with leading zeroes to meet minimum width. */
14162 if (prec > numdigits) {
14163 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14164 numnondigits + prec);
14165 char *b1;
14166 if (!r1) {
14167 Py_DECREF(result);
14168 return NULL;
14169 }
14170 b1 = PyBytes_AS_STRING(r1);
14171 for (i = 0; i < numnondigits; ++i)
14172 *b1++ = *buf++;
14173 for (i = 0; i < prec - numdigits; i++)
14174 *b1++ = '0';
14175 for (i = 0; i < numdigits; i++)
14176 *b1++ = *buf++;
14177 *b1 = '\0';
14178 Py_DECREF(result);
14179 result = r1;
14180 buf = PyBytes_AS_STRING(result);
14181 len = numnondigits + prec;
14182 }
14183
14184 /* Fix up case for hex conversions. */
14185 if (type == 'X') {
14186 /* Need to convert all lower case letters to upper case.
14187 and need to convert 0x to 0X (and -0x to -0X). */
14188 for (i = 0; i < len; i++)
14189 if (buf[i] >= 'a' && buf[i] <= 'x')
14190 buf[i] -= 'a'-'A';
14191 }
14192 if (!PyUnicode_Check(result)
14193 || buf != PyUnicode_DATA(result)) {
14194 PyObject *unicode;
14195 unicode = _PyUnicode_FromASCII(buf, len);
14196 Py_DECREF(result);
14197 result = unicode;
14198 }
14199 else if (len != PyUnicode_GET_LENGTH(result)) {
14200 if (PyUnicode_Resize(&result, len) < 0)
14201 Py_CLEAR(result);
14202 }
14203 return result;
14204 }
14205
14206 /* Format an integer or a float as an integer.
14207 * Return 1 if the number has been formatted into the writer,
14208 * 0 if the number has been formatted into *p_output
14209 * -1 and raise an exception on error */
14210 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14211 mainformatlong(PyObject *v,
14212 struct unicode_format_arg_t *arg,
14213 PyObject **p_output,
14214 _PyUnicodeWriter *writer)
14215 {
14216 PyObject *iobj, *res;
14217 char type = (char)arg->ch;
14218
14219 if (!PyNumber_Check(v))
14220 goto wrongtype;
14221
14222 /* make sure number is a type of integer for o, x, and X */
14223 if (!PyLong_Check(v)) {
14224 if (type == 'o' || type == 'x' || type == 'X') {
14225 iobj = PyNumber_Index(v);
14226 if (iobj == NULL) {
14227 if (PyErr_ExceptionMatches(PyExc_TypeError))
14228 goto wrongtype;
14229 return -1;
14230 }
14231 }
14232 else {
14233 iobj = PyNumber_Long(v);
14234 if (iobj == NULL ) {
14235 if (PyErr_ExceptionMatches(PyExc_TypeError))
14236 goto wrongtype;
14237 return -1;
14238 }
14239 }
14240 assert(PyLong_Check(iobj));
14241 }
14242 else {
14243 iobj = v;
14244 Py_INCREF(iobj);
14245 }
14246
14247 if (PyLong_CheckExact(v)
14248 && arg->width == -1 && arg->prec == -1
14249 && !(arg->flags & (F_SIGN | F_BLANK))
14250 && type != 'X')
14251 {
14252 /* Fast path */
14253 int alternate = arg->flags & F_ALT;
14254 int base;
14255
14256 switch(type)
14257 {
14258 default:
14259 assert(0 && "'type' not in [diuoxX]");
14260 case 'd':
14261 case 'i':
14262 case 'u':
14263 base = 10;
14264 break;
14265 case 'o':
14266 base = 8;
14267 break;
14268 case 'x':
14269 case 'X':
14270 base = 16;
14271 break;
14272 }
14273
14274 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14275 Py_DECREF(iobj);
14276 return -1;
14277 }
14278 Py_DECREF(iobj);
14279 return 1;
14280 }
14281
14282 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14283 Py_DECREF(iobj);
14284 if (res == NULL)
14285 return -1;
14286 *p_output = res;
14287 return 0;
14288
14289 wrongtype:
14290 switch(type)
14291 {
14292 case 'o':
14293 case 'x':
14294 case 'X':
14295 PyErr_Format(PyExc_TypeError,
14296 "%%%c format: an integer is required, "
14297 "not %.200s",
14298 type, Py_TYPE(v)->tp_name);
14299 break;
14300 default:
14301 PyErr_Format(PyExc_TypeError,
14302 "%%%c format: a number is required, "
14303 "not %.200s",
14304 type, Py_TYPE(v)->tp_name);
14305 break;
14306 }
14307 return -1;
14308 }
14309
14310 static Py_UCS4
formatchar(PyObject * v)14311 formatchar(PyObject *v)
14312 {
14313 /* presume that the buffer is at least 3 characters long */
14314 if (PyUnicode_Check(v)) {
14315 if (PyUnicode_GET_LENGTH(v) == 1) {
14316 return PyUnicode_READ_CHAR(v, 0);
14317 }
14318 goto onError;
14319 }
14320 else {
14321 PyObject *iobj;
14322 long x;
14323 /* make sure number is a type of integer */
14324 if (!PyLong_Check(v)) {
14325 iobj = PyNumber_Index(v);
14326 if (iobj == NULL) {
14327 goto onError;
14328 }
14329 x = PyLong_AsLong(iobj);
14330 Py_DECREF(iobj);
14331 }
14332 else {
14333 x = PyLong_AsLong(v);
14334 }
14335 if (x == -1 && PyErr_Occurred())
14336 goto onError;
14337
14338 if (x < 0 || x > MAX_UNICODE) {
14339 PyErr_SetString(PyExc_OverflowError,
14340 "%c arg not in range(0x110000)");
14341 return (Py_UCS4) -1;
14342 }
14343
14344 return (Py_UCS4) x;
14345 }
14346
14347 onError:
14348 PyErr_SetString(PyExc_TypeError,
14349 "%c requires int or char");
14350 return (Py_UCS4) -1;
14351 }
14352
14353 /* Parse options of an argument: flags, width, precision.
14354 Handle also "%(name)" syntax.
14355
14356 Return 0 if the argument has been formatted into arg->str.
14357 Return 1 if the argument has been written into ctx->writer,
14358 Raise an exception and return -1 on error. */
14359 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14360 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14361 struct unicode_format_arg_t *arg)
14362 {
14363 #define FORMAT_READ(ctx) \
14364 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14365
14366 PyObject *v;
14367
14368 if (arg->ch == '(') {
14369 /* Get argument value from a dictionary. Example: "%(name)s". */
14370 Py_ssize_t keystart;
14371 Py_ssize_t keylen;
14372 PyObject *key;
14373 int pcount = 1;
14374
14375 if (ctx->dict == NULL) {
14376 PyErr_SetString(PyExc_TypeError,
14377 "format requires a mapping");
14378 return -1;
14379 }
14380 ++ctx->fmtpos;
14381 --ctx->fmtcnt;
14382 keystart = ctx->fmtpos;
14383 /* Skip over balanced parentheses */
14384 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14385 arg->ch = FORMAT_READ(ctx);
14386 if (arg->ch == ')')
14387 --pcount;
14388 else if (arg->ch == '(')
14389 ++pcount;
14390 ctx->fmtpos++;
14391 }
14392 keylen = ctx->fmtpos - keystart - 1;
14393 if (ctx->fmtcnt < 0 || pcount > 0) {
14394 PyErr_SetString(PyExc_ValueError,
14395 "incomplete format key");
14396 return -1;
14397 }
14398 key = PyUnicode_Substring(ctx->fmtstr,
14399 keystart, keystart + keylen);
14400 if (key == NULL)
14401 return -1;
14402 if (ctx->args_owned) {
14403 ctx->args_owned = 0;
14404 Py_DECREF(ctx->args);
14405 }
14406 ctx->args = PyObject_GetItem(ctx->dict, key);
14407 Py_DECREF(key);
14408 if (ctx->args == NULL)
14409 return -1;
14410 ctx->args_owned = 1;
14411 ctx->arglen = -1;
14412 ctx->argidx = -2;
14413 }
14414
14415 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14416 while (--ctx->fmtcnt >= 0) {
14417 arg->ch = FORMAT_READ(ctx);
14418 ctx->fmtpos++;
14419 switch (arg->ch) {
14420 case '-': arg->flags |= F_LJUST; continue;
14421 case '+': arg->flags |= F_SIGN; continue;
14422 case ' ': arg->flags |= F_BLANK; continue;
14423 case '#': arg->flags |= F_ALT; continue;
14424 case '0': arg->flags |= F_ZERO; continue;
14425 }
14426 break;
14427 }
14428
14429 /* Parse width. Example: "%10s" => width=10 */
14430 if (arg->ch == '*') {
14431 v = unicode_format_getnextarg(ctx);
14432 if (v == NULL)
14433 return -1;
14434 if (!PyLong_Check(v)) {
14435 PyErr_SetString(PyExc_TypeError,
14436 "* wants int");
14437 return -1;
14438 }
14439 arg->width = PyLong_AsSsize_t(v);
14440 if (arg->width == -1 && PyErr_Occurred())
14441 return -1;
14442 if (arg->width < 0) {
14443 arg->flags |= F_LJUST;
14444 arg->width = -arg->width;
14445 }
14446 if (--ctx->fmtcnt >= 0) {
14447 arg->ch = FORMAT_READ(ctx);
14448 ctx->fmtpos++;
14449 }
14450 }
14451 else if (arg->ch >= '0' && arg->ch <= '9') {
14452 arg->width = arg->ch - '0';
14453 while (--ctx->fmtcnt >= 0) {
14454 arg->ch = FORMAT_READ(ctx);
14455 ctx->fmtpos++;
14456 if (arg->ch < '0' || arg->ch > '9')
14457 break;
14458 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14459 mixing signed and unsigned comparison. Since arg->ch is between
14460 '0' and '9', casting to int is safe. */
14461 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14462 PyErr_SetString(PyExc_ValueError,
14463 "width too big");
14464 return -1;
14465 }
14466 arg->width = arg->width*10 + (arg->ch - '0');
14467 }
14468 }
14469
14470 /* Parse precision. Example: "%.3f" => prec=3 */
14471 if (arg->ch == '.') {
14472 arg->prec = 0;
14473 if (--ctx->fmtcnt >= 0) {
14474 arg->ch = FORMAT_READ(ctx);
14475 ctx->fmtpos++;
14476 }
14477 if (arg->ch == '*') {
14478 v = unicode_format_getnextarg(ctx);
14479 if (v == NULL)
14480 return -1;
14481 if (!PyLong_Check(v)) {
14482 PyErr_SetString(PyExc_TypeError,
14483 "* wants int");
14484 return -1;
14485 }
14486 arg->prec = _PyLong_AsInt(v);
14487 if (arg->prec == -1 && PyErr_Occurred())
14488 return -1;
14489 if (arg->prec < 0)
14490 arg->prec = 0;
14491 if (--ctx->fmtcnt >= 0) {
14492 arg->ch = FORMAT_READ(ctx);
14493 ctx->fmtpos++;
14494 }
14495 }
14496 else if (arg->ch >= '0' && arg->ch <= '9') {
14497 arg->prec = arg->ch - '0';
14498 while (--ctx->fmtcnt >= 0) {
14499 arg->ch = FORMAT_READ(ctx);
14500 ctx->fmtpos++;
14501 if (arg->ch < '0' || arg->ch > '9')
14502 break;
14503 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14504 PyErr_SetString(PyExc_ValueError,
14505 "precision too big");
14506 return -1;
14507 }
14508 arg->prec = arg->prec*10 + (arg->ch - '0');
14509 }
14510 }
14511 }
14512
14513 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14514 if (ctx->fmtcnt >= 0) {
14515 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14516 if (--ctx->fmtcnt >= 0) {
14517 arg->ch = FORMAT_READ(ctx);
14518 ctx->fmtpos++;
14519 }
14520 }
14521 }
14522 if (ctx->fmtcnt < 0) {
14523 PyErr_SetString(PyExc_ValueError,
14524 "incomplete format");
14525 return -1;
14526 }
14527 return 0;
14528
14529 #undef FORMAT_READ
14530 }
14531
14532 /* Format one argument. Supported conversion specifiers:
14533
14534 - "s", "r", "a": any type
14535 - "i", "d", "u": int or float
14536 - "o", "x", "X": int
14537 - "e", "E", "f", "F", "g", "G": float
14538 - "c": int or str (1 character)
14539
14540 When possible, the output is written directly into the Unicode writer
14541 (ctx->writer). A string is created when padding is required.
14542
14543 Return 0 if the argument has been formatted into *p_str,
14544 1 if the argument has been written into ctx->writer,
14545 -1 on error. */
14546 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14547 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14548 struct unicode_format_arg_t *arg,
14549 PyObject **p_str)
14550 {
14551 PyObject *v;
14552 _PyUnicodeWriter *writer = &ctx->writer;
14553
14554 if (ctx->fmtcnt == 0)
14555 ctx->writer.overallocate = 0;
14556
14557 if (arg->ch == '%') {
14558 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14559 return -1;
14560 return 1;
14561 }
14562
14563 v = unicode_format_getnextarg(ctx);
14564 if (v == NULL)
14565 return -1;
14566
14567
14568 switch (arg->ch) {
14569 case 's':
14570 case 'r':
14571 case 'a':
14572 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14573 /* Fast path */
14574 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14575 return -1;
14576 return 1;
14577 }
14578
14579 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14580 *p_str = v;
14581 Py_INCREF(*p_str);
14582 }
14583 else {
14584 if (arg->ch == 's')
14585 *p_str = PyObject_Str(v);
14586 else if (arg->ch == 'r')
14587 *p_str = PyObject_Repr(v);
14588 else
14589 *p_str = PyObject_ASCII(v);
14590 }
14591 break;
14592
14593 case 'i':
14594 case 'd':
14595 case 'u':
14596 case 'o':
14597 case 'x':
14598 case 'X':
14599 {
14600 int ret = mainformatlong(v, arg, p_str, writer);
14601 if (ret != 0)
14602 return ret;
14603 arg->sign = 1;
14604 break;
14605 }
14606
14607 case 'e':
14608 case 'E':
14609 case 'f':
14610 case 'F':
14611 case 'g':
14612 case 'G':
14613 if (arg->width == -1 && arg->prec == -1
14614 && !(arg->flags & (F_SIGN | F_BLANK)))
14615 {
14616 /* Fast path */
14617 if (formatfloat(v, arg, NULL, writer) == -1)
14618 return -1;
14619 return 1;
14620 }
14621
14622 arg->sign = 1;
14623 if (formatfloat(v, arg, p_str, NULL) == -1)
14624 return -1;
14625 break;
14626
14627 case 'c':
14628 {
14629 Py_UCS4 ch = formatchar(v);
14630 if (ch == (Py_UCS4) -1)
14631 return -1;
14632 if (arg->width == -1 && arg->prec == -1) {
14633 /* Fast path */
14634 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14635 return -1;
14636 return 1;
14637 }
14638 *p_str = PyUnicode_FromOrdinal(ch);
14639 break;
14640 }
14641
14642 default:
14643 PyErr_Format(PyExc_ValueError,
14644 "unsupported format character '%c' (0x%x) "
14645 "at index %zd",
14646 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14647 (int)arg->ch,
14648 ctx->fmtpos - 1);
14649 return -1;
14650 }
14651 if (*p_str == NULL)
14652 return -1;
14653 assert (PyUnicode_Check(*p_str));
14654 return 0;
14655 }
14656
14657 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14658 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14659 struct unicode_format_arg_t *arg,
14660 PyObject *str)
14661 {
14662 Py_ssize_t len;
14663 enum PyUnicode_Kind kind;
14664 void *pbuf;
14665 Py_ssize_t pindex;
14666 Py_UCS4 signchar;
14667 Py_ssize_t buflen;
14668 Py_UCS4 maxchar;
14669 Py_ssize_t sublen;
14670 _PyUnicodeWriter *writer = &ctx->writer;
14671 Py_UCS4 fill;
14672
14673 fill = ' ';
14674 if (arg->sign && arg->flags & F_ZERO)
14675 fill = '0';
14676
14677 if (PyUnicode_READY(str) == -1)
14678 return -1;
14679
14680 len = PyUnicode_GET_LENGTH(str);
14681 if ((arg->width == -1 || arg->width <= len)
14682 && (arg->prec == -1 || arg->prec >= len)
14683 && !(arg->flags & (F_SIGN | F_BLANK)))
14684 {
14685 /* Fast path */
14686 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14687 return -1;
14688 return 0;
14689 }
14690
14691 /* Truncate the string for "s", "r" and "a" formats
14692 if the precision is set */
14693 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14694 if (arg->prec >= 0 && len > arg->prec)
14695 len = arg->prec;
14696 }
14697
14698 /* Adjust sign and width */
14699 kind = PyUnicode_KIND(str);
14700 pbuf = PyUnicode_DATA(str);
14701 pindex = 0;
14702 signchar = '\0';
14703 if (arg->sign) {
14704 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14705 if (ch == '-' || ch == '+') {
14706 signchar = ch;
14707 len--;
14708 pindex++;
14709 }
14710 else if (arg->flags & F_SIGN)
14711 signchar = '+';
14712 else if (arg->flags & F_BLANK)
14713 signchar = ' ';
14714 else
14715 arg->sign = 0;
14716 }
14717 if (arg->width < len)
14718 arg->width = len;
14719
14720 /* Prepare the writer */
14721 maxchar = writer->maxchar;
14722 if (!(arg->flags & F_LJUST)) {
14723 if (arg->sign) {
14724 if ((arg->width-1) > len)
14725 maxchar = Py_MAX(maxchar, fill);
14726 }
14727 else {
14728 if (arg->width > len)
14729 maxchar = Py_MAX(maxchar, fill);
14730 }
14731 }
14732 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14733 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14734 maxchar = Py_MAX(maxchar, strmaxchar);
14735 }
14736
14737 buflen = arg->width;
14738 if (arg->sign && len == arg->width)
14739 buflen++;
14740 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14741 return -1;
14742
14743 /* Write the sign if needed */
14744 if (arg->sign) {
14745 if (fill != ' ') {
14746 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14747 writer->pos += 1;
14748 }
14749 if (arg->width > len)
14750 arg->width--;
14751 }
14752
14753 /* Write the numeric prefix for "x", "X" and "o" formats
14754 if the alternate form is used.
14755 For example, write "0x" for the "%#x" format. */
14756 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14757 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14758 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14759 if (fill != ' ') {
14760 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14761 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14762 writer->pos += 2;
14763 pindex += 2;
14764 }
14765 arg->width -= 2;
14766 if (arg->width < 0)
14767 arg->width = 0;
14768 len -= 2;
14769 }
14770
14771 /* Pad left with the fill character if needed */
14772 if (arg->width > len && !(arg->flags & F_LJUST)) {
14773 sublen = arg->width - len;
14774 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14775 writer->pos += sublen;
14776 arg->width = len;
14777 }
14778
14779 /* If padding with spaces: write sign if needed and/or numeric prefix if
14780 the alternate form is used */
14781 if (fill == ' ') {
14782 if (arg->sign) {
14783 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14784 writer->pos += 1;
14785 }
14786 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14787 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14788 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14789 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14790 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14791 writer->pos += 2;
14792 pindex += 2;
14793 }
14794 }
14795
14796 /* Write characters */
14797 if (len) {
14798 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14799 str, pindex, len);
14800 writer->pos += len;
14801 }
14802
14803 /* Pad right with the fill character if needed */
14804 if (arg->width > len) {
14805 sublen = arg->width - len;
14806 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14807 writer->pos += sublen;
14808 }
14809 return 0;
14810 }
14811
14812 /* Helper of PyUnicode_Format(): format one arg.
14813 Return 0 on success, raise an exception and return -1 on error. */
14814 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14815 unicode_format_arg(struct unicode_formatter_t *ctx)
14816 {
14817 struct unicode_format_arg_t arg;
14818 PyObject *str;
14819 int ret;
14820
14821 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14822 arg.flags = 0;
14823 arg.width = -1;
14824 arg.prec = -1;
14825 arg.sign = 0;
14826 str = NULL;
14827
14828 ret = unicode_format_arg_parse(ctx, &arg);
14829 if (ret == -1)
14830 return -1;
14831
14832 ret = unicode_format_arg_format(ctx, &arg, &str);
14833 if (ret == -1)
14834 return -1;
14835
14836 if (ret != 1) {
14837 ret = unicode_format_arg_output(ctx, &arg, str);
14838 Py_DECREF(str);
14839 if (ret == -1)
14840 return -1;
14841 }
14842
14843 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14844 PyErr_SetString(PyExc_TypeError,
14845 "not all arguments converted during string formatting");
14846 return -1;
14847 }
14848 return 0;
14849 }
14850
14851 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14852 PyUnicode_Format(PyObject *format, PyObject *args)
14853 {
14854 struct unicode_formatter_t ctx;
14855
14856 if (format == NULL || args == NULL) {
14857 PyErr_BadInternalCall();
14858 return NULL;
14859 }
14860
14861 if (ensure_unicode(format) < 0)
14862 return NULL;
14863
14864 ctx.fmtstr = format;
14865 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14866 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14867 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14868 ctx.fmtpos = 0;
14869
14870 _PyUnicodeWriter_Init(&ctx.writer);
14871 ctx.writer.min_length = ctx.fmtcnt + 100;
14872 ctx.writer.overallocate = 1;
14873
14874 if (PyTuple_Check(args)) {
14875 ctx.arglen = PyTuple_Size(args);
14876 ctx.argidx = 0;
14877 }
14878 else {
14879 ctx.arglen = -1;
14880 ctx.argidx = -2;
14881 }
14882 ctx.args_owned = 0;
14883 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14884 ctx.dict = args;
14885 else
14886 ctx.dict = NULL;
14887 ctx.args = args;
14888
14889 while (--ctx.fmtcnt >= 0) {
14890 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14891 Py_ssize_t nonfmtpos;
14892
14893 nonfmtpos = ctx.fmtpos++;
14894 while (ctx.fmtcnt >= 0 &&
14895 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14896 ctx.fmtpos++;
14897 ctx.fmtcnt--;
14898 }
14899 if (ctx.fmtcnt < 0) {
14900 ctx.fmtpos--;
14901 ctx.writer.overallocate = 0;
14902 }
14903
14904 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14905 nonfmtpos, ctx.fmtpos) < 0)
14906 goto onError;
14907 }
14908 else {
14909 ctx.fmtpos++;
14910 if (unicode_format_arg(&ctx) == -1)
14911 goto onError;
14912 }
14913 }
14914
14915 if (ctx.argidx < ctx.arglen && !ctx.dict) {
14916 PyErr_SetString(PyExc_TypeError,
14917 "not all arguments converted during string formatting");
14918 goto onError;
14919 }
14920
14921 if (ctx.args_owned) {
14922 Py_DECREF(ctx.args);
14923 }
14924 return _PyUnicodeWriter_Finish(&ctx.writer);
14925
14926 onError:
14927 _PyUnicodeWriter_Dealloc(&ctx.writer);
14928 if (ctx.args_owned) {
14929 Py_DECREF(ctx.args);
14930 }
14931 return NULL;
14932 }
14933
14934 static PyObject *
14935 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14936
14937 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)14938 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14939 {
14940 PyObject *x = NULL;
14941 static char *kwlist[] = {"object", "encoding", "errors", 0};
14942 char *encoding = NULL;
14943 char *errors = NULL;
14944
14945 if (type != &PyUnicode_Type)
14946 return unicode_subtype_new(type, args, kwds);
14947 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14948 kwlist, &x, &encoding, &errors))
14949 return NULL;
14950 if (x == NULL)
14951 _Py_RETURN_UNICODE_EMPTY();
14952 if (encoding == NULL && errors == NULL)
14953 return PyObject_Str(x);
14954 else
14955 return PyUnicode_FromEncodedObject(x, encoding, errors);
14956 }
14957
14958 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)14959 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14960 {
14961 PyObject *unicode, *self;
14962 Py_ssize_t length, char_size;
14963 int share_wstr, share_utf8;
14964 unsigned int kind;
14965 void *data;
14966
14967 assert(PyType_IsSubtype(type, &PyUnicode_Type));
14968
14969 unicode = unicode_new(&PyUnicode_Type, args, kwds);
14970 if (unicode == NULL)
14971 return NULL;
14972 assert(_PyUnicode_CHECK(unicode));
14973 if (PyUnicode_READY(unicode) == -1) {
14974 Py_DECREF(unicode);
14975 return NULL;
14976 }
14977
14978 self = type->tp_alloc(type, 0);
14979 if (self == NULL) {
14980 Py_DECREF(unicode);
14981 return NULL;
14982 }
14983 kind = PyUnicode_KIND(unicode);
14984 length = PyUnicode_GET_LENGTH(unicode);
14985
14986 _PyUnicode_LENGTH(self) = length;
14987 #ifdef Py_DEBUG
14988 _PyUnicode_HASH(self) = -1;
14989 #else
14990 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14991 #endif
14992 _PyUnicode_STATE(self).interned = 0;
14993 _PyUnicode_STATE(self).kind = kind;
14994 _PyUnicode_STATE(self).compact = 0;
14995 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14996 _PyUnicode_STATE(self).ready = 1;
14997 _PyUnicode_WSTR(self) = NULL;
14998 _PyUnicode_UTF8_LENGTH(self) = 0;
14999 _PyUnicode_UTF8(self) = NULL;
15000 _PyUnicode_WSTR_LENGTH(self) = 0;
15001 _PyUnicode_DATA_ANY(self) = NULL;
15002
15003 share_utf8 = 0;
15004 share_wstr = 0;
15005 if (kind == PyUnicode_1BYTE_KIND) {
15006 char_size = 1;
15007 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15008 share_utf8 = 1;
15009 }
15010 else if (kind == PyUnicode_2BYTE_KIND) {
15011 char_size = 2;
15012 if (sizeof(wchar_t) == 2)
15013 share_wstr = 1;
15014 }
15015 else {
15016 assert(kind == PyUnicode_4BYTE_KIND);
15017 char_size = 4;
15018 if (sizeof(wchar_t) == 4)
15019 share_wstr = 1;
15020 }
15021
15022 /* Ensure we won't overflow the length. */
15023 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15024 PyErr_NoMemory();
15025 goto onError;
15026 }
15027 data = PyObject_MALLOC((length + 1) * char_size);
15028 if (data == NULL) {
15029 PyErr_NoMemory();
15030 goto onError;
15031 }
15032
15033 _PyUnicode_DATA_ANY(self) = data;
15034 if (share_utf8) {
15035 _PyUnicode_UTF8_LENGTH(self) = length;
15036 _PyUnicode_UTF8(self) = data;
15037 }
15038 if (share_wstr) {
15039 _PyUnicode_WSTR_LENGTH(self) = length;
15040 _PyUnicode_WSTR(self) = (wchar_t *)data;
15041 }
15042
15043 memcpy(data, PyUnicode_DATA(unicode),
15044 kind * (length + 1));
15045 assert(_PyUnicode_CheckConsistency(self, 1));
15046 #ifdef Py_DEBUG
15047 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15048 #endif
15049 Py_DECREF(unicode);
15050 return self;
15051
15052 onError:
15053 Py_DECREF(unicode);
15054 Py_DECREF(self);
15055 return NULL;
15056 }
15057
15058 PyDoc_STRVAR(unicode_doc,
15059 "str(object='') -> str\n\
15060 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15061 \n\
15062 Create a new string object from the given object. If encoding or\n\
15063 errors is specified, then the object must expose a data buffer\n\
15064 that will be decoded using the given encoding and error handler.\n\
15065 Otherwise, returns the result of object.__str__() (if defined)\n\
15066 or repr(object).\n\
15067 encoding defaults to sys.getdefaultencoding().\n\
15068 errors defaults to 'strict'.");
15069
15070 static PyObject *unicode_iter(PyObject *seq);
15071
15072 PyTypeObject PyUnicode_Type = {
15073 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15074 "str", /* tp_name */
15075 sizeof(PyUnicodeObject), /* tp_size */
15076 0, /* tp_itemsize */
15077 /* Slots */
15078 (destructor)unicode_dealloc, /* tp_dealloc */
15079 0, /* tp_print */
15080 0, /* tp_getattr */
15081 0, /* tp_setattr */
15082 0, /* tp_reserved */
15083 unicode_repr, /* tp_repr */
15084 &unicode_as_number, /* tp_as_number */
15085 &unicode_as_sequence, /* tp_as_sequence */
15086 &unicode_as_mapping, /* tp_as_mapping */
15087 (hashfunc) unicode_hash, /* tp_hash*/
15088 0, /* tp_call*/
15089 (reprfunc) unicode_str, /* tp_str */
15090 PyObject_GenericGetAttr, /* tp_getattro */
15091 0, /* tp_setattro */
15092 0, /* tp_as_buffer */
15093 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15094 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15095 unicode_doc, /* tp_doc */
15096 0, /* tp_traverse */
15097 0, /* tp_clear */
15098 PyUnicode_RichCompare, /* tp_richcompare */
15099 0, /* tp_weaklistoffset */
15100 unicode_iter, /* tp_iter */
15101 0, /* tp_iternext */
15102 unicode_methods, /* tp_methods */
15103 0, /* tp_members */
15104 0, /* tp_getset */
15105 &PyBaseObject_Type, /* tp_base */
15106 0, /* tp_dict */
15107 0, /* tp_descr_get */
15108 0, /* tp_descr_set */
15109 0, /* tp_dictoffset */
15110 0, /* tp_init */
15111 0, /* tp_alloc */
15112 unicode_new, /* tp_new */
15113 PyObject_Del, /* tp_free */
15114 };
15115
15116 /* Initialize the Unicode implementation */
15117
_PyUnicode_Init(void)15118 int _PyUnicode_Init(void)
15119 {
15120 /* XXX - move this array to unicodectype.c ? */
15121 Py_UCS2 linebreak[] = {
15122 0x000A, /* LINE FEED */
15123 0x000D, /* CARRIAGE RETURN */
15124 0x001C, /* FILE SEPARATOR */
15125 0x001D, /* GROUP SEPARATOR */
15126 0x001E, /* RECORD SEPARATOR */
15127 0x0085, /* NEXT LINE */
15128 0x2028, /* LINE SEPARATOR */
15129 0x2029, /* PARAGRAPH SEPARATOR */
15130 };
15131
15132 /* Init the implementation */
15133 _Py_INCREF_UNICODE_EMPTY();
15134 if (!unicode_empty)
15135 Py_FatalError("Can't create empty string");
15136 Py_DECREF(unicode_empty);
15137
15138 if (PyType_Ready(&PyUnicode_Type) < 0)
15139 Py_FatalError("Can't initialize 'unicode'");
15140
15141 /* initialize the linebreak bloom filter */
15142 bloom_linebreak = make_bloom_mask(
15143 PyUnicode_2BYTE_KIND, linebreak,
15144 Py_ARRAY_LENGTH(linebreak));
15145
15146 if (PyType_Ready(&EncodingMapType) < 0)
15147 Py_FatalError("Can't initialize encoding map type");
15148
15149 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15150 Py_FatalError("Can't initialize field name iterator type");
15151
15152 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15153 Py_FatalError("Can't initialize formatter iter type");
15154
15155 return 0;
15156 }
15157
15158 /* Finalize the Unicode implementation */
15159
15160 int
PyUnicode_ClearFreeList(void)15161 PyUnicode_ClearFreeList(void)
15162 {
15163 return 0;
15164 }
15165
15166 void
_PyUnicode_Fini(void)15167 _PyUnicode_Fini(void)
15168 {
15169 int i;
15170
15171 Py_CLEAR(unicode_empty);
15172
15173 for (i = 0; i < 256; i++)
15174 Py_CLEAR(unicode_latin1[i]);
15175 _PyUnicode_ClearStaticStrings();
15176 (void)PyUnicode_ClearFreeList();
15177 }
15178
15179 void
PyUnicode_InternInPlace(PyObject ** p)15180 PyUnicode_InternInPlace(PyObject **p)
15181 {
15182 PyObject *s = *p;
15183 PyObject *t;
15184 #ifdef Py_DEBUG
15185 assert(s != NULL);
15186 assert(_PyUnicode_CHECK(s));
15187 #else
15188 if (s == NULL || !PyUnicode_Check(s))
15189 return;
15190 #endif
15191 /* If it's a subclass, we don't really know what putting
15192 it in the interned dict might do. */
15193 if (!PyUnicode_CheckExact(s))
15194 return;
15195 if (PyUnicode_CHECK_INTERNED(s))
15196 return;
15197 if (interned == NULL) {
15198 interned = PyDict_New();
15199 if (interned == NULL) {
15200 PyErr_Clear(); /* Don't leave an exception */
15201 return;
15202 }
15203 }
15204 Py_ALLOW_RECURSION
15205 t = PyDict_SetDefault(interned, s, s);
15206 Py_END_ALLOW_RECURSION
15207 if (t == NULL) {
15208 PyErr_Clear();
15209 return;
15210 }
15211 if (t != s) {
15212 Py_INCREF(t);
15213 Py_SETREF(*p, t);
15214 return;
15215 }
15216 /* The two references in interned are not counted by refcnt.
15217 The deallocator will take care of this */
15218 Py_REFCNT(s) -= 2;
15219 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15220 }
15221
15222 void
PyUnicode_InternImmortal(PyObject ** p)15223 PyUnicode_InternImmortal(PyObject **p)
15224 {
15225 PyUnicode_InternInPlace(p);
15226 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15227 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15228 Py_INCREF(*p);
15229 }
15230 }
15231
15232 PyObject *
PyUnicode_InternFromString(const char * cp)15233 PyUnicode_InternFromString(const char *cp)
15234 {
15235 PyObject *s = PyUnicode_FromString(cp);
15236 if (s == NULL)
15237 return NULL;
15238 PyUnicode_InternInPlace(&s);
15239 return s;
15240 }
15241
15242 void
_Py_ReleaseInternedUnicodeStrings(void)15243 _Py_ReleaseInternedUnicodeStrings(void)
15244 {
15245 PyObject *keys;
15246 PyObject *s;
15247 Py_ssize_t i, n;
15248 Py_ssize_t immortal_size = 0, mortal_size = 0;
15249
15250 if (interned == NULL || !PyDict_Check(interned))
15251 return;
15252 keys = PyDict_Keys(interned);
15253 if (keys == NULL || !PyList_Check(keys)) {
15254 PyErr_Clear();
15255 return;
15256 }
15257
15258 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15259 detector, interned unicode strings are not forcibly deallocated;
15260 rather, we give them their stolen references back, and then clear
15261 and DECREF the interned dict. */
15262
15263 n = PyList_GET_SIZE(keys);
15264 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15265 n);
15266 for (i = 0; i < n; i++) {
15267 s = PyList_GET_ITEM(keys, i);
15268 if (PyUnicode_READY(s) == -1) {
15269 assert(0 && "could not ready string");
15270 fprintf(stderr, "could not ready string\n");
15271 }
15272 switch (PyUnicode_CHECK_INTERNED(s)) {
15273 case SSTATE_NOT_INTERNED:
15274 /* XXX Shouldn't happen */
15275 break;
15276 case SSTATE_INTERNED_IMMORTAL:
15277 Py_REFCNT(s) += 1;
15278 immortal_size += PyUnicode_GET_LENGTH(s);
15279 break;
15280 case SSTATE_INTERNED_MORTAL:
15281 Py_REFCNT(s) += 2;
15282 mortal_size += PyUnicode_GET_LENGTH(s);
15283 break;
15284 default:
15285 Py_FatalError("Inconsistent interned string state.");
15286 }
15287 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15288 }
15289 fprintf(stderr, "total size of all interned strings: "
15290 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15291 "mortal/immortal\n", mortal_size, immortal_size);
15292 Py_DECREF(keys);
15293 PyDict_Clear(interned);
15294 Py_CLEAR(interned);
15295 }
15296
15297
15298 /********************* Unicode Iterator **************************/
15299
15300 typedef struct {
15301 PyObject_HEAD
15302 Py_ssize_t it_index;
15303 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15304 } unicodeiterobject;
15305
15306 static void
unicodeiter_dealloc(unicodeiterobject * it)15307 unicodeiter_dealloc(unicodeiterobject *it)
15308 {
15309 _PyObject_GC_UNTRACK(it);
15310 Py_XDECREF(it->it_seq);
15311 PyObject_GC_Del(it);
15312 }
15313
15314 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15315 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15316 {
15317 Py_VISIT(it->it_seq);
15318 return 0;
15319 }
15320
15321 static PyObject *
unicodeiter_next(unicodeiterobject * it)15322 unicodeiter_next(unicodeiterobject *it)
15323 {
15324 PyObject *seq, *item;
15325
15326 assert(it != NULL);
15327 seq = it->it_seq;
15328 if (seq == NULL)
15329 return NULL;
15330 assert(_PyUnicode_CHECK(seq));
15331
15332 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15333 int kind = PyUnicode_KIND(seq);
15334 void *data = PyUnicode_DATA(seq);
15335 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15336 item = PyUnicode_FromOrdinal(chr);
15337 if (item != NULL)
15338 ++it->it_index;
15339 return item;
15340 }
15341
15342 it->it_seq = NULL;
15343 Py_DECREF(seq);
15344 return NULL;
15345 }
15346
15347 static PyObject *
unicodeiter_len(unicodeiterobject * it)15348 unicodeiter_len(unicodeiterobject *it)
15349 {
15350 Py_ssize_t len = 0;
15351 if (it->it_seq)
15352 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15353 return PyLong_FromSsize_t(len);
15354 }
15355
15356 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15357
15358 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15359 unicodeiter_reduce(unicodeiterobject *it)
15360 {
15361 if (it->it_seq != NULL) {
15362 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15363 it->it_seq, it->it_index);
15364 } else {
15365 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15366 if (u == NULL)
15367 return NULL;
15368 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15369 }
15370 }
15371
15372 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15373
15374 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15375 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15376 {
15377 Py_ssize_t index = PyLong_AsSsize_t(state);
15378 if (index == -1 && PyErr_Occurred())
15379 return NULL;
15380 if (it->it_seq != NULL) {
15381 if (index < 0)
15382 index = 0;
15383 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15384 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15385 it->it_index = index;
15386 }
15387 Py_RETURN_NONE;
15388 }
15389
15390 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15391
15392 static PyMethodDef unicodeiter_methods[] = {
15393 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15394 length_hint_doc},
15395 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15396 reduce_doc},
15397 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15398 setstate_doc},
15399 {NULL, NULL} /* sentinel */
15400 };
15401
15402 PyTypeObject PyUnicodeIter_Type = {
15403 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15404 "str_iterator", /* tp_name */
15405 sizeof(unicodeiterobject), /* tp_basicsize */
15406 0, /* tp_itemsize */
15407 /* methods */
15408 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15409 0, /* tp_print */
15410 0, /* tp_getattr */
15411 0, /* tp_setattr */
15412 0, /* tp_reserved */
15413 0, /* tp_repr */
15414 0, /* tp_as_number */
15415 0, /* tp_as_sequence */
15416 0, /* tp_as_mapping */
15417 0, /* tp_hash */
15418 0, /* tp_call */
15419 0, /* tp_str */
15420 PyObject_GenericGetAttr, /* tp_getattro */
15421 0, /* tp_setattro */
15422 0, /* tp_as_buffer */
15423 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15424 0, /* tp_doc */
15425 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15426 0, /* tp_clear */
15427 0, /* tp_richcompare */
15428 0, /* tp_weaklistoffset */
15429 PyObject_SelfIter, /* tp_iter */
15430 (iternextfunc)unicodeiter_next, /* tp_iternext */
15431 unicodeiter_methods, /* tp_methods */
15432 0,
15433 };
15434
15435 static PyObject *
unicode_iter(PyObject * seq)15436 unicode_iter(PyObject *seq)
15437 {
15438 unicodeiterobject *it;
15439
15440 if (!PyUnicode_Check(seq)) {
15441 PyErr_BadInternalCall();
15442 return NULL;
15443 }
15444 if (PyUnicode_READY(seq) == -1)
15445 return NULL;
15446 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15447 if (it == NULL)
15448 return NULL;
15449 it->it_index = 0;
15450 Py_INCREF(seq);
15451 it->it_seq = seq;
15452 _PyObject_GC_TRACK(it);
15453 return (PyObject *)it;
15454 }
15455
15456
15457 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15458 Py_UNICODE_strlen(const Py_UNICODE *u)
15459 {
15460 int res = 0;
15461 while(*u++)
15462 res++;
15463 return res;
15464 }
15465
15466 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15467 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15468 {
15469 Py_UNICODE *u = s1;
15470 while ((*u++ = *s2++));
15471 return s1;
15472 }
15473
15474 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15475 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15476 {
15477 Py_UNICODE *u = s1;
15478 while ((*u++ = *s2++))
15479 if (n-- == 0)
15480 break;
15481 return s1;
15482 }
15483
15484 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15485 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15486 {
15487 Py_UNICODE *u1 = s1;
15488 u1 += Py_UNICODE_strlen(u1);
15489 Py_UNICODE_strcpy(u1, s2);
15490 return s1;
15491 }
15492
15493 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15494 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15495 {
15496 while (*s1 && *s2 && *s1 == *s2)
15497 s1++, s2++;
15498 if (*s1 && *s2)
15499 return (*s1 < *s2) ? -1 : +1;
15500 if (*s1)
15501 return 1;
15502 if (*s2)
15503 return -1;
15504 return 0;
15505 }
15506
15507 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15508 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15509 {
15510 Py_UNICODE u1, u2;
15511 for (; n != 0; n--) {
15512 u1 = *s1;
15513 u2 = *s2;
15514 if (u1 != u2)
15515 return (u1 < u2) ? -1 : +1;
15516 if (u1 == '\0')
15517 return 0;
15518 s1++;
15519 s2++;
15520 }
15521 return 0;
15522 }
15523
15524 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15525 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15526 {
15527 const Py_UNICODE *p;
15528 for (p = s; *p; p++)
15529 if (*p == c)
15530 return (Py_UNICODE*)p;
15531 return NULL;
15532 }
15533
15534 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15535 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15536 {
15537 const Py_UNICODE *p;
15538 p = s + Py_UNICODE_strlen(s);
15539 while (p != s) {
15540 p--;
15541 if (*p == c)
15542 return (Py_UNICODE*)p;
15543 }
15544 return NULL;
15545 }
15546
15547 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15548 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15549 {
15550 Py_UNICODE *u, *copy;
15551 Py_ssize_t len, size;
15552
15553 if (!PyUnicode_Check(unicode)) {
15554 PyErr_BadArgument();
15555 return NULL;
15556 }
15557 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15558 if (u == NULL)
15559 return NULL;
15560 /* Ensure we won't overflow the size. */
15561 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15562 PyErr_NoMemory();
15563 return NULL;
15564 }
15565 size = len + 1; /* copy the null character */
15566 size *= sizeof(Py_UNICODE);
15567 copy = PyMem_Malloc(size);
15568 if (copy == NULL) {
15569 PyErr_NoMemory();
15570 return NULL;
15571 }
15572 memcpy(copy, u, size);
15573 return copy;
15574 }
15575
15576 /* A _string module, to export formatter_parser and formatter_field_name_split
15577 to the string.Formatter class implemented in Python. */
15578
15579 static PyMethodDef _string_methods[] = {
15580 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15581 METH_O, PyDoc_STR("split the argument as a field name")},
15582 {"formatter_parser", (PyCFunction) formatter_parser,
15583 METH_O, PyDoc_STR("parse the argument as a format string")},
15584 {NULL, NULL}
15585 };
15586
15587 static struct PyModuleDef _string_module = {
15588 PyModuleDef_HEAD_INIT,
15589 "_string",
15590 PyDoc_STR("string helper module"),
15591 0,
15592 _string_methods,
15593 NULL,
15594 NULL,
15595 NULL,
15596 NULL
15597 };
15598
15599 PyMODINIT_FUNC
PyInit__string(void)15600 PyInit__string(void)
15601 {
15602 return PyModule_Create(&_string_module);
15603 }
15604
15605
15606 #ifdef __cplusplus
15607 }
15608 #endif
15609