• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "ucnhash.h"
44 #include "bytes_methods.h"
45 #include "stringlib/eq.h"
46 
47 #ifdef MS_WINDOWS
48 #include <windows.h>
49 #endif
50 
51 /*[clinic input]
52 class str "PyUnicodeObject *" "&PyUnicode_Type"
53 [clinic start generated code]*/
54 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
55 
56 /* --- Globals ------------------------------------------------------------
57 
58 NOTE: In the interpreter's initialization phase, some globals are currently
59       initialized dynamically as needed. In the process Unicode objects may
60       be created before the Unicode type is ready.
61 
62 */
63 
64 
65 #ifdef __cplusplus
66 extern "C" {
67 #endif
68 
69 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70 #define MAX_UNICODE 0x10ffff
71 
72 #ifdef Py_DEBUG
73 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74 #else
75 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76 #endif
77 
78 #define _PyUnicode_UTF8(op)                             \
79     (((PyCompactUnicodeObject*)(op))->utf8)
80 #define PyUnicode_UTF8(op)                              \
81     (assert(_PyUnicode_CHECK(op)),                      \
82      assert(PyUnicode_IS_READY(op)),                    \
83      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
84          ((char*)((PyASCIIObject*)(op) + 1)) :          \
85          _PyUnicode_UTF8(op))
86 #define _PyUnicode_UTF8_LENGTH(op)                      \
87     (((PyCompactUnicodeObject*)(op))->utf8_length)
88 #define PyUnicode_UTF8_LENGTH(op)                       \
89     (assert(_PyUnicode_CHECK(op)),                      \
90      assert(PyUnicode_IS_READY(op)),                    \
91      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
92          ((PyASCIIObject*)(op))->length :               \
93          _PyUnicode_UTF8_LENGTH(op))
94 #define _PyUnicode_WSTR(op)                             \
95     (((PyASCIIObject*)(op))->wstr)
96 #define _PyUnicode_WSTR_LENGTH(op)                      \
97     (((PyCompactUnicodeObject*)(op))->wstr_length)
98 #define _PyUnicode_LENGTH(op)                           \
99     (((PyASCIIObject *)(op))->length)
100 #define _PyUnicode_STATE(op)                            \
101     (((PyASCIIObject *)(op))->state)
102 #define _PyUnicode_HASH(op)                             \
103     (((PyASCIIObject *)(op))->hash)
104 #define _PyUnicode_KIND(op)                             \
105     (assert(_PyUnicode_CHECK(op)),                      \
106      ((PyASCIIObject *)(op))->state.kind)
107 #define _PyUnicode_GET_LENGTH(op)                       \
108     (assert(_PyUnicode_CHECK(op)),                      \
109      ((PyASCIIObject *)(op))->length)
110 #define _PyUnicode_DATA_ANY(op)                         \
111     (((PyUnicodeObject*)(op))->data.any)
112 
113 #undef PyUnicode_READY
114 #define PyUnicode_READY(op)                             \
115     (assert(_PyUnicode_CHECK(op)),                      \
116      (PyUnicode_IS_READY(op) ?                          \
117       0 :                                               \
118       _PyUnicode_Ready(op)))
119 
120 #define _PyUnicode_SHARE_UTF8(op)                       \
121     (assert(_PyUnicode_CHECK(op)),                      \
122      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
123      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124 #define _PyUnicode_SHARE_WSTR(op)                       \
125     (assert(_PyUnicode_CHECK(op)),                      \
126      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127 
128 /* true if the Unicode object has an allocated UTF-8 memory block
129    (not shared with other data) */
130 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
131     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
132       && _PyUnicode_UTF8(op)                            \
133       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134 
135 /* true if the Unicode object has an allocated wstr memory block
136    (not shared with other data) */
137 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
138     ((_PyUnicode_WSTR(op) &&                            \
139       (!PyUnicode_IS_READY(op) ||                       \
140        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141 
142 /* Generic helper macro to convert characters of different types.
143    from_type and to_type have to be valid type names, begin and end
144    are pointers to the source characters which should be of type
145    "from_type *".  to is a pointer of type "to_type *" and points to the
146    buffer where the result characters are written to. */
147 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148     do {                                                \
149         to_type *_to = (to_type *)(to);                \
150         const from_type *_iter = (from_type *)(begin);  \
151         const from_type *_end = (from_type *)(end);     \
152         Py_ssize_t n = (_end) - (_iter);                \
153         const from_type *_unrolled_end =                \
154             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155         while (_iter < (_unrolled_end)) {               \
156             _to[0] = (to_type) _iter[0];                \
157             _to[1] = (to_type) _iter[1];                \
158             _to[2] = (to_type) _iter[2];                \
159             _to[3] = (to_type) _iter[3];                \
160             _iter += 4; _to += 4;                       \
161         }                                               \
162         while (_iter < (_end))                          \
163             *_to++ = (to_type) *_iter++;                \
164     } while (0)
165 
166 #ifdef MS_WINDOWS
167    /* On Windows, overallocate by 50% is the best factor */
168 #  define OVERALLOCATE_FACTOR 2
169 #else
170    /* On Linux, overallocate by 25% is the best factor */
171 #  define OVERALLOCATE_FACTOR 4
172 #endif
173 
174 /* This dictionary holds all interned unicode strings.  Note that references
175    to strings in this dictionary are *not* counted in the string's ob_refcnt.
176    When the interned string reaches a refcnt of 0 the string deallocation
177    function will delete the reference from this dictionary.
178 
179    Another way to look at this is that to say that the actual reference
180    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181 */
182 static PyObject *interned = NULL;
183 
184 /* The empty Unicode object is shared to improve performance. */
185 static PyObject *unicode_empty = NULL;
186 
187 #define _Py_INCREF_UNICODE_EMPTY()                      \
188     do {                                                \
189         if (unicode_empty != NULL)                      \
190             Py_INCREF(unicode_empty);                   \
191         else {                                          \
192             unicode_empty = PyUnicode_New(0, 0);        \
193             if (unicode_empty != NULL) {                \
194                 Py_INCREF(unicode_empty);               \
195                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196             }                                           \
197         }                                               \
198     } while (0)
199 
200 #define _Py_RETURN_UNICODE_EMPTY()                      \
201     do {                                                \
202         _Py_INCREF_UNICODE_EMPTY();                     \
203         return unicode_empty;                           \
204     } while (0)
205 
206 /* Forward declaration */
207 static inline int
208 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209 
210 /* List of static strings. */
211 static _Py_Identifier *static_strings = NULL;
212 
213 /* Single character Unicode strings in the Latin-1 range are being
214    shared as well. */
215 static PyObject *unicode_latin1[256] = {NULL};
216 
217 /* Fast detection of the most frequent whitespace characters */
218 const unsigned char _Py_ascii_whitespace[] = {
219     0, 0, 0, 0, 0, 0, 0, 0,
220 /*     case 0x0009: * CHARACTER TABULATION */
221 /*     case 0x000A: * LINE FEED */
222 /*     case 0x000B: * LINE TABULATION */
223 /*     case 0x000C: * FORM FEED */
224 /*     case 0x000D: * CARRIAGE RETURN */
225     0, 1, 1, 1, 1, 1, 0, 0,
226     0, 0, 0, 0, 0, 0, 0, 0,
227 /*     case 0x001C: * FILE SEPARATOR */
228 /*     case 0x001D: * GROUP SEPARATOR */
229 /*     case 0x001E: * RECORD SEPARATOR */
230 /*     case 0x001F: * UNIT SEPARATOR */
231     0, 0, 0, 0, 1, 1, 1, 1,
232 /*     case 0x0020: * SPACE */
233     1, 0, 0, 0, 0, 0, 0, 0,
234     0, 0, 0, 0, 0, 0, 0, 0,
235     0, 0, 0, 0, 0, 0, 0, 0,
236     0, 0, 0, 0, 0, 0, 0, 0,
237 
238     0, 0, 0, 0, 0, 0, 0, 0,
239     0, 0, 0, 0, 0, 0, 0, 0,
240     0, 0, 0, 0, 0, 0, 0, 0,
241     0, 0, 0, 0, 0, 0, 0, 0,
242     0, 0, 0, 0, 0, 0, 0, 0,
243     0, 0, 0, 0, 0, 0, 0, 0,
244     0, 0, 0, 0, 0, 0, 0, 0,
245     0, 0, 0, 0, 0, 0, 0, 0
246 };
247 
248 /* forward */
249 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
250 static PyObject* get_latin1_char(unsigned char ch);
251 static int unicode_modifiable(PyObject *unicode);
252 
253 
254 static PyObject *
255 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
256 static PyObject *
257 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258 static PyObject *
259 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260 
261 static PyObject *
262 unicode_encode_call_errorhandler(const char *errors,
263        PyObject **errorHandler,const char *encoding, const char *reason,
264        PyObject *unicode, PyObject **exceptionObject,
265        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266 
267 static void
268 raise_encode_exception(PyObject **exceptionObject,
269                        const char *encoding,
270                        PyObject *unicode,
271                        Py_ssize_t startpos, Py_ssize_t endpos,
272                        const char *reason);
273 
274 /* Same for linebreaks */
275 static const unsigned char ascii_linebreak[] = {
276     0, 0, 0, 0, 0, 0, 0, 0,
277 /*         0x000A, * LINE FEED */
278 /*         0x000B, * LINE TABULATION */
279 /*         0x000C, * FORM FEED */
280 /*         0x000D, * CARRIAGE RETURN */
281     0, 0, 1, 1, 1, 1, 0, 0,
282     0, 0, 0, 0, 0, 0, 0, 0,
283 /*         0x001C, * FILE SEPARATOR */
284 /*         0x001D, * GROUP SEPARATOR */
285 /*         0x001E, * RECORD SEPARATOR */
286     0, 0, 0, 0, 1, 1, 1, 0,
287     0, 0, 0, 0, 0, 0, 0, 0,
288     0, 0, 0, 0, 0, 0, 0, 0,
289     0, 0, 0, 0, 0, 0, 0, 0,
290     0, 0, 0, 0, 0, 0, 0, 0,
291 
292     0, 0, 0, 0, 0, 0, 0, 0,
293     0, 0, 0, 0, 0, 0, 0, 0,
294     0, 0, 0, 0, 0, 0, 0, 0,
295     0, 0, 0, 0, 0, 0, 0, 0,
296     0, 0, 0, 0, 0, 0, 0, 0,
297     0, 0, 0, 0, 0, 0, 0, 0,
298     0, 0, 0, 0, 0, 0, 0, 0,
299     0, 0, 0, 0, 0, 0, 0, 0
300 };
301 
302 #include "clinic/unicodeobject.c.h"
303 
304 typedef enum {
305     _Py_ERROR_UNKNOWN=0,
306     _Py_ERROR_STRICT,
307     _Py_ERROR_SURROGATEESCAPE,
308     _Py_ERROR_REPLACE,
309     _Py_ERROR_IGNORE,
310     _Py_ERROR_BACKSLASHREPLACE,
311     _Py_ERROR_SURROGATEPASS,
312     _Py_ERROR_XMLCHARREFREPLACE,
313     _Py_ERROR_OTHER
314 } _Py_error_handler;
315 
316 static _Py_error_handler
get_error_handler(const char * errors)317 get_error_handler(const char *errors)
318 {
319     if (errors == NULL || strcmp(errors, "strict") == 0) {
320         return _Py_ERROR_STRICT;
321     }
322     if (strcmp(errors, "surrogateescape") == 0) {
323         return _Py_ERROR_SURROGATEESCAPE;
324     }
325     if (strcmp(errors, "replace") == 0) {
326         return _Py_ERROR_REPLACE;
327     }
328     if (strcmp(errors, "ignore") == 0) {
329         return _Py_ERROR_IGNORE;
330     }
331     if (strcmp(errors, "backslashreplace") == 0) {
332         return _Py_ERROR_BACKSLASHREPLACE;
333     }
334     if (strcmp(errors, "surrogatepass") == 0) {
335         return _Py_ERROR_SURROGATEPASS;
336     }
337     if (strcmp(errors, "xmlcharrefreplace") == 0) {
338         return _Py_ERROR_XMLCHARREFREPLACE;
339     }
340     return _Py_ERROR_OTHER;
341 }
342 
343 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344    This function is kept for backward compatibility with the old API. */
345 Py_UNICODE
PyUnicode_GetMax(void)346 PyUnicode_GetMax(void)
347 {
348 #ifdef Py_UNICODE_WIDE
349     return 0x10FFFF;
350 #else
351     /* This is actually an illegal character, so it should
352        not be passed to unichr. */
353     return 0xFFFF;
354 #endif
355 }
356 
357 #ifdef Py_DEBUG
358 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)359 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
360 {
361     PyASCIIObject *ascii;
362     unsigned int kind;
363 
364     assert(PyUnicode_Check(op));
365 
366     ascii = (PyASCIIObject *)op;
367     kind = ascii->state.kind;
368 
369     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
370         assert(kind == PyUnicode_1BYTE_KIND);
371         assert(ascii->state.ready == 1);
372     }
373     else {
374         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
375         void *data;
376 
377         if (ascii->state.compact == 1) {
378             data = compact + 1;
379             assert(kind == PyUnicode_1BYTE_KIND
380                    || kind == PyUnicode_2BYTE_KIND
381                    || kind == PyUnicode_4BYTE_KIND);
382             assert(ascii->state.ascii == 0);
383             assert(ascii->state.ready == 1);
384             assert (compact->utf8 != data);
385         }
386         else {
387             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388 
389             data = unicode->data.any;
390             if (kind == PyUnicode_WCHAR_KIND) {
391                 assert(ascii->length == 0);
392                 assert(ascii->hash == -1);
393                 assert(ascii->state.compact == 0);
394                 assert(ascii->state.ascii == 0);
395                 assert(ascii->state.ready == 0);
396                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
397                 assert(ascii->wstr != NULL);
398                 assert(data == NULL);
399                 assert(compact->utf8 == NULL);
400             }
401             else {
402                 assert(kind == PyUnicode_1BYTE_KIND
403                        || kind == PyUnicode_2BYTE_KIND
404                        || kind == PyUnicode_4BYTE_KIND);
405                 assert(ascii->state.compact == 0);
406                 assert(ascii->state.ready == 1);
407                 assert(data != NULL);
408                 if (ascii->state.ascii) {
409                     assert (compact->utf8 == data);
410                     assert (compact->utf8_length == ascii->length);
411                 }
412                 else
413                     assert (compact->utf8 != data);
414             }
415         }
416         if (kind != PyUnicode_WCHAR_KIND) {
417             if (
418 #if SIZEOF_WCHAR_T == 2
419                 kind == PyUnicode_2BYTE_KIND
420 #else
421                 kind == PyUnicode_4BYTE_KIND
422 #endif
423                )
424             {
425                 assert(ascii->wstr == data);
426                 assert(compact->wstr_length == ascii->length);
427             } else
428                 assert(ascii->wstr != data);
429         }
430 
431         if (compact->utf8 == NULL)
432             assert(compact->utf8_length == 0);
433         if (ascii->wstr == NULL)
434             assert(compact->wstr_length == 0);
435     }
436     /* check that the best kind is used */
437     if (check_content && kind != PyUnicode_WCHAR_KIND)
438     {
439         Py_ssize_t i;
440         Py_UCS4 maxchar = 0;
441         void *data;
442         Py_UCS4 ch;
443 
444         data = PyUnicode_DATA(ascii);
445         for (i=0; i < ascii->length; i++)
446         {
447             ch = PyUnicode_READ(kind, data, i);
448             if (ch > maxchar)
449                 maxchar = ch;
450         }
451         if (kind == PyUnicode_1BYTE_KIND) {
452             if (ascii->state.ascii == 0) {
453                 assert(maxchar >= 128);
454                 assert(maxchar <= 255);
455             }
456             else
457                 assert(maxchar < 128);
458         }
459         else if (kind == PyUnicode_2BYTE_KIND) {
460             assert(maxchar >= 0x100);
461             assert(maxchar <= 0xFFFF);
462         }
463         else {
464             assert(maxchar >= 0x10000);
465             assert(maxchar <= MAX_UNICODE);
466         }
467         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
468     }
469     return 1;
470 }
471 #endif
472 
473 static PyObject*
unicode_result_wchar(PyObject * unicode)474 unicode_result_wchar(PyObject *unicode)
475 {
476 #ifndef Py_DEBUG
477     Py_ssize_t len;
478 
479     len = _PyUnicode_WSTR_LENGTH(unicode);
480     if (len == 0) {
481         Py_DECREF(unicode);
482         _Py_RETURN_UNICODE_EMPTY();
483     }
484 
485     if (len == 1) {
486         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
487         if ((Py_UCS4)ch < 256) {
488             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489             Py_DECREF(unicode);
490             return latin1_char;
491         }
492     }
493 
494     if (_PyUnicode_Ready(unicode) < 0) {
495         Py_DECREF(unicode);
496         return NULL;
497     }
498 #else
499     assert(Py_REFCNT(unicode) == 1);
500 
501     /* don't make the result ready in debug mode to ensure that the caller
502        makes the string ready before using it */
503     assert(_PyUnicode_CheckConsistency(unicode, 1));
504 #endif
505     return unicode;
506 }
507 
508 static PyObject*
unicode_result_ready(PyObject * unicode)509 unicode_result_ready(PyObject *unicode)
510 {
511     Py_ssize_t length;
512 
513     length = PyUnicode_GET_LENGTH(unicode);
514     if (length == 0) {
515         if (unicode != unicode_empty) {
516             Py_DECREF(unicode);
517             _Py_RETURN_UNICODE_EMPTY();
518         }
519         return unicode_empty;
520     }
521 
522     if (length == 1) {
523         void *data = PyUnicode_DATA(unicode);
524         int kind = PyUnicode_KIND(unicode);
525         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
526         if (ch < 256) {
527             PyObject *latin1_char = unicode_latin1[ch];
528             if (latin1_char != NULL) {
529                 if (unicode != latin1_char) {
530                     Py_INCREF(latin1_char);
531                     Py_DECREF(unicode);
532                 }
533                 return latin1_char;
534             }
535             else {
536                 assert(_PyUnicode_CheckConsistency(unicode, 1));
537                 Py_INCREF(unicode);
538                 unicode_latin1[ch] = unicode;
539                 return unicode;
540             }
541         }
542     }
543 
544     assert(_PyUnicode_CheckConsistency(unicode, 1));
545     return unicode;
546 }
547 
548 static PyObject*
unicode_result(PyObject * unicode)549 unicode_result(PyObject *unicode)
550 {
551     assert(_PyUnicode_CHECK(unicode));
552     if (PyUnicode_IS_READY(unicode))
553         return unicode_result_ready(unicode);
554     else
555         return unicode_result_wchar(unicode);
556 }
557 
558 static PyObject*
unicode_result_unchanged(PyObject * unicode)559 unicode_result_unchanged(PyObject *unicode)
560 {
561     if (PyUnicode_CheckExact(unicode)) {
562         if (PyUnicode_READY(unicode) == -1)
563             return NULL;
564         Py_INCREF(unicode);
565         return unicode;
566     }
567     else
568         /* Subtype -- return genuine unicode string with the same value. */
569         return _PyUnicode_Copy(unicode);
570 }
571 
572 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573    ASCII, Latin1, UTF-8, etc. */
574 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)575 backslashreplace(_PyBytesWriter *writer, char *str,
576                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577 {
578     Py_ssize_t size, i;
579     Py_UCS4 ch;
580     enum PyUnicode_Kind kind;
581     void *data;
582 
583     assert(PyUnicode_IS_READY(unicode));
584     kind = PyUnicode_KIND(unicode);
585     data = PyUnicode_DATA(unicode);
586 
587     size = 0;
588     /* determine replacement size */
589     for (i = collstart; i < collend; ++i) {
590         Py_ssize_t incr;
591 
592         ch = PyUnicode_READ(kind, data, i);
593         if (ch < 0x100)
594             incr = 2+2;
595         else if (ch < 0x10000)
596             incr = 2+4;
597         else {
598             assert(ch <= MAX_UNICODE);
599             incr = 2+8;
600         }
601         if (size > PY_SSIZE_T_MAX - incr) {
602             PyErr_SetString(PyExc_OverflowError,
603                             "encoded result is too long for a Python string");
604             return NULL;
605         }
606         size += incr;
607     }
608 
609     str = _PyBytesWriter_Prepare(writer, str, size);
610     if (str == NULL)
611         return NULL;
612 
613     /* generate replacement */
614     for (i = collstart; i < collend; ++i) {
615         ch = PyUnicode_READ(kind, data, i);
616         *str++ = '\\';
617         if (ch >= 0x00010000) {
618             *str++ = 'U';
619             *str++ = Py_hexdigits[(ch>>28)&0xf];
620             *str++ = Py_hexdigits[(ch>>24)&0xf];
621             *str++ = Py_hexdigits[(ch>>20)&0xf];
622             *str++ = Py_hexdigits[(ch>>16)&0xf];
623             *str++ = Py_hexdigits[(ch>>12)&0xf];
624             *str++ = Py_hexdigits[(ch>>8)&0xf];
625         }
626         else if (ch >= 0x100) {
627             *str++ = 'u';
628             *str++ = Py_hexdigits[(ch>>12)&0xf];
629             *str++ = Py_hexdigits[(ch>>8)&0xf];
630         }
631         else
632             *str++ = 'x';
633         *str++ = Py_hexdigits[(ch>>4)&0xf];
634         *str++ = Py_hexdigits[ch&0xf];
635     }
636     return str;
637 }
638 
639 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640    ASCII, Latin1, UTF-8, etc. */
641 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)642 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
643                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644 {
645     Py_ssize_t size, i;
646     Py_UCS4 ch;
647     enum PyUnicode_Kind kind;
648     void *data;
649 
650     assert(PyUnicode_IS_READY(unicode));
651     kind = PyUnicode_KIND(unicode);
652     data = PyUnicode_DATA(unicode);
653 
654     size = 0;
655     /* determine replacement size */
656     for (i = collstart; i < collend; ++i) {
657         Py_ssize_t incr;
658 
659         ch = PyUnicode_READ(kind, data, i);
660         if (ch < 10)
661             incr = 2+1+1;
662         else if (ch < 100)
663             incr = 2+2+1;
664         else if (ch < 1000)
665             incr = 2+3+1;
666         else if (ch < 10000)
667             incr = 2+4+1;
668         else if (ch < 100000)
669             incr = 2+5+1;
670         else if (ch < 1000000)
671             incr = 2+6+1;
672         else {
673             assert(ch <= MAX_UNICODE);
674             incr = 2+7+1;
675         }
676         if (size > PY_SSIZE_T_MAX - incr) {
677             PyErr_SetString(PyExc_OverflowError,
678                             "encoded result is too long for a Python string");
679             return NULL;
680         }
681         size += incr;
682     }
683 
684     str = _PyBytesWriter_Prepare(writer, str, size);
685     if (str == NULL)
686         return NULL;
687 
688     /* generate replacement */
689     for (i = collstart; i < collend; ++i) {
690         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691     }
692     return str;
693 }
694 
695 /* --- Bloom Filters ----------------------------------------------------- */
696 
697 /* stuff to implement simple "bloom filters" for Unicode characters.
698    to keep things simple, we use a single bitmask, using the least 5
699    bits from each unicode characters as the bit index. */
700 
701 /* the linebreak mask is set up by Unicode_Init below */
702 
703 #if LONG_BIT >= 128
704 #define BLOOM_WIDTH 128
705 #elif LONG_BIT >= 64
706 #define BLOOM_WIDTH 64
707 #elif LONG_BIT >= 32
708 #define BLOOM_WIDTH 32
709 #else
710 #error "LONG_BIT is smaller than 32"
711 #endif
712 
713 #define BLOOM_MASK unsigned long
714 
715 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
716 
717 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
718 
719 #define BLOOM_LINEBREAK(ch)                                             \
720     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
721      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
722 
723 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)724 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
725 {
726 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
727     do {                                               \
728         TYPE *data = (TYPE *)PTR;                      \
729         TYPE *end = data + LEN;                        \
730         Py_UCS4 ch;                                    \
731         for (; data != end; data++) {                  \
732             ch = *data;                                \
733             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734         }                                              \
735         break;                                         \
736     } while (0)
737 
738     /* calculate simple bloom-style bitmask for a given unicode string */
739 
740     BLOOM_MASK mask;
741 
742     mask = 0;
743     switch (kind) {
744     case PyUnicode_1BYTE_KIND:
745         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746         break;
747     case PyUnicode_2BYTE_KIND:
748         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749         break;
750     case PyUnicode_4BYTE_KIND:
751         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752         break;
753     default:
754         assert(0);
755     }
756     return mask;
757 
758 #undef BLOOM_UPDATE
759 }
760 
761 static int
ensure_unicode(PyObject * obj)762 ensure_unicode(PyObject *obj)
763 {
764     if (!PyUnicode_Check(obj)) {
765         PyErr_Format(PyExc_TypeError,
766                      "must be str, not %.100s",
767                      Py_TYPE(obj)->tp_name);
768         return -1;
769     }
770     return PyUnicode_READY(obj);
771 }
772 
773 /* Compilation of templated routines */
774 
775 #include "stringlib/asciilib.h"
776 #include "stringlib/fastsearch.h"
777 #include "stringlib/partition.h"
778 #include "stringlib/split.h"
779 #include "stringlib/count.h"
780 #include "stringlib/find.h"
781 #include "stringlib/find_max_char.h"
782 #include "stringlib/localeutil.h"
783 #include "stringlib/undef.h"
784 
785 #include "stringlib/ucs1lib.h"
786 #include "stringlib/fastsearch.h"
787 #include "stringlib/partition.h"
788 #include "stringlib/split.h"
789 #include "stringlib/count.h"
790 #include "stringlib/find.h"
791 #include "stringlib/replace.h"
792 #include "stringlib/find_max_char.h"
793 #include "stringlib/localeutil.h"
794 #include "stringlib/undef.h"
795 
796 #include "stringlib/ucs2lib.h"
797 #include "stringlib/fastsearch.h"
798 #include "stringlib/partition.h"
799 #include "stringlib/split.h"
800 #include "stringlib/count.h"
801 #include "stringlib/find.h"
802 #include "stringlib/replace.h"
803 #include "stringlib/find_max_char.h"
804 #include "stringlib/localeutil.h"
805 #include "stringlib/undef.h"
806 
807 #include "stringlib/ucs4lib.h"
808 #include "stringlib/fastsearch.h"
809 #include "stringlib/partition.h"
810 #include "stringlib/split.h"
811 #include "stringlib/count.h"
812 #include "stringlib/find.h"
813 #include "stringlib/replace.h"
814 #include "stringlib/find_max_char.h"
815 #include "stringlib/localeutil.h"
816 #include "stringlib/undef.h"
817 
818 #include "stringlib/unicodedefs.h"
819 #include "stringlib/fastsearch.h"
820 #include "stringlib/count.h"
821 #include "stringlib/find.h"
822 #include "stringlib/undef.h"
823 
824 /* --- Unicode Object ----------------------------------------------------- */
825 
826 static PyObject *
827 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
828 
829 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)830 findchar(const void *s, int kind,
831          Py_ssize_t size, Py_UCS4 ch,
832          int direction)
833 {
834     switch (kind) {
835     case PyUnicode_1BYTE_KIND:
836         if ((Py_UCS1) ch != ch)
837             return -1;
838         if (direction > 0)
839             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
840         else
841             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
842     case PyUnicode_2BYTE_KIND:
843         if ((Py_UCS2) ch != ch)
844             return -1;
845         if (direction > 0)
846             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
847         else
848             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
849     case PyUnicode_4BYTE_KIND:
850         if (direction > 0)
851             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
852         else
853             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
854     default:
855         assert(0);
856         return -1;
857     }
858 }
859 
860 #ifdef Py_DEBUG
861 /* Fill the data of a Unicode string with invalid characters to detect bugs
862    earlier.
863 
864    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
865    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
866    invalid character in Unicode 6.0. */
867 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)868 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
869 {
870     int kind = PyUnicode_KIND(unicode);
871     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
872     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
873     if (length <= old_length)
874         return;
875     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
876 }
877 #endif
878 
879 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)880 resize_compact(PyObject *unicode, Py_ssize_t length)
881 {
882     Py_ssize_t char_size;
883     Py_ssize_t struct_size;
884     Py_ssize_t new_size;
885     int share_wstr;
886     PyObject *new_unicode;
887 #ifdef Py_DEBUG
888     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
889 #endif
890 
891     assert(unicode_modifiable(unicode));
892     assert(PyUnicode_IS_READY(unicode));
893     assert(PyUnicode_IS_COMPACT(unicode));
894 
895     char_size = PyUnicode_KIND(unicode);
896     if (PyUnicode_IS_ASCII(unicode))
897         struct_size = sizeof(PyASCIIObject);
898     else
899         struct_size = sizeof(PyCompactUnicodeObject);
900     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
901 
902     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
903         PyErr_NoMemory();
904         return NULL;
905     }
906     new_size = (struct_size + (length + 1) * char_size);
907 
908     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
909         PyObject_DEL(_PyUnicode_UTF8(unicode));
910         _PyUnicode_UTF8(unicode) = NULL;
911         _PyUnicode_UTF8_LENGTH(unicode) = 0;
912     }
913     _Py_DEC_REFTOTAL;
914     _Py_ForgetReference(unicode);
915 
916     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
917     if (new_unicode == NULL) {
918         _Py_NewReference(unicode);
919         PyErr_NoMemory();
920         return NULL;
921     }
922     unicode = new_unicode;
923     _Py_NewReference(unicode);
924 
925     _PyUnicode_LENGTH(unicode) = length;
926     if (share_wstr) {
927         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
928         if (!PyUnicode_IS_ASCII(unicode))
929             _PyUnicode_WSTR_LENGTH(unicode) = length;
930     }
931     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
932         PyObject_DEL(_PyUnicode_WSTR(unicode));
933         _PyUnicode_WSTR(unicode) = NULL;
934         if (!PyUnicode_IS_ASCII(unicode))
935             _PyUnicode_WSTR_LENGTH(unicode) = 0;
936     }
937 #ifdef Py_DEBUG
938     unicode_fill_invalid(unicode, old_length);
939 #endif
940     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
941                     length, 0);
942     assert(_PyUnicode_CheckConsistency(unicode, 0));
943     return unicode;
944 }
945 
946 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)947 resize_inplace(PyObject *unicode, Py_ssize_t length)
948 {
949     wchar_t *wstr;
950     Py_ssize_t new_size;
951     assert(!PyUnicode_IS_COMPACT(unicode));
952     assert(Py_REFCNT(unicode) == 1);
953 
954     if (PyUnicode_IS_READY(unicode)) {
955         Py_ssize_t char_size;
956         int share_wstr, share_utf8;
957         void *data;
958 #ifdef Py_DEBUG
959         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
960 #endif
961 
962         data = _PyUnicode_DATA_ANY(unicode);
963         char_size = PyUnicode_KIND(unicode);
964         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
965         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
966 
967         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
968             PyErr_NoMemory();
969             return -1;
970         }
971         new_size = (length + 1) * char_size;
972 
973         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
974         {
975             PyObject_DEL(_PyUnicode_UTF8(unicode));
976             _PyUnicode_UTF8(unicode) = NULL;
977             _PyUnicode_UTF8_LENGTH(unicode) = 0;
978         }
979 
980         data = (PyObject *)PyObject_REALLOC(data, new_size);
981         if (data == NULL) {
982             PyErr_NoMemory();
983             return -1;
984         }
985         _PyUnicode_DATA_ANY(unicode) = data;
986         if (share_wstr) {
987             _PyUnicode_WSTR(unicode) = data;
988             _PyUnicode_WSTR_LENGTH(unicode) = length;
989         }
990         if (share_utf8) {
991             _PyUnicode_UTF8(unicode) = data;
992             _PyUnicode_UTF8_LENGTH(unicode) = length;
993         }
994         _PyUnicode_LENGTH(unicode) = length;
995         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
996 #ifdef Py_DEBUG
997         unicode_fill_invalid(unicode, old_length);
998 #endif
999         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1000             assert(_PyUnicode_CheckConsistency(unicode, 0));
1001             return 0;
1002         }
1003     }
1004     assert(_PyUnicode_WSTR(unicode) != NULL);
1005 
1006     /* check for integer overflow */
1007     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1008         PyErr_NoMemory();
1009         return -1;
1010     }
1011     new_size = sizeof(wchar_t) * (length + 1);
1012     wstr =  _PyUnicode_WSTR(unicode);
1013     wstr = PyObject_REALLOC(wstr, new_size);
1014     if (!wstr) {
1015         PyErr_NoMemory();
1016         return -1;
1017     }
1018     _PyUnicode_WSTR(unicode) = wstr;
1019     _PyUnicode_WSTR(unicode)[length] = 0;
1020     _PyUnicode_WSTR_LENGTH(unicode) = length;
1021     assert(_PyUnicode_CheckConsistency(unicode, 0));
1022     return 0;
1023 }
1024 
1025 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1026 resize_copy(PyObject *unicode, Py_ssize_t length)
1027 {
1028     Py_ssize_t copy_length;
1029     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1030         PyObject *copy;
1031 
1032         if (PyUnicode_READY(unicode) == -1)
1033             return NULL;
1034 
1035         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1036         if (copy == NULL)
1037             return NULL;
1038 
1039         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1040         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1041         return copy;
1042     }
1043     else {
1044         PyObject *w;
1045 
1046         w = (PyObject*)_PyUnicode_New(length);
1047         if (w == NULL)
1048             return NULL;
1049         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1050         copy_length = Py_MIN(copy_length, length);
1051         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1052                   copy_length * sizeof(wchar_t));
1053         return w;
1054     }
1055 }
1056 
1057 /* We allocate one more byte to make sure the string is
1058    Ux0000 terminated; some code (e.g. new_identifier)
1059    relies on that.
1060 
1061    XXX This allocator could further be enhanced by assuring that the
1062    free list never reduces its size below 1.
1063 
1064 */
1065 
1066 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1067 _PyUnicode_New(Py_ssize_t length)
1068 {
1069     PyUnicodeObject *unicode;
1070     size_t new_size;
1071 
1072     /* Optimization for empty strings */
1073     if (length == 0 && unicode_empty != NULL) {
1074         Py_INCREF(unicode_empty);
1075         return (PyUnicodeObject*)unicode_empty;
1076     }
1077 
1078     /* Ensure we won't overflow the size. */
1079     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1080         return (PyUnicodeObject *)PyErr_NoMemory();
1081     }
1082     if (length < 0) {
1083         PyErr_SetString(PyExc_SystemError,
1084                         "Negative size passed to _PyUnicode_New");
1085         return NULL;
1086     }
1087 
1088     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1089     if (unicode == NULL)
1090         return NULL;
1091     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1092 
1093     _PyUnicode_WSTR_LENGTH(unicode) = length;
1094     _PyUnicode_HASH(unicode) = -1;
1095     _PyUnicode_STATE(unicode).interned = 0;
1096     _PyUnicode_STATE(unicode).kind = 0;
1097     _PyUnicode_STATE(unicode).compact = 0;
1098     _PyUnicode_STATE(unicode).ready = 0;
1099     _PyUnicode_STATE(unicode).ascii = 0;
1100     _PyUnicode_DATA_ANY(unicode) = NULL;
1101     _PyUnicode_LENGTH(unicode) = 0;
1102     _PyUnicode_UTF8(unicode) = NULL;
1103     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104 
1105     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1106     if (!_PyUnicode_WSTR(unicode)) {
1107         Py_DECREF(unicode);
1108         PyErr_NoMemory();
1109         return NULL;
1110     }
1111 
1112     /* Initialize the first element to guard against cases where
1113      * the caller fails before initializing str -- unicode_resize()
1114      * reads str[0], and the Keep-Alive optimization can keep memory
1115      * allocated for str alive across a call to unicode_dealloc(unicode).
1116      * We don't want unicode_resize to read uninitialized memory in
1117      * that case.
1118      */
1119     _PyUnicode_WSTR(unicode)[0] = 0;
1120     _PyUnicode_WSTR(unicode)[length] = 0;
1121 
1122     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1123     return unicode;
1124 }
1125 
1126 static const char*
unicode_kind_name(PyObject * unicode)1127 unicode_kind_name(PyObject *unicode)
1128 {
1129     /* don't check consistency: unicode_kind_name() is called from
1130        _PyUnicode_Dump() */
1131     if (!PyUnicode_IS_COMPACT(unicode))
1132     {
1133         if (!PyUnicode_IS_READY(unicode))
1134             return "wstr";
1135         switch (PyUnicode_KIND(unicode))
1136         {
1137         case PyUnicode_1BYTE_KIND:
1138             if (PyUnicode_IS_ASCII(unicode))
1139                 return "legacy ascii";
1140             else
1141                 return "legacy latin1";
1142         case PyUnicode_2BYTE_KIND:
1143             return "legacy UCS2";
1144         case PyUnicode_4BYTE_KIND:
1145             return "legacy UCS4";
1146         default:
1147             return "<legacy invalid kind>";
1148         }
1149     }
1150     assert(PyUnicode_IS_READY(unicode));
1151     switch (PyUnicode_KIND(unicode)) {
1152     case PyUnicode_1BYTE_KIND:
1153         if (PyUnicode_IS_ASCII(unicode))
1154             return "ascii";
1155         else
1156             return "latin1";
1157     case PyUnicode_2BYTE_KIND:
1158         return "UCS2";
1159     case PyUnicode_4BYTE_KIND:
1160         return "UCS4";
1161     default:
1162         return "<invalid compact kind>";
1163     }
1164 }
1165 
1166 #ifdef Py_DEBUG
1167 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1168 char *_PyUnicode_utf8(void *unicode){
1169     return PyUnicode_UTF8(unicode);
1170 }
1171 
_PyUnicode_compact_data(void * unicode)1172 void *_PyUnicode_compact_data(void *unicode) {
1173     return _PyUnicode_COMPACT_DATA(unicode);
1174 }
_PyUnicode_data(void * unicode)1175 void *_PyUnicode_data(void *unicode){
1176     printf("obj %p\n", unicode);
1177     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1178     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1179     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1180     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1181     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1182     return PyUnicode_DATA(unicode);
1183 }
1184 
1185 void
_PyUnicode_Dump(PyObject * op)1186 _PyUnicode_Dump(PyObject *op)
1187 {
1188     PyASCIIObject *ascii = (PyASCIIObject *)op;
1189     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1190     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1191     void *data;
1192 
1193     if (ascii->state.compact)
1194     {
1195         if (ascii->state.ascii)
1196             data = (ascii + 1);
1197         else
1198             data = (compact + 1);
1199     }
1200     else
1201         data = unicode->data.any;
1202     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1203            unicode_kind_name(op), ascii->length);
1204 
1205     if (ascii->wstr == data)
1206         printf("shared ");
1207     printf("wstr=%p", ascii->wstr);
1208 
1209     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1210         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1211         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1212             printf("shared ");
1213         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1214                compact->utf8, compact->utf8_length);
1215     }
1216     printf(", data=%p\n", data);
1217 }
1218 #endif
1219 
1220 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1221 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1222 {
1223     PyObject *obj;
1224     PyCompactUnicodeObject *unicode;
1225     void *data;
1226     enum PyUnicode_Kind kind;
1227     int is_sharing, is_ascii;
1228     Py_ssize_t char_size;
1229     Py_ssize_t struct_size;
1230 
1231     /* Optimization for empty strings */
1232     if (size == 0 && unicode_empty != NULL) {
1233         Py_INCREF(unicode_empty);
1234         return unicode_empty;
1235     }
1236 
1237     is_ascii = 0;
1238     is_sharing = 0;
1239     struct_size = sizeof(PyCompactUnicodeObject);
1240     if (maxchar < 128) {
1241         kind = PyUnicode_1BYTE_KIND;
1242         char_size = 1;
1243         is_ascii = 1;
1244         struct_size = sizeof(PyASCIIObject);
1245     }
1246     else if (maxchar < 256) {
1247         kind = PyUnicode_1BYTE_KIND;
1248         char_size = 1;
1249     }
1250     else if (maxchar < 65536) {
1251         kind = PyUnicode_2BYTE_KIND;
1252         char_size = 2;
1253         if (sizeof(wchar_t) == 2)
1254             is_sharing = 1;
1255     }
1256     else {
1257         if (maxchar > MAX_UNICODE) {
1258             PyErr_SetString(PyExc_SystemError,
1259                             "invalid maximum character passed to PyUnicode_New");
1260             return NULL;
1261         }
1262         kind = PyUnicode_4BYTE_KIND;
1263         char_size = 4;
1264         if (sizeof(wchar_t) == 4)
1265             is_sharing = 1;
1266     }
1267 
1268     /* Ensure we won't overflow the size. */
1269     if (size < 0) {
1270         PyErr_SetString(PyExc_SystemError,
1271                         "Negative size passed to PyUnicode_New");
1272         return NULL;
1273     }
1274     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1275         return PyErr_NoMemory();
1276 
1277     /* Duplicated allocation code from _PyObject_New() instead of a call to
1278      * PyObject_New() so we are able to allocate space for the object and
1279      * it's data buffer.
1280      */
1281     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1282     if (obj == NULL)
1283         return PyErr_NoMemory();
1284     obj = PyObject_INIT(obj, &PyUnicode_Type);
1285     if (obj == NULL)
1286         return NULL;
1287 
1288     unicode = (PyCompactUnicodeObject *)obj;
1289     if (is_ascii)
1290         data = ((PyASCIIObject*)obj) + 1;
1291     else
1292         data = unicode + 1;
1293     _PyUnicode_LENGTH(unicode) = size;
1294     _PyUnicode_HASH(unicode) = -1;
1295     _PyUnicode_STATE(unicode).interned = 0;
1296     _PyUnicode_STATE(unicode).kind = kind;
1297     _PyUnicode_STATE(unicode).compact = 1;
1298     _PyUnicode_STATE(unicode).ready = 1;
1299     _PyUnicode_STATE(unicode).ascii = is_ascii;
1300     if (is_ascii) {
1301         ((char*)data)[size] = 0;
1302         _PyUnicode_WSTR(unicode) = NULL;
1303     }
1304     else if (kind == PyUnicode_1BYTE_KIND) {
1305         ((char*)data)[size] = 0;
1306         _PyUnicode_WSTR(unicode) = NULL;
1307         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1308         unicode->utf8 = NULL;
1309         unicode->utf8_length = 0;
1310     }
1311     else {
1312         unicode->utf8 = NULL;
1313         unicode->utf8_length = 0;
1314         if (kind == PyUnicode_2BYTE_KIND)
1315             ((Py_UCS2*)data)[size] = 0;
1316         else /* kind == PyUnicode_4BYTE_KIND */
1317             ((Py_UCS4*)data)[size] = 0;
1318         if (is_sharing) {
1319             _PyUnicode_WSTR_LENGTH(unicode) = size;
1320             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1321         }
1322         else {
1323             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1324             _PyUnicode_WSTR(unicode) = NULL;
1325         }
1326     }
1327 #ifdef Py_DEBUG
1328     unicode_fill_invalid((PyObject*)unicode, 0);
1329 #endif
1330     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1331     return obj;
1332 }
1333 
1334 #if SIZEOF_WCHAR_T == 2
1335 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1336    will decode surrogate pairs, the other conversions are implemented as macros
1337    for efficiency.
1338 
1339    This function assumes that unicode can hold one more code point than wstr
1340    characters for a terminating null character. */
1341 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1342 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1343                               PyObject *unicode)
1344 {
1345     const wchar_t *iter;
1346     Py_UCS4 *ucs4_out;
1347 
1348     assert(unicode != NULL);
1349     assert(_PyUnicode_CHECK(unicode));
1350     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1351     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1352 
1353     for (iter = begin; iter < end; ) {
1354         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1355                            _PyUnicode_GET_LENGTH(unicode)));
1356         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1357             && (iter+1) < end
1358             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1359         {
1360             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1361             iter += 2;
1362         }
1363         else {
1364             *ucs4_out++ = *iter;
1365             iter++;
1366         }
1367     }
1368     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1369                         _PyUnicode_GET_LENGTH(unicode)));
1370 
1371 }
1372 #endif
1373 
1374 static int
unicode_check_modifiable(PyObject * unicode)1375 unicode_check_modifiable(PyObject *unicode)
1376 {
1377     if (!unicode_modifiable(unicode)) {
1378         PyErr_SetString(PyExc_SystemError,
1379                         "Cannot modify a string currently used");
1380         return -1;
1381     }
1382     return 0;
1383 }
1384 
1385 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1386 _copy_characters(PyObject *to, Py_ssize_t to_start,
1387                  PyObject *from, Py_ssize_t from_start,
1388                  Py_ssize_t how_many, int check_maxchar)
1389 {
1390     unsigned int from_kind, to_kind;
1391     void *from_data, *to_data;
1392 
1393     assert(0 <= how_many);
1394     assert(0 <= from_start);
1395     assert(0 <= to_start);
1396     assert(PyUnicode_Check(from));
1397     assert(PyUnicode_IS_READY(from));
1398     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1399 
1400     assert(PyUnicode_Check(to));
1401     assert(PyUnicode_IS_READY(to));
1402     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1403 
1404     if (how_many == 0)
1405         return 0;
1406 
1407     from_kind = PyUnicode_KIND(from);
1408     from_data = PyUnicode_DATA(from);
1409     to_kind = PyUnicode_KIND(to);
1410     to_data = PyUnicode_DATA(to);
1411 
1412 #ifdef Py_DEBUG
1413     if (!check_maxchar
1414         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1415     {
1416         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1417         Py_UCS4 ch;
1418         Py_ssize_t i;
1419         for (i=0; i < how_many; i++) {
1420             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1421             assert(ch <= to_maxchar);
1422         }
1423     }
1424 #endif
1425 
1426     if (from_kind == to_kind) {
1427         if (check_maxchar
1428             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1429         {
1430             /* Writing Latin-1 characters into an ASCII string requires to
1431                check that all written characters are pure ASCII */
1432             Py_UCS4 max_char;
1433             max_char = ucs1lib_find_max_char(from_data,
1434                                              (Py_UCS1*)from_data + how_many);
1435             if (max_char >= 128)
1436                 return -1;
1437         }
1438         memcpy((char*)to_data + to_kind * to_start,
1439                   (char*)from_data + from_kind * from_start,
1440                   to_kind * how_many);
1441     }
1442     else if (from_kind == PyUnicode_1BYTE_KIND
1443              && to_kind == PyUnicode_2BYTE_KIND)
1444     {
1445         _PyUnicode_CONVERT_BYTES(
1446             Py_UCS1, Py_UCS2,
1447             PyUnicode_1BYTE_DATA(from) + from_start,
1448             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1449             PyUnicode_2BYTE_DATA(to) + to_start
1450             );
1451     }
1452     else if (from_kind == PyUnicode_1BYTE_KIND
1453              && to_kind == PyUnicode_4BYTE_KIND)
1454     {
1455         _PyUnicode_CONVERT_BYTES(
1456             Py_UCS1, Py_UCS4,
1457             PyUnicode_1BYTE_DATA(from) + from_start,
1458             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1459             PyUnicode_4BYTE_DATA(to) + to_start
1460             );
1461     }
1462     else if (from_kind == PyUnicode_2BYTE_KIND
1463              && to_kind == PyUnicode_4BYTE_KIND)
1464     {
1465         _PyUnicode_CONVERT_BYTES(
1466             Py_UCS2, Py_UCS4,
1467             PyUnicode_2BYTE_DATA(from) + from_start,
1468             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1469             PyUnicode_4BYTE_DATA(to) + to_start
1470             );
1471     }
1472     else {
1473         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1474 
1475         if (!check_maxchar) {
1476             if (from_kind == PyUnicode_2BYTE_KIND
1477                 && to_kind == PyUnicode_1BYTE_KIND)
1478             {
1479                 _PyUnicode_CONVERT_BYTES(
1480                     Py_UCS2, Py_UCS1,
1481                     PyUnicode_2BYTE_DATA(from) + from_start,
1482                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1483                     PyUnicode_1BYTE_DATA(to) + to_start
1484                     );
1485             }
1486             else if (from_kind == PyUnicode_4BYTE_KIND
1487                      && to_kind == PyUnicode_1BYTE_KIND)
1488             {
1489                 _PyUnicode_CONVERT_BYTES(
1490                     Py_UCS4, Py_UCS1,
1491                     PyUnicode_4BYTE_DATA(from) + from_start,
1492                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1493                     PyUnicode_1BYTE_DATA(to) + to_start
1494                     );
1495             }
1496             else if (from_kind == PyUnicode_4BYTE_KIND
1497                      && to_kind == PyUnicode_2BYTE_KIND)
1498             {
1499                 _PyUnicode_CONVERT_BYTES(
1500                     Py_UCS4, Py_UCS2,
1501                     PyUnicode_4BYTE_DATA(from) + from_start,
1502                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1503                     PyUnicode_2BYTE_DATA(to) + to_start
1504                     );
1505             }
1506             else {
1507                 assert(0);
1508                 return -1;
1509             }
1510         }
1511         else {
1512             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1513             Py_UCS4 ch;
1514             Py_ssize_t i;
1515 
1516             for (i=0; i < how_many; i++) {
1517                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1518                 if (ch > to_maxchar)
1519                     return -1;
1520                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1521             }
1522         }
1523     }
1524     return 0;
1525 }
1526 
1527 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1528 _PyUnicode_FastCopyCharacters(
1529     PyObject *to, Py_ssize_t to_start,
1530     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1531 {
1532     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1533 }
1534 
1535 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1536 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1537                          PyObject *from, Py_ssize_t from_start,
1538                          Py_ssize_t how_many)
1539 {
1540     int err;
1541 
1542     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1543         PyErr_BadInternalCall();
1544         return -1;
1545     }
1546 
1547     if (PyUnicode_READY(from) == -1)
1548         return -1;
1549     if (PyUnicode_READY(to) == -1)
1550         return -1;
1551 
1552     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1553         PyErr_SetString(PyExc_IndexError, "string index out of range");
1554         return -1;
1555     }
1556     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1557         PyErr_SetString(PyExc_IndexError, "string index out of range");
1558         return -1;
1559     }
1560     if (how_many < 0) {
1561         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1562         return -1;
1563     }
1564     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1565     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1566         PyErr_Format(PyExc_SystemError,
1567                      "Cannot write %zi characters at %zi "
1568                      "in a string of %zi characters",
1569                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1570         return -1;
1571     }
1572 
1573     if (how_many == 0)
1574         return 0;
1575 
1576     if (unicode_check_modifiable(to))
1577         return -1;
1578 
1579     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1580     if (err) {
1581         PyErr_Format(PyExc_SystemError,
1582                      "Cannot copy %s characters "
1583                      "into a string of %s characters",
1584                      unicode_kind_name(from),
1585                      unicode_kind_name(to));
1586         return -1;
1587     }
1588     return how_many;
1589 }
1590 
1591 /* Find the maximum code point and count the number of surrogate pairs so a
1592    correct string length can be computed before converting a string to UCS4.
1593    This function counts single surrogates as a character and not as a pair.
1594 
1595    Return 0 on success, or -1 on error. */
1596 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1597 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1598                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1599 {
1600     const wchar_t *iter;
1601     Py_UCS4 ch;
1602 
1603     assert(num_surrogates != NULL && maxchar != NULL);
1604     *num_surrogates = 0;
1605     *maxchar = 0;
1606 
1607     for (iter = begin; iter < end; ) {
1608 #if SIZEOF_WCHAR_T == 2
1609         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1610             && (iter+1) < end
1611             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1612         {
1613             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1614             ++(*num_surrogates);
1615             iter += 2;
1616         }
1617         else
1618 #endif
1619         {
1620             ch = *iter;
1621             iter++;
1622         }
1623         if (ch > *maxchar) {
1624             *maxchar = ch;
1625             if (*maxchar > MAX_UNICODE) {
1626                 PyErr_Format(PyExc_ValueError,
1627                              "character U+%x is not in range [U+0000; U+10ffff]",
1628                              ch);
1629                 return -1;
1630             }
1631         }
1632     }
1633     return 0;
1634 }
1635 
1636 int
_PyUnicode_Ready(PyObject * unicode)1637 _PyUnicode_Ready(PyObject *unicode)
1638 {
1639     wchar_t *end;
1640     Py_UCS4 maxchar = 0;
1641     Py_ssize_t num_surrogates;
1642 #if SIZEOF_WCHAR_T == 2
1643     Py_ssize_t length_wo_surrogates;
1644 #endif
1645 
1646     /* _PyUnicode_Ready() is only intended for old-style API usage where
1647        strings were created using _PyObject_New() and where no canonical
1648        representation (the str field) has been set yet aka strings
1649        which are not yet ready. */
1650     assert(_PyUnicode_CHECK(unicode));
1651     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1652     assert(_PyUnicode_WSTR(unicode) != NULL);
1653     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1654     assert(_PyUnicode_UTF8(unicode) == NULL);
1655     /* Actually, it should neither be interned nor be anything else: */
1656     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1657 
1658     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1659     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1660                                 &maxchar, &num_surrogates) == -1)
1661         return -1;
1662 
1663     if (maxchar < 256) {
1664         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1665         if (!_PyUnicode_DATA_ANY(unicode)) {
1666             PyErr_NoMemory();
1667             return -1;
1668         }
1669         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1670                                 _PyUnicode_WSTR(unicode), end,
1671                                 PyUnicode_1BYTE_DATA(unicode));
1672         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1673         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1674         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1675         if (maxchar < 128) {
1676             _PyUnicode_STATE(unicode).ascii = 1;
1677             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1678             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1679         }
1680         else {
1681             _PyUnicode_STATE(unicode).ascii = 0;
1682             _PyUnicode_UTF8(unicode) = NULL;
1683             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1684         }
1685         PyObject_FREE(_PyUnicode_WSTR(unicode));
1686         _PyUnicode_WSTR(unicode) = NULL;
1687         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1688     }
1689     /* In this case we might have to convert down from 4-byte native
1690        wchar_t to 2-byte unicode. */
1691     else if (maxchar < 65536) {
1692         assert(num_surrogates == 0 &&
1693                "FindMaxCharAndNumSurrogatePairs() messed up");
1694 
1695 #if SIZEOF_WCHAR_T == 2
1696         /* We can share representations and are done. */
1697         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1698         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1699         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1700         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1701         _PyUnicode_UTF8(unicode) = NULL;
1702         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1703 #else
1704         /* sizeof(wchar_t) == 4 */
1705         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1706             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1707         if (!_PyUnicode_DATA_ANY(unicode)) {
1708             PyErr_NoMemory();
1709             return -1;
1710         }
1711         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1712                                 _PyUnicode_WSTR(unicode), end,
1713                                 PyUnicode_2BYTE_DATA(unicode));
1714         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1715         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1716         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1717         _PyUnicode_UTF8(unicode) = NULL;
1718         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1719         PyObject_FREE(_PyUnicode_WSTR(unicode));
1720         _PyUnicode_WSTR(unicode) = NULL;
1721         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1722 #endif
1723     }
1724     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1725     else {
1726 #if SIZEOF_WCHAR_T == 2
1727         /* in case the native representation is 2-bytes, we need to allocate a
1728            new normalized 4-byte version. */
1729         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1730         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1731             PyErr_NoMemory();
1732             return -1;
1733         }
1734         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1735         if (!_PyUnicode_DATA_ANY(unicode)) {
1736             PyErr_NoMemory();
1737             return -1;
1738         }
1739         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1740         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1741         _PyUnicode_UTF8(unicode) = NULL;
1742         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1744         _PyUnicode_STATE(unicode).ready = 1;
1745         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1746         PyObject_FREE(_PyUnicode_WSTR(unicode));
1747         _PyUnicode_WSTR(unicode) = NULL;
1748         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1749 #else
1750         assert(num_surrogates == 0);
1751 
1752         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1753         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1754         _PyUnicode_UTF8(unicode) = NULL;
1755         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1756         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1757 #endif
1758         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1759     }
1760     _PyUnicode_STATE(unicode).ready = 1;
1761     assert(_PyUnicode_CheckConsistency(unicode, 1));
1762     return 0;
1763 }
1764 
1765 static void
unicode_dealloc(PyObject * unicode)1766 unicode_dealloc(PyObject *unicode)
1767 {
1768     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1769     case SSTATE_NOT_INTERNED:
1770         break;
1771 
1772     case SSTATE_INTERNED_MORTAL:
1773         /* revive dead object temporarily for DelItem */
1774         Py_REFCNT(unicode) = 3;
1775         if (PyDict_DelItem(interned, unicode) != 0)
1776             Py_FatalError(
1777                 "deletion of interned string failed");
1778         break;
1779 
1780     case SSTATE_INTERNED_IMMORTAL:
1781         Py_FatalError("Immortal interned string died.");
1782 
1783     default:
1784         Py_FatalError("Inconsistent interned string state.");
1785     }
1786 
1787     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1788         PyObject_DEL(_PyUnicode_WSTR(unicode));
1789     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1790         PyObject_DEL(_PyUnicode_UTF8(unicode));
1791     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1792         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1793 
1794     Py_TYPE(unicode)->tp_free(unicode);
1795 }
1796 
1797 #ifdef Py_DEBUG
1798 static int
unicode_is_singleton(PyObject * unicode)1799 unicode_is_singleton(PyObject *unicode)
1800 {
1801     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1802     if (unicode == unicode_empty)
1803         return 1;
1804     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1805     {
1806         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1807         if (ch < 256 && unicode_latin1[ch] == unicode)
1808             return 1;
1809     }
1810     return 0;
1811 }
1812 #endif
1813 
1814 static int
unicode_modifiable(PyObject * unicode)1815 unicode_modifiable(PyObject *unicode)
1816 {
1817     assert(_PyUnicode_CHECK(unicode));
1818     if (Py_REFCNT(unicode) != 1)
1819         return 0;
1820     if (_PyUnicode_HASH(unicode) != -1)
1821         return 0;
1822     if (PyUnicode_CHECK_INTERNED(unicode))
1823         return 0;
1824     if (!PyUnicode_CheckExact(unicode))
1825         return 0;
1826 #ifdef Py_DEBUG
1827     /* singleton refcount is greater than 1 */
1828     assert(!unicode_is_singleton(unicode));
1829 #endif
1830     return 1;
1831 }
1832 
1833 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1834 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1835 {
1836     PyObject *unicode;
1837     Py_ssize_t old_length;
1838 
1839     assert(p_unicode != NULL);
1840     unicode = *p_unicode;
1841 
1842     assert(unicode != NULL);
1843     assert(PyUnicode_Check(unicode));
1844     assert(0 <= length);
1845 
1846     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1847         old_length = PyUnicode_WSTR_LENGTH(unicode);
1848     else
1849         old_length = PyUnicode_GET_LENGTH(unicode);
1850     if (old_length == length)
1851         return 0;
1852 
1853     if (length == 0) {
1854         _Py_INCREF_UNICODE_EMPTY();
1855         if (!unicode_empty)
1856             return -1;
1857         Py_SETREF(*p_unicode, unicode_empty);
1858         return 0;
1859     }
1860 
1861     if (!unicode_modifiable(unicode)) {
1862         PyObject *copy = resize_copy(unicode, length);
1863         if (copy == NULL)
1864             return -1;
1865         Py_SETREF(*p_unicode, copy);
1866         return 0;
1867     }
1868 
1869     if (PyUnicode_IS_COMPACT(unicode)) {
1870         PyObject *new_unicode = resize_compact(unicode, length);
1871         if (new_unicode == NULL)
1872             return -1;
1873         *p_unicode = new_unicode;
1874         return 0;
1875     }
1876     return resize_inplace(unicode, length);
1877 }
1878 
1879 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1880 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1881 {
1882     PyObject *unicode;
1883     if (p_unicode == NULL) {
1884         PyErr_BadInternalCall();
1885         return -1;
1886     }
1887     unicode = *p_unicode;
1888     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1889     {
1890         PyErr_BadInternalCall();
1891         return -1;
1892     }
1893     return unicode_resize(p_unicode, length);
1894 }
1895 
1896 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1897 
1898    WARNING: The function doesn't copy the terminating null character and
1899    doesn't check the maximum character (may write a latin1 character in an
1900    ASCII string). */
1901 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1902 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903                    const char *str, Py_ssize_t len)
1904 {
1905     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1906     void *data = PyUnicode_DATA(unicode);
1907     const char *end = str + len;
1908 
1909     switch (kind) {
1910     case PyUnicode_1BYTE_KIND: {
1911         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1912 #ifdef Py_DEBUG
1913         if (PyUnicode_IS_ASCII(unicode)) {
1914             Py_UCS4 maxchar = ucs1lib_find_max_char(
1915                 (const Py_UCS1*)str,
1916                 (const Py_UCS1*)str + len);
1917             assert(maxchar < 128);
1918         }
1919 #endif
1920         memcpy((char *) data + index, str, len);
1921         break;
1922     }
1923     case PyUnicode_2BYTE_KIND: {
1924         Py_UCS2 *start = (Py_UCS2 *)data + index;
1925         Py_UCS2 *ucs2 = start;
1926         assert(index <= PyUnicode_GET_LENGTH(unicode));
1927 
1928         for (; str < end; ++ucs2, ++str)
1929             *ucs2 = (Py_UCS2)*str;
1930 
1931         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1932         break;
1933     }
1934     default: {
1935         Py_UCS4 *start = (Py_UCS4 *)data + index;
1936         Py_UCS4 *ucs4 = start;
1937         assert(kind == PyUnicode_4BYTE_KIND);
1938         assert(index <= PyUnicode_GET_LENGTH(unicode));
1939 
1940         for (; str < end; ++ucs4, ++str)
1941             *ucs4 = (Py_UCS4)*str;
1942 
1943         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1944     }
1945     }
1946 }
1947 
1948 static PyObject*
get_latin1_char(unsigned char ch)1949 get_latin1_char(unsigned char ch)
1950 {
1951     PyObject *unicode = unicode_latin1[ch];
1952     if (!unicode) {
1953         unicode = PyUnicode_New(1, ch);
1954         if (!unicode)
1955             return NULL;
1956         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1957         assert(_PyUnicode_CheckConsistency(unicode, 1));
1958         unicode_latin1[ch] = unicode;
1959     }
1960     Py_INCREF(unicode);
1961     return unicode;
1962 }
1963 
1964 static PyObject*
unicode_char(Py_UCS4 ch)1965 unicode_char(Py_UCS4 ch)
1966 {
1967     PyObject *unicode;
1968 
1969     assert(ch <= MAX_UNICODE);
1970 
1971     if (ch < 256)
1972         return get_latin1_char(ch);
1973 
1974     unicode = PyUnicode_New(1, ch);
1975     if (unicode == NULL)
1976         return NULL;
1977     switch (PyUnicode_KIND(unicode)) {
1978     case PyUnicode_1BYTE_KIND:
1979         PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1980         break;
1981     case PyUnicode_2BYTE_KIND:
1982         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1983         break;
1984     default:
1985         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1986         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1987     }
1988     assert(_PyUnicode_CheckConsistency(unicode, 1));
1989     return unicode;
1990 }
1991 
1992 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)1993 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1994 {
1995     PyObject *unicode;
1996     Py_UCS4 maxchar = 0;
1997     Py_ssize_t num_surrogates;
1998 
1999     if (u == NULL)
2000         return (PyObject*)_PyUnicode_New(size);
2001 
2002     /* If the Unicode data is known at construction time, we can apply
2003        some optimizations which share commonly used objects. */
2004 
2005     /* Optimization for empty strings */
2006     if (size == 0)
2007         _Py_RETURN_UNICODE_EMPTY();
2008 
2009     /* Single character Unicode objects in the Latin-1 range are
2010        shared when using this constructor */
2011     if (size == 1 && (Py_UCS4)*u < 256)
2012         return get_latin1_char((unsigned char)*u);
2013 
2014     /* If not empty and not single character, copy the Unicode data
2015        into the new object */
2016     if (find_maxchar_surrogates(u, u + size,
2017                                 &maxchar, &num_surrogates) == -1)
2018         return NULL;
2019 
2020     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2021     if (!unicode)
2022         return NULL;
2023 
2024     switch (PyUnicode_KIND(unicode)) {
2025     case PyUnicode_1BYTE_KIND:
2026         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2027                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2028         break;
2029     case PyUnicode_2BYTE_KIND:
2030 #if Py_UNICODE_SIZE == 2
2031         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2032 #else
2033         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2034                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2035 #endif
2036         break;
2037     case PyUnicode_4BYTE_KIND:
2038 #if SIZEOF_WCHAR_T == 2
2039         /* This is the only case which has to process surrogates, thus
2040            a simple copy loop is not enough and we need a function. */
2041         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2042 #else
2043         assert(num_surrogates == 0);
2044         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2045 #endif
2046         break;
2047     default:
2048         assert(0 && "Impossible state");
2049     }
2050 
2051     return unicode_result(unicode);
2052 }
2053 
2054 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2055 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2056 {
2057     if (size < 0) {
2058         PyErr_SetString(PyExc_SystemError,
2059                         "Negative size passed to PyUnicode_FromStringAndSize");
2060         return NULL;
2061     }
2062     if (u != NULL)
2063         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2064     else
2065         return (PyObject *)_PyUnicode_New(size);
2066 }
2067 
2068 PyObject *
PyUnicode_FromString(const char * u)2069 PyUnicode_FromString(const char *u)
2070 {
2071     size_t size = strlen(u);
2072     if (size > PY_SSIZE_T_MAX) {
2073         PyErr_SetString(PyExc_OverflowError, "input too long");
2074         return NULL;
2075     }
2076     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2077 }
2078 
2079 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2080 _PyUnicode_FromId(_Py_Identifier *id)
2081 {
2082     if (!id->object) {
2083         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2084                                                   strlen(id->string),
2085                                                   NULL, NULL);
2086         if (!id->object)
2087             return NULL;
2088         PyUnicode_InternInPlace(&id->object);
2089         assert(!id->next);
2090         id->next = static_strings;
2091         static_strings = id;
2092     }
2093     return id->object;
2094 }
2095 
2096 void
_PyUnicode_ClearStaticStrings()2097 _PyUnicode_ClearStaticStrings()
2098 {
2099     _Py_Identifier *tmp, *s = static_strings;
2100     while (s) {
2101         Py_CLEAR(s->object);
2102         tmp = s->next;
2103         s->next = NULL;
2104         s = tmp;
2105     }
2106     static_strings = NULL;
2107 }
2108 
2109 /* Internal function, doesn't check maximum character */
2110 
2111 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2112 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2113 {
2114     const unsigned char *s = (const unsigned char *)buffer;
2115     PyObject *unicode;
2116     if (size == 1) {
2117 #ifdef Py_DEBUG
2118         assert((unsigned char)s[0] < 128);
2119 #endif
2120         return get_latin1_char(s[0]);
2121     }
2122     unicode = PyUnicode_New(size, 127);
2123     if (!unicode)
2124         return NULL;
2125     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2126     assert(_PyUnicode_CheckConsistency(unicode, 1));
2127     return unicode;
2128 }
2129 
2130 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2131 kind_maxchar_limit(unsigned int kind)
2132 {
2133     switch (kind) {
2134     case PyUnicode_1BYTE_KIND:
2135         return 0x80;
2136     case PyUnicode_2BYTE_KIND:
2137         return 0x100;
2138     case PyUnicode_4BYTE_KIND:
2139         return 0x10000;
2140     default:
2141         assert(0 && "invalid kind");
2142         return MAX_UNICODE;
2143     }
2144 }
2145 
2146 static inline Py_UCS4
align_maxchar(Py_UCS4 maxchar)2147 align_maxchar(Py_UCS4 maxchar)
2148 {
2149     if (maxchar <= 127)
2150         return 127;
2151     else if (maxchar <= 255)
2152         return 255;
2153     else if (maxchar <= 65535)
2154         return 65535;
2155     else
2156         return MAX_UNICODE;
2157 }
2158 
2159 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2160 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2161 {
2162     PyObject *res;
2163     unsigned char max_char;
2164 
2165     if (size == 0)
2166         _Py_RETURN_UNICODE_EMPTY();
2167     assert(size > 0);
2168     if (size == 1)
2169         return get_latin1_char(u[0]);
2170 
2171     max_char = ucs1lib_find_max_char(u, u + size);
2172     res = PyUnicode_New(size, max_char);
2173     if (!res)
2174         return NULL;
2175     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2176     assert(_PyUnicode_CheckConsistency(res, 1));
2177     return res;
2178 }
2179 
2180 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2181 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2182 {
2183     PyObject *res;
2184     Py_UCS2 max_char;
2185 
2186     if (size == 0)
2187         _Py_RETURN_UNICODE_EMPTY();
2188     assert(size > 0);
2189     if (size == 1)
2190         return unicode_char(u[0]);
2191 
2192     max_char = ucs2lib_find_max_char(u, u + size);
2193     res = PyUnicode_New(size, max_char);
2194     if (!res)
2195         return NULL;
2196     if (max_char >= 256)
2197         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2198     else {
2199         _PyUnicode_CONVERT_BYTES(
2200             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2201     }
2202     assert(_PyUnicode_CheckConsistency(res, 1));
2203     return res;
2204 }
2205 
2206 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2207 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2208 {
2209     PyObject *res;
2210     Py_UCS4 max_char;
2211 
2212     if (size == 0)
2213         _Py_RETURN_UNICODE_EMPTY();
2214     assert(size > 0);
2215     if (size == 1)
2216         return unicode_char(u[0]);
2217 
2218     max_char = ucs4lib_find_max_char(u, u + size);
2219     res = PyUnicode_New(size, max_char);
2220     if (!res)
2221         return NULL;
2222     if (max_char < 256)
2223         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2224                                  PyUnicode_1BYTE_DATA(res));
2225     else if (max_char < 0x10000)
2226         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2227                                  PyUnicode_2BYTE_DATA(res));
2228     else
2229         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2230     assert(_PyUnicode_CheckConsistency(res, 1));
2231     return res;
2232 }
2233 
2234 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2235 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2236 {
2237     if (size < 0) {
2238         PyErr_SetString(PyExc_ValueError, "size must be positive");
2239         return NULL;
2240     }
2241     switch (kind) {
2242     case PyUnicode_1BYTE_KIND:
2243         return _PyUnicode_FromUCS1(buffer, size);
2244     case PyUnicode_2BYTE_KIND:
2245         return _PyUnicode_FromUCS2(buffer, size);
2246     case PyUnicode_4BYTE_KIND:
2247         return _PyUnicode_FromUCS4(buffer, size);
2248     default:
2249         PyErr_SetString(PyExc_SystemError, "invalid kind");
2250         return NULL;
2251     }
2252 }
2253 
2254 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2255 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2256 {
2257     enum PyUnicode_Kind kind;
2258     void *startptr, *endptr;
2259 
2260     assert(PyUnicode_IS_READY(unicode));
2261     assert(0 <= start);
2262     assert(end <= PyUnicode_GET_LENGTH(unicode));
2263     assert(start <= end);
2264 
2265     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2266         return PyUnicode_MAX_CHAR_VALUE(unicode);
2267 
2268     if (start == end)
2269         return 127;
2270 
2271     if (PyUnicode_IS_ASCII(unicode))
2272         return 127;
2273 
2274     kind = PyUnicode_KIND(unicode);
2275     startptr = PyUnicode_DATA(unicode);
2276     endptr = (char *)startptr + end * kind;
2277     startptr = (char *)startptr + start * kind;
2278     switch(kind) {
2279     case PyUnicode_1BYTE_KIND:
2280         return ucs1lib_find_max_char(startptr, endptr);
2281     case PyUnicode_2BYTE_KIND:
2282         return ucs2lib_find_max_char(startptr, endptr);
2283     case PyUnicode_4BYTE_KIND:
2284         return ucs4lib_find_max_char(startptr, endptr);
2285     default:
2286         assert(0);
2287         return 0;
2288     }
2289 }
2290 
2291 /* Ensure that a string uses the most efficient storage, if it is not the
2292    case: create a new string with of the right kind. Write NULL into *p_unicode
2293    on error. */
2294 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2295 unicode_adjust_maxchar(PyObject **p_unicode)
2296 {
2297     PyObject *unicode, *copy;
2298     Py_UCS4 max_char;
2299     Py_ssize_t len;
2300     unsigned int kind;
2301 
2302     assert(p_unicode != NULL);
2303     unicode = *p_unicode;
2304     assert(PyUnicode_IS_READY(unicode));
2305     if (PyUnicode_IS_ASCII(unicode))
2306         return;
2307 
2308     len = PyUnicode_GET_LENGTH(unicode);
2309     kind = PyUnicode_KIND(unicode);
2310     if (kind == PyUnicode_1BYTE_KIND) {
2311         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2312         max_char = ucs1lib_find_max_char(u, u + len);
2313         if (max_char >= 128)
2314             return;
2315     }
2316     else if (kind == PyUnicode_2BYTE_KIND) {
2317         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2318         max_char = ucs2lib_find_max_char(u, u + len);
2319         if (max_char >= 256)
2320             return;
2321     }
2322     else {
2323         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2324         assert(kind == PyUnicode_4BYTE_KIND);
2325         max_char = ucs4lib_find_max_char(u, u + len);
2326         if (max_char >= 0x10000)
2327             return;
2328     }
2329     copy = PyUnicode_New(len, max_char);
2330     if (copy != NULL)
2331         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2332     Py_DECREF(unicode);
2333     *p_unicode = copy;
2334 }
2335 
2336 PyObject*
_PyUnicode_Copy(PyObject * unicode)2337 _PyUnicode_Copy(PyObject *unicode)
2338 {
2339     Py_ssize_t length;
2340     PyObject *copy;
2341 
2342     if (!PyUnicode_Check(unicode)) {
2343         PyErr_BadInternalCall();
2344         return NULL;
2345     }
2346     if (PyUnicode_READY(unicode) == -1)
2347         return NULL;
2348 
2349     length = PyUnicode_GET_LENGTH(unicode);
2350     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2351     if (!copy)
2352         return NULL;
2353     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2354 
2355     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2356               length * PyUnicode_KIND(unicode));
2357     assert(_PyUnicode_CheckConsistency(copy, 1));
2358     return copy;
2359 }
2360 
2361 
2362 /* Widen Unicode objects to larger buffers. Don't write terminating null
2363    character. Return NULL on error. */
2364 
2365 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2366 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2367 {
2368     Py_ssize_t len;
2369     void *result;
2370     unsigned int skind;
2371 
2372     if (PyUnicode_READY(s) == -1)
2373         return NULL;
2374 
2375     len = PyUnicode_GET_LENGTH(s);
2376     skind = PyUnicode_KIND(s);
2377     if (skind >= kind) {
2378         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2379         return NULL;
2380     }
2381     switch (kind) {
2382     case PyUnicode_2BYTE_KIND:
2383         result = PyMem_New(Py_UCS2, len);
2384         if (!result)
2385             return PyErr_NoMemory();
2386         assert(skind == PyUnicode_1BYTE_KIND);
2387         _PyUnicode_CONVERT_BYTES(
2388             Py_UCS1, Py_UCS2,
2389             PyUnicode_1BYTE_DATA(s),
2390             PyUnicode_1BYTE_DATA(s) + len,
2391             result);
2392         return result;
2393     case PyUnicode_4BYTE_KIND:
2394         result = PyMem_New(Py_UCS4, len);
2395         if (!result)
2396             return PyErr_NoMemory();
2397         if (skind == PyUnicode_2BYTE_KIND) {
2398             _PyUnicode_CONVERT_BYTES(
2399                 Py_UCS2, Py_UCS4,
2400                 PyUnicode_2BYTE_DATA(s),
2401                 PyUnicode_2BYTE_DATA(s) + len,
2402                 result);
2403         }
2404         else {
2405             assert(skind == PyUnicode_1BYTE_KIND);
2406             _PyUnicode_CONVERT_BYTES(
2407                 Py_UCS1, Py_UCS4,
2408                 PyUnicode_1BYTE_DATA(s),
2409                 PyUnicode_1BYTE_DATA(s) + len,
2410                 result);
2411         }
2412         return result;
2413     default:
2414         break;
2415     }
2416     PyErr_SetString(PyExc_SystemError, "invalid kind");
2417     return NULL;
2418 }
2419 
2420 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2421 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2422         int copy_null)
2423 {
2424     int kind;
2425     void *data;
2426     Py_ssize_t len, targetlen;
2427     if (PyUnicode_READY(string) == -1)
2428         return NULL;
2429     kind = PyUnicode_KIND(string);
2430     data = PyUnicode_DATA(string);
2431     len = PyUnicode_GET_LENGTH(string);
2432     targetlen = len;
2433     if (copy_null)
2434         targetlen++;
2435     if (!target) {
2436         target = PyMem_New(Py_UCS4, targetlen);
2437         if (!target) {
2438             PyErr_NoMemory();
2439             return NULL;
2440         }
2441     }
2442     else {
2443         if (targetsize < targetlen) {
2444             PyErr_Format(PyExc_SystemError,
2445                          "string is longer than the buffer");
2446             if (copy_null && 0 < targetsize)
2447                 target[0] = 0;
2448             return NULL;
2449         }
2450     }
2451     if (kind == PyUnicode_1BYTE_KIND) {
2452         Py_UCS1 *start = (Py_UCS1 *) data;
2453         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2454     }
2455     else if (kind == PyUnicode_2BYTE_KIND) {
2456         Py_UCS2 *start = (Py_UCS2 *) data;
2457         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2458     }
2459     else {
2460         assert(kind == PyUnicode_4BYTE_KIND);
2461         memcpy(target, data, len * sizeof(Py_UCS4));
2462     }
2463     if (copy_null)
2464         target[len] = 0;
2465     return target;
2466 }
2467 
2468 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2469 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2470                  int copy_null)
2471 {
2472     if (target == NULL || targetsize < 0) {
2473         PyErr_BadInternalCall();
2474         return NULL;
2475     }
2476     return as_ucs4(string, target, targetsize, copy_null);
2477 }
2478 
2479 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2480 PyUnicode_AsUCS4Copy(PyObject *string)
2481 {
2482     return as_ucs4(string, NULL, 0, 1);
2483 }
2484 
2485 #ifdef HAVE_WCHAR_H
2486 
2487 PyObject *
PyUnicode_FromWideChar(const wchar_t * w,Py_ssize_t size)2488 PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2489 {
2490     if (w == NULL) {
2491         if (size == 0)
2492             _Py_RETURN_UNICODE_EMPTY();
2493         PyErr_BadInternalCall();
2494         return NULL;
2495     }
2496 
2497     if (size == -1) {
2498         size = wcslen(w);
2499     }
2500 
2501     return PyUnicode_FromUnicode(w, size);
2502 }
2503 
2504 #endif /* HAVE_WCHAR_H */
2505 
2506 /* maximum number of characters required for output of %lld or %p.
2507    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2508    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2509 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2510 
2511 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2512 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2513                              Py_ssize_t width, Py_ssize_t precision)
2514 {
2515     Py_ssize_t length, fill, arglen;
2516     Py_UCS4 maxchar;
2517 
2518     if (PyUnicode_READY(str) == -1)
2519         return -1;
2520 
2521     length = PyUnicode_GET_LENGTH(str);
2522     if ((precision == -1 || precision >= length)
2523         && width <= length)
2524         return _PyUnicodeWriter_WriteStr(writer, str);
2525 
2526     if (precision != -1)
2527         length = Py_MIN(precision, length);
2528 
2529     arglen = Py_MAX(length, width);
2530     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532     else
2533         maxchar = writer->maxchar;
2534 
2535     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536         return -1;
2537 
2538     if (width > length) {
2539         fill = width - length;
2540         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541             return -1;
2542         writer->pos += fill;
2543     }
2544 
2545     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546                                   str, 0, length);
2547     writer->pos += length;
2548     return 0;
2549 }
2550 
2551 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2552 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2553                               Py_ssize_t width, Py_ssize_t precision)
2554 {
2555     /* UTF-8 */
2556     Py_ssize_t length;
2557     PyObject *unicode;
2558     int res;
2559 
2560     length = strlen(str);
2561     if (precision != -1)
2562         length = Py_MIN(length, precision);
2563     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2564     if (unicode == NULL)
2565         return -1;
2566 
2567     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2568     Py_DECREF(unicode);
2569     return res;
2570 }
2571 
2572 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2573 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2574                        const char *f, va_list *vargs)
2575 {
2576     const char *p;
2577     Py_ssize_t len;
2578     int zeropad;
2579     Py_ssize_t width;
2580     Py_ssize_t precision;
2581     int longflag;
2582     int longlongflag;
2583     int size_tflag;
2584     Py_ssize_t fill;
2585 
2586     p = f;
2587     f++;
2588     zeropad = 0;
2589     if (*f == '0') {
2590         zeropad = 1;
2591         f++;
2592     }
2593 
2594     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2595     width = -1;
2596     if (Py_ISDIGIT((unsigned)*f)) {
2597         width = *f - '0';
2598         f++;
2599         while (Py_ISDIGIT((unsigned)*f)) {
2600             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2601                 PyErr_SetString(PyExc_ValueError,
2602                                 "width too big");
2603                 return NULL;
2604             }
2605             width = (width * 10) + (*f - '0');
2606             f++;
2607         }
2608     }
2609     precision = -1;
2610     if (*f == '.') {
2611         f++;
2612         if (Py_ISDIGIT((unsigned)*f)) {
2613             precision = (*f - '0');
2614             f++;
2615             while (Py_ISDIGIT((unsigned)*f)) {
2616                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2617                     PyErr_SetString(PyExc_ValueError,
2618                                     "precision too big");
2619                     return NULL;
2620                 }
2621                 precision = (precision * 10) + (*f - '0');
2622                 f++;
2623             }
2624         }
2625         if (*f == '%') {
2626             /* "%.3%s" => f points to "3" */
2627             f--;
2628         }
2629     }
2630     if (*f == '\0') {
2631         /* bogus format "%.123" => go backward, f points to "3" */
2632         f--;
2633     }
2634 
2635     /* Handle %ld, %lu, %lld and %llu. */
2636     longflag = 0;
2637     longlongflag = 0;
2638     size_tflag = 0;
2639     if (*f == 'l') {
2640         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2641             longflag = 1;
2642             ++f;
2643         }
2644         else if (f[1] == 'l' &&
2645                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2646             longlongflag = 1;
2647             f += 2;
2648         }
2649     }
2650     /* handle the size_t flag. */
2651     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2652         size_tflag = 1;
2653         ++f;
2654     }
2655 
2656     if (f[1] == '\0')
2657         writer->overallocate = 0;
2658 
2659     switch (*f) {
2660     case 'c':
2661     {
2662         int ordinal = va_arg(*vargs, int);
2663         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2664             PyErr_SetString(PyExc_OverflowError,
2665                             "character argument not in range(0x110000)");
2666             return NULL;
2667         }
2668         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2669             return NULL;
2670         break;
2671     }
2672 
2673     case 'i':
2674     case 'd':
2675     case 'u':
2676     case 'x':
2677     {
2678         /* used by sprintf */
2679         char buffer[MAX_LONG_LONG_CHARS];
2680         Py_ssize_t arglen;
2681 
2682         if (*f == 'u') {
2683             if (longflag)
2684                 len = sprintf(buffer, "%lu",
2685                         va_arg(*vargs, unsigned long));
2686             else if (longlongflag)
2687                 len = sprintf(buffer, "%llu",
2688                         va_arg(*vargs, unsigned long long));
2689             else if (size_tflag)
2690                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2691                         va_arg(*vargs, size_t));
2692             else
2693                 len = sprintf(buffer, "%u",
2694                         va_arg(*vargs, unsigned int));
2695         }
2696         else if (*f == 'x') {
2697             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2698         }
2699         else {
2700             if (longflag)
2701                 len = sprintf(buffer, "%li",
2702                         va_arg(*vargs, long));
2703             else if (longlongflag)
2704                 len = sprintf(buffer, "%lli",
2705                         va_arg(*vargs, long long));
2706             else if (size_tflag)
2707                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2708                         va_arg(*vargs, Py_ssize_t));
2709             else
2710                 len = sprintf(buffer, "%i",
2711                         va_arg(*vargs, int));
2712         }
2713         assert(len >= 0);
2714 
2715         if (precision < len)
2716             precision = len;
2717 
2718         arglen = Py_MAX(precision, width);
2719         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2720             return NULL;
2721 
2722         if (width > precision) {
2723             Py_UCS4 fillchar;
2724             fill = width - precision;
2725             fillchar = zeropad?'0':' ';
2726             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2727                 return NULL;
2728             writer->pos += fill;
2729         }
2730         if (precision > len) {
2731             fill = precision - len;
2732             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2733                 return NULL;
2734             writer->pos += fill;
2735         }
2736 
2737         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2738             return NULL;
2739         break;
2740     }
2741 
2742     case 'p':
2743     {
2744         char number[MAX_LONG_LONG_CHARS];
2745 
2746         len = sprintf(number, "%p", va_arg(*vargs, void*));
2747         assert(len >= 0);
2748 
2749         /* %p is ill-defined:  ensure leading 0x. */
2750         if (number[1] == 'X')
2751             number[1] = 'x';
2752         else if (number[1] != 'x') {
2753             memmove(number + 2, number,
2754                     strlen(number) + 1);
2755             number[0] = '0';
2756             number[1] = 'x';
2757             len += 2;
2758         }
2759 
2760         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2761             return NULL;
2762         break;
2763     }
2764 
2765     case 's':
2766     {
2767         /* UTF-8 */
2768         const char *s = va_arg(*vargs, const char*);
2769         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2770             return NULL;
2771         break;
2772     }
2773 
2774     case 'U':
2775     {
2776         PyObject *obj = va_arg(*vargs, PyObject *);
2777         assert(obj && _PyUnicode_CHECK(obj));
2778 
2779         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2780             return NULL;
2781         break;
2782     }
2783 
2784     case 'V':
2785     {
2786         PyObject *obj = va_arg(*vargs, PyObject *);
2787         const char *str = va_arg(*vargs, const char *);
2788         if (obj) {
2789             assert(_PyUnicode_CHECK(obj));
2790             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2791                 return NULL;
2792         }
2793         else {
2794             assert(str != NULL);
2795             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2796                 return NULL;
2797         }
2798         break;
2799     }
2800 
2801     case 'S':
2802     {
2803         PyObject *obj = va_arg(*vargs, PyObject *);
2804         PyObject *str;
2805         assert(obj);
2806         str = PyObject_Str(obj);
2807         if (!str)
2808             return NULL;
2809         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2810             Py_DECREF(str);
2811             return NULL;
2812         }
2813         Py_DECREF(str);
2814         break;
2815     }
2816 
2817     case 'R':
2818     {
2819         PyObject *obj = va_arg(*vargs, PyObject *);
2820         PyObject *repr;
2821         assert(obj);
2822         repr = PyObject_Repr(obj);
2823         if (!repr)
2824             return NULL;
2825         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2826             Py_DECREF(repr);
2827             return NULL;
2828         }
2829         Py_DECREF(repr);
2830         break;
2831     }
2832 
2833     case 'A':
2834     {
2835         PyObject *obj = va_arg(*vargs, PyObject *);
2836         PyObject *ascii;
2837         assert(obj);
2838         ascii = PyObject_ASCII(obj);
2839         if (!ascii)
2840             return NULL;
2841         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2842             Py_DECREF(ascii);
2843             return NULL;
2844         }
2845         Py_DECREF(ascii);
2846         break;
2847     }
2848 
2849     case '%':
2850         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2851             return NULL;
2852         break;
2853 
2854     default:
2855         /* if we stumble upon an unknown formatting code, copy the rest
2856            of the format string to the output string. (we cannot just
2857            skip the code, since there's no way to know what's in the
2858            argument list) */
2859         len = strlen(p);
2860         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2861             return NULL;
2862         f = p+len;
2863         return f;
2864     }
2865 
2866     f++;
2867     return f;
2868 }
2869 
2870 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2871 PyUnicode_FromFormatV(const char *format, va_list vargs)
2872 {
2873     va_list vargs2;
2874     const char *f;
2875     _PyUnicodeWriter writer;
2876 
2877     _PyUnicodeWriter_Init(&writer);
2878     writer.min_length = strlen(format) + 100;
2879     writer.overallocate = 1;
2880 
2881     // Copy varags to be able to pass a reference to a subfunction.
2882     va_copy(vargs2, vargs);
2883 
2884     for (f = format; *f; ) {
2885         if (*f == '%') {
2886             f = unicode_fromformat_arg(&writer, f, &vargs2);
2887             if (f == NULL)
2888                 goto fail;
2889         }
2890         else {
2891             const char *p;
2892             Py_ssize_t len;
2893 
2894             p = f;
2895             do
2896             {
2897                 if ((unsigned char)*p > 127) {
2898                     PyErr_Format(PyExc_ValueError,
2899                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2900                         "string, got a non-ASCII byte: 0x%02x",
2901                         (unsigned char)*p);
2902                     goto fail;
2903                 }
2904                 p++;
2905             }
2906             while (*p != '\0' && *p != '%');
2907             len = p - f;
2908 
2909             if (*p == '\0')
2910                 writer.overallocate = 0;
2911 
2912             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2913                 goto fail;
2914 
2915             f = p;
2916         }
2917     }
2918     va_end(vargs2);
2919     return _PyUnicodeWriter_Finish(&writer);
2920 
2921   fail:
2922     va_end(vargs2);
2923     _PyUnicodeWriter_Dealloc(&writer);
2924     return NULL;
2925 }
2926 
2927 PyObject *
PyUnicode_FromFormat(const char * format,...)2928 PyUnicode_FromFormat(const char *format, ...)
2929 {
2930     PyObject* ret;
2931     va_list vargs;
2932 
2933 #ifdef HAVE_STDARG_PROTOTYPES
2934     va_start(vargs, format);
2935 #else
2936     va_start(vargs);
2937 #endif
2938     ret = PyUnicode_FromFormatV(format, vargs);
2939     va_end(vargs);
2940     return ret;
2941 }
2942 
2943 #ifdef HAVE_WCHAR_H
2944 
2945 /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946    convert a Unicode object to a wide character string.
2947 
2948    - If w is NULL: return the number of wide characters (including the null
2949      character) required to convert the unicode object. Ignore size argument.
2950 
2951    - Otherwise: return the number of wide characters (excluding the null
2952      character) written into w. Write at most size wide characters (including
2953      the null character). */
2954 static Py_ssize_t
unicode_aswidechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2955 unicode_aswidechar(PyObject *unicode,
2956                    wchar_t *w,
2957                    Py_ssize_t size)
2958 {
2959     Py_ssize_t res;
2960     const wchar_t *wstr;
2961 
2962     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2963     if (wstr == NULL)
2964         return -1;
2965 
2966     if (w != NULL) {
2967         if (size > res)
2968             size = res + 1;
2969         else
2970             res = size;
2971         memcpy(w, wstr, size * sizeof(wchar_t));
2972         return res;
2973     }
2974     else
2975         return res + 1;
2976 }
2977 
2978 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2979 PyUnicode_AsWideChar(PyObject *unicode,
2980                      wchar_t *w,
2981                      Py_ssize_t size)
2982 {
2983     if (unicode == NULL) {
2984         PyErr_BadInternalCall();
2985         return -1;
2986     }
2987     return unicode_aswidechar(unicode, w, size);
2988 }
2989 
2990 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)2991 PyUnicode_AsWideCharString(PyObject *unicode,
2992                            Py_ssize_t *size)
2993 {
2994     wchar_t* buffer;
2995     Py_ssize_t buflen;
2996 
2997     if (unicode == NULL) {
2998         PyErr_BadInternalCall();
2999         return NULL;
3000     }
3001 
3002     buflen = unicode_aswidechar(unicode, NULL, 0);
3003     if (buflen == -1)
3004         return NULL;
3005     buffer = PyMem_NEW(wchar_t, buflen);
3006     if (buffer == NULL) {
3007         PyErr_NoMemory();
3008         return NULL;
3009     }
3010     buflen = unicode_aswidechar(unicode, buffer, buflen);
3011     if (buflen == -1) {
3012         PyMem_FREE(buffer);
3013         return NULL;
3014     }
3015     if (size != NULL)
3016         *size = buflen;
3017     return buffer;
3018 }
3019 
3020 #endif /* HAVE_WCHAR_H */
3021 
3022 PyObject *
PyUnicode_FromOrdinal(int ordinal)3023 PyUnicode_FromOrdinal(int ordinal)
3024 {
3025     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3026         PyErr_SetString(PyExc_ValueError,
3027                         "chr() arg not in range(0x110000)");
3028         return NULL;
3029     }
3030 
3031     return unicode_char((Py_UCS4)ordinal);
3032 }
3033 
3034 PyObject *
PyUnicode_FromObject(PyObject * obj)3035 PyUnicode_FromObject(PyObject *obj)
3036 {
3037     /* XXX Perhaps we should make this API an alias of
3038        PyObject_Str() instead ?! */
3039     if (PyUnicode_CheckExact(obj)) {
3040         if (PyUnicode_READY(obj) == -1)
3041             return NULL;
3042         Py_INCREF(obj);
3043         return obj;
3044     }
3045     if (PyUnicode_Check(obj)) {
3046         /* For a Unicode subtype that's not a Unicode object,
3047            return a true Unicode object with the same data. */
3048         return _PyUnicode_Copy(obj);
3049     }
3050     PyErr_Format(PyExc_TypeError,
3051                  "Can't convert '%.100s' object to str implicitly",
3052                  Py_TYPE(obj)->tp_name);
3053     return NULL;
3054 }
3055 
3056 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3057 PyUnicode_FromEncodedObject(PyObject *obj,
3058                             const char *encoding,
3059                             const char *errors)
3060 {
3061     Py_buffer buffer;
3062     PyObject *v;
3063 
3064     if (obj == NULL) {
3065         PyErr_BadInternalCall();
3066         return NULL;
3067     }
3068 
3069     /* Decoding bytes objects is the most common case and should be fast */
3070     if (PyBytes_Check(obj)) {
3071         if (PyBytes_GET_SIZE(obj) == 0)
3072             _Py_RETURN_UNICODE_EMPTY();
3073         v = PyUnicode_Decode(
3074                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075                 encoding, errors);
3076         return v;
3077     }
3078 
3079     if (PyUnicode_Check(obj)) {
3080         PyErr_SetString(PyExc_TypeError,
3081                         "decoding str is not supported");
3082         return NULL;
3083     }
3084 
3085     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087         PyErr_Format(PyExc_TypeError,
3088                      "decoding to str: need a bytes-like object, %.80s found",
3089                      Py_TYPE(obj)->tp_name);
3090         return NULL;
3091     }
3092 
3093     if (buffer.len == 0) {
3094         PyBuffer_Release(&buffer);
3095         _Py_RETURN_UNICODE_EMPTY();
3096     }
3097 
3098     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3099     PyBuffer_Release(&buffer);
3100     return v;
3101 }
3102 
3103 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3104    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3105    longer than lower_len-1). */
3106 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3107 _Py_normalize_encoding(const char *encoding,
3108                        char *lower,
3109                        size_t lower_len)
3110 {
3111     const char *e;
3112     char *l;
3113     char *l_end;
3114     int punct;
3115 
3116     assert(encoding != NULL);
3117 
3118     e = encoding;
3119     l = lower;
3120     l_end = &lower[lower_len - 1];
3121     punct = 0;
3122     while (1) {
3123         char c = *e;
3124         if (c == 0) {
3125             break;
3126         }
3127 
3128         if (Py_ISALNUM(c) || c == '.') {
3129             if (punct && l != lower) {
3130                 if (l == l_end) {
3131                     return 0;
3132                 }
3133                 *l++ = '_';
3134             }
3135             punct = 0;
3136 
3137             if (l == l_end) {
3138                 return 0;
3139             }
3140             *l++ = Py_TOLOWER(c);
3141         }
3142         else {
3143             punct = 1;
3144         }
3145 
3146         e++;
3147     }
3148     *l = '\0';
3149     return 1;
3150 }
3151 
3152 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3153 PyUnicode_Decode(const char *s,
3154                  Py_ssize_t size,
3155                  const char *encoding,
3156                  const char *errors)
3157 {
3158     PyObject *buffer = NULL, *unicode;
3159     Py_buffer info;
3160     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161 
3162     if (encoding == NULL) {
3163         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164     }
3165 
3166     /* Shortcuts for common default encodings */
3167     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168         char *lower = buflower;
3169 
3170         /* Fast paths */
3171         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172             lower += 3;
3173             if (*lower == '_') {
3174                 /* Match "utf8" and "utf_8" */
3175                 lower++;
3176             }
3177 
3178             if (lower[0] == '8' && lower[1] == 0) {
3179                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180             }
3181             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183             }
3184             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186             }
3187         }
3188         else {
3189             if (strcmp(lower, "ascii") == 0
3190                 || strcmp(lower, "us_ascii") == 0) {
3191                 return PyUnicode_DecodeASCII(s, size, errors);
3192             }
3193     #ifdef MS_WINDOWS
3194             else if (strcmp(lower, "mbcs") == 0) {
3195                 return PyUnicode_DecodeMBCS(s, size, errors);
3196             }
3197     #endif
3198             else if (strcmp(lower, "latin1") == 0
3199                      || strcmp(lower, "latin_1") == 0
3200                      || strcmp(lower, "iso_8859_1") == 0
3201                      || strcmp(lower, "iso8859_1") == 0) {
3202                 return PyUnicode_DecodeLatin1(s, size, errors);
3203             }
3204         }
3205     }
3206 
3207     /* Decode via the codec registry */
3208     buffer = NULL;
3209     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3210         goto onError;
3211     buffer = PyMemoryView_FromBuffer(&info);
3212     if (buffer == NULL)
3213         goto onError;
3214     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3215     if (unicode == NULL)
3216         goto onError;
3217     if (!PyUnicode_Check(unicode)) {
3218         PyErr_Format(PyExc_TypeError,
3219                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3220                      "use codecs.decode() to decode to arbitrary types",
3221                      encoding,
3222                      Py_TYPE(unicode)->tp_name);
3223         Py_DECREF(unicode);
3224         goto onError;
3225     }
3226     Py_DECREF(buffer);
3227     return unicode_result(unicode);
3228 
3229   onError:
3230     Py_XDECREF(buffer);
3231     return NULL;
3232 }
3233 
3234 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3235 PyUnicode_AsDecodedObject(PyObject *unicode,
3236                           const char *encoding,
3237                           const char *errors)
3238 {
3239     if (!PyUnicode_Check(unicode)) {
3240         PyErr_BadArgument();
3241         return NULL;
3242     }
3243 
3244     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3245                      "PyUnicode_AsDecodedObject() is deprecated; "
3246                      "use PyCodec_Decode() to decode from str", 1) < 0)
3247         return NULL;
3248 
3249     if (encoding == NULL)
3250         encoding = PyUnicode_GetDefaultEncoding();
3251 
3252     /* Decode via the codec registry */
3253     return PyCodec_Decode(unicode, encoding, errors);
3254 }
3255 
3256 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3257 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3258                            const char *encoding,
3259                            const char *errors)
3260 {
3261     PyObject *v;
3262 
3263     if (!PyUnicode_Check(unicode)) {
3264         PyErr_BadArgument();
3265         goto onError;
3266     }
3267 
3268     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3269                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3270                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3271         return NULL;
3272 
3273     if (encoding == NULL)
3274         encoding = PyUnicode_GetDefaultEncoding();
3275 
3276     /* Decode via the codec registry */
3277     v = PyCodec_Decode(unicode, encoding, errors);
3278     if (v == NULL)
3279         goto onError;
3280     if (!PyUnicode_Check(v)) {
3281         PyErr_Format(PyExc_TypeError,
3282                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3283                      "use codecs.decode() to decode to arbitrary types",
3284                      encoding,
3285                      Py_TYPE(unicode)->tp_name);
3286         Py_DECREF(v);
3287         goto onError;
3288     }
3289     return unicode_result(v);
3290 
3291   onError:
3292     return NULL;
3293 }
3294 
3295 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3296 PyUnicode_Encode(const Py_UNICODE *s,
3297                  Py_ssize_t size,
3298                  const char *encoding,
3299                  const char *errors)
3300 {
3301     PyObject *v, *unicode;
3302 
3303     unicode = PyUnicode_FromUnicode(s, size);
3304     if (unicode == NULL)
3305         return NULL;
3306     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3307     Py_DECREF(unicode);
3308     return v;
3309 }
3310 
3311 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3312 PyUnicode_AsEncodedObject(PyObject *unicode,
3313                           const char *encoding,
3314                           const char *errors)
3315 {
3316     PyObject *v;
3317 
3318     if (!PyUnicode_Check(unicode)) {
3319         PyErr_BadArgument();
3320         goto onError;
3321     }
3322 
3323     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3324                      "PyUnicode_AsEncodedObject() is deprecated; "
3325                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3326                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3327         return NULL;
3328 
3329     if (encoding == NULL)
3330         encoding = PyUnicode_GetDefaultEncoding();
3331 
3332     /* Encode via the codec registry */
3333     v = PyCodec_Encode(unicode, encoding, errors);
3334     if (v == NULL)
3335         goto onError;
3336     return v;
3337 
3338   onError:
3339     return NULL;
3340 }
3341 
3342 static size_t
wcstombs_errorpos(const wchar_t * wstr)3343 wcstombs_errorpos(const wchar_t *wstr)
3344 {
3345     size_t len;
3346 #if SIZEOF_WCHAR_T == 2
3347     wchar_t buf[3];
3348 #else
3349     wchar_t buf[2];
3350 #endif
3351     char outbuf[MB_LEN_MAX];
3352     const wchar_t *start, *previous;
3353 
3354 #if SIZEOF_WCHAR_T == 2
3355     buf[2] = 0;
3356 #else
3357     buf[1] = 0;
3358 #endif
3359     start = wstr;
3360     while (*wstr != L'\0')
3361     {
3362         previous = wstr;
3363 #if SIZEOF_WCHAR_T == 2
3364         if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3365             && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3366         {
3367             buf[0] = wstr[0];
3368             buf[1] = wstr[1];
3369             wstr += 2;
3370         }
3371         else {
3372             buf[0] = *wstr;
3373             buf[1] = 0;
3374             wstr++;
3375         }
3376 #else
3377         buf[0] = *wstr;
3378         wstr++;
3379 #endif
3380         len = wcstombs(outbuf, buf, sizeof(outbuf));
3381         if (len == (size_t)-1)
3382             return previous - start;
3383     }
3384 
3385     /* failed to find the unencodable character */
3386     return 0;
3387 }
3388 
3389 static int
locale_error_handler(const char * errors,int * surrogateescape)3390 locale_error_handler(const char *errors, int *surrogateescape)
3391 {
3392     _Py_error_handler error_handler = get_error_handler(errors);
3393     switch (error_handler)
3394     {
3395     case _Py_ERROR_STRICT:
3396         *surrogateescape = 0;
3397         return 0;
3398     case _Py_ERROR_SURROGATEESCAPE:
3399         *surrogateescape = 1;
3400         return 0;
3401     default:
3402         PyErr_Format(PyExc_ValueError,
3403                      "only 'strict' and 'surrogateescape' error handlers "
3404                      "are supported, not '%s'",
3405                      errors);
3406         return -1;
3407     }
3408 }
3409 
3410 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3411 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3412 {
3413     Py_ssize_t wlen, wlen2;
3414     wchar_t *wstr;
3415     PyObject *bytes = NULL;
3416     char *errmsg;
3417     PyObject *reason = NULL;
3418     PyObject *exc;
3419     size_t error_pos;
3420     int surrogateescape;
3421 
3422     if (locale_error_handler(errors, &surrogateescape) < 0)
3423         return NULL;
3424 
3425     wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3426     if (wstr == NULL)
3427         return NULL;
3428 
3429     wlen2 = wcslen(wstr);
3430     if (wlen2 != wlen) {
3431         PyMem_Free(wstr);
3432         PyErr_SetString(PyExc_ValueError, "embedded null character");
3433         return NULL;
3434     }
3435 
3436     if (surrogateescape) {
3437         /* "surrogateescape" error handler */
3438         char *str;
3439 
3440         str = Py_EncodeLocale(wstr, &error_pos);
3441         if (str == NULL) {
3442             if (error_pos == (size_t)-1) {
3443                 PyErr_NoMemory();
3444                 PyMem_Free(wstr);
3445                 return NULL;
3446             }
3447             else {
3448                 goto encode_error;
3449             }
3450         }
3451         PyMem_Free(wstr);
3452 
3453         bytes = PyBytes_FromString(str);
3454         PyMem_Free(str);
3455     }
3456     else {
3457         /* strict mode */
3458         size_t len, len2;
3459 
3460         len = wcstombs(NULL, wstr, 0);
3461         if (len == (size_t)-1) {
3462             error_pos = (size_t)-1;
3463             goto encode_error;
3464         }
3465 
3466         bytes = PyBytes_FromStringAndSize(NULL, len);
3467         if (bytes == NULL) {
3468             PyMem_Free(wstr);
3469             return NULL;
3470         }
3471 
3472         len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3473         if (len2 == (size_t)-1 || len2 > len) {
3474             error_pos = (size_t)-1;
3475             goto encode_error;
3476         }
3477         PyMem_Free(wstr);
3478     }
3479     return bytes;
3480 
3481 encode_error:
3482     errmsg = strerror(errno);
3483     assert(errmsg != NULL);
3484 
3485     if (error_pos == (size_t)-1)
3486         error_pos = wcstombs_errorpos(wstr);
3487 
3488     PyMem_Free(wstr);
3489     Py_XDECREF(bytes);
3490 
3491     if (errmsg != NULL) {
3492         size_t errlen;
3493         wstr = Py_DecodeLocale(errmsg, &errlen);
3494         if (wstr != NULL) {
3495             reason = PyUnicode_FromWideChar(wstr, errlen);
3496             PyMem_RawFree(wstr);
3497         } else
3498             errmsg = NULL;
3499     }
3500     if (errmsg == NULL)
3501         reason = PyUnicode_FromString(
3502             "wcstombs() encountered an unencodable "
3503             "wide character");
3504     if (reason == NULL)
3505         return NULL;
3506 
3507     exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3508                                 "locale", unicode,
3509                                 (Py_ssize_t)error_pos,
3510                                 (Py_ssize_t)(error_pos+1),
3511                                 reason);
3512     Py_DECREF(reason);
3513     if (exc != NULL) {
3514         PyCodec_StrictErrors(exc);
3515         Py_XDECREF(exc);
3516     }
3517     return NULL;
3518 }
3519 
3520 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3521 PyUnicode_EncodeFSDefault(PyObject *unicode)
3522 {
3523 #if defined(__APPLE__)
3524     return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3525 #else
3526     PyInterpreterState *interp = PyThreadState_GET()->interp;
3527     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3528        cannot use it to encode and decode filenames before it is loaded. Load
3529        the Python codec requires to encode at least its own filename. Use the C
3530        version of the locale codec until the codec registry is initialized and
3531        the Python codec is loaded.
3532 
3533        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3534        cannot only rely on it: check also interp->fscodec_initialized for
3535        subinterpreters. */
3536     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3537         return PyUnicode_AsEncodedString(unicode,
3538                                          Py_FileSystemDefaultEncoding,
3539                                          Py_FileSystemDefaultEncodeErrors);
3540     }
3541     else {
3542         return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
3543     }
3544 #endif
3545 }
3546 
3547 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3548 PyUnicode_AsEncodedString(PyObject *unicode,
3549                           const char *encoding,
3550                           const char *errors)
3551 {
3552     PyObject *v;
3553     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3554 
3555     if (!PyUnicode_Check(unicode)) {
3556         PyErr_BadArgument();
3557         return NULL;
3558     }
3559 
3560     if (encoding == NULL) {
3561         return _PyUnicode_AsUTF8String(unicode, errors);
3562     }
3563 
3564     /* Shortcuts for common default encodings */
3565     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566         char *lower = buflower;
3567 
3568         /* Fast paths */
3569         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570             lower += 3;
3571             if (*lower == '_') {
3572                 /* Match "utf8" and "utf_8" */
3573                 lower++;
3574             }
3575 
3576             if (lower[0] == '8' && lower[1] == 0) {
3577                 return _PyUnicode_AsUTF8String(unicode, errors);
3578             }
3579             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3581             }
3582             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3584             }
3585         }
3586         else {
3587             if (strcmp(lower, "ascii") == 0
3588                 || strcmp(lower, "us_ascii") == 0) {
3589                 return _PyUnicode_AsASCIIString(unicode, errors);
3590             }
3591 #ifdef MS_WINDOWS
3592             else if (strcmp(lower, "mbcs") == 0) {
3593                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3594             }
3595 #endif
3596             else if (strcmp(lower, "latin1") == 0 ||
3597                      strcmp(lower, "latin_1") == 0 ||
3598                      strcmp(lower, "iso_8859_1") == 0 ||
3599                      strcmp(lower, "iso8859_1") == 0) {
3600                 return _PyUnicode_AsLatin1String(unicode, errors);
3601             }
3602         }
3603     }
3604 
3605     /* Encode via the codec registry */
3606     v = _PyCodec_EncodeText(unicode, encoding, errors);
3607     if (v == NULL)
3608         return NULL;
3609 
3610     /* The normal path */
3611     if (PyBytes_Check(v))
3612         return v;
3613 
3614     /* If the codec returns a buffer, raise a warning and convert to bytes */
3615     if (PyByteArray_Check(v)) {
3616         int error;
3617         PyObject *b;
3618 
3619         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3620             "encoder %s returned bytearray instead of bytes; "
3621             "use codecs.encode() to encode to arbitrary types",
3622             encoding);
3623         if (error) {
3624             Py_DECREF(v);
3625             return NULL;
3626         }
3627 
3628         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3629         Py_DECREF(v);
3630         return b;
3631     }
3632 
3633     PyErr_Format(PyExc_TypeError,
3634                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3635                  "use codecs.encode() to encode to arbitrary types",
3636                  encoding,
3637                  Py_TYPE(v)->tp_name);
3638     Py_DECREF(v);
3639     return NULL;
3640 }
3641 
3642 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3643 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3644                            const char *encoding,
3645                            const char *errors)
3646 {
3647     PyObject *v;
3648 
3649     if (!PyUnicode_Check(unicode)) {
3650         PyErr_BadArgument();
3651         goto onError;
3652     }
3653 
3654     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3655                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3656                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3657         return NULL;
3658 
3659     if (encoding == NULL)
3660         encoding = PyUnicode_GetDefaultEncoding();
3661 
3662     /* Encode via the codec registry */
3663     v = PyCodec_Encode(unicode, encoding, errors);
3664     if (v == NULL)
3665         goto onError;
3666     if (!PyUnicode_Check(v)) {
3667         PyErr_Format(PyExc_TypeError,
3668                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3669                      "use codecs.encode() to encode to arbitrary types",
3670                      encoding,
3671                      Py_TYPE(v)->tp_name);
3672         Py_DECREF(v);
3673         goto onError;
3674     }
3675     return v;
3676 
3677   onError:
3678     return NULL;
3679 }
3680 
3681 static size_t
mbstowcs_errorpos(const char * str,size_t len)3682 mbstowcs_errorpos(const char *str, size_t len)
3683 {
3684 #ifdef HAVE_MBRTOWC
3685     const char *start = str;
3686     mbstate_t mbs;
3687     size_t converted;
3688     wchar_t ch;
3689 
3690     memset(&mbs, 0, sizeof mbs);
3691     while (len)
3692     {
3693         converted = mbrtowc(&ch, str, len, &mbs);
3694         if (converted == 0)
3695             /* Reached end of string */
3696             break;
3697         if (converted == (size_t)-1 || converted == (size_t)-2) {
3698             /* Conversion error or incomplete character */
3699             return str - start;
3700         }
3701         else {
3702             str += converted;
3703             len -= converted;
3704         }
3705     }
3706     /* failed to find the undecodable byte sequence */
3707     return 0;
3708 #endif
3709     return 0;
3710 }
3711 
3712 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3713 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3714                               const char *errors)
3715 {
3716     wchar_t smallbuf[256];
3717     size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3718     wchar_t *wstr;
3719     size_t wlen, wlen2;
3720     PyObject *unicode;
3721     int surrogateescape;
3722     size_t error_pos;
3723     char *errmsg;
3724     PyObject *reason = NULL;   /* initialize to prevent gcc warning */
3725     PyObject *exc;
3726 
3727     if (locale_error_handler(errors, &surrogateescape) < 0)
3728         return NULL;
3729 
3730     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3731         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3732         return NULL;
3733     }
3734 
3735     if (surrogateescape) {
3736         /* "surrogateescape" error handler */
3737         wstr = Py_DecodeLocale(str, &wlen);
3738         if (wstr == NULL) {
3739             if (wlen == (size_t)-1)
3740                 PyErr_NoMemory();
3741             else
3742                 PyErr_SetFromErrno(PyExc_OSError);
3743             return NULL;
3744         }
3745 
3746         unicode = PyUnicode_FromWideChar(wstr, wlen);
3747         PyMem_RawFree(wstr);
3748     }
3749     else {
3750         /* strict mode */
3751 #ifndef HAVE_BROKEN_MBSTOWCS
3752         wlen = mbstowcs(NULL, str, 0);
3753 #else
3754         wlen = len;
3755 #endif
3756         if (wlen == (size_t)-1)
3757             goto decode_error;
3758         if (wlen+1 <= smallbuf_len) {
3759             wstr = smallbuf;
3760         }
3761         else {
3762             wstr = PyMem_New(wchar_t, wlen+1);
3763             if (!wstr)
3764                 return PyErr_NoMemory();
3765         }
3766 
3767         wlen2 = mbstowcs(wstr, str, wlen+1);
3768         if (wlen2 == (size_t)-1) {
3769             if (wstr != smallbuf)
3770                 PyMem_Free(wstr);
3771             goto decode_error;
3772         }
3773 #ifdef HAVE_BROKEN_MBSTOWCS
3774         assert(wlen2 == wlen);
3775 #endif
3776         unicode = PyUnicode_FromWideChar(wstr, wlen2);
3777         if (wstr != smallbuf)
3778             PyMem_Free(wstr);
3779     }
3780     return unicode;
3781 
3782 decode_error:
3783     reason = NULL;
3784     errmsg = strerror(errno);
3785     assert(errmsg != NULL);
3786 
3787     error_pos = mbstowcs_errorpos(str, len);
3788     if (errmsg != NULL) {
3789         size_t errlen;
3790         wstr = Py_DecodeLocale(errmsg, &errlen);
3791         if (wstr != NULL) {
3792             reason = PyUnicode_FromWideChar(wstr, errlen);
3793             PyMem_RawFree(wstr);
3794         }
3795     }
3796     if (reason == NULL)
3797         reason = PyUnicode_FromString(
3798             "mbstowcs() encountered an invalid multibyte sequence");
3799     if (reason == NULL)
3800         return NULL;
3801 
3802     exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3803                                 "locale", str, len,
3804                                 (Py_ssize_t)error_pos,
3805                                 (Py_ssize_t)(error_pos+1),
3806                                 reason);
3807     Py_DECREF(reason);
3808     if (exc != NULL) {
3809         PyCodec_StrictErrors(exc);
3810         Py_XDECREF(exc);
3811     }
3812     return NULL;
3813 }
3814 
3815 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3816 PyUnicode_DecodeLocale(const char *str, const char *errors)
3817 {
3818     Py_ssize_t size = (Py_ssize_t)strlen(str);
3819     return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3820 }
3821 
3822 
3823 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3824 PyUnicode_DecodeFSDefault(const char *s) {
3825     Py_ssize_t size = (Py_ssize_t)strlen(s);
3826     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3827 }
3828 
3829 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3830 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3831 {
3832 #if defined(__APPLE__)
3833     return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3834 #else
3835     PyInterpreterState *interp = PyThreadState_GET()->interp;
3836     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3837        cannot use it to encode and decode filenames before it is loaded. Load
3838        the Python codec requires to encode at least its own filename. Use the C
3839        version of the locale codec until the codec registry is initialized and
3840        the Python codec is loaded.
3841 
3842        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3843        cannot only rely on it: check also interp->fscodec_initialized for
3844        subinterpreters. */
3845     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3846         return PyUnicode_Decode(s, size,
3847                                 Py_FileSystemDefaultEncoding,
3848                                 Py_FileSystemDefaultEncodeErrors);
3849     }
3850     else {
3851         return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
3852     }
3853 #endif
3854 }
3855 
3856 
3857 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3858 PyUnicode_FSConverter(PyObject* arg, void* addr)
3859 {
3860     PyObject *path = NULL;
3861     PyObject *output = NULL;
3862     Py_ssize_t size;
3863     void *data;
3864     if (arg == NULL) {
3865         Py_DECREF(*(PyObject**)addr);
3866         *(PyObject**)addr = NULL;
3867         return 1;
3868     }
3869     path = PyOS_FSPath(arg);
3870     if (path == NULL) {
3871         return 0;
3872     }
3873     if (PyBytes_Check(path)) {
3874         output = path;
3875     }
3876     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3877         output = PyUnicode_EncodeFSDefault(path);
3878         Py_DECREF(path);
3879         if (!output) {
3880             return 0;
3881         }
3882         assert(PyBytes_Check(output));
3883     }
3884 
3885     size = PyBytes_GET_SIZE(output);
3886     data = PyBytes_AS_STRING(output);
3887     if ((size_t)size != strlen(data)) {
3888         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3889         Py_DECREF(output);
3890         return 0;
3891     }
3892     *(PyObject**)addr = output;
3893     return Py_CLEANUP_SUPPORTED;
3894 }
3895 
3896 
3897 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3898 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3899 {
3900     int is_buffer = 0;
3901     PyObject *path = NULL;
3902     PyObject *output = NULL;
3903     if (arg == NULL) {
3904         Py_DECREF(*(PyObject**)addr);
3905         return 1;
3906     }
3907 
3908     is_buffer = PyObject_CheckBuffer(arg);
3909     if (!is_buffer) {
3910         path = PyOS_FSPath(arg);
3911         if (path == NULL) {
3912             return 0;
3913         }
3914     }
3915     else {
3916         path = arg;
3917         Py_INCREF(arg);
3918     }
3919 
3920     if (PyUnicode_Check(path)) {
3921         if (PyUnicode_READY(path) == -1) {
3922             Py_DECREF(path);
3923             return 0;
3924         }
3925         output = path;
3926     }
3927     else if (PyBytes_Check(path) || is_buffer) {
3928         PyObject *path_bytes = NULL;
3929 
3930         if (!PyBytes_Check(path) &&
3931             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3932             "path should be string, bytes, or os.PathLike, not %.200s",
3933             Py_TYPE(arg)->tp_name)) {
3934                 Py_DECREF(path);
3935             return 0;
3936         }
3937         path_bytes = PyBytes_FromObject(path);
3938         Py_DECREF(path);
3939         if (!path_bytes) {
3940             return 0;
3941         }
3942         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3943                                                   PyBytes_GET_SIZE(path_bytes));
3944         Py_DECREF(path_bytes);
3945         if (!output) {
3946             return 0;
3947         }
3948     }
3949     else {
3950         PyErr_Format(PyExc_TypeError,
3951                      "path should be string, bytes, or os.PathLike, not %.200s",
3952                      Py_TYPE(arg)->tp_name);
3953         Py_DECREF(path);
3954         return 0;
3955     }
3956     if (PyUnicode_READY(output) == -1) {
3957         Py_DECREF(output);
3958         return 0;
3959     }
3960     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3961                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3962         PyErr_SetString(PyExc_ValueError, "embedded null character");
3963         Py_DECREF(output);
3964         return 0;
3965     }
3966     *(PyObject**)addr = output;
3967     return Py_CLEANUP_SUPPORTED;
3968 }
3969 
3970 
3971 char*
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3972 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3973 {
3974     PyObject *bytes;
3975 
3976     if (!PyUnicode_Check(unicode)) {
3977         PyErr_BadArgument();
3978         return NULL;
3979     }
3980     if (PyUnicode_READY(unicode) == -1)
3981         return NULL;
3982 
3983     if (PyUnicode_UTF8(unicode) == NULL) {
3984         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3985         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3986         if (bytes == NULL)
3987             return NULL;
3988         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3989         if (_PyUnicode_UTF8(unicode) == NULL) {
3990             PyErr_NoMemory();
3991             Py_DECREF(bytes);
3992             return NULL;
3993         }
3994         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3995         memcpy(_PyUnicode_UTF8(unicode),
3996                   PyBytes_AS_STRING(bytes),
3997                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
3998         Py_DECREF(bytes);
3999     }
4000 
4001     if (psize)
4002         *psize = PyUnicode_UTF8_LENGTH(unicode);
4003     return PyUnicode_UTF8(unicode);
4004 }
4005 
4006 char*
PyUnicode_AsUTF8(PyObject * unicode)4007 PyUnicode_AsUTF8(PyObject *unicode)
4008 {
4009     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4010 }
4011 
4012 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4013 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4014 {
4015     const unsigned char *one_byte;
4016 #if SIZEOF_WCHAR_T == 4
4017     const Py_UCS2 *two_bytes;
4018 #else
4019     const Py_UCS4 *four_bytes;
4020     const Py_UCS4 *ucs4_end;
4021     Py_ssize_t num_surrogates;
4022 #endif
4023     wchar_t *w;
4024     wchar_t *wchar_end;
4025 
4026     if (!PyUnicode_Check(unicode)) {
4027         PyErr_BadArgument();
4028         return NULL;
4029     }
4030     if (_PyUnicode_WSTR(unicode) == NULL) {
4031         /* Non-ASCII compact unicode object */
4032         assert(_PyUnicode_KIND(unicode) != 0);
4033         assert(PyUnicode_IS_READY(unicode));
4034 
4035         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
4036 #if SIZEOF_WCHAR_T == 2
4037             four_bytes = PyUnicode_4BYTE_DATA(unicode);
4038             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
4039             num_surrogates = 0;
4040 
4041             for (; four_bytes < ucs4_end; ++four_bytes) {
4042                 if (*four_bytes > 0xFFFF)
4043                     ++num_surrogates;
4044             }
4045 
4046             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4047                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4048             if (!_PyUnicode_WSTR(unicode)) {
4049                 PyErr_NoMemory();
4050                 return NULL;
4051             }
4052             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
4053 
4054             w = _PyUnicode_WSTR(unicode);
4055             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4056             four_bytes = PyUnicode_4BYTE_DATA(unicode);
4057             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4058                 if (*four_bytes > 0xFFFF) {
4059                     assert(*four_bytes <= MAX_UNICODE);
4060                     /* encode surrogate pair in this case */
4061                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4062                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
4063                 }
4064                 else
4065                     *w = *four_bytes;
4066 
4067                 if (w > wchar_end) {
4068                     assert(0 && "Miscalculated string end");
4069                 }
4070             }
4071             *w = 0;
4072 #else
4073             /* sizeof(wchar_t) == 4 */
4074             Py_FatalError("Impossible unicode object state, wstr and str "
4075                           "should share memory already.");
4076             return NULL;
4077 #endif
4078         }
4079         else {
4080             if ((size_t)_PyUnicode_LENGTH(unicode) >
4081                     PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4082                 PyErr_NoMemory();
4083                 return NULL;
4084             }
4085             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4086                                                   (_PyUnicode_LENGTH(unicode) + 1));
4087             if (!_PyUnicode_WSTR(unicode)) {
4088                 PyErr_NoMemory();
4089                 return NULL;
4090             }
4091             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4092                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4093             w = _PyUnicode_WSTR(unicode);
4094             wchar_end = w + _PyUnicode_LENGTH(unicode);
4095 
4096             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4097                 one_byte = PyUnicode_1BYTE_DATA(unicode);
4098                 for (; w < wchar_end; ++one_byte, ++w)
4099                     *w = *one_byte;
4100                 /* null-terminate the wstr */
4101                 *w = 0;
4102             }
4103             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
4104 #if SIZEOF_WCHAR_T == 4
4105                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
4106                 for (; w < wchar_end; ++two_bytes, ++w)
4107                     *w = *two_bytes;
4108                 /* null-terminate the wstr */
4109                 *w = 0;
4110 #else
4111                 /* sizeof(wchar_t) == 2 */
4112                 PyObject_FREE(_PyUnicode_WSTR(unicode));
4113                 _PyUnicode_WSTR(unicode) = NULL;
4114                 Py_FatalError("Impossible unicode object state, wstr "
4115                               "and str should share memory already.");
4116                 return NULL;
4117 #endif
4118             }
4119             else {
4120                 assert(0 && "This should never happen.");
4121             }
4122         }
4123     }
4124     if (size != NULL)
4125         *size = PyUnicode_WSTR_LENGTH(unicode);
4126     return _PyUnicode_WSTR(unicode);
4127 }
4128 
4129 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4130 PyUnicode_AsUnicode(PyObject *unicode)
4131 {
4132     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4133 }
4134 
4135 
4136 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4137 PyUnicode_GetSize(PyObject *unicode)
4138 {
4139     if (!PyUnicode_Check(unicode)) {
4140         PyErr_BadArgument();
4141         goto onError;
4142     }
4143     return PyUnicode_GET_SIZE(unicode);
4144 
4145   onError:
4146     return -1;
4147 }
4148 
4149 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4150 PyUnicode_GetLength(PyObject *unicode)
4151 {
4152     if (!PyUnicode_Check(unicode)) {
4153         PyErr_BadArgument();
4154         return -1;
4155     }
4156     if (PyUnicode_READY(unicode) == -1)
4157         return -1;
4158     return PyUnicode_GET_LENGTH(unicode);
4159 }
4160 
4161 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163 {
4164     void *data;
4165     int kind;
4166 
4167     if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4168         PyErr_BadArgument();
4169         return (Py_UCS4)-1;
4170     }
4171     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4172         PyErr_SetString(PyExc_IndexError, "string index out of range");
4173         return (Py_UCS4)-1;
4174     }
4175     data = PyUnicode_DATA(unicode);
4176     kind = PyUnicode_KIND(unicode);
4177     return PyUnicode_READ(kind, data, index);
4178 }
4179 
4180 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4181 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4182 {
4183     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4184         PyErr_BadArgument();
4185         return -1;
4186     }
4187     assert(PyUnicode_IS_READY(unicode));
4188     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4189         PyErr_SetString(PyExc_IndexError, "string index out of range");
4190         return -1;
4191     }
4192     if (unicode_check_modifiable(unicode))
4193         return -1;
4194     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4195         PyErr_SetString(PyExc_ValueError, "character out of range");
4196         return -1;
4197     }
4198     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4199                     index, ch);
4200     return 0;
4201 }
4202 
4203 const char *
PyUnicode_GetDefaultEncoding(void)4204 PyUnicode_GetDefaultEncoding(void)
4205 {
4206     return "utf-8";
4207 }
4208 
4209 /* create or adjust a UnicodeDecodeError */
4210 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4211 make_decode_exception(PyObject **exceptionObject,
4212                       const char *encoding,
4213                       const char *input, Py_ssize_t length,
4214                       Py_ssize_t startpos, Py_ssize_t endpos,
4215                       const char *reason)
4216 {
4217     if (*exceptionObject == NULL) {
4218         *exceptionObject = PyUnicodeDecodeError_Create(
4219             encoding, input, length, startpos, endpos, reason);
4220     }
4221     else {
4222         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4223             goto onError;
4224         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4225             goto onError;
4226         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4227             goto onError;
4228     }
4229     return;
4230 
4231 onError:
4232     Py_CLEAR(*exceptionObject);
4233 }
4234 
4235 #ifdef MS_WINDOWS
4236 /* error handling callback helper:
4237    build arguments, call the callback and check the arguments,
4238    if no exception occurred, copy the replacement to the output
4239    and adjust various state variables.
4240    return 0 on success, -1 on error
4241 */
4242 
4243 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4244 unicode_decode_call_errorhandler_wchar(
4245     const char *errors, PyObject **errorHandler,
4246     const char *encoding, const char *reason,
4247     const char **input, const char **inend, Py_ssize_t *startinpos,
4248     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4249     PyObject **output, Py_ssize_t *outpos)
4250 {
4251     static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4252 
4253     PyObject *restuple = NULL;
4254     PyObject *repunicode = NULL;
4255     Py_ssize_t outsize;
4256     Py_ssize_t insize;
4257     Py_ssize_t requiredsize;
4258     Py_ssize_t newpos;
4259     PyObject *inputobj = NULL;
4260     wchar_t *repwstr;
4261     Py_ssize_t repwlen;
4262 
4263     assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4264     outsize = _PyUnicode_WSTR_LENGTH(*output);
4265 
4266     if (*errorHandler == NULL) {
4267         *errorHandler = PyCodec_LookupError(errors);
4268         if (*errorHandler == NULL)
4269             goto onError;
4270     }
4271 
4272     make_decode_exception(exceptionObject,
4273         encoding,
4274         *input, *inend - *input,
4275         *startinpos, *endinpos,
4276         reason);
4277     if (*exceptionObject == NULL)
4278         goto onError;
4279 
4280     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4281     if (restuple == NULL)
4282         goto onError;
4283     if (!PyTuple_Check(restuple)) {
4284         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4285         goto onError;
4286     }
4287     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4288         goto onError;
4289 
4290     /* Copy back the bytes variables, which might have been modified by the
4291        callback */
4292     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4293     if (!inputobj)
4294         goto onError;
4295     if (!PyBytes_Check(inputobj)) {
4296         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4297     }
4298     *input = PyBytes_AS_STRING(inputobj);
4299     insize = PyBytes_GET_SIZE(inputobj);
4300     *inend = *input + insize;
4301     /* we can DECREF safely, as the exception has another reference,
4302        so the object won't go away. */
4303     Py_DECREF(inputobj);
4304 
4305     if (newpos<0)
4306         newpos = insize+newpos;
4307     if (newpos<0 || newpos>insize) {
4308         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4309         goto onError;
4310     }
4311 
4312     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4313     if (repwstr == NULL)
4314         goto onError;
4315     /* need more space? (at least enough for what we
4316        have+the replacement+the rest of the string (starting
4317        at the new input position), so we won't have to check space
4318        when there are no errors in the rest of the string) */
4319     requiredsize = *outpos;
4320     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4321         goto overflow;
4322     requiredsize += repwlen;
4323     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4324         goto overflow;
4325     requiredsize += insize - newpos;
4326     if (requiredsize > outsize) {
4327         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4328             requiredsize = 2*outsize;
4329         if (unicode_resize(output, requiredsize) < 0)
4330             goto onError;
4331     }
4332     wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4333     *outpos += repwlen;
4334     *endinpos = newpos;
4335     *inptr = *input + newpos;
4336 
4337     /* we made it! */
4338     Py_XDECREF(restuple);
4339     return 0;
4340 
4341   overflow:
4342     PyErr_SetString(PyExc_OverflowError,
4343                     "decoded result is too long for a Python string");
4344 
4345   onError:
4346     Py_XDECREF(restuple);
4347     return -1;
4348 }
4349 #endif   /* MS_WINDOWS */
4350 
4351 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4352 unicode_decode_call_errorhandler_writer(
4353     const char *errors, PyObject **errorHandler,
4354     const char *encoding, const char *reason,
4355     const char **input, const char **inend, Py_ssize_t *startinpos,
4356     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4357     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4358 {
4359     static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4360 
4361     PyObject *restuple = NULL;
4362     PyObject *repunicode = NULL;
4363     Py_ssize_t insize;
4364     Py_ssize_t newpos;
4365     Py_ssize_t replen;
4366     PyObject *inputobj = NULL;
4367 
4368     if (*errorHandler == NULL) {
4369         *errorHandler = PyCodec_LookupError(errors);
4370         if (*errorHandler == NULL)
4371             goto onError;
4372     }
4373 
4374     make_decode_exception(exceptionObject,
4375         encoding,
4376         *input, *inend - *input,
4377         *startinpos, *endinpos,
4378         reason);
4379     if (*exceptionObject == NULL)
4380         goto onError;
4381 
4382     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4383     if (restuple == NULL)
4384         goto onError;
4385     if (!PyTuple_Check(restuple)) {
4386         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4387         goto onError;
4388     }
4389     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4390         goto onError;
4391 
4392     /* Copy back the bytes variables, which might have been modified by the
4393        callback */
4394     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4395     if (!inputobj)
4396         goto onError;
4397     if (!PyBytes_Check(inputobj)) {
4398         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4399     }
4400     *input = PyBytes_AS_STRING(inputobj);
4401     insize = PyBytes_GET_SIZE(inputobj);
4402     *inend = *input + insize;
4403     /* we can DECREF safely, as the exception has another reference,
4404        so the object won't go away. */
4405     Py_DECREF(inputobj);
4406 
4407     if (newpos<0)
4408         newpos = insize+newpos;
4409     if (newpos<0 || newpos>insize) {
4410         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4411         goto onError;
4412     }
4413 
4414     if (PyUnicode_READY(repunicode) < 0)
4415         goto onError;
4416     replen = PyUnicode_GET_LENGTH(repunicode);
4417     if (replen > 1) {
4418         writer->min_length += replen - 1;
4419         writer->overallocate = 1;
4420         if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4421                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4422             goto onError;
4423     }
4424     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4425         goto onError;
4426 
4427     *endinpos = newpos;
4428     *inptr = *input + newpos;
4429 
4430     /* we made it! */
4431     Py_XDECREF(restuple);
4432     return 0;
4433 
4434   onError:
4435     Py_XDECREF(restuple);
4436     return -1;
4437 }
4438 
4439 /* --- UTF-7 Codec -------------------------------------------------------- */
4440 
4441 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4442 
4443 /* Three simple macros defining base-64. */
4444 
4445 /* Is c a base-64 character? */
4446 
4447 #define IS_BASE64(c) \
4448     (((c) >= 'A' && (c) <= 'Z') ||     \
4449      ((c) >= 'a' && (c) <= 'z') ||     \
4450      ((c) >= '0' && (c) <= '9') ||     \
4451      (c) == '+' || (c) == '/')
4452 
4453 /* given that c is a base-64 character, what is its base-64 value? */
4454 
4455 #define FROM_BASE64(c)                                                  \
4456     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4457      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4458      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4459      (c) == '+' ? 62 : 63)
4460 
4461 /* What is the base-64 character of the bottom 6 bits of n? */
4462 
4463 #define TO_BASE64(n)  \
4464     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4465 
4466 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4467  * decoded as itself.  We are permissive on decoding; the only ASCII
4468  * byte not decoding to itself is the + which begins a base64
4469  * string. */
4470 
4471 #define DECODE_DIRECT(c)                                \
4472     ((c) <= 127 && (c) != '+')
4473 
4474 /* The UTF-7 encoder treats ASCII characters differently according to
4475  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4476  * the above).  See RFC2152.  This array identifies these different
4477  * sets:
4478  * 0 : "Set D"
4479  *     alphanumeric and '(),-./:?
4480  * 1 : "Set O"
4481  *     !"#$%&*;<=>@[]^_`{|}
4482  * 2 : "whitespace"
4483  *     ht nl cr sp
4484  * 3 : special (must be base64 encoded)
4485  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4486  */
4487 
4488 static
4489 char utf7_category[128] = {
4490 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4491     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4492 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4493     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4494 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4495     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4496 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4497     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4498 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4499     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4500 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4501     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4502 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4503     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4504 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4505     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4506 };
4507 
4508 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4509  * answer depends on whether we are encoding set O as itself, and also
4510  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4511  * clear that the answers to these questions vary between
4512  * applications, so this code needs to be flexible.  */
4513 
4514 #define ENCODE_DIRECT(c, directO, directWS)             \
4515     ((c) < 128 && (c) > 0 &&                            \
4516      ((utf7_category[(c)] == 0) ||                      \
4517       (directWS && (utf7_category[(c)] == 2)) ||        \
4518       (directO && (utf7_category[(c)] == 1))))
4519 
4520 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4521 PyUnicode_DecodeUTF7(const char *s,
4522                      Py_ssize_t size,
4523                      const char *errors)
4524 {
4525     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4526 }
4527 
4528 /* The decoder.  The only state we preserve is our read position,
4529  * i.e. how many characters we have consumed.  So if we end in the
4530  * middle of a shift sequence we have to back off the read position
4531  * and the output to the beginning of the sequence, otherwise we lose
4532  * all the shift state (seen bits, number of bits seen, high
4533  * surrogate). */
4534 
4535 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4536 PyUnicode_DecodeUTF7Stateful(const char *s,
4537                              Py_ssize_t size,
4538                              const char *errors,
4539                              Py_ssize_t *consumed)
4540 {
4541     const char *starts = s;
4542     Py_ssize_t startinpos;
4543     Py_ssize_t endinpos;
4544     const char *e;
4545     _PyUnicodeWriter writer;
4546     const char *errmsg = "";
4547     int inShift = 0;
4548     Py_ssize_t shiftOutStart;
4549     unsigned int base64bits = 0;
4550     unsigned long base64buffer = 0;
4551     Py_UCS4 surrogate = 0;
4552     PyObject *errorHandler = NULL;
4553     PyObject *exc = NULL;
4554 
4555     if (size == 0) {
4556         if (consumed)
4557             *consumed = 0;
4558         _Py_RETURN_UNICODE_EMPTY();
4559     }
4560 
4561     /* Start off assuming it's all ASCII. Widen later as necessary. */
4562     _PyUnicodeWriter_Init(&writer);
4563     writer.min_length = size;
4564 
4565     shiftOutStart = 0;
4566     e = s + size;
4567 
4568     while (s < e) {
4569         Py_UCS4 ch;
4570       restart:
4571         ch = (unsigned char) *s;
4572 
4573         if (inShift) { /* in a base-64 section */
4574             if (IS_BASE64(ch)) { /* consume a base-64 character */
4575                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4576                 base64bits += 6;
4577                 s++;
4578                 if (base64bits >= 16) {
4579                     /* we have enough bits for a UTF-16 value */
4580                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4581                     base64bits -= 16;
4582                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4583                     assert(outCh <= 0xffff);
4584                     if (surrogate) {
4585                         /* expecting a second surrogate */
4586                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4587                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4588                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4589                                 goto onError;
4590                             surrogate = 0;
4591                             continue;
4592                         }
4593                         else {
4594                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4595                                 goto onError;
4596                             surrogate = 0;
4597                         }
4598                     }
4599                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4600                         /* first surrogate */
4601                         surrogate = outCh;
4602                     }
4603                     else {
4604                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4605                             goto onError;
4606                     }
4607                 }
4608             }
4609             else { /* now leaving a base-64 section */
4610                 inShift = 0;
4611                 if (base64bits > 0) { /* left-over bits */
4612                     if (base64bits >= 6) {
4613                         /* We've seen at least one base-64 character */
4614                         s++;
4615                         errmsg = "partial character in shift sequence";
4616                         goto utf7Error;
4617                     }
4618                     else {
4619                         /* Some bits remain; they should be zero */
4620                         if (base64buffer != 0) {
4621                             s++;
4622                             errmsg = "non-zero padding bits in shift sequence";
4623                             goto utf7Error;
4624                         }
4625                     }
4626                 }
4627                 if (surrogate && DECODE_DIRECT(ch)) {
4628                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4629                         goto onError;
4630                 }
4631                 surrogate = 0;
4632                 if (ch == '-') {
4633                     /* '-' is absorbed; other terminating
4634                        characters are preserved */
4635                     s++;
4636                 }
4637             }
4638         }
4639         else if ( ch == '+' ) {
4640             startinpos = s-starts;
4641             s++; /* consume '+' */
4642             if (s < e && *s == '-') { /* '+-' encodes '+' */
4643                 s++;
4644                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4645                     goto onError;
4646             }
4647             else { /* begin base64-encoded section */
4648                 inShift = 1;
4649                 surrogate = 0;
4650                 shiftOutStart = writer.pos;
4651                 base64bits = 0;
4652                 base64buffer = 0;
4653             }
4654         }
4655         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4656             s++;
4657             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4658                 goto onError;
4659         }
4660         else {
4661             startinpos = s-starts;
4662             s++;
4663             errmsg = "unexpected special character";
4664             goto utf7Error;
4665         }
4666         continue;
4667 utf7Error:
4668         endinpos = s-starts;
4669         if (unicode_decode_call_errorhandler_writer(
4670                 errors, &errorHandler,
4671                 "utf7", errmsg,
4672                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4673                 &writer))
4674             goto onError;
4675     }
4676 
4677     /* end of string */
4678 
4679     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4680         /* if we're in an inconsistent state, that's an error */
4681         inShift = 0;
4682         if (surrogate ||
4683                 (base64bits >= 6) ||
4684                 (base64bits > 0 && base64buffer != 0)) {
4685             endinpos = size;
4686             if (unicode_decode_call_errorhandler_writer(
4687                     errors, &errorHandler,
4688                     "utf7", "unterminated shift sequence",
4689                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4690                     &writer))
4691                 goto onError;
4692             if (s < e)
4693                 goto restart;
4694         }
4695     }
4696 
4697     /* return state */
4698     if (consumed) {
4699         if (inShift) {
4700             *consumed = startinpos;
4701             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4702                 PyObject *result = PyUnicode_FromKindAndData(
4703                         writer.kind, writer.data, shiftOutStart);
4704                 Py_XDECREF(errorHandler);
4705                 Py_XDECREF(exc);
4706                 _PyUnicodeWriter_Dealloc(&writer);
4707                 return result;
4708             }
4709             writer.pos = shiftOutStart; /* back off output */
4710         }
4711         else {
4712             *consumed = s-starts;
4713         }
4714     }
4715 
4716     Py_XDECREF(errorHandler);
4717     Py_XDECREF(exc);
4718     return _PyUnicodeWriter_Finish(&writer);
4719 
4720   onError:
4721     Py_XDECREF(errorHandler);
4722     Py_XDECREF(exc);
4723     _PyUnicodeWriter_Dealloc(&writer);
4724     return NULL;
4725 }
4726 
4727 
4728 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4729 _PyUnicode_EncodeUTF7(PyObject *str,
4730                       int base64SetO,
4731                       int base64WhiteSpace,
4732                       const char *errors)
4733 {
4734     int kind;
4735     void *data;
4736     Py_ssize_t len;
4737     PyObject *v;
4738     int inShift = 0;
4739     Py_ssize_t i;
4740     unsigned int base64bits = 0;
4741     unsigned long base64buffer = 0;
4742     char * out;
4743     char * start;
4744 
4745     if (PyUnicode_READY(str) == -1)
4746         return NULL;
4747     kind = PyUnicode_KIND(str);
4748     data = PyUnicode_DATA(str);
4749     len = PyUnicode_GET_LENGTH(str);
4750 
4751     if (len == 0)
4752         return PyBytes_FromStringAndSize(NULL, 0);
4753 
4754     /* It might be possible to tighten this worst case */
4755     if (len > PY_SSIZE_T_MAX / 8)
4756         return PyErr_NoMemory();
4757     v = PyBytes_FromStringAndSize(NULL, len * 8);
4758     if (v == NULL)
4759         return NULL;
4760 
4761     start = out = PyBytes_AS_STRING(v);
4762     for (i = 0; i < len; ++i) {
4763         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4764 
4765         if (inShift) {
4766             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4767                 /* shifting out */
4768                 if (base64bits) { /* output remaining bits */
4769                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4770                     base64buffer = 0;
4771                     base64bits = 0;
4772                 }
4773                 inShift = 0;
4774                 /* Characters not in the BASE64 set implicitly unshift the sequence
4775                    so no '-' is required, except if the character is itself a '-' */
4776                 if (IS_BASE64(ch) || ch == '-') {
4777                     *out++ = '-';
4778                 }
4779                 *out++ = (char) ch;
4780             }
4781             else {
4782                 goto encode_char;
4783             }
4784         }
4785         else { /* not in a shift sequence */
4786             if (ch == '+') {
4787                 *out++ = '+';
4788                         *out++ = '-';
4789             }
4790             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4791                 *out++ = (char) ch;
4792             }
4793             else {
4794                 *out++ = '+';
4795                 inShift = 1;
4796                 goto encode_char;
4797             }
4798         }
4799         continue;
4800 encode_char:
4801         if (ch >= 0x10000) {
4802             assert(ch <= MAX_UNICODE);
4803 
4804             /* code first surrogate */
4805             base64bits += 16;
4806             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4807             while (base64bits >= 6) {
4808                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4809                 base64bits -= 6;
4810             }
4811             /* prepare second surrogate */
4812             ch = Py_UNICODE_LOW_SURROGATE(ch);
4813         }
4814         base64bits += 16;
4815         base64buffer = (base64buffer << 16) | ch;
4816         while (base64bits >= 6) {
4817             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4818             base64bits -= 6;
4819         }
4820     }
4821     if (base64bits)
4822         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4823     if (inShift)
4824         *out++ = '-';
4825     if (_PyBytes_Resize(&v, out - start) < 0)
4826         return NULL;
4827     return v;
4828 }
4829 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4830 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4831                      Py_ssize_t size,
4832                      int base64SetO,
4833                      int base64WhiteSpace,
4834                      const char *errors)
4835 {
4836     PyObject *result;
4837     PyObject *tmp = PyUnicode_FromUnicode(s, size);
4838     if (tmp == NULL)
4839         return NULL;
4840     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4841                                    base64WhiteSpace, errors);
4842     Py_DECREF(tmp);
4843     return result;
4844 }
4845 
4846 #undef IS_BASE64
4847 #undef FROM_BASE64
4848 #undef TO_BASE64
4849 #undef DECODE_DIRECT
4850 #undef ENCODE_DIRECT
4851 
4852 /* --- UTF-8 Codec -------------------------------------------------------- */
4853 
4854 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4855 PyUnicode_DecodeUTF8(const char *s,
4856                      Py_ssize_t size,
4857                      const char *errors)
4858 {
4859     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860 }
4861 
4862 #include "stringlib/asciilib.h"
4863 #include "stringlib/codecs.h"
4864 #include "stringlib/undef.h"
4865 
4866 #include "stringlib/ucs1lib.h"
4867 #include "stringlib/codecs.h"
4868 #include "stringlib/undef.h"
4869 
4870 #include "stringlib/ucs2lib.h"
4871 #include "stringlib/codecs.h"
4872 #include "stringlib/undef.h"
4873 
4874 #include "stringlib/ucs4lib.h"
4875 #include "stringlib/codecs.h"
4876 #include "stringlib/undef.h"
4877 
4878 /* Mask to quickly check whether a C 'long' contains a
4879    non-ASCII, UTF8-encoded char. */
4880 #if (SIZEOF_LONG == 8)
4881 # define ASCII_CHAR_MASK 0x8080808080808080UL
4882 #elif (SIZEOF_LONG == 4)
4883 # define ASCII_CHAR_MASK 0x80808080UL
4884 #else
4885 # error C 'long' size should be either 4 or 8!
4886 #endif
4887 
4888 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4889 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4890 {
4891     const char *p = start;
4892     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4893 
4894     /*
4895      * Issue #17237: m68k is a bit different from most architectures in
4896      * that objects do not use "natural alignment" - for example, int and
4897      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4898      * won't work; also, tests have shown that skipping the "optimised
4899      * version" will even speed up m68k.
4900      */
4901 #if !defined(__m68k__)
4902 #if SIZEOF_LONG <= SIZEOF_VOID_P
4903     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4904     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4905         /* Fast path, see in STRINGLIB(utf8_decode) for
4906            an explanation. */
4907         /* Help allocation */
4908         const char *_p = p;
4909         Py_UCS1 * q = dest;
4910         while (_p < aligned_end) {
4911             unsigned long value = *(const unsigned long *) _p;
4912             if (value & ASCII_CHAR_MASK)
4913                 break;
4914             *((unsigned long *)q) = value;
4915             _p += SIZEOF_LONG;
4916             q += SIZEOF_LONG;
4917         }
4918         p = _p;
4919         while (p < end) {
4920             if ((unsigned char)*p & 0x80)
4921                 break;
4922             *q++ = *p++;
4923         }
4924         return p - start;
4925     }
4926 #endif
4927 #endif
4928     while (p < end) {
4929         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4930            for an explanation. */
4931         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4932             /* Help allocation */
4933             const char *_p = p;
4934             while (_p < aligned_end) {
4935                 unsigned long value = *(unsigned long *) _p;
4936                 if (value & ASCII_CHAR_MASK)
4937                     break;
4938                 _p += SIZEOF_LONG;
4939             }
4940             p = _p;
4941             if (_p == end)
4942                 break;
4943         }
4944         if ((unsigned char)*p & 0x80)
4945             break;
4946         ++p;
4947     }
4948     memcpy(dest, start, p - start);
4949     return p - start;
4950 }
4951 
4952 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4953 PyUnicode_DecodeUTF8Stateful(const char *s,
4954                              Py_ssize_t size,
4955                              const char *errors,
4956                              Py_ssize_t *consumed)
4957 {
4958     _PyUnicodeWriter writer;
4959     const char *starts = s;
4960     const char *end = s + size;
4961 
4962     Py_ssize_t startinpos;
4963     Py_ssize_t endinpos;
4964     const char *errmsg = "";
4965     PyObject *error_handler_obj = NULL;
4966     PyObject *exc = NULL;
4967     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4968 
4969     if (size == 0) {
4970         if (consumed)
4971             *consumed = 0;
4972         _Py_RETURN_UNICODE_EMPTY();
4973     }
4974 
4975     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4976     if (size == 1 && (unsigned char)s[0] < 128) {
4977         if (consumed)
4978             *consumed = 1;
4979         return get_latin1_char((unsigned char)s[0]);
4980     }
4981 
4982     _PyUnicodeWriter_Init(&writer);
4983     writer.min_length = size;
4984     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4985         goto onError;
4986 
4987     writer.pos = ascii_decode(s, end, writer.data);
4988     s += writer.pos;
4989     while (s < end) {
4990         Py_UCS4 ch;
4991         int kind = writer.kind;
4992 
4993         if (kind == PyUnicode_1BYTE_KIND) {
4994             if (PyUnicode_IS_ASCII(writer.buffer))
4995                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4996             else
4997                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4998         } else if (kind == PyUnicode_2BYTE_KIND) {
4999             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5000         } else {
5001             assert(kind == PyUnicode_4BYTE_KIND);
5002             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5003         }
5004 
5005         switch (ch) {
5006         case 0:
5007             if (s == end || consumed)
5008                 goto End;
5009             errmsg = "unexpected end of data";
5010             startinpos = s - starts;
5011             endinpos = end - starts;
5012             break;
5013         case 1:
5014             errmsg = "invalid start byte";
5015             startinpos = s - starts;
5016             endinpos = startinpos + 1;
5017             break;
5018         case 2:
5019         case 3:
5020         case 4:
5021             errmsg = "invalid continuation byte";
5022             startinpos = s - starts;
5023             endinpos = startinpos + ch - 1;
5024             break;
5025         default:
5026             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5027                 goto onError;
5028             continue;
5029         }
5030 
5031         if (error_handler == _Py_ERROR_UNKNOWN)
5032             error_handler = get_error_handler(errors);
5033 
5034         switch (error_handler) {
5035         case _Py_ERROR_IGNORE:
5036             s += (endinpos - startinpos);
5037             break;
5038 
5039         case _Py_ERROR_REPLACE:
5040             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041                 goto onError;
5042             s += (endinpos - startinpos);
5043             break;
5044 
5045         case _Py_ERROR_SURROGATEESCAPE:
5046         {
5047             Py_ssize_t i;
5048 
5049             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050                 goto onError;
5051             for (i=startinpos; i<endinpos; i++) {
5052                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054                                 ch + 0xdc00);
5055                 writer.pos++;
5056             }
5057             s += (endinpos - startinpos);
5058             break;
5059         }
5060 
5061         default:
5062             if (unicode_decode_call_errorhandler_writer(
5063                     errors, &error_handler_obj,
5064                     "utf-8", errmsg,
5065                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5066                     &writer))
5067                 goto onError;
5068         }
5069     }
5070 
5071 End:
5072     if (consumed)
5073         *consumed = s - starts;
5074 
5075     Py_XDECREF(error_handler_obj);
5076     Py_XDECREF(exc);
5077     return _PyUnicodeWriter_Finish(&writer);
5078 
5079 onError:
5080     Py_XDECREF(error_handler_obj);
5081     Py_XDECREF(exc);
5082     _PyUnicodeWriter_Dealloc(&writer);
5083     return NULL;
5084 }
5085 
5086 #if defined(__APPLE__) || defined(__ANDROID__)
5087 
5088 /* Simplified UTF-8 decoder using surrogateescape error handler,
5089    used to decode the command line arguments on Mac OS X and Android.
5090 
5091    Return a pointer to a newly allocated wide character string (use
5092    PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
5093 
5094 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * s,Py_ssize_t size)5095 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5096 {
5097     const char *e;
5098     wchar_t *unicode;
5099     Py_ssize_t outpos;
5100 
5101     /* Note: size will always be longer than the resulting Unicode
5102        character count */
5103     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
5104         return NULL;
5105     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5106     if (!unicode)
5107         return NULL;
5108 
5109     /* Unpack UTF-8 encoded data */
5110     e = s + size;
5111     outpos = 0;
5112     while (s < e) {
5113         Py_UCS4 ch;
5114 #if SIZEOF_WCHAR_T == 4
5115         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5116 #else
5117         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5118 #endif
5119         if (ch > 0xFF) {
5120 #if SIZEOF_WCHAR_T == 4
5121             assert(0);
5122 #else
5123             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5124             /*  compute and append the two surrogates: */
5125             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5126             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5127 #endif
5128         }
5129         else {
5130             if (!ch && s == e)
5131                 break;
5132             /* surrogateescape */
5133             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5134         }
5135     }
5136     unicode[outpos] = L'\0';
5137     return unicode;
5138 }
5139 
5140 #endif /* __APPLE__ or __ANDROID__ */
5141 
5142 /* Primary internal function which creates utf8 encoded bytes objects.
5143 
5144    Allocation strategy:  if the string is short, convert into a stack buffer
5145    and allocate exactly as much space needed at the end.  Else allocate the
5146    maximum possible needed (4 result bytes per Unicode character), and return
5147    the excess memory at the end.
5148 */
5149 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5150 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5151 {
5152     enum PyUnicode_Kind kind;
5153     void *data;
5154     Py_ssize_t size;
5155 
5156     if (!PyUnicode_Check(unicode)) {
5157         PyErr_BadArgument();
5158         return NULL;
5159     }
5160 
5161     if (PyUnicode_READY(unicode) == -1)
5162         return NULL;
5163 
5164     if (PyUnicode_UTF8(unicode))
5165         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5166                                          PyUnicode_UTF8_LENGTH(unicode));
5167 
5168     kind = PyUnicode_KIND(unicode);
5169     data = PyUnicode_DATA(unicode);
5170     size = PyUnicode_GET_LENGTH(unicode);
5171 
5172     switch (kind) {
5173     default:
5174         assert(0);
5175     case PyUnicode_1BYTE_KIND:
5176         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5177         assert(!PyUnicode_IS_ASCII(unicode));
5178         return ucs1lib_utf8_encoder(unicode, data, size, errors);
5179     case PyUnicode_2BYTE_KIND:
5180         return ucs2lib_utf8_encoder(unicode, data, size, errors);
5181     case PyUnicode_4BYTE_KIND:
5182         return ucs4lib_utf8_encoder(unicode, data, size, errors);
5183     }
5184 }
5185 
5186 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5187 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5188                      Py_ssize_t size,
5189                      const char *errors)
5190 {
5191     PyObject *v, *unicode;
5192 
5193     unicode = PyUnicode_FromUnicode(s, size);
5194     if (unicode == NULL)
5195         return NULL;
5196     v = _PyUnicode_AsUTF8String(unicode, errors);
5197     Py_DECREF(unicode);
5198     return v;
5199 }
5200 
5201 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5202 PyUnicode_AsUTF8String(PyObject *unicode)
5203 {
5204     return _PyUnicode_AsUTF8String(unicode, NULL);
5205 }
5206 
5207 /* --- UTF-32 Codec ------------------------------------------------------- */
5208 
5209 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5210 PyUnicode_DecodeUTF32(const char *s,
5211                       Py_ssize_t size,
5212                       const char *errors,
5213                       int *byteorder)
5214 {
5215     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5216 }
5217 
5218 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5219 PyUnicode_DecodeUTF32Stateful(const char *s,
5220                               Py_ssize_t size,
5221                               const char *errors,
5222                               int *byteorder,
5223                               Py_ssize_t *consumed)
5224 {
5225     const char *starts = s;
5226     Py_ssize_t startinpos;
5227     Py_ssize_t endinpos;
5228     _PyUnicodeWriter writer;
5229     const unsigned char *q, *e;
5230     int le, bo = 0;       /* assume native ordering by default */
5231     const char *encoding;
5232     const char *errmsg = "";
5233     PyObject *errorHandler = NULL;
5234     PyObject *exc = NULL;
5235 
5236     q = (unsigned char *)s;
5237     e = q + size;
5238 
5239     if (byteorder)
5240         bo = *byteorder;
5241 
5242     /* Check for BOM marks (U+FEFF) in the input and adjust current
5243        byte order setting accordingly. In native mode, the leading BOM
5244        mark is skipped, in all other modes, it is copied to the output
5245        stream as-is (giving a ZWNBSP character). */
5246     if (bo == 0 && size >= 4) {
5247         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5248         if (bom == 0x0000FEFF) {
5249             bo = -1;
5250             q += 4;
5251         }
5252         else if (bom == 0xFFFE0000) {
5253             bo = 1;
5254             q += 4;
5255         }
5256         if (byteorder)
5257             *byteorder = bo;
5258     }
5259 
5260     if (q == e) {
5261         if (consumed)
5262             *consumed = size;
5263         _Py_RETURN_UNICODE_EMPTY();
5264     }
5265 
5266 #ifdef WORDS_BIGENDIAN
5267     le = bo < 0;
5268 #else
5269     le = bo <= 0;
5270 #endif
5271     encoding = le ? "utf-32-le" : "utf-32-be";
5272 
5273     _PyUnicodeWriter_Init(&writer);
5274     writer.min_length = (e - q + 3) / 4;
5275     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5276         goto onError;
5277 
5278     while (1) {
5279         Py_UCS4 ch = 0;
5280         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5281 
5282         if (e - q >= 4) {
5283             enum PyUnicode_Kind kind = writer.kind;
5284             void *data = writer.data;
5285             const unsigned char *last = e - 4;
5286             Py_ssize_t pos = writer.pos;
5287             if (le) {
5288                 do {
5289                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5290                     if (ch > maxch)
5291                         break;
5292                     if (kind != PyUnicode_1BYTE_KIND &&
5293                         Py_UNICODE_IS_SURROGATE(ch))
5294                         break;
5295                     PyUnicode_WRITE(kind, data, pos++, ch);
5296                     q += 4;
5297                 } while (q <= last);
5298             }
5299             else {
5300                 do {
5301                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5302                     if (ch > maxch)
5303                         break;
5304                     if (kind != PyUnicode_1BYTE_KIND &&
5305                         Py_UNICODE_IS_SURROGATE(ch))
5306                         break;
5307                     PyUnicode_WRITE(kind, data, pos++, ch);
5308                     q += 4;
5309                 } while (q <= last);
5310             }
5311             writer.pos = pos;
5312         }
5313 
5314         if (Py_UNICODE_IS_SURROGATE(ch)) {
5315             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5316             startinpos = ((const char *)q) - starts;
5317             endinpos = startinpos + 4;
5318         }
5319         else if (ch <= maxch) {
5320             if (q == e || consumed)
5321                 break;
5322             /* remaining bytes at the end? (size should be divisible by 4) */
5323             errmsg = "truncated data";
5324             startinpos = ((const char *)q) - starts;
5325             endinpos = ((const char *)e) - starts;
5326         }
5327         else {
5328             if (ch < 0x110000) {
5329                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5330                     goto onError;
5331                 q += 4;
5332                 continue;
5333             }
5334             errmsg = "code point not in range(0x110000)";
5335             startinpos = ((const char *)q) - starts;
5336             endinpos = startinpos + 4;
5337         }
5338 
5339         /* The remaining input chars are ignored if the callback
5340            chooses to skip the input */
5341         if (unicode_decode_call_errorhandler_writer(
5342                 errors, &errorHandler,
5343                 encoding, errmsg,
5344                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5345                 &writer))
5346             goto onError;
5347     }
5348 
5349     if (consumed)
5350         *consumed = (const char *)q-starts;
5351 
5352     Py_XDECREF(errorHandler);
5353     Py_XDECREF(exc);
5354     return _PyUnicodeWriter_Finish(&writer);
5355 
5356   onError:
5357     _PyUnicodeWriter_Dealloc(&writer);
5358     Py_XDECREF(errorHandler);
5359     Py_XDECREF(exc);
5360     return NULL;
5361 }
5362 
5363 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5364 _PyUnicode_EncodeUTF32(PyObject *str,
5365                        const char *errors,
5366                        int byteorder)
5367 {
5368     enum PyUnicode_Kind kind;
5369     const void *data;
5370     Py_ssize_t len;
5371     PyObject *v;
5372     uint32_t *out;
5373 #if PY_LITTLE_ENDIAN
5374     int native_ordering = byteorder <= 0;
5375 #else
5376     int native_ordering = byteorder >= 0;
5377 #endif
5378     const char *encoding;
5379     Py_ssize_t nsize, pos;
5380     PyObject *errorHandler = NULL;
5381     PyObject *exc = NULL;
5382     PyObject *rep = NULL;
5383 
5384     if (!PyUnicode_Check(str)) {
5385         PyErr_BadArgument();
5386         return NULL;
5387     }
5388     if (PyUnicode_READY(str) == -1)
5389         return NULL;
5390     kind = PyUnicode_KIND(str);
5391     data = PyUnicode_DATA(str);
5392     len = PyUnicode_GET_LENGTH(str);
5393 
5394     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5395         return PyErr_NoMemory();
5396     nsize = len + (byteorder == 0);
5397     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5398     if (v == NULL)
5399         return NULL;
5400 
5401     /* output buffer is 4-bytes aligned */
5402     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5403     out = (uint32_t *)PyBytes_AS_STRING(v);
5404     if (byteorder == 0)
5405         *out++ = 0xFEFF;
5406     if (len == 0)
5407         goto done;
5408 
5409     if (byteorder == -1)
5410         encoding = "utf-32-le";
5411     else if (byteorder == 1)
5412         encoding = "utf-32-be";
5413     else
5414         encoding = "utf-32";
5415 
5416     if (kind == PyUnicode_1BYTE_KIND) {
5417         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5418         goto done;
5419     }
5420 
5421     pos = 0;
5422     while (pos < len) {
5423         Py_ssize_t repsize, moreunits;
5424 
5425         if (kind == PyUnicode_2BYTE_KIND) {
5426             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5427                                         &out, native_ordering);
5428         }
5429         else {
5430             assert(kind == PyUnicode_4BYTE_KIND);
5431             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5432                                         &out, native_ordering);
5433         }
5434         if (pos == len)
5435             break;
5436 
5437         rep = unicode_encode_call_errorhandler(
5438                 errors, &errorHandler,
5439                 encoding, "surrogates not allowed",
5440                 str, &exc, pos, pos + 1, &pos);
5441         if (!rep)
5442             goto error;
5443 
5444         if (PyBytes_Check(rep)) {
5445             repsize = PyBytes_GET_SIZE(rep);
5446             if (repsize & 3) {
5447                 raise_encode_exception(&exc, encoding,
5448                                        str, pos - 1, pos,
5449                                        "surrogates not allowed");
5450                 goto error;
5451             }
5452             moreunits = repsize / 4;
5453         }
5454         else {
5455             assert(PyUnicode_Check(rep));
5456             if (PyUnicode_READY(rep) < 0)
5457                 goto error;
5458             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5459             if (!PyUnicode_IS_ASCII(rep)) {
5460                 raise_encode_exception(&exc, encoding,
5461                                        str, pos - 1, pos,
5462                                        "surrogates not allowed");
5463                 goto error;
5464             }
5465         }
5466 
5467         /* four bytes are reserved for each surrogate */
5468         if (moreunits > 1) {
5469             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5470             Py_ssize_t morebytes = 4 * (moreunits - 1);
5471             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5472                 /* integer overflow */
5473                 PyErr_NoMemory();
5474                 goto error;
5475             }
5476             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5477                 goto error;
5478             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5479         }
5480 
5481         if (PyBytes_Check(rep)) {
5482             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5483             out += moreunits;
5484         } else /* rep is unicode */ {
5485             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5486             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5487                                  &out, native_ordering);
5488         }
5489 
5490         Py_CLEAR(rep);
5491     }
5492 
5493     /* Cut back to size actually needed. This is necessary for, for example,
5494        encoding of a string containing isolated surrogates and the 'ignore'
5495        handler is used. */
5496     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5497     if (nsize != PyBytes_GET_SIZE(v))
5498       _PyBytes_Resize(&v, nsize);
5499     Py_XDECREF(errorHandler);
5500     Py_XDECREF(exc);
5501   done:
5502     return v;
5503   error:
5504     Py_XDECREF(rep);
5505     Py_XDECREF(errorHandler);
5506     Py_XDECREF(exc);
5507     Py_XDECREF(v);
5508     return NULL;
5509 }
5510 
5511 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5512 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5513                       Py_ssize_t size,
5514                       const char *errors,
5515                       int byteorder)
5516 {
5517     PyObject *result;
5518     PyObject *tmp = PyUnicode_FromUnicode(s, size);
5519     if (tmp == NULL)
5520         return NULL;
5521     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5522     Py_DECREF(tmp);
5523     return result;
5524 }
5525 
5526 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5527 PyUnicode_AsUTF32String(PyObject *unicode)
5528 {
5529     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5530 }
5531 
5532 /* --- UTF-16 Codec ------------------------------------------------------- */
5533 
5534 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5535 PyUnicode_DecodeUTF16(const char *s,
5536                       Py_ssize_t size,
5537                       const char *errors,
5538                       int *byteorder)
5539 {
5540     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5541 }
5542 
5543 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5544 PyUnicode_DecodeUTF16Stateful(const char *s,
5545                               Py_ssize_t size,
5546                               const char *errors,
5547                               int *byteorder,
5548                               Py_ssize_t *consumed)
5549 {
5550     const char *starts = s;
5551     Py_ssize_t startinpos;
5552     Py_ssize_t endinpos;
5553     _PyUnicodeWriter writer;
5554     const unsigned char *q, *e;
5555     int bo = 0;       /* assume native ordering by default */
5556     int native_ordering;
5557     const char *errmsg = "";
5558     PyObject *errorHandler = NULL;
5559     PyObject *exc = NULL;
5560     const char *encoding;
5561 
5562     q = (unsigned char *)s;
5563     e = q + size;
5564 
5565     if (byteorder)
5566         bo = *byteorder;
5567 
5568     /* Check for BOM marks (U+FEFF) in the input and adjust current
5569        byte order setting accordingly. In native mode, the leading BOM
5570        mark is skipped, in all other modes, it is copied to the output
5571        stream as-is (giving a ZWNBSP character). */
5572     if (bo == 0 && size >= 2) {
5573         const Py_UCS4 bom = (q[1] << 8) | q[0];
5574         if (bom == 0xFEFF) {
5575             q += 2;
5576             bo = -1;
5577         }
5578         else if (bom == 0xFFFE) {
5579             q += 2;
5580             bo = 1;
5581         }
5582         if (byteorder)
5583             *byteorder = bo;
5584     }
5585 
5586     if (q == e) {
5587         if (consumed)
5588             *consumed = size;
5589         _Py_RETURN_UNICODE_EMPTY();
5590     }
5591 
5592 #if PY_LITTLE_ENDIAN
5593     native_ordering = bo <= 0;
5594     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5595 #else
5596     native_ordering = bo >= 0;
5597     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5598 #endif
5599 
5600     /* Note: size will always be longer than the resulting Unicode
5601        character count */
5602     _PyUnicodeWriter_Init(&writer);
5603     writer.min_length = (e - q + 1) / 2;
5604     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5605         goto onError;
5606 
5607     while (1) {
5608         Py_UCS4 ch = 0;
5609         if (e - q >= 2) {
5610             int kind = writer.kind;
5611             if (kind == PyUnicode_1BYTE_KIND) {
5612                 if (PyUnicode_IS_ASCII(writer.buffer))
5613                     ch = asciilib_utf16_decode(&q, e,
5614                             (Py_UCS1*)writer.data, &writer.pos,
5615                             native_ordering);
5616                 else
5617                     ch = ucs1lib_utf16_decode(&q, e,
5618                             (Py_UCS1*)writer.data, &writer.pos,
5619                             native_ordering);
5620             } else if (kind == PyUnicode_2BYTE_KIND) {
5621                 ch = ucs2lib_utf16_decode(&q, e,
5622                         (Py_UCS2*)writer.data, &writer.pos,
5623                         native_ordering);
5624             } else {
5625                 assert(kind == PyUnicode_4BYTE_KIND);
5626                 ch = ucs4lib_utf16_decode(&q, e,
5627                         (Py_UCS4*)writer.data, &writer.pos,
5628                         native_ordering);
5629             }
5630         }
5631 
5632         switch (ch)
5633         {
5634         case 0:
5635             /* remaining byte at the end? (size should be even) */
5636             if (q == e || consumed)
5637                 goto End;
5638             errmsg = "truncated data";
5639             startinpos = ((const char *)q) - starts;
5640             endinpos = ((const char *)e) - starts;
5641             break;
5642             /* The remaining input chars are ignored if the callback
5643                chooses to skip the input */
5644         case 1:
5645             q -= 2;
5646             if (consumed)
5647                 goto End;
5648             errmsg = "unexpected end of data";
5649             startinpos = ((const char *)q) - starts;
5650             endinpos = ((const char *)e) - starts;
5651             break;
5652         case 2:
5653             errmsg = "illegal encoding";
5654             startinpos = ((const char *)q) - 2 - starts;
5655             endinpos = startinpos + 2;
5656             break;
5657         case 3:
5658             errmsg = "illegal UTF-16 surrogate";
5659             startinpos = ((const char *)q) - 4 - starts;
5660             endinpos = startinpos + 2;
5661             break;
5662         default:
5663             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5664                 goto onError;
5665             continue;
5666         }
5667 
5668         if (unicode_decode_call_errorhandler_writer(
5669                 errors,
5670                 &errorHandler,
5671                 encoding, errmsg,
5672                 &starts,
5673                 (const char **)&e,
5674                 &startinpos,
5675                 &endinpos,
5676                 &exc,
5677                 (const char **)&q,
5678                 &writer))
5679             goto onError;
5680     }
5681 
5682 End:
5683     if (consumed)
5684         *consumed = (const char *)q-starts;
5685 
5686     Py_XDECREF(errorHandler);
5687     Py_XDECREF(exc);
5688     return _PyUnicodeWriter_Finish(&writer);
5689 
5690   onError:
5691     _PyUnicodeWriter_Dealloc(&writer);
5692     Py_XDECREF(errorHandler);
5693     Py_XDECREF(exc);
5694     return NULL;
5695 }
5696 
5697 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5698 _PyUnicode_EncodeUTF16(PyObject *str,
5699                        const char *errors,
5700                        int byteorder)
5701 {
5702     enum PyUnicode_Kind kind;
5703     const void *data;
5704     Py_ssize_t len;
5705     PyObject *v;
5706     unsigned short *out;
5707     Py_ssize_t pairs;
5708 #if PY_BIG_ENDIAN
5709     int native_ordering = byteorder >= 0;
5710 #else
5711     int native_ordering = byteorder <= 0;
5712 #endif
5713     const char *encoding;
5714     Py_ssize_t nsize, pos;
5715     PyObject *errorHandler = NULL;
5716     PyObject *exc = NULL;
5717     PyObject *rep = NULL;
5718 
5719     if (!PyUnicode_Check(str)) {
5720         PyErr_BadArgument();
5721         return NULL;
5722     }
5723     if (PyUnicode_READY(str) == -1)
5724         return NULL;
5725     kind = PyUnicode_KIND(str);
5726     data = PyUnicode_DATA(str);
5727     len = PyUnicode_GET_LENGTH(str);
5728 
5729     pairs = 0;
5730     if (kind == PyUnicode_4BYTE_KIND) {
5731         const Py_UCS4 *in = (const Py_UCS4 *)data;
5732         const Py_UCS4 *end = in + len;
5733         while (in < end) {
5734             if (*in++ >= 0x10000) {
5735                 pairs++;
5736             }
5737         }
5738     }
5739     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5740         return PyErr_NoMemory();
5741     }
5742     nsize = len + pairs + (byteorder == 0);
5743     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5744     if (v == NULL) {
5745         return NULL;
5746     }
5747 
5748     /* output buffer is 2-bytes aligned */
5749     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5750     out = (unsigned short *)PyBytes_AS_STRING(v);
5751     if (byteorder == 0) {
5752         *out++ = 0xFEFF;
5753     }
5754     if (len == 0) {
5755         goto done;
5756     }
5757 
5758     if (kind == PyUnicode_1BYTE_KIND) {
5759         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5760         goto done;
5761     }
5762 
5763     if (byteorder < 0) {
5764         encoding = "utf-16-le";
5765     }
5766     else if (byteorder > 0) {
5767         encoding = "utf-16-be";
5768     }
5769     else {
5770         encoding = "utf-16";
5771     }
5772 
5773     pos = 0;
5774     while (pos < len) {
5775         Py_ssize_t repsize, moreunits;
5776 
5777         if (kind == PyUnicode_2BYTE_KIND) {
5778             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5779                                         &out, native_ordering);
5780         }
5781         else {
5782             assert(kind == PyUnicode_4BYTE_KIND);
5783             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5784                                         &out, native_ordering);
5785         }
5786         if (pos == len)
5787             break;
5788 
5789         rep = unicode_encode_call_errorhandler(
5790                 errors, &errorHandler,
5791                 encoding, "surrogates not allowed",
5792                 str, &exc, pos, pos + 1, &pos);
5793         if (!rep)
5794             goto error;
5795 
5796         if (PyBytes_Check(rep)) {
5797             repsize = PyBytes_GET_SIZE(rep);
5798             if (repsize & 1) {
5799                 raise_encode_exception(&exc, encoding,
5800                                        str, pos - 1, pos,
5801                                        "surrogates not allowed");
5802                 goto error;
5803             }
5804             moreunits = repsize / 2;
5805         }
5806         else {
5807             assert(PyUnicode_Check(rep));
5808             if (PyUnicode_READY(rep) < 0)
5809                 goto error;
5810             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5811             if (!PyUnicode_IS_ASCII(rep)) {
5812                 raise_encode_exception(&exc, encoding,
5813                                        str, pos - 1, pos,
5814                                        "surrogates not allowed");
5815                 goto error;
5816             }
5817         }
5818 
5819         /* two bytes are reserved for each surrogate */
5820         if (moreunits > 1) {
5821             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5822             Py_ssize_t morebytes = 2 * (moreunits - 1);
5823             if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5824                 /* integer overflow */
5825                 PyErr_NoMemory();
5826                 goto error;
5827             }
5828             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5829                 goto error;
5830             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5831         }
5832 
5833         if (PyBytes_Check(rep)) {
5834             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5835             out += moreunits;
5836         } else /* rep is unicode */ {
5837             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5838             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839                                  &out, native_ordering);
5840         }
5841 
5842         Py_CLEAR(rep);
5843     }
5844 
5845     /* Cut back to size actually needed. This is necessary for, for example,
5846     encoding of a string containing isolated surrogates and the 'ignore' handler
5847     is used. */
5848     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5849     if (nsize != PyBytes_GET_SIZE(v))
5850       _PyBytes_Resize(&v, nsize);
5851     Py_XDECREF(errorHandler);
5852     Py_XDECREF(exc);
5853   done:
5854     return v;
5855   error:
5856     Py_XDECREF(rep);
5857     Py_XDECREF(errorHandler);
5858     Py_XDECREF(exc);
5859     Py_XDECREF(v);
5860     return NULL;
5861 #undef STORECHAR
5862 }
5863 
5864 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5865 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5866                       Py_ssize_t size,
5867                       const char *errors,
5868                       int byteorder)
5869 {
5870     PyObject *result;
5871     PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872     if (tmp == NULL)
5873         return NULL;
5874     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5875     Py_DECREF(tmp);
5876     return result;
5877 }
5878 
5879 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5880 PyUnicode_AsUTF16String(PyObject *unicode)
5881 {
5882     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5883 }
5884 
5885 /* --- Unicode Escape Codec ----------------------------------------------- */
5886 
5887 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5888 
5889 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5890 _PyUnicode_DecodeUnicodeEscape(const char *s,
5891                                Py_ssize_t size,
5892                                const char *errors,
5893                                const char **first_invalid_escape)
5894 {
5895     const char *starts = s;
5896     _PyUnicodeWriter writer;
5897     const char *end;
5898     PyObject *errorHandler = NULL;
5899     PyObject *exc = NULL;
5900 
5901     // so we can remember if we've seen an invalid escape char or not
5902     *first_invalid_escape = NULL;
5903 
5904     if (size == 0) {
5905         _Py_RETURN_UNICODE_EMPTY();
5906     }
5907     /* Escaped strings will always be longer than the resulting
5908        Unicode string, so we start with size here and then reduce the
5909        length after conversion to the true value.
5910        (but if the error callback returns a long replacement string
5911        we'll have to allocate more space) */
5912     _PyUnicodeWriter_Init(&writer);
5913     writer.min_length = size;
5914     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5915         goto onError;
5916     }
5917 
5918     end = s + size;
5919     while (s < end) {
5920         unsigned char c = (unsigned char) *s++;
5921         Py_UCS4 ch;
5922         int count;
5923         Py_ssize_t startinpos;
5924         Py_ssize_t endinpos;
5925         const char *message;
5926 
5927 #define WRITE_ASCII_CHAR(ch)                                                  \
5928             do {                                                              \
5929                 assert(ch <= 127);                                            \
5930                 assert(writer.pos < writer.size);                             \
5931                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
5932             } while(0)
5933 
5934 #define WRITE_CHAR(ch)                                                        \
5935             do {                                                              \
5936                 if (ch <= writer.maxchar) {                                   \
5937                     assert(writer.pos < writer.size);                         \
5938                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5939                 }                                                             \
5940                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5941                     goto onError;                                             \
5942                 }                                                             \
5943             } while(0)
5944 
5945         /* Non-escape characters are interpreted as Unicode ordinals */
5946         if (c != '\\') {
5947             WRITE_CHAR(c);
5948             continue;
5949         }
5950 
5951         startinpos = s - starts - 1;
5952         /* \ - Escapes */
5953         if (s >= end) {
5954             message = "\\ at end of string";
5955             goto error;
5956         }
5957         c = (unsigned char) *s++;
5958 
5959         assert(writer.pos < writer.size);
5960         switch (c) {
5961 
5962             /* \x escapes */
5963         case '\n': continue;
5964         case '\\': WRITE_ASCII_CHAR('\\'); continue;
5965         case '\'': WRITE_ASCII_CHAR('\''); continue;
5966         case '\"': WRITE_ASCII_CHAR('\"'); continue;
5967         case 'b': WRITE_ASCII_CHAR('\b'); continue;
5968         /* FF */
5969         case 'f': WRITE_ASCII_CHAR('\014'); continue;
5970         case 't': WRITE_ASCII_CHAR('\t'); continue;
5971         case 'n': WRITE_ASCII_CHAR('\n'); continue;
5972         case 'r': WRITE_ASCII_CHAR('\r'); continue;
5973         /* VT */
5974         case 'v': WRITE_ASCII_CHAR('\013'); continue;
5975         /* BEL, not classic C */
5976         case 'a': WRITE_ASCII_CHAR('\007'); continue;
5977 
5978             /* \OOO (octal) escapes */
5979         case '0': case '1': case '2': case '3':
5980         case '4': case '5': case '6': case '7':
5981             ch = c - '0';
5982             if (s < end && '0' <= *s && *s <= '7') {
5983                 ch = (ch<<3) + *s++ - '0';
5984                 if (s < end && '0' <= *s && *s <= '7') {
5985                     ch = (ch<<3) + *s++ - '0';
5986                 }
5987             }
5988             WRITE_CHAR(ch);
5989             continue;
5990 
5991             /* hex escapes */
5992             /* \xXX */
5993         case 'x':
5994             count = 2;
5995             message = "truncated \\xXX escape";
5996             goto hexescape;
5997 
5998             /* \uXXXX */
5999         case 'u':
6000             count = 4;
6001             message = "truncated \\uXXXX escape";
6002             goto hexescape;
6003 
6004             /* \UXXXXXXXX */
6005         case 'U':
6006             count = 8;
6007             message = "truncated \\UXXXXXXXX escape";
6008         hexescape:
6009             for (ch = 0; count && s < end; ++s, --count) {
6010                 c = (unsigned char)*s;
6011                 ch <<= 4;
6012                 if (c >= '0' && c <= '9') {
6013                     ch += c - '0';
6014                 }
6015                 else if (c >= 'a' && c <= 'f') {
6016                     ch += c - ('a' - 10);
6017                 }
6018                 else if (c >= 'A' && c <= 'F') {
6019                     ch += c - ('A' - 10);
6020                 }
6021                 else {
6022                     break;
6023                 }
6024             }
6025             if (count) {
6026                 goto error;
6027             }
6028 
6029             /* when we get here, ch is a 32-bit unicode character */
6030             if (ch > MAX_UNICODE) {
6031                 message = "illegal Unicode character";
6032                 goto error;
6033             }
6034 
6035             WRITE_CHAR(ch);
6036             continue;
6037 
6038             /* \N{name} */
6039         case 'N':
6040             if (ucnhash_CAPI == NULL) {
6041                 /* load the unicode data module */
6042                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6043                                                 PyUnicodeData_CAPSULE_NAME, 1);
6044                 if (ucnhash_CAPI == NULL) {
6045                     PyErr_SetString(
6046                         PyExc_UnicodeError,
6047                         "\\N escapes not supported (can't load unicodedata module)"
6048                         );
6049                     goto onError;
6050                 }
6051             }
6052 
6053             message = "malformed \\N character escape";
6054             if (*s == '{') {
6055                 const char *start = ++s;
6056                 size_t namelen;
6057                 /* look for the closing brace */
6058                 while (s < end && *s != '}')
6059                     s++;
6060                 namelen = s - start;
6061                 if (namelen && s < end) {
6062                     /* found a name.  look it up in the unicode database */
6063                     s++;
6064                     ch = 0xffffffff; /* in case 'getcode' messes up */
6065                     if (namelen <= INT_MAX &&
6066                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6067                                               &ch, 0)) {
6068                         assert(ch <= MAX_UNICODE);
6069                         WRITE_CHAR(ch);
6070                         continue;
6071                     }
6072                     message = "unknown Unicode character name";
6073                 }
6074             }
6075             goto error;
6076 
6077         default:
6078             if (*first_invalid_escape == NULL) {
6079                 *first_invalid_escape = s-1; /* Back up one char, since we've
6080                                                 already incremented s. */
6081             }
6082             WRITE_ASCII_CHAR('\\');
6083             WRITE_CHAR(c);
6084             continue;
6085         }
6086 
6087       error:
6088         endinpos = s-starts;
6089         writer.min_length = end - s + writer.pos;
6090         if (unicode_decode_call_errorhandler_writer(
6091                 errors, &errorHandler,
6092                 "unicodeescape", message,
6093                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6094                 &writer)) {
6095             goto onError;
6096         }
6097         if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6098             goto onError;
6099         }
6100 
6101 #undef WRITE_ASCII_CHAR
6102 #undef WRITE_CHAR
6103     }
6104 
6105     Py_XDECREF(errorHandler);
6106     Py_XDECREF(exc);
6107     return _PyUnicodeWriter_Finish(&writer);
6108 
6109   onError:
6110     _PyUnicodeWriter_Dealloc(&writer);
6111     Py_XDECREF(errorHandler);
6112     Py_XDECREF(exc);
6113     return NULL;
6114 }
6115 
6116 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6117 PyUnicode_DecodeUnicodeEscape(const char *s,
6118                               Py_ssize_t size,
6119                               const char *errors)
6120 {
6121     const char *first_invalid_escape;
6122     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6123                                                       &first_invalid_escape);
6124     if (result == NULL)
6125         return NULL;
6126     if (first_invalid_escape != NULL) {
6127         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6128                              "invalid escape sequence '\\%c'",
6129                              *first_invalid_escape) < 0) {
6130             Py_DECREF(result);
6131             return NULL;
6132         }
6133     }
6134     return result;
6135 }
6136 
6137 /* Return a Unicode-Escape string version of the Unicode object. */
6138 
6139 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6140 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6141 {
6142     Py_ssize_t i, len;
6143     PyObject *repr;
6144     char *p;
6145     enum PyUnicode_Kind kind;
6146     void *data;
6147     Py_ssize_t expandsize;
6148 
6149     /* Initial allocation is based on the longest-possible character
6150        escape.
6151 
6152        For UCS1 strings it's '\xxx', 4 bytes per source character.
6153        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6154        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6155     */
6156 
6157     if (!PyUnicode_Check(unicode)) {
6158         PyErr_BadArgument();
6159         return NULL;
6160     }
6161     if (PyUnicode_READY(unicode) == -1) {
6162         return NULL;
6163     }
6164 
6165     len = PyUnicode_GET_LENGTH(unicode);
6166     if (len == 0) {
6167         return PyBytes_FromStringAndSize(NULL, 0);
6168     }
6169 
6170     kind = PyUnicode_KIND(unicode);
6171     data = PyUnicode_DATA(unicode);
6172     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6173        bytes, and 1 byte characters 4. */
6174     expandsize = kind * 2 + 2;
6175     if (len > PY_SSIZE_T_MAX / expandsize) {
6176         return PyErr_NoMemory();
6177     }
6178     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6179     if (repr == NULL) {
6180         return NULL;
6181     }
6182 
6183     p = PyBytes_AS_STRING(repr);
6184     for (i = 0; i < len; i++) {
6185         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6186 
6187         /* U+0000-U+00ff range */
6188         if (ch < 0x100) {
6189             if (ch >= ' ' && ch < 127) {
6190                 if (ch != '\\') {
6191                     /* Copy printable US ASCII as-is */
6192                     *p++ = (char) ch;
6193                 }
6194                 /* Escape backslashes */
6195                 else {
6196                     *p++ = '\\';
6197                     *p++ = '\\';
6198                 }
6199             }
6200 
6201             /* Map special whitespace to '\t', \n', '\r' */
6202             else if (ch == '\t') {
6203                 *p++ = '\\';
6204                 *p++ = 't';
6205             }
6206             else if (ch == '\n') {
6207                 *p++ = '\\';
6208                 *p++ = 'n';
6209             }
6210             else if (ch == '\r') {
6211                 *p++ = '\\';
6212                 *p++ = 'r';
6213             }
6214 
6215             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6216             else {
6217                 *p++ = '\\';
6218                 *p++ = 'x';
6219                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6220                 *p++ = Py_hexdigits[ch & 0x000F];
6221             }
6222         }
6223         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6224         else if (ch < 0x10000) {
6225             *p++ = '\\';
6226             *p++ = 'u';
6227             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6228             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6229             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230             *p++ = Py_hexdigits[ch & 0x000F];
6231         }
6232         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6233         else {
6234 
6235             /* Make sure that the first two digits are zero */
6236             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6237             *p++ = '\\';
6238             *p++ = 'U';
6239             *p++ = '0';
6240             *p++ = '0';
6241             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6242             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6243             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6244             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6245             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6246             *p++ = Py_hexdigits[ch & 0x0000000F];
6247         }
6248     }
6249 
6250     assert(p - PyBytes_AS_STRING(repr) > 0);
6251     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6252         return NULL;
6253     }
6254     return repr;
6255 }
6256 
6257 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6258 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6259                               Py_ssize_t size)
6260 {
6261     PyObject *result;
6262     PyObject *tmp = PyUnicode_FromUnicode(s, size);
6263     if (tmp == NULL) {
6264         return NULL;
6265     }
6266 
6267     result = PyUnicode_AsUnicodeEscapeString(tmp);
6268     Py_DECREF(tmp);
6269     return result;
6270 }
6271 
6272 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6273 
6274 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6275 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6276                                  Py_ssize_t size,
6277                                  const char *errors)
6278 {
6279     const char *starts = s;
6280     _PyUnicodeWriter writer;
6281     const char *end;
6282     PyObject *errorHandler = NULL;
6283     PyObject *exc = NULL;
6284 
6285     if (size == 0) {
6286         _Py_RETURN_UNICODE_EMPTY();
6287     }
6288 
6289     /* Escaped strings will always be longer than the resulting
6290        Unicode string, so we start with size here and then reduce the
6291        length after conversion to the true value. (But decoding error
6292        handler might have to resize the string) */
6293     _PyUnicodeWriter_Init(&writer);
6294      writer.min_length = size;
6295     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6296         goto onError;
6297     }
6298 
6299     end = s + size;
6300     while (s < end) {
6301         unsigned char c = (unsigned char) *s++;
6302         Py_UCS4 ch;
6303         int count;
6304         Py_ssize_t startinpos;
6305         Py_ssize_t endinpos;
6306         const char *message;
6307 
6308 #define WRITE_CHAR(ch)                                                        \
6309             do {                                                              \
6310                 if (ch <= writer.maxchar) {                                   \
6311                     assert(writer.pos < writer.size);                         \
6312                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6313                 }                                                             \
6314                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6315                     goto onError;                                             \
6316                 }                                                             \
6317             } while(0)
6318 
6319         /* Non-escape characters are interpreted as Unicode ordinals */
6320         if (c != '\\' || s >= end) {
6321             WRITE_CHAR(c);
6322             continue;
6323         }
6324 
6325         c = (unsigned char) *s++;
6326         if (c == 'u') {
6327             count = 4;
6328             message = "truncated \\uXXXX escape";
6329         }
6330         else if (c == 'U') {
6331             count = 8;
6332             message = "truncated \\UXXXXXXXX escape";
6333         }
6334         else {
6335             assert(writer.pos < writer.size);
6336             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6337             WRITE_CHAR(c);
6338             continue;
6339         }
6340         startinpos = s - starts - 2;
6341 
6342         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6343         for (ch = 0; count && s < end; ++s, --count) {
6344             c = (unsigned char)*s;
6345             ch <<= 4;
6346             if (c >= '0' && c <= '9') {
6347                 ch += c - '0';
6348             }
6349             else if (c >= 'a' && c <= 'f') {
6350                 ch += c - ('a' - 10);
6351             }
6352             else if (c >= 'A' && c <= 'F') {
6353                 ch += c - ('A' - 10);
6354             }
6355             else {
6356                 break;
6357             }
6358         }
6359         if (!count) {
6360             if (ch <= MAX_UNICODE) {
6361                 WRITE_CHAR(ch);
6362                 continue;
6363             }
6364             message = "\\Uxxxxxxxx out of range";
6365         }
6366 
6367         endinpos = s-starts;
6368         writer.min_length = end - s + writer.pos;
6369         if (unicode_decode_call_errorhandler_writer(
6370                 errors, &errorHandler,
6371                 "rawunicodeescape", message,
6372                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6373                 &writer)) {
6374             goto onError;
6375         }
6376         if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6377             goto onError;
6378         }
6379 
6380 #undef WRITE_CHAR
6381     }
6382     Py_XDECREF(errorHandler);
6383     Py_XDECREF(exc);
6384     return _PyUnicodeWriter_Finish(&writer);
6385 
6386   onError:
6387     _PyUnicodeWriter_Dealloc(&writer);
6388     Py_XDECREF(errorHandler);
6389     Py_XDECREF(exc);
6390     return NULL;
6391 
6392 }
6393 
6394 
6395 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6396 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6397 {
6398     PyObject *repr;
6399     char *p;
6400     Py_ssize_t expandsize, pos;
6401     int kind;
6402     void *data;
6403     Py_ssize_t len;
6404 
6405     if (!PyUnicode_Check(unicode)) {
6406         PyErr_BadArgument();
6407         return NULL;
6408     }
6409     if (PyUnicode_READY(unicode) == -1) {
6410         return NULL;
6411     }
6412     kind = PyUnicode_KIND(unicode);
6413     data = PyUnicode_DATA(unicode);
6414     len = PyUnicode_GET_LENGTH(unicode);
6415     if (kind == PyUnicode_1BYTE_KIND) {
6416         return PyBytes_FromStringAndSize(data, len);
6417     }
6418 
6419     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6420        bytes, and 1 byte characters 4. */
6421     expandsize = kind * 2 + 2;
6422 
6423     if (len > PY_SSIZE_T_MAX / expandsize) {
6424         return PyErr_NoMemory();
6425     }
6426     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6427     if (repr == NULL) {
6428         return NULL;
6429     }
6430     if (len == 0) {
6431         return repr;
6432     }
6433 
6434     p = PyBytes_AS_STRING(repr);
6435     for (pos = 0; pos < len; pos++) {
6436         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6437 
6438         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6439         if (ch < 0x100) {
6440             *p++ = (char) ch;
6441         }
6442         /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6443         else if (ch < 0x10000) {
6444             *p++ = '\\';
6445             *p++ = 'u';
6446             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6447             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6448             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6449             *p++ = Py_hexdigits[ch & 15];
6450         }
6451         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6452         else {
6453             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6454             *p++ = '\\';
6455             *p++ = 'U';
6456             *p++ = '0';
6457             *p++ = '0';
6458             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6459             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6460             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6461             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6462             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6463             *p++ = Py_hexdigits[ch & 15];
6464         }
6465     }
6466 
6467     assert(p > PyBytes_AS_STRING(repr));
6468     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6469         return NULL;
6470     }
6471     return repr;
6472 }
6473 
6474 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6475 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6476                                  Py_ssize_t size)
6477 {
6478     PyObject *result;
6479     PyObject *tmp = PyUnicode_FromUnicode(s, size);
6480     if (tmp == NULL)
6481         return NULL;
6482     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6483     Py_DECREF(tmp);
6484     return result;
6485 }
6486 
6487 /* --- Unicode Internal Codec ------------------------------------------- */
6488 
6489 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6490 _PyUnicode_DecodeUnicodeInternal(const char *s,
6491                                  Py_ssize_t size,
6492                                  const char *errors)
6493 {
6494     const char *starts = s;
6495     Py_ssize_t startinpos;
6496     Py_ssize_t endinpos;
6497     _PyUnicodeWriter writer;
6498     const char *end;
6499     const char *reason;
6500     PyObject *errorHandler = NULL;
6501     PyObject *exc = NULL;
6502 
6503     if (PyErr_WarnEx(PyExc_DeprecationWarning,
6504                      "unicode_internal codec has been deprecated",
6505                      1))
6506         return NULL;
6507 
6508     if (size == 0)
6509         _Py_RETURN_UNICODE_EMPTY();
6510 
6511     _PyUnicodeWriter_Init(&writer);
6512     if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6513         PyErr_NoMemory();
6514         goto onError;
6515     }
6516     writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6517 
6518     end = s + size;
6519     while (s < end) {
6520         Py_UNICODE uch;
6521         Py_UCS4 ch;
6522         if (end - s < Py_UNICODE_SIZE) {
6523             endinpos = end-starts;
6524             reason = "truncated input";
6525             goto error;
6526         }
6527         /* We copy the raw representation one byte at a time because the
6528            pointer may be unaligned (see test_codeccallbacks). */
6529         ((char *) &uch)[0] = s[0];
6530         ((char *) &uch)[1] = s[1];
6531 #ifdef Py_UNICODE_WIDE
6532         ((char *) &uch)[2] = s[2];
6533         ((char *) &uch)[3] = s[3];
6534 #endif
6535         ch = uch;
6536 #ifdef Py_UNICODE_WIDE
6537         /* We have to sanity check the raw data, otherwise doom looms for
6538            some malformed UCS-4 data. */
6539         if (ch > 0x10ffff) {
6540             endinpos = s - starts + Py_UNICODE_SIZE;
6541             reason = "illegal code point (> 0x10FFFF)";
6542             goto error;
6543         }
6544 #endif
6545         s += Py_UNICODE_SIZE;
6546 #ifndef Py_UNICODE_WIDE
6547         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6548         {
6549             Py_UNICODE uch2;
6550             ((char *) &uch2)[0] = s[0];
6551             ((char *) &uch2)[1] = s[1];
6552             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6553             {
6554                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6555                 s += Py_UNICODE_SIZE;
6556             }
6557         }
6558 #endif
6559 
6560         if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6561             goto onError;
6562         continue;
6563 
6564   error:
6565         startinpos = s - starts;
6566         if (unicode_decode_call_errorhandler_writer(
6567                 errors, &errorHandler,
6568                 "unicode_internal", reason,
6569                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6570                 &writer))
6571             goto onError;
6572     }
6573 
6574     Py_XDECREF(errorHandler);
6575     Py_XDECREF(exc);
6576     return _PyUnicodeWriter_Finish(&writer);
6577 
6578   onError:
6579     _PyUnicodeWriter_Dealloc(&writer);
6580     Py_XDECREF(errorHandler);
6581     Py_XDECREF(exc);
6582     return NULL;
6583 }
6584 
6585 /* --- Latin-1 Codec ------------------------------------------------------ */
6586 
6587 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6588 PyUnicode_DecodeLatin1(const char *s,
6589                        Py_ssize_t size,
6590                        const char *errors)
6591 {
6592     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6593     return _PyUnicode_FromUCS1((unsigned char*)s, size);
6594 }
6595 
6596 /* create or adjust a UnicodeEncodeError */
6597 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6598 make_encode_exception(PyObject **exceptionObject,
6599                       const char *encoding,
6600                       PyObject *unicode,
6601                       Py_ssize_t startpos, Py_ssize_t endpos,
6602                       const char *reason)
6603 {
6604     if (*exceptionObject == NULL) {
6605         *exceptionObject = PyObject_CallFunction(
6606             PyExc_UnicodeEncodeError, "sOnns",
6607             encoding, unicode, startpos, endpos, reason);
6608     }
6609     else {
6610         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6611             goto onError;
6612         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6613             goto onError;
6614         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6615             goto onError;
6616         return;
6617       onError:
6618         Py_CLEAR(*exceptionObject);
6619     }
6620 }
6621 
6622 /* raises a UnicodeEncodeError */
6623 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6624 raise_encode_exception(PyObject **exceptionObject,
6625                        const char *encoding,
6626                        PyObject *unicode,
6627                        Py_ssize_t startpos, Py_ssize_t endpos,
6628                        const char *reason)
6629 {
6630     make_encode_exception(exceptionObject,
6631                           encoding, unicode, startpos, endpos, reason);
6632     if (*exceptionObject != NULL)
6633         PyCodec_StrictErrors(*exceptionObject);
6634 }
6635 
6636 /* error handling callback helper:
6637    build arguments, call the callback and check the arguments,
6638    put the result into newpos and return the replacement string, which
6639    has to be freed by the caller */
6640 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6641 unicode_encode_call_errorhandler(const char *errors,
6642                                  PyObject **errorHandler,
6643                                  const char *encoding, const char *reason,
6644                                  PyObject *unicode, PyObject **exceptionObject,
6645                                  Py_ssize_t startpos, Py_ssize_t endpos,
6646                                  Py_ssize_t *newpos)
6647 {
6648     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6649     Py_ssize_t len;
6650     PyObject *restuple;
6651     PyObject *resunicode;
6652 
6653     if (*errorHandler == NULL) {
6654         *errorHandler = PyCodec_LookupError(errors);
6655         if (*errorHandler == NULL)
6656             return NULL;
6657     }
6658 
6659     if (PyUnicode_READY(unicode) == -1)
6660         return NULL;
6661     len = PyUnicode_GET_LENGTH(unicode);
6662 
6663     make_encode_exception(exceptionObject,
6664                           encoding, unicode, startpos, endpos, reason);
6665     if (*exceptionObject == NULL)
6666         return NULL;
6667 
6668     restuple = PyObject_CallFunctionObjArgs(
6669         *errorHandler, *exceptionObject, NULL);
6670     if (restuple == NULL)
6671         return NULL;
6672     if (!PyTuple_Check(restuple)) {
6673         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6674         Py_DECREF(restuple);
6675         return NULL;
6676     }
6677     if (!PyArg_ParseTuple(restuple, argparse,
6678                           &resunicode, newpos)) {
6679         Py_DECREF(restuple);
6680         return NULL;
6681     }
6682     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6683         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6684         Py_DECREF(restuple);
6685         return NULL;
6686     }
6687     if (*newpos<0)
6688         *newpos = len + *newpos;
6689     if (*newpos<0 || *newpos>len) {
6690         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6691         Py_DECREF(restuple);
6692         return NULL;
6693     }
6694     Py_INCREF(resunicode);
6695     Py_DECREF(restuple);
6696     return resunicode;
6697 }
6698 
6699 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6700 unicode_encode_ucs1(PyObject *unicode,
6701                     const char *errors,
6702                     const Py_UCS4 limit)
6703 {
6704     /* input state */
6705     Py_ssize_t pos=0, size;
6706     int kind;
6707     void *data;
6708     /* pointer into the output */
6709     char *str;
6710     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6711     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6712     PyObject *error_handler_obj = NULL;
6713     PyObject *exc = NULL;
6714     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6715     PyObject *rep = NULL;
6716     /* output object */
6717     _PyBytesWriter writer;
6718 
6719     if (PyUnicode_READY(unicode) == -1)
6720         return NULL;
6721     size = PyUnicode_GET_LENGTH(unicode);
6722     kind = PyUnicode_KIND(unicode);
6723     data = PyUnicode_DATA(unicode);
6724     /* allocate enough for a simple encoding without
6725        replacements, if we need more, we'll resize */
6726     if (size == 0)
6727         return PyBytes_FromStringAndSize(NULL, 0);
6728 
6729     _PyBytesWriter_Init(&writer);
6730     str = _PyBytesWriter_Alloc(&writer, size);
6731     if (str == NULL)
6732         return NULL;
6733 
6734     while (pos < size) {
6735         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6736 
6737         /* can we encode this? */
6738         if (ch < limit) {
6739             /* no overflow check, because we know that the space is enough */
6740             *str++ = (char)ch;
6741             ++pos;
6742         }
6743         else {
6744             Py_ssize_t newpos, i;
6745             /* startpos for collecting unencodable chars */
6746             Py_ssize_t collstart = pos;
6747             Py_ssize_t collend = collstart + 1;
6748             /* find all unecodable characters */
6749 
6750             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6751                 ++collend;
6752 
6753             /* Only overallocate the buffer if it's not the last write */
6754             writer.overallocate = (collend < size);
6755 
6756             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6757             if (error_handler == _Py_ERROR_UNKNOWN)
6758                 error_handler = get_error_handler(errors);
6759 
6760             switch (error_handler) {
6761             case _Py_ERROR_STRICT:
6762                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6763                 goto onError;
6764 
6765             case _Py_ERROR_REPLACE:
6766                 memset(str, '?', collend - collstart);
6767                 str += (collend - collstart);
6768                 /* fall through ignore error handler */
6769             case _Py_ERROR_IGNORE:
6770                 pos = collend;
6771                 break;
6772 
6773             case _Py_ERROR_BACKSLASHREPLACE:
6774                 /* subtract preallocated bytes */
6775                 writer.min_size -= (collend - collstart);
6776                 str = backslashreplace(&writer, str,
6777                                        unicode, collstart, collend);
6778                 if (str == NULL)
6779                     goto onError;
6780                 pos = collend;
6781                 break;
6782 
6783             case _Py_ERROR_XMLCHARREFREPLACE:
6784                 /* subtract preallocated bytes */
6785                 writer.min_size -= (collend - collstart);
6786                 str = xmlcharrefreplace(&writer, str,
6787                                         unicode, collstart, collend);
6788                 if (str == NULL)
6789                     goto onError;
6790                 pos = collend;
6791                 break;
6792 
6793             case _Py_ERROR_SURROGATEESCAPE:
6794                 for (i = collstart; i < collend; ++i) {
6795                     ch = PyUnicode_READ(kind, data, i);
6796                     if (ch < 0xdc80 || 0xdcff < ch) {
6797                         /* Not a UTF-8b surrogate */
6798                         break;
6799                     }
6800                     *str++ = (char)(ch - 0xdc00);
6801                     ++pos;
6802                 }
6803                 if (i >= collend)
6804                     break;
6805                 collstart = pos;
6806                 assert(collstart != collend);
6807                 /* fallback to general error handling */
6808 
6809             default:
6810                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6811                                                        encoding, reason, unicode, &exc,
6812                                                        collstart, collend, &newpos);
6813                 if (rep == NULL)
6814                     goto onError;
6815 
6816                 /* subtract preallocated bytes */
6817                 writer.min_size -= 1;
6818 
6819                 if (PyBytes_Check(rep)) {
6820                     /* Directly copy bytes result to output. */
6821                     str = _PyBytesWriter_WriteBytes(&writer, str,
6822                                                     PyBytes_AS_STRING(rep),
6823                                                     PyBytes_GET_SIZE(rep));
6824                     if (str == NULL)
6825                         goto onError;
6826                 }
6827                 else {
6828                     assert(PyUnicode_Check(rep));
6829 
6830                     if (PyUnicode_READY(rep) < 0)
6831                         goto onError;
6832 
6833                     if (PyUnicode_IS_ASCII(rep)) {
6834                         /* Fast path: all characters are smaller than limit */
6835                         assert(limit >= 128);
6836                         assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6837                         str = _PyBytesWriter_WriteBytes(&writer, str,
6838                                                         PyUnicode_DATA(rep),
6839                                                         PyUnicode_GET_LENGTH(rep));
6840                     }
6841                     else {
6842                         Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6843 
6844                         str = _PyBytesWriter_Prepare(&writer, str, repsize);
6845                         if (str == NULL)
6846                             goto onError;
6847 
6848                         /* check if there is anything unencodable in the
6849                            replacement and copy it to the output */
6850                         for (i = 0; repsize-->0; ++i, ++str) {
6851                             ch = PyUnicode_READ_CHAR(rep, i);
6852                             if (ch >= limit) {
6853                                 raise_encode_exception(&exc, encoding, unicode,
6854                                                        pos, pos+1, reason);
6855                                 goto onError;
6856                             }
6857                             *str = (char)ch;
6858                         }
6859                     }
6860                 }
6861                 pos = newpos;
6862                 Py_CLEAR(rep);
6863             }
6864 
6865             /* If overallocation was disabled, ensure that it was the last
6866                write. Otherwise, we missed an optimization */
6867             assert(writer.overallocate || pos == size);
6868         }
6869     }
6870 
6871     Py_XDECREF(error_handler_obj);
6872     Py_XDECREF(exc);
6873     return _PyBytesWriter_Finish(&writer, str);
6874 
6875   onError:
6876     Py_XDECREF(rep);
6877     _PyBytesWriter_Dealloc(&writer);
6878     Py_XDECREF(error_handler_obj);
6879     Py_XDECREF(exc);
6880     return NULL;
6881 }
6882 
6883 /* Deprecated */
6884 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6885 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6886                        Py_ssize_t size,
6887                        const char *errors)
6888 {
6889     PyObject *result;
6890     PyObject *unicode = PyUnicode_FromUnicode(p, size);
6891     if (unicode == NULL)
6892         return NULL;
6893     result = unicode_encode_ucs1(unicode, errors, 256);
6894     Py_DECREF(unicode);
6895     return result;
6896 }
6897 
6898 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6899 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6900 {
6901     if (!PyUnicode_Check(unicode)) {
6902         PyErr_BadArgument();
6903         return NULL;
6904     }
6905     if (PyUnicode_READY(unicode) == -1)
6906         return NULL;
6907     /* Fast path: if it is a one-byte string, construct
6908        bytes object directly. */
6909     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911                                          PyUnicode_GET_LENGTH(unicode));
6912     /* Non-Latin-1 characters present. Defer to above function to
6913        raise the exception. */
6914     return unicode_encode_ucs1(unicode, errors, 256);
6915 }
6916 
6917 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6918 PyUnicode_AsLatin1String(PyObject *unicode)
6919 {
6920     return _PyUnicode_AsLatin1String(unicode, NULL);
6921 }
6922 
6923 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6924 
6925 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6926 PyUnicode_DecodeASCII(const char *s,
6927                       Py_ssize_t size,
6928                       const char *errors)
6929 {
6930     const char *starts = s;
6931     _PyUnicodeWriter writer;
6932     int kind;
6933     void *data;
6934     Py_ssize_t startinpos;
6935     Py_ssize_t endinpos;
6936     Py_ssize_t outpos;
6937     const char *e;
6938     PyObject *error_handler_obj = NULL;
6939     PyObject *exc = NULL;
6940     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6941 
6942     if (size == 0)
6943         _Py_RETURN_UNICODE_EMPTY();
6944 
6945     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6946     if (size == 1 && (unsigned char)s[0] < 128)
6947         return get_latin1_char((unsigned char)s[0]);
6948 
6949     _PyUnicodeWriter_Init(&writer);
6950     writer.min_length = size;
6951     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6952         return NULL;
6953 
6954     e = s + size;
6955     data = writer.data;
6956     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6957     writer.pos = outpos;
6958     if (writer.pos == size)
6959         return _PyUnicodeWriter_Finish(&writer);
6960 
6961     s += writer.pos;
6962     kind = writer.kind;
6963     while (s < e) {
6964         unsigned char c = (unsigned char)*s;
6965         if (c < 128) {
6966             PyUnicode_WRITE(kind, data, writer.pos, c);
6967             writer.pos++;
6968             ++s;
6969             continue;
6970         }
6971 
6972         /* byte outsize range 0x00..0x7f: call the error handler */
6973 
6974         if (error_handler == _Py_ERROR_UNKNOWN)
6975             error_handler = get_error_handler(errors);
6976 
6977         switch (error_handler)
6978         {
6979         case _Py_ERROR_REPLACE:
6980         case _Py_ERROR_SURROGATEESCAPE:
6981             /* Fast-path: the error handler only writes one character,
6982                but we may switch to UCS2 at the first write */
6983             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984                 goto onError;
6985             kind = writer.kind;
6986             data = writer.data;
6987 
6988             if (error_handler == _Py_ERROR_REPLACE)
6989                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990             else
6991                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992             writer.pos++;
6993             ++s;
6994             break;
6995 
6996         case _Py_ERROR_IGNORE:
6997             ++s;
6998             break;
6999 
7000         default:
7001             startinpos = s-starts;
7002             endinpos = startinpos + 1;
7003             if (unicode_decode_call_errorhandler_writer(
7004                     errors, &error_handler_obj,
7005                     "ascii", "ordinal not in range(128)",
7006                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7007                     &writer))
7008                 goto onError;
7009             kind = writer.kind;
7010             data = writer.data;
7011         }
7012     }
7013     Py_XDECREF(error_handler_obj);
7014     Py_XDECREF(exc);
7015     return _PyUnicodeWriter_Finish(&writer);
7016 
7017   onError:
7018     _PyUnicodeWriter_Dealloc(&writer);
7019     Py_XDECREF(error_handler_obj);
7020     Py_XDECREF(exc);
7021     return NULL;
7022 }
7023 
7024 /* Deprecated */
7025 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7026 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027                       Py_ssize_t size,
7028                       const char *errors)
7029 {
7030     PyObject *result;
7031     PyObject *unicode = PyUnicode_FromUnicode(p, size);
7032     if (unicode == NULL)
7033         return NULL;
7034     result = unicode_encode_ucs1(unicode, errors, 128);
7035     Py_DECREF(unicode);
7036     return result;
7037 }
7038 
7039 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7040 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7041 {
7042     if (!PyUnicode_Check(unicode)) {
7043         PyErr_BadArgument();
7044         return NULL;
7045     }
7046     if (PyUnicode_READY(unicode) == -1)
7047         return NULL;
7048     /* Fast path: if it is an ASCII-only string, construct bytes object
7049        directly. Else defer to above function to raise the exception. */
7050     if (PyUnicode_IS_ASCII(unicode))
7051         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052                                          PyUnicode_GET_LENGTH(unicode));
7053     return unicode_encode_ucs1(unicode, errors, 128);
7054 }
7055 
7056 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7057 PyUnicode_AsASCIIString(PyObject *unicode)
7058 {
7059     return _PyUnicode_AsASCIIString(unicode, NULL);
7060 }
7061 
7062 #ifdef MS_WINDOWS
7063 
7064 /* --- MBCS codecs for Windows -------------------------------------------- */
7065 
7066 #if SIZEOF_INT < SIZEOF_SIZE_T
7067 #define NEED_RETRY
7068 #endif
7069 
7070 #ifndef WC_ERR_INVALID_CHARS
7071 #  define WC_ERR_INVALID_CHARS 0x0080
7072 #endif
7073 
7074 static const char*
code_page_name(UINT code_page,PyObject ** obj)7075 code_page_name(UINT code_page, PyObject **obj)
7076 {
7077     *obj = NULL;
7078     if (code_page == CP_ACP)
7079         return "mbcs";
7080     if (code_page == CP_UTF7)
7081         return "CP_UTF7";
7082     if (code_page == CP_UTF8)
7083         return "CP_UTF8";
7084 
7085     *obj = PyBytes_FromFormat("cp%u", code_page);
7086     if (*obj == NULL)
7087         return NULL;
7088     return PyBytes_AS_STRING(*obj);
7089 }
7090 
7091 static DWORD
decode_code_page_flags(UINT code_page)7092 decode_code_page_flags(UINT code_page)
7093 {
7094     if (code_page == CP_UTF7) {
7095         /* The CP_UTF7 decoder only supports flags=0 */
7096         return 0;
7097     }
7098     else
7099         return MB_ERR_INVALID_CHARS;
7100 }
7101 
7102 /*
7103  * Decode a byte string from a Windows code page into unicode object in strict
7104  * mode.
7105  *
7106  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107  * OSError and returns -1 on other error.
7108  */
7109 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7110 decode_code_page_strict(UINT code_page,
7111                         PyObject **v,
7112                         const char *in,
7113                         int insize)
7114 {
7115     const DWORD flags = decode_code_page_flags(code_page);
7116     wchar_t *out;
7117     DWORD outsize;
7118 
7119     /* First get the size of the result */
7120     assert(insize > 0);
7121     outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122     if (outsize <= 0)
7123         goto error;
7124 
7125     if (*v == NULL) {
7126         /* Create unicode object */
7127         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7128         *v = (PyObject*)_PyUnicode_New(outsize);
7129         if (*v == NULL)
7130             return -1;
7131         out = PyUnicode_AS_UNICODE(*v);
7132     }
7133     else {
7134         /* Extend unicode object */
7135         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7136         if (unicode_resize(v, n + outsize) < 0)
7137             return -1;
7138         out = PyUnicode_AS_UNICODE(*v) + n;
7139     }
7140 
7141     /* Do the conversion */
7142     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143     if (outsize <= 0)
7144         goto error;
7145     return insize;
7146 
7147 error:
7148     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149         return -2;
7150     PyErr_SetFromWindowsErr(0);
7151     return -1;
7152 }
7153 
7154 /*
7155  * Decode a byte string from a code page into unicode object with an error
7156  * handler.
7157  *
7158  * Returns consumed size if succeed, or raise an OSError or
7159  * UnicodeDecodeError exception and returns -1 on error.
7160  */
7161 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7162 decode_code_page_errors(UINT code_page,
7163                         PyObject **v,
7164                         const char *in, const int size,
7165                         const char *errors, int final)
7166 {
7167     const char *startin = in;
7168     const char *endin = in + size;
7169     const DWORD flags = decode_code_page_flags(code_page);
7170     /* Ideally, we should get reason from FormatMessage. This is the Windows
7171        2000 English version of the message. */
7172     const char *reason = "No mapping for the Unicode character exists "
7173                          "in the target code page.";
7174     /* each step cannot decode more than 1 character, but a character can be
7175        represented as a surrogate pair */
7176     wchar_t buffer[2], *startout, *out;
7177     int insize;
7178     Py_ssize_t outsize;
7179     PyObject *errorHandler = NULL;
7180     PyObject *exc = NULL;
7181     PyObject *encoding_obj = NULL;
7182     const char *encoding;
7183     DWORD err;
7184     int ret = -1;
7185 
7186     assert(size > 0);
7187 
7188     encoding = code_page_name(code_page, &encoding_obj);
7189     if (encoding == NULL)
7190         return -1;
7191 
7192     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7193         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194            UnicodeDecodeError. */
7195         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196         if (exc != NULL) {
7197             PyCodec_StrictErrors(exc);
7198             Py_CLEAR(exc);
7199         }
7200         goto error;
7201     }
7202 
7203     if (*v == NULL) {
7204         /* Create unicode object */
7205         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206             PyErr_NoMemory();
7207             goto error;
7208         }
7209         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7210         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7211         if (*v == NULL)
7212             goto error;
7213         startout = PyUnicode_AS_UNICODE(*v);
7214     }
7215     else {
7216         /* Extend unicode object */
7217         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219             PyErr_NoMemory();
7220             goto error;
7221         }
7222         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7223             goto error;
7224         startout = PyUnicode_AS_UNICODE(*v) + n;
7225     }
7226 
7227     /* Decode the byte string character per character */
7228     out = startout;
7229     while (in < endin)
7230     {
7231         /* Decode a character */
7232         insize = 1;
7233         do
7234         {
7235             outsize = MultiByteToWideChar(code_page, flags,
7236                                           in, insize,
7237                                           buffer, Py_ARRAY_LENGTH(buffer));
7238             if (outsize > 0)
7239                 break;
7240             err = GetLastError();
7241             if (err != ERROR_NO_UNICODE_TRANSLATION
7242                 && err != ERROR_INSUFFICIENT_BUFFER)
7243             {
7244                 PyErr_SetFromWindowsErr(0);
7245                 goto error;
7246             }
7247             insize++;
7248         }
7249         /* 4=maximum length of a UTF-8 sequence */
7250         while (insize <= 4 && (in + insize) <= endin);
7251 
7252         if (outsize <= 0) {
7253             Py_ssize_t startinpos, endinpos, outpos;
7254 
7255             /* last character in partial decode? */
7256             if (in + insize >= endin && !final)
7257                 break;
7258 
7259             startinpos = in - startin;
7260             endinpos = startinpos + 1;
7261             outpos = out - PyUnicode_AS_UNICODE(*v);
7262             if (unicode_decode_call_errorhandler_wchar(
7263                     errors, &errorHandler,
7264                     encoding, reason,
7265                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7266                     v, &outpos))
7267             {
7268                 goto error;
7269             }
7270             out = PyUnicode_AS_UNICODE(*v) + outpos;
7271         }
7272         else {
7273             in += insize;
7274             memcpy(out, buffer, outsize * sizeof(wchar_t));
7275             out += outsize;
7276         }
7277     }
7278 
7279     /* write a NUL character at the end */
7280     *out = 0;
7281 
7282     /* Extend unicode object */
7283     outsize = out - startout;
7284     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7285     if (unicode_resize(v, outsize) < 0)
7286         goto error;
7287     /* (in - startin) <= size and size is an int */
7288     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7289 
7290 error:
7291     Py_XDECREF(encoding_obj);
7292     Py_XDECREF(errorHandler);
7293     Py_XDECREF(exc);
7294     return ret;
7295 }
7296 
7297 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7298 decode_code_page_stateful(int code_page,
7299                           const char *s, Py_ssize_t size,
7300                           const char *errors, Py_ssize_t *consumed)
7301 {
7302     PyObject *v = NULL;
7303     int chunk_size, final, converted, done;
7304 
7305     if (code_page < 0) {
7306         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307         return NULL;
7308     }
7309 
7310     if (consumed)
7311         *consumed = 0;
7312 
7313     do
7314     {
7315 #ifdef NEED_RETRY
7316         if (size > INT_MAX) {
7317             chunk_size = INT_MAX;
7318             final = 0;
7319             done = 0;
7320         }
7321         else
7322 #endif
7323         {
7324             chunk_size = (int)size;
7325             final = (consumed == NULL);
7326             done = 1;
7327         }
7328 
7329         if (chunk_size == 0 && done) {
7330             if (v != NULL)
7331                 break;
7332             _Py_RETURN_UNICODE_EMPTY();
7333         }
7334 
7335         converted = decode_code_page_strict(code_page, &v,
7336                                             s, chunk_size);
7337         if (converted == -2)
7338             converted = decode_code_page_errors(code_page, &v,
7339                                                 s, chunk_size,
7340                                                 errors, final);
7341         assert(converted != 0 || done);
7342 
7343         if (converted < 0) {
7344             Py_XDECREF(v);
7345             return NULL;
7346         }
7347 
7348         if (consumed)
7349             *consumed += converted;
7350 
7351         s += converted;
7352         size -= converted;
7353     } while (!done);
7354 
7355     return unicode_result(v);
7356 }
7357 
7358 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7359 PyUnicode_DecodeCodePageStateful(int code_page,
7360                                  const char *s,
7361                                  Py_ssize_t size,
7362                                  const char *errors,
7363                                  Py_ssize_t *consumed)
7364 {
7365     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7366 }
7367 
7368 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7369 PyUnicode_DecodeMBCSStateful(const char *s,
7370                              Py_ssize_t size,
7371                              const char *errors,
7372                              Py_ssize_t *consumed)
7373 {
7374     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7375 }
7376 
7377 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7378 PyUnicode_DecodeMBCS(const char *s,
7379                      Py_ssize_t size,
7380                      const char *errors)
7381 {
7382     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7383 }
7384 
7385 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7386 encode_code_page_flags(UINT code_page, const char *errors)
7387 {
7388     if (code_page == CP_UTF8) {
7389         return WC_ERR_INVALID_CHARS;
7390     }
7391     else if (code_page == CP_UTF7) {
7392         /* CP_UTF7 only supports flags=0 */
7393         return 0;
7394     }
7395     else {
7396         if (errors != NULL && strcmp(errors, "replace") == 0)
7397             return 0;
7398         else
7399             return WC_NO_BEST_FIT_CHARS;
7400     }
7401 }
7402 
7403 /*
7404  * Encode a Unicode string to a Windows code page into a byte string in strict
7405  * mode.
7406  *
7407  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7408  * an OSError and returns -1 on other error.
7409  */
7410 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7411 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7412                         PyObject *unicode, Py_ssize_t offset, int len,
7413                         const char* errors)
7414 {
7415     BOOL usedDefaultChar = FALSE;
7416     BOOL *pusedDefaultChar = &usedDefaultChar;
7417     int outsize;
7418     wchar_t *p;
7419     Py_ssize_t size;
7420     const DWORD flags = encode_code_page_flags(code_page, NULL);
7421     char *out;
7422     /* Create a substring so that we can get the UTF-16 representation
7423        of just the slice under consideration. */
7424     PyObject *substring;
7425 
7426     assert(len > 0);
7427 
7428     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7429         pusedDefaultChar = &usedDefaultChar;
7430     else
7431         pusedDefaultChar = NULL;
7432 
7433     substring = PyUnicode_Substring(unicode, offset, offset+len);
7434     if (substring == NULL)
7435         return -1;
7436     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7437     if (p == NULL) {
7438         Py_DECREF(substring);
7439         return -1;
7440     }
7441     assert(size <= INT_MAX);
7442 
7443     /* First get the size of the result */
7444     outsize = WideCharToMultiByte(code_page, flags,
7445                                   p, (int)size,
7446                                   NULL, 0,
7447                                   NULL, pusedDefaultChar);
7448     if (outsize <= 0)
7449         goto error;
7450     /* If we used a default char, then we failed! */
7451     if (pusedDefaultChar && *pusedDefaultChar) {
7452         Py_DECREF(substring);
7453         return -2;
7454     }
7455 
7456     if (*outbytes == NULL) {
7457         /* Create string object */
7458         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7459         if (*outbytes == NULL) {
7460             Py_DECREF(substring);
7461             return -1;
7462         }
7463         out = PyBytes_AS_STRING(*outbytes);
7464     }
7465     else {
7466         /* Extend string object */
7467         const Py_ssize_t n = PyBytes_Size(*outbytes);
7468         if (outsize > PY_SSIZE_T_MAX - n) {
7469             PyErr_NoMemory();
7470             Py_DECREF(substring);
7471             return -1;
7472         }
7473         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7474             Py_DECREF(substring);
7475             return -1;
7476         }
7477         out = PyBytes_AS_STRING(*outbytes) + n;
7478     }
7479 
7480     /* Do the conversion */
7481     outsize = WideCharToMultiByte(code_page, flags,
7482                                   p, (int)size,
7483                                   out, outsize,
7484                                   NULL, pusedDefaultChar);
7485     Py_CLEAR(substring);
7486     if (outsize <= 0)
7487         goto error;
7488     if (pusedDefaultChar && *pusedDefaultChar)
7489         return -2;
7490     return 0;
7491 
7492 error:
7493     Py_XDECREF(substring);
7494     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7495         return -2;
7496     PyErr_SetFromWindowsErr(0);
7497     return -1;
7498 }
7499 
7500 /*
7501  * Encode a Unicode string to a Windows code page into a byte string using an
7502  * error handler.
7503  *
7504  * Returns consumed characters if succeed, or raise an OSError and returns
7505  * -1 on other error.
7506  */
7507 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7508 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7509                         PyObject *unicode, Py_ssize_t unicode_offset,
7510                         Py_ssize_t insize, const char* errors)
7511 {
7512     const DWORD flags = encode_code_page_flags(code_page, errors);
7513     Py_ssize_t pos = unicode_offset;
7514     Py_ssize_t endin = unicode_offset + insize;
7515     /* Ideally, we should get reason from FormatMessage. This is the Windows
7516        2000 English version of the message. */
7517     const char *reason = "invalid character";
7518     /* 4=maximum length of a UTF-8 sequence */
7519     char buffer[4];
7520     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7521     Py_ssize_t outsize;
7522     char *out;
7523     PyObject *errorHandler = NULL;
7524     PyObject *exc = NULL;
7525     PyObject *encoding_obj = NULL;
7526     const char *encoding;
7527     Py_ssize_t newpos, newoutsize;
7528     PyObject *rep;
7529     int ret = -1;
7530 
7531     assert(insize > 0);
7532 
7533     encoding = code_page_name(code_page, &encoding_obj);
7534     if (encoding == NULL)
7535         return -1;
7536 
7537     if (errors == NULL || strcmp(errors, "strict") == 0) {
7538         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7539            then we raise a UnicodeEncodeError. */
7540         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7541         if (exc != NULL) {
7542             PyCodec_StrictErrors(exc);
7543             Py_DECREF(exc);
7544         }
7545         Py_XDECREF(encoding_obj);
7546         return -1;
7547     }
7548 
7549     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7550         pusedDefaultChar = &usedDefaultChar;
7551     else
7552         pusedDefaultChar = NULL;
7553 
7554     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7555         PyErr_NoMemory();
7556         goto error;
7557     }
7558     outsize = insize * Py_ARRAY_LENGTH(buffer);
7559 
7560     if (*outbytes == NULL) {
7561         /* Create string object */
7562         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7563         if (*outbytes == NULL)
7564             goto error;
7565         out = PyBytes_AS_STRING(*outbytes);
7566     }
7567     else {
7568         /* Extend string object */
7569         Py_ssize_t n = PyBytes_Size(*outbytes);
7570         if (n > PY_SSIZE_T_MAX - outsize) {
7571             PyErr_NoMemory();
7572             goto error;
7573         }
7574         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7575             goto error;
7576         out = PyBytes_AS_STRING(*outbytes) + n;
7577     }
7578 
7579     /* Encode the string character per character */
7580     while (pos < endin)
7581     {
7582         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7583         wchar_t chars[2];
7584         int charsize;
7585         if (ch < 0x10000) {
7586             chars[0] = (wchar_t)ch;
7587             charsize = 1;
7588         }
7589         else {
7590             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7591             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7592             charsize = 2;
7593         }
7594 
7595         outsize = WideCharToMultiByte(code_page, flags,
7596                                       chars, charsize,
7597                                       buffer, Py_ARRAY_LENGTH(buffer),
7598                                       NULL, pusedDefaultChar);
7599         if (outsize > 0) {
7600             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7601             {
7602                 pos++;
7603                 memcpy(out, buffer, outsize);
7604                 out += outsize;
7605                 continue;
7606             }
7607         }
7608         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7609             PyErr_SetFromWindowsErr(0);
7610             goto error;
7611         }
7612 
7613         rep = unicode_encode_call_errorhandler(
7614                   errors, &errorHandler, encoding, reason,
7615                   unicode, &exc,
7616                   pos, pos + 1, &newpos);
7617         if (rep == NULL)
7618             goto error;
7619         pos = newpos;
7620 
7621         if (PyBytes_Check(rep)) {
7622             outsize = PyBytes_GET_SIZE(rep);
7623             if (outsize != 1) {
7624                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7625                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7626                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7627                     Py_DECREF(rep);
7628                     goto error;
7629                 }
7630                 out = PyBytes_AS_STRING(*outbytes) + offset;
7631             }
7632             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7633             out += outsize;
7634         }
7635         else {
7636             Py_ssize_t i;
7637             enum PyUnicode_Kind kind;
7638             void *data;
7639 
7640             if (PyUnicode_READY(rep) == -1) {
7641                 Py_DECREF(rep);
7642                 goto error;
7643             }
7644 
7645             outsize = PyUnicode_GET_LENGTH(rep);
7646             if (outsize != 1) {
7647                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7648                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7649                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7650                     Py_DECREF(rep);
7651                     goto error;
7652                 }
7653                 out = PyBytes_AS_STRING(*outbytes) + offset;
7654             }
7655             kind = PyUnicode_KIND(rep);
7656             data = PyUnicode_DATA(rep);
7657             for (i=0; i < outsize; i++) {
7658                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7659                 if (ch > 127) {
7660                     raise_encode_exception(&exc,
7661                         encoding, unicode,
7662                         pos, pos + 1,
7663                         "unable to encode error handler result to ASCII");
7664                     Py_DECREF(rep);
7665                     goto error;
7666                 }
7667                 *out = (unsigned char)ch;
7668                 out++;
7669             }
7670         }
7671         Py_DECREF(rep);
7672     }
7673     /* write a NUL byte */
7674     *out = 0;
7675     outsize = out - PyBytes_AS_STRING(*outbytes);
7676     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7677     if (_PyBytes_Resize(outbytes, outsize) < 0)
7678         goto error;
7679     ret = 0;
7680 
7681 error:
7682     Py_XDECREF(encoding_obj);
7683     Py_XDECREF(errorHandler);
7684     Py_XDECREF(exc);
7685     return ret;
7686 }
7687 
7688 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7689 encode_code_page(int code_page,
7690                  PyObject *unicode,
7691                  const char *errors)
7692 {
7693     Py_ssize_t len;
7694     PyObject *outbytes = NULL;
7695     Py_ssize_t offset;
7696     int chunk_len, ret, done;
7697 
7698     if (!PyUnicode_Check(unicode)) {
7699         PyErr_BadArgument();
7700         return NULL;
7701     }
7702 
7703     if (PyUnicode_READY(unicode) == -1)
7704         return NULL;
7705     len = PyUnicode_GET_LENGTH(unicode);
7706 
7707     if (code_page < 0) {
7708         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7709         return NULL;
7710     }
7711 
7712     if (len == 0)
7713         return PyBytes_FromStringAndSize(NULL, 0);
7714 
7715     offset = 0;
7716     do
7717     {
7718 #ifdef NEED_RETRY
7719         /* UTF-16 encoding may double the size, so use only INT_MAX/2
7720            chunks. */
7721         if (len > INT_MAX/2) {
7722             chunk_len = INT_MAX/2;
7723             done = 0;
7724         }
7725         else
7726 #endif
7727         {
7728             chunk_len = (int)len;
7729             done = 1;
7730         }
7731 
7732         ret = encode_code_page_strict(code_page, &outbytes,
7733                                       unicode, offset, chunk_len,
7734                                       errors);
7735         if (ret == -2)
7736             ret = encode_code_page_errors(code_page, &outbytes,
7737                                           unicode, offset,
7738                                           chunk_len, errors);
7739         if (ret < 0) {
7740             Py_XDECREF(outbytes);
7741             return NULL;
7742         }
7743 
7744         offset += chunk_len;
7745         len -= chunk_len;
7746     } while (!done);
7747 
7748     return outbytes;
7749 }
7750 
7751 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7752 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7753                      Py_ssize_t size,
7754                      const char *errors)
7755 {
7756     PyObject *unicode, *res;
7757     unicode = PyUnicode_FromUnicode(p, size);
7758     if (unicode == NULL)
7759         return NULL;
7760     res = encode_code_page(CP_ACP, unicode, errors);
7761     Py_DECREF(unicode);
7762     return res;
7763 }
7764 
7765 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7766 PyUnicode_EncodeCodePage(int code_page,
7767                          PyObject *unicode,
7768                          const char *errors)
7769 {
7770     return encode_code_page(code_page, unicode, errors);
7771 }
7772 
7773 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7774 PyUnicode_AsMBCSString(PyObject *unicode)
7775 {
7776     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7777 }
7778 
7779 #undef NEED_RETRY
7780 
7781 #endif /* MS_WINDOWS */
7782 
7783 /* --- Character Mapping Codec -------------------------------------------- */
7784 
7785 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7786 charmap_decode_string(const char *s,
7787                       Py_ssize_t size,
7788                       PyObject *mapping,
7789                       const char *errors,
7790                       _PyUnicodeWriter *writer)
7791 {
7792     const char *starts = s;
7793     const char *e;
7794     Py_ssize_t startinpos, endinpos;
7795     PyObject *errorHandler = NULL, *exc = NULL;
7796     Py_ssize_t maplen;
7797     enum PyUnicode_Kind mapkind;
7798     void *mapdata;
7799     Py_UCS4 x;
7800     unsigned char ch;
7801 
7802     if (PyUnicode_READY(mapping) == -1)
7803         return -1;
7804 
7805     maplen = PyUnicode_GET_LENGTH(mapping);
7806     mapdata = PyUnicode_DATA(mapping);
7807     mapkind = PyUnicode_KIND(mapping);
7808 
7809     e = s + size;
7810 
7811     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7812         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7813          * is disabled in encoding aliases, latin1 is preferred because
7814          * its implementation is faster. */
7815         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7816         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7817         Py_UCS4 maxchar = writer->maxchar;
7818 
7819         assert (writer->kind == PyUnicode_1BYTE_KIND);
7820         while (s < e) {
7821             ch = *s;
7822             x = mapdata_ucs1[ch];
7823             if (x > maxchar) {
7824                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7825                     goto onError;
7826                 maxchar = writer->maxchar;
7827                 outdata = (Py_UCS1 *)writer->data;
7828             }
7829             outdata[writer->pos] = x;
7830             writer->pos++;
7831             ++s;
7832         }
7833         return 0;
7834     }
7835 
7836     while (s < e) {
7837         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7838             enum PyUnicode_Kind outkind = writer->kind;
7839             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7840             if (outkind == PyUnicode_1BYTE_KIND) {
7841                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7842                 Py_UCS4 maxchar = writer->maxchar;
7843                 while (s < e) {
7844                     ch = *s;
7845                     x = mapdata_ucs2[ch];
7846                     if (x > maxchar)
7847                         goto Error;
7848                     outdata[writer->pos] = x;
7849                     writer->pos++;
7850                     ++s;
7851                 }
7852                 break;
7853             }
7854             else if (outkind == PyUnicode_2BYTE_KIND) {
7855                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7856                 while (s < e) {
7857                     ch = *s;
7858                     x = mapdata_ucs2[ch];
7859                     if (x == 0xFFFE)
7860                         goto Error;
7861                     outdata[writer->pos] = x;
7862                     writer->pos++;
7863                     ++s;
7864                 }
7865                 break;
7866             }
7867         }
7868         ch = *s;
7869 
7870         if (ch < maplen)
7871             x = PyUnicode_READ(mapkind, mapdata, ch);
7872         else
7873             x = 0xfffe; /* invalid value */
7874 Error:
7875         if (x == 0xfffe)
7876         {
7877             /* undefined mapping */
7878             startinpos = s-starts;
7879             endinpos = startinpos+1;
7880             if (unicode_decode_call_errorhandler_writer(
7881                     errors, &errorHandler,
7882                     "charmap", "character maps to <undefined>",
7883                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7884                     writer)) {
7885                 goto onError;
7886             }
7887             continue;
7888         }
7889 
7890         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7891             goto onError;
7892         ++s;
7893     }
7894     Py_XDECREF(errorHandler);
7895     Py_XDECREF(exc);
7896     return 0;
7897 
7898 onError:
7899     Py_XDECREF(errorHandler);
7900     Py_XDECREF(exc);
7901     return -1;
7902 }
7903 
7904 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7905 charmap_decode_mapping(const char *s,
7906                        Py_ssize_t size,
7907                        PyObject *mapping,
7908                        const char *errors,
7909                        _PyUnicodeWriter *writer)
7910 {
7911     const char *starts = s;
7912     const char *e;
7913     Py_ssize_t startinpos, endinpos;
7914     PyObject *errorHandler = NULL, *exc = NULL;
7915     unsigned char ch;
7916     PyObject *key, *item = NULL;
7917 
7918     e = s + size;
7919 
7920     while (s < e) {
7921         ch = *s;
7922 
7923         /* Get mapping (char ordinal -> integer, Unicode char or None) */
7924         key = PyLong_FromLong((long)ch);
7925         if (key == NULL)
7926             goto onError;
7927 
7928         item = PyObject_GetItem(mapping, key);
7929         Py_DECREF(key);
7930         if (item == NULL) {
7931             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7932                 /* No mapping found means: mapping is undefined. */
7933                 PyErr_Clear();
7934                 goto Undefined;
7935             } else
7936                 goto onError;
7937         }
7938 
7939         /* Apply mapping */
7940         if (item == Py_None)
7941             goto Undefined;
7942         if (PyLong_Check(item)) {
7943             long value = PyLong_AS_LONG(item);
7944             if (value == 0xFFFE)
7945                 goto Undefined;
7946             if (value < 0 || value > MAX_UNICODE) {
7947                 PyErr_Format(PyExc_TypeError,
7948                              "character mapping must be in range(0x%lx)",
7949                              (unsigned long)MAX_UNICODE + 1);
7950                 goto onError;
7951             }
7952 
7953             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7954                 goto onError;
7955         }
7956         else if (PyUnicode_Check(item)) {
7957             if (PyUnicode_READY(item) == -1)
7958                 goto onError;
7959             if (PyUnicode_GET_LENGTH(item) == 1) {
7960                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7961                 if (value == 0xFFFE)
7962                     goto Undefined;
7963                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7964                     goto onError;
7965             }
7966             else {
7967                 writer->overallocate = 1;
7968                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7969                     goto onError;
7970             }
7971         }
7972         else {
7973             /* wrong return value */
7974             PyErr_SetString(PyExc_TypeError,
7975                             "character mapping must return integer, None or str");
7976             goto onError;
7977         }
7978         Py_CLEAR(item);
7979         ++s;
7980         continue;
7981 
7982 Undefined:
7983         /* undefined mapping */
7984         Py_CLEAR(item);
7985         startinpos = s-starts;
7986         endinpos = startinpos+1;
7987         if (unicode_decode_call_errorhandler_writer(
7988                 errors, &errorHandler,
7989                 "charmap", "character maps to <undefined>",
7990                 &starts, &e, &startinpos, &endinpos, &exc, &s,
7991                 writer)) {
7992             goto onError;
7993         }
7994     }
7995     Py_XDECREF(errorHandler);
7996     Py_XDECREF(exc);
7997     return 0;
7998 
7999 onError:
8000     Py_XDECREF(item);
8001     Py_XDECREF(errorHandler);
8002     Py_XDECREF(exc);
8003     return -1;
8004 }
8005 
8006 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8007 PyUnicode_DecodeCharmap(const char *s,
8008                         Py_ssize_t size,
8009                         PyObject *mapping,
8010                         const char *errors)
8011 {
8012     _PyUnicodeWriter writer;
8013 
8014     /* Default to Latin-1 */
8015     if (mapping == NULL)
8016         return PyUnicode_DecodeLatin1(s, size, errors);
8017 
8018     if (size == 0)
8019         _Py_RETURN_UNICODE_EMPTY();
8020     _PyUnicodeWriter_Init(&writer);
8021     writer.min_length = size;
8022     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8023         goto onError;
8024 
8025     if (PyUnicode_CheckExact(mapping)) {
8026         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8027             goto onError;
8028     }
8029     else {
8030         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8031             goto onError;
8032     }
8033     return _PyUnicodeWriter_Finish(&writer);
8034 
8035   onError:
8036     _PyUnicodeWriter_Dealloc(&writer);
8037     return NULL;
8038 }
8039 
8040 /* Charmap encoding: the lookup table */
8041 
8042 struct encoding_map {
8043     PyObject_HEAD
8044     unsigned char level1[32];
8045     int count2, count3;
8046     unsigned char level23[1];
8047 };
8048 
8049 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8050 encoding_map_size(PyObject *obj, PyObject* args)
8051 {
8052     struct encoding_map *map = (struct encoding_map*)obj;
8053     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8054                            128*map->count3);
8055 }
8056 
8057 static PyMethodDef encoding_map_methods[] = {
8058     {"size", encoding_map_size, METH_NOARGS,
8059      PyDoc_STR("Return the size (in bytes) of this object") },
8060     { 0 }
8061 };
8062 
8063 static void
encoding_map_dealloc(PyObject * o)8064 encoding_map_dealloc(PyObject* o)
8065 {
8066     PyObject_FREE(o);
8067 }
8068 
8069 static PyTypeObject EncodingMapType = {
8070     PyVarObject_HEAD_INIT(NULL, 0)
8071     "EncodingMap",          /*tp_name*/
8072     sizeof(struct encoding_map),   /*tp_basicsize*/
8073     0,                      /*tp_itemsize*/
8074     /* methods */
8075     encoding_map_dealloc,   /*tp_dealloc*/
8076     0,                      /*tp_print*/
8077     0,                      /*tp_getattr*/
8078     0,                      /*tp_setattr*/
8079     0,                      /*tp_reserved*/
8080     0,                      /*tp_repr*/
8081     0,                      /*tp_as_number*/
8082     0,                      /*tp_as_sequence*/
8083     0,                      /*tp_as_mapping*/
8084     0,                      /*tp_hash*/
8085     0,                      /*tp_call*/
8086     0,                      /*tp_str*/
8087     0,                      /*tp_getattro*/
8088     0,                      /*tp_setattro*/
8089     0,                      /*tp_as_buffer*/
8090     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8091     0,                      /*tp_doc*/
8092     0,                      /*tp_traverse*/
8093     0,                      /*tp_clear*/
8094     0,                      /*tp_richcompare*/
8095     0,                      /*tp_weaklistoffset*/
8096     0,                      /*tp_iter*/
8097     0,                      /*tp_iternext*/
8098     encoding_map_methods,   /*tp_methods*/
8099     0,                      /*tp_members*/
8100     0,                      /*tp_getset*/
8101     0,                      /*tp_base*/
8102     0,                      /*tp_dict*/
8103     0,                      /*tp_descr_get*/
8104     0,                      /*tp_descr_set*/
8105     0,                      /*tp_dictoffset*/
8106     0,                      /*tp_init*/
8107     0,                      /*tp_alloc*/
8108     0,                      /*tp_new*/
8109     0,                      /*tp_free*/
8110     0,                      /*tp_is_gc*/
8111 };
8112 
8113 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8114 PyUnicode_BuildEncodingMap(PyObject* string)
8115 {
8116     PyObject *result;
8117     struct encoding_map *mresult;
8118     int i;
8119     int need_dict = 0;
8120     unsigned char level1[32];
8121     unsigned char level2[512];
8122     unsigned char *mlevel1, *mlevel2, *mlevel3;
8123     int count2 = 0, count3 = 0;
8124     int kind;
8125     void *data;
8126     Py_ssize_t length;
8127     Py_UCS4 ch;
8128 
8129     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8130         PyErr_BadArgument();
8131         return NULL;
8132     }
8133     kind = PyUnicode_KIND(string);
8134     data = PyUnicode_DATA(string);
8135     length = PyUnicode_GET_LENGTH(string);
8136     length = Py_MIN(length, 256);
8137     memset(level1, 0xFF, sizeof level1);
8138     memset(level2, 0xFF, sizeof level2);
8139 
8140     /* If there isn't a one-to-one mapping of NULL to \0,
8141        or if there are non-BMP characters, we need to use
8142        a mapping dictionary. */
8143     if (PyUnicode_READ(kind, data, 0) != 0)
8144         need_dict = 1;
8145     for (i = 1; i < length; i++) {
8146         int l1, l2;
8147         ch = PyUnicode_READ(kind, data, i);
8148         if (ch == 0 || ch > 0xFFFF) {
8149             need_dict = 1;
8150             break;
8151         }
8152         if (ch == 0xFFFE)
8153             /* unmapped character */
8154             continue;
8155         l1 = ch >> 11;
8156         l2 = ch >> 7;
8157         if (level1[l1] == 0xFF)
8158             level1[l1] = count2++;
8159         if (level2[l2] == 0xFF)
8160             level2[l2] = count3++;
8161     }
8162 
8163     if (count2 >= 0xFF || count3 >= 0xFF)
8164         need_dict = 1;
8165 
8166     if (need_dict) {
8167         PyObject *result = PyDict_New();
8168         PyObject *key, *value;
8169         if (!result)
8170             return NULL;
8171         for (i = 0; i < length; i++) {
8172             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8173             value = PyLong_FromLong(i);
8174             if (!key || !value)
8175                 goto failed1;
8176             if (PyDict_SetItem(result, key, value) == -1)
8177                 goto failed1;
8178             Py_DECREF(key);
8179             Py_DECREF(value);
8180         }
8181         return result;
8182       failed1:
8183         Py_XDECREF(key);
8184         Py_XDECREF(value);
8185         Py_DECREF(result);
8186         return NULL;
8187     }
8188 
8189     /* Create a three-level trie */
8190     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8191                              16*count2 + 128*count3 - 1);
8192     if (!result)
8193         return PyErr_NoMemory();
8194     PyObject_Init(result, &EncodingMapType);
8195     mresult = (struct encoding_map*)result;
8196     mresult->count2 = count2;
8197     mresult->count3 = count3;
8198     mlevel1 = mresult->level1;
8199     mlevel2 = mresult->level23;
8200     mlevel3 = mresult->level23 + 16*count2;
8201     memcpy(mlevel1, level1, 32);
8202     memset(mlevel2, 0xFF, 16*count2);
8203     memset(mlevel3, 0, 128*count3);
8204     count3 = 0;
8205     for (i = 1; i < length; i++) {
8206         int o1, o2, o3, i2, i3;
8207         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8208         if (ch == 0xFFFE)
8209             /* unmapped character */
8210             continue;
8211         o1 = ch>>11;
8212         o2 = (ch>>7) & 0xF;
8213         i2 = 16*mlevel1[o1] + o2;
8214         if (mlevel2[i2] == 0xFF)
8215             mlevel2[i2] = count3++;
8216         o3 = ch & 0x7F;
8217         i3 = 128*mlevel2[i2] + o3;
8218         mlevel3[i3] = i;
8219     }
8220     return result;
8221 }
8222 
8223 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8224 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8225 {
8226     struct encoding_map *map = (struct encoding_map*)mapping;
8227     int l1 = c>>11;
8228     int l2 = (c>>7) & 0xF;
8229     int l3 = c & 0x7F;
8230     int i;
8231 
8232     if (c > 0xFFFF)
8233         return -1;
8234     if (c == 0)
8235         return 0;
8236     /* level 1*/
8237     i = map->level1[l1];
8238     if (i == 0xFF) {
8239         return -1;
8240     }
8241     /* level 2*/
8242     i = map->level23[16*i+l2];
8243     if (i == 0xFF) {
8244         return -1;
8245     }
8246     /* level 3 */
8247     i = map->level23[16*map->count2 + 128*i + l3];
8248     if (i == 0) {
8249         return -1;
8250     }
8251     return i;
8252 }
8253 
8254 /* Lookup the character ch in the mapping. If the character
8255    can't be found, Py_None is returned (or NULL, if another
8256    error occurred). */
8257 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8258 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8259 {
8260     PyObject *w = PyLong_FromLong((long)c);
8261     PyObject *x;
8262 
8263     if (w == NULL)
8264         return NULL;
8265     x = PyObject_GetItem(mapping, w);
8266     Py_DECREF(w);
8267     if (x == NULL) {
8268         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8269             /* No mapping found means: mapping is undefined. */
8270             PyErr_Clear();
8271             x = Py_None;
8272             Py_INCREF(x);
8273             return x;
8274         } else
8275             return NULL;
8276     }
8277     else if (x == Py_None)
8278         return x;
8279     else if (PyLong_Check(x)) {
8280         long value = PyLong_AS_LONG(x);
8281         if (value < 0 || value > 255) {
8282             PyErr_SetString(PyExc_TypeError,
8283                             "character mapping must be in range(256)");
8284             Py_DECREF(x);
8285             return NULL;
8286         }
8287         return x;
8288     }
8289     else if (PyBytes_Check(x))
8290         return x;
8291     else {
8292         /* wrong return value */
8293         PyErr_Format(PyExc_TypeError,
8294                      "character mapping must return integer, bytes or None, not %.400s",
8295                      x->ob_type->tp_name);
8296         Py_DECREF(x);
8297         return NULL;
8298     }
8299 }
8300 
8301 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8302 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8303 {
8304     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8305     /* exponentially overallocate to minimize reallocations */
8306     if (requiredsize < 2*outsize)
8307         requiredsize = 2*outsize;
8308     if (_PyBytes_Resize(outobj, requiredsize))
8309         return -1;
8310     return 0;
8311 }
8312 
8313 typedef enum charmapencode_result {
8314     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8315 } charmapencode_result;
8316 /* lookup the character, put the result in the output string and adjust
8317    various state variables. Resize the output bytes object if not enough
8318    space is available. Return a new reference to the object that
8319    was put in the output buffer, or Py_None, if the mapping was undefined
8320    (in which case no character was written) or NULL, if a
8321    reallocation error occurred. The caller must decref the result */
8322 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8323 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8324                      PyObject **outobj, Py_ssize_t *outpos)
8325 {
8326     PyObject *rep;
8327     char *outstart;
8328     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8329 
8330     if (Py_TYPE(mapping) == &EncodingMapType) {
8331         int res = encoding_map_lookup(c, mapping);
8332         Py_ssize_t requiredsize = *outpos+1;
8333         if (res == -1)
8334             return enc_FAILED;
8335         if (outsize<requiredsize)
8336             if (charmapencode_resize(outobj, outpos, requiredsize))
8337                 return enc_EXCEPTION;
8338         outstart = PyBytes_AS_STRING(*outobj);
8339         outstart[(*outpos)++] = (char)res;
8340         return enc_SUCCESS;
8341     }
8342 
8343     rep = charmapencode_lookup(c, mapping);
8344     if (rep==NULL)
8345         return enc_EXCEPTION;
8346     else if (rep==Py_None) {
8347         Py_DECREF(rep);
8348         return enc_FAILED;
8349     } else {
8350         if (PyLong_Check(rep)) {
8351             Py_ssize_t requiredsize = *outpos+1;
8352             if (outsize<requiredsize)
8353                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8354                     Py_DECREF(rep);
8355                     return enc_EXCEPTION;
8356                 }
8357             outstart = PyBytes_AS_STRING(*outobj);
8358             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8359         }
8360         else {
8361             const char *repchars = PyBytes_AS_STRING(rep);
8362             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8363             Py_ssize_t requiredsize = *outpos+repsize;
8364             if (outsize<requiredsize)
8365                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8366                     Py_DECREF(rep);
8367                     return enc_EXCEPTION;
8368                 }
8369             outstart = PyBytes_AS_STRING(*outobj);
8370             memcpy(outstart + *outpos, repchars, repsize);
8371             *outpos += repsize;
8372         }
8373     }
8374     Py_DECREF(rep);
8375     return enc_SUCCESS;
8376 }
8377 
8378 /* handle an error in PyUnicode_EncodeCharmap
8379    Return 0 on success, -1 on error */
8380 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8381 charmap_encoding_error(
8382     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8383     PyObject **exceptionObject,
8384     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8385     PyObject **res, Py_ssize_t *respos)
8386 {
8387     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8388     Py_ssize_t size, repsize;
8389     Py_ssize_t newpos;
8390     enum PyUnicode_Kind kind;
8391     void *data;
8392     Py_ssize_t index;
8393     /* startpos for collecting unencodable chars */
8394     Py_ssize_t collstartpos = *inpos;
8395     Py_ssize_t collendpos = *inpos+1;
8396     Py_ssize_t collpos;
8397     char *encoding = "charmap";
8398     char *reason = "character maps to <undefined>";
8399     charmapencode_result x;
8400     Py_UCS4 ch;
8401     int val;
8402 
8403     if (PyUnicode_READY(unicode) == -1)
8404         return -1;
8405     size = PyUnicode_GET_LENGTH(unicode);
8406     /* find all unencodable characters */
8407     while (collendpos < size) {
8408         PyObject *rep;
8409         if (Py_TYPE(mapping) == &EncodingMapType) {
8410             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8411             val = encoding_map_lookup(ch, mapping);
8412             if (val != -1)
8413                 break;
8414             ++collendpos;
8415             continue;
8416         }
8417 
8418         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8419         rep = charmapencode_lookup(ch, mapping);
8420         if (rep==NULL)
8421             return -1;
8422         else if (rep!=Py_None) {
8423             Py_DECREF(rep);
8424             break;
8425         }
8426         Py_DECREF(rep);
8427         ++collendpos;
8428     }
8429     /* cache callback name lookup
8430      * (if not done yet, i.e. it's the first error) */
8431     if (*error_handler == _Py_ERROR_UNKNOWN)
8432         *error_handler = get_error_handler(errors);
8433 
8434     switch (*error_handler) {
8435     case _Py_ERROR_STRICT:
8436         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8437         return -1;
8438 
8439     case _Py_ERROR_REPLACE:
8440         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8441             x = charmapencode_output('?', mapping, res, respos);
8442             if (x==enc_EXCEPTION) {
8443                 return -1;
8444             }
8445             else if (x==enc_FAILED) {
8446                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8447                 return -1;
8448             }
8449         }
8450         /* fall through */
8451     case _Py_ERROR_IGNORE:
8452         *inpos = collendpos;
8453         break;
8454 
8455     case _Py_ERROR_XMLCHARREFREPLACE:
8456         /* generate replacement (temporarily (mis)uses p) */
8457         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8458             char buffer[2+29+1+1];
8459             char *cp;
8460             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8461             for (cp = buffer; *cp; ++cp) {
8462                 x = charmapencode_output(*cp, mapping, res, respos);
8463                 if (x==enc_EXCEPTION)
8464                     return -1;
8465                 else if (x==enc_FAILED) {
8466                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8467                     return -1;
8468                 }
8469             }
8470         }
8471         *inpos = collendpos;
8472         break;
8473 
8474     default:
8475         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8476                                                       encoding, reason, unicode, exceptionObject,
8477                                                       collstartpos, collendpos, &newpos);
8478         if (repunicode == NULL)
8479             return -1;
8480         if (PyBytes_Check(repunicode)) {
8481             /* Directly copy bytes result to output. */
8482             Py_ssize_t outsize = PyBytes_Size(*res);
8483             Py_ssize_t requiredsize;
8484             repsize = PyBytes_Size(repunicode);
8485             requiredsize = *respos + repsize;
8486             if (requiredsize > outsize)
8487                 /* Make room for all additional bytes. */
8488                 if (charmapencode_resize(res, respos, requiredsize)) {
8489                     Py_DECREF(repunicode);
8490                     return -1;
8491                 }
8492             memcpy(PyBytes_AsString(*res) + *respos,
8493                    PyBytes_AsString(repunicode),  repsize);
8494             *respos += repsize;
8495             *inpos = newpos;
8496             Py_DECREF(repunicode);
8497             break;
8498         }
8499         /* generate replacement  */
8500         if (PyUnicode_READY(repunicode) == -1) {
8501             Py_DECREF(repunicode);
8502             return -1;
8503         }
8504         repsize = PyUnicode_GET_LENGTH(repunicode);
8505         data = PyUnicode_DATA(repunicode);
8506         kind = PyUnicode_KIND(repunicode);
8507         for (index = 0; index < repsize; index++) {
8508             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8509             x = charmapencode_output(repch, mapping, res, respos);
8510             if (x==enc_EXCEPTION) {
8511                 Py_DECREF(repunicode);
8512                 return -1;
8513             }
8514             else if (x==enc_FAILED) {
8515                 Py_DECREF(repunicode);
8516                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8517                 return -1;
8518             }
8519         }
8520         *inpos = newpos;
8521         Py_DECREF(repunicode);
8522     }
8523     return 0;
8524 }
8525 
8526 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8527 _PyUnicode_EncodeCharmap(PyObject *unicode,
8528                          PyObject *mapping,
8529                          const char *errors)
8530 {
8531     /* output object */
8532     PyObject *res = NULL;
8533     /* current input position */
8534     Py_ssize_t inpos = 0;
8535     Py_ssize_t size;
8536     /* current output position */
8537     Py_ssize_t respos = 0;
8538     PyObject *error_handler_obj = NULL;
8539     PyObject *exc = NULL;
8540     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8541     void *data;
8542     int kind;
8543 
8544     if (PyUnicode_READY(unicode) == -1)
8545         return NULL;
8546     size = PyUnicode_GET_LENGTH(unicode);
8547     data = PyUnicode_DATA(unicode);
8548     kind = PyUnicode_KIND(unicode);
8549 
8550     /* Default to Latin-1 */
8551     if (mapping == NULL)
8552         return unicode_encode_ucs1(unicode, errors, 256);
8553 
8554     /* allocate enough for a simple encoding without
8555        replacements, if we need more, we'll resize */
8556     res = PyBytes_FromStringAndSize(NULL, size);
8557     if (res == NULL)
8558         goto onError;
8559     if (size == 0)
8560         return res;
8561 
8562     while (inpos<size) {
8563         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8564         /* try to encode it */
8565         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8566         if (x==enc_EXCEPTION) /* error */
8567             goto onError;
8568         if (x==enc_FAILED) { /* unencodable character */
8569             if (charmap_encoding_error(unicode, &inpos, mapping,
8570                                        &exc,
8571                                        &error_handler, &error_handler_obj, errors,
8572                                        &res, &respos)) {
8573                 goto onError;
8574             }
8575         }
8576         else
8577             /* done with this character => adjust input position */
8578             ++inpos;
8579     }
8580 
8581     /* Resize if we allocated to much */
8582     if (respos<PyBytes_GET_SIZE(res))
8583         if (_PyBytes_Resize(&res, respos) < 0)
8584             goto onError;
8585 
8586     Py_XDECREF(exc);
8587     Py_XDECREF(error_handler_obj);
8588     return res;
8589 
8590   onError:
8591     Py_XDECREF(res);
8592     Py_XDECREF(exc);
8593     Py_XDECREF(error_handler_obj);
8594     return NULL;
8595 }
8596 
8597 /* Deprecated */
8598 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8599 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8600                         Py_ssize_t size,
8601                         PyObject *mapping,
8602                         const char *errors)
8603 {
8604     PyObject *result;
8605     PyObject *unicode = PyUnicode_FromUnicode(p, size);
8606     if (unicode == NULL)
8607         return NULL;
8608     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8609     Py_DECREF(unicode);
8610     return result;
8611 }
8612 
8613 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8614 PyUnicode_AsCharmapString(PyObject *unicode,
8615                           PyObject *mapping)
8616 {
8617     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8618         PyErr_BadArgument();
8619         return NULL;
8620     }
8621     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8622 }
8623 
8624 /* create or adjust a UnicodeTranslateError */
8625 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8626 make_translate_exception(PyObject **exceptionObject,
8627                          PyObject *unicode,
8628                          Py_ssize_t startpos, Py_ssize_t endpos,
8629                          const char *reason)
8630 {
8631     if (*exceptionObject == NULL) {
8632         *exceptionObject = _PyUnicodeTranslateError_Create(
8633             unicode, startpos, endpos, reason);
8634     }
8635     else {
8636         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8637             goto onError;
8638         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8639             goto onError;
8640         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8641             goto onError;
8642         return;
8643       onError:
8644         Py_CLEAR(*exceptionObject);
8645     }
8646 }
8647 
8648 /* error handling callback helper:
8649    build arguments, call the callback and check the arguments,
8650    put the result into newpos and return the replacement string, which
8651    has to be freed by the caller */
8652 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8653 unicode_translate_call_errorhandler(const char *errors,
8654                                     PyObject **errorHandler,
8655                                     const char *reason,
8656                                     PyObject *unicode, PyObject **exceptionObject,
8657                                     Py_ssize_t startpos, Py_ssize_t endpos,
8658                                     Py_ssize_t *newpos)
8659 {
8660     static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
8661 
8662     Py_ssize_t i_newpos;
8663     PyObject *restuple;
8664     PyObject *resunicode;
8665 
8666     if (*errorHandler == NULL) {
8667         *errorHandler = PyCodec_LookupError(errors);
8668         if (*errorHandler == NULL)
8669             return NULL;
8670     }
8671 
8672     make_translate_exception(exceptionObject,
8673                              unicode, startpos, endpos, reason);
8674     if (*exceptionObject == NULL)
8675         return NULL;
8676 
8677     restuple = PyObject_CallFunctionObjArgs(
8678         *errorHandler, *exceptionObject, NULL);
8679     if (restuple == NULL)
8680         return NULL;
8681     if (!PyTuple_Check(restuple)) {
8682         PyErr_SetString(PyExc_TypeError, &argparse[4]);
8683         Py_DECREF(restuple);
8684         return NULL;
8685     }
8686     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8687                           &resunicode, &i_newpos)) {
8688         Py_DECREF(restuple);
8689         return NULL;
8690     }
8691     if (i_newpos<0)
8692         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8693     else
8694         *newpos = i_newpos;
8695     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8696         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8697         Py_DECREF(restuple);
8698         return NULL;
8699     }
8700     Py_INCREF(resunicode);
8701     Py_DECREF(restuple);
8702     return resunicode;
8703 }
8704 
8705 /* Lookup the character ch in the mapping and put the result in result,
8706    which must be decrefed by the caller.
8707    Return 0 on success, -1 on error */
8708 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8709 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8710 {
8711     PyObject *w = PyLong_FromLong((long)c);
8712     PyObject *x;
8713 
8714     if (w == NULL)
8715         return -1;
8716     x = PyObject_GetItem(mapping, w);
8717     Py_DECREF(w);
8718     if (x == NULL) {
8719         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8720             /* No mapping found means: use 1:1 mapping. */
8721             PyErr_Clear();
8722             *result = NULL;
8723             return 0;
8724         } else
8725             return -1;
8726     }
8727     else if (x == Py_None) {
8728         *result = x;
8729         return 0;
8730     }
8731     else if (PyLong_Check(x)) {
8732         long value = PyLong_AS_LONG(x);
8733         if (value < 0 || value > MAX_UNICODE) {
8734             PyErr_Format(PyExc_ValueError,
8735                          "character mapping must be in range(0x%x)",
8736                          MAX_UNICODE+1);
8737             Py_DECREF(x);
8738             return -1;
8739         }
8740         *result = x;
8741         return 0;
8742     }
8743     else if (PyUnicode_Check(x)) {
8744         *result = x;
8745         return 0;
8746     }
8747     else {
8748         /* wrong return value */
8749         PyErr_SetString(PyExc_TypeError,
8750                         "character mapping must return integer, None or str");
8751         Py_DECREF(x);
8752         return -1;
8753     }
8754 }
8755 
8756 /* lookup the character, write the result into the writer.
8757    Return 1 if the result was written into the writer, return 0 if the mapping
8758    was undefined, raise an exception return -1 on error. */
8759 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8760 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8761                         _PyUnicodeWriter *writer)
8762 {
8763     PyObject *item;
8764 
8765     if (charmaptranslate_lookup(ch, mapping, &item))
8766         return -1;
8767 
8768     if (item == NULL) {
8769         /* not found => default to 1:1 mapping */
8770         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8771             return -1;
8772         }
8773         return 1;
8774     }
8775 
8776     if (item == Py_None) {
8777         Py_DECREF(item);
8778         return 0;
8779     }
8780 
8781     if (PyLong_Check(item)) {
8782         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8783         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8784            used it */
8785         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8786             Py_DECREF(item);
8787             return -1;
8788         }
8789         Py_DECREF(item);
8790         return 1;
8791     }
8792 
8793     if (!PyUnicode_Check(item)) {
8794         Py_DECREF(item);
8795         return -1;
8796     }
8797 
8798     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8799         Py_DECREF(item);
8800         return -1;
8801     }
8802 
8803     Py_DECREF(item);
8804     return 1;
8805 }
8806 
8807 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8808 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8809                               Py_UCS1 *translate)
8810 {
8811     PyObject *item = NULL;
8812     int ret = 0;
8813 
8814     if (charmaptranslate_lookup(ch, mapping, &item)) {
8815         return -1;
8816     }
8817 
8818     if (item == Py_None) {
8819         /* deletion */
8820         translate[ch] = 0xfe;
8821     }
8822     else if (item == NULL) {
8823         /* not found => default to 1:1 mapping */
8824         translate[ch] = ch;
8825         return 1;
8826     }
8827     else if (PyLong_Check(item)) {
8828         long replace = PyLong_AS_LONG(item);
8829         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8830            used it */
8831         if (127 < replace) {
8832             /* invalid character or character outside ASCII:
8833                skip the fast translate */
8834             goto exit;
8835         }
8836         translate[ch] = (Py_UCS1)replace;
8837     }
8838     else if (PyUnicode_Check(item)) {
8839         Py_UCS4 replace;
8840 
8841         if (PyUnicode_READY(item) == -1) {
8842             Py_DECREF(item);
8843             return -1;
8844         }
8845         if (PyUnicode_GET_LENGTH(item) != 1)
8846             goto exit;
8847 
8848         replace = PyUnicode_READ_CHAR(item, 0);
8849         if (replace > 127)
8850             goto exit;
8851         translate[ch] = (Py_UCS1)replace;
8852     }
8853     else {
8854         /* not None, NULL, long or unicode */
8855         goto exit;
8856     }
8857     ret = 1;
8858 
8859   exit:
8860     Py_DECREF(item);
8861     return ret;
8862 }
8863 
8864 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8865    was translated into writer, return 0 if the input string was partially
8866    translated into writer, raise an exception and return -1 on error. */
8867 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8868 unicode_fast_translate(PyObject *input, PyObject *mapping,
8869                        _PyUnicodeWriter *writer, int ignore,
8870                        Py_ssize_t *input_pos)
8871 {
8872     Py_UCS1 ascii_table[128], ch, ch2;
8873     Py_ssize_t len;
8874     Py_UCS1 *in, *end, *out;
8875     int res = 0;
8876 
8877     len = PyUnicode_GET_LENGTH(input);
8878 
8879     memset(ascii_table, 0xff, 128);
8880 
8881     in = PyUnicode_1BYTE_DATA(input);
8882     end = in + len;
8883 
8884     assert(PyUnicode_IS_ASCII(writer->buffer));
8885     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8886     out = PyUnicode_1BYTE_DATA(writer->buffer);
8887 
8888     for (; in < end; in++) {
8889         ch = *in;
8890         ch2 = ascii_table[ch];
8891         if (ch2 == 0xff) {
8892             int translate = unicode_fast_translate_lookup(mapping, ch,
8893                                                           ascii_table);
8894             if (translate < 0)
8895                 return -1;
8896             if (translate == 0)
8897                 goto exit;
8898             ch2 = ascii_table[ch];
8899         }
8900         if (ch2 == 0xfe) {
8901             if (ignore)
8902                 continue;
8903             goto exit;
8904         }
8905         assert(ch2 < 128);
8906         *out = ch2;
8907         out++;
8908     }
8909     res = 1;
8910 
8911 exit:
8912     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8913     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8914     return res;
8915 }
8916 
8917 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8918 _PyUnicode_TranslateCharmap(PyObject *input,
8919                             PyObject *mapping,
8920                             const char *errors)
8921 {
8922     /* input object */
8923     char *data;
8924     Py_ssize_t size, i;
8925     int kind;
8926     /* output buffer */
8927     _PyUnicodeWriter writer;
8928     /* error handler */
8929     char *reason = "character maps to <undefined>";
8930     PyObject *errorHandler = NULL;
8931     PyObject *exc = NULL;
8932     int ignore;
8933     int res;
8934 
8935     if (mapping == NULL) {
8936         PyErr_BadArgument();
8937         return NULL;
8938     }
8939 
8940     if (PyUnicode_READY(input) == -1)
8941         return NULL;
8942     data = (char*)PyUnicode_DATA(input);
8943     kind = PyUnicode_KIND(input);
8944     size = PyUnicode_GET_LENGTH(input);
8945 
8946     if (size == 0)
8947         return PyUnicode_FromObject(input);
8948 
8949     /* allocate enough for a simple 1:1 translation without
8950        replacements, if we need more, we'll resize */
8951     _PyUnicodeWriter_Init(&writer);
8952     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8953         goto onError;
8954 
8955     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8956 
8957     if (PyUnicode_READY(input) == -1)
8958         return NULL;
8959     if (PyUnicode_IS_ASCII(input)) {
8960         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8961         if (res < 0) {
8962             _PyUnicodeWriter_Dealloc(&writer);
8963             return NULL;
8964         }
8965         if (res == 1)
8966             return _PyUnicodeWriter_Finish(&writer);
8967     }
8968     else {
8969         i = 0;
8970     }
8971 
8972     while (i<size) {
8973         /* try to encode it */
8974         int translate;
8975         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8976         Py_ssize_t newpos;
8977         /* startpos for collecting untranslatable chars */
8978         Py_ssize_t collstart;
8979         Py_ssize_t collend;
8980         Py_UCS4 ch;
8981 
8982         ch = PyUnicode_READ(kind, data, i);
8983         translate = charmaptranslate_output(ch, mapping, &writer);
8984         if (translate < 0)
8985             goto onError;
8986 
8987         if (translate != 0) {
8988             /* it worked => adjust input pointer */
8989             ++i;
8990             continue;
8991         }
8992 
8993         /* untranslatable character */
8994         collstart = i;
8995         collend = i+1;
8996 
8997         /* find all untranslatable characters */
8998         while (collend < size) {
8999             PyObject *x;
9000             ch = PyUnicode_READ(kind, data, collend);
9001             if (charmaptranslate_lookup(ch, mapping, &x))
9002                 goto onError;
9003             Py_XDECREF(x);
9004             if (x != Py_None)
9005                 break;
9006             ++collend;
9007         }
9008 
9009         if (ignore) {
9010             i = collend;
9011         }
9012         else {
9013             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9014                                                              reason, input, &exc,
9015                                                              collstart, collend, &newpos);
9016             if (repunicode == NULL)
9017                 goto onError;
9018             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9019                 Py_DECREF(repunicode);
9020                 goto onError;
9021             }
9022             Py_DECREF(repunicode);
9023             i = newpos;
9024         }
9025     }
9026     Py_XDECREF(exc);
9027     Py_XDECREF(errorHandler);
9028     return _PyUnicodeWriter_Finish(&writer);
9029 
9030   onError:
9031     _PyUnicodeWriter_Dealloc(&writer);
9032     Py_XDECREF(exc);
9033     Py_XDECREF(errorHandler);
9034     return NULL;
9035 }
9036 
9037 /* Deprecated. Use PyUnicode_Translate instead. */
9038 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9039 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9040                            Py_ssize_t size,
9041                            PyObject *mapping,
9042                            const char *errors)
9043 {
9044     PyObject *result;
9045     PyObject *unicode = PyUnicode_FromUnicode(p, size);
9046     if (!unicode)
9047         return NULL;
9048     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9049     Py_DECREF(unicode);
9050     return result;
9051 }
9052 
9053 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9054 PyUnicode_Translate(PyObject *str,
9055                     PyObject *mapping,
9056                     const char *errors)
9057 {
9058     if (ensure_unicode(str) < 0)
9059         return NULL;
9060     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9061 }
9062 
9063 static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject * self)9064 fix_decimal_and_space_to_ascii(PyObject *self)
9065 {
9066     /* No need to call PyUnicode_READY(self) because this function is only
9067        called as a callback from fixup() which does it already. */
9068     const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9069     const int kind = PyUnicode_KIND(self);
9070     void *data = PyUnicode_DATA(self);
9071     Py_UCS4 maxchar = 127, ch, fixed;
9072     int modified = 0;
9073     Py_ssize_t i;
9074 
9075     for (i = 0; i < len; ++i) {
9076         ch = PyUnicode_READ(kind, data, i);
9077         fixed = 0;
9078         if (ch > 127) {
9079             if (Py_UNICODE_ISSPACE(ch))
9080                 fixed = ' ';
9081             else {
9082                 const int decimal = Py_UNICODE_TODECIMAL(ch);
9083                 if (decimal >= 0)
9084                     fixed = '0' + decimal;
9085             }
9086             if (fixed != 0) {
9087                 modified = 1;
9088                 maxchar = Py_MAX(maxchar, fixed);
9089                 PyUnicode_WRITE(kind, data, i, fixed);
9090             }
9091             else
9092                 maxchar = Py_MAX(maxchar, ch);
9093         }
9094     }
9095 
9096     return (modified) ? maxchar : 0;
9097 }
9098 
9099 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9100 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9101 {
9102     if (!PyUnicode_Check(unicode)) {
9103         PyErr_BadInternalCall();
9104         return NULL;
9105     }
9106     if (PyUnicode_READY(unicode) == -1)
9107         return NULL;
9108     if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9109         /* If the string is already ASCII, just return the same string */
9110         Py_INCREF(unicode);
9111         return unicode;
9112     }
9113     return fixup(unicode, fix_decimal_and_space_to_ascii);
9114 }
9115 
9116 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9117 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9118                                   Py_ssize_t length)
9119 {
9120     PyObject *decimal;
9121     Py_ssize_t i;
9122     Py_UCS4 maxchar;
9123     enum PyUnicode_Kind kind;
9124     void *data;
9125 
9126     maxchar = 127;
9127     for (i = 0; i < length; i++) {
9128         Py_UCS4 ch = s[i];
9129         if (ch > 127) {
9130             int decimal = Py_UNICODE_TODECIMAL(ch);
9131             if (decimal >= 0)
9132                 ch = '0' + decimal;
9133             maxchar = Py_MAX(maxchar, ch);
9134         }
9135     }
9136 
9137     /* Copy to a new string */
9138     decimal = PyUnicode_New(length, maxchar);
9139     if (decimal == NULL)
9140         return decimal;
9141     kind = PyUnicode_KIND(decimal);
9142     data = PyUnicode_DATA(decimal);
9143     /* Iterate over code points */
9144     for (i = 0; i < length; i++) {
9145         Py_UCS4 ch = s[i];
9146         if (ch > 127) {
9147             int decimal = Py_UNICODE_TODECIMAL(ch);
9148             if (decimal >= 0)
9149                 ch = '0' + decimal;
9150         }
9151         PyUnicode_WRITE(kind, data, i, ch);
9152     }
9153     return unicode_result(decimal);
9154 }
9155 /* --- Decimal Encoder ---------------------------------------------------- */
9156 
9157 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9158 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9159                         Py_ssize_t length,
9160                         char *output,
9161                         const char *errors)
9162 {
9163     PyObject *unicode;
9164     Py_ssize_t i;
9165     enum PyUnicode_Kind kind;
9166     void *data;
9167 
9168     if (output == NULL) {
9169         PyErr_BadArgument();
9170         return -1;
9171     }
9172 
9173     unicode = PyUnicode_FromUnicode(s, length);
9174     if (unicode == NULL)
9175         return -1;
9176 
9177     if (PyUnicode_READY(unicode) == -1) {
9178         Py_DECREF(unicode);
9179         return -1;
9180     }
9181     kind = PyUnicode_KIND(unicode);
9182     data = PyUnicode_DATA(unicode);
9183 
9184     for (i=0; i < length; ) {
9185         PyObject *exc;
9186         Py_UCS4 ch;
9187         int decimal;
9188         Py_ssize_t startpos;
9189 
9190         ch = PyUnicode_READ(kind, data, i);
9191 
9192         if (Py_UNICODE_ISSPACE(ch)) {
9193             *output++ = ' ';
9194             i++;
9195             continue;
9196         }
9197         decimal = Py_UNICODE_TODECIMAL(ch);
9198         if (decimal >= 0) {
9199             *output++ = '0' + decimal;
9200             i++;
9201             continue;
9202         }
9203         if (0 < ch && ch < 256) {
9204             *output++ = (char)ch;
9205             i++;
9206             continue;
9207         }
9208 
9209         startpos = i;
9210         exc = NULL;
9211         raise_encode_exception(&exc, "decimal", unicode,
9212                                startpos, startpos+1,
9213                                "invalid decimal Unicode string");
9214         Py_XDECREF(exc);
9215         Py_DECREF(unicode);
9216         return -1;
9217     }
9218     /* 0-terminate the output string */
9219     *output++ = '\0';
9220     Py_DECREF(unicode);
9221     return 0;
9222 }
9223 
9224 /* --- Helpers ------------------------------------------------------------ */
9225 
9226 /* helper macro to fixup start/end slice values */
9227 #define ADJUST_INDICES(start, end, len)         \
9228     if (end > len)                              \
9229         end = len;                              \
9230     else if (end < 0) {                         \
9231         end += len;                             \
9232         if (end < 0)                            \
9233             end = 0;                            \
9234     }                                           \
9235     if (start < 0) {                            \
9236         start += len;                           \
9237         if (start < 0)                          \
9238             start = 0;                          \
9239     }
9240 
9241 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9242 any_find_slice(PyObject* s1, PyObject* s2,
9243                Py_ssize_t start,
9244                Py_ssize_t end,
9245                int direction)
9246 {
9247     int kind1, kind2;
9248     void *buf1, *buf2;
9249     Py_ssize_t len1, len2, result;
9250 
9251     kind1 = PyUnicode_KIND(s1);
9252     kind2 = PyUnicode_KIND(s2);
9253     if (kind1 < kind2)
9254         return -1;
9255 
9256     len1 = PyUnicode_GET_LENGTH(s1);
9257     len2 = PyUnicode_GET_LENGTH(s2);
9258     ADJUST_INDICES(start, end, len1);
9259     if (end - start < len2)
9260         return -1;
9261 
9262     buf1 = PyUnicode_DATA(s1);
9263     buf2 = PyUnicode_DATA(s2);
9264     if (len2 == 1) {
9265         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9266         result = findchar((const char *)buf1 + kind1*start,
9267                           kind1, end - start, ch, direction);
9268         if (result == -1)
9269             return -1;
9270         else
9271             return start + result;
9272     }
9273 
9274     if (kind2 != kind1) {
9275         buf2 = _PyUnicode_AsKind(s2, kind1);
9276         if (!buf2)
9277             return -2;
9278     }
9279 
9280     if (direction > 0) {
9281         switch (kind1) {
9282         case PyUnicode_1BYTE_KIND:
9283             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9284                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9285             else
9286                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9287             break;
9288         case PyUnicode_2BYTE_KIND:
9289             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9290             break;
9291         case PyUnicode_4BYTE_KIND:
9292             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9293             break;
9294         default:
9295             assert(0); result = -2;
9296         }
9297     }
9298     else {
9299         switch (kind1) {
9300         case PyUnicode_1BYTE_KIND:
9301             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9302                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303             else
9304                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9305             break;
9306         case PyUnicode_2BYTE_KIND:
9307             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9308             break;
9309         case PyUnicode_4BYTE_KIND:
9310             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9311             break;
9312         default:
9313             assert(0); result = -2;
9314         }
9315     }
9316 
9317     if (kind2 != kind1)
9318         PyMem_Free(buf2);
9319 
9320     return result;
9321 }
9322 
9323 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(PyObject * unicode,Py_ssize_t index,Py_ssize_t n_buffer,void * digits,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9324 _PyUnicode_InsertThousandsGrouping(
9325     PyObject *unicode, Py_ssize_t index,
9326     Py_ssize_t n_buffer,
9327     void *digits, Py_ssize_t n_digits,
9328     Py_ssize_t min_width,
9329     const char *grouping, PyObject *thousands_sep,
9330     Py_UCS4 *maxchar)
9331 {
9332     unsigned int kind, thousands_sep_kind;
9333     char *data, *thousands_sep_data;
9334     Py_ssize_t thousands_sep_len;
9335     Py_ssize_t len;
9336 
9337     if (unicode != NULL) {
9338         kind = PyUnicode_KIND(unicode);
9339         data = (char *) PyUnicode_DATA(unicode) + index * kind;
9340     }
9341     else {
9342         kind = PyUnicode_1BYTE_KIND;
9343         data = NULL;
9344     }
9345     thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9346     thousands_sep_data = PyUnicode_DATA(thousands_sep);
9347     thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9348     if (unicode != NULL && thousands_sep_kind != kind) {
9349         if (thousands_sep_kind < kind) {
9350             thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9351             if (!thousands_sep_data)
9352                 return -1;
9353         }
9354         else {
9355             data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9356             if (!data)
9357                 return -1;
9358         }
9359     }
9360 
9361     switch (kind) {
9362     case PyUnicode_1BYTE_KIND:
9363         if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9364             len = asciilib_InsertThousandsGrouping(
9365                 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9366                 min_width, grouping,
9367                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9368         else
9369             len = ucs1lib_InsertThousandsGrouping(
9370                 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9371                 min_width, grouping,
9372                 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9373         break;
9374     case PyUnicode_2BYTE_KIND:
9375         len = ucs2lib_InsertThousandsGrouping(
9376             (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9377             min_width, grouping,
9378             (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9379         break;
9380     case PyUnicode_4BYTE_KIND:
9381         len = ucs4lib_InsertThousandsGrouping(
9382             (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9383             min_width, grouping,
9384             (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9385         break;
9386     default:
9387         assert(0);
9388         return -1;
9389     }
9390     if (unicode != NULL && thousands_sep_kind != kind) {
9391         if (thousands_sep_kind < kind)
9392             PyMem_Free(thousands_sep_data);
9393         else
9394             PyMem_Free(data);
9395     }
9396     if (unicode == NULL) {
9397         *maxchar = 127;
9398         if (len != n_digits) {
9399             *maxchar = Py_MAX(*maxchar,
9400                                    PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9401         }
9402     }
9403     return len;
9404 }
9405 
9406 
9407 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9408 PyUnicode_Count(PyObject *str,
9409                 PyObject *substr,
9410                 Py_ssize_t start,
9411                 Py_ssize_t end)
9412 {
9413     Py_ssize_t result;
9414     int kind1, kind2;
9415     void *buf1 = NULL, *buf2 = NULL;
9416     Py_ssize_t len1, len2;
9417 
9418     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9419         return -1;
9420 
9421     kind1 = PyUnicode_KIND(str);
9422     kind2 = PyUnicode_KIND(substr);
9423     if (kind1 < kind2)
9424         return 0;
9425 
9426     len1 = PyUnicode_GET_LENGTH(str);
9427     len2 = PyUnicode_GET_LENGTH(substr);
9428     ADJUST_INDICES(start, end, len1);
9429     if (end - start < len2)
9430         return 0;
9431 
9432     buf1 = PyUnicode_DATA(str);
9433     buf2 = PyUnicode_DATA(substr);
9434     if (kind2 != kind1) {
9435         buf2 = _PyUnicode_AsKind(substr, kind1);
9436         if (!buf2)
9437             goto onError;
9438     }
9439 
9440     switch (kind1) {
9441     case PyUnicode_1BYTE_KIND:
9442         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9443             result = asciilib_count(
9444                 ((Py_UCS1*)buf1) + start, end - start,
9445                 buf2, len2, PY_SSIZE_T_MAX
9446                 );
9447         else
9448             result = ucs1lib_count(
9449                 ((Py_UCS1*)buf1) + start, end - start,
9450                 buf2, len2, PY_SSIZE_T_MAX
9451                 );
9452         break;
9453     case PyUnicode_2BYTE_KIND:
9454         result = ucs2lib_count(
9455             ((Py_UCS2*)buf1) + start, end - start,
9456             buf2, len2, PY_SSIZE_T_MAX
9457             );
9458         break;
9459     case PyUnicode_4BYTE_KIND:
9460         result = ucs4lib_count(
9461             ((Py_UCS4*)buf1) + start, end - start,
9462             buf2, len2, PY_SSIZE_T_MAX
9463             );
9464         break;
9465     default:
9466         assert(0); result = 0;
9467     }
9468 
9469     if (kind2 != kind1)
9470         PyMem_Free(buf2);
9471 
9472     return result;
9473   onError:
9474     if (kind2 != kind1 && buf2)
9475         PyMem_Free(buf2);
9476     return -1;
9477 }
9478 
9479 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9480 PyUnicode_Find(PyObject *str,
9481                PyObject *substr,
9482                Py_ssize_t start,
9483                Py_ssize_t end,
9484                int direction)
9485 {
9486     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9487         return -2;
9488 
9489     return any_find_slice(str, substr, start, end, direction);
9490 }
9491 
9492 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9493 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9494                    Py_ssize_t start, Py_ssize_t end,
9495                    int direction)
9496 {
9497     int kind;
9498     Py_ssize_t result;
9499     if (PyUnicode_READY(str) == -1)
9500         return -2;
9501     if (start < 0 || end < 0) {
9502         PyErr_SetString(PyExc_IndexError, "string index out of range");
9503         return -2;
9504     }
9505     if (end > PyUnicode_GET_LENGTH(str))
9506         end = PyUnicode_GET_LENGTH(str);
9507     if (start >= end)
9508         return -1;
9509     kind = PyUnicode_KIND(str);
9510     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9511                       kind, end-start, ch, direction);
9512     if (result == -1)
9513         return -1;
9514     else
9515         return start + result;
9516 }
9517 
9518 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9519 tailmatch(PyObject *self,
9520           PyObject *substring,
9521           Py_ssize_t start,
9522           Py_ssize_t end,
9523           int direction)
9524 {
9525     int kind_self;
9526     int kind_sub;
9527     void *data_self;
9528     void *data_sub;
9529     Py_ssize_t offset;
9530     Py_ssize_t i;
9531     Py_ssize_t end_sub;
9532 
9533     if (PyUnicode_READY(self) == -1 ||
9534         PyUnicode_READY(substring) == -1)
9535         return -1;
9536 
9537     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9538     end -= PyUnicode_GET_LENGTH(substring);
9539     if (end < start)
9540         return 0;
9541 
9542     if (PyUnicode_GET_LENGTH(substring) == 0)
9543         return 1;
9544 
9545     kind_self = PyUnicode_KIND(self);
9546     data_self = PyUnicode_DATA(self);
9547     kind_sub = PyUnicode_KIND(substring);
9548     data_sub = PyUnicode_DATA(substring);
9549     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9550 
9551     if (direction > 0)
9552         offset = end;
9553     else
9554         offset = start;
9555 
9556     if (PyUnicode_READ(kind_self, data_self, offset) ==
9557         PyUnicode_READ(kind_sub, data_sub, 0) &&
9558         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9559         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9560         /* If both are of the same kind, memcmp is sufficient */
9561         if (kind_self == kind_sub) {
9562             return ! memcmp((char *)data_self +
9563                                 (offset * PyUnicode_KIND(substring)),
9564                             data_sub,
9565                             PyUnicode_GET_LENGTH(substring) *
9566                                 PyUnicode_KIND(substring));
9567         }
9568         /* otherwise we have to compare each character by first accessing it */
9569         else {
9570             /* We do not need to compare 0 and len(substring)-1 because
9571                the if statement above ensured already that they are equal
9572                when we end up here. */
9573             for (i = 1; i < end_sub; ++i) {
9574                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9575                     PyUnicode_READ(kind_sub, data_sub, i))
9576                     return 0;
9577             }
9578             return 1;
9579         }
9580     }
9581 
9582     return 0;
9583 }
9584 
9585 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9586 PyUnicode_Tailmatch(PyObject *str,
9587                     PyObject *substr,
9588                     Py_ssize_t start,
9589                     Py_ssize_t end,
9590                     int direction)
9591 {
9592     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9593         return -1;
9594 
9595     return tailmatch(str, substr, start, end, direction);
9596 }
9597 
9598 /* Apply fixfct filter to the Unicode object self and return a
9599    reference to the modified object */
9600 
9601 static PyObject *
fixup(PyObject * self,Py_UCS4 (* fixfct)(PyObject * s))9602 fixup(PyObject *self,
9603       Py_UCS4 (*fixfct)(PyObject *s))
9604 {
9605     PyObject *u;
9606     Py_UCS4 maxchar_old, maxchar_new = 0;
9607     PyObject *v;
9608 
9609     u = _PyUnicode_Copy(self);
9610     if (u == NULL)
9611         return NULL;
9612     maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9613 
9614     /* fix functions return the new maximum character in a string,
9615        if the kind of the resulting unicode object does not change,
9616        everything is fine.  Otherwise we need to change the string kind
9617        and re-run the fix function. */
9618     maxchar_new = fixfct(u);
9619 
9620     if (maxchar_new == 0) {
9621         /* no changes */;
9622         if (PyUnicode_CheckExact(self)) {
9623             Py_DECREF(u);
9624             Py_INCREF(self);
9625             return self;
9626         }
9627         else
9628             return u;
9629     }
9630 
9631     maxchar_new = align_maxchar(maxchar_new);
9632 
9633     if (maxchar_new == maxchar_old)
9634         return u;
9635 
9636     /* In case the maximum character changed, we need to
9637        convert the string to the new category. */
9638     v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9639     if (v == NULL) {
9640         Py_DECREF(u);
9641         return NULL;
9642     }
9643     if (maxchar_new > maxchar_old) {
9644         /* If the maxchar increased so that the kind changed, not all
9645            characters are representable anymore and we need to fix the
9646            string again. This only happens in very few cases. */
9647         _PyUnicode_FastCopyCharacters(v, 0,
9648                                       self, 0, PyUnicode_GET_LENGTH(self));
9649         maxchar_old = fixfct(v);
9650         assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9651     }
9652     else {
9653         _PyUnicode_FastCopyCharacters(v, 0,
9654                                       u, 0, PyUnicode_GET_LENGTH(self));
9655     }
9656     Py_DECREF(u);
9657     assert(_PyUnicode_CheckConsistency(v, 1));
9658     return v;
9659 }
9660 
9661 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9662 ascii_upper_or_lower(PyObject *self, int lower)
9663 {
9664     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9665     char *resdata, *data = PyUnicode_DATA(self);
9666     PyObject *res;
9667 
9668     res = PyUnicode_New(len, 127);
9669     if (res == NULL)
9670         return NULL;
9671     resdata = PyUnicode_DATA(res);
9672     if (lower)
9673         _Py_bytes_lower(resdata, data, len);
9674     else
9675         _Py_bytes_upper(resdata, data, len);
9676     return res;
9677 }
9678 
9679 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9680 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9681 {
9682     Py_ssize_t j;
9683     int final_sigma;
9684     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9685     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9686 
9687      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9688 
9689     where ! is a negation and \p{xxx} is a character with property xxx.
9690     */
9691     for (j = i - 1; j >= 0; j--) {
9692         c = PyUnicode_READ(kind, data, j);
9693         if (!_PyUnicode_IsCaseIgnorable(c))
9694             break;
9695     }
9696     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9697     if (final_sigma) {
9698         for (j = i + 1; j < length; j++) {
9699             c = PyUnicode_READ(kind, data, j);
9700             if (!_PyUnicode_IsCaseIgnorable(c))
9701                 break;
9702         }
9703         final_sigma = j == length || !_PyUnicode_IsCased(c);
9704     }
9705     return (final_sigma) ? 0x3C2 : 0x3C3;
9706 }
9707 
9708 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9709 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9710            Py_UCS4 c, Py_UCS4 *mapped)
9711 {
9712     /* Obscure special case. */
9713     if (c == 0x3A3) {
9714         mapped[0] = handle_capital_sigma(kind, data, length, i);
9715         return 1;
9716     }
9717     return _PyUnicode_ToLowerFull(c, mapped);
9718 }
9719 
9720 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9721 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9722 {
9723     Py_ssize_t i, k = 0;
9724     int n_res, j;
9725     Py_UCS4 c, mapped[3];
9726 
9727     c = PyUnicode_READ(kind, data, 0);
9728     n_res = _PyUnicode_ToUpperFull(c, mapped);
9729     for (j = 0; j < n_res; j++) {
9730         *maxchar = Py_MAX(*maxchar, mapped[j]);
9731         res[k++] = mapped[j];
9732     }
9733     for (i = 1; i < length; i++) {
9734         c = PyUnicode_READ(kind, data, i);
9735         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9736         for (j = 0; j < n_res; j++) {
9737             *maxchar = Py_MAX(*maxchar, mapped[j]);
9738             res[k++] = mapped[j];
9739         }
9740     }
9741     return k;
9742 }
9743 
9744 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9745 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9746     Py_ssize_t i, k = 0;
9747 
9748     for (i = 0; i < length; i++) {
9749         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750         int n_res, j;
9751         if (Py_UNICODE_ISUPPER(c)) {
9752             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753         }
9754         else if (Py_UNICODE_ISLOWER(c)) {
9755             n_res = _PyUnicode_ToUpperFull(c, mapped);
9756         }
9757         else {
9758             n_res = 1;
9759             mapped[0] = c;
9760         }
9761         for (j = 0; j < n_res; j++) {
9762             *maxchar = Py_MAX(*maxchar, mapped[j]);
9763             res[k++] = mapped[j];
9764         }
9765     }
9766     return k;
9767 }
9768 
9769 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9770 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9771                   Py_UCS4 *maxchar, int lower)
9772 {
9773     Py_ssize_t i, k = 0;
9774 
9775     for (i = 0; i < length; i++) {
9776         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9777         int n_res, j;
9778         if (lower)
9779             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9780         else
9781             n_res = _PyUnicode_ToUpperFull(c, mapped);
9782         for (j = 0; j < n_res; j++) {
9783             *maxchar = Py_MAX(*maxchar, mapped[j]);
9784             res[k++] = mapped[j];
9785         }
9786     }
9787     return k;
9788 }
9789 
9790 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9791 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792 {
9793     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9794 }
9795 
9796 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9797 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798 {
9799     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9800 }
9801 
9802 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9803 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9804 {
9805     Py_ssize_t i, k = 0;
9806 
9807     for (i = 0; i < length; i++) {
9808         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9809         Py_UCS4 mapped[3];
9810         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9811         for (j = 0; j < n_res; j++) {
9812             *maxchar = Py_MAX(*maxchar, mapped[j]);
9813             res[k++] = mapped[j];
9814         }
9815     }
9816     return k;
9817 }
9818 
9819 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9820 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821 {
9822     Py_ssize_t i, k = 0;
9823     int previous_is_cased;
9824 
9825     previous_is_cased = 0;
9826     for (i = 0; i < length; i++) {
9827         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9828         Py_UCS4 mapped[3];
9829         int n_res, j;
9830 
9831         if (previous_is_cased)
9832             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9833         else
9834             n_res = _PyUnicode_ToTitleFull(c, mapped);
9835 
9836         for (j = 0; j < n_res; j++) {
9837             *maxchar = Py_MAX(*maxchar, mapped[j]);
9838             res[k++] = mapped[j];
9839         }
9840 
9841         previous_is_cased = _PyUnicode_IsCased(c);
9842     }
9843     return k;
9844 }
9845 
9846 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9847 case_operation(PyObject *self,
9848                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9849 {
9850     PyObject *res = NULL;
9851     Py_ssize_t length, newlength = 0;
9852     int kind, outkind;
9853     void *data, *outdata;
9854     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9855 
9856     assert(PyUnicode_IS_READY(self));
9857 
9858     kind = PyUnicode_KIND(self);
9859     data = PyUnicode_DATA(self);
9860     length = PyUnicode_GET_LENGTH(self);
9861     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9862         PyErr_SetString(PyExc_OverflowError, "string is too long");
9863         return NULL;
9864     }
9865     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9866     if (tmp == NULL)
9867         return PyErr_NoMemory();
9868     newlength = perform(kind, data, length, tmp, &maxchar);
9869     res = PyUnicode_New(newlength, maxchar);
9870     if (res == NULL)
9871         goto leave;
9872     tmpend = tmp + newlength;
9873     outdata = PyUnicode_DATA(res);
9874     outkind = PyUnicode_KIND(res);
9875     switch (outkind) {
9876     case PyUnicode_1BYTE_KIND:
9877         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9878         break;
9879     case PyUnicode_2BYTE_KIND:
9880         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9881         break;
9882     case PyUnicode_4BYTE_KIND:
9883         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9884         break;
9885     default:
9886         assert(0);
9887         break;
9888     }
9889   leave:
9890     PyMem_FREE(tmp);
9891     return res;
9892 }
9893 
9894 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9895 PyUnicode_Join(PyObject *separator, PyObject *seq)
9896 {
9897     PyObject *res;
9898     PyObject *fseq;
9899     Py_ssize_t seqlen;
9900     PyObject **items;
9901 
9902     fseq = PySequence_Fast(seq, "can only join an iterable");
9903     if (fseq == NULL) {
9904         return NULL;
9905     }
9906 
9907     /* NOTE: the following code can't call back into Python code,
9908      * so we are sure that fseq won't be mutated.
9909      */
9910 
9911     items = PySequence_Fast_ITEMS(fseq);
9912     seqlen = PySequence_Fast_GET_SIZE(fseq);
9913     res = _PyUnicode_JoinArray(separator, items, seqlen);
9914     Py_DECREF(fseq);
9915     return res;
9916 }
9917 
9918 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject ** items,Py_ssize_t seqlen)9919 _PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9920 {
9921     PyObject *res = NULL; /* the result */
9922     PyObject *sep = NULL;
9923     Py_ssize_t seplen;
9924     PyObject *item;
9925     Py_ssize_t sz, i, res_offset;
9926     Py_UCS4 maxchar;
9927     Py_UCS4 item_maxchar;
9928     int use_memcpy;
9929     unsigned char *res_data = NULL, *sep_data = NULL;
9930     PyObject *last_obj;
9931     unsigned int kind = 0;
9932 
9933     /* If empty sequence, return u"". */
9934     if (seqlen == 0) {
9935         _Py_RETURN_UNICODE_EMPTY();
9936     }
9937 
9938     /* If singleton sequence with an exact Unicode, return that. */
9939     last_obj = NULL;
9940     if (seqlen == 1) {
9941         if (PyUnicode_CheckExact(items[0])) {
9942             res = items[0];
9943             Py_INCREF(res);
9944             return res;
9945         }
9946         seplen = 0;
9947         maxchar = 0;
9948     }
9949     else {
9950         /* Set up sep and seplen */
9951         if (separator == NULL) {
9952             /* fall back to a blank space separator */
9953             sep = PyUnicode_FromOrdinal(' ');
9954             if (!sep)
9955                 goto onError;
9956             seplen = 1;
9957             maxchar = 32;
9958         }
9959         else {
9960             if (!PyUnicode_Check(separator)) {
9961                 PyErr_Format(PyExc_TypeError,
9962                              "separator: expected str instance,"
9963                              " %.80s found",
9964                              Py_TYPE(separator)->tp_name);
9965                 goto onError;
9966             }
9967             if (PyUnicode_READY(separator))
9968                 goto onError;
9969             sep = separator;
9970             seplen = PyUnicode_GET_LENGTH(separator);
9971             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9972             /* inc refcount to keep this code path symmetric with the
9973                above case of a blank separator */
9974             Py_INCREF(sep);
9975         }
9976         last_obj = sep;
9977     }
9978 
9979     /* There are at least two things to join, or else we have a subclass
9980      * of str in the sequence.
9981      * Do a pre-pass to figure out the total amount of space we'll
9982      * need (sz), and see whether all argument are strings.
9983      */
9984     sz = 0;
9985 #ifdef Py_DEBUG
9986     use_memcpy = 0;
9987 #else
9988     use_memcpy = 1;
9989 #endif
9990     for (i = 0; i < seqlen; i++) {
9991         size_t add_sz;
9992         item = items[i];
9993         if (!PyUnicode_Check(item)) {
9994             PyErr_Format(PyExc_TypeError,
9995                          "sequence item %zd: expected str instance,"
9996                          " %.80s found",
9997                          i, Py_TYPE(item)->tp_name);
9998             goto onError;
9999         }
10000         if (PyUnicode_READY(item) == -1)
10001             goto onError;
10002         add_sz = PyUnicode_GET_LENGTH(item);
10003         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10004         maxchar = Py_MAX(maxchar, item_maxchar);
10005         if (i != 0) {
10006             add_sz += seplen;
10007         }
10008         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10009             PyErr_SetString(PyExc_OverflowError,
10010                             "join() result is too long for a Python string");
10011             goto onError;
10012         }
10013         sz += add_sz;
10014         if (use_memcpy && last_obj != NULL) {
10015             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10016                 use_memcpy = 0;
10017         }
10018         last_obj = item;
10019     }
10020 
10021     res = PyUnicode_New(sz, maxchar);
10022     if (res == NULL)
10023         goto onError;
10024 
10025     /* Catenate everything. */
10026 #ifdef Py_DEBUG
10027     use_memcpy = 0;
10028 #else
10029     if (use_memcpy) {
10030         res_data = PyUnicode_1BYTE_DATA(res);
10031         kind = PyUnicode_KIND(res);
10032         if (seplen != 0)
10033             sep_data = PyUnicode_1BYTE_DATA(sep);
10034     }
10035 #endif
10036     if (use_memcpy) {
10037         for (i = 0; i < seqlen; ++i) {
10038             Py_ssize_t itemlen;
10039             item = items[i];
10040 
10041             /* Copy item, and maybe the separator. */
10042             if (i && seplen != 0) {
10043                 memcpy(res_data,
10044                           sep_data,
10045                           kind * seplen);
10046                 res_data += kind * seplen;
10047             }
10048 
10049             itemlen = PyUnicode_GET_LENGTH(item);
10050             if (itemlen != 0) {
10051                 memcpy(res_data,
10052                           PyUnicode_DATA(item),
10053                           kind * itemlen);
10054                 res_data += kind * itemlen;
10055             }
10056         }
10057         assert(res_data == PyUnicode_1BYTE_DATA(res)
10058                            + kind * PyUnicode_GET_LENGTH(res));
10059     }
10060     else {
10061         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10062             Py_ssize_t itemlen;
10063             item = items[i];
10064 
10065             /* Copy item, and maybe the separator. */
10066             if (i && seplen != 0) {
10067                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10068                 res_offset += seplen;
10069             }
10070 
10071             itemlen = PyUnicode_GET_LENGTH(item);
10072             if (itemlen != 0) {
10073                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10074                 res_offset += itemlen;
10075             }
10076         }
10077         assert(res_offset == PyUnicode_GET_LENGTH(res));
10078     }
10079 
10080     Py_XDECREF(sep);
10081     assert(_PyUnicode_CheckConsistency(res, 1));
10082     return res;
10083 
10084   onError:
10085     Py_XDECREF(sep);
10086     Py_XDECREF(res);
10087     return NULL;
10088 }
10089 
10090 #define FILL(kind, data, value, start, length) \
10091     do { \
10092         Py_ssize_t i_ = 0; \
10093         assert(kind != PyUnicode_WCHAR_KIND); \
10094         switch ((kind)) { \
10095         case PyUnicode_1BYTE_KIND: { \
10096             unsigned char * to_ = (unsigned char *)((data)) + (start); \
10097             memset(to_, (unsigned char)value, (length)); \
10098             break; \
10099         } \
10100         case PyUnicode_2BYTE_KIND: { \
10101             Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10102             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10103             break; \
10104         } \
10105         case PyUnicode_4BYTE_KIND: { \
10106             Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10107             for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10108             break; \
10109         } \
10110         default: assert(0); \
10111         } \
10112     } while (0)
10113 
10114 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10115 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10116                     Py_UCS4 fill_char)
10117 {
10118     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10119     const void *data = PyUnicode_DATA(unicode);
10120     assert(PyUnicode_IS_READY(unicode));
10121     assert(unicode_modifiable(unicode));
10122     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10123     assert(start >= 0);
10124     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10125     FILL(kind, data, fill_char, start, length);
10126 }
10127 
10128 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10129 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10130                Py_UCS4 fill_char)
10131 {
10132     Py_ssize_t maxlen;
10133 
10134     if (!PyUnicode_Check(unicode)) {
10135         PyErr_BadInternalCall();
10136         return -1;
10137     }
10138     if (PyUnicode_READY(unicode) == -1)
10139         return -1;
10140     if (unicode_check_modifiable(unicode))
10141         return -1;
10142 
10143     if (start < 0) {
10144         PyErr_SetString(PyExc_IndexError, "string index out of range");
10145         return -1;
10146     }
10147     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10148         PyErr_SetString(PyExc_ValueError,
10149                          "fill character is bigger than "
10150                          "the string maximum character");
10151         return -1;
10152     }
10153 
10154     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10155     length = Py_MIN(maxlen, length);
10156     if (length <= 0)
10157         return 0;
10158 
10159     _PyUnicode_FastFill(unicode, start, length, fill_char);
10160     return length;
10161 }
10162 
10163 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10164 pad(PyObject *self,
10165     Py_ssize_t left,
10166     Py_ssize_t right,
10167     Py_UCS4 fill)
10168 {
10169     PyObject *u;
10170     Py_UCS4 maxchar;
10171     int kind;
10172     void *data;
10173 
10174     if (left < 0)
10175         left = 0;
10176     if (right < 0)
10177         right = 0;
10178 
10179     if (left == 0 && right == 0)
10180         return unicode_result_unchanged(self);
10181 
10182     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10183         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10184         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10185         return NULL;
10186     }
10187     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10188     maxchar = Py_MAX(maxchar, fill);
10189     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10190     if (!u)
10191         return NULL;
10192 
10193     kind = PyUnicode_KIND(u);
10194     data = PyUnicode_DATA(u);
10195     if (left)
10196         FILL(kind, data, fill, 0, left);
10197     if (right)
10198         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10199     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10200     assert(_PyUnicode_CheckConsistency(u, 1));
10201     return u;
10202 }
10203 
10204 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10205 PyUnicode_Splitlines(PyObject *string, int keepends)
10206 {
10207     PyObject *list;
10208 
10209     if (ensure_unicode(string) < 0)
10210         return NULL;
10211 
10212     switch (PyUnicode_KIND(string)) {
10213     case PyUnicode_1BYTE_KIND:
10214         if (PyUnicode_IS_ASCII(string))
10215             list = asciilib_splitlines(
10216                 string, PyUnicode_1BYTE_DATA(string),
10217                 PyUnicode_GET_LENGTH(string), keepends);
10218         else
10219             list = ucs1lib_splitlines(
10220                 string, PyUnicode_1BYTE_DATA(string),
10221                 PyUnicode_GET_LENGTH(string), keepends);
10222         break;
10223     case PyUnicode_2BYTE_KIND:
10224         list = ucs2lib_splitlines(
10225             string, PyUnicode_2BYTE_DATA(string),
10226             PyUnicode_GET_LENGTH(string), keepends);
10227         break;
10228     case PyUnicode_4BYTE_KIND:
10229         list = ucs4lib_splitlines(
10230             string, PyUnicode_4BYTE_DATA(string),
10231             PyUnicode_GET_LENGTH(string), keepends);
10232         break;
10233     default:
10234         assert(0);
10235         list = 0;
10236     }
10237     return list;
10238 }
10239 
10240 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10241 split(PyObject *self,
10242       PyObject *substring,
10243       Py_ssize_t maxcount)
10244 {
10245     int kind1, kind2;
10246     void *buf1, *buf2;
10247     Py_ssize_t len1, len2;
10248     PyObject* out;
10249 
10250     if (maxcount < 0)
10251         maxcount = PY_SSIZE_T_MAX;
10252 
10253     if (PyUnicode_READY(self) == -1)
10254         return NULL;
10255 
10256     if (substring == NULL)
10257         switch (PyUnicode_KIND(self)) {
10258         case PyUnicode_1BYTE_KIND:
10259             if (PyUnicode_IS_ASCII(self))
10260                 return asciilib_split_whitespace(
10261                     self,  PyUnicode_1BYTE_DATA(self),
10262                     PyUnicode_GET_LENGTH(self), maxcount
10263                     );
10264             else
10265                 return ucs1lib_split_whitespace(
10266                     self,  PyUnicode_1BYTE_DATA(self),
10267                     PyUnicode_GET_LENGTH(self), maxcount
10268                     );
10269         case PyUnicode_2BYTE_KIND:
10270             return ucs2lib_split_whitespace(
10271                 self,  PyUnicode_2BYTE_DATA(self),
10272                 PyUnicode_GET_LENGTH(self), maxcount
10273                 );
10274         case PyUnicode_4BYTE_KIND:
10275             return ucs4lib_split_whitespace(
10276                 self,  PyUnicode_4BYTE_DATA(self),
10277                 PyUnicode_GET_LENGTH(self), maxcount
10278                 );
10279         default:
10280             assert(0);
10281             return NULL;
10282         }
10283 
10284     if (PyUnicode_READY(substring) == -1)
10285         return NULL;
10286 
10287     kind1 = PyUnicode_KIND(self);
10288     kind2 = PyUnicode_KIND(substring);
10289     len1 = PyUnicode_GET_LENGTH(self);
10290     len2 = PyUnicode_GET_LENGTH(substring);
10291     if (kind1 < kind2 || len1 < len2) {
10292         out = PyList_New(1);
10293         if (out == NULL)
10294             return NULL;
10295         Py_INCREF(self);
10296         PyList_SET_ITEM(out, 0, self);
10297         return out;
10298     }
10299     buf1 = PyUnicode_DATA(self);
10300     buf2 = PyUnicode_DATA(substring);
10301     if (kind2 != kind1) {
10302         buf2 = _PyUnicode_AsKind(substring, kind1);
10303         if (!buf2)
10304             return NULL;
10305     }
10306 
10307     switch (kind1) {
10308     case PyUnicode_1BYTE_KIND:
10309         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10310             out = asciilib_split(
10311                 self,  buf1, len1, buf2, len2, maxcount);
10312         else
10313             out = ucs1lib_split(
10314                 self,  buf1, len1, buf2, len2, maxcount);
10315         break;
10316     case PyUnicode_2BYTE_KIND:
10317         out = ucs2lib_split(
10318             self,  buf1, len1, buf2, len2, maxcount);
10319         break;
10320     case PyUnicode_4BYTE_KIND:
10321         out = ucs4lib_split(
10322             self,  buf1, len1, buf2, len2, maxcount);
10323         break;
10324     default:
10325         out = NULL;
10326     }
10327     if (kind2 != kind1)
10328         PyMem_Free(buf2);
10329     return out;
10330 }
10331 
10332 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10333 rsplit(PyObject *self,
10334        PyObject *substring,
10335        Py_ssize_t maxcount)
10336 {
10337     int kind1, kind2;
10338     void *buf1, *buf2;
10339     Py_ssize_t len1, len2;
10340     PyObject* out;
10341 
10342     if (maxcount < 0)
10343         maxcount = PY_SSIZE_T_MAX;
10344 
10345     if (PyUnicode_READY(self) == -1)
10346         return NULL;
10347 
10348     if (substring == NULL)
10349         switch (PyUnicode_KIND(self)) {
10350         case PyUnicode_1BYTE_KIND:
10351             if (PyUnicode_IS_ASCII(self))
10352                 return asciilib_rsplit_whitespace(
10353                     self,  PyUnicode_1BYTE_DATA(self),
10354                     PyUnicode_GET_LENGTH(self), maxcount
10355                     );
10356             else
10357                 return ucs1lib_rsplit_whitespace(
10358                     self,  PyUnicode_1BYTE_DATA(self),
10359                     PyUnicode_GET_LENGTH(self), maxcount
10360                     );
10361         case PyUnicode_2BYTE_KIND:
10362             return ucs2lib_rsplit_whitespace(
10363                 self,  PyUnicode_2BYTE_DATA(self),
10364                 PyUnicode_GET_LENGTH(self), maxcount
10365                 );
10366         case PyUnicode_4BYTE_KIND:
10367             return ucs4lib_rsplit_whitespace(
10368                 self,  PyUnicode_4BYTE_DATA(self),
10369                 PyUnicode_GET_LENGTH(self), maxcount
10370                 );
10371         default:
10372             assert(0);
10373             return NULL;
10374         }
10375 
10376     if (PyUnicode_READY(substring) == -1)
10377         return NULL;
10378 
10379     kind1 = PyUnicode_KIND(self);
10380     kind2 = PyUnicode_KIND(substring);
10381     len1 = PyUnicode_GET_LENGTH(self);
10382     len2 = PyUnicode_GET_LENGTH(substring);
10383     if (kind1 < kind2 || len1 < len2) {
10384         out = PyList_New(1);
10385         if (out == NULL)
10386             return NULL;
10387         Py_INCREF(self);
10388         PyList_SET_ITEM(out, 0, self);
10389         return out;
10390     }
10391     buf1 = PyUnicode_DATA(self);
10392     buf2 = PyUnicode_DATA(substring);
10393     if (kind2 != kind1) {
10394         buf2 = _PyUnicode_AsKind(substring, kind1);
10395         if (!buf2)
10396             return NULL;
10397     }
10398 
10399     switch (kind1) {
10400     case PyUnicode_1BYTE_KIND:
10401         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10402             out = asciilib_rsplit(
10403                 self,  buf1, len1, buf2, len2, maxcount);
10404         else
10405             out = ucs1lib_rsplit(
10406                 self,  buf1, len1, buf2, len2, maxcount);
10407         break;
10408     case PyUnicode_2BYTE_KIND:
10409         out = ucs2lib_rsplit(
10410             self,  buf1, len1, buf2, len2, maxcount);
10411         break;
10412     case PyUnicode_4BYTE_KIND:
10413         out = ucs4lib_rsplit(
10414             self,  buf1, len1, buf2, len2, maxcount);
10415         break;
10416     default:
10417         out = NULL;
10418     }
10419     if (kind2 != kind1)
10420         PyMem_Free(buf2);
10421     return out;
10422 }
10423 
10424 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10425 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10426             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10427 {
10428     switch (kind) {
10429     case PyUnicode_1BYTE_KIND:
10430         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10431             return asciilib_find(buf1, len1, buf2, len2, offset);
10432         else
10433             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10434     case PyUnicode_2BYTE_KIND:
10435         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10436     case PyUnicode_4BYTE_KIND:
10437         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10438     }
10439     assert(0);
10440     return -1;
10441 }
10442 
10443 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10444 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10445              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10446 {
10447     switch (kind) {
10448     case PyUnicode_1BYTE_KIND:
10449         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10450             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10451         else
10452             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10453     case PyUnicode_2BYTE_KIND:
10454         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10455     case PyUnicode_4BYTE_KIND:
10456         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10457     }
10458     assert(0);
10459     return 0;
10460 }
10461 
10462 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10463 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10464                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10465 {
10466     int kind = PyUnicode_KIND(u);
10467     void *data = PyUnicode_DATA(u);
10468     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10469     if (kind == PyUnicode_1BYTE_KIND) {
10470         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10471                                       (Py_UCS1 *)data + len,
10472                                       u1, u2, maxcount);
10473     }
10474     else if (kind == PyUnicode_2BYTE_KIND) {
10475         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10476                                       (Py_UCS2 *)data + len,
10477                                       u1, u2, maxcount);
10478     }
10479     else {
10480         assert(kind == PyUnicode_4BYTE_KIND);
10481         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10482                                       (Py_UCS4 *)data + len,
10483                                       u1, u2, maxcount);
10484     }
10485 }
10486 
10487 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10488 replace(PyObject *self, PyObject *str1,
10489         PyObject *str2, Py_ssize_t maxcount)
10490 {
10491     PyObject *u;
10492     char *sbuf = PyUnicode_DATA(self);
10493     char *buf1 = PyUnicode_DATA(str1);
10494     char *buf2 = PyUnicode_DATA(str2);
10495     int srelease = 0, release1 = 0, release2 = 0;
10496     int skind = PyUnicode_KIND(self);
10497     int kind1 = PyUnicode_KIND(str1);
10498     int kind2 = PyUnicode_KIND(str2);
10499     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10500     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10501     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10502     int mayshrink;
10503     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10504 
10505     if (maxcount < 0)
10506         maxcount = PY_SSIZE_T_MAX;
10507     else if (maxcount == 0 || slen == 0)
10508         goto nothing;
10509 
10510     if (str1 == str2)
10511         goto nothing;
10512 
10513     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10514     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10515     if (maxchar < maxchar_str1)
10516         /* substring too wide to be present */
10517         goto nothing;
10518     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10519     /* Replacing str1 with str2 may cause a maxchar reduction in the
10520        result string. */
10521     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10522     maxchar = Py_MAX(maxchar, maxchar_str2);
10523 
10524     if (len1 == len2) {
10525         /* same length */
10526         if (len1 == 0)
10527             goto nothing;
10528         if (len1 == 1) {
10529             /* replace characters */
10530             Py_UCS4 u1, u2;
10531             Py_ssize_t pos;
10532 
10533             u1 = PyUnicode_READ(kind1, buf1, 0);
10534             pos = findchar(sbuf, skind, slen, u1, 1);
10535             if (pos < 0)
10536                 goto nothing;
10537             u2 = PyUnicode_READ(kind2, buf2, 0);
10538             u = PyUnicode_New(slen, maxchar);
10539             if (!u)
10540                 goto error;
10541 
10542             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10543             replace_1char_inplace(u, pos, u1, u2, maxcount);
10544         }
10545         else {
10546             int rkind = skind;
10547             char *res;
10548             Py_ssize_t i;
10549 
10550             if (kind1 < rkind) {
10551                 /* widen substring */
10552                 buf1 = _PyUnicode_AsKind(str1, rkind);
10553                 if (!buf1) goto error;
10554                 release1 = 1;
10555             }
10556             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10557             if (i < 0)
10558                 goto nothing;
10559             if (rkind > kind2) {
10560                 /* widen replacement */
10561                 buf2 = _PyUnicode_AsKind(str2, rkind);
10562                 if (!buf2) goto error;
10563                 release2 = 1;
10564             }
10565             else if (rkind < kind2) {
10566                 /* widen self and buf1 */
10567                 rkind = kind2;
10568                 if (release1) PyMem_Free(buf1);
10569                 release1 = 0;
10570                 sbuf = _PyUnicode_AsKind(self, rkind);
10571                 if (!sbuf) goto error;
10572                 srelease = 1;
10573                 buf1 = _PyUnicode_AsKind(str1, rkind);
10574                 if (!buf1) goto error;
10575                 release1 = 1;
10576             }
10577             u = PyUnicode_New(slen, maxchar);
10578             if (!u)
10579                 goto error;
10580             assert(PyUnicode_KIND(u) == rkind);
10581             res = PyUnicode_DATA(u);
10582 
10583             memcpy(res, sbuf, rkind * slen);
10584             /* change everything in-place, starting with this one */
10585             memcpy(res + rkind * i,
10586                    buf2,
10587                    rkind * len2);
10588             i += len1;
10589 
10590             while ( --maxcount > 0) {
10591                 i = anylib_find(rkind, self,
10592                                 sbuf+rkind*i, slen-i,
10593                                 str1, buf1, len1, i);
10594                 if (i == -1)
10595                     break;
10596                 memcpy(res + rkind * i,
10597                        buf2,
10598                        rkind * len2);
10599                 i += len1;
10600             }
10601         }
10602     }
10603     else {
10604         Py_ssize_t n, i, j, ires;
10605         Py_ssize_t new_size;
10606         int rkind = skind;
10607         char *res;
10608 
10609         if (kind1 < rkind) {
10610             /* widen substring */
10611             buf1 = _PyUnicode_AsKind(str1, rkind);
10612             if (!buf1) goto error;
10613             release1 = 1;
10614         }
10615         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10616         if (n == 0)
10617             goto nothing;
10618         if (kind2 < rkind) {
10619             /* widen replacement */
10620             buf2 = _PyUnicode_AsKind(str2, rkind);
10621             if (!buf2) goto error;
10622             release2 = 1;
10623         }
10624         else if (kind2 > rkind) {
10625             /* widen self and buf1 */
10626             rkind = kind2;
10627             sbuf = _PyUnicode_AsKind(self, rkind);
10628             if (!sbuf) goto error;
10629             srelease = 1;
10630             if (release1) PyMem_Free(buf1);
10631             release1 = 0;
10632             buf1 = _PyUnicode_AsKind(str1, rkind);
10633             if (!buf1) goto error;
10634             release1 = 1;
10635         }
10636         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10637            PyUnicode_GET_LENGTH(str1))); */
10638         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10639                 PyErr_SetString(PyExc_OverflowError,
10640                                 "replace string is too long");
10641                 goto error;
10642         }
10643         new_size = slen + n * (len2 - len1);
10644         if (new_size == 0) {
10645             _Py_INCREF_UNICODE_EMPTY();
10646             if (!unicode_empty)
10647                 goto error;
10648             u = unicode_empty;
10649             goto done;
10650         }
10651         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10652             PyErr_SetString(PyExc_OverflowError,
10653                             "replace string is too long");
10654             goto error;
10655         }
10656         u = PyUnicode_New(new_size, maxchar);
10657         if (!u)
10658             goto error;
10659         assert(PyUnicode_KIND(u) == rkind);
10660         res = PyUnicode_DATA(u);
10661         ires = i = 0;
10662         if (len1 > 0) {
10663             while (n-- > 0) {
10664                 /* look for next match */
10665                 j = anylib_find(rkind, self,
10666                                 sbuf + rkind * i, slen-i,
10667                                 str1, buf1, len1, i);
10668                 if (j == -1)
10669                     break;
10670                 else if (j > i) {
10671                     /* copy unchanged part [i:j] */
10672                     memcpy(res + rkind * ires,
10673                            sbuf + rkind * i,
10674                            rkind * (j-i));
10675                     ires += j - i;
10676                 }
10677                 /* copy substitution string */
10678                 if (len2 > 0) {
10679                     memcpy(res + rkind * ires,
10680                            buf2,
10681                            rkind * len2);
10682                     ires += len2;
10683                 }
10684                 i = j + len1;
10685             }
10686             if (i < slen)
10687                 /* copy tail [i:] */
10688                 memcpy(res + rkind * ires,
10689                        sbuf + rkind * i,
10690                        rkind * (slen-i));
10691         }
10692         else {
10693             /* interleave */
10694             while (n > 0) {
10695                 memcpy(res + rkind * ires,
10696                        buf2,
10697                        rkind * len2);
10698                 ires += len2;
10699                 if (--n <= 0)
10700                     break;
10701                 memcpy(res + rkind * ires,
10702                        sbuf + rkind * i,
10703                        rkind);
10704                 ires++;
10705                 i++;
10706             }
10707             memcpy(res + rkind * ires,
10708                    sbuf + rkind * i,
10709                    rkind * (slen-i));
10710         }
10711     }
10712 
10713     if (mayshrink) {
10714         unicode_adjust_maxchar(&u);
10715         if (u == NULL)
10716             goto error;
10717     }
10718 
10719   done:
10720     if (srelease)
10721         PyMem_FREE(sbuf);
10722     if (release1)
10723         PyMem_FREE(buf1);
10724     if (release2)
10725         PyMem_FREE(buf2);
10726     assert(_PyUnicode_CheckConsistency(u, 1));
10727     return u;
10728 
10729   nothing:
10730     /* nothing to replace; return original string (when possible) */
10731     if (srelease)
10732         PyMem_FREE(sbuf);
10733     if (release1)
10734         PyMem_FREE(buf1);
10735     if (release2)
10736         PyMem_FREE(buf2);
10737     return unicode_result_unchanged(self);
10738 
10739   error:
10740     if (srelease && sbuf)
10741         PyMem_FREE(sbuf);
10742     if (release1 && buf1)
10743         PyMem_FREE(buf1);
10744     if (release2 && buf2)
10745         PyMem_FREE(buf2);
10746     return NULL;
10747 }
10748 
10749 /* --- Unicode Object Methods --------------------------------------------- */
10750 
10751 PyDoc_STRVAR(title__doc__,
10752              "S.title() -> str\n\
10753 \n\
10754 Return a titlecased version of S, i.e. words start with title case\n\
10755 characters, all remaining cased characters have lower case.");
10756 
10757 static PyObject*
unicode_title(PyObject * self)10758 unicode_title(PyObject *self)
10759 {
10760     if (PyUnicode_READY(self) == -1)
10761         return NULL;
10762     return case_operation(self, do_title);
10763 }
10764 
10765 PyDoc_STRVAR(capitalize__doc__,
10766              "S.capitalize() -> str\n\
10767 \n\
10768 Return a capitalized version of S, i.e. make the first character\n\
10769 have upper case and the rest lower case.");
10770 
10771 static PyObject*
unicode_capitalize(PyObject * self)10772 unicode_capitalize(PyObject *self)
10773 {
10774     if (PyUnicode_READY(self) == -1)
10775         return NULL;
10776     if (PyUnicode_GET_LENGTH(self) == 0)
10777         return unicode_result_unchanged(self);
10778     return case_operation(self, do_capitalize);
10779 }
10780 
10781 PyDoc_STRVAR(casefold__doc__,
10782              "S.casefold() -> str\n\
10783 \n\
10784 Return a version of S suitable for caseless comparisons.");
10785 
10786 static PyObject *
unicode_casefold(PyObject * self)10787 unicode_casefold(PyObject *self)
10788 {
10789     if (PyUnicode_READY(self) == -1)
10790         return NULL;
10791     if (PyUnicode_IS_ASCII(self))
10792         return ascii_upper_or_lower(self, 1);
10793     return case_operation(self, do_casefold);
10794 }
10795 
10796 
10797 /* Argument converter. Accepts a single Unicode character. */
10798 
10799 static int
convert_uc(PyObject * obj,void * addr)10800 convert_uc(PyObject *obj, void *addr)
10801 {
10802     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10803 
10804     if (!PyUnicode_Check(obj)) {
10805         PyErr_Format(PyExc_TypeError,
10806                      "The fill character must be a unicode character, "
10807                      "not %.100s", Py_TYPE(obj)->tp_name);
10808         return 0;
10809     }
10810     if (PyUnicode_READY(obj) < 0)
10811         return 0;
10812     if (PyUnicode_GET_LENGTH(obj) != 1) {
10813         PyErr_SetString(PyExc_TypeError,
10814                         "The fill character must be exactly one character long");
10815         return 0;
10816     }
10817     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10818     return 1;
10819 }
10820 
10821 PyDoc_STRVAR(center__doc__,
10822              "S.center(width[, fillchar]) -> str\n\
10823 \n\
10824 Return S centered in a string of length width. Padding is\n\
10825 done using the specified fill character (default is a space)");
10826 
10827 static PyObject *
unicode_center(PyObject * self,PyObject * args)10828 unicode_center(PyObject *self, PyObject *args)
10829 {
10830     Py_ssize_t marg, left;
10831     Py_ssize_t width;
10832     Py_UCS4 fillchar = ' ';
10833 
10834     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10835         return NULL;
10836 
10837     if (PyUnicode_READY(self) == -1)
10838         return NULL;
10839 
10840     if (PyUnicode_GET_LENGTH(self) >= width)
10841         return unicode_result_unchanged(self);
10842 
10843     marg = width - PyUnicode_GET_LENGTH(self);
10844     left = marg / 2 + (marg & width & 1);
10845 
10846     return pad(self, left, marg - left, fillchar);
10847 }
10848 
10849 /* This function assumes that str1 and str2 are readied by the caller. */
10850 
10851 static int
unicode_compare(PyObject * str1,PyObject * str2)10852 unicode_compare(PyObject *str1, PyObject *str2)
10853 {
10854 #define COMPARE(TYPE1, TYPE2) \
10855     do { \
10856         TYPE1* p1 = (TYPE1 *)data1; \
10857         TYPE2* p2 = (TYPE2 *)data2; \
10858         TYPE1* end = p1 + len; \
10859         Py_UCS4 c1, c2; \
10860         for (; p1 != end; p1++, p2++) { \
10861             c1 = *p1; \
10862             c2 = *p2; \
10863             if (c1 != c2) \
10864                 return (c1 < c2) ? -1 : 1; \
10865         } \
10866     } \
10867     while (0)
10868 
10869     int kind1, kind2;
10870     void *data1, *data2;
10871     Py_ssize_t len1, len2, len;
10872 
10873     kind1 = PyUnicode_KIND(str1);
10874     kind2 = PyUnicode_KIND(str2);
10875     data1 = PyUnicode_DATA(str1);
10876     data2 = PyUnicode_DATA(str2);
10877     len1 = PyUnicode_GET_LENGTH(str1);
10878     len2 = PyUnicode_GET_LENGTH(str2);
10879     len = Py_MIN(len1, len2);
10880 
10881     switch(kind1) {
10882     case PyUnicode_1BYTE_KIND:
10883     {
10884         switch(kind2) {
10885         case PyUnicode_1BYTE_KIND:
10886         {
10887             int cmp = memcmp(data1, data2, len);
10888             /* normalize result of memcmp() into the range [-1; 1] */
10889             if (cmp < 0)
10890                 return -1;
10891             if (cmp > 0)
10892                 return 1;
10893             break;
10894         }
10895         case PyUnicode_2BYTE_KIND:
10896             COMPARE(Py_UCS1, Py_UCS2);
10897             break;
10898         case PyUnicode_4BYTE_KIND:
10899             COMPARE(Py_UCS1, Py_UCS4);
10900             break;
10901         default:
10902             assert(0);
10903         }
10904         break;
10905     }
10906     case PyUnicode_2BYTE_KIND:
10907     {
10908         switch(kind2) {
10909         case PyUnicode_1BYTE_KIND:
10910             COMPARE(Py_UCS2, Py_UCS1);
10911             break;
10912         case PyUnicode_2BYTE_KIND:
10913         {
10914             COMPARE(Py_UCS2, Py_UCS2);
10915             break;
10916         }
10917         case PyUnicode_4BYTE_KIND:
10918             COMPARE(Py_UCS2, Py_UCS4);
10919             break;
10920         default:
10921             assert(0);
10922         }
10923         break;
10924     }
10925     case PyUnicode_4BYTE_KIND:
10926     {
10927         switch(kind2) {
10928         case PyUnicode_1BYTE_KIND:
10929             COMPARE(Py_UCS4, Py_UCS1);
10930             break;
10931         case PyUnicode_2BYTE_KIND:
10932             COMPARE(Py_UCS4, Py_UCS2);
10933             break;
10934         case PyUnicode_4BYTE_KIND:
10935         {
10936 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10937             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10938             /* normalize result of wmemcmp() into the range [-1; 1] */
10939             if (cmp < 0)
10940                 return -1;
10941             if (cmp > 0)
10942                 return 1;
10943 #else
10944             COMPARE(Py_UCS4, Py_UCS4);
10945 #endif
10946             break;
10947         }
10948         default:
10949             assert(0);
10950         }
10951         break;
10952     }
10953     default:
10954         assert(0);
10955     }
10956 
10957     if (len1 == len2)
10958         return 0;
10959     if (len1 < len2)
10960         return -1;
10961     else
10962         return 1;
10963 
10964 #undef COMPARE
10965 }
10966 
10967 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10968 unicode_compare_eq(PyObject *str1, PyObject *str2)
10969 {
10970     int kind;
10971     void *data1, *data2;
10972     Py_ssize_t len;
10973     int cmp;
10974 
10975     len = PyUnicode_GET_LENGTH(str1);
10976     if (PyUnicode_GET_LENGTH(str2) != len)
10977         return 0;
10978     kind = PyUnicode_KIND(str1);
10979     if (PyUnicode_KIND(str2) != kind)
10980         return 0;
10981     data1 = PyUnicode_DATA(str1);
10982     data2 = PyUnicode_DATA(str2);
10983 
10984     cmp = memcmp(data1, data2, len * kind);
10985     return (cmp == 0);
10986 }
10987 
10988 
10989 int
PyUnicode_Compare(PyObject * left,PyObject * right)10990 PyUnicode_Compare(PyObject *left, PyObject *right)
10991 {
10992     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10993         if (PyUnicode_READY(left) == -1 ||
10994             PyUnicode_READY(right) == -1)
10995             return -1;
10996 
10997         /* a string is equal to itself */
10998         if (left == right)
10999             return 0;
11000 
11001         return unicode_compare(left, right);
11002     }
11003     PyErr_Format(PyExc_TypeError,
11004                  "Can't compare %.100s and %.100s",
11005                  left->ob_type->tp_name,
11006                  right->ob_type->tp_name);
11007     return -1;
11008 }
11009 
11010 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11011 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11012 {
11013     Py_ssize_t i;
11014     int kind;
11015     Py_UCS4 chr;
11016     const unsigned char *ustr = (const unsigned char *)str;
11017 
11018     assert(_PyUnicode_CHECK(uni));
11019     if (!PyUnicode_IS_READY(uni)) {
11020         const wchar_t *ws = _PyUnicode_WSTR(uni);
11021         /* Compare Unicode string and source character set string */
11022         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11023             if (chr != ustr[i])
11024                 return (chr < ustr[i]) ? -1 : 1;
11025         }
11026         /* This check keeps Python strings that end in '\0' from comparing equal
11027          to C strings identical up to that point. */
11028         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11029             return 1; /* uni is longer */
11030         if (ustr[i])
11031             return -1; /* str is longer */
11032         return 0;
11033     }
11034     kind = PyUnicode_KIND(uni);
11035     if (kind == PyUnicode_1BYTE_KIND) {
11036         const void *data = PyUnicode_1BYTE_DATA(uni);
11037         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11038         size_t len, len2 = strlen(str);
11039         int cmp;
11040 
11041         len = Py_MIN(len1, len2);
11042         cmp = memcmp(data, str, len);
11043         if (cmp != 0) {
11044             if (cmp < 0)
11045                 return -1;
11046             else
11047                 return 1;
11048         }
11049         if (len1 > len2)
11050             return 1; /* uni is longer */
11051         if (len1 < len2)
11052             return -1; /* str is longer */
11053         return 0;
11054     }
11055     else {
11056         void *data = PyUnicode_DATA(uni);
11057         /* Compare Unicode string and source character set string */
11058         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11059             if (chr != (unsigned char)str[i])
11060                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11061         /* This check keeps Python strings that end in '\0' from comparing equal
11062          to C strings identical up to that point. */
11063         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11064             return 1; /* uni is longer */
11065         if (str[i])
11066             return -1; /* str is longer */
11067         return 0;
11068     }
11069 }
11070 
11071 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11072 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11073 {
11074     size_t i, len;
11075     const wchar_t *p;
11076     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11077     if (strlen(str) != len)
11078         return 0;
11079     p = _PyUnicode_WSTR(unicode);
11080     assert(p);
11081     for (i = 0; i < len; i++) {
11082         unsigned char c = (unsigned char)str[i];
11083         if (c >= 128 || p[i] != (wchar_t)c)
11084             return 0;
11085     }
11086     return 1;
11087 }
11088 
11089 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11090 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11091 {
11092     size_t len;
11093     assert(_PyUnicode_CHECK(unicode));
11094     assert(str);
11095 #ifndef NDEBUG
11096     for (const char *p = str; *p; p++) {
11097         assert((unsigned char)*p < 128);
11098     }
11099 #endif
11100     if (PyUnicode_READY(unicode) == -1) {
11101         /* Memory error or bad data */
11102         PyErr_Clear();
11103         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11104     }
11105     if (!PyUnicode_IS_ASCII(unicode))
11106         return 0;
11107     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11108     return strlen(str) == len &&
11109            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11110 }
11111 
11112 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11113 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11114 {
11115     PyObject *right_uni;
11116     Py_hash_t hash;
11117 
11118     assert(_PyUnicode_CHECK(left));
11119     assert(right->string);
11120 #ifndef NDEBUG
11121     for (const char *p = right->string; *p; p++) {
11122         assert((unsigned char)*p < 128);
11123     }
11124 #endif
11125 
11126     if (PyUnicode_READY(left) == -1) {
11127         /* memory error or bad data */
11128         PyErr_Clear();
11129         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11130     }
11131 
11132     if (!PyUnicode_IS_ASCII(left))
11133         return 0;
11134 
11135     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11136     if (right_uni == NULL) {
11137         /* memory error or bad data */
11138         PyErr_Clear();
11139         return _PyUnicode_EqualToASCIIString(left, right->string);
11140     }
11141 
11142     if (left == right_uni)
11143         return 1;
11144 
11145     if (PyUnicode_CHECK_INTERNED(left))
11146         return 0;
11147 
11148     assert(_PyUnicode_HASH(right_uni) != 1);
11149     hash = _PyUnicode_HASH(left);
11150     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11151         return 0;
11152 
11153     return unicode_compare_eq(left, right_uni);
11154 }
11155 
11156 #define TEST_COND(cond)                         \
11157     ((cond) ? Py_True : Py_False)
11158 
11159 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11161 {
11162     int result;
11163     PyObject *v;
11164 
11165     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11166         Py_RETURN_NOTIMPLEMENTED;
11167 
11168     if (PyUnicode_READY(left) == -1 ||
11169         PyUnicode_READY(right) == -1)
11170         return NULL;
11171 
11172     if (left == right) {
11173         switch (op) {
11174         case Py_EQ:
11175         case Py_LE:
11176         case Py_GE:
11177             /* a string is equal to itself */
11178             v = Py_True;
11179             break;
11180         case Py_NE:
11181         case Py_LT:
11182         case Py_GT:
11183             v = Py_False;
11184             break;
11185         default:
11186             PyErr_BadArgument();
11187             return NULL;
11188         }
11189     }
11190     else if (op == Py_EQ || op == Py_NE) {
11191         result = unicode_compare_eq(left, right);
11192         result ^= (op == Py_NE);
11193         v = TEST_COND(result);
11194     }
11195     else {
11196         result = unicode_compare(left, right);
11197 
11198         /* Convert the return value to a Boolean */
11199         switch (op) {
11200         case Py_LE:
11201             v = TEST_COND(result <= 0);
11202             break;
11203         case Py_GE:
11204             v = TEST_COND(result >= 0);
11205             break;
11206         case Py_LT:
11207             v = TEST_COND(result == -1);
11208             break;
11209         case Py_GT:
11210             v = TEST_COND(result == 1);
11211             break;
11212         default:
11213             PyErr_BadArgument();
11214             return NULL;
11215         }
11216     }
11217     Py_INCREF(v);
11218     return v;
11219 }
11220 
11221 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11222 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11223 {
11224     return unicode_eq(aa, bb);
11225 }
11226 
11227 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11228 PyUnicode_Contains(PyObject *str, PyObject *substr)
11229 {
11230     int kind1, kind2;
11231     void *buf1, *buf2;
11232     Py_ssize_t len1, len2;
11233     int result;
11234 
11235     if (!PyUnicode_Check(substr)) {
11236         PyErr_Format(PyExc_TypeError,
11237                      "'in <string>' requires string as left operand, not %.100s",
11238                      Py_TYPE(substr)->tp_name);
11239         return -1;
11240     }
11241     if (PyUnicode_READY(substr) == -1)
11242         return -1;
11243     if (ensure_unicode(str) < 0)
11244         return -1;
11245 
11246     kind1 = PyUnicode_KIND(str);
11247     kind2 = PyUnicode_KIND(substr);
11248     if (kind1 < kind2)
11249         return 0;
11250     len1 = PyUnicode_GET_LENGTH(str);
11251     len2 = PyUnicode_GET_LENGTH(substr);
11252     if (len1 < len2)
11253         return 0;
11254     buf1 = PyUnicode_DATA(str);
11255     buf2 = PyUnicode_DATA(substr);
11256     if (len2 == 1) {
11257         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259         return result;
11260     }
11261     if (kind2 != kind1) {
11262         buf2 = _PyUnicode_AsKind(substr, kind1);
11263         if (!buf2)
11264             return -1;
11265     }
11266 
11267     switch (kind1) {
11268     case PyUnicode_1BYTE_KIND:
11269         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270         break;
11271     case PyUnicode_2BYTE_KIND:
11272         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273         break;
11274     case PyUnicode_4BYTE_KIND:
11275         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276         break;
11277     default:
11278         result = -1;
11279         assert(0);
11280     }
11281 
11282     if (kind2 != kind1)
11283         PyMem_Free(buf2);
11284 
11285     return result;
11286 }
11287 
11288 /* Concat to string or Unicode object giving a new Unicode object. */
11289 
11290 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11291 PyUnicode_Concat(PyObject *left, PyObject *right)
11292 {
11293     PyObject *result;
11294     Py_UCS4 maxchar, maxchar2;
11295     Py_ssize_t left_len, right_len, new_len;
11296 
11297     if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11298         return NULL;
11299 
11300     /* Shortcuts */
11301     if (left == unicode_empty)
11302         return PyUnicode_FromObject(right);
11303     if (right == unicode_empty)
11304         return PyUnicode_FromObject(left);
11305 
11306     left_len = PyUnicode_GET_LENGTH(left);
11307     right_len = PyUnicode_GET_LENGTH(right);
11308     if (left_len > PY_SSIZE_T_MAX - right_len) {
11309         PyErr_SetString(PyExc_OverflowError,
11310                         "strings are too large to concat");
11311         return NULL;
11312     }
11313     new_len = left_len + right_len;
11314 
11315     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11316     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11317     maxchar = Py_MAX(maxchar, maxchar2);
11318 
11319     /* Concat the two Unicode strings */
11320     result = PyUnicode_New(new_len, maxchar);
11321     if (result == NULL)
11322         return NULL;
11323     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11324     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11325     assert(_PyUnicode_CheckConsistency(result, 1));
11326     return result;
11327 }
11328 
11329 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11330 PyUnicode_Append(PyObject **p_left, PyObject *right)
11331 {
11332     PyObject *left, *res;
11333     Py_UCS4 maxchar, maxchar2;
11334     Py_ssize_t left_len, right_len, new_len;
11335 
11336     if (p_left == NULL) {
11337         if (!PyErr_Occurred())
11338             PyErr_BadInternalCall();
11339         return;
11340     }
11341     left = *p_left;
11342     if (right == NULL || left == NULL
11343         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11344         if (!PyErr_Occurred())
11345             PyErr_BadInternalCall();
11346         goto error;
11347     }
11348 
11349     if (PyUnicode_READY(left) == -1)
11350         goto error;
11351     if (PyUnicode_READY(right) == -1)
11352         goto error;
11353 
11354     /* Shortcuts */
11355     if (left == unicode_empty) {
11356         Py_DECREF(left);
11357         Py_INCREF(right);
11358         *p_left = right;
11359         return;
11360     }
11361     if (right == unicode_empty)
11362         return;
11363 
11364     left_len = PyUnicode_GET_LENGTH(left);
11365     right_len = PyUnicode_GET_LENGTH(right);
11366     if (left_len > PY_SSIZE_T_MAX - right_len) {
11367         PyErr_SetString(PyExc_OverflowError,
11368                         "strings are too large to concat");
11369         goto error;
11370     }
11371     new_len = left_len + right_len;
11372 
11373     if (unicode_modifiable(left)
11374         && PyUnicode_CheckExact(right)
11375         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11376         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11377            to change the structure size, but characters are stored just after
11378            the structure, and so it requires to move all characters which is
11379            not so different than duplicating the string. */
11380         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11381     {
11382         /* append inplace */
11383         if (unicode_resize(p_left, new_len) != 0)
11384             goto error;
11385 
11386         /* copy 'right' into the newly allocated area of 'left' */
11387         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11388     }
11389     else {
11390         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11391         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11392         maxchar = Py_MAX(maxchar, maxchar2);
11393 
11394         /* Concat the two Unicode strings */
11395         res = PyUnicode_New(new_len, maxchar);
11396         if (res == NULL)
11397             goto error;
11398         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11399         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11400         Py_DECREF(left);
11401         *p_left = res;
11402     }
11403     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11404     return;
11405 
11406 error:
11407     Py_CLEAR(*p_left);
11408 }
11409 
11410 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11411 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11412 {
11413     PyUnicode_Append(pleft, right);
11414     Py_XDECREF(right);
11415 }
11416 
11417 /*
11418 Wraps stringlib_parse_args_finds() and additionally ensures that the
11419 first argument is a unicode object.
11420 */
11421 
11422 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11423 parse_args_finds_unicode(const char * function_name, PyObject *args,
11424                          PyObject **substring,
11425                          Py_ssize_t *start, Py_ssize_t *end)
11426 {
11427     if(stringlib_parse_args_finds(function_name, args, substring,
11428                                   start, end)) {
11429         if (ensure_unicode(*substring) < 0)
11430             return 0;
11431         return 1;
11432     }
11433     return 0;
11434 }
11435 
11436 PyDoc_STRVAR(count__doc__,
11437              "S.count(sub[, start[, end]]) -> int\n\
11438 \n\
11439 Return the number of non-overlapping occurrences of substring sub in\n\
11440 string S[start:end].  Optional arguments start and end are\n\
11441 interpreted as in slice notation.");
11442 
11443 static PyObject *
unicode_count(PyObject * self,PyObject * args)11444 unicode_count(PyObject *self, PyObject *args)
11445 {
11446     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11447     Py_ssize_t start = 0;
11448     Py_ssize_t end = PY_SSIZE_T_MAX;
11449     PyObject *result;
11450     int kind1, kind2;
11451     void *buf1, *buf2;
11452     Py_ssize_t len1, len2, iresult;
11453 
11454     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11455         return NULL;
11456 
11457     kind1 = PyUnicode_KIND(self);
11458     kind2 = PyUnicode_KIND(substring);
11459     if (kind1 < kind2)
11460         return PyLong_FromLong(0);
11461 
11462     len1 = PyUnicode_GET_LENGTH(self);
11463     len2 = PyUnicode_GET_LENGTH(substring);
11464     ADJUST_INDICES(start, end, len1);
11465     if (end - start < len2)
11466         return PyLong_FromLong(0);
11467 
11468     buf1 = PyUnicode_DATA(self);
11469     buf2 = PyUnicode_DATA(substring);
11470     if (kind2 != kind1) {
11471         buf2 = _PyUnicode_AsKind(substring, kind1);
11472         if (!buf2)
11473             return NULL;
11474     }
11475     switch (kind1) {
11476     case PyUnicode_1BYTE_KIND:
11477         iresult = ucs1lib_count(
11478             ((Py_UCS1*)buf1) + start, end - start,
11479             buf2, len2, PY_SSIZE_T_MAX
11480             );
11481         break;
11482     case PyUnicode_2BYTE_KIND:
11483         iresult = ucs2lib_count(
11484             ((Py_UCS2*)buf1) + start, end - start,
11485             buf2, len2, PY_SSIZE_T_MAX
11486             );
11487         break;
11488     case PyUnicode_4BYTE_KIND:
11489         iresult = ucs4lib_count(
11490             ((Py_UCS4*)buf1) + start, end - start,
11491             buf2, len2, PY_SSIZE_T_MAX
11492             );
11493         break;
11494     default:
11495         assert(0); iresult = 0;
11496     }
11497 
11498     result = PyLong_FromSsize_t(iresult);
11499 
11500     if (kind2 != kind1)
11501         PyMem_Free(buf2);
11502 
11503     return result;
11504 }
11505 
11506 PyDoc_STRVAR(encode__doc__,
11507              "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11508 \n\
11509 Encode S using the codec registered for encoding. Default encoding\n\
11510 is 'utf-8'. errors may be given to set a different error\n\
11511 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11512 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11513 'xmlcharrefreplace' as well as any other name registered with\n\
11514 codecs.register_error that can handle UnicodeEncodeErrors.");
11515 
11516 static PyObject *
unicode_encode(PyObject * self,PyObject * args,PyObject * kwargs)11517 unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11518 {
11519     static char *kwlist[] = {"encoding", "errors", 0};
11520     char *encoding = NULL;
11521     char *errors = NULL;
11522 
11523     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11524                                      kwlist, &encoding, &errors))
11525         return NULL;
11526     return PyUnicode_AsEncodedString(self, encoding, errors);
11527 }
11528 
11529 PyDoc_STRVAR(expandtabs__doc__,
11530              "S.expandtabs(tabsize=8) -> str\n\
11531 \n\
11532 Return a copy of S where all tab characters are expanded using spaces.\n\
11533 If tabsize is not given, a tab size of 8 characters is assumed.");
11534 
11535 static PyObject*
unicode_expandtabs(PyObject * self,PyObject * args,PyObject * kwds)11536 unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11537 {
11538     Py_ssize_t i, j, line_pos, src_len, incr;
11539     Py_UCS4 ch;
11540     PyObject *u;
11541     void *src_data, *dest_data;
11542     static char *kwlist[] = {"tabsize", 0};
11543     int tabsize = 8;
11544     int kind;
11545     int found;
11546 
11547     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11548                                      kwlist, &tabsize))
11549         return NULL;
11550 
11551     if (PyUnicode_READY(self) == -1)
11552         return NULL;
11553 
11554     /* First pass: determine size of output string */
11555     src_len = PyUnicode_GET_LENGTH(self);
11556     i = j = line_pos = 0;
11557     kind = PyUnicode_KIND(self);
11558     src_data = PyUnicode_DATA(self);
11559     found = 0;
11560     for (; i < src_len; i++) {
11561         ch = PyUnicode_READ(kind, src_data, i);
11562         if (ch == '\t') {
11563             found = 1;
11564             if (tabsize > 0) {
11565                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11566                 if (j > PY_SSIZE_T_MAX - incr)
11567                     goto overflow;
11568                 line_pos += incr;
11569                 j += incr;
11570             }
11571         }
11572         else {
11573             if (j > PY_SSIZE_T_MAX - 1)
11574                 goto overflow;
11575             line_pos++;
11576             j++;
11577             if (ch == '\n' || ch == '\r')
11578                 line_pos = 0;
11579         }
11580     }
11581     if (!found)
11582         return unicode_result_unchanged(self);
11583 
11584     /* Second pass: create output string and fill it */
11585     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11586     if (!u)
11587         return NULL;
11588     dest_data = PyUnicode_DATA(u);
11589 
11590     i = j = line_pos = 0;
11591 
11592     for (; i < src_len; i++) {
11593         ch = PyUnicode_READ(kind, src_data, i);
11594         if (ch == '\t') {
11595             if (tabsize > 0) {
11596                 incr = tabsize - (line_pos % tabsize);
11597                 line_pos += incr;
11598                 FILL(kind, dest_data, ' ', j, incr);
11599                 j += incr;
11600             }
11601         }
11602         else {
11603             line_pos++;
11604             PyUnicode_WRITE(kind, dest_data, j, ch);
11605             j++;
11606             if (ch == '\n' || ch == '\r')
11607                 line_pos = 0;
11608         }
11609     }
11610     assert (j == PyUnicode_GET_LENGTH(u));
11611     return unicode_result(u);
11612 
11613   overflow:
11614     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11615     return NULL;
11616 }
11617 
11618 PyDoc_STRVAR(find__doc__,
11619              "S.find(sub[, start[, end]]) -> int\n\
11620 \n\
11621 Return the lowest index in S where substring sub is found,\n\
11622 such that sub is contained within S[start:end].  Optional\n\
11623 arguments start and end are interpreted as in slice notation.\n\
11624 \n\
11625 Return -1 on failure.");
11626 
11627 static PyObject *
unicode_find(PyObject * self,PyObject * args)11628 unicode_find(PyObject *self, PyObject *args)
11629 {
11630     /* initialize variables to prevent gcc warning */
11631     PyObject *substring = NULL;
11632     Py_ssize_t start = 0;
11633     Py_ssize_t end = 0;
11634     Py_ssize_t result;
11635 
11636     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11637         return NULL;
11638 
11639     if (PyUnicode_READY(self) == -1)
11640         return NULL;
11641 
11642     result = any_find_slice(self, substring, start, end, 1);
11643 
11644     if (result == -2)
11645         return NULL;
11646 
11647     return PyLong_FromSsize_t(result);
11648 }
11649 
11650 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11651 unicode_getitem(PyObject *self, Py_ssize_t index)
11652 {
11653     void *data;
11654     enum PyUnicode_Kind kind;
11655     Py_UCS4 ch;
11656 
11657     if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11658         PyErr_BadArgument();
11659         return NULL;
11660     }
11661     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11662         PyErr_SetString(PyExc_IndexError, "string index out of range");
11663         return NULL;
11664     }
11665     kind = PyUnicode_KIND(self);
11666     data = PyUnicode_DATA(self);
11667     ch = PyUnicode_READ(kind, data, index);
11668     return unicode_char(ch);
11669 }
11670 
11671 /* Believe it or not, this produces the same value for ASCII strings
11672    as bytes_hash(). */
11673 static Py_hash_t
unicode_hash(PyObject * self)11674 unicode_hash(PyObject *self)
11675 {
11676     Py_ssize_t len;
11677     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11678 
11679 #ifdef Py_DEBUG
11680     assert(_Py_HashSecret_Initialized);
11681 #endif
11682     if (_PyUnicode_HASH(self) != -1)
11683         return _PyUnicode_HASH(self);
11684     if (PyUnicode_READY(self) == -1)
11685         return -1;
11686     len = PyUnicode_GET_LENGTH(self);
11687     /*
11688       We make the hash of the empty string be 0, rather than using
11689       (prefix ^ suffix), since this slightly obfuscates the hash secret
11690     */
11691     if (len == 0) {
11692         _PyUnicode_HASH(self) = 0;
11693         return 0;
11694     }
11695     x = _Py_HashBytes(PyUnicode_DATA(self),
11696                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11697     _PyUnicode_HASH(self) = x;
11698     return x;
11699 }
11700 
11701 PyDoc_STRVAR(index__doc__,
11702              "S.index(sub[, start[, end]]) -> int\n\
11703 \n\
11704 Like S.find() but raise ValueError when the substring is not found.");
11705 
11706 static PyObject *
unicode_index(PyObject * self,PyObject * args)11707 unicode_index(PyObject *self, PyObject *args)
11708 {
11709     /* initialize variables to prevent gcc warning */
11710     Py_ssize_t result;
11711     PyObject *substring = NULL;
11712     Py_ssize_t start = 0;
11713     Py_ssize_t end = 0;
11714 
11715     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11716         return NULL;
11717 
11718     if (PyUnicode_READY(self) == -1)
11719         return NULL;
11720 
11721     result = any_find_slice(self, substring, start, end, 1);
11722 
11723     if (result == -2)
11724         return NULL;
11725 
11726     if (result < 0) {
11727         PyErr_SetString(PyExc_ValueError, "substring not found");
11728         return NULL;
11729     }
11730 
11731     return PyLong_FromSsize_t(result);
11732 }
11733 
11734 PyDoc_STRVAR(islower__doc__,
11735              "S.islower() -> bool\n\
11736 \n\
11737 Return True if all cased characters in S are lowercase and there is\n\
11738 at least one cased character in S, False otherwise.");
11739 
11740 static PyObject*
unicode_islower(PyObject * self)11741 unicode_islower(PyObject *self)
11742 {
11743     Py_ssize_t i, length;
11744     int kind;
11745     void *data;
11746     int cased;
11747 
11748     if (PyUnicode_READY(self) == -1)
11749         return NULL;
11750     length = PyUnicode_GET_LENGTH(self);
11751     kind = PyUnicode_KIND(self);
11752     data = PyUnicode_DATA(self);
11753 
11754     /* Shortcut for single character strings */
11755     if (length == 1)
11756         return PyBool_FromLong(
11757             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11758 
11759     /* Special case for empty strings */
11760     if (length == 0)
11761         return PyBool_FromLong(0);
11762 
11763     cased = 0;
11764     for (i = 0; i < length; i++) {
11765         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11766 
11767         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11768             return PyBool_FromLong(0);
11769         else if (!cased && Py_UNICODE_ISLOWER(ch))
11770             cased = 1;
11771     }
11772     return PyBool_FromLong(cased);
11773 }
11774 
11775 PyDoc_STRVAR(isupper__doc__,
11776              "S.isupper() -> bool\n\
11777 \n\
11778 Return True if all cased characters in S are uppercase and there is\n\
11779 at least one cased character in S, False otherwise.");
11780 
11781 static PyObject*
unicode_isupper(PyObject * self)11782 unicode_isupper(PyObject *self)
11783 {
11784     Py_ssize_t i, length;
11785     int kind;
11786     void *data;
11787     int cased;
11788 
11789     if (PyUnicode_READY(self) == -1)
11790         return NULL;
11791     length = PyUnicode_GET_LENGTH(self);
11792     kind = PyUnicode_KIND(self);
11793     data = PyUnicode_DATA(self);
11794 
11795     /* Shortcut for single character strings */
11796     if (length == 1)
11797         return PyBool_FromLong(
11798             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11799 
11800     /* Special case for empty strings */
11801     if (length == 0)
11802         return PyBool_FromLong(0);
11803 
11804     cased = 0;
11805     for (i = 0; i < length; i++) {
11806         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11807 
11808         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11809             return PyBool_FromLong(0);
11810         else if (!cased && Py_UNICODE_ISUPPER(ch))
11811             cased = 1;
11812     }
11813     return PyBool_FromLong(cased);
11814 }
11815 
11816 PyDoc_STRVAR(istitle__doc__,
11817              "S.istitle() -> bool\n\
11818 \n\
11819 Return True if S is a titlecased string and there is at least one\n\
11820 character in S, i.e. upper- and titlecase characters may only\n\
11821 follow uncased characters and lowercase characters only cased ones.\n\
11822 Return False otherwise.");
11823 
11824 static PyObject*
unicode_istitle(PyObject * self)11825 unicode_istitle(PyObject *self)
11826 {
11827     Py_ssize_t i, length;
11828     int kind;
11829     void *data;
11830     int cased, previous_is_cased;
11831 
11832     if (PyUnicode_READY(self) == -1)
11833         return NULL;
11834     length = PyUnicode_GET_LENGTH(self);
11835     kind = PyUnicode_KIND(self);
11836     data = PyUnicode_DATA(self);
11837 
11838     /* Shortcut for single character strings */
11839     if (length == 1) {
11840         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11841         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11842                                (Py_UNICODE_ISUPPER(ch) != 0));
11843     }
11844 
11845     /* Special case for empty strings */
11846     if (length == 0)
11847         return PyBool_FromLong(0);
11848 
11849     cased = 0;
11850     previous_is_cased = 0;
11851     for (i = 0; i < length; i++) {
11852         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11853 
11854         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11855             if (previous_is_cased)
11856                 return PyBool_FromLong(0);
11857             previous_is_cased = 1;
11858             cased = 1;
11859         }
11860         else if (Py_UNICODE_ISLOWER(ch)) {
11861             if (!previous_is_cased)
11862                 return PyBool_FromLong(0);
11863             previous_is_cased = 1;
11864             cased = 1;
11865         }
11866         else
11867             previous_is_cased = 0;
11868     }
11869     return PyBool_FromLong(cased);
11870 }
11871 
11872 PyDoc_STRVAR(isspace__doc__,
11873              "S.isspace() -> bool\n\
11874 \n\
11875 Return True if all characters in S are whitespace\n\
11876 and there is at least one character in S, False otherwise.");
11877 
11878 static PyObject*
unicode_isspace(PyObject * self)11879 unicode_isspace(PyObject *self)
11880 {
11881     Py_ssize_t i, length;
11882     int kind;
11883     void *data;
11884 
11885     if (PyUnicode_READY(self) == -1)
11886         return NULL;
11887     length = PyUnicode_GET_LENGTH(self);
11888     kind = PyUnicode_KIND(self);
11889     data = PyUnicode_DATA(self);
11890 
11891     /* Shortcut for single character strings */
11892     if (length == 1)
11893         return PyBool_FromLong(
11894             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11895 
11896     /* Special case for empty strings */
11897     if (length == 0)
11898         return PyBool_FromLong(0);
11899 
11900     for (i = 0; i < length; i++) {
11901         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11902         if (!Py_UNICODE_ISSPACE(ch))
11903             return PyBool_FromLong(0);
11904     }
11905     return PyBool_FromLong(1);
11906 }
11907 
11908 PyDoc_STRVAR(isalpha__doc__,
11909              "S.isalpha() -> bool\n\
11910 \n\
11911 Return True if all characters in S are alphabetic\n\
11912 and there is at least one character in S, False otherwise.");
11913 
11914 static PyObject*
unicode_isalpha(PyObject * self)11915 unicode_isalpha(PyObject *self)
11916 {
11917     Py_ssize_t i, length;
11918     int kind;
11919     void *data;
11920 
11921     if (PyUnicode_READY(self) == -1)
11922         return NULL;
11923     length = PyUnicode_GET_LENGTH(self);
11924     kind = PyUnicode_KIND(self);
11925     data = PyUnicode_DATA(self);
11926 
11927     /* Shortcut for single character strings */
11928     if (length == 1)
11929         return PyBool_FromLong(
11930             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11931 
11932     /* Special case for empty strings */
11933     if (length == 0)
11934         return PyBool_FromLong(0);
11935 
11936     for (i = 0; i < length; i++) {
11937         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11938             return PyBool_FromLong(0);
11939     }
11940     return PyBool_FromLong(1);
11941 }
11942 
11943 PyDoc_STRVAR(isalnum__doc__,
11944              "S.isalnum() -> bool\n\
11945 \n\
11946 Return True if all characters in S are alphanumeric\n\
11947 and there is at least one character in S, False otherwise.");
11948 
11949 static PyObject*
unicode_isalnum(PyObject * self)11950 unicode_isalnum(PyObject *self)
11951 {
11952     int kind;
11953     void *data;
11954     Py_ssize_t len, i;
11955 
11956     if (PyUnicode_READY(self) == -1)
11957         return NULL;
11958 
11959     kind = PyUnicode_KIND(self);
11960     data = PyUnicode_DATA(self);
11961     len = PyUnicode_GET_LENGTH(self);
11962 
11963     /* Shortcut for single character strings */
11964     if (len == 1) {
11965         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967     }
11968 
11969     /* Special case for empty strings */
11970     if (len == 0)
11971         return PyBool_FromLong(0);
11972 
11973     for (i = 0; i < len; i++) {
11974         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975         if (!Py_UNICODE_ISALNUM(ch))
11976             return PyBool_FromLong(0);
11977     }
11978     return PyBool_FromLong(1);
11979 }
11980 
11981 PyDoc_STRVAR(isdecimal__doc__,
11982              "S.isdecimal() -> bool\n\
11983 \n\
11984 Return True if there are only decimal characters in S,\n\
11985 False otherwise.");
11986 
11987 static PyObject*
unicode_isdecimal(PyObject * self)11988 unicode_isdecimal(PyObject *self)
11989 {
11990     Py_ssize_t i, length;
11991     int kind;
11992     void *data;
11993 
11994     if (PyUnicode_READY(self) == -1)
11995         return NULL;
11996     length = PyUnicode_GET_LENGTH(self);
11997     kind = PyUnicode_KIND(self);
11998     data = PyUnicode_DATA(self);
11999 
12000     /* Shortcut for single character strings */
12001     if (length == 1)
12002         return PyBool_FromLong(
12003             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12004 
12005     /* Special case for empty strings */
12006     if (length == 0)
12007         return PyBool_FromLong(0);
12008 
12009     for (i = 0; i < length; i++) {
12010         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12011             return PyBool_FromLong(0);
12012     }
12013     return PyBool_FromLong(1);
12014 }
12015 
12016 PyDoc_STRVAR(isdigit__doc__,
12017              "S.isdigit() -> bool\n\
12018 \n\
12019 Return True if all characters in S are digits\n\
12020 and there is at least one character in S, False otherwise.");
12021 
12022 static PyObject*
unicode_isdigit(PyObject * self)12023 unicode_isdigit(PyObject *self)
12024 {
12025     Py_ssize_t i, length;
12026     int kind;
12027     void *data;
12028 
12029     if (PyUnicode_READY(self) == -1)
12030         return NULL;
12031     length = PyUnicode_GET_LENGTH(self);
12032     kind = PyUnicode_KIND(self);
12033     data = PyUnicode_DATA(self);
12034 
12035     /* Shortcut for single character strings */
12036     if (length == 1) {
12037         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12038         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12039     }
12040 
12041     /* Special case for empty strings */
12042     if (length == 0)
12043         return PyBool_FromLong(0);
12044 
12045     for (i = 0; i < length; i++) {
12046         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12047             return PyBool_FromLong(0);
12048     }
12049     return PyBool_FromLong(1);
12050 }
12051 
12052 PyDoc_STRVAR(isnumeric__doc__,
12053              "S.isnumeric() -> bool\n\
12054 \n\
12055 Return True if there are only numeric characters in S,\n\
12056 False otherwise.");
12057 
12058 static PyObject*
unicode_isnumeric(PyObject * self)12059 unicode_isnumeric(PyObject *self)
12060 {
12061     Py_ssize_t i, length;
12062     int kind;
12063     void *data;
12064 
12065     if (PyUnicode_READY(self) == -1)
12066         return NULL;
12067     length = PyUnicode_GET_LENGTH(self);
12068     kind = PyUnicode_KIND(self);
12069     data = PyUnicode_DATA(self);
12070 
12071     /* Shortcut for single character strings */
12072     if (length == 1)
12073         return PyBool_FromLong(
12074             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12075 
12076     /* Special case for empty strings */
12077     if (length == 0)
12078         return PyBool_FromLong(0);
12079 
12080     for (i = 0; i < length; i++) {
12081         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12082             return PyBool_FromLong(0);
12083     }
12084     return PyBool_FromLong(1);
12085 }
12086 
12087 int
PyUnicode_IsIdentifier(PyObject * self)12088 PyUnicode_IsIdentifier(PyObject *self)
12089 {
12090     int kind;
12091     void *data;
12092     Py_ssize_t i;
12093     Py_UCS4 first;
12094 
12095     if (PyUnicode_READY(self) == -1) {
12096         Py_FatalError("identifier not ready");
12097         return 0;
12098     }
12099 
12100     /* Special case for empty strings */
12101     if (PyUnicode_GET_LENGTH(self) == 0)
12102         return 0;
12103     kind = PyUnicode_KIND(self);
12104     data = PyUnicode_DATA(self);
12105 
12106     /* PEP 3131 says that the first character must be in
12107        XID_Start and subsequent characters in XID_Continue,
12108        and for the ASCII range, the 2.x rules apply (i.e
12109        start with letters and underscore, continue with
12110        letters, digits, underscore). However, given the current
12111        definition of XID_Start and XID_Continue, it is sufficient
12112        to check just for these, except that _ must be allowed
12113        as starting an identifier.  */
12114     first = PyUnicode_READ(kind, data, 0);
12115     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12116         return 0;
12117 
12118     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12119         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12120             return 0;
12121     return 1;
12122 }
12123 
12124 PyDoc_STRVAR(isidentifier__doc__,
12125              "S.isidentifier() -> bool\n\
12126 \n\
12127 Return True if S is a valid identifier according\n\
12128 to the language definition.\n\
12129 \n\
12130 Use keyword.iskeyword() to test for reserved identifiers\n\
12131 such as \"def\" and \"class\".\n");
12132 
12133 static PyObject*
unicode_isidentifier(PyObject * self)12134 unicode_isidentifier(PyObject *self)
12135 {
12136     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12137 }
12138 
12139 PyDoc_STRVAR(isprintable__doc__,
12140              "S.isprintable() -> bool\n\
12141 \n\
12142 Return True if all characters in S are considered\n\
12143 printable in repr() or S is empty, False otherwise.");
12144 
12145 static PyObject*
unicode_isprintable(PyObject * self)12146 unicode_isprintable(PyObject *self)
12147 {
12148     Py_ssize_t i, length;
12149     int kind;
12150     void *data;
12151 
12152     if (PyUnicode_READY(self) == -1)
12153         return NULL;
12154     length = PyUnicode_GET_LENGTH(self);
12155     kind = PyUnicode_KIND(self);
12156     data = PyUnicode_DATA(self);
12157 
12158     /* Shortcut for single character strings */
12159     if (length == 1)
12160         return PyBool_FromLong(
12161             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12162 
12163     for (i = 0; i < length; i++) {
12164         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12165             Py_RETURN_FALSE;
12166         }
12167     }
12168     Py_RETURN_TRUE;
12169 }
12170 
12171 PyDoc_STRVAR(join__doc__,
12172              "S.join(iterable) -> str\n\
12173 \n\
12174 Return a string which is the concatenation of the strings in the\n\
12175 iterable.  The separator between elements is S.");
12176 
12177 static PyObject*
unicode_join(PyObject * self,PyObject * data)12178 unicode_join(PyObject *self, PyObject *data)
12179 {
12180     return PyUnicode_Join(self, data);
12181 }
12182 
12183 static Py_ssize_t
unicode_length(PyObject * self)12184 unicode_length(PyObject *self)
12185 {
12186     if (PyUnicode_READY(self) == -1)
12187         return -1;
12188     return PyUnicode_GET_LENGTH(self);
12189 }
12190 
12191 PyDoc_STRVAR(ljust__doc__,
12192              "S.ljust(width[, fillchar]) -> str\n\
12193 \n\
12194 Return S left-justified in a Unicode string of length width. Padding is\n\
12195 done using the specified fill character (default is a space).");
12196 
12197 static PyObject *
unicode_ljust(PyObject * self,PyObject * args)12198 unicode_ljust(PyObject *self, PyObject *args)
12199 {
12200     Py_ssize_t width;
12201     Py_UCS4 fillchar = ' ';
12202 
12203     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
12204         return NULL;
12205 
12206     if (PyUnicode_READY(self) == -1)
12207         return NULL;
12208 
12209     if (PyUnicode_GET_LENGTH(self) >= width)
12210         return unicode_result_unchanged(self);
12211 
12212     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12213 }
12214 
12215 PyDoc_STRVAR(lower__doc__,
12216              "S.lower() -> str\n\
12217 \n\
12218 Return a copy of the string S converted to lowercase.");
12219 
12220 static PyObject*
unicode_lower(PyObject * self)12221 unicode_lower(PyObject *self)
12222 {
12223     if (PyUnicode_READY(self) == -1)
12224         return NULL;
12225     if (PyUnicode_IS_ASCII(self))
12226         return ascii_upper_or_lower(self, 1);
12227     return case_operation(self, do_lower);
12228 }
12229 
12230 #define LEFTSTRIP 0
12231 #define RIGHTSTRIP 1
12232 #define BOTHSTRIP 2
12233 
12234 /* Arrays indexed by above */
12235 static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12236 
12237 #define STRIPNAME(i) (stripformat[i]+3)
12238 
12239 /* externally visible for str.strip(unicode) */
12240 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12241 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12242 {
12243     void *data;
12244     int kind;
12245     Py_ssize_t i, j, len;
12246     BLOOM_MASK sepmask;
12247     Py_ssize_t seplen;
12248 
12249     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12250         return NULL;
12251 
12252     kind = PyUnicode_KIND(self);
12253     data = PyUnicode_DATA(self);
12254     len = PyUnicode_GET_LENGTH(self);
12255     seplen = PyUnicode_GET_LENGTH(sepobj);
12256     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12257                               PyUnicode_DATA(sepobj),
12258                               seplen);
12259 
12260     i = 0;
12261     if (striptype != RIGHTSTRIP) {
12262         while (i < len) {
12263             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12264             if (!BLOOM(sepmask, ch))
12265                 break;
12266             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12267                 break;
12268             i++;
12269         }
12270     }
12271 
12272     j = len;
12273     if (striptype != LEFTSTRIP) {
12274         j--;
12275         while (j >= i) {
12276             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12277             if (!BLOOM(sepmask, ch))
12278                 break;
12279             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12280                 break;
12281             j--;
12282         }
12283 
12284         j++;
12285     }
12286 
12287     return PyUnicode_Substring(self, i, j);
12288 }
12289 
12290 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12291 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12292 {
12293     unsigned char *data;
12294     int kind;
12295     Py_ssize_t length;
12296 
12297     if (PyUnicode_READY(self) == -1)
12298         return NULL;
12299 
12300     length = PyUnicode_GET_LENGTH(self);
12301     end = Py_MIN(end, length);
12302 
12303     if (start == 0 && end == length)
12304         return unicode_result_unchanged(self);
12305 
12306     if (start < 0 || end < 0) {
12307         PyErr_SetString(PyExc_IndexError, "string index out of range");
12308         return NULL;
12309     }
12310     if (start >= length || end < start)
12311         _Py_RETURN_UNICODE_EMPTY();
12312 
12313     length = end - start;
12314     if (PyUnicode_IS_ASCII(self)) {
12315         data = PyUnicode_1BYTE_DATA(self);
12316         return _PyUnicode_FromASCII((char*)(data + start), length);
12317     }
12318     else {
12319         kind = PyUnicode_KIND(self);
12320         data = PyUnicode_1BYTE_DATA(self);
12321         return PyUnicode_FromKindAndData(kind,
12322                                          data + kind * start,
12323                                          length);
12324     }
12325 }
12326 
12327 static PyObject *
do_strip(PyObject * self,int striptype)12328 do_strip(PyObject *self, int striptype)
12329 {
12330     Py_ssize_t len, i, j;
12331 
12332     if (PyUnicode_READY(self) == -1)
12333         return NULL;
12334 
12335     len = PyUnicode_GET_LENGTH(self);
12336 
12337     if (PyUnicode_IS_ASCII(self)) {
12338         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12339 
12340         i = 0;
12341         if (striptype != RIGHTSTRIP) {
12342             while (i < len) {
12343                 Py_UCS1 ch = data[i];
12344                 if (!_Py_ascii_whitespace[ch])
12345                     break;
12346                 i++;
12347             }
12348         }
12349 
12350         j = len;
12351         if (striptype != LEFTSTRIP) {
12352             j--;
12353             while (j >= i) {
12354                 Py_UCS1 ch = data[j];
12355                 if (!_Py_ascii_whitespace[ch])
12356                     break;
12357                 j--;
12358             }
12359             j++;
12360         }
12361     }
12362     else {
12363         int kind = PyUnicode_KIND(self);
12364         void *data = PyUnicode_DATA(self);
12365 
12366         i = 0;
12367         if (striptype != RIGHTSTRIP) {
12368             while (i < len) {
12369                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12370                 if (!Py_UNICODE_ISSPACE(ch))
12371                     break;
12372                 i++;
12373             }
12374         }
12375 
12376         j = len;
12377         if (striptype != LEFTSTRIP) {
12378             j--;
12379             while (j >= i) {
12380                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12381                 if (!Py_UNICODE_ISSPACE(ch))
12382                     break;
12383                 j--;
12384             }
12385             j++;
12386         }
12387     }
12388 
12389     return PyUnicode_Substring(self, i, j);
12390 }
12391 
12392 
12393 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * args)12394 do_argstrip(PyObject *self, int striptype, PyObject *args)
12395 {
12396     PyObject *sep = NULL;
12397 
12398     if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12399         return NULL;
12400 
12401     if (sep != NULL && sep != Py_None) {
12402         if (PyUnicode_Check(sep))
12403             return _PyUnicode_XStrip(self, striptype, sep);
12404         else {
12405             PyErr_Format(PyExc_TypeError,
12406                          "%s arg must be None or str",
12407                          STRIPNAME(striptype));
12408             return NULL;
12409         }
12410     }
12411 
12412     return do_strip(self, striptype);
12413 }
12414 
12415 
12416 PyDoc_STRVAR(strip__doc__,
12417              "S.strip([chars]) -> str\n\
12418 \n\
12419 Return a copy of the string S with leading and trailing\n\
12420 whitespace removed.\n\
12421 If chars is given and not None, remove characters in chars instead.");
12422 
12423 static PyObject *
unicode_strip(PyObject * self,PyObject * args)12424 unicode_strip(PyObject *self, PyObject *args)
12425 {
12426     if (PyTuple_GET_SIZE(args) == 0)
12427         return do_strip(self, BOTHSTRIP); /* Common case */
12428     else
12429         return do_argstrip(self, BOTHSTRIP, args);
12430 }
12431 
12432 
12433 PyDoc_STRVAR(lstrip__doc__,
12434              "S.lstrip([chars]) -> str\n\
12435 \n\
12436 Return a copy of the string S with leading whitespace removed.\n\
12437 If chars is given and not None, remove characters in chars instead.");
12438 
12439 static PyObject *
unicode_lstrip(PyObject * self,PyObject * args)12440 unicode_lstrip(PyObject *self, PyObject *args)
12441 {
12442     if (PyTuple_GET_SIZE(args) == 0)
12443         return do_strip(self, LEFTSTRIP); /* Common case */
12444     else
12445         return do_argstrip(self, LEFTSTRIP, args);
12446 }
12447 
12448 
12449 PyDoc_STRVAR(rstrip__doc__,
12450              "S.rstrip([chars]) -> str\n\
12451 \n\
12452 Return a copy of the string S with trailing whitespace removed.\n\
12453 If chars is given and not None, remove characters in chars instead.");
12454 
12455 static PyObject *
unicode_rstrip(PyObject * self,PyObject * args)12456 unicode_rstrip(PyObject *self, PyObject *args)
12457 {
12458     if (PyTuple_GET_SIZE(args) == 0)
12459         return do_strip(self, RIGHTSTRIP); /* Common case */
12460     else
12461         return do_argstrip(self, RIGHTSTRIP, args);
12462 }
12463 
12464 
12465 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12466 unicode_repeat(PyObject *str, Py_ssize_t len)
12467 {
12468     PyObject *u;
12469     Py_ssize_t nchars, n;
12470 
12471     if (len < 1)
12472         _Py_RETURN_UNICODE_EMPTY();
12473 
12474     /* no repeat, return original string */
12475     if (len == 1)
12476         return unicode_result_unchanged(str);
12477 
12478     if (PyUnicode_READY(str) == -1)
12479         return NULL;
12480 
12481     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12482         PyErr_SetString(PyExc_OverflowError,
12483                         "repeated string is too long");
12484         return NULL;
12485     }
12486     nchars = len * PyUnicode_GET_LENGTH(str);
12487 
12488     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12489     if (!u)
12490         return NULL;
12491     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12492 
12493     if (PyUnicode_GET_LENGTH(str) == 1) {
12494         const int kind = PyUnicode_KIND(str);
12495         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12496         if (kind == PyUnicode_1BYTE_KIND) {
12497             void *to = PyUnicode_DATA(u);
12498             memset(to, (unsigned char)fill_char, len);
12499         }
12500         else if (kind == PyUnicode_2BYTE_KIND) {
12501             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12502             for (n = 0; n < len; ++n)
12503                 ucs2[n] = fill_char;
12504         } else {
12505             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12506             assert(kind == PyUnicode_4BYTE_KIND);
12507             for (n = 0; n < len; ++n)
12508                 ucs4[n] = fill_char;
12509         }
12510     }
12511     else {
12512         /* number of characters copied this far */
12513         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12514         const Py_ssize_t char_size = PyUnicode_KIND(str);
12515         char *to = (char *) PyUnicode_DATA(u);
12516         memcpy(to, PyUnicode_DATA(str),
12517                   PyUnicode_GET_LENGTH(str) * char_size);
12518         while (done < nchars) {
12519             n = (done <= nchars-done) ? done : nchars-done;
12520             memcpy(to + (done * char_size), to, n * char_size);
12521             done += n;
12522         }
12523     }
12524 
12525     assert(_PyUnicode_CheckConsistency(u, 1));
12526     return u;
12527 }
12528 
12529 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12530 PyUnicode_Replace(PyObject *str,
12531                   PyObject *substr,
12532                   PyObject *replstr,
12533                   Py_ssize_t maxcount)
12534 {
12535     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12536             ensure_unicode(replstr) < 0)
12537         return NULL;
12538     return replace(str, substr, replstr, maxcount);
12539 }
12540 
12541 PyDoc_STRVAR(replace__doc__,
12542              "S.replace(old, new[, count]) -> str\n\
12543 \n\
12544 Return a copy of S with all occurrences of substring\n\
12545 old replaced by new.  If the optional argument count is\n\
12546 given, only the first count occurrences are replaced.");
12547 
12548 static PyObject*
unicode_replace(PyObject * self,PyObject * args)12549 unicode_replace(PyObject *self, PyObject *args)
12550 {
12551     PyObject *str1;
12552     PyObject *str2;
12553     Py_ssize_t maxcount = -1;
12554 
12555     if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
12556         return NULL;
12557     if (PyUnicode_READY(self) == -1)
12558         return NULL;
12559     return replace(self, str1, str2, maxcount);
12560 }
12561 
12562 static PyObject *
unicode_repr(PyObject * unicode)12563 unicode_repr(PyObject *unicode)
12564 {
12565     PyObject *repr;
12566     Py_ssize_t isize;
12567     Py_ssize_t osize, squote, dquote, i, o;
12568     Py_UCS4 max, quote;
12569     int ikind, okind, unchanged;
12570     void *idata, *odata;
12571 
12572     if (PyUnicode_READY(unicode) == -1)
12573         return NULL;
12574 
12575     isize = PyUnicode_GET_LENGTH(unicode);
12576     idata = PyUnicode_DATA(unicode);
12577 
12578     /* Compute length of output, quote characters, and
12579        maximum character */
12580     osize = 0;
12581     max = 127;
12582     squote = dquote = 0;
12583     ikind = PyUnicode_KIND(unicode);
12584     for (i = 0; i < isize; i++) {
12585         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12586         Py_ssize_t incr = 1;
12587         switch (ch) {
12588         case '\'': squote++; break;
12589         case '"':  dquote++; break;
12590         case '\\': case '\t': case '\r': case '\n':
12591             incr = 2;
12592             break;
12593         default:
12594             /* Fast-path ASCII */
12595             if (ch < ' ' || ch == 0x7f)
12596                 incr = 4; /* \xHH */
12597             else if (ch < 0x7f)
12598                 ;
12599             else if (Py_UNICODE_ISPRINTABLE(ch))
12600                 max = ch > max ? ch : max;
12601             else if (ch < 0x100)
12602                 incr = 4; /* \xHH */
12603             else if (ch < 0x10000)
12604                 incr = 6; /* \uHHHH */
12605             else
12606                 incr = 10; /* \uHHHHHHHH */
12607         }
12608         if (osize > PY_SSIZE_T_MAX - incr) {
12609             PyErr_SetString(PyExc_OverflowError,
12610                             "string is too long to generate repr");
12611             return NULL;
12612         }
12613         osize += incr;
12614     }
12615 
12616     quote = '\'';
12617     unchanged = (osize == isize);
12618     if (squote) {
12619         unchanged = 0;
12620         if (dquote)
12621             /* Both squote and dquote present. Use squote,
12622                and escape them */
12623             osize += squote;
12624         else
12625             quote = '"';
12626     }
12627     osize += 2;   /* quotes */
12628 
12629     repr = PyUnicode_New(osize, max);
12630     if (repr == NULL)
12631         return NULL;
12632     okind = PyUnicode_KIND(repr);
12633     odata = PyUnicode_DATA(repr);
12634 
12635     PyUnicode_WRITE(okind, odata, 0, quote);
12636     PyUnicode_WRITE(okind, odata, osize-1, quote);
12637     if (unchanged) {
12638         _PyUnicode_FastCopyCharacters(repr, 1,
12639                                       unicode, 0,
12640                                       isize);
12641     }
12642     else {
12643         for (i = 0, o = 1; i < isize; i++) {
12644             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12645 
12646             /* Escape quotes and backslashes */
12647             if ((ch == quote) || (ch == '\\')) {
12648                 PyUnicode_WRITE(okind, odata, o++, '\\');
12649                 PyUnicode_WRITE(okind, odata, o++, ch);
12650                 continue;
12651             }
12652 
12653             /* Map special whitespace to '\t', \n', '\r' */
12654             if (ch == '\t') {
12655                 PyUnicode_WRITE(okind, odata, o++, '\\');
12656                 PyUnicode_WRITE(okind, odata, o++, 't');
12657             }
12658             else if (ch == '\n') {
12659                 PyUnicode_WRITE(okind, odata, o++, '\\');
12660                 PyUnicode_WRITE(okind, odata, o++, 'n');
12661             }
12662             else if (ch == '\r') {
12663                 PyUnicode_WRITE(okind, odata, o++, '\\');
12664                 PyUnicode_WRITE(okind, odata, o++, 'r');
12665             }
12666 
12667             /* Map non-printable US ASCII to '\xhh' */
12668             else if (ch < ' ' || ch == 0x7F) {
12669                 PyUnicode_WRITE(okind, odata, o++, '\\');
12670                 PyUnicode_WRITE(okind, odata, o++, 'x');
12671                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12672                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12673             }
12674 
12675             /* Copy ASCII characters as-is */
12676             else if (ch < 0x7F) {
12677                 PyUnicode_WRITE(okind, odata, o++, ch);
12678             }
12679 
12680             /* Non-ASCII characters */
12681             else {
12682                 /* Map Unicode whitespace and control characters
12683                    (categories Z* and C* except ASCII space)
12684                 */
12685                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12686                     PyUnicode_WRITE(okind, odata, o++, '\\');
12687                     /* Map 8-bit characters to '\xhh' */
12688                     if (ch <= 0xff) {
12689                         PyUnicode_WRITE(okind, odata, o++, 'x');
12690                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12691                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12692                     }
12693                     /* Map 16-bit characters to '\uxxxx' */
12694                     else if (ch <= 0xffff) {
12695                         PyUnicode_WRITE(okind, odata, o++, 'u');
12696                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12697                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12698                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12699                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12700                     }
12701                     /* Map 21-bit characters to '\U00xxxxxx' */
12702                     else {
12703                         PyUnicode_WRITE(okind, odata, o++, 'U');
12704                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12705                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12706                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12707                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12708                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12709                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12710                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12711                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12712                     }
12713                 }
12714                 /* Copy characters as-is */
12715                 else {
12716                     PyUnicode_WRITE(okind, odata, o++, ch);
12717                 }
12718             }
12719         }
12720     }
12721     /* Closing quote already added at the beginning */
12722     assert(_PyUnicode_CheckConsistency(repr, 1));
12723     return repr;
12724 }
12725 
12726 PyDoc_STRVAR(rfind__doc__,
12727              "S.rfind(sub[, start[, end]]) -> int\n\
12728 \n\
12729 Return the highest index in S where substring sub is found,\n\
12730 such that sub is contained within S[start:end].  Optional\n\
12731 arguments start and end are interpreted as in slice notation.\n\
12732 \n\
12733 Return -1 on failure.");
12734 
12735 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12736 unicode_rfind(PyObject *self, PyObject *args)
12737 {
12738     /* initialize variables to prevent gcc warning */
12739     PyObject *substring = NULL;
12740     Py_ssize_t start = 0;
12741     Py_ssize_t end = 0;
12742     Py_ssize_t result;
12743 
12744     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12745         return NULL;
12746 
12747     if (PyUnicode_READY(self) == -1)
12748         return NULL;
12749 
12750     result = any_find_slice(self, substring, start, end, -1);
12751 
12752     if (result == -2)
12753         return NULL;
12754 
12755     return PyLong_FromSsize_t(result);
12756 }
12757 
12758 PyDoc_STRVAR(rindex__doc__,
12759              "S.rindex(sub[, start[, end]]) -> int\n\
12760 \n\
12761 Like S.rfind() but raise ValueError when the substring is not found.");
12762 
12763 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12764 unicode_rindex(PyObject *self, PyObject *args)
12765 {
12766     /* initialize variables to prevent gcc warning */
12767     PyObject *substring = NULL;
12768     Py_ssize_t start = 0;
12769     Py_ssize_t end = 0;
12770     Py_ssize_t result;
12771 
12772     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12773         return NULL;
12774 
12775     if (PyUnicode_READY(self) == -1)
12776         return NULL;
12777 
12778     result = any_find_slice(self, substring, start, end, -1);
12779 
12780     if (result == -2)
12781         return NULL;
12782 
12783     if (result < 0) {
12784         PyErr_SetString(PyExc_ValueError, "substring not found");
12785         return NULL;
12786     }
12787 
12788     return PyLong_FromSsize_t(result);
12789 }
12790 
12791 PyDoc_STRVAR(rjust__doc__,
12792              "S.rjust(width[, fillchar]) -> str\n\
12793 \n\
12794 Return S right-justified in a string of length width. Padding is\n\
12795 done using the specified fill character (default is a space).");
12796 
12797 static PyObject *
unicode_rjust(PyObject * self,PyObject * args)12798 unicode_rjust(PyObject *self, PyObject *args)
12799 {
12800     Py_ssize_t width;
12801     Py_UCS4 fillchar = ' ';
12802 
12803     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12804         return NULL;
12805 
12806     if (PyUnicode_READY(self) == -1)
12807         return NULL;
12808 
12809     if (PyUnicode_GET_LENGTH(self) >= width)
12810         return unicode_result_unchanged(self);
12811 
12812     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12813 }
12814 
12815 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12816 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12817 {
12818     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12819         return NULL;
12820 
12821     return split(s, sep, maxsplit);
12822 }
12823 
12824 PyDoc_STRVAR(split__doc__,
12825              "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12826 \n\
12827 Return a list of the words in S, using sep as the\n\
12828 delimiter string.  If maxsplit is given, at most maxsplit\n\
12829 splits are done. If sep is not specified or is None, any\n\
12830 whitespace string is a separator and empty strings are\n\
12831 removed from the result.");
12832 
12833 static PyObject*
unicode_split(PyObject * self,PyObject * args,PyObject * kwds)12834 unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12835 {
12836     static char *kwlist[] = {"sep", "maxsplit", 0};
12837     PyObject *substring = Py_None;
12838     Py_ssize_t maxcount = -1;
12839 
12840     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12841                                      kwlist, &substring, &maxcount))
12842         return NULL;
12843 
12844     if (substring == Py_None)
12845         return split(self, NULL, maxcount);
12846 
12847     if (PyUnicode_Check(substring))
12848         return split(self, substring, maxcount);
12849 
12850     PyErr_Format(PyExc_TypeError,
12851                  "must be str or None, not %.100s",
12852                  Py_TYPE(substring)->tp_name);
12853     return NULL;
12854 }
12855 
12856 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12857 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12858 {
12859     PyObject* out;
12860     int kind1, kind2;
12861     void *buf1, *buf2;
12862     Py_ssize_t len1, len2;
12863 
12864     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12865         return NULL;
12866 
12867     kind1 = PyUnicode_KIND(str_obj);
12868     kind2 = PyUnicode_KIND(sep_obj);
12869     len1 = PyUnicode_GET_LENGTH(str_obj);
12870     len2 = PyUnicode_GET_LENGTH(sep_obj);
12871     if (kind1 < kind2 || len1 < len2) {
12872         _Py_INCREF_UNICODE_EMPTY();
12873         if (!unicode_empty)
12874             out = NULL;
12875         else {
12876             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12877             Py_DECREF(unicode_empty);
12878         }
12879         return out;
12880     }
12881     buf1 = PyUnicode_DATA(str_obj);
12882     buf2 = PyUnicode_DATA(sep_obj);
12883     if (kind2 != kind1) {
12884         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12885         if (!buf2)
12886             return NULL;
12887     }
12888 
12889     switch (kind1) {
12890     case PyUnicode_1BYTE_KIND:
12891         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12892             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893         else
12894             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12895         break;
12896     case PyUnicode_2BYTE_KIND:
12897         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12898         break;
12899     case PyUnicode_4BYTE_KIND:
12900         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12901         break;
12902     default:
12903         assert(0);
12904         out = 0;
12905     }
12906 
12907     if (kind2 != kind1)
12908         PyMem_Free(buf2);
12909 
12910     return out;
12911 }
12912 
12913 
12914 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12915 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12916 {
12917     PyObject* out;
12918     int kind1, kind2;
12919     void *buf1, *buf2;
12920     Py_ssize_t len1, len2;
12921 
12922     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12923         return NULL;
12924 
12925     kind1 = PyUnicode_KIND(str_obj);
12926     kind2 = PyUnicode_KIND(sep_obj);
12927     len1 = PyUnicode_GET_LENGTH(str_obj);
12928     len2 = PyUnicode_GET_LENGTH(sep_obj);
12929     if (kind1 < kind2 || len1 < len2) {
12930         _Py_INCREF_UNICODE_EMPTY();
12931         if (!unicode_empty)
12932             out = NULL;
12933         else {
12934             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12935             Py_DECREF(unicode_empty);
12936         }
12937         return out;
12938     }
12939     buf1 = PyUnicode_DATA(str_obj);
12940     buf2 = PyUnicode_DATA(sep_obj);
12941     if (kind2 != kind1) {
12942         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12943         if (!buf2)
12944             return NULL;
12945     }
12946 
12947     switch (kind1) {
12948     case PyUnicode_1BYTE_KIND:
12949         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12950             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951         else
12952             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12953         break;
12954     case PyUnicode_2BYTE_KIND:
12955         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12956         break;
12957     case PyUnicode_4BYTE_KIND:
12958         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959         break;
12960     default:
12961         assert(0);
12962         out = 0;
12963     }
12964 
12965     if (kind2 != kind1)
12966         PyMem_Free(buf2);
12967 
12968     return out;
12969 }
12970 
12971 PyDoc_STRVAR(partition__doc__,
12972              "S.partition(sep) -> (head, sep, tail)\n\
12973 \n\
12974 Search for the separator sep in S, and return the part before it,\n\
12975 the separator itself, and the part after it.  If the separator is not\n\
12976 found, return S and two empty strings.");
12977 
12978 static PyObject*
unicode_partition(PyObject * self,PyObject * separator)12979 unicode_partition(PyObject *self, PyObject *separator)
12980 {
12981     return PyUnicode_Partition(self, separator);
12982 }
12983 
12984 PyDoc_STRVAR(rpartition__doc__,
12985              "S.rpartition(sep) -> (head, sep, tail)\n\
12986 \n\
12987 Search for the separator sep in S, starting at the end of S, and return\n\
12988 the part before it, the separator itself, and the part after it.  If the\n\
12989 separator is not found, return two empty strings and S.");
12990 
12991 static PyObject*
unicode_rpartition(PyObject * self,PyObject * separator)12992 unicode_rpartition(PyObject *self, PyObject *separator)
12993 {
12994     return PyUnicode_RPartition(self, separator);
12995 }
12996 
12997 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12998 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12999 {
13000     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13001         return NULL;
13002 
13003     return rsplit(s, sep, maxsplit);
13004 }
13005 
13006 PyDoc_STRVAR(rsplit__doc__,
13007              "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
13008 \n\
13009 Return a list of the words in S, using sep as the\n\
13010 delimiter string, starting at the end of the string and\n\
13011 working to the front.  If maxsplit is given, at most maxsplit\n\
13012 splits are done. If sep is not specified, any whitespace string\n\
13013 is a separator.");
13014 
13015 static PyObject*
unicode_rsplit(PyObject * self,PyObject * args,PyObject * kwds)13016 unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
13017 {
13018     static char *kwlist[] = {"sep", "maxsplit", 0};
13019     PyObject *substring = Py_None;
13020     Py_ssize_t maxcount = -1;
13021 
13022     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13023                                      kwlist, &substring, &maxcount))
13024         return NULL;
13025 
13026     if (substring == Py_None)
13027         return rsplit(self, NULL, maxcount);
13028 
13029     if (PyUnicode_Check(substring))
13030         return rsplit(self, substring, maxcount);
13031 
13032     PyErr_Format(PyExc_TypeError,
13033                  "must be str or None, not %.100s",
13034                  Py_TYPE(substring)->tp_name);
13035     return NULL;
13036 }
13037 
13038 PyDoc_STRVAR(splitlines__doc__,
13039              "S.splitlines([keepends]) -> list of strings\n\
13040 \n\
13041 Return a list of the lines in S, breaking at line boundaries.\n\
13042 Line breaks are not included in the resulting list unless keepends\n\
13043 is given and true.");
13044 
13045 static PyObject*
unicode_splitlines(PyObject * self,PyObject * args,PyObject * kwds)13046 unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
13047 {
13048     static char *kwlist[] = {"keepends", 0};
13049     int keepends = 0;
13050 
13051     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13052                                      kwlist, &keepends))
13053         return NULL;
13054 
13055     return PyUnicode_Splitlines(self, keepends);
13056 }
13057 
13058 static
unicode_str(PyObject * self)13059 PyObject *unicode_str(PyObject *self)
13060 {
13061     return unicode_result_unchanged(self);
13062 }
13063 
13064 PyDoc_STRVAR(swapcase__doc__,
13065              "S.swapcase() -> str\n\
13066 \n\
13067 Return a copy of S with uppercase characters converted to lowercase\n\
13068 and vice versa.");
13069 
13070 static PyObject*
unicode_swapcase(PyObject * self)13071 unicode_swapcase(PyObject *self)
13072 {
13073     if (PyUnicode_READY(self) == -1)
13074         return NULL;
13075     return case_operation(self, do_swapcase);
13076 }
13077 
13078 /*[clinic input]
13079 
13080 @staticmethod
13081 str.maketrans as unicode_maketrans
13082 
13083   x: object
13084 
13085   y: unicode=NULL
13086 
13087   z: unicode=NULL
13088 
13089   /
13090 
13091 Return a translation table usable for str.translate().
13092 
13093 If there is only one argument, it must be a dictionary mapping Unicode
13094 ordinals (integers) or characters to Unicode ordinals, strings or None.
13095 Character keys will be then converted to ordinals.
13096 If there are two arguments, they must be strings of equal length, and
13097 in the resulting dictionary, each character in x will be mapped to the
13098 character at the same position in y. If there is a third argument, it
13099 must be a string, whose characters will be mapped to None in the result.
13100 [clinic start generated code]*/
13101 
13102 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13103 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13104 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13105 {
13106     PyObject *new = NULL, *key, *value;
13107     Py_ssize_t i = 0;
13108     int res;
13109 
13110     new = PyDict_New();
13111     if (!new)
13112         return NULL;
13113     if (y != NULL) {
13114         int x_kind, y_kind, z_kind;
13115         void *x_data, *y_data, *z_data;
13116 
13117         /* x must be a string too, of equal length */
13118         if (!PyUnicode_Check(x)) {
13119             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13120                             "be a string if there is a second argument");
13121             goto err;
13122         }
13123         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13124             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13125                             "arguments must have equal length");
13126             goto err;
13127         }
13128         /* create entries for translating chars in x to those in y */
13129         x_kind = PyUnicode_KIND(x);
13130         y_kind = PyUnicode_KIND(y);
13131         x_data = PyUnicode_DATA(x);
13132         y_data = PyUnicode_DATA(y);
13133         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13134             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13135             if (!key)
13136                 goto err;
13137             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13138             if (!value) {
13139                 Py_DECREF(key);
13140                 goto err;
13141             }
13142             res = PyDict_SetItem(new, key, value);
13143             Py_DECREF(key);
13144             Py_DECREF(value);
13145             if (res < 0)
13146                 goto err;
13147         }
13148         /* create entries for deleting chars in z */
13149         if (z != NULL) {
13150             z_kind = PyUnicode_KIND(z);
13151             z_data = PyUnicode_DATA(z);
13152             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13153                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13154                 if (!key)
13155                     goto err;
13156                 res = PyDict_SetItem(new, key, Py_None);
13157                 Py_DECREF(key);
13158                 if (res < 0)
13159                     goto err;
13160             }
13161         }
13162     } else {
13163         int kind;
13164         void *data;
13165 
13166         /* x must be a dict */
13167         if (!PyDict_CheckExact(x)) {
13168             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13169                             "to maketrans it must be a dict");
13170             goto err;
13171         }
13172         /* copy entries into the new dict, converting string keys to int keys */
13173         while (PyDict_Next(x, &i, &key, &value)) {
13174             if (PyUnicode_Check(key)) {
13175                 /* convert string keys to integer keys */
13176                 PyObject *newkey;
13177                 if (PyUnicode_GET_LENGTH(key) != 1) {
13178                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13179                                     "table must be of length 1");
13180                     goto err;
13181                 }
13182                 kind = PyUnicode_KIND(key);
13183                 data = PyUnicode_DATA(key);
13184                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13185                 if (!newkey)
13186                     goto err;
13187                 res = PyDict_SetItem(new, newkey, value);
13188                 Py_DECREF(newkey);
13189                 if (res < 0)
13190                     goto err;
13191             } else if (PyLong_Check(key)) {
13192                 /* just keep integer keys */
13193                 if (PyDict_SetItem(new, key, value) < 0)
13194                     goto err;
13195             } else {
13196                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13197                                 "be strings or integers");
13198                 goto err;
13199             }
13200         }
13201     }
13202     return new;
13203   err:
13204     Py_DECREF(new);
13205     return NULL;
13206 }
13207 
13208 PyDoc_STRVAR(translate__doc__,
13209              "S.translate(table) -> str\n\
13210 \n\
13211 Return a copy of the string S in which each character has been mapped\n\
13212 through the given translation table. The table must implement\n\
13213 lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13214 mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13215 this operation raises LookupError, the character is left untouched.\n\
13216 Characters mapped to None are deleted.");
13217 
13218 static PyObject*
unicode_translate(PyObject * self,PyObject * table)13219 unicode_translate(PyObject *self, PyObject *table)
13220 {
13221     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13222 }
13223 
13224 PyDoc_STRVAR(upper__doc__,
13225              "S.upper() -> str\n\
13226 \n\
13227 Return a copy of S converted to uppercase.");
13228 
13229 static PyObject*
unicode_upper(PyObject * self)13230 unicode_upper(PyObject *self)
13231 {
13232     if (PyUnicode_READY(self) == -1)
13233         return NULL;
13234     if (PyUnicode_IS_ASCII(self))
13235         return ascii_upper_or_lower(self, 0);
13236     return case_operation(self, do_upper);
13237 }
13238 
13239 PyDoc_STRVAR(zfill__doc__,
13240              "S.zfill(width) -> str\n\
13241 \n\
13242 Pad a numeric string S with zeros on the left, to fill a field\n\
13243 of the specified width. The string S is never truncated.");
13244 
13245 static PyObject *
unicode_zfill(PyObject * self,PyObject * args)13246 unicode_zfill(PyObject *self, PyObject *args)
13247 {
13248     Py_ssize_t fill;
13249     PyObject *u;
13250     Py_ssize_t width;
13251     int kind;
13252     void *data;
13253     Py_UCS4 chr;
13254 
13255     if (!PyArg_ParseTuple(args, "n:zfill", &width))
13256         return NULL;
13257 
13258     if (PyUnicode_READY(self) == -1)
13259         return NULL;
13260 
13261     if (PyUnicode_GET_LENGTH(self) >= width)
13262         return unicode_result_unchanged(self);
13263 
13264     fill = width - PyUnicode_GET_LENGTH(self);
13265 
13266     u = pad(self, fill, 0, '0');
13267 
13268     if (u == NULL)
13269         return NULL;
13270 
13271     kind = PyUnicode_KIND(u);
13272     data = PyUnicode_DATA(u);
13273     chr = PyUnicode_READ(kind, data, fill);
13274 
13275     if (chr == '+' || chr == '-') {
13276         /* move sign to beginning of string */
13277         PyUnicode_WRITE(kind, data, 0, chr);
13278         PyUnicode_WRITE(kind, data, fill, '0');
13279     }
13280 
13281     assert(_PyUnicode_CheckConsistency(u, 1));
13282     return u;
13283 }
13284 
13285 #if 0
13286 static PyObject *
13287 unicode__decimal2ascii(PyObject *self)
13288 {
13289     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13290 }
13291 #endif
13292 
13293 PyDoc_STRVAR(startswith__doc__,
13294              "S.startswith(prefix[, start[, end]]) -> bool\n\
13295 \n\
13296 Return True if S starts with the specified prefix, False otherwise.\n\
13297 With optional start, test S beginning at that position.\n\
13298 With optional end, stop comparing S at that position.\n\
13299 prefix can also be a tuple of strings to try.");
13300 
13301 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13302 unicode_startswith(PyObject *self,
13303                    PyObject *args)
13304 {
13305     PyObject *subobj;
13306     PyObject *substring;
13307     Py_ssize_t start = 0;
13308     Py_ssize_t end = PY_SSIZE_T_MAX;
13309     int result;
13310 
13311     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13312         return NULL;
13313     if (PyTuple_Check(subobj)) {
13314         Py_ssize_t i;
13315         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13316             substring = PyTuple_GET_ITEM(subobj, i);
13317             if (!PyUnicode_Check(substring)) {
13318                 PyErr_Format(PyExc_TypeError,
13319                              "tuple for startswith must only contain str, "
13320                              "not %.100s",
13321                              Py_TYPE(substring)->tp_name);
13322                 return NULL;
13323             }
13324             result = tailmatch(self, substring, start, end, -1);
13325             if (result == -1)
13326                 return NULL;
13327             if (result) {
13328                 Py_RETURN_TRUE;
13329             }
13330         }
13331         /* nothing matched */
13332         Py_RETURN_FALSE;
13333     }
13334     if (!PyUnicode_Check(subobj)) {
13335         PyErr_Format(PyExc_TypeError,
13336                      "startswith first arg must be str or "
13337                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13338         return NULL;
13339     }
13340     result = tailmatch(self, subobj, start, end, -1);
13341     if (result == -1)
13342         return NULL;
13343     return PyBool_FromLong(result);
13344 }
13345 
13346 
13347 PyDoc_STRVAR(endswith__doc__,
13348              "S.endswith(suffix[, start[, end]]) -> bool\n\
13349 \n\
13350 Return True if S ends with the specified suffix, False otherwise.\n\
13351 With optional start, test S beginning at that position.\n\
13352 With optional end, stop comparing S at that position.\n\
13353 suffix can also be a tuple of strings to try.");
13354 
13355 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13356 unicode_endswith(PyObject *self,
13357                  PyObject *args)
13358 {
13359     PyObject *subobj;
13360     PyObject *substring;
13361     Py_ssize_t start = 0;
13362     Py_ssize_t end = PY_SSIZE_T_MAX;
13363     int result;
13364 
13365     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13366         return NULL;
13367     if (PyTuple_Check(subobj)) {
13368         Py_ssize_t i;
13369         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13370             substring = PyTuple_GET_ITEM(subobj, i);
13371             if (!PyUnicode_Check(substring)) {
13372                 PyErr_Format(PyExc_TypeError,
13373                              "tuple for endswith must only contain str, "
13374                              "not %.100s",
13375                              Py_TYPE(substring)->tp_name);
13376                 return NULL;
13377             }
13378             result = tailmatch(self, substring, start, end, +1);
13379             if (result == -1)
13380                 return NULL;
13381             if (result) {
13382                 Py_RETURN_TRUE;
13383             }
13384         }
13385         Py_RETURN_FALSE;
13386     }
13387     if (!PyUnicode_Check(subobj)) {
13388         PyErr_Format(PyExc_TypeError,
13389                      "endswith first arg must be str or "
13390                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13391         return NULL;
13392     }
13393     result = tailmatch(self, subobj, start, end, +1);
13394     if (result == -1)
13395         return NULL;
13396     return PyBool_FromLong(result);
13397 }
13398 
13399 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13400 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13401 {
13402     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13403     writer->data = PyUnicode_DATA(writer->buffer);
13404 
13405     if (!writer->readonly) {
13406         writer->kind = PyUnicode_KIND(writer->buffer);
13407         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13408     }
13409     else {
13410         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13411            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13412         writer->kind = PyUnicode_WCHAR_KIND;
13413         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13414 
13415         /* Copy-on-write mode: set buffer size to 0 so
13416          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13417          * next write. */
13418         writer->size = 0;
13419     }
13420 }
13421 
13422 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13423 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13424 {
13425     memset(writer, 0, sizeof(*writer));
13426 
13427     /* ASCII is the bare minimum */
13428     writer->min_char = 127;
13429 
13430     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13431        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13432     writer->kind = PyUnicode_WCHAR_KIND;
13433     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13434 }
13435 
13436 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13437 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13438                                  Py_ssize_t length, Py_UCS4 maxchar)
13439 {
13440     Py_ssize_t newlen;
13441     PyObject *newbuffer;
13442 
13443     assert(maxchar <= MAX_UNICODE);
13444 
13445     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13446     assert((maxchar > writer->maxchar && length >= 0)
13447            || length > 0);
13448 
13449     if (length > PY_SSIZE_T_MAX - writer->pos) {
13450         PyErr_NoMemory();
13451         return -1;
13452     }
13453     newlen = writer->pos + length;
13454 
13455     maxchar = Py_MAX(maxchar, writer->min_char);
13456 
13457     if (writer->buffer == NULL) {
13458         assert(!writer->readonly);
13459         if (writer->overallocate
13460             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13461             /* overallocate to limit the number of realloc() */
13462             newlen += newlen / OVERALLOCATE_FACTOR;
13463         }
13464         if (newlen < writer->min_length)
13465             newlen = writer->min_length;
13466 
13467         writer->buffer = PyUnicode_New(newlen, maxchar);
13468         if (writer->buffer == NULL)
13469             return -1;
13470     }
13471     else if (newlen > writer->size) {
13472         if (writer->overallocate
13473             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13474             /* overallocate to limit the number of realloc() */
13475             newlen += newlen / OVERALLOCATE_FACTOR;
13476         }
13477         if (newlen < writer->min_length)
13478             newlen = writer->min_length;
13479 
13480         if (maxchar > writer->maxchar || writer->readonly) {
13481             /* resize + widen */
13482             maxchar = Py_MAX(maxchar, writer->maxchar);
13483             newbuffer = PyUnicode_New(newlen, maxchar);
13484             if (newbuffer == NULL)
13485                 return -1;
13486             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13487                                           writer->buffer, 0, writer->pos);
13488             Py_DECREF(writer->buffer);
13489             writer->readonly = 0;
13490         }
13491         else {
13492             newbuffer = resize_compact(writer->buffer, newlen);
13493             if (newbuffer == NULL)
13494                 return -1;
13495         }
13496         writer->buffer = newbuffer;
13497     }
13498     else if (maxchar > writer->maxchar) {
13499         assert(!writer->readonly);
13500         newbuffer = PyUnicode_New(writer->size, maxchar);
13501         if (newbuffer == NULL)
13502             return -1;
13503         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13504                                       writer->buffer, 0, writer->pos);
13505         Py_SETREF(writer->buffer, newbuffer);
13506     }
13507     _PyUnicodeWriter_Update(writer);
13508     return 0;
13509 
13510 #undef OVERALLOCATE_FACTOR
13511 }
13512 
13513 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13514 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13515                                      enum PyUnicode_Kind kind)
13516 {
13517     Py_UCS4 maxchar;
13518 
13519     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13520     assert(writer->kind < kind);
13521 
13522     switch (kind)
13523     {
13524     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13525     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13526     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13527     default:
13528         assert(0 && "invalid kind");
13529         return -1;
13530     }
13531 
13532     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13533 }
13534 
13535 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13536 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13537 {
13538     assert(ch <= MAX_UNICODE);
13539     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13540         return -1;
13541     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13542     writer->pos++;
13543     return 0;
13544 }
13545 
13546 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13547 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13548 {
13549     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13550 }
13551 
13552 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13553 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13554 {
13555     Py_UCS4 maxchar;
13556     Py_ssize_t len;
13557 
13558     if (PyUnicode_READY(str) == -1)
13559         return -1;
13560     len = PyUnicode_GET_LENGTH(str);
13561     if (len == 0)
13562         return 0;
13563     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13564     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13565         if (writer->buffer == NULL && !writer->overallocate) {
13566             assert(_PyUnicode_CheckConsistency(str, 1));
13567             writer->readonly = 1;
13568             Py_INCREF(str);
13569             writer->buffer = str;
13570             _PyUnicodeWriter_Update(writer);
13571             writer->pos += len;
13572             return 0;
13573         }
13574         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13575             return -1;
13576     }
13577     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13578                                   str, 0, len);
13579     writer->pos += len;
13580     return 0;
13581 }
13582 
13583 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13584 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13585                                 Py_ssize_t start, Py_ssize_t end)
13586 {
13587     Py_UCS4 maxchar;
13588     Py_ssize_t len;
13589 
13590     if (PyUnicode_READY(str) == -1)
13591         return -1;
13592 
13593     assert(0 <= start);
13594     assert(end <= PyUnicode_GET_LENGTH(str));
13595     assert(start <= end);
13596 
13597     if (end == 0)
13598         return 0;
13599 
13600     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13601         return _PyUnicodeWriter_WriteStr(writer, str);
13602 
13603     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13604         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13605     else
13606         maxchar = writer->maxchar;
13607     len = end - start;
13608 
13609     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13610         return -1;
13611 
13612     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13613                                   str, start, len);
13614     writer->pos += len;
13615     return 0;
13616 }
13617 
13618 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13619 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13620                                   const char *ascii, Py_ssize_t len)
13621 {
13622     if (len == -1)
13623         len = strlen(ascii);
13624 
13625     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13626 
13627     if (writer->buffer == NULL && !writer->overallocate) {
13628         PyObject *str;
13629 
13630         str = _PyUnicode_FromASCII(ascii, len);
13631         if (str == NULL)
13632             return -1;
13633 
13634         writer->readonly = 1;
13635         writer->buffer = str;
13636         _PyUnicodeWriter_Update(writer);
13637         writer->pos += len;
13638         return 0;
13639     }
13640 
13641     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13642         return -1;
13643 
13644     switch (writer->kind)
13645     {
13646     case PyUnicode_1BYTE_KIND:
13647     {
13648         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13649         Py_UCS1 *data = writer->data;
13650 
13651         memcpy(data + writer->pos, str, len);
13652         break;
13653     }
13654     case PyUnicode_2BYTE_KIND:
13655     {
13656         _PyUnicode_CONVERT_BYTES(
13657             Py_UCS1, Py_UCS2,
13658             ascii, ascii + len,
13659             (Py_UCS2 *)writer->data + writer->pos);
13660         break;
13661     }
13662     case PyUnicode_4BYTE_KIND:
13663     {
13664         _PyUnicode_CONVERT_BYTES(
13665             Py_UCS1, Py_UCS4,
13666             ascii, ascii + len,
13667             (Py_UCS4 *)writer->data + writer->pos);
13668         break;
13669     }
13670     default:
13671         assert(0);
13672     }
13673 
13674     writer->pos += len;
13675     return 0;
13676 }
13677 
13678 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13679 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13680                                    const char *str, Py_ssize_t len)
13681 {
13682     Py_UCS4 maxchar;
13683 
13684     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13685     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13686         return -1;
13687     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13688     writer->pos += len;
13689     return 0;
13690 }
13691 
13692 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13693 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13694 {
13695     PyObject *str;
13696 
13697     if (writer->pos == 0) {
13698         Py_CLEAR(writer->buffer);
13699         _Py_RETURN_UNICODE_EMPTY();
13700     }
13701 
13702     str = writer->buffer;
13703     writer->buffer = NULL;
13704 
13705     if (writer->readonly) {
13706         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13707         return str;
13708     }
13709 
13710     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13711         PyObject *str2;
13712         str2 = resize_compact(str, writer->pos);
13713         if (str2 == NULL) {
13714             Py_DECREF(str);
13715             return NULL;
13716         }
13717         str = str2;
13718     }
13719 
13720     assert(_PyUnicode_CheckConsistency(str, 1));
13721     return unicode_result_ready(str);
13722 }
13723 
13724 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13725 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13726 {
13727     Py_CLEAR(writer->buffer);
13728 }
13729 
13730 #include "stringlib/unicode_format.h"
13731 
13732 PyDoc_STRVAR(format__doc__,
13733              "S.format(*args, **kwargs) -> str\n\
13734 \n\
13735 Return a formatted version of S, using substitutions from args and kwargs.\n\
13736 The substitutions are identified by braces ('{' and '}').");
13737 
13738 PyDoc_STRVAR(format_map__doc__,
13739              "S.format_map(mapping) -> str\n\
13740 \n\
13741 Return a formatted version of S, using substitutions from mapping.\n\
13742 The substitutions are identified by braces ('{' and '}').");
13743 
13744 static PyObject *
unicode__format__(PyObject * self,PyObject * args)13745 unicode__format__(PyObject* self, PyObject* args)
13746 {
13747     PyObject *format_spec;
13748     _PyUnicodeWriter writer;
13749     int ret;
13750 
13751     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13752         return NULL;
13753 
13754     if (PyUnicode_READY(self) == -1)
13755         return NULL;
13756     _PyUnicodeWriter_Init(&writer);
13757     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13758                                           self, format_spec, 0,
13759                                           PyUnicode_GET_LENGTH(format_spec));
13760     if (ret == -1) {
13761         _PyUnicodeWriter_Dealloc(&writer);
13762         return NULL;
13763     }
13764     return _PyUnicodeWriter_Finish(&writer);
13765 }
13766 
13767 PyDoc_STRVAR(p_format__doc__,
13768              "S.__format__(format_spec) -> str\n\
13769 \n\
13770 Return a formatted version of S as described by format_spec.");
13771 
13772 static PyObject *
unicode__sizeof__(PyObject * v)13773 unicode__sizeof__(PyObject *v)
13774 {
13775     Py_ssize_t size;
13776 
13777     /* If it's a compact object, account for base structure +
13778        character data. */
13779     if (PyUnicode_IS_COMPACT_ASCII(v))
13780         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13781     else if (PyUnicode_IS_COMPACT(v))
13782         size = sizeof(PyCompactUnicodeObject) +
13783             (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13784     else {
13785         /* If it is a two-block object, account for base object, and
13786            for character block if present. */
13787         size = sizeof(PyUnicodeObject);
13788         if (_PyUnicode_DATA_ANY(v))
13789             size += (PyUnicode_GET_LENGTH(v) + 1) *
13790                 PyUnicode_KIND(v);
13791     }
13792     /* If the wstr pointer is present, account for it unless it is shared
13793        with the data pointer. Check if the data is not shared. */
13794     if (_PyUnicode_HAS_WSTR_MEMORY(v))
13795         size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13796     if (_PyUnicode_HAS_UTF8_MEMORY(v))
13797         size += PyUnicode_UTF8_LENGTH(v) + 1;
13798 
13799     return PyLong_FromSsize_t(size);
13800 }
13801 
13802 PyDoc_STRVAR(sizeof__doc__,
13803              "S.__sizeof__() -> size of S in memory, in bytes");
13804 
13805 static PyObject *
unicode_getnewargs(PyObject * v)13806 unicode_getnewargs(PyObject *v)
13807 {
13808     PyObject *copy = _PyUnicode_Copy(v);
13809     if (!copy)
13810         return NULL;
13811     return Py_BuildValue("(N)", copy);
13812 }
13813 
13814 static PyMethodDef unicode_methods[] = {
13815     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13816     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13817     {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13818     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13819     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13820     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13821     {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13822     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13823     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13824     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13825     {"expandtabs", (PyCFunction) unicode_expandtabs,
13826      METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13827     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13828     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13829     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13830     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13831     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13832     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13833     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13834     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13835     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13836     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13837     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13838     {"splitlines", (PyCFunction) unicode_splitlines,
13839      METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13840     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13841     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13842     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13843     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13844     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13845     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13846     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13847     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13848     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13849     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13850     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13851     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13852     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13853     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13854     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13855     {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13856     {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13857     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13858     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13859     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13860     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13861     UNICODE_MAKETRANS_METHODDEF
13862     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13863 #if 0
13864     /* These methods are just used for debugging the implementation. */
13865     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13866 #endif
13867 
13868     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13869     {NULL, NULL}
13870 };
13871 
13872 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13873 unicode_mod(PyObject *v, PyObject *w)
13874 {
13875     if (!PyUnicode_Check(v))
13876         Py_RETURN_NOTIMPLEMENTED;
13877     return PyUnicode_Format(v, w);
13878 }
13879 
13880 static PyNumberMethods unicode_as_number = {
13881     0,              /*nb_add*/
13882     0,              /*nb_subtract*/
13883     0,              /*nb_multiply*/
13884     unicode_mod,            /*nb_remainder*/
13885 };
13886 
13887 static PySequenceMethods unicode_as_sequence = {
13888     (lenfunc) unicode_length,       /* sq_length */
13889     PyUnicode_Concat,           /* sq_concat */
13890     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13891     (ssizeargfunc) unicode_getitem,     /* sq_item */
13892     0,                  /* sq_slice */
13893     0,                  /* sq_ass_item */
13894     0,                  /* sq_ass_slice */
13895     PyUnicode_Contains,         /* sq_contains */
13896 };
13897 
13898 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13899 unicode_subscript(PyObject* self, PyObject* item)
13900 {
13901     if (PyUnicode_READY(self) == -1)
13902         return NULL;
13903 
13904     if (PyIndex_Check(item)) {
13905         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13906         if (i == -1 && PyErr_Occurred())
13907             return NULL;
13908         if (i < 0)
13909             i += PyUnicode_GET_LENGTH(self);
13910         return unicode_getitem(self, i);
13911     } else if (PySlice_Check(item)) {
13912         Py_ssize_t start, stop, step, slicelength, cur, i;
13913         PyObject *result;
13914         void *src_data, *dest_data;
13915         int src_kind, dest_kind;
13916         Py_UCS4 ch, max_char, kind_limit;
13917 
13918         if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13919                                  &start, &stop, &step, &slicelength) < 0) {
13920             return NULL;
13921         }
13922 
13923         if (slicelength <= 0) {
13924             _Py_RETURN_UNICODE_EMPTY();
13925         } else if (start == 0 && step == 1 &&
13926                    slicelength == PyUnicode_GET_LENGTH(self)) {
13927             return unicode_result_unchanged(self);
13928         } else if (step == 1) {
13929             return PyUnicode_Substring(self,
13930                                        start, start + slicelength);
13931         }
13932         /* General case */
13933         src_kind = PyUnicode_KIND(self);
13934         src_data = PyUnicode_DATA(self);
13935         if (!PyUnicode_IS_ASCII(self)) {
13936             kind_limit = kind_maxchar_limit(src_kind);
13937             max_char = 0;
13938             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13939                 ch = PyUnicode_READ(src_kind, src_data, cur);
13940                 if (ch > max_char) {
13941                     max_char = ch;
13942                     if (max_char >= kind_limit)
13943                         break;
13944                 }
13945             }
13946         }
13947         else
13948             max_char = 127;
13949         result = PyUnicode_New(slicelength, max_char);
13950         if (result == NULL)
13951             return NULL;
13952         dest_kind = PyUnicode_KIND(result);
13953         dest_data = PyUnicode_DATA(result);
13954 
13955         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13956             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13957             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13958         }
13959         assert(_PyUnicode_CheckConsistency(result, 1));
13960         return result;
13961     } else {
13962         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13963         return NULL;
13964     }
13965 }
13966 
13967 static PyMappingMethods unicode_as_mapping = {
13968     (lenfunc)unicode_length,        /* mp_length */
13969     (binaryfunc)unicode_subscript,  /* mp_subscript */
13970     (objobjargproc)0,           /* mp_ass_subscript */
13971 };
13972 
13973 
13974 /* Helpers for PyUnicode_Format() */
13975 
13976 struct unicode_formatter_t {
13977     PyObject *args;
13978     int args_owned;
13979     Py_ssize_t arglen, argidx;
13980     PyObject *dict;
13981 
13982     enum PyUnicode_Kind fmtkind;
13983     Py_ssize_t fmtcnt, fmtpos;
13984     void *fmtdata;
13985     PyObject *fmtstr;
13986 
13987     _PyUnicodeWriter writer;
13988 };
13989 
13990 struct unicode_format_arg_t {
13991     Py_UCS4 ch;
13992     int flags;
13993     Py_ssize_t width;
13994     int prec;
13995     int sign;
13996 };
13997 
13998 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)13999 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14000 {
14001     Py_ssize_t argidx = ctx->argidx;
14002 
14003     if (argidx < ctx->arglen) {
14004         ctx->argidx++;
14005         if (ctx->arglen < 0)
14006             return ctx->args;
14007         else
14008             return PyTuple_GetItem(ctx->args, argidx);
14009     }
14010     PyErr_SetString(PyExc_TypeError,
14011                     "not enough arguments for format string");
14012     return NULL;
14013 }
14014 
14015 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14016 
14017 /* Format a float into the writer if the writer is not NULL, or into *p_output
14018    otherwise.
14019 
14020    Return 0 on success, raise an exception and return -1 on error. */
14021 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14022 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14023             PyObject **p_output,
14024             _PyUnicodeWriter *writer)
14025 {
14026     char *p;
14027     double x;
14028     Py_ssize_t len;
14029     int prec;
14030     int dtoa_flags;
14031 
14032     x = PyFloat_AsDouble(v);
14033     if (x == -1.0 && PyErr_Occurred())
14034         return -1;
14035 
14036     prec = arg->prec;
14037     if (prec < 0)
14038         prec = 6;
14039 
14040     if (arg->flags & F_ALT)
14041         dtoa_flags = Py_DTSF_ALT;
14042     else
14043         dtoa_flags = 0;
14044     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14045     if (p == NULL)
14046         return -1;
14047     len = strlen(p);
14048     if (writer) {
14049         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14050             PyMem_Free(p);
14051             return -1;
14052         }
14053     }
14054     else
14055         *p_output = _PyUnicode_FromASCII(p, len);
14056     PyMem_Free(p);
14057     return 0;
14058 }
14059 
14060 /* formatlong() emulates the format codes d, u, o, x and X, and
14061  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14062  * Python's regular ints.
14063  * Return value:  a new PyUnicodeObject*, or NULL if error.
14064  *     The output string is of the form
14065  *         "-"? ("0x" | "0X")? digit+
14066  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14067  *         set in flags.  The case of hex digits will be correct,
14068  *     There will be at least prec digits, zero-filled on the left if
14069  *         necessary to get that many.
14070  * val          object to be converted
14071  * flags        bitmask of format flags; only F_ALT is looked at
14072  * prec         minimum number of digits; 0-fill on left if needed
14073  * type         a character in [duoxX]; u acts the same as d
14074  *
14075  * CAUTION:  o, x and X conversions on regular ints can never
14076  * produce a '-' sign, but can for Python's unbounded ints.
14077  */
14078 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14079 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14080 {
14081     PyObject *result = NULL;
14082     char *buf;
14083     Py_ssize_t i;
14084     int sign;           /* 1 if '-', else 0 */
14085     int len;            /* number of characters */
14086     Py_ssize_t llen;
14087     int numdigits;      /* len == numnondigits + numdigits */
14088     int numnondigits = 0;
14089 
14090     /* Avoid exceeding SSIZE_T_MAX */
14091     if (prec > INT_MAX-3) {
14092         PyErr_SetString(PyExc_OverflowError,
14093                         "precision too large");
14094         return NULL;
14095     }
14096 
14097     assert(PyLong_Check(val));
14098 
14099     switch (type) {
14100     default:
14101         assert(!"'type' not in [diuoxX]");
14102     case 'd':
14103     case 'i':
14104     case 'u':
14105         /* int and int subclasses should print numerically when a numeric */
14106         /* format code is used (see issue18780) */
14107         result = PyNumber_ToBase(val, 10);
14108         break;
14109     case 'o':
14110         numnondigits = 2;
14111         result = PyNumber_ToBase(val, 8);
14112         break;
14113     case 'x':
14114     case 'X':
14115         numnondigits = 2;
14116         result = PyNumber_ToBase(val, 16);
14117         break;
14118     }
14119     if (!result)
14120         return NULL;
14121 
14122     assert(unicode_modifiable(result));
14123     assert(PyUnicode_IS_READY(result));
14124     assert(PyUnicode_IS_ASCII(result));
14125 
14126     /* To modify the string in-place, there can only be one reference. */
14127     if (Py_REFCNT(result) != 1) {
14128         Py_DECREF(result);
14129         PyErr_BadInternalCall();
14130         return NULL;
14131     }
14132     buf = PyUnicode_DATA(result);
14133     llen = PyUnicode_GET_LENGTH(result);
14134     if (llen > INT_MAX) {
14135         Py_DECREF(result);
14136         PyErr_SetString(PyExc_ValueError,
14137                         "string too large in _PyUnicode_FormatLong");
14138         return NULL;
14139     }
14140     len = (int)llen;
14141     sign = buf[0] == '-';
14142     numnondigits += sign;
14143     numdigits = len - numnondigits;
14144     assert(numdigits > 0);
14145 
14146     /* Get rid of base marker unless F_ALT */
14147     if (((alt) == 0 &&
14148         (type == 'o' || type == 'x' || type == 'X'))) {
14149         assert(buf[sign] == '0');
14150         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14151                buf[sign+1] == 'o');
14152         numnondigits -= 2;
14153         buf += 2;
14154         len -= 2;
14155         if (sign)
14156             buf[0] = '-';
14157         assert(len == numnondigits + numdigits);
14158         assert(numdigits > 0);
14159     }
14160 
14161     /* Fill with leading zeroes to meet minimum width. */
14162     if (prec > numdigits) {
14163         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14164                                 numnondigits + prec);
14165         char *b1;
14166         if (!r1) {
14167             Py_DECREF(result);
14168             return NULL;
14169         }
14170         b1 = PyBytes_AS_STRING(r1);
14171         for (i = 0; i < numnondigits; ++i)
14172             *b1++ = *buf++;
14173         for (i = 0; i < prec - numdigits; i++)
14174             *b1++ = '0';
14175         for (i = 0; i < numdigits; i++)
14176             *b1++ = *buf++;
14177         *b1 = '\0';
14178         Py_DECREF(result);
14179         result = r1;
14180         buf = PyBytes_AS_STRING(result);
14181         len = numnondigits + prec;
14182     }
14183 
14184     /* Fix up case for hex conversions. */
14185     if (type == 'X') {
14186         /* Need to convert all lower case letters to upper case.
14187            and need to convert 0x to 0X (and -0x to -0X). */
14188         for (i = 0; i < len; i++)
14189             if (buf[i] >= 'a' && buf[i] <= 'x')
14190                 buf[i] -= 'a'-'A';
14191     }
14192     if (!PyUnicode_Check(result)
14193         || buf != PyUnicode_DATA(result)) {
14194         PyObject *unicode;
14195         unicode = _PyUnicode_FromASCII(buf, len);
14196         Py_DECREF(result);
14197         result = unicode;
14198     }
14199     else if (len != PyUnicode_GET_LENGTH(result)) {
14200         if (PyUnicode_Resize(&result, len) < 0)
14201             Py_CLEAR(result);
14202     }
14203     return result;
14204 }
14205 
14206 /* Format an integer or a float as an integer.
14207  * Return 1 if the number has been formatted into the writer,
14208  *        0 if the number has been formatted into *p_output
14209  *       -1 and raise an exception on error */
14210 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14211 mainformatlong(PyObject *v,
14212                struct unicode_format_arg_t *arg,
14213                PyObject **p_output,
14214                _PyUnicodeWriter *writer)
14215 {
14216     PyObject *iobj, *res;
14217     char type = (char)arg->ch;
14218 
14219     if (!PyNumber_Check(v))
14220         goto wrongtype;
14221 
14222     /* make sure number is a type of integer for o, x, and X */
14223     if (!PyLong_Check(v)) {
14224         if (type == 'o' || type == 'x' || type == 'X') {
14225             iobj = PyNumber_Index(v);
14226             if (iobj == NULL) {
14227                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14228                     goto wrongtype;
14229                 return -1;
14230             }
14231         }
14232         else {
14233             iobj = PyNumber_Long(v);
14234             if (iobj == NULL ) {
14235                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14236                     goto wrongtype;
14237                 return -1;
14238             }
14239         }
14240         assert(PyLong_Check(iobj));
14241     }
14242     else {
14243         iobj = v;
14244         Py_INCREF(iobj);
14245     }
14246 
14247     if (PyLong_CheckExact(v)
14248         && arg->width == -1 && arg->prec == -1
14249         && !(arg->flags & (F_SIGN | F_BLANK))
14250         && type != 'X')
14251     {
14252         /* Fast path */
14253         int alternate = arg->flags & F_ALT;
14254         int base;
14255 
14256         switch(type)
14257         {
14258             default:
14259                 assert(0 && "'type' not in [diuoxX]");
14260             case 'd':
14261             case 'i':
14262             case 'u':
14263                 base = 10;
14264                 break;
14265             case 'o':
14266                 base = 8;
14267                 break;
14268             case 'x':
14269             case 'X':
14270                 base = 16;
14271                 break;
14272         }
14273 
14274         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14275             Py_DECREF(iobj);
14276             return -1;
14277         }
14278         Py_DECREF(iobj);
14279         return 1;
14280     }
14281 
14282     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14283     Py_DECREF(iobj);
14284     if (res == NULL)
14285         return -1;
14286     *p_output = res;
14287     return 0;
14288 
14289 wrongtype:
14290     switch(type)
14291     {
14292         case 'o':
14293         case 'x':
14294         case 'X':
14295             PyErr_Format(PyExc_TypeError,
14296                     "%%%c format: an integer is required, "
14297                     "not %.200s",
14298                     type, Py_TYPE(v)->tp_name);
14299             break;
14300         default:
14301             PyErr_Format(PyExc_TypeError,
14302                     "%%%c format: a number is required, "
14303                     "not %.200s",
14304                     type, Py_TYPE(v)->tp_name);
14305             break;
14306     }
14307     return -1;
14308 }
14309 
14310 static Py_UCS4
formatchar(PyObject * v)14311 formatchar(PyObject *v)
14312 {
14313     /* presume that the buffer is at least 3 characters long */
14314     if (PyUnicode_Check(v)) {
14315         if (PyUnicode_GET_LENGTH(v) == 1) {
14316             return PyUnicode_READ_CHAR(v, 0);
14317         }
14318         goto onError;
14319     }
14320     else {
14321         PyObject *iobj;
14322         long x;
14323         /* make sure number is a type of integer */
14324         if (!PyLong_Check(v)) {
14325             iobj = PyNumber_Index(v);
14326             if (iobj == NULL) {
14327                 goto onError;
14328             }
14329             x = PyLong_AsLong(iobj);
14330             Py_DECREF(iobj);
14331         }
14332         else {
14333             x = PyLong_AsLong(v);
14334         }
14335         if (x == -1 && PyErr_Occurred())
14336             goto onError;
14337 
14338         if (x < 0 || x > MAX_UNICODE) {
14339             PyErr_SetString(PyExc_OverflowError,
14340                             "%c arg not in range(0x110000)");
14341             return (Py_UCS4) -1;
14342         }
14343 
14344         return (Py_UCS4) x;
14345     }
14346 
14347   onError:
14348     PyErr_SetString(PyExc_TypeError,
14349                     "%c requires int or char");
14350     return (Py_UCS4) -1;
14351 }
14352 
14353 /* Parse options of an argument: flags, width, precision.
14354    Handle also "%(name)" syntax.
14355 
14356    Return 0 if the argument has been formatted into arg->str.
14357    Return 1 if the argument has been written into ctx->writer,
14358    Raise an exception and return -1 on error. */
14359 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14360 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14361                          struct unicode_format_arg_t *arg)
14362 {
14363 #define FORMAT_READ(ctx) \
14364         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14365 
14366     PyObject *v;
14367 
14368     if (arg->ch == '(') {
14369         /* Get argument value from a dictionary. Example: "%(name)s". */
14370         Py_ssize_t keystart;
14371         Py_ssize_t keylen;
14372         PyObject *key;
14373         int pcount = 1;
14374 
14375         if (ctx->dict == NULL) {
14376             PyErr_SetString(PyExc_TypeError,
14377                             "format requires a mapping");
14378             return -1;
14379         }
14380         ++ctx->fmtpos;
14381         --ctx->fmtcnt;
14382         keystart = ctx->fmtpos;
14383         /* Skip over balanced parentheses */
14384         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14385             arg->ch = FORMAT_READ(ctx);
14386             if (arg->ch == ')')
14387                 --pcount;
14388             else if (arg->ch == '(')
14389                 ++pcount;
14390             ctx->fmtpos++;
14391         }
14392         keylen = ctx->fmtpos - keystart - 1;
14393         if (ctx->fmtcnt < 0 || pcount > 0) {
14394             PyErr_SetString(PyExc_ValueError,
14395                             "incomplete format key");
14396             return -1;
14397         }
14398         key = PyUnicode_Substring(ctx->fmtstr,
14399                                   keystart, keystart + keylen);
14400         if (key == NULL)
14401             return -1;
14402         if (ctx->args_owned) {
14403             ctx->args_owned = 0;
14404             Py_DECREF(ctx->args);
14405         }
14406         ctx->args = PyObject_GetItem(ctx->dict, key);
14407         Py_DECREF(key);
14408         if (ctx->args == NULL)
14409             return -1;
14410         ctx->args_owned = 1;
14411         ctx->arglen = -1;
14412         ctx->argidx = -2;
14413     }
14414 
14415     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14416     while (--ctx->fmtcnt >= 0) {
14417         arg->ch = FORMAT_READ(ctx);
14418         ctx->fmtpos++;
14419         switch (arg->ch) {
14420         case '-': arg->flags |= F_LJUST; continue;
14421         case '+': arg->flags |= F_SIGN; continue;
14422         case ' ': arg->flags |= F_BLANK; continue;
14423         case '#': arg->flags |= F_ALT; continue;
14424         case '0': arg->flags |= F_ZERO; continue;
14425         }
14426         break;
14427     }
14428 
14429     /* Parse width. Example: "%10s" => width=10 */
14430     if (arg->ch == '*') {
14431         v = unicode_format_getnextarg(ctx);
14432         if (v == NULL)
14433             return -1;
14434         if (!PyLong_Check(v)) {
14435             PyErr_SetString(PyExc_TypeError,
14436                             "* wants int");
14437             return -1;
14438         }
14439         arg->width = PyLong_AsSsize_t(v);
14440         if (arg->width == -1 && PyErr_Occurred())
14441             return -1;
14442         if (arg->width < 0) {
14443             arg->flags |= F_LJUST;
14444             arg->width = -arg->width;
14445         }
14446         if (--ctx->fmtcnt >= 0) {
14447             arg->ch = FORMAT_READ(ctx);
14448             ctx->fmtpos++;
14449         }
14450     }
14451     else if (arg->ch >= '0' && arg->ch <= '9') {
14452         arg->width = arg->ch - '0';
14453         while (--ctx->fmtcnt >= 0) {
14454             arg->ch = FORMAT_READ(ctx);
14455             ctx->fmtpos++;
14456             if (arg->ch < '0' || arg->ch > '9')
14457                 break;
14458             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14459                mixing signed and unsigned comparison. Since arg->ch is between
14460                '0' and '9', casting to int is safe. */
14461             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14462                 PyErr_SetString(PyExc_ValueError,
14463                                 "width too big");
14464                 return -1;
14465             }
14466             arg->width = arg->width*10 + (arg->ch - '0');
14467         }
14468     }
14469 
14470     /* Parse precision. Example: "%.3f" => prec=3 */
14471     if (arg->ch == '.') {
14472         arg->prec = 0;
14473         if (--ctx->fmtcnt >= 0) {
14474             arg->ch = FORMAT_READ(ctx);
14475             ctx->fmtpos++;
14476         }
14477         if (arg->ch == '*') {
14478             v = unicode_format_getnextarg(ctx);
14479             if (v == NULL)
14480                 return -1;
14481             if (!PyLong_Check(v)) {
14482                 PyErr_SetString(PyExc_TypeError,
14483                                 "* wants int");
14484                 return -1;
14485             }
14486             arg->prec = _PyLong_AsInt(v);
14487             if (arg->prec == -1 && PyErr_Occurred())
14488                 return -1;
14489             if (arg->prec < 0)
14490                 arg->prec = 0;
14491             if (--ctx->fmtcnt >= 0) {
14492                 arg->ch = FORMAT_READ(ctx);
14493                 ctx->fmtpos++;
14494             }
14495         }
14496         else if (arg->ch >= '0' && arg->ch <= '9') {
14497             arg->prec = arg->ch - '0';
14498             while (--ctx->fmtcnt >= 0) {
14499                 arg->ch = FORMAT_READ(ctx);
14500                 ctx->fmtpos++;
14501                 if (arg->ch < '0' || arg->ch > '9')
14502                     break;
14503                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14504                     PyErr_SetString(PyExc_ValueError,
14505                                     "precision too big");
14506                     return -1;
14507                 }
14508                 arg->prec = arg->prec*10 + (arg->ch - '0');
14509             }
14510         }
14511     }
14512 
14513     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14514     if (ctx->fmtcnt >= 0) {
14515         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14516             if (--ctx->fmtcnt >= 0) {
14517                 arg->ch = FORMAT_READ(ctx);
14518                 ctx->fmtpos++;
14519             }
14520         }
14521     }
14522     if (ctx->fmtcnt < 0) {
14523         PyErr_SetString(PyExc_ValueError,
14524                         "incomplete format");
14525         return -1;
14526     }
14527     return 0;
14528 
14529 #undef FORMAT_READ
14530 }
14531 
14532 /* Format one argument. Supported conversion specifiers:
14533 
14534    - "s", "r", "a": any type
14535    - "i", "d", "u": int or float
14536    - "o", "x", "X": int
14537    - "e", "E", "f", "F", "g", "G": float
14538    - "c": int or str (1 character)
14539 
14540    When possible, the output is written directly into the Unicode writer
14541    (ctx->writer). A string is created when padding is required.
14542 
14543    Return 0 if the argument has been formatted into *p_str,
14544           1 if the argument has been written into ctx->writer,
14545          -1 on error. */
14546 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14547 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14548                           struct unicode_format_arg_t *arg,
14549                           PyObject **p_str)
14550 {
14551     PyObject *v;
14552     _PyUnicodeWriter *writer = &ctx->writer;
14553 
14554     if (ctx->fmtcnt == 0)
14555         ctx->writer.overallocate = 0;
14556 
14557     if (arg->ch == '%') {
14558         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14559             return -1;
14560         return 1;
14561     }
14562 
14563     v = unicode_format_getnextarg(ctx);
14564     if (v == NULL)
14565         return -1;
14566 
14567 
14568     switch (arg->ch) {
14569     case 's':
14570     case 'r':
14571     case 'a':
14572         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14573             /* Fast path */
14574             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14575                 return -1;
14576             return 1;
14577         }
14578 
14579         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14580             *p_str = v;
14581             Py_INCREF(*p_str);
14582         }
14583         else {
14584             if (arg->ch == 's')
14585                 *p_str = PyObject_Str(v);
14586             else if (arg->ch == 'r')
14587                 *p_str = PyObject_Repr(v);
14588             else
14589                 *p_str = PyObject_ASCII(v);
14590         }
14591         break;
14592 
14593     case 'i':
14594     case 'd':
14595     case 'u':
14596     case 'o':
14597     case 'x':
14598     case 'X':
14599     {
14600         int ret = mainformatlong(v, arg, p_str, writer);
14601         if (ret != 0)
14602             return ret;
14603         arg->sign = 1;
14604         break;
14605     }
14606 
14607     case 'e':
14608     case 'E':
14609     case 'f':
14610     case 'F':
14611     case 'g':
14612     case 'G':
14613         if (arg->width == -1 && arg->prec == -1
14614             && !(arg->flags & (F_SIGN | F_BLANK)))
14615         {
14616             /* Fast path */
14617             if (formatfloat(v, arg, NULL, writer) == -1)
14618                 return -1;
14619             return 1;
14620         }
14621 
14622         arg->sign = 1;
14623         if (formatfloat(v, arg, p_str, NULL) == -1)
14624             return -1;
14625         break;
14626 
14627     case 'c':
14628     {
14629         Py_UCS4 ch = formatchar(v);
14630         if (ch == (Py_UCS4) -1)
14631             return -1;
14632         if (arg->width == -1 && arg->prec == -1) {
14633             /* Fast path */
14634             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14635                 return -1;
14636             return 1;
14637         }
14638         *p_str = PyUnicode_FromOrdinal(ch);
14639         break;
14640     }
14641 
14642     default:
14643         PyErr_Format(PyExc_ValueError,
14644                      "unsupported format character '%c' (0x%x) "
14645                      "at index %zd",
14646                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14647                      (int)arg->ch,
14648                      ctx->fmtpos - 1);
14649         return -1;
14650     }
14651     if (*p_str == NULL)
14652         return -1;
14653     assert (PyUnicode_Check(*p_str));
14654     return 0;
14655 }
14656 
14657 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14658 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14659                           struct unicode_format_arg_t *arg,
14660                           PyObject *str)
14661 {
14662     Py_ssize_t len;
14663     enum PyUnicode_Kind kind;
14664     void *pbuf;
14665     Py_ssize_t pindex;
14666     Py_UCS4 signchar;
14667     Py_ssize_t buflen;
14668     Py_UCS4 maxchar;
14669     Py_ssize_t sublen;
14670     _PyUnicodeWriter *writer = &ctx->writer;
14671     Py_UCS4 fill;
14672 
14673     fill = ' ';
14674     if (arg->sign && arg->flags & F_ZERO)
14675         fill = '0';
14676 
14677     if (PyUnicode_READY(str) == -1)
14678         return -1;
14679 
14680     len = PyUnicode_GET_LENGTH(str);
14681     if ((arg->width == -1 || arg->width <= len)
14682         && (arg->prec == -1 || arg->prec >= len)
14683         && !(arg->flags & (F_SIGN | F_BLANK)))
14684     {
14685         /* Fast path */
14686         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14687             return -1;
14688         return 0;
14689     }
14690 
14691     /* Truncate the string for "s", "r" and "a" formats
14692        if the precision is set */
14693     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14694         if (arg->prec >= 0 && len > arg->prec)
14695             len = arg->prec;
14696     }
14697 
14698     /* Adjust sign and width */
14699     kind = PyUnicode_KIND(str);
14700     pbuf = PyUnicode_DATA(str);
14701     pindex = 0;
14702     signchar = '\0';
14703     if (arg->sign) {
14704         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14705         if (ch == '-' || ch == '+') {
14706             signchar = ch;
14707             len--;
14708             pindex++;
14709         }
14710         else if (arg->flags & F_SIGN)
14711             signchar = '+';
14712         else if (arg->flags & F_BLANK)
14713             signchar = ' ';
14714         else
14715             arg->sign = 0;
14716     }
14717     if (arg->width < len)
14718         arg->width = len;
14719 
14720     /* Prepare the writer */
14721     maxchar = writer->maxchar;
14722     if (!(arg->flags & F_LJUST)) {
14723         if (arg->sign) {
14724             if ((arg->width-1) > len)
14725                 maxchar = Py_MAX(maxchar, fill);
14726         }
14727         else {
14728             if (arg->width > len)
14729                 maxchar = Py_MAX(maxchar, fill);
14730         }
14731     }
14732     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14733         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14734         maxchar = Py_MAX(maxchar, strmaxchar);
14735     }
14736 
14737     buflen = arg->width;
14738     if (arg->sign && len == arg->width)
14739         buflen++;
14740     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14741         return -1;
14742 
14743     /* Write the sign if needed */
14744     if (arg->sign) {
14745         if (fill != ' ') {
14746             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14747             writer->pos += 1;
14748         }
14749         if (arg->width > len)
14750             arg->width--;
14751     }
14752 
14753     /* Write the numeric prefix for "x", "X" and "o" formats
14754        if the alternate form is used.
14755        For example, write "0x" for the "%#x" format. */
14756     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14757         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14758         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14759         if (fill != ' ') {
14760             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14761             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14762             writer->pos += 2;
14763             pindex += 2;
14764         }
14765         arg->width -= 2;
14766         if (arg->width < 0)
14767             arg->width = 0;
14768         len -= 2;
14769     }
14770 
14771     /* Pad left with the fill character if needed */
14772     if (arg->width > len && !(arg->flags & F_LJUST)) {
14773         sublen = arg->width - len;
14774         FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14775         writer->pos += sublen;
14776         arg->width = len;
14777     }
14778 
14779     /* If padding with spaces: write sign if needed and/or numeric prefix if
14780        the alternate form is used */
14781     if (fill == ' ') {
14782         if (arg->sign) {
14783             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14784             writer->pos += 1;
14785         }
14786         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14787             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14788             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14789             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14790             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14791             writer->pos += 2;
14792             pindex += 2;
14793         }
14794     }
14795 
14796     /* Write characters */
14797     if (len) {
14798         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14799                                       str, pindex, len);
14800         writer->pos += len;
14801     }
14802 
14803     /* Pad right with the fill character if needed */
14804     if (arg->width > len) {
14805         sublen = arg->width - len;
14806         FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14807         writer->pos += sublen;
14808     }
14809     return 0;
14810 }
14811 
14812 /* Helper of PyUnicode_Format(): format one arg.
14813    Return 0 on success, raise an exception and return -1 on error. */
14814 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14815 unicode_format_arg(struct unicode_formatter_t *ctx)
14816 {
14817     struct unicode_format_arg_t arg;
14818     PyObject *str;
14819     int ret;
14820 
14821     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14822     arg.flags = 0;
14823     arg.width = -1;
14824     arg.prec = -1;
14825     arg.sign = 0;
14826     str = NULL;
14827 
14828     ret = unicode_format_arg_parse(ctx, &arg);
14829     if (ret == -1)
14830         return -1;
14831 
14832     ret = unicode_format_arg_format(ctx, &arg, &str);
14833     if (ret == -1)
14834         return -1;
14835 
14836     if (ret != 1) {
14837         ret = unicode_format_arg_output(ctx, &arg, str);
14838         Py_DECREF(str);
14839         if (ret == -1)
14840             return -1;
14841     }
14842 
14843     if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14844         PyErr_SetString(PyExc_TypeError,
14845                         "not all arguments converted during string formatting");
14846         return -1;
14847     }
14848     return 0;
14849 }
14850 
14851 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14852 PyUnicode_Format(PyObject *format, PyObject *args)
14853 {
14854     struct unicode_formatter_t ctx;
14855 
14856     if (format == NULL || args == NULL) {
14857         PyErr_BadInternalCall();
14858         return NULL;
14859     }
14860 
14861     if (ensure_unicode(format) < 0)
14862         return NULL;
14863 
14864     ctx.fmtstr = format;
14865     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14866     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14867     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14868     ctx.fmtpos = 0;
14869 
14870     _PyUnicodeWriter_Init(&ctx.writer);
14871     ctx.writer.min_length = ctx.fmtcnt + 100;
14872     ctx.writer.overallocate = 1;
14873 
14874     if (PyTuple_Check(args)) {
14875         ctx.arglen = PyTuple_Size(args);
14876         ctx.argidx = 0;
14877     }
14878     else {
14879         ctx.arglen = -1;
14880         ctx.argidx = -2;
14881     }
14882     ctx.args_owned = 0;
14883     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14884         ctx.dict = args;
14885     else
14886         ctx.dict = NULL;
14887     ctx.args = args;
14888 
14889     while (--ctx.fmtcnt >= 0) {
14890         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14891             Py_ssize_t nonfmtpos;
14892 
14893             nonfmtpos = ctx.fmtpos++;
14894             while (ctx.fmtcnt >= 0 &&
14895                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14896                 ctx.fmtpos++;
14897                 ctx.fmtcnt--;
14898             }
14899             if (ctx.fmtcnt < 0) {
14900                 ctx.fmtpos--;
14901                 ctx.writer.overallocate = 0;
14902             }
14903 
14904             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14905                                                 nonfmtpos, ctx.fmtpos) < 0)
14906                 goto onError;
14907         }
14908         else {
14909             ctx.fmtpos++;
14910             if (unicode_format_arg(&ctx) == -1)
14911                 goto onError;
14912         }
14913     }
14914 
14915     if (ctx.argidx < ctx.arglen && !ctx.dict) {
14916         PyErr_SetString(PyExc_TypeError,
14917                         "not all arguments converted during string formatting");
14918         goto onError;
14919     }
14920 
14921     if (ctx.args_owned) {
14922         Py_DECREF(ctx.args);
14923     }
14924     return _PyUnicodeWriter_Finish(&ctx.writer);
14925 
14926   onError:
14927     _PyUnicodeWriter_Dealloc(&ctx.writer);
14928     if (ctx.args_owned) {
14929         Py_DECREF(ctx.args);
14930     }
14931     return NULL;
14932 }
14933 
14934 static PyObject *
14935 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14936 
14937 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)14938 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14939 {
14940     PyObject *x = NULL;
14941     static char *kwlist[] = {"object", "encoding", "errors", 0};
14942     char *encoding = NULL;
14943     char *errors = NULL;
14944 
14945     if (type != &PyUnicode_Type)
14946         return unicode_subtype_new(type, args, kwds);
14947     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14948                                      kwlist, &x, &encoding, &errors))
14949         return NULL;
14950     if (x == NULL)
14951         _Py_RETURN_UNICODE_EMPTY();
14952     if (encoding == NULL && errors == NULL)
14953         return PyObject_Str(x);
14954     else
14955         return PyUnicode_FromEncodedObject(x, encoding, errors);
14956 }
14957 
14958 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)14959 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14960 {
14961     PyObject *unicode, *self;
14962     Py_ssize_t length, char_size;
14963     int share_wstr, share_utf8;
14964     unsigned int kind;
14965     void *data;
14966 
14967     assert(PyType_IsSubtype(type, &PyUnicode_Type));
14968 
14969     unicode = unicode_new(&PyUnicode_Type, args, kwds);
14970     if (unicode == NULL)
14971         return NULL;
14972     assert(_PyUnicode_CHECK(unicode));
14973     if (PyUnicode_READY(unicode) == -1) {
14974         Py_DECREF(unicode);
14975         return NULL;
14976     }
14977 
14978     self = type->tp_alloc(type, 0);
14979     if (self == NULL) {
14980         Py_DECREF(unicode);
14981         return NULL;
14982     }
14983     kind = PyUnicode_KIND(unicode);
14984     length = PyUnicode_GET_LENGTH(unicode);
14985 
14986     _PyUnicode_LENGTH(self) = length;
14987 #ifdef Py_DEBUG
14988     _PyUnicode_HASH(self) = -1;
14989 #else
14990     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14991 #endif
14992     _PyUnicode_STATE(self).interned = 0;
14993     _PyUnicode_STATE(self).kind = kind;
14994     _PyUnicode_STATE(self).compact = 0;
14995     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14996     _PyUnicode_STATE(self).ready = 1;
14997     _PyUnicode_WSTR(self) = NULL;
14998     _PyUnicode_UTF8_LENGTH(self) = 0;
14999     _PyUnicode_UTF8(self) = NULL;
15000     _PyUnicode_WSTR_LENGTH(self) = 0;
15001     _PyUnicode_DATA_ANY(self) = NULL;
15002 
15003     share_utf8 = 0;
15004     share_wstr = 0;
15005     if (kind == PyUnicode_1BYTE_KIND) {
15006         char_size = 1;
15007         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15008             share_utf8 = 1;
15009     }
15010     else if (kind == PyUnicode_2BYTE_KIND) {
15011         char_size = 2;
15012         if (sizeof(wchar_t) == 2)
15013             share_wstr = 1;
15014     }
15015     else {
15016         assert(kind == PyUnicode_4BYTE_KIND);
15017         char_size = 4;
15018         if (sizeof(wchar_t) == 4)
15019             share_wstr = 1;
15020     }
15021 
15022     /* Ensure we won't overflow the length. */
15023     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15024         PyErr_NoMemory();
15025         goto onError;
15026     }
15027     data = PyObject_MALLOC((length + 1) * char_size);
15028     if (data == NULL) {
15029         PyErr_NoMemory();
15030         goto onError;
15031     }
15032 
15033     _PyUnicode_DATA_ANY(self) = data;
15034     if (share_utf8) {
15035         _PyUnicode_UTF8_LENGTH(self) = length;
15036         _PyUnicode_UTF8(self) = data;
15037     }
15038     if (share_wstr) {
15039         _PyUnicode_WSTR_LENGTH(self) = length;
15040         _PyUnicode_WSTR(self) = (wchar_t *)data;
15041     }
15042 
15043     memcpy(data, PyUnicode_DATA(unicode),
15044               kind * (length + 1));
15045     assert(_PyUnicode_CheckConsistency(self, 1));
15046 #ifdef Py_DEBUG
15047     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15048 #endif
15049     Py_DECREF(unicode);
15050     return self;
15051 
15052 onError:
15053     Py_DECREF(unicode);
15054     Py_DECREF(self);
15055     return NULL;
15056 }
15057 
15058 PyDoc_STRVAR(unicode_doc,
15059 "str(object='') -> str\n\
15060 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15061 \n\
15062 Create a new string object from the given object. If encoding or\n\
15063 errors is specified, then the object must expose a data buffer\n\
15064 that will be decoded using the given encoding and error handler.\n\
15065 Otherwise, returns the result of object.__str__() (if defined)\n\
15066 or repr(object).\n\
15067 encoding defaults to sys.getdefaultencoding().\n\
15068 errors defaults to 'strict'.");
15069 
15070 static PyObject *unicode_iter(PyObject *seq);
15071 
15072 PyTypeObject PyUnicode_Type = {
15073     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15074     "str",              /* tp_name */
15075     sizeof(PyUnicodeObject),        /* tp_size */
15076     0,                  /* tp_itemsize */
15077     /* Slots */
15078     (destructor)unicode_dealloc,    /* tp_dealloc */
15079     0,                  /* tp_print */
15080     0,                  /* tp_getattr */
15081     0,                  /* tp_setattr */
15082     0,                  /* tp_reserved */
15083     unicode_repr,           /* tp_repr */
15084     &unicode_as_number,         /* tp_as_number */
15085     &unicode_as_sequence,       /* tp_as_sequence */
15086     &unicode_as_mapping,        /* tp_as_mapping */
15087     (hashfunc) unicode_hash,        /* tp_hash*/
15088     0,                  /* tp_call*/
15089     (reprfunc) unicode_str,     /* tp_str */
15090     PyObject_GenericGetAttr,        /* tp_getattro */
15091     0,                  /* tp_setattro */
15092     0,                  /* tp_as_buffer */
15093     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15094     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
15095     unicode_doc,            /* tp_doc */
15096     0,                  /* tp_traverse */
15097     0,                  /* tp_clear */
15098     PyUnicode_RichCompare,      /* tp_richcompare */
15099     0,                  /* tp_weaklistoffset */
15100     unicode_iter,           /* tp_iter */
15101     0,                  /* tp_iternext */
15102     unicode_methods,            /* tp_methods */
15103     0,                  /* tp_members */
15104     0,                  /* tp_getset */
15105     &PyBaseObject_Type,         /* tp_base */
15106     0,                  /* tp_dict */
15107     0,                  /* tp_descr_get */
15108     0,                  /* tp_descr_set */
15109     0,                  /* tp_dictoffset */
15110     0,                  /* tp_init */
15111     0,                  /* tp_alloc */
15112     unicode_new,            /* tp_new */
15113     PyObject_Del,           /* tp_free */
15114 };
15115 
15116 /* Initialize the Unicode implementation */
15117 
_PyUnicode_Init(void)15118 int _PyUnicode_Init(void)
15119 {
15120     /* XXX - move this array to unicodectype.c ? */
15121     Py_UCS2 linebreak[] = {
15122         0x000A, /* LINE FEED */
15123         0x000D, /* CARRIAGE RETURN */
15124         0x001C, /* FILE SEPARATOR */
15125         0x001D, /* GROUP SEPARATOR */
15126         0x001E, /* RECORD SEPARATOR */
15127         0x0085, /* NEXT LINE */
15128         0x2028, /* LINE SEPARATOR */
15129         0x2029, /* PARAGRAPH SEPARATOR */
15130     };
15131 
15132     /* Init the implementation */
15133     _Py_INCREF_UNICODE_EMPTY();
15134     if (!unicode_empty)
15135         Py_FatalError("Can't create empty string");
15136     Py_DECREF(unicode_empty);
15137 
15138     if (PyType_Ready(&PyUnicode_Type) < 0)
15139         Py_FatalError("Can't initialize 'unicode'");
15140 
15141     /* initialize the linebreak bloom filter */
15142     bloom_linebreak = make_bloom_mask(
15143         PyUnicode_2BYTE_KIND, linebreak,
15144         Py_ARRAY_LENGTH(linebreak));
15145 
15146     if (PyType_Ready(&EncodingMapType) < 0)
15147          Py_FatalError("Can't initialize encoding map type");
15148 
15149     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15150         Py_FatalError("Can't initialize field name iterator type");
15151 
15152     if (PyType_Ready(&PyFormatterIter_Type) < 0)
15153         Py_FatalError("Can't initialize formatter iter type");
15154 
15155     return 0;
15156 }
15157 
15158 /* Finalize the Unicode implementation */
15159 
15160 int
PyUnicode_ClearFreeList(void)15161 PyUnicode_ClearFreeList(void)
15162 {
15163     return 0;
15164 }
15165 
15166 void
_PyUnicode_Fini(void)15167 _PyUnicode_Fini(void)
15168 {
15169     int i;
15170 
15171     Py_CLEAR(unicode_empty);
15172 
15173     for (i = 0; i < 256; i++)
15174         Py_CLEAR(unicode_latin1[i]);
15175     _PyUnicode_ClearStaticStrings();
15176     (void)PyUnicode_ClearFreeList();
15177 }
15178 
15179 void
PyUnicode_InternInPlace(PyObject ** p)15180 PyUnicode_InternInPlace(PyObject **p)
15181 {
15182     PyObject *s = *p;
15183     PyObject *t;
15184 #ifdef Py_DEBUG
15185     assert(s != NULL);
15186     assert(_PyUnicode_CHECK(s));
15187 #else
15188     if (s == NULL || !PyUnicode_Check(s))
15189         return;
15190 #endif
15191     /* If it's a subclass, we don't really know what putting
15192        it in the interned dict might do. */
15193     if (!PyUnicode_CheckExact(s))
15194         return;
15195     if (PyUnicode_CHECK_INTERNED(s))
15196         return;
15197     if (interned == NULL) {
15198         interned = PyDict_New();
15199         if (interned == NULL) {
15200             PyErr_Clear(); /* Don't leave an exception */
15201             return;
15202         }
15203     }
15204     Py_ALLOW_RECURSION
15205     t = PyDict_SetDefault(interned, s, s);
15206     Py_END_ALLOW_RECURSION
15207     if (t == NULL) {
15208         PyErr_Clear();
15209         return;
15210     }
15211     if (t != s) {
15212         Py_INCREF(t);
15213         Py_SETREF(*p, t);
15214         return;
15215     }
15216     /* The two references in interned are not counted by refcnt.
15217        The deallocator will take care of this */
15218     Py_REFCNT(s) -= 2;
15219     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15220 }
15221 
15222 void
PyUnicode_InternImmortal(PyObject ** p)15223 PyUnicode_InternImmortal(PyObject **p)
15224 {
15225     PyUnicode_InternInPlace(p);
15226     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15227         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15228         Py_INCREF(*p);
15229     }
15230 }
15231 
15232 PyObject *
PyUnicode_InternFromString(const char * cp)15233 PyUnicode_InternFromString(const char *cp)
15234 {
15235     PyObject *s = PyUnicode_FromString(cp);
15236     if (s == NULL)
15237         return NULL;
15238     PyUnicode_InternInPlace(&s);
15239     return s;
15240 }
15241 
15242 void
_Py_ReleaseInternedUnicodeStrings(void)15243 _Py_ReleaseInternedUnicodeStrings(void)
15244 {
15245     PyObject *keys;
15246     PyObject *s;
15247     Py_ssize_t i, n;
15248     Py_ssize_t immortal_size = 0, mortal_size = 0;
15249 
15250     if (interned == NULL || !PyDict_Check(interned))
15251         return;
15252     keys = PyDict_Keys(interned);
15253     if (keys == NULL || !PyList_Check(keys)) {
15254         PyErr_Clear();
15255         return;
15256     }
15257 
15258     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15259        detector, interned unicode strings are not forcibly deallocated;
15260        rather, we give them their stolen references back, and then clear
15261        and DECREF the interned dict. */
15262 
15263     n = PyList_GET_SIZE(keys);
15264     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15265             n);
15266     for (i = 0; i < n; i++) {
15267         s = PyList_GET_ITEM(keys, i);
15268         if (PyUnicode_READY(s) == -1) {
15269             assert(0 && "could not ready string");
15270             fprintf(stderr, "could not ready string\n");
15271         }
15272         switch (PyUnicode_CHECK_INTERNED(s)) {
15273         case SSTATE_NOT_INTERNED:
15274             /* XXX Shouldn't happen */
15275             break;
15276         case SSTATE_INTERNED_IMMORTAL:
15277             Py_REFCNT(s) += 1;
15278             immortal_size += PyUnicode_GET_LENGTH(s);
15279             break;
15280         case SSTATE_INTERNED_MORTAL:
15281             Py_REFCNT(s) += 2;
15282             mortal_size += PyUnicode_GET_LENGTH(s);
15283             break;
15284         default:
15285             Py_FatalError("Inconsistent interned string state.");
15286         }
15287         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15288     }
15289     fprintf(stderr, "total size of all interned strings: "
15290             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15291             "mortal/immortal\n", mortal_size, immortal_size);
15292     Py_DECREF(keys);
15293     PyDict_Clear(interned);
15294     Py_CLEAR(interned);
15295 }
15296 
15297 
15298 /********************* Unicode Iterator **************************/
15299 
15300 typedef struct {
15301     PyObject_HEAD
15302     Py_ssize_t it_index;
15303     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15304 } unicodeiterobject;
15305 
15306 static void
unicodeiter_dealloc(unicodeiterobject * it)15307 unicodeiter_dealloc(unicodeiterobject *it)
15308 {
15309     _PyObject_GC_UNTRACK(it);
15310     Py_XDECREF(it->it_seq);
15311     PyObject_GC_Del(it);
15312 }
15313 
15314 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15315 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15316 {
15317     Py_VISIT(it->it_seq);
15318     return 0;
15319 }
15320 
15321 static PyObject *
unicodeiter_next(unicodeiterobject * it)15322 unicodeiter_next(unicodeiterobject *it)
15323 {
15324     PyObject *seq, *item;
15325 
15326     assert(it != NULL);
15327     seq = it->it_seq;
15328     if (seq == NULL)
15329         return NULL;
15330     assert(_PyUnicode_CHECK(seq));
15331 
15332     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15333         int kind = PyUnicode_KIND(seq);
15334         void *data = PyUnicode_DATA(seq);
15335         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15336         item = PyUnicode_FromOrdinal(chr);
15337         if (item != NULL)
15338             ++it->it_index;
15339         return item;
15340     }
15341 
15342     it->it_seq = NULL;
15343     Py_DECREF(seq);
15344     return NULL;
15345 }
15346 
15347 static PyObject *
unicodeiter_len(unicodeiterobject * it)15348 unicodeiter_len(unicodeiterobject *it)
15349 {
15350     Py_ssize_t len = 0;
15351     if (it->it_seq)
15352         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15353     return PyLong_FromSsize_t(len);
15354 }
15355 
15356 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15357 
15358 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15359 unicodeiter_reduce(unicodeiterobject *it)
15360 {
15361     if (it->it_seq != NULL) {
15362         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15363                              it->it_seq, it->it_index);
15364     } else {
15365         PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15366         if (u == NULL)
15367             return NULL;
15368         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15369     }
15370 }
15371 
15372 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15373 
15374 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15375 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15376 {
15377     Py_ssize_t index = PyLong_AsSsize_t(state);
15378     if (index == -1 && PyErr_Occurred())
15379         return NULL;
15380     if (it->it_seq != NULL) {
15381         if (index < 0)
15382             index = 0;
15383         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15384             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15385         it->it_index = index;
15386     }
15387     Py_RETURN_NONE;
15388 }
15389 
15390 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15391 
15392 static PyMethodDef unicodeiter_methods[] = {
15393     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15394      length_hint_doc},
15395     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15396      reduce_doc},
15397     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15398      setstate_doc},
15399     {NULL,      NULL}       /* sentinel */
15400 };
15401 
15402 PyTypeObject PyUnicodeIter_Type = {
15403     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15404     "str_iterator",         /* tp_name */
15405     sizeof(unicodeiterobject),      /* tp_basicsize */
15406     0,                  /* tp_itemsize */
15407     /* methods */
15408     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15409     0,                  /* tp_print */
15410     0,                  /* tp_getattr */
15411     0,                  /* tp_setattr */
15412     0,                  /* tp_reserved */
15413     0,                  /* tp_repr */
15414     0,                  /* tp_as_number */
15415     0,                  /* tp_as_sequence */
15416     0,                  /* tp_as_mapping */
15417     0,                  /* tp_hash */
15418     0,                  /* tp_call */
15419     0,                  /* tp_str */
15420     PyObject_GenericGetAttr,        /* tp_getattro */
15421     0,                  /* tp_setattro */
15422     0,                  /* tp_as_buffer */
15423     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15424     0,                  /* tp_doc */
15425     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15426     0,                  /* tp_clear */
15427     0,                  /* tp_richcompare */
15428     0,                  /* tp_weaklistoffset */
15429     PyObject_SelfIter,          /* tp_iter */
15430     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15431     unicodeiter_methods,            /* tp_methods */
15432     0,
15433 };
15434 
15435 static PyObject *
unicode_iter(PyObject * seq)15436 unicode_iter(PyObject *seq)
15437 {
15438     unicodeiterobject *it;
15439 
15440     if (!PyUnicode_Check(seq)) {
15441         PyErr_BadInternalCall();
15442         return NULL;
15443     }
15444     if (PyUnicode_READY(seq) == -1)
15445         return NULL;
15446     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15447     if (it == NULL)
15448         return NULL;
15449     it->it_index = 0;
15450     Py_INCREF(seq);
15451     it->it_seq = seq;
15452     _PyObject_GC_TRACK(it);
15453     return (PyObject *)it;
15454 }
15455 
15456 
15457 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15458 Py_UNICODE_strlen(const Py_UNICODE *u)
15459 {
15460     int res = 0;
15461     while(*u++)
15462         res++;
15463     return res;
15464 }
15465 
15466 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15467 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15468 {
15469     Py_UNICODE *u = s1;
15470     while ((*u++ = *s2++));
15471     return s1;
15472 }
15473 
15474 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15475 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15476 {
15477     Py_UNICODE *u = s1;
15478     while ((*u++ = *s2++))
15479         if (n-- == 0)
15480             break;
15481     return s1;
15482 }
15483 
15484 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15485 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15486 {
15487     Py_UNICODE *u1 = s1;
15488     u1 += Py_UNICODE_strlen(u1);
15489     Py_UNICODE_strcpy(u1, s2);
15490     return s1;
15491 }
15492 
15493 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15494 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15495 {
15496     while (*s1 && *s2 && *s1 == *s2)
15497         s1++, s2++;
15498     if (*s1 && *s2)
15499         return (*s1 < *s2) ? -1 : +1;
15500     if (*s1)
15501         return 1;
15502     if (*s2)
15503         return -1;
15504     return 0;
15505 }
15506 
15507 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15508 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15509 {
15510     Py_UNICODE u1, u2;
15511     for (; n != 0; n--) {
15512         u1 = *s1;
15513         u2 = *s2;
15514         if (u1 != u2)
15515             return (u1 < u2) ? -1 : +1;
15516         if (u1 == '\0')
15517             return 0;
15518         s1++;
15519         s2++;
15520     }
15521     return 0;
15522 }
15523 
15524 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15525 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15526 {
15527     const Py_UNICODE *p;
15528     for (p = s; *p; p++)
15529         if (*p == c)
15530             return (Py_UNICODE*)p;
15531     return NULL;
15532 }
15533 
15534 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15535 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15536 {
15537     const Py_UNICODE *p;
15538     p = s + Py_UNICODE_strlen(s);
15539     while (p != s) {
15540         p--;
15541         if (*p == c)
15542             return (Py_UNICODE*)p;
15543     }
15544     return NULL;
15545 }
15546 
15547 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15548 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15549 {
15550     Py_UNICODE *u, *copy;
15551     Py_ssize_t len, size;
15552 
15553     if (!PyUnicode_Check(unicode)) {
15554         PyErr_BadArgument();
15555         return NULL;
15556     }
15557     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15558     if (u == NULL)
15559         return NULL;
15560     /* Ensure we won't overflow the size. */
15561     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15562         PyErr_NoMemory();
15563         return NULL;
15564     }
15565     size = len + 1; /* copy the null character */
15566     size *= sizeof(Py_UNICODE);
15567     copy = PyMem_Malloc(size);
15568     if (copy == NULL) {
15569         PyErr_NoMemory();
15570         return NULL;
15571     }
15572     memcpy(copy, u, size);
15573     return copy;
15574 }
15575 
15576 /* A _string module, to export formatter_parser and formatter_field_name_split
15577    to the string.Formatter class implemented in Python. */
15578 
15579 static PyMethodDef _string_methods[] = {
15580     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15581      METH_O, PyDoc_STR("split the argument as a field name")},
15582     {"formatter_parser", (PyCFunction) formatter_parser,
15583      METH_O, PyDoc_STR("parse the argument as a format string")},
15584     {NULL, NULL}
15585 };
15586 
15587 static struct PyModuleDef _string_module = {
15588     PyModuleDef_HEAD_INIT,
15589     "_string",
15590     PyDoc_STR("string helper module"),
15591     0,
15592     _string_methods,
15593     NULL,
15594     NULL,
15595     NULL,
15596     NULL
15597 };
15598 
15599 PyMODINIT_FUNC
PyInit__string(void)15600 PyInit__string(void)
15601 {
15602     return PyModule_Create(&_string_module);
15603 }
15604 
15605 
15606 #ifdef __cplusplus
15607 }
15608 #endif
15609