• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_initconfig.h"
44 #include "pycore_fileutils.h"
45 #include "pycore_object.h"
46 #include "pycore_pylifecycle.h"
47 #include "pycore_pystate.h"
48 #include "ucnhash.h"
49 #include "bytes_methods.h"
50 #include "stringlib/eq.h"
51 
52 #ifdef MS_WINDOWS
53 #include <windows.h>
54 #endif
55 
56 /* Uncomment to display statistics on interned strings at exit when
57    using Valgrind or Insecure++. */
58 /* #define INTERNED_STATS 1 */
59 
60 
61 /*[clinic input]
62 class str "PyObject *" "&PyUnicode_Type"
63 [clinic start generated code]*/
64 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65 
66 /*[python input]
67 class Py_UCS4_converter(CConverter):
68     type = 'Py_UCS4'
69     converter = 'convert_uc'
70 
71     def converter_init(self):
72         if self.default is not unspecified:
73             self.c_default = ascii(self.default)
74             if len(self.c_default) > 4 or self.c_default[0] != "'":
75                 self.c_default = hex(ord(self.default))
76 
77 [python start generated code]*/
78 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
79 
80 /* --- Globals ------------------------------------------------------------
81 
82 NOTE: In the interpreter's initialization phase, some globals are currently
83       initialized dynamically as needed. In the process Unicode objects may
84       be created before the Unicode type is ready.
85 
86 */
87 
88 
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92 
93 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94 #define MAX_UNICODE 0x10ffff
95 
96 #ifdef Py_DEBUG
97 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
98 #else
99 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100 #endif
101 
102 #define _PyUnicode_UTF8(op)                             \
103     (((PyCompactUnicodeObject*)(op))->utf8)
104 #define PyUnicode_UTF8(op)                              \
105     (assert(_PyUnicode_CHECK(op)),                      \
106      assert(PyUnicode_IS_READY(op)),                    \
107      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
108          ((char*)((PyASCIIObject*)(op) + 1)) :          \
109          _PyUnicode_UTF8(op))
110 #define _PyUnicode_UTF8_LENGTH(op)                      \
111     (((PyCompactUnicodeObject*)(op))->utf8_length)
112 #define PyUnicode_UTF8_LENGTH(op)                       \
113     (assert(_PyUnicode_CHECK(op)),                      \
114      assert(PyUnicode_IS_READY(op)),                    \
115      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
116          ((PyASCIIObject*)(op))->length :               \
117          _PyUnicode_UTF8_LENGTH(op))
118 #define _PyUnicode_WSTR(op)                             \
119     (((PyASCIIObject*)(op))->wstr)
120 #define _PyUnicode_WSTR_LENGTH(op)                      \
121     (((PyCompactUnicodeObject*)(op))->wstr_length)
122 #define _PyUnicode_LENGTH(op)                           \
123     (((PyASCIIObject *)(op))->length)
124 #define _PyUnicode_STATE(op)                            \
125     (((PyASCIIObject *)(op))->state)
126 #define _PyUnicode_HASH(op)                             \
127     (((PyASCIIObject *)(op))->hash)
128 #define _PyUnicode_KIND(op)                             \
129     (assert(_PyUnicode_CHECK(op)),                      \
130      ((PyASCIIObject *)(op))->state.kind)
131 #define _PyUnicode_GET_LENGTH(op)                       \
132     (assert(_PyUnicode_CHECK(op)),                      \
133      ((PyASCIIObject *)(op))->length)
134 #define _PyUnicode_DATA_ANY(op)                         \
135     (((PyUnicodeObject*)(op))->data.any)
136 
137 #undef PyUnicode_READY
138 #define PyUnicode_READY(op)                             \
139     (assert(_PyUnicode_CHECK(op)),                      \
140      (PyUnicode_IS_READY(op) ?                          \
141       0 :                                               \
142       _PyUnicode_Ready(op)))
143 
144 #define _PyUnicode_SHARE_UTF8(op)                       \
145     (assert(_PyUnicode_CHECK(op)),                      \
146      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
147      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148 #define _PyUnicode_SHARE_WSTR(op)                       \
149     (assert(_PyUnicode_CHECK(op)),                      \
150      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151 
152 /* true if the Unicode object has an allocated UTF-8 memory block
153    (not shared with other data) */
154 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
155     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
156       && _PyUnicode_UTF8(op)                            \
157       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158 
159 /* true if the Unicode object has an allocated wstr memory block
160    (not shared with other data) */
161 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
162     ((_PyUnicode_WSTR(op) &&                            \
163       (!PyUnicode_IS_READY(op) ||                       \
164        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165 
166 /* Generic helper macro to convert characters of different types.
167    from_type and to_type have to be valid type names, begin and end
168    are pointers to the source characters which should be of type
169    "from_type *".  to is a pointer of type "to_type *" and points to the
170    buffer where the result characters are written to. */
171 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172     do {                                                \
173         to_type *_to = (to_type *)(to);                \
174         const from_type *_iter = (from_type *)(begin);  \
175         const from_type *_end = (from_type *)(end);     \
176         Py_ssize_t n = (_end) - (_iter);                \
177         const from_type *_unrolled_end =                \
178             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
179         while (_iter < (_unrolled_end)) {               \
180             _to[0] = (to_type) _iter[0];                \
181             _to[1] = (to_type) _iter[1];                \
182             _to[2] = (to_type) _iter[2];                \
183             _to[3] = (to_type) _iter[3];                \
184             _iter += 4; _to += 4;                       \
185         }                                               \
186         while (_iter < (_end))                          \
187             *_to++ = (to_type) *_iter++;                \
188     } while (0)
189 
190 #ifdef MS_WINDOWS
191    /* On Windows, overallocate by 50% is the best factor */
192 #  define OVERALLOCATE_FACTOR 2
193 #else
194    /* On Linux, overallocate by 25% is the best factor */
195 #  define OVERALLOCATE_FACTOR 4
196 #endif
197 
198 /* This dictionary holds all interned unicode strings.  Note that references
199    to strings in this dictionary are *not* counted in the string's ob_refcnt.
200    When the interned string reaches a refcnt of 0 the string deallocation
201    function will delete the reference from this dictionary.
202 
203    Another way to look at this is that to say that the actual reference
204    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
205 */
206 static PyObject *interned = NULL;
207 
208 /* The empty Unicode object is shared to improve performance. */
209 static PyObject *unicode_empty = NULL;
210 
211 #define _Py_INCREF_UNICODE_EMPTY()                      \
212     do {                                                \
213         if (unicode_empty != NULL)                      \
214             Py_INCREF(unicode_empty);                   \
215         else {                                          \
216             unicode_empty = PyUnicode_New(0, 0);        \
217             if (unicode_empty != NULL) {                \
218                 Py_INCREF(unicode_empty);               \
219                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220             }                                           \
221         }                                               \
222     } while (0)
223 
224 #define _Py_RETURN_UNICODE_EMPTY()                      \
225     do {                                                \
226         _Py_INCREF_UNICODE_EMPTY();                     \
227         return unicode_empty;                           \
228     } while (0)
229 
230 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)231 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232              Py_ssize_t start, Py_ssize_t length)
233 {
234     assert(0 <= start);
235     assert(kind != PyUnicode_WCHAR_KIND);
236     switch (kind) {
237     case PyUnicode_1BYTE_KIND: {
238         assert(value <= 0xff);
239         Py_UCS1 ch = (unsigned char)value;
240         Py_UCS1 *to = (Py_UCS1 *)data + start;
241         memset(to, ch, length);
242         break;
243     }
244     case PyUnicode_2BYTE_KIND: {
245         assert(value <= 0xffff);
246         Py_UCS2 ch = (Py_UCS2)value;
247         Py_UCS2 *to = (Py_UCS2 *)data + start;
248         const Py_UCS2 *end = to + length;
249         for (; to < end; ++to) *to = ch;
250         break;
251     }
252     case PyUnicode_4BYTE_KIND: {
253         assert(value <= MAX_UNICODE);
254         Py_UCS4 ch = value;
255         Py_UCS4 * to = (Py_UCS4 *)data + start;
256         const Py_UCS4 *end = to + length;
257         for (; to < end; ++to) *to = ch;
258         break;
259     }
260     default: Py_UNREACHABLE();
261     }
262 }
263 
264 
265 /* Forward declaration */
266 static inline int
267 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
268 static PyObject *
269 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270                     const char *errors);
271 static PyObject *
272 unicode_decode_utf8(const char *s, Py_ssize_t size,
273                     _Py_error_handler error_handler, const char *errors,
274                     Py_ssize_t *consumed);
275 
276 /* List of static strings. */
277 static _Py_Identifier *static_strings = NULL;
278 
279 /* Single character Unicode strings in the Latin-1 range are being
280    shared as well. */
281 static PyObject *unicode_latin1[256] = {NULL};
282 
283 /* Fast detection of the most frequent whitespace characters */
284 const unsigned char _Py_ascii_whitespace[] = {
285     0, 0, 0, 0, 0, 0, 0, 0,
286 /*     case 0x0009: * CHARACTER TABULATION */
287 /*     case 0x000A: * LINE FEED */
288 /*     case 0x000B: * LINE TABULATION */
289 /*     case 0x000C: * FORM FEED */
290 /*     case 0x000D: * CARRIAGE RETURN */
291     0, 1, 1, 1, 1, 1, 0, 0,
292     0, 0, 0, 0, 0, 0, 0, 0,
293 /*     case 0x001C: * FILE SEPARATOR */
294 /*     case 0x001D: * GROUP SEPARATOR */
295 /*     case 0x001E: * RECORD SEPARATOR */
296 /*     case 0x001F: * UNIT SEPARATOR */
297     0, 0, 0, 0, 1, 1, 1, 1,
298 /*     case 0x0020: * SPACE */
299     1, 0, 0, 0, 0, 0, 0, 0,
300     0, 0, 0, 0, 0, 0, 0, 0,
301     0, 0, 0, 0, 0, 0, 0, 0,
302     0, 0, 0, 0, 0, 0, 0, 0,
303 
304     0, 0, 0, 0, 0, 0, 0, 0,
305     0, 0, 0, 0, 0, 0, 0, 0,
306     0, 0, 0, 0, 0, 0, 0, 0,
307     0, 0, 0, 0, 0, 0, 0, 0,
308     0, 0, 0, 0, 0, 0, 0, 0,
309     0, 0, 0, 0, 0, 0, 0, 0,
310     0, 0, 0, 0, 0, 0, 0, 0,
311     0, 0, 0, 0, 0, 0, 0, 0
312 };
313 
314 /* forward */
315 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
316 static PyObject* get_latin1_char(unsigned char ch);
317 static int unicode_modifiable(PyObject *unicode);
318 
319 
320 static PyObject *
321 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
322 static PyObject *
323 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324 static PyObject *
325 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326 
327 static PyObject *
328 unicode_encode_call_errorhandler(const char *errors,
329        PyObject **errorHandler,const char *encoding, const char *reason,
330        PyObject *unicode, PyObject **exceptionObject,
331        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332 
333 static void
334 raise_encode_exception(PyObject **exceptionObject,
335                        const char *encoding,
336                        PyObject *unicode,
337                        Py_ssize_t startpos, Py_ssize_t endpos,
338                        const char *reason);
339 
340 /* Same for linebreaks */
341 static const unsigned char ascii_linebreak[] = {
342     0, 0, 0, 0, 0, 0, 0, 0,
343 /*         0x000A, * LINE FEED */
344 /*         0x000B, * LINE TABULATION */
345 /*         0x000C, * FORM FEED */
346 /*         0x000D, * CARRIAGE RETURN */
347     0, 0, 1, 1, 1, 1, 0, 0,
348     0, 0, 0, 0, 0, 0, 0, 0,
349 /*         0x001C, * FILE SEPARATOR */
350 /*         0x001D, * GROUP SEPARATOR */
351 /*         0x001E, * RECORD SEPARATOR */
352     0, 0, 0, 0, 1, 1, 1, 0,
353     0, 0, 0, 0, 0, 0, 0, 0,
354     0, 0, 0, 0, 0, 0, 0, 0,
355     0, 0, 0, 0, 0, 0, 0, 0,
356     0, 0, 0, 0, 0, 0, 0, 0,
357 
358     0, 0, 0, 0, 0, 0, 0, 0,
359     0, 0, 0, 0, 0, 0, 0, 0,
360     0, 0, 0, 0, 0, 0, 0, 0,
361     0, 0, 0, 0, 0, 0, 0, 0,
362     0, 0, 0, 0, 0, 0, 0, 0,
363     0, 0, 0, 0, 0, 0, 0, 0,
364     0, 0, 0, 0, 0, 0, 0, 0,
365     0, 0, 0, 0, 0, 0, 0, 0
366 };
367 
368 static int convert_uc(PyObject *obj, void *addr);
369 
370 #include "clinic/unicodeobject.c.h"
371 
372 _Py_error_handler
_Py_GetErrorHandler(const char * errors)373 _Py_GetErrorHandler(const char *errors)
374 {
375     if (errors == NULL || strcmp(errors, "strict") == 0) {
376         return _Py_ERROR_STRICT;
377     }
378     if (strcmp(errors, "surrogateescape") == 0) {
379         return _Py_ERROR_SURROGATEESCAPE;
380     }
381     if (strcmp(errors, "replace") == 0) {
382         return _Py_ERROR_REPLACE;
383     }
384     if (strcmp(errors, "ignore") == 0) {
385         return _Py_ERROR_IGNORE;
386     }
387     if (strcmp(errors, "backslashreplace") == 0) {
388         return _Py_ERROR_BACKSLASHREPLACE;
389     }
390     if (strcmp(errors, "surrogatepass") == 0) {
391         return _Py_ERROR_SURROGATEPASS;
392     }
393     if (strcmp(errors, "xmlcharrefreplace") == 0) {
394         return _Py_ERROR_XMLCHARREFREPLACE;
395     }
396     return _Py_ERROR_OTHER;
397 }
398 
399 
400 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)401 get_error_handler_wide(const wchar_t *errors)
402 {
403     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404         return _Py_ERROR_STRICT;
405     }
406     if (wcscmp(errors, L"surrogateescape") == 0) {
407         return _Py_ERROR_SURROGATEESCAPE;
408     }
409     if (wcscmp(errors, L"replace") == 0) {
410         return _Py_ERROR_REPLACE;
411     }
412     if (wcscmp(errors, L"ignore") == 0) {
413         return _Py_ERROR_IGNORE;
414     }
415     if (wcscmp(errors, L"backslashreplace") == 0) {
416         return _Py_ERROR_BACKSLASHREPLACE;
417     }
418     if (wcscmp(errors, L"surrogatepass") == 0) {
419         return _Py_ERROR_SURROGATEPASS;
420     }
421     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422         return _Py_ERROR_XMLCHARREFREPLACE;
423     }
424     return _Py_ERROR_OTHER;
425 }
426 
427 
428 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429    This function is kept for backward compatibility with the old API. */
430 Py_UNICODE
PyUnicode_GetMax(void)431 PyUnicode_GetMax(void)
432 {
433 #ifdef Py_UNICODE_WIDE
434     return 0x10FFFF;
435 #else
436     /* This is actually an illegal character, so it should
437        not be passed to unichr. */
438     return 0xFFFF;
439 #endif
440 }
441 
442 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)443 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
444 {
445 #define CHECK(expr) \
446     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
447 
448     PyASCIIObject *ascii;
449     unsigned int kind;
450 
451     assert(op != NULL);
452     CHECK(PyUnicode_Check(op));
453 
454     ascii = (PyASCIIObject *)op;
455     kind = ascii->state.kind;
456 
457     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
458         CHECK(kind == PyUnicode_1BYTE_KIND);
459         CHECK(ascii->state.ready == 1);
460     }
461     else {
462         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
463         void *data;
464 
465         if (ascii->state.compact == 1) {
466             data = compact + 1;
467             CHECK(kind == PyUnicode_1BYTE_KIND
468                                  || kind == PyUnicode_2BYTE_KIND
469                                  || kind == PyUnicode_4BYTE_KIND);
470             CHECK(ascii->state.ascii == 0);
471             CHECK(ascii->state.ready == 1);
472             CHECK(compact->utf8 != data);
473         }
474         else {
475             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
476 
477             data = unicode->data.any;
478             if (kind == PyUnicode_WCHAR_KIND) {
479                 CHECK(ascii->length == 0);
480                 CHECK(ascii->hash == -1);
481                 CHECK(ascii->state.compact == 0);
482                 CHECK(ascii->state.ascii == 0);
483                 CHECK(ascii->state.ready == 0);
484                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
485                 CHECK(ascii->wstr != NULL);
486                 CHECK(data == NULL);
487                 CHECK(compact->utf8 == NULL);
488             }
489             else {
490                 CHECK(kind == PyUnicode_1BYTE_KIND
491                                      || kind == PyUnicode_2BYTE_KIND
492                                      || kind == PyUnicode_4BYTE_KIND);
493                 CHECK(ascii->state.compact == 0);
494                 CHECK(ascii->state.ready == 1);
495                 CHECK(data != NULL);
496                 if (ascii->state.ascii) {
497                     CHECK(compact->utf8 == data);
498                     CHECK(compact->utf8_length == ascii->length);
499                 }
500                 else
501                     CHECK(compact->utf8 != data);
502             }
503         }
504         if (kind != PyUnicode_WCHAR_KIND) {
505             if (
506 #if SIZEOF_WCHAR_T == 2
507                 kind == PyUnicode_2BYTE_KIND
508 #else
509                 kind == PyUnicode_4BYTE_KIND
510 #endif
511                )
512             {
513                 CHECK(ascii->wstr == data);
514                 CHECK(compact->wstr_length == ascii->length);
515             } else
516                 CHECK(ascii->wstr != data);
517         }
518 
519         if (compact->utf8 == NULL)
520             CHECK(compact->utf8_length == 0);
521         if (ascii->wstr == NULL)
522             CHECK(compact->wstr_length == 0);
523     }
524 
525     /* check that the best kind is used: O(n) operation */
526     if (check_content && kind != PyUnicode_WCHAR_KIND) {
527         Py_ssize_t i;
528         Py_UCS4 maxchar = 0;
529         void *data;
530         Py_UCS4 ch;
531 
532         data = PyUnicode_DATA(ascii);
533         for (i=0; i < ascii->length; i++)
534         {
535             ch = PyUnicode_READ(kind, data, i);
536             if (ch > maxchar)
537                 maxchar = ch;
538         }
539         if (kind == PyUnicode_1BYTE_KIND) {
540             if (ascii->state.ascii == 0) {
541                 CHECK(maxchar >= 128);
542                 CHECK(maxchar <= 255);
543             }
544             else
545                 CHECK(maxchar < 128);
546         }
547         else if (kind == PyUnicode_2BYTE_KIND) {
548             CHECK(maxchar >= 0x100);
549             CHECK(maxchar <= 0xFFFF);
550         }
551         else {
552             CHECK(maxchar >= 0x10000);
553             CHECK(maxchar <= MAX_UNICODE);
554         }
555         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
556     }
557     return 1;
558 
559 #undef CHECK
560 }
561 
562 
563 static PyObject*
unicode_result_wchar(PyObject * unicode)564 unicode_result_wchar(PyObject *unicode)
565 {
566 #ifndef Py_DEBUG
567     Py_ssize_t len;
568 
569     len = _PyUnicode_WSTR_LENGTH(unicode);
570     if (len == 0) {
571         Py_DECREF(unicode);
572         _Py_RETURN_UNICODE_EMPTY();
573     }
574 
575     if (len == 1) {
576         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
577         if ((Py_UCS4)ch < 256) {
578             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
579             Py_DECREF(unicode);
580             return latin1_char;
581         }
582     }
583 
584     if (_PyUnicode_Ready(unicode) < 0) {
585         Py_DECREF(unicode);
586         return NULL;
587     }
588 #else
589     assert(Py_REFCNT(unicode) == 1);
590 
591     /* don't make the result ready in debug mode to ensure that the caller
592        makes the string ready before using it */
593     assert(_PyUnicode_CheckConsistency(unicode, 1));
594 #endif
595     return unicode;
596 }
597 
598 static PyObject*
unicode_result_ready(PyObject * unicode)599 unicode_result_ready(PyObject *unicode)
600 {
601     Py_ssize_t length;
602 
603     length = PyUnicode_GET_LENGTH(unicode);
604     if (length == 0) {
605         if (unicode != unicode_empty) {
606             Py_DECREF(unicode);
607             _Py_RETURN_UNICODE_EMPTY();
608         }
609         return unicode_empty;
610     }
611 
612     if (length == 1) {
613         void *data = PyUnicode_DATA(unicode);
614         int kind = PyUnicode_KIND(unicode);
615         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
616         if (ch < 256) {
617             PyObject *latin1_char = unicode_latin1[ch];
618             if (latin1_char != NULL) {
619                 if (unicode != latin1_char) {
620                     Py_INCREF(latin1_char);
621                     Py_DECREF(unicode);
622                 }
623                 return latin1_char;
624             }
625             else {
626                 assert(_PyUnicode_CheckConsistency(unicode, 1));
627                 Py_INCREF(unicode);
628                 unicode_latin1[ch] = unicode;
629                 return unicode;
630             }
631         }
632     }
633 
634     assert(_PyUnicode_CheckConsistency(unicode, 1));
635     return unicode;
636 }
637 
638 static PyObject*
unicode_result(PyObject * unicode)639 unicode_result(PyObject *unicode)
640 {
641     assert(_PyUnicode_CHECK(unicode));
642     if (PyUnicode_IS_READY(unicode))
643         return unicode_result_ready(unicode);
644     else
645         return unicode_result_wchar(unicode);
646 }
647 
648 static PyObject*
unicode_result_unchanged(PyObject * unicode)649 unicode_result_unchanged(PyObject *unicode)
650 {
651     if (PyUnicode_CheckExact(unicode)) {
652         if (PyUnicode_READY(unicode) == -1)
653             return NULL;
654         Py_INCREF(unicode);
655         return unicode;
656     }
657     else
658         /* Subtype -- return genuine unicode string with the same value. */
659         return _PyUnicode_Copy(unicode);
660 }
661 
662 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
663    ASCII, Latin1, UTF-8, etc. */
664 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)665 backslashreplace(_PyBytesWriter *writer, char *str,
666                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
667 {
668     Py_ssize_t size, i;
669     Py_UCS4 ch;
670     enum PyUnicode_Kind kind;
671     void *data;
672 
673     assert(PyUnicode_IS_READY(unicode));
674     kind = PyUnicode_KIND(unicode);
675     data = PyUnicode_DATA(unicode);
676 
677     size = 0;
678     /* determine replacement size */
679     for (i = collstart; i < collend; ++i) {
680         Py_ssize_t incr;
681 
682         ch = PyUnicode_READ(kind, data, i);
683         if (ch < 0x100)
684             incr = 2+2;
685         else if (ch < 0x10000)
686             incr = 2+4;
687         else {
688             assert(ch <= MAX_UNICODE);
689             incr = 2+8;
690         }
691         if (size > PY_SSIZE_T_MAX - incr) {
692             PyErr_SetString(PyExc_OverflowError,
693                             "encoded result is too long for a Python string");
694             return NULL;
695         }
696         size += incr;
697     }
698 
699     str = _PyBytesWriter_Prepare(writer, str, size);
700     if (str == NULL)
701         return NULL;
702 
703     /* generate replacement */
704     for (i = collstart; i < collend; ++i) {
705         ch = PyUnicode_READ(kind, data, i);
706         *str++ = '\\';
707         if (ch >= 0x00010000) {
708             *str++ = 'U';
709             *str++ = Py_hexdigits[(ch>>28)&0xf];
710             *str++ = Py_hexdigits[(ch>>24)&0xf];
711             *str++ = Py_hexdigits[(ch>>20)&0xf];
712             *str++ = Py_hexdigits[(ch>>16)&0xf];
713             *str++ = Py_hexdigits[(ch>>12)&0xf];
714             *str++ = Py_hexdigits[(ch>>8)&0xf];
715         }
716         else if (ch >= 0x100) {
717             *str++ = 'u';
718             *str++ = Py_hexdigits[(ch>>12)&0xf];
719             *str++ = Py_hexdigits[(ch>>8)&0xf];
720         }
721         else
722             *str++ = 'x';
723         *str++ = Py_hexdigits[(ch>>4)&0xf];
724         *str++ = Py_hexdigits[ch&0xf];
725     }
726     return str;
727 }
728 
729 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
730    ASCII, Latin1, UTF-8, etc. */
731 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)732 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
733                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
734 {
735     Py_ssize_t size, i;
736     Py_UCS4 ch;
737     enum PyUnicode_Kind kind;
738     void *data;
739 
740     assert(PyUnicode_IS_READY(unicode));
741     kind = PyUnicode_KIND(unicode);
742     data = PyUnicode_DATA(unicode);
743 
744     size = 0;
745     /* determine replacement size */
746     for (i = collstart; i < collend; ++i) {
747         Py_ssize_t incr;
748 
749         ch = PyUnicode_READ(kind, data, i);
750         if (ch < 10)
751             incr = 2+1+1;
752         else if (ch < 100)
753             incr = 2+2+1;
754         else if (ch < 1000)
755             incr = 2+3+1;
756         else if (ch < 10000)
757             incr = 2+4+1;
758         else if (ch < 100000)
759             incr = 2+5+1;
760         else if (ch < 1000000)
761             incr = 2+6+1;
762         else {
763             assert(ch <= MAX_UNICODE);
764             incr = 2+7+1;
765         }
766         if (size > PY_SSIZE_T_MAX - incr) {
767             PyErr_SetString(PyExc_OverflowError,
768                             "encoded result is too long for a Python string");
769             return NULL;
770         }
771         size += incr;
772     }
773 
774     str = _PyBytesWriter_Prepare(writer, str, size);
775     if (str == NULL)
776         return NULL;
777 
778     /* generate replacement */
779     for (i = collstart; i < collend; ++i) {
780         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
781     }
782     return str;
783 }
784 
785 /* --- Bloom Filters ----------------------------------------------------- */
786 
787 /* stuff to implement simple "bloom filters" for Unicode characters.
788    to keep things simple, we use a single bitmask, using the least 5
789    bits from each unicode characters as the bit index. */
790 
791 /* the linebreak mask is set up by Unicode_Init below */
792 
793 #if LONG_BIT >= 128
794 #define BLOOM_WIDTH 128
795 #elif LONG_BIT >= 64
796 #define BLOOM_WIDTH 64
797 #elif LONG_BIT >= 32
798 #define BLOOM_WIDTH 32
799 #else
800 #error "LONG_BIT is smaller than 32"
801 #endif
802 
803 #define BLOOM_MASK unsigned long
804 
805 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
806 
807 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
808 
809 #define BLOOM_LINEBREAK(ch)                                             \
810     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
811      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
812 
813 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)814 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
815 {
816 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
817     do {                                               \
818         TYPE *data = (TYPE *)PTR;                      \
819         TYPE *end = data + LEN;                        \
820         Py_UCS4 ch;                                    \
821         for (; data != end; data++) {                  \
822             ch = *data;                                \
823             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
824         }                                              \
825         break;                                         \
826     } while (0)
827 
828     /* calculate simple bloom-style bitmask for a given unicode string */
829 
830     BLOOM_MASK mask;
831 
832     mask = 0;
833     switch (kind) {
834     case PyUnicode_1BYTE_KIND:
835         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
836         break;
837     case PyUnicode_2BYTE_KIND:
838         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
839         break;
840     case PyUnicode_4BYTE_KIND:
841         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
842         break;
843     default:
844         Py_UNREACHABLE();
845     }
846     return mask;
847 
848 #undef BLOOM_UPDATE
849 }
850 
851 static int
ensure_unicode(PyObject * obj)852 ensure_unicode(PyObject *obj)
853 {
854     if (!PyUnicode_Check(obj)) {
855         PyErr_Format(PyExc_TypeError,
856                      "must be str, not %.100s",
857                      Py_TYPE(obj)->tp_name);
858         return -1;
859     }
860     return PyUnicode_READY(obj);
861 }
862 
863 /* Compilation of templated routines */
864 
865 #include "stringlib/asciilib.h"
866 #include "stringlib/fastsearch.h"
867 #include "stringlib/partition.h"
868 #include "stringlib/split.h"
869 #include "stringlib/count.h"
870 #include "stringlib/find.h"
871 #include "stringlib/find_max_char.h"
872 #include "stringlib/undef.h"
873 
874 #include "stringlib/ucs1lib.h"
875 #include "stringlib/fastsearch.h"
876 #include "stringlib/partition.h"
877 #include "stringlib/split.h"
878 #include "stringlib/count.h"
879 #include "stringlib/find.h"
880 #include "stringlib/replace.h"
881 #include "stringlib/find_max_char.h"
882 #include "stringlib/undef.h"
883 
884 #include "stringlib/ucs2lib.h"
885 #include "stringlib/fastsearch.h"
886 #include "stringlib/partition.h"
887 #include "stringlib/split.h"
888 #include "stringlib/count.h"
889 #include "stringlib/find.h"
890 #include "stringlib/replace.h"
891 #include "stringlib/find_max_char.h"
892 #include "stringlib/undef.h"
893 
894 #include "stringlib/ucs4lib.h"
895 #include "stringlib/fastsearch.h"
896 #include "stringlib/partition.h"
897 #include "stringlib/split.h"
898 #include "stringlib/count.h"
899 #include "stringlib/find.h"
900 #include "stringlib/replace.h"
901 #include "stringlib/find_max_char.h"
902 #include "stringlib/undef.h"
903 
904 #include "stringlib/unicodedefs.h"
905 #include "stringlib/fastsearch.h"
906 #include "stringlib/count.h"
907 #include "stringlib/find.h"
908 #include "stringlib/undef.h"
909 
910 /* --- Unicode Object ----------------------------------------------------- */
911 
912 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)913 findchar(const void *s, int kind,
914          Py_ssize_t size, Py_UCS4 ch,
915          int direction)
916 {
917     switch (kind) {
918     case PyUnicode_1BYTE_KIND:
919         if ((Py_UCS1) ch != ch)
920             return -1;
921         if (direction > 0)
922             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
923         else
924             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
925     case PyUnicode_2BYTE_KIND:
926         if ((Py_UCS2) ch != ch)
927             return -1;
928         if (direction > 0)
929             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
930         else
931             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
932     case PyUnicode_4BYTE_KIND:
933         if (direction > 0)
934             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
935         else
936             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
937     default:
938         Py_UNREACHABLE();
939     }
940 }
941 
942 #ifdef Py_DEBUG
943 /* Fill the data of a Unicode string with invalid characters to detect bugs
944    earlier.
945 
946    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
947    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
948    invalid character in Unicode 6.0. */
949 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)950 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
951 {
952     int kind = PyUnicode_KIND(unicode);
953     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
954     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
955     if (length <= old_length)
956         return;
957     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
958 }
959 #endif
960 
961 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)962 resize_compact(PyObject *unicode, Py_ssize_t length)
963 {
964     Py_ssize_t char_size;
965     Py_ssize_t struct_size;
966     Py_ssize_t new_size;
967     int share_wstr;
968     PyObject *new_unicode;
969 #ifdef Py_DEBUG
970     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
971 #endif
972 
973     assert(unicode_modifiable(unicode));
974     assert(PyUnicode_IS_READY(unicode));
975     assert(PyUnicode_IS_COMPACT(unicode));
976 
977     char_size = PyUnicode_KIND(unicode);
978     if (PyUnicode_IS_ASCII(unicode))
979         struct_size = sizeof(PyASCIIObject);
980     else
981         struct_size = sizeof(PyCompactUnicodeObject);
982     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
983 
984     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
985         PyErr_NoMemory();
986         return NULL;
987     }
988     new_size = (struct_size + (length + 1) * char_size);
989 
990     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
991         PyObject_DEL(_PyUnicode_UTF8(unicode));
992         _PyUnicode_UTF8(unicode) = NULL;
993         _PyUnicode_UTF8_LENGTH(unicode) = 0;
994     }
995     _Py_DEC_REFTOTAL;
996     _Py_ForgetReference(unicode);
997 
998     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
999     if (new_unicode == NULL) {
1000         _Py_NewReference(unicode);
1001         PyErr_NoMemory();
1002         return NULL;
1003     }
1004     unicode = new_unicode;
1005     _Py_NewReference(unicode);
1006 
1007     _PyUnicode_LENGTH(unicode) = length;
1008     if (share_wstr) {
1009         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1010         if (!PyUnicode_IS_ASCII(unicode))
1011             _PyUnicode_WSTR_LENGTH(unicode) = length;
1012     }
1013     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1014         PyObject_DEL(_PyUnicode_WSTR(unicode));
1015         _PyUnicode_WSTR(unicode) = NULL;
1016         if (!PyUnicode_IS_ASCII(unicode))
1017             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1018     }
1019 #ifdef Py_DEBUG
1020     unicode_fill_invalid(unicode, old_length);
1021 #endif
1022     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1023                     length, 0);
1024     assert(_PyUnicode_CheckConsistency(unicode, 0));
1025     return unicode;
1026 }
1027 
1028 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1029 resize_inplace(PyObject *unicode, Py_ssize_t length)
1030 {
1031     wchar_t *wstr;
1032     Py_ssize_t new_size;
1033     assert(!PyUnicode_IS_COMPACT(unicode));
1034     assert(Py_REFCNT(unicode) == 1);
1035 
1036     if (PyUnicode_IS_READY(unicode)) {
1037         Py_ssize_t char_size;
1038         int share_wstr, share_utf8;
1039         void *data;
1040 #ifdef Py_DEBUG
1041         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1042 #endif
1043 
1044         data = _PyUnicode_DATA_ANY(unicode);
1045         char_size = PyUnicode_KIND(unicode);
1046         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1047         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1048 
1049         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1050             PyErr_NoMemory();
1051             return -1;
1052         }
1053         new_size = (length + 1) * char_size;
1054 
1055         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1056         {
1057             PyObject_DEL(_PyUnicode_UTF8(unicode));
1058             _PyUnicode_UTF8(unicode) = NULL;
1059             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1060         }
1061 
1062         data = (PyObject *)PyObject_REALLOC(data, new_size);
1063         if (data == NULL) {
1064             PyErr_NoMemory();
1065             return -1;
1066         }
1067         _PyUnicode_DATA_ANY(unicode) = data;
1068         if (share_wstr) {
1069             _PyUnicode_WSTR(unicode) = data;
1070             _PyUnicode_WSTR_LENGTH(unicode) = length;
1071         }
1072         if (share_utf8) {
1073             _PyUnicode_UTF8(unicode) = data;
1074             _PyUnicode_UTF8_LENGTH(unicode) = length;
1075         }
1076         _PyUnicode_LENGTH(unicode) = length;
1077         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1078 #ifdef Py_DEBUG
1079         unicode_fill_invalid(unicode, old_length);
1080 #endif
1081         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1082             assert(_PyUnicode_CheckConsistency(unicode, 0));
1083             return 0;
1084         }
1085     }
1086     assert(_PyUnicode_WSTR(unicode) != NULL);
1087 
1088     /* check for integer overflow */
1089     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1090         PyErr_NoMemory();
1091         return -1;
1092     }
1093     new_size = sizeof(wchar_t) * (length + 1);
1094     wstr =  _PyUnicode_WSTR(unicode);
1095     wstr = PyObject_REALLOC(wstr, new_size);
1096     if (!wstr) {
1097         PyErr_NoMemory();
1098         return -1;
1099     }
1100     _PyUnicode_WSTR(unicode) = wstr;
1101     _PyUnicode_WSTR(unicode)[length] = 0;
1102     _PyUnicode_WSTR_LENGTH(unicode) = length;
1103     assert(_PyUnicode_CheckConsistency(unicode, 0));
1104     return 0;
1105 }
1106 
1107 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1108 resize_copy(PyObject *unicode, Py_ssize_t length)
1109 {
1110     Py_ssize_t copy_length;
1111     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1112         PyObject *copy;
1113 
1114         assert(PyUnicode_IS_READY(unicode));
1115 
1116         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1117         if (copy == NULL)
1118             return NULL;
1119 
1120         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1121         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1122         return copy;
1123     }
1124     else {
1125         PyObject *w;
1126 
1127         w = (PyObject*)_PyUnicode_New(length);
1128         if (w == NULL)
1129             return NULL;
1130         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1131         copy_length = Py_MIN(copy_length, length);
1132         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1133                   copy_length * sizeof(wchar_t));
1134         return w;
1135     }
1136 }
1137 
1138 /* We allocate one more byte to make sure the string is
1139    Ux0000 terminated; some code (e.g. new_identifier)
1140    relies on that.
1141 
1142    XXX This allocator could further be enhanced by assuring that the
1143    free list never reduces its size below 1.
1144 
1145 */
1146 
1147 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1148 _PyUnicode_New(Py_ssize_t length)
1149 {
1150     PyUnicodeObject *unicode;
1151     size_t new_size;
1152 
1153     /* Optimization for empty strings */
1154     if (length == 0 && unicode_empty != NULL) {
1155         Py_INCREF(unicode_empty);
1156         return (PyUnicodeObject*)unicode_empty;
1157     }
1158 
1159     /* Ensure we won't overflow the size. */
1160     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1161         return (PyUnicodeObject *)PyErr_NoMemory();
1162     }
1163     if (length < 0) {
1164         PyErr_SetString(PyExc_SystemError,
1165                         "Negative size passed to _PyUnicode_New");
1166         return NULL;
1167     }
1168 
1169     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1170     if (unicode == NULL)
1171         return NULL;
1172     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1173 
1174     _PyUnicode_WSTR_LENGTH(unicode) = length;
1175     _PyUnicode_HASH(unicode) = -1;
1176     _PyUnicode_STATE(unicode).interned = 0;
1177     _PyUnicode_STATE(unicode).kind = 0;
1178     _PyUnicode_STATE(unicode).compact = 0;
1179     _PyUnicode_STATE(unicode).ready = 0;
1180     _PyUnicode_STATE(unicode).ascii = 0;
1181     _PyUnicode_DATA_ANY(unicode) = NULL;
1182     _PyUnicode_LENGTH(unicode) = 0;
1183     _PyUnicode_UTF8(unicode) = NULL;
1184     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1185 
1186     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1187     if (!_PyUnicode_WSTR(unicode)) {
1188         Py_DECREF(unicode);
1189         PyErr_NoMemory();
1190         return NULL;
1191     }
1192 
1193     /* Initialize the first element to guard against cases where
1194      * the caller fails before initializing str -- unicode_resize()
1195      * reads str[0], and the Keep-Alive optimization can keep memory
1196      * allocated for str alive across a call to unicode_dealloc(unicode).
1197      * We don't want unicode_resize to read uninitialized memory in
1198      * that case.
1199      */
1200     _PyUnicode_WSTR(unicode)[0] = 0;
1201     _PyUnicode_WSTR(unicode)[length] = 0;
1202 
1203     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1204     return unicode;
1205 }
1206 
1207 static const char*
unicode_kind_name(PyObject * unicode)1208 unicode_kind_name(PyObject *unicode)
1209 {
1210     /* don't check consistency: unicode_kind_name() is called from
1211        _PyUnicode_Dump() */
1212     if (!PyUnicode_IS_COMPACT(unicode))
1213     {
1214         if (!PyUnicode_IS_READY(unicode))
1215             return "wstr";
1216         switch (PyUnicode_KIND(unicode))
1217         {
1218         case PyUnicode_1BYTE_KIND:
1219             if (PyUnicode_IS_ASCII(unicode))
1220                 return "legacy ascii";
1221             else
1222                 return "legacy latin1";
1223         case PyUnicode_2BYTE_KIND:
1224             return "legacy UCS2";
1225         case PyUnicode_4BYTE_KIND:
1226             return "legacy UCS4";
1227         default:
1228             return "<legacy invalid kind>";
1229         }
1230     }
1231     assert(PyUnicode_IS_READY(unicode));
1232     switch (PyUnicode_KIND(unicode)) {
1233     case PyUnicode_1BYTE_KIND:
1234         if (PyUnicode_IS_ASCII(unicode))
1235             return "ascii";
1236         else
1237             return "latin1";
1238     case PyUnicode_2BYTE_KIND:
1239         return "UCS2";
1240     case PyUnicode_4BYTE_KIND:
1241         return "UCS4";
1242     default:
1243         return "<invalid compact kind>";
1244     }
1245 }
1246 
1247 #ifdef Py_DEBUG
1248 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1249 char *_PyUnicode_utf8(void *unicode_raw){
1250     PyObject *unicode = _PyObject_CAST(unicode_raw);
1251     return PyUnicode_UTF8(unicode);
1252 }
1253 
_PyUnicode_compact_data(void * unicode_raw)1254 void *_PyUnicode_compact_data(void *unicode_raw) {
1255     PyObject *unicode = _PyObject_CAST(unicode_raw);
1256     return _PyUnicode_COMPACT_DATA(unicode);
1257 }
_PyUnicode_data(void * unicode_raw)1258 void *_PyUnicode_data(void *unicode_raw) {
1259     PyObject *unicode = _PyObject_CAST(unicode_raw);
1260     printf("obj %p\n", (void*)unicode);
1261     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1262     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1263     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1264     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1265     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1266     return PyUnicode_DATA(unicode);
1267 }
1268 
1269 void
_PyUnicode_Dump(PyObject * op)1270 _PyUnicode_Dump(PyObject *op)
1271 {
1272     PyASCIIObject *ascii = (PyASCIIObject *)op;
1273     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1274     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1275     void *data;
1276 
1277     if (ascii->state.compact)
1278     {
1279         if (ascii->state.ascii)
1280             data = (ascii + 1);
1281         else
1282             data = (compact + 1);
1283     }
1284     else
1285         data = unicode->data.any;
1286     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1287            unicode_kind_name(op), ascii->length);
1288 
1289     if (ascii->wstr == data)
1290         printf("shared ");
1291     printf("wstr=%p", (void *)ascii->wstr);
1292 
1293     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1294         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1295         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1296             printf("shared ");
1297         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1298                (void *)compact->utf8, compact->utf8_length);
1299     }
1300     printf(", data=%p\n", data);
1301 }
1302 #endif
1303 
1304 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1305 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1306 {
1307     PyObject *obj;
1308     PyCompactUnicodeObject *unicode;
1309     void *data;
1310     enum PyUnicode_Kind kind;
1311     int is_sharing, is_ascii;
1312     Py_ssize_t char_size;
1313     Py_ssize_t struct_size;
1314 
1315     /* Optimization for empty strings */
1316     if (size == 0 && unicode_empty != NULL) {
1317         Py_INCREF(unicode_empty);
1318         return unicode_empty;
1319     }
1320 
1321     is_ascii = 0;
1322     is_sharing = 0;
1323     struct_size = sizeof(PyCompactUnicodeObject);
1324     if (maxchar < 128) {
1325         kind = PyUnicode_1BYTE_KIND;
1326         char_size = 1;
1327         is_ascii = 1;
1328         struct_size = sizeof(PyASCIIObject);
1329     }
1330     else if (maxchar < 256) {
1331         kind = PyUnicode_1BYTE_KIND;
1332         char_size = 1;
1333     }
1334     else if (maxchar < 65536) {
1335         kind = PyUnicode_2BYTE_KIND;
1336         char_size = 2;
1337         if (sizeof(wchar_t) == 2)
1338             is_sharing = 1;
1339     }
1340     else {
1341         if (maxchar > MAX_UNICODE) {
1342             PyErr_SetString(PyExc_SystemError,
1343                             "invalid maximum character passed to PyUnicode_New");
1344             return NULL;
1345         }
1346         kind = PyUnicode_4BYTE_KIND;
1347         char_size = 4;
1348         if (sizeof(wchar_t) == 4)
1349             is_sharing = 1;
1350     }
1351 
1352     /* Ensure we won't overflow the size. */
1353     if (size < 0) {
1354         PyErr_SetString(PyExc_SystemError,
1355                         "Negative size passed to PyUnicode_New");
1356         return NULL;
1357     }
1358     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1359         return PyErr_NoMemory();
1360 
1361     /* Duplicated allocation code from _PyObject_New() instead of a call to
1362      * PyObject_New() so we are able to allocate space for the object and
1363      * it's data buffer.
1364      */
1365     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1366     if (obj == NULL)
1367         return PyErr_NoMemory();
1368     obj = PyObject_INIT(obj, &PyUnicode_Type);
1369     if (obj == NULL)
1370         return NULL;
1371 
1372     unicode = (PyCompactUnicodeObject *)obj;
1373     if (is_ascii)
1374         data = ((PyASCIIObject*)obj) + 1;
1375     else
1376         data = unicode + 1;
1377     _PyUnicode_LENGTH(unicode) = size;
1378     _PyUnicode_HASH(unicode) = -1;
1379     _PyUnicode_STATE(unicode).interned = 0;
1380     _PyUnicode_STATE(unicode).kind = kind;
1381     _PyUnicode_STATE(unicode).compact = 1;
1382     _PyUnicode_STATE(unicode).ready = 1;
1383     _PyUnicode_STATE(unicode).ascii = is_ascii;
1384     if (is_ascii) {
1385         ((char*)data)[size] = 0;
1386         _PyUnicode_WSTR(unicode) = NULL;
1387     }
1388     else if (kind == PyUnicode_1BYTE_KIND) {
1389         ((char*)data)[size] = 0;
1390         _PyUnicode_WSTR(unicode) = NULL;
1391         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1392         unicode->utf8 = NULL;
1393         unicode->utf8_length = 0;
1394     }
1395     else {
1396         unicode->utf8 = NULL;
1397         unicode->utf8_length = 0;
1398         if (kind == PyUnicode_2BYTE_KIND)
1399             ((Py_UCS2*)data)[size] = 0;
1400         else /* kind == PyUnicode_4BYTE_KIND */
1401             ((Py_UCS4*)data)[size] = 0;
1402         if (is_sharing) {
1403             _PyUnicode_WSTR_LENGTH(unicode) = size;
1404             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1405         }
1406         else {
1407             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408             _PyUnicode_WSTR(unicode) = NULL;
1409         }
1410     }
1411 #ifdef Py_DEBUG
1412     unicode_fill_invalid((PyObject*)unicode, 0);
1413 #endif
1414     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1415     return obj;
1416 }
1417 
1418 #if SIZEOF_WCHAR_T == 2
1419 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1420    will decode surrogate pairs, the other conversions are implemented as macros
1421    for efficiency.
1422 
1423    This function assumes that unicode can hold one more code point than wstr
1424    characters for a terminating null character. */
1425 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1426 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1427                               PyObject *unicode)
1428 {
1429     const wchar_t *iter;
1430     Py_UCS4 *ucs4_out;
1431 
1432     assert(unicode != NULL);
1433     assert(_PyUnicode_CHECK(unicode));
1434     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1435     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1436 
1437     for (iter = begin; iter < end; ) {
1438         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1439                            _PyUnicode_GET_LENGTH(unicode)));
1440         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1441             && (iter+1) < end
1442             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1443         {
1444             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1445             iter += 2;
1446         }
1447         else {
1448             *ucs4_out++ = *iter;
1449             iter++;
1450         }
1451     }
1452     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1453                         _PyUnicode_GET_LENGTH(unicode)));
1454 
1455 }
1456 #endif
1457 
1458 static int
unicode_check_modifiable(PyObject * unicode)1459 unicode_check_modifiable(PyObject *unicode)
1460 {
1461     if (!unicode_modifiable(unicode)) {
1462         PyErr_SetString(PyExc_SystemError,
1463                         "Cannot modify a string currently used");
1464         return -1;
1465     }
1466     return 0;
1467 }
1468 
1469 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1470 _copy_characters(PyObject *to, Py_ssize_t to_start,
1471                  PyObject *from, Py_ssize_t from_start,
1472                  Py_ssize_t how_many, int check_maxchar)
1473 {
1474     unsigned int from_kind, to_kind;
1475     void *from_data, *to_data;
1476 
1477     assert(0 <= how_many);
1478     assert(0 <= from_start);
1479     assert(0 <= to_start);
1480     assert(PyUnicode_Check(from));
1481     assert(PyUnicode_IS_READY(from));
1482     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1483 
1484     assert(PyUnicode_Check(to));
1485     assert(PyUnicode_IS_READY(to));
1486     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1487 
1488     if (how_many == 0)
1489         return 0;
1490 
1491     from_kind = PyUnicode_KIND(from);
1492     from_data = PyUnicode_DATA(from);
1493     to_kind = PyUnicode_KIND(to);
1494     to_data = PyUnicode_DATA(to);
1495 
1496 #ifdef Py_DEBUG
1497     if (!check_maxchar
1498         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1499     {
1500         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501         Py_UCS4 ch;
1502         Py_ssize_t i;
1503         for (i=0; i < how_many; i++) {
1504             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505             assert(ch <= to_maxchar);
1506         }
1507     }
1508 #endif
1509 
1510     if (from_kind == to_kind) {
1511         if (check_maxchar
1512             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1513         {
1514             /* Writing Latin-1 characters into an ASCII string requires to
1515                check that all written characters are pure ASCII */
1516             Py_UCS4 max_char;
1517             max_char = ucs1lib_find_max_char(from_data,
1518                                              (Py_UCS1*)from_data + how_many);
1519             if (max_char >= 128)
1520                 return -1;
1521         }
1522         memcpy((char*)to_data + to_kind * to_start,
1523                   (char*)from_data + from_kind * from_start,
1524                   to_kind * how_many);
1525     }
1526     else if (from_kind == PyUnicode_1BYTE_KIND
1527              && to_kind == PyUnicode_2BYTE_KIND)
1528     {
1529         _PyUnicode_CONVERT_BYTES(
1530             Py_UCS1, Py_UCS2,
1531             PyUnicode_1BYTE_DATA(from) + from_start,
1532             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1533             PyUnicode_2BYTE_DATA(to) + to_start
1534             );
1535     }
1536     else if (from_kind == PyUnicode_1BYTE_KIND
1537              && to_kind == PyUnicode_4BYTE_KIND)
1538     {
1539         _PyUnicode_CONVERT_BYTES(
1540             Py_UCS1, Py_UCS4,
1541             PyUnicode_1BYTE_DATA(from) + from_start,
1542             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1543             PyUnicode_4BYTE_DATA(to) + to_start
1544             );
1545     }
1546     else if (from_kind == PyUnicode_2BYTE_KIND
1547              && to_kind == PyUnicode_4BYTE_KIND)
1548     {
1549         _PyUnicode_CONVERT_BYTES(
1550             Py_UCS2, Py_UCS4,
1551             PyUnicode_2BYTE_DATA(from) + from_start,
1552             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1553             PyUnicode_4BYTE_DATA(to) + to_start
1554             );
1555     }
1556     else {
1557         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1558 
1559         if (!check_maxchar) {
1560             if (from_kind == PyUnicode_2BYTE_KIND
1561                 && to_kind == PyUnicode_1BYTE_KIND)
1562             {
1563                 _PyUnicode_CONVERT_BYTES(
1564                     Py_UCS2, Py_UCS1,
1565                     PyUnicode_2BYTE_DATA(from) + from_start,
1566                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1567                     PyUnicode_1BYTE_DATA(to) + to_start
1568                     );
1569             }
1570             else if (from_kind == PyUnicode_4BYTE_KIND
1571                      && to_kind == PyUnicode_1BYTE_KIND)
1572             {
1573                 _PyUnicode_CONVERT_BYTES(
1574                     Py_UCS4, Py_UCS1,
1575                     PyUnicode_4BYTE_DATA(from) + from_start,
1576                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1577                     PyUnicode_1BYTE_DATA(to) + to_start
1578                     );
1579             }
1580             else if (from_kind == PyUnicode_4BYTE_KIND
1581                      && to_kind == PyUnicode_2BYTE_KIND)
1582             {
1583                 _PyUnicode_CONVERT_BYTES(
1584                     Py_UCS4, Py_UCS2,
1585                     PyUnicode_4BYTE_DATA(from) + from_start,
1586                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1587                     PyUnicode_2BYTE_DATA(to) + to_start
1588                     );
1589             }
1590             else {
1591                 Py_UNREACHABLE();
1592             }
1593         }
1594         else {
1595             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1596             Py_UCS4 ch;
1597             Py_ssize_t i;
1598 
1599             for (i=0; i < how_many; i++) {
1600                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1601                 if (ch > to_maxchar)
1602                     return -1;
1603                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1604             }
1605         }
1606     }
1607     return 0;
1608 }
1609 
1610 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1611 _PyUnicode_FastCopyCharacters(
1612     PyObject *to, Py_ssize_t to_start,
1613     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1614 {
1615     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1616 }
1617 
1618 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1619 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1620                          PyObject *from, Py_ssize_t from_start,
1621                          Py_ssize_t how_many)
1622 {
1623     int err;
1624 
1625     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1626         PyErr_BadInternalCall();
1627         return -1;
1628     }
1629 
1630     if (PyUnicode_READY(from) == -1)
1631         return -1;
1632     if (PyUnicode_READY(to) == -1)
1633         return -1;
1634 
1635     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1636         PyErr_SetString(PyExc_IndexError, "string index out of range");
1637         return -1;
1638     }
1639     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1640         PyErr_SetString(PyExc_IndexError, "string index out of range");
1641         return -1;
1642     }
1643     if (how_many < 0) {
1644         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1645         return -1;
1646     }
1647     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1648     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1649         PyErr_Format(PyExc_SystemError,
1650                      "Cannot write %zi characters at %zi "
1651                      "in a string of %zi characters",
1652                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1653         return -1;
1654     }
1655 
1656     if (how_many == 0)
1657         return 0;
1658 
1659     if (unicode_check_modifiable(to))
1660         return -1;
1661 
1662     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1663     if (err) {
1664         PyErr_Format(PyExc_SystemError,
1665                      "Cannot copy %s characters "
1666                      "into a string of %s characters",
1667                      unicode_kind_name(from),
1668                      unicode_kind_name(to));
1669         return -1;
1670     }
1671     return how_many;
1672 }
1673 
1674 /* Find the maximum code point and count the number of surrogate pairs so a
1675    correct string length can be computed before converting a string to UCS4.
1676    This function counts single surrogates as a character and not as a pair.
1677 
1678    Return 0 on success, or -1 on error. */
1679 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1680 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1681                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1682 {
1683     const wchar_t *iter;
1684     Py_UCS4 ch;
1685 
1686     assert(num_surrogates != NULL && maxchar != NULL);
1687     *num_surrogates = 0;
1688     *maxchar = 0;
1689 
1690     for (iter = begin; iter < end; ) {
1691 #if SIZEOF_WCHAR_T == 2
1692         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1693             && (iter+1) < end
1694             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1695         {
1696             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1697             ++(*num_surrogates);
1698             iter += 2;
1699         }
1700         else
1701 #endif
1702         {
1703             ch = *iter;
1704             iter++;
1705         }
1706         if (ch > *maxchar) {
1707             *maxchar = ch;
1708             if (*maxchar > MAX_UNICODE) {
1709                 PyErr_Format(PyExc_ValueError,
1710                              "character U+%x is not in range [U+0000; U+10ffff]",
1711                              ch);
1712                 return -1;
1713             }
1714         }
1715     }
1716     return 0;
1717 }
1718 
1719 int
_PyUnicode_Ready(PyObject * unicode)1720 _PyUnicode_Ready(PyObject *unicode)
1721 {
1722     wchar_t *end;
1723     Py_UCS4 maxchar = 0;
1724     Py_ssize_t num_surrogates;
1725 #if SIZEOF_WCHAR_T == 2
1726     Py_ssize_t length_wo_surrogates;
1727 #endif
1728 
1729     /* _PyUnicode_Ready() is only intended for old-style API usage where
1730        strings were created using _PyObject_New() and where no canonical
1731        representation (the str field) has been set yet aka strings
1732        which are not yet ready. */
1733     assert(_PyUnicode_CHECK(unicode));
1734     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1735     assert(_PyUnicode_WSTR(unicode) != NULL);
1736     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1737     assert(_PyUnicode_UTF8(unicode) == NULL);
1738     /* Actually, it should neither be interned nor be anything else: */
1739     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1740 
1741     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1742     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1743                                 &maxchar, &num_surrogates) == -1)
1744         return -1;
1745 
1746     if (maxchar < 256) {
1747         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1748         if (!_PyUnicode_DATA_ANY(unicode)) {
1749             PyErr_NoMemory();
1750             return -1;
1751         }
1752         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1753                                 _PyUnicode_WSTR(unicode), end,
1754                                 PyUnicode_1BYTE_DATA(unicode));
1755         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1756         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1757         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1758         if (maxchar < 128) {
1759             _PyUnicode_STATE(unicode).ascii = 1;
1760             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1761             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1762         }
1763         else {
1764             _PyUnicode_STATE(unicode).ascii = 0;
1765             _PyUnicode_UTF8(unicode) = NULL;
1766             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1767         }
1768         PyObject_FREE(_PyUnicode_WSTR(unicode));
1769         _PyUnicode_WSTR(unicode) = NULL;
1770         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1771     }
1772     /* In this case we might have to convert down from 4-byte native
1773        wchar_t to 2-byte unicode. */
1774     else if (maxchar < 65536) {
1775         assert(num_surrogates == 0 &&
1776                "FindMaxCharAndNumSurrogatePairs() messed up");
1777 
1778 #if SIZEOF_WCHAR_T == 2
1779         /* We can share representations and are done. */
1780         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1781         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1782         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1783         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1784         _PyUnicode_UTF8(unicode) = NULL;
1785         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1786 #else
1787         /* sizeof(wchar_t) == 4 */
1788         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1789             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1790         if (!_PyUnicode_DATA_ANY(unicode)) {
1791             PyErr_NoMemory();
1792             return -1;
1793         }
1794         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1795                                 _PyUnicode_WSTR(unicode), end,
1796                                 PyUnicode_2BYTE_DATA(unicode));
1797         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1798         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1799         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1800         _PyUnicode_UTF8(unicode) = NULL;
1801         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1802         PyObject_FREE(_PyUnicode_WSTR(unicode));
1803         _PyUnicode_WSTR(unicode) = NULL;
1804         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1805 #endif
1806     }
1807     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1808     else {
1809 #if SIZEOF_WCHAR_T == 2
1810         /* in case the native representation is 2-bytes, we need to allocate a
1811            new normalized 4-byte version. */
1812         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1813         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1814             PyErr_NoMemory();
1815             return -1;
1816         }
1817         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1818         if (!_PyUnicode_DATA_ANY(unicode)) {
1819             PyErr_NoMemory();
1820             return -1;
1821         }
1822         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1823         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1824         _PyUnicode_UTF8(unicode) = NULL;
1825         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1826         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1827         _PyUnicode_STATE(unicode).ready = 1;
1828         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1829         PyObject_FREE(_PyUnicode_WSTR(unicode));
1830         _PyUnicode_WSTR(unicode) = NULL;
1831         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1832 #else
1833         assert(num_surrogates == 0);
1834 
1835         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1836         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837         _PyUnicode_UTF8(unicode) = NULL;
1838         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1839         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1840 #endif
1841         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1842     }
1843     _PyUnicode_STATE(unicode).ready = 1;
1844     assert(_PyUnicode_CheckConsistency(unicode, 1));
1845     return 0;
1846 }
1847 
1848 static void
unicode_dealloc(PyObject * unicode)1849 unicode_dealloc(PyObject *unicode)
1850 {
1851     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1852     case SSTATE_NOT_INTERNED:
1853         break;
1854 
1855     case SSTATE_INTERNED_MORTAL:
1856         /* revive dead object temporarily for DelItem */
1857         Py_REFCNT(unicode) = 3;
1858         if (PyDict_DelItem(interned, unicode) != 0)
1859             Py_FatalError(
1860                 "deletion of interned string failed");
1861         break;
1862 
1863     case SSTATE_INTERNED_IMMORTAL:
1864         Py_FatalError("Immortal interned string died.");
1865         /* fall through */
1866 
1867     default:
1868         Py_FatalError("Inconsistent interned string state.");
1869     }
1870 
1871     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1872         PyObject_DEL(_PyUnicode_WSTR(unicode));
1873     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1874         PyObject_DEL(_PyUnicode_UTF8(unicode));
1875     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1876         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1877 
1878     Py_TYPE(unicode)->tp_free(unicode);
1879 }
1880 
1881 #ifdef Py_DEBUG
1882 static int
unicode_is_singleton(PyObject * unicode)1883 unicode_is_singleton(PyObject *unicode)
1884 {
1885     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1886     if (unicode == unicode_empty)
1887         return 1;
1888     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1889     {
1890         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1891         if (ch < 256 && unicode_latin1[ch] == unicode)
1892             return 1;
1893     }
1894     return 0;
1895 }
1896 #endif
1897 
1898 static int
unicode_modifiable(PyObject * unicode)1899 unicode_modifiable(PyObject *unicode)
1900 {
1901     assert(_PyUnicode_CHECK(unicode));
1902     if (Py_REFCNT(unicode) != 1)
1903         return 0;
1904     if (_PyUnicode_HASH(unicode) != -1)
1905         return 0;
1906     if (PyUnicode_CHECK_INTERNED(unicode))
1907         return 0;
1908     if (!PyUnicode_CheckExact(unicode))
1909         return 0;
1910 #ifdef Py_DEBUG
1911     /* singleton refcount is greater than 1 */
1912     assert(!unicode_is_singleton(unicode));
1913 #endif
1914     return 1;
1915 }
1916 
1917 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1918 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1919 {
1920     PyObject *unicode;
1921     Py_ssize_t old_length;
1922 
1923     assert(p_unicode != NULL);
1924     unicode = *p_unicode;
1925 
1926     assert(unicode != NULL);
1927     assert(PyUnicode_Check(unicode));
1928     assert(0 <= length);
1929 
1930     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1931         old_length = PyUnicode_WSTR_LENGTH(unicode);
1932     else
1933         old_length = PyUnicode_GET_LENGTH(unicode);
1934     if (old_length == length)
1935         return 0;
1936 
1937     if (length == 0) {
1938         _Py_INCREF_UNICODE_EMPTY();
1939         if (!unicode_empty)
1940             return -1;
1941         Py_SETREF(*p_unicode, unicode_empty);
1942         return 0;
1943     }
1944 
1945     if (!unicode_modifiable(unicode)) {
1946         PyObject *copy = resize_copy(unicode, length);
1947         if (copy == NULL)
1948             return -1;
1949         Py_SETREF(*p_unicode, copy);
1950         return 0;
1951     }
1952 
1953     if (PyUnicode_IS_COMPACT(unicode)) {
1954         PyObject *new_unicode = resize_compact(unicode, length);
1955         if (new_unicode == NULL)
1956             return -1;
1957         *p_unicode = new_unicode;
1958         return 0;
1959     }
1960     return resize_inplace(unicode, length);
1961 }
1962 
1963 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1964 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1965 {
1966     PyObject *unicode;
1967     if (p_unicode == NULL) {
1968         PyErr_BadInternalCall();
1969         return -1;
1970     }
1971     unicode = *p_unicode;
1972     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1973     {
1974         PyErr_BadInternalCall();
1975         return -1;
1976     }
1977     return unicode_resize(p_unicode, length);
1978 }
1979 
1980 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1981 
1982    WARNING: The function doesn't copy the terminating null character and
1983    doesn't check the maximum character (may write a latin1 character in an
1984    ASCII string). */
1985 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1986 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1987                    const char *str, Py_ssize_t len)
1988 {
1989     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1990     void *data = PyUnicode_DATA(unicode);
1991     const char *end = str + len;
1992 
1993     switch (kind) {
1994     case PyUnicode_1BYTE_KIND: {
1995         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1996 #ifdef Py_DEBUG
1997         if (PyUnicode_IS_ASCII(unicode)) {
1998             Py_UCS4 maxchar = ucs1lib_find_max_char(
1999                 (const Py_UCS1*)str,
2000                 (const Py_UCS1*)str + len);
2001             assert(maxchar < 128);
2002         }
2003 #endif
2004         memcpy((char *) data + index, str, len);
2005         break;
2006     }
2007     case PyUnicode_2BYTE_KIND: {
2008         Py_UCS2 *start = (Py_UCS2 *)data + index;
2009         Py_UCS2 *ucs2 = start;
2010         assert(index <= PyUnicode_GET_LENGTH(unicode));
2011 
2012         for (; str < end; ++ucs2, ++str)
2013             *ucs2 = (Py_UCS2)*str;
2014 
2015         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2016         break;
2017     }
2018     default: {
2019         Py_UCS4 *start = (Py_UCS4 *)data + index;
2020         Py_UCS4 *ucs4 = start;
2021         assert(kind == PyUnicode_4BYTE_KIND);
2022         assert(index <= PyUnicode_GET_LENGTH(unicode));
2023 
2024         for (; str < end; ++ucs4, ++str)
2025             *ucs4 = (Py_UCS4)*str;
2026 
2027         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2028     }
2029     }
2030 }
2031 
2032 static PyObject*
get_latin1_char(unsigned char ch)2033 get_latin1_char(unsigned char ch)
2034 {
2035     PyObject *unicode = unicode_latin1[ch];
2036     if (!unicode) {
2037         unicode = PyUnicode_New(1, ch);
2038         if (!unicode)
2039             return NULL;
2040         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2041         assert(_PyUnicode_CheckConsistency(unicode, 1));
2042         unicode_latin1[ch] = unicode;
2043     }
2044     Py_INCREF(unicode);
2045     return unicode;
2046 }
2047 
2048 static PyObject*
unicode_char(Py_UCS4 ch)2049 unicode_char(Py_UCS4 ch)
2050 {
2051     PyObject *unicode;
2052 
2053     assert(ch <= MAX_UNICODE);
2054 
2055     if (ch < 256)
2056         return get_latin1_char(ch);
2057 
2058     unicode = PyUnicode_New(1, ch);
2059     if (unicode == NULL)
2060         return NULL;
2061 
2062     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2063     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2064         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2065     } else {
2066         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2067         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2068     }
2069     assert(_PyUnicode_CheckConsistency(unicode, 1));
2070     return unicode;
2071 }
2072 
2073 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2074 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2075 {
2076     if (u == NULL)
2077         return (PyObject*)_PyUnicode_New(size);
2078 
2079     if (size < 0) {
2080         PyErr_BadInternalCall();
2081         return NULL;
2082     }
2083 
2084     return PyUnicode_FromWideChar(u, size);
2085 }
2086 
2087 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2088 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2089 {
2090     PyObject *unicode;
2091     Py_UCS4 maxchar = 0;
2092     Py_ssize_t num_surrogates;
2093 
2094     if (u == NULL && size != 0) {
2095         PyErr_BadInternalCall();
2096         return NULL;
2097     }
2098 
2099     if (size == -1) {
2100         size = wcslen(u);
2101     }
2102 
2103     /* If the Unicode data is known at construction time, we can apply
2104        some optimizations which share commonly used objects. */
2105 
2106     /* Optimization for empty strings */
2107     if (size == 0)
2108         _Py_RETURN_UNICODE_EMPTY();
2109 
2110     /* Single character Unicode objects in the Latin-1 range are
2111        shared when using this constructor */
2112     if (size == 1 && (Py_UCS4)*u < 256)
2113         return get_latin1_char((unsigned char)*u);
2114 
2115     /* If not empty and not single character, copy the Unicode data
2116        into the new object */
2117     if (find_maxchar_surrogates(u, u + size,
2118                                 &maxchar, &num_surrogates) == -1)
2119         return NULL;
2120 
2121     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2122     if (!unicode)
2123         return NULL;
2124 
2125     switch (PyUnicode_KIND(unicode)) {
2126     case PyUnicode_1BYTE_KIND:
2127         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2128                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2129         break;
2130     case PyUnicode_2BYTE_KIND:
2131 #if Py_UNICODE_SIZE == 2
2132         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2133 #else
2134         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2135                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2136 #endif
2137         break;
2138     case PyUnicode_4BYTE_KIND:
2139 #if SIZEOF_WCHAR_T == 2
2140         /* This is the only case which has to process surrogates, thus
2141            a simple copy loop is not enough and we need a function. */
2142         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2143 #else
2144         assert(num_surrogates == 0);
2145         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2146 #endif
2147         break;
2148     default:
2149         Py_UNREACHABLE();
2150     }
2151 
2152     return unicode_result(unicode);
2153 }
2154 
2155 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2156 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2157 {
2158     if (size < 0) {
2159         PyErr_SetString(PyExc_SystemError,
2160                         "Negative size passed to PyUnicode_FromStringAndSize");
2161         return NULL;
2162     }
2163     if (u != NULL)
2164         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2165     else
2166         return (PyObject *)_PyUnicode_New(size);
2167 }
2168 
2169 PyObject *
PyUnicode_FromString(const char * u)2170 PyUnicode_FromString(const char *u)
2171 {
2172     size_t size = strlen(u);
2173     if (size > PY_SSIZE_T_MAX) {
2174         PyErr_SetString(PyExc_OverflowError, "input too long");
2175         return NULL;
2176     }
2177     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2178 }
2179 
2180 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2181 _PyUnicode_FromId(_Py_Identifier *id)
2182 {
2183     if (!id->object) {
2184         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2185                                                   strlen(id->string),
2186                                                   NULL, NULL);
2187         if (!id->object)
2188             return NULL;
2189         PyUnicode_InternInPlace(&id->object);
2190         assert(!id->next);
2191         id->next = static_strings;
2192         static_strings = id;
2193     }
2194     return id->object;
2195 }
2196 
2197 void
_PyUnicode_ClearStaticStrings()2198 _PyUnicode_ClearStaticStrings()
2199 {
2200     _Py_Identifier *tmp, *s = static_strings;
2201     while (s) {
2202         Py_CLEAR(s->object);
2203         tmp = s->next;
2204         s->next = NULL;
2205         s = tmp;
2206     }
2207     static_strings = NULL;
2208 }
2209 
2210 /* Internal function, doesn't check maximum character */
2211 
2212 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2213 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2214 {
2215     const unsigned char *s = (const unsigned char *)buffer;
2216     PyObject *unicode;
2217     if (size == 1) {
2218 #ifdef Py_DEBUG
2219         assert((unsigned char)s[0] < 128);
2220 #endif
2221         return get_latin1_char(s[0]);
2222     }
2223     unicode = PyUnicode_New(size, 127);
2224     if (!unicode)
2225         return NULL;
2226     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2227     assert(_PyUnicode_CheckConsistency(unicode, 1));
2228     return unicode;
2229 }
2230 
2231 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2232 kind_maxchar_limit(unsigned int kind)
2233 {
2234     switch (kind) {
2235     case PyUnicode_1BYTE_KIND:
2236         return 0x80;
2237     case PyUnicode_2BYTE_KIND:
2238         return 0x100;
2239     case PyUnicode_4BYTE_KIND:
2240         return 0x10000;
2241     default:
2242         Py_UNREACHABLE();
2243     }
2244 }
2245 
2246 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2247 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2248 {
2249     PyObject *res;
2250     unsigned char max_char;
2251 
2252     if (size == 0)
2253         _Py_RETURN_UNICODE_EMPTY();
2254     assert(size > 0);
2255     if (size == 1)
2256         return get_latin1_char(u[0]);
2257 
2258     max_char = ucs1lib_find_max_char(u, u + size);
2259     res = PyUnicode_New(size, max_char);
2260     if (!res)
2261         return NULL;
2262     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2263     assert(_PyUnicode_CheckConsistency(res, 1));
2264     return res;
2265 }
2266 
2267 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2268 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2269 {
2270     PyObject *res;
2271     Py_UCS2 max_char;
2272 
2273     if (size == 0)
2274         _Py_RETURN_UNICODE_EMPTY();
2275     assert(size > 0);
2276     if (size == 1)
2277         return unicode_char(u[0]);
2278 
2279     max_char = ucs2lib_find_max_char(u, u + size);
2280     res = PyUnicode_New(size, max_char);
2281     if (!res)
2282         return NULL;
2283     if (max_char >= 256)
2284         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2285     else {
2286         _PyUnicode_CONVERT_BYTES(
2287             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2288     }
2289     assert(_PyUnicode_CheckConsistency(res, 1));
2290     return res;
2291 }
2292 
2293 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2294 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2295 {
2296     PyObject *res;
2297     Py_UCS4 max_char;
2298 
2299     if (size == 0)
2300         _Py_RETURN_UNICODE_EMPTY();
2301     assert(size > 0);
2302     if (size == 1)
2303         return unicode_char(u[0]);
2304 
2305     max_char = ucs4lib_find_max_char(u, u + size);
2306     res = PyUnicode_New(size, max_char);
2307     if (!res)
2308         return NULL;
2309     if (max_char < 256)
2310         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2311                                  PyUnicode_1BYTE_DATA(res));
2312     else if (max_char < 0x10000)
2313         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2314                                  PyUnicode_2BYTE_DATA(res));
2315     else
2316         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2317     assert(_PyUnicode_CheckConsistency(res, 1));
2318     return res;
2319 }
2320 
2321 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2322 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2323 {
2324     if (size < 0) {
2325         PyErr_SetString(PyExc_ValueError, "size must be positive");
2326         return NULL;
2327     }
2328     switch (kind) {
2329     case PyUnicode_1BYTE_KIND:
2330         return _PyUnicode_FromUCS1(buffer, size);
2331     case PyUnicode_2BYTE_KIND:
2332         return _PyUnicode_FromUCS2(buffer, size);
2333     case PyUnicode_4BYTE_KIND:
2334         return _PyUnicode_FromUCS4(buffer, size);
2335     default:
2336         PyErr_SetString(PyExc_SystemError, "invalid kind");
2337         return NULL;
2338     }
2339 }
2340 
2341 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2342 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2343 {
2344     enum PyUnicode_Kind kind;
2345     void *startptr, *endptr;
2346 
2347     assert(PyUnicode_IS_READY(unicode));
2348     assert(0 <= start);
2349     assert(end <= PyUnicode_GET_LENGTH(unicode));
2350     assert(start <= end);
2351 
2352     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2353         return PyUnicode_MAX_CHAR_VALUE(unicode);
2354 
2355     if (start == end)
2356         return 127;
2357 
2358     if (PyUnicode_IS_ASCII(unicode))
2359         return 127;
2360 
2361     kind = PyUnicode_KIND(unicode);
2362     startptr = PyUnicode_DATA(unicode);
2363     endptr = (char *)startptr + end * kind;
2364     startptr = (char *)startptr + start * kind;
2365     switch(kind) {
2366     case PyUnicode_1BYTE_KIND:
2367         return ucs1lib_find_max_char(startptr, endptr);
2368     case PyUnicode_2BYTE_KIND:
2369         return ucs2lib_find_max_char(startptr, endptr);
2370     case PyUnicode_4BYTE_KIND:
2371         return ucs4lib_find_max_char(startptr, endptr);
2372     default:
2373         Py_UNREACHABLE();
2374     }
2375 }
2376 
2377 /* Ensure that a string uses the most efficient storage, if it is not the
2378    case: create a new string with of the right kind. Write NULL into *p_unicode
2379    on error. */
2380 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2381 unicode_adjust_maxchar(PyObject **p_unicode)
2382 {
2383     PyObject *unicode, *copy;
2384     Py_UCS4 max_char;
2385     Py_ssize_t len;
2386     unsigned int kind;
2387 
2388     assert(p_unicode != NULL);
2389     unicode = *p_unicode;
2390     assert(PyUnicode_IS_READY(unicode));
2391     if (PyUnicode_IS_ASCII(unicode))
2392         return;
2393 
2394     len = PyUnicode_GET_LENGTH(unicode);
2395     kind = PyUnicode_KIND(unicode);
2396     if (kind == PyUnicode_1BYTE_KIND) {
2397         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2398         max_char = ucs1lib_find_max_char(u, u + len);
2399         if (max_char >= 128)
2400             return;
2401     }
2402     else if (kind == PyUnicode_2BYTE_KIND) {
2403         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2404         max_char = ucs2lib_find_max_char(u, u + len);
2405         if (max_char >= 256)
2406             return;
2407     }
2408     else {
2409         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2410         assert(kind == PyUnicode_4BYTE_KIND);
2411         max_char = ucs4lib_find_max_char(u, u + len);
2412         if (max_char >= 0x10000)
2413             return;
2414     }
2415     copy = PyUnicode_New(len, max_char);
2416     if (copy != NULL)
2417         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2418     Py_DECREF(unicode);
2419     *p_unicode = copy;
2420 }
2421 
2422 PyObject*
_PyUnicode_Copy(PyObject * unicode)2423 _PyUnicode_Copy(PyObject *unicode)
2424 {
2425     Py_ssize_t length;
2426     PyObject *copy;
2427 
2428     if (!PyUnicode_Check(unicode)) {
2429         PyErr_BadInternalCall();
2430         return NULL;
2431     }
2432     if (PyUnicode_READY(unicode) == -1)
2433         return NULL;
2434 
2435     length = PyUnicode_GET_LENGTH(unicode);
2436     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2437     if (!copy)
2438         return NULL;
2439     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2440 
2441     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2442               length * PyUnicode_KIND(unicode));
2443     assert(_PyUnicode_CheckConsistency(copy, 1));
2444     return copy;
2445 }
2446 
2447 
2448 /* Widen Unicode objects to larger buffers. Don't write terminating null
2449    character. Return NULL on error. */
2450 
2451 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2452 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2453 {
2454     Py_ssize_t len;
2455     void *result;
2456     unsigned int skind;
2457 
2458     if (PyUnicode_READY(s) == -1)
2459         return NULL;
2460 
2461     len = PyUnicode_GET_LENGTH(s);
2462     skind = PyUnicode_KIND(s);
2463     if (skind >= kind) {
2464         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2465         return NULL;
2466     }
2467     switch (kind) {
2468     case PyUnicode_2BYTE_KIND:
2469         result = PyMem_New(Py_UCS2, len);
2470         if (!result)
2471             return PyErr_NoMemory();
2472         assert(skind == PyUnicode_1BYTE_KIND);
2473         _PyUnicode_CONVERT_BYTES(
2474             Py_UCS1, Py_UCS2,
2475             PyUnicode_1BYTE_DATA(s),
2476             PyUnicode_1BYTE_DATA(s) + len,
2477             result);
2478         return result;
2479     case PyUnicode_4BYTE_KIND:
2480         result = PyMem_New(Py_UCS4, len);
2481         if (!result)
2482             return PyErr_NoMemory();
2483         if (skind == PyUnicode_2BYTE_KIND) {
2484             _PyUnicode_CONVERT_BYTES(
2485                 Py_UCS2, Py_UCS4,
2486                 PyUnicode_2BYTE_DATA(s),
2487                 PyUnicode_2BYTE_DATA(s) + len,
2488                 result);
2489         }
2490         else {
2491             assert(skind == PyUnicode_1BYTE_KIND);
2492             _PyUnicode_CONVERT_BYTES(
2493                 Py_UCS1, Py_UCS4,
2494                 PyUnicode_1BYTE_DATA(s),
2495                 PyUnicode_1BYTE_DATA(s) + len,
2496                 result);
2497         }
2498         return result;
2499     default:
2500         break;
2501     }
2502     PyErr_SetString(PyExc_SystemError, "invalid kind");
2503     return NULL;
2504 }
2505 
2506 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2507 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2508         int copy_null)
2509 {
2510     int kind;
2511     void *data;
2512     Py_ssize_t len, targetlen;
2513     if (PyUnicode_READY(string) == -1)
2514         return NULL;
2515     kind = PyUnicode_KIND(string);
2516     data = PyUnicode_DATA(string);
2517     len = PyUnicode_GET_LENGTH(string);
2518     targetlen = len;
2519     if (copy_null)
2520         targetlen++;
2521     if (!target) {
2522         target = PyMem_New(Py_UCS4, targetlen);
2523         if (!target) {
2524             PyErr_NoMemory();
2525             return NULL;
2526         }
2527     }
2528     else {
2529         if (targetsize < targetlen) {
2530             PyErr_Format(PyExc_SystemError,
2531                          "string is longer than the buffer");
2532             if (copy_null && 0 < targetsize)
2533                 target[0] = 0;
2534             return NULL;
2535         }
2536     }
2537     if (kind == PyUnicode_1BYTE_KIND) {
2538         Py_UCS1 *start = (Py_UCS1 *) data;
2539         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2540     }
2541     else if (kind == PyUnicode_2BYTE_KIND) {
2542         Py_UCS2 *start = (Py_UCS2 *) data;
2543         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2544     }
2545     else {
2546         assert(kind == PyUnicode_4BYTE_KIND);
2547         memcpy(target, data, len * sizeof(Py_UCS4));
2548     }
2549     if (copy_null)
2550         target[len] = 0;
2551     return target;
2552 }
2553 
2554 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2555 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2556                  int copy_null)
2557 {
2558     if (target == NULL || targetsize < 0) {
2559         PyErr_BadInternalCall();
2560         return NULL;
2561     }
2562     return as_ucs4(string, target, targetsize, copy_null);
2563 }
2564 
2565 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2566 PyUnicode_AsUCS4Copy(PyObject *string)
2567 {
2568     return as_ucs4(string, NULL, 0, 1);
2569 }
2570 
2571 /* maximum number of characters required for output of %lld or %p.
2572    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2573    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2574 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2575 
2576 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2577 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2578                              Py_ssize_t width, Py_ssize_t precision)
2579 {
2580     Py_ssize_t length, fill, arglen;
2581     Py_UCS4 maxchar;
2582 
2583     if (PyUnicode_READY(str) == -1)
2584         return -1;
2585 
2586     length = PyUnicode_GET_LENGTH(str);
2587     if ((precision == -1 || precision >= length)
2588         && width <= length)
2589         return _PyUnicodeWriter_WriteStr(writer, str);
2590 
2591     if (precision != -1)
2592         length = Py_MIN(precision, length);
2593 
2594     arglen = Py_MAX(length, width);
2595     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2596         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2597     else
2598         maxchar = writer->maxchar;
2599 
2600     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2601         return -1;
2602 
2603     if (width > length) {
2604         fill = width - length;
2605         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2606             return -1;
2607         writer->pos += fill;
2608     }
2609 
2610     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2611                                   str, 0, length);
2612     writer->pos += length;
2613     return 0;
2614 }
2615 
2616 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2617 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2618                               Py_ssize_t width, Py_ssize_t precision)
2619 {
2620     /* UTF-8 */
2621     Py_ssize_t length;
2622     PyObject *unicode;
2623     int res;
2624 
2625     if (precision == -1) {
2626         length = strlen(str);
2627     }
2628     else {
2629         length = 0;
2630         while (length < precision && str[length]) {
2631             length++;
2632         }
2633     }
2634     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2635     if (unicode == NULL)
2636         return -1;
2637 
2638     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2639     Py_DECREF(unicode);
2640     return res;
2641 }
2642 
2643 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2644 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2645                        const char *f, va_list *vargs)
2646 {
2647     const char *p;
2648     Py_ssize_t len;
2649     int zeropad;
2650     Py_ssize_t width;
2651     Py_ssize_t precision;
2652     int longflag;
2653     int longlongflag;
2654     int size_tflag;
2655     Py_ssize_t fill;
2656 
2657     p = f;
2658     f++;
2659     zeropad = 0;
2660     if (*f == '0') {
2661         zeropad = 1;
2662         f++;
2663     }
2664 
2665     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2666     width = -1;
2667     if (Py_ISDIGIT((unsigned)*f)) {
2668         width = *f - '0';
2669         f++;
2670         while (Py_ISDIGIT((unsigned)*f)) {
2671             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2672                 PyErr_SetString(PyExc_ValueError,
2673                                 "width too big");
2674                 return NULL;
2675             }
2676             width = (width * 10) + (*f - '0');
2677             f++;
2678         }
2679     }
2680     precision = -1;
2681     if (*f == '.') {
2682         f++;
2683         if (Py_ISDIGIT((unsigned)*f)) {
2684             precision = (*f - '0');
2685             f++;
2686             while (Py_ISDIGIT((unsigned)*f)) {
2687                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2688                     PyErr_SetString(PyExc_ValueError,
2689                                     "precision too big");
2690                     return NULL;
2691                 }
2692                 precision = (precision * 10) + (*f - '0');
2693                 f++;
2694             }
2695         }
2696         if (*f == '%') {
2697             /* "%.3%s" => f points to "3" */
2698             f--;
2699         }
2700     }
2701     if (*f == '\0') {
2702         /* bogus format "%.123" => go backward, f points to "3" */
2703         f--;
2704     }
2705 
2706     /* Handle %ld, %lu, %lld and %llu. */
2707     longflag = 0;
2708     longlongflag = 0;
2709     size_tflag = 0;
2710     if (*f == 'l') {
2711         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2712             longflag = 1;
2713             ++f;
2714         }
2715         else if (f[1] == 'l' &&
2716                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2717             longlongflag = 1;
2718             f += 2;
2719         }
2720     }
2721     /* handle the size_t flag. */
2722     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2723         size_tflag = 1;
2724         ++f;
2725     }
2726 
2727     if (f[1] == '\0')
2728         writer->overallocate = 0;
2729 
2730     switch (*f) {
2731     case 'c':
2732     {
2733         int ordinal = va_arg(*vargs, int);
2734         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2735             PyErr_SetString(PyExc_OverflowError,
2736                             "character argument not in range(0x110000)");
2737             return NULL;
2738         }
2739         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2740             return NULL;
2741         break;
2742     }
2743 
2744     case 'i':
2745     case 'd':
2746     case 'u':
2747     case 'x':
2748     {
2749         /* used by sprintf */
2750         char buffer[MAX_LONG_LONG_CHARS];
2751         Py_ssize_t arglen;
2752 
2753         if (*f == 'u') {
2754             if (longflag)
2755                 len = sprintf(buffer, "%lu",
2756                         va_arg(*vargs, unsigned long));
2757             else if (longlongflag)
2758                 len = sprintf(buffer, "%llu",
2759                         va_arg(*vargs, unsigned long long));
2760             else if (size_tflag)
2761                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2762                         va_arg(*vargs, size_t));
2763             else
2764                 len = sprintf(buffer, "%u",
2765                         va_arg(*vargs, unsigned int));
2766         }
2767         else if (*f == 'x') {
2768             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2769         }
2770         else {
2771             if (longflag)
2772                 len = sprintf(buffer, "%li",
2773                         va_arg(*vargs, long));
2774             else if (longlongflag)
2775                 len = sprintf(buffer, "%lli",
2776                         va_arg(*vargs, long long));
2777             else if (size_tflag)
2778                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2779                         va_arg(*vargs, Py_ssize_t));
2780             else
2781                 len = sprintf(buffer, "%i",
2782                         va_arg(*vargs, int));
2783         }
2784         assert(len >= 0);
2785 
2786         if (precision < len)
2787             precision = len;
2788 
2789         arglen = Py_MAX(precision, width);
2790         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2791             return NULL;
2792 
2793         if (width > precision) {
2794             Py_UCS4 fillchar;
2795             fill = width - precision;
2796             fillchar = zeropad?'0':' ';
2797             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2798                 return NULL;
2799             writer->pos += fill;
2800         }
2801         if (precision > len) {
2802             fill = precision - len;
2803             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2804                 return NULL;
2805             writer->pos += fill;
2806         }
2807 
2808         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2809             return NULL;
2810         break;
2811     }
2812 
2813     case 'p':
2814     {
2815         char number[MAX_LONG_LONG_CHARS];
2816 
2817         len = sprintf(number, "%p", va_arg(*vargs, void*));
2818         assert(len >= 0);
2819 
2820         /* %p is ill-defined:  ensure leading 0x. */
2821         if (number[1] == 'X')
2822             number[1] = 'x';
2823         else if (number[1] != 'x') {
2824             memmove(number + 2, number,
2825                     strlen(number) + 1);
2826             number[0] = '0';
2827             number[1] = 'x';
2828             len += 2;
2829         }
2830 
2831         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2832             return NULL;
2833         break;
2834     }
2835 
2836     case 's':
2837     {
2838         /* UTF-8 */
2839         const char *s = va_arg(*vargs, const char*);
2840         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2841             return NULL;
2842         break;
2843     }
2844 
2845     case 'U':
2846     {
2847         PyObject *obj = va_arg(*vargs, PyObject *);
2848         assert(obj && _PyUnicode_CHECK(obj));
2849 
2850         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2851             return NULL;
2852         break;
2853     }
2854 
2855     case 'V':
2856     {
2857         PyObject *obj = va_arg(*vargs, PyObject *);
2858         const char *str = va_arg(*vargs, const char *);
2859         if (obj) {
2860             assert(_PyUnicode_CHECK(obj));
2861             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2862                 return NULL;
2863         }
2864         else {
2865             assert(str != NULL);
2866             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2867                 return NULL;
2868         }
2869         break;
2870     }
2871 
2872     case 'S':
2873     {
2874         PyObject *obj = va_arg(*vargs, PyObject *);
2875         PyObject *str;
2876         assert(obj);
2877         str = PyObject_Str(obj);
2878         if (!str)
2879             return NULL;
2880         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2881             Py_DECREF(str);
2882             return NULL;
2883         }
2884         Py_DECREF(str);
2885         break;
2886     }
2887 
2888     case 'R':
2889     {
2890         PyObject *obj = va_arg(*vargs, PyObject *);
2891         PyObject *repr;
2892         assert(obj);
2893         repr = PyObject_Repr(obj);
2894         if (!repr)
2895             return NULL;
2896         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2897             Py_DECREF(repr);
2898             return NULL;
2899         }
2900         Py_DECREF(repr);
2901         break;
2902     }
2903 
2904     case 'A':
2905     {
2906         PyObject *obj = va_arg(*vargs, PyObject *);
2907         PyObject *ascii;
2908         assert(obj);
2909         ascii = PyObject_ASCII(obj);
2910         if (!ascii)
2911             return NULL;
2912         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2913             Py_DECREF(ascii);
2914             return NULL;
2915         }
2916         Py_DECREF(ascii);
2917         break;
2918     }
2919 
2920     case '%':
2921         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2922             return NULL;
2923         break;
2924 
2925     default:
2926         /* if we stumble upon an unknown formatting code, copy the rest
2927            of the format string to the output string. (we cannot just
2928            skip the code, since there's no way to know what's in the
2929            argument list) */
2930         len = strlen(p);
2931         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2932             return NULL;
2933         f = p+len;
2934         return f;
2935     }
2936 
2937     f++;
2938     return f;
2939 }
2940 
2941 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2942 PyUnicode_FromFormatV(const char *format, va_list vargs)
2943 {
2944     va_list vargs2;
2945     const char *f;
2946     _PyUnicodeWriter writer;
2947 
2948     _PyUnicodeWriter_Init(&writer);
2949     writer.min_length = strlen(format) + 100;
2950     writer.overallocate = 1;
2951 
2952     // Copy varags to be able to pass a reference to a subfunction.
2953     va_copy(vargs2, vargs);
2954 
2955     for (f = format; *f; ) {
2956         if (*f == '%') {
2957             f = unicode_fromformat_arg(&writer, f, &vargs2);
2958             if (f == NULL)
2959                 goto fail;
2960         }
2961         else {
2962             const char *p;
2963             Py_ssize_t len;
2964 
2965             p = f;
2966             do
2967             {
2968                 if ((unsigned char)*p > 127) {
2969                     PyErr_Format(PyExc_ValueError,
2970                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2971                         "string, got a non-ASCII byte: 0x%02x",
2972                         (unsigned char)*p);
2973                     goto fail;
2974                 }
2975                 p++;
2976             }
2977             while (*p != '\0' && *p != '%');
2978             len = p - f;
2979 
2980             if (*p == '\0')
2981                 writer.overallocate = 0;
2982 
2983             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2984                 goto fail;
2985 
2986             f = p;
2987         }
2988     }
2989     va_end(vargs2);
2990     return _PyUnicodeWriter_Finish(&writer);
2991 
2992   fail:
2993     va_end(vargs2);
2994     _PyUnicodeWriter_Dealloc(&writer);
2995     return NULL;
2996 }
2997 
2998 PyObject *
PyUnicode_FromFormat(const char * format,...)2999 PyUnicode_FromFormat(const char *format, ...)
3000 {
3001     PyObject* ret;
3002     va_list vargs;
3003 
3004 #ifdef HAVE_STDARG_PROTOTYPES
3005     va_start(vargs, format);
3006 #else
3007     va_start(vargs);
3008 #endif
3009     ret = PyUnicode_FromFormatV(format, vargs);
3010     va_end(vargs);
3011     return ret;
3012 }
3013 
3014 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3015 unicode_get_widechar_size(PyObject *unicode)
3016 {
3017     Py_ssize_t res;
3018 
3019     assert(unicode != NULL);
3020     assert(_PyUnicode_CHECK(unicode));
3021 
3022     if (_PyUnicode_WSTR(unicode) != NULL) {
3023         return PyUnicode_WSTR_LENGTH(unicode);
3024     }
3025     assert(PyUnicode_IS_READY(unicode));
3026 
3027     res = _PyUnicode_LENGTH(unicode);
3028 #if SIZEOF_WCHAR_T == 2
3029     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3030         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3031         const Py_UCS4 *end = s + res;
3032         for (; s < end; ++s) {
3033             if (*s > 0xFFFF) {
3034                 ++res;
3035             }
3036         }
3037     }
3038 #endif
3039     return res;
3040 }
3041 
3042 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3043 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3044 {
3045     const wchar_t *wstr;
3046 
3047     assert(unicode != NULL);
3048     assert(_PyUnicode_CHECK(unicode));
3049 
3050     wstr = _PyUnicode_WSTR(unicode);
3051     if (wstr != NULL) {
3052         memcpy(w, wstr, size * sizeof(wchar_t));
3053         return;
3054     }
3055     assert(PyUnicode_IS_READY(unicode));
3056 
3057     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3058         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3059         for (; size--; ++s, ++w) {
3060             *w = *s;
3061         }
3062     }
3063     else {
3064 #if SIZEOF_WCHAR_T == 4
3065         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3066         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3067         for (; size--; ++s, ++w) {
3068             *w = *s;
3069         }
3070 #else
3071         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3072         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3073         for (; size--; ++s, ++w) {
3074             Py_UCS4 ch = *s;
3075             if (ch > 0xFFFF) {
3076                 assert(ch <= MAX_UNICODE);
3077                 /* encode surrogate pair in this case */
3078                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3079                 if (!size--)
3080                     break;
3081                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3082             }
3083             else {
3084                 *w = ch;
3085             }
3086         }
3087 #endif
3088     }
3089 }
3090 
3091 #ifdef HAVE_WCHAR_H
3092 
3093 /* Convert a Unicode object to a wide character string.
3094 
3095    - If w is NULL: return the number of wide characters (including the null
3096      character) required to convert the unicode object. Ignore size argument.
3097 
3098    - Otherwise: return the number of wide characters (excluding the null
3099      character) written into w. Write at most size wide characters (including
3100      the null character). */
3101 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3102 PyUnicode_AsWideChar(PyObject *unicode,
3103                      wchar_t *w,
3104                      Py_ssize_t size)
3105 {
3106     Py_ssize_t res;
3107 
3108     if (unicode == NULL) {
3109         PyErr_BadInternalCall();
3110         return -1;
3111     }
3112     if (!PyUnicode_Check(unicode)) {
3113         PyErr_BadArgument();
3114         return -1;
3115     }
3116 
3117     res = unicode_get_widechar_size(unicode);
3118     if (w == NULL) {
3119         return res + 1;
3120     }
3121 
3122     if (size > res) {
3123         size = res + 1;
3124     }
3125     else {
3126         res = size;
3127     }
3128     unicode_copy_as_widechar(unicode, w, size);
3129     return res;
3130 }
3131 
3132 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3133 PyUnicode_AsWideCharString(PyObject *unicode,
3134                            Py_ssize_t *size)
3135 {
3136     wchar_t *buffer;
3137     Py_ssize_t buflen;
3138 
3139     if (unicode == NULL) {
3140         PyErr_BadInternalCall();
3141         return NULL;
3142     }
3143     if (!PyUnicode_Check(unicode)) {
3144         PyErr_BadArgument();
3145         return NULL;
3146     }
3147 
3148     buflen = unicode_get_widechar_size(unicode);
3149     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3150     if (buffer == NULL) {
3151         PyErr_NoMemory();
3152         return NULL;
3153     }
3154     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3155     if (size != NULL) {
3156         *size = buflen;
3157     }
3158     else if (wcslen(buffer) != (size_t)buflen) {
3159         PyMem_FREE(buffer);
3160         PyErr_SetString(PyExc_ValueError,
3161                         "embedded null character");
3162         return NULL;
3163     }
3164     return buffer;
3165 }
3166 
3167 #endif /* HAVE_WCHAR_H */
3168 
3169 PyObject *
PyUnicode_FromOrdinal(int ordinal)3170 PyUnicode_FromOrdinal(int ordinal)
3171 {
3172     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3173         PyErr_SetString(PyExc_ValueError,
3174                         "chr() arg not in range(0x110000)");
3175         return NULL;
3176     }
3177 
3178     return unicode_char((Py_UCS4)ordinal);
3179 }
3180 
3181 PyObject *
PyUnicode_FromObject(PyObject * obj)3182 PyUnicode_FromObject(PyObject *obj)
3183 {
3184     /* XXX Perhaps we should make this API an alias of
3185        PyObject_Str() instead ?! */
3186     if (PyUnicode_CheckExact(obj)) {
3187         if (PyUnicode_READY(obj) == -1)
3188             return NULL;
3189         Py_INCREF(obj);
3190         return obj;
3191     }
3192     if (PyUnicode_Check(obj)) {
3193         /* For a Unicode subtype that's not a Unicode object,
3194            return a true Unicode object with the same data. */
3195         return _PyUnicode_Copy(obj);
3196     }
3197     PyErr_Format(PyExc_TypeError,
3198                  "Can't convert '%.100s' object to str implicitly",
3199                  Py_TYPE(obj)->tp_name);
3200     return NULL;
3201 }
3202 
3203 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3204 PyUnicode_FromEncodedObject(PyObject *obj,
3205                             const char *encoding,
3206                             const char *errors)
3207 {
3208     Py_buffer buffer;
3209     PyObject *v;
3210 
3211     if (obj == NULL) {
3212         PyErr_BadInternalCall();
3213         return NULL;
3214     }
3215 
3216     /* Decoding bytes objects is the most common case and should be fast */
3217     if (PyBytes_Check(obj)) {
3218         if (PyBytes_GET_SIZE(obj) == 0)
3219             _Py_RETURN_UNICODE_EMPTY();
3220         v = PyUnicode_Decode(
3221                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3222                 encoding, errors);
3223         return v;
3224     }
3225 
3226     if (PyUnicode_Check(obj)) {
3227         PyErr_SetString(PyExc_TypeError,
3228                         "decoding str is not supported");
3229         return NULL;
3230     }
3231 
3232     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3233     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3234         PyErr_Format(PyExc_TypeError,
3235                      "decoding to str: need a bytes-like object, %.80s found",
3236                      Py_TYPE(obj)->tp_name);
3237         return NULL;
3238     }
3239 
3240     if (buffer.len == 0) {
3241         PyBuffer_Release(&buffer);
3242         _Py_RETURN_UNICODE_EMPTY();
3243     }
3244 
3245     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3246     PyBuffer_Release(&buffer);
3247     return v;
3248 }
3249 
3250 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3251    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3252    longer than lower_len-1). */
3253 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3254 _Py_normalize_encoding(const char *encoding,
3255                        char *lower,
3256                        size_t lower_len)
3257 {
3258     const char *e;
3259     char *l;
3260     char *l_end;
3261     int punct;
3262 
3263     assert(encoding != NULL);
3264 
3265     e = encoding;
3266     l = lower;
3267     l_end = &lower[lower_len - 1];
3268     punct = 0;
3269     while (1) {
3270         char c = *e;
3271         if (c == 0) {
3272             break;
3273         }
3274 
3275         if (Py_ISALNUM(c) || c == '.') {
3276             if (punct && l != lower) {
3277                 if (l == l_end) {
3278                     return 0;
3279                 }
3280                 *l++ = '_';
3281             }
3282             punct = 0;
3283 
3284             if (l == l_end) {
3285                 return 0;
3286             }
3287             *l++ = Py_TOLOWER(c);
3288         }
3289         else {
3290             punct = 1;
3291         }
3292 
3293         e++;
3294     }
3295     *l = '\0';
3296     return 1;
3297 }
3298 
3299 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3300 PyUnicode_Decode(const char *s,
3301                  Py_ssize_t size,
3302                  const char *encoding,
3303                  const char *errors)
3304 {
3305     PyObject *buffer = NULL, *unicode;
3306     Py_buffer info;
3307     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3308 
3309     if (encoding == NULL) {
3310         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3311     }
3312 
3313     /* Shortcuts for common default encodings */
3314     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3315         char *lower = buflower;
3316 
3317         /* Fast paths */
3318         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3319             lower += 3;
3320             if (*lower == '_') {
3321                 /* Match "utf8" and "utf_8" */
3322                 lower++;
3323             }
3324 
3325             if (lower[0] == '8' && lower[1] == 0) {
3326                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3327             }
3328             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3329                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3330             }
3331             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3332                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3333             }
3334         }
3335         else {
3336             if (strcmp(lower, "ascii") == 0
3337                 || strcmp(lower, "us_ascii") == 0) {
3338                 return PyUnicode_DecodeASCII(s, size, errors);
3339             }
3340     #ifdef MS_WINDOWS
3341             else if (strcmp(lower, "mbcs") == 0) {
3342                 return PyUnicode_DecodeMBCS(s, size, errors);
3343             }
3344     #endif
3345             else if (strcmp(lower, "latin1") == 0
3346                      || strcmp(lower, "latin_1") == 0
3347                      || strcmp(lower, "iso_8859_1") == 0
3348                      || strcmp(lower, "iso8859_1") == 0) {
3349                 return PyUnicode_DecodeLatin1(s, size, errors);
3350             }
3351         }
3352     }
3353 
3354     /* Decode via the codec registry */
3355     buffer = NULL;
3356     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3357         goto onError;
3358     buffer = PyMemoryView_FromBuffer(&info);
3359     if (buffer == NULL)
3360         goto onError;
3361     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3362     if (unicode == NULL)
3363         goto onError;
3364     if (!PyUnicode_Check(unicode)) {
3365         PyErr_Format(PyExc_TypeError,
3366                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3367                      "use codecs.decode() to decode to arbitrary types",
3368                      encoding,
3369                      Py_TYPE(unicode)->tp_name);
3370         Py_DECREF(unicode);
3371         goto onError;
3372     }
3373     Py_DECREF(buffer);
3374     return unicode_result(unicode);
3375 
3376   onError:
3377     Py_XDECREF(buffer);
3378     return NULL;
3379 }
3380 
3381 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3382 PyUnicode_AsDecodedObject(PyObject *unicode,
3383                           const char *encoding,
3384                           const char *errors)
3385 {
3386     if (!PyUnicode_Check(unicode)) {
3387         PyErr_BadArgument();
3388         return NULL;
3389     }
3390 
3391     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3392                      "PyUnicode_AsDecodedObject() is deprecated; "
3393                      "use PyCodec_Decode() to decode from str", 1) < 0)
3394         return NULL;
3395 
3396     if (encoding == NULL)
3397         encoding = PyUnicode_GetDefaultEncoding();
3398 
3399     /* Decode via the codec registry */
3400     return PyCodec_Decode(unicode, encoding, errors);
3401 }
3402 
3403 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3404 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3405                            const char *encoding,
3406                            const char *errors)
3407 {
3408     PyObject *v;
3409 
3410     if (!PyUnicode_Check(unicode)) {
3411         PyErr_BadArgument();
3412         goto onError;
3413     }
3414 
3415     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3416                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3417                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3418         return NULL;
3419 
3420     if (encoding == NULL)
3421         encoding = PyUnicode_GetDefaultEncoding();
3422 
3423     /* Decode via the codec registry */
3424     v = PyCodec_Decode(unicode, encoding, errors);
3425     if (v == NULL)
3426         goto onError;
3427     if (!PyUnicode_Check(v)) {
3428         PyErr_Format(PyExc_TypeError,
3429                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3430                      "use codecs.decode() to decode to arbitrary types",
3431                      encoding,
3432                      Py_TYPE(unicode)->tp_name);
3433         Py_DECREF(v);
3434         goto onError;
3435     }
3436     return unicode_result(v);
3437 
3438   onError:
3439     return NULL;
3440 }
3441 
3442 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3443 PyUnicode_Encode(const Py_UNICODE *s,
3444                  Py_ssize_t size,
3445                  const char *encoding,
3446                  const char *errors)
3447 {
3448     PyObject *v, *unicode;
3449 
3450     unicode = PyUnicode_FromWideChar(s, size);
3451     if (unicode == NULL)
3452         return NULL;
3453     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3454     Py_DECREF(unicode);
3455     return v;
3456 }
3457 
3458 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3459 PyUnicode_AsEncodedObject(PyObject *unicode,
3460                           const char *encoding,
3461                           const char *errors)
3462 {
3463     PyObject *v;
3464 
3465     if (!PyUnicode_Check(unicode)) {
3466         PyErr_BadArgument();
3467         goto onError;
3468     }
3469 
3470     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3471                      "PyUnicode_AsEncodedObject() is deprecated; "
3472                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3473                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3474         return NULL;
3475 
3476     if (encoding == NULL)
3477         encoding = PyUnicode_GetDefaultEncoding();
3478 
3479     /* Encode via the codec registry */
3480     v = PyCodec_Encode(unicode, encoding, errors);
3481     if (v == NULL)
3482         goto onError;
3483     return v;
3484 
3485   onError:
3486     return NULL;
3487 }
3488 
3489 
3490 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3491 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3492                       int current_locale)
3493 {
3494     Py_ssize_t wlen;
3495     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3496     if (wstr == NULL) {
3497         return NULL;
3498     }
3499 
3500     if ((size_t)wlen != wcslen(wstr)) {
3501         PyErr_SetString(PyExc_ValueError, "embedded null character");
3502         PyMem_Free(wstr);
3503         return NULL;
3504     }
3505 
3506     char *str;
3507     size_t error_pos;
3508     const char *reason;
3509     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3510                                  current_locale, error_handler);
3511     PyMem_Free(wstr);
3512 
3513     if (res != 0) {
3514         if (res == -2) {
3515             PyObject *exc;
3516             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3517                     "locale", unicode,
3518                     (Py_ssize_t)error_pos,
3519                     (Py_ssize_t)(error_pos+1),
3520                     reason);
3521             if (exc != NULL) {
3522                 PyCodec_StrictErrors(exc);
3523                 Py_DECREF(exc);
3524             }
3525         }
3526         else if (res == -3) {
3527             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3528         }
3529         else {
3530             PyErr_NoMemory();
3531         }
3532         return NULL;
3533     }
3534 
3535     PyObject *bytes = PyBytes_FromString(str);
3536     PyMem_RawFree(str);
3537     return bytes;
3538 }
3539 
3540 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3541 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3542 {
3543     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3544     return unicode_encode_locale(unicode, error_handler, 1);
3545 }
3546 
3547 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3548 PyUnicode_EncodeFSDefault(PyObject *unicode)
3549 {
3550     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3551 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3552     if (interp->fs_codec.encoding) {
3553         return unicode_encode_utf8(unicode,
3554                                    interp->fs_codec.error_handler,
3555                                    interp->fs_codec.errors);
3556     }
3557     else {
3558         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3559         _Py_error_handler errors;
3560         errors = get_error_handler_wide(filesystem_errors);
3561         assert(errors != _Py_ERROR_UNKNOWN);
3562         return unicode_encode_utf8(unicode, errors, NULL);
3563     }
3564 #else
3565     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566        cannot use it to encode and decode filenames before it is loaded. Load
3567        the Python codec requires to encode at least its own filename. Use the C
3568        implementation of the locale codec until the codec registry is
3569        initialized and the Python codec is loaded. See initfsencoding(). */
3570     if (interp->fs_codec.encoding) {
3571         return PyUnicode_AsEncodedString(unicode,
3572                                          interp->fs_codec.encoding,
3573                                          interp->fs_codec.errors);
3574     }
3575     else {
3576         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3577         _Py_error_handler errors;
3578         errors = get_error_handler_wide(filesystem_errors);
3579         assert(errors != _Py_ERROR_UNKNOWN);
3580         return unicode_encode_locale(unicode, errors, 0);
3581     }
3582 #endif
3583 }
3584 
3585 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3586 PyUnicode_AsEncodedString(PyObject *unicode,
3587                           const char *encoding,
3588                           const char *errors)
3589 {
3590     PyObject *v;
3591     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3592 
3593     if (!PyUnicode_Check(unicode)) {
3594         PyErr_BadArgument();
3595         return NULL;
3596     }
3597 
3598     if (encoding == NULL) {
3599         return _PyUnicode_AsUTF8String(unicode, errors);
3600     }
3601 
3602     /* Shortcuts for common default encodings */
3603     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3604         char *lower = buflower;
3605 
3606         /* Fast paths */
3607         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3608             lower += 3;
3609             if (*lower == '_') {
3610                 /* Match "utf8" and "utf_8" */
3611                 lower++;
3612             }
3613 
3614             if (lower[0] == '8' && lower[1] == 0) {
3615                 return _PyUnicode_AsUTF8String(unicode, errors);
3616             }
3617             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3618                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3619             }
3620             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3621                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3622             }
3623         }
3624         else {
3625             if (strcmp(lower, "ascii") == 0
3626                 || strcmp(lower, "us_ascii") == 0) {
3627                 return _PyUnicode_AsASCIIString(unicode, errors);
3628             }
3629 #ifdef MS_WINDOWS
3630             else if (strcmp(lower, "mbcs") == 0) {
3631                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3632             }
3633 #endif
3634             else if (strcmp(lower, "latin1") == 0 ||
3635                      strcmp(lower, "latin_1") == 0 ||
3636                      strcmp(lower, "iso_8859_1") == 0 ||
3637                      strcmp(lower, "iso8859_1") == 0) {
3638                 return _PyUnicode_AsLatin1String(unicode, errors);
3639             }
3640         }
3641     }
3642 
3643     /* Encode via the codec registry */
3644     v = _PyCodec_EncodeText(unicode, encoding, errors);
3645     if (v == NULL)
3646         return NULL;
3647 
3648     /* The normal path */
3649     if (PyBytes_Check(v))
3650         return v;
3651 
3652     /* If the codec returns a buffer, raise a warning and convert to bytes */
3653     if (PyByteArray_Check(v)) {
3654         int error;
3655         PyObject *b;
3656 
3657         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3658             "encoder %s returned bytearray instead of bytes; "
3659             "use codecs.encode() to encode to arbitrary types",
3660             encoding);
3661         if (error) {
3662             Py_DECREF(v);
3663             return NULL;
3664         }
3665 
3666         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3667                                       PyByteArray_GET_SIZE(v));
3668         Py_DECREF(v);
3669         return b;
3670     }
3671 
3672     PyErr_Format(PyExc_TypeError,
3673                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3674                  "use codecs.encode() to encode to arbitrary types",
3675                  encoding,
3676                  Py_TYPE(v)->tp_name);
3677     Py_DECREF(v);
3678     return NULL;
3679 }
3680 
3681 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3682 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3683                            const char *encoding,
3684                            const char *errors)
3685 {
3686     PyObject *v;
3687 
3688     if (!PyUnicode_Check(unicode)) {
3689         PyErr_BadArgument();
3690         goto onError;
3691     }
3692 
3693     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3694                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3695                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3696         return NULL;
3697 
3698     if (encoding == NULL)
3699         encoding = PyUnicode_GetDefaultEncoding();
3700 
3701     /* Encode via the codec registry */
3702     v = PyCodec_Encode(unicode, encoding, errors);
3703     if (v == NULL)
3704         goto onError;
3705     if (!PyUnicode_Check(v)) {
3706         PyErr_Format(PyExc_TypeError,
3707                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3708                      "use codecs.encode() to encode to arbitrary types",
3709                      encoding,
3710                      Py_TYPE(v)->tp_name);
3711         Py_DECREF(v);
3712         goto onError;
3713     }
3714     return v;
3715 
3716   onError:
3717     return NULL;
3718 }
3719 
3720 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3721 unicode_decode_locale(const char *str, Py_ssize_t len,
3722                       _Py_error_handler errors, int current_locale)
3723 {
3724     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3725         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3726         return NULL;
3727     }
3728 
3729     wchar_t *wstr;
3730     size_t wlen;
3731     const char *reason;
3732     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3733                                  current_locale, errors);
3734     if (res != 0) {
3735         if (res == -2) {
3736             PyObject *exc;
3737             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3738                                         "locale", str, len,
3739                                         (Py_ssize_t)wlen,
3740                                         (Py_ssize_t)(wlen + 1),
3741                                         reason);
3742             if (exc != NULL) {
3743                 PyCodec_StrictErrors(exc);
3744                 Py_DECREF(exc);
3745             }
3746         }
3747         else if (res == -3) {
3748             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3749         }
3750         else {
3751             PyErr_NoMemory();
3752         }
3753         return NULL;
3754     }
3755 
3756     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3757     PyMem_RawFree(wstr);
3758     return unicode;
3759 }
3760 
3761 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3762 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3763                               const char *errors)
3764 {
3765     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3766     return unicode_decode_locale(str, len, error_handler, 1);
3767 }
3768 
3769 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3770 PyUnicode_DecodeLocale(const char *str, const char *errors)
3771 {
3772     Py_ssize_t size = (Py_ssize_t)strlen(str);
3773     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3774     return unicode_decode_locale(str, size, error_handler, 1);
3775 }
3776 
3777 
3778 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3779 PyUnicode_DecodeFSDefault(const char *s) {
3780     Py_ssize_t size = (Py_ssize_t)strlen(s);
3781     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3782 }
3783 
3784 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3785 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3786 {
3787     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3788 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3789     if (interp->fs_codec.encoding) {
3790         return unicode_decode_utf8(s, size,
3791                                    interp->fs_codec.error_handler,
3792                                    interp->fs_codec.errors,
3793                                    NULL);
3794     }
3795     else {
3796         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3797         _Py_error_handler errors;
3798         errors = get_error_handler_wide(filesystem_errors);
3799         assert(errors != _Py_ERROR_UNKNOWN);
3800         return unicode_decode_utf8(s, size, errors, NULL, NULL);
3801     }
3802 #else
3803     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3804        cannot use it to encode and decode filenames before it is loaded. Load
3805        the Python codec requires to encode at least its own filename. Use the C
3806        implementation of the locale codec until the codec registry is
3807        initialized and the Python codec is loaded. See initfsencoding(). */
3808     if (interp->fs_codec.encoding) {
3809         return PyUnicode_Decode(s, size,
3810                                 interp->fs_codec.encoding,
3811                                 interp->fs_codec.errors);
3812     }
3813     else {
3814         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3815         _Py_error_handler errors;
3816         errors = get_error_handler_wide(filesystem_errors);
3817         return unicode_decode_locale(s, size, errors, 0);
3818     }
3819 #endif
3820 }
3821 
3822 
3823 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3824 PyUnicode_FSConverter(PyObject* arg, void* addr)
3825 {
3826     PyObject *path = NULL;
3827     PyObject *output = NULL;
3828     Py_ssize_t size;
3829     void *data;
3830     if (arg == NULL) {
3831         Py_DECREF(*(PyObject**)addr);
3832         *(PyObject**)addr = NULL;
3833         return 1;
3834     }
3835     path = PyOS_FSPath(arg);
3836     if (path == NULL) {
3837         return 0;
3838     }
3839     if (PyBytes_Check(path)) {
3840         output = path;
3841     }
3842     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3843         output = PyUnicode_EncodeFSDefault(path);
3844         Py_DECREF(path);
3845         if (!output) {
3846             return 0;
3847         }
3848         assert(PyBytes_Check(output));
3849     }
3850 
3851     size = PyBytes_GET_SIZE(output);
3852     data = PyBytes_AS_STRING(output);
3853     if ((size_t)size != strlen(data)) {
3854         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3855         Py_DECREF(output);
3856         return 0;
3857     }
3858     *(PyObject**)addr = output;
3859     return Py_CLEANUP_SUPPORTED;
3860 }
3861 
3862 
3863 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3864 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3865 {
3866     int is_buffer = 0;
3867     PyObject *path = NULL;
3868     PyObject *output = NULL;
3869     if (arg == NULL) {
3870         Py_DECREF(*(PyObject**)addr);
3871         *(PyObject**)addr = NULL;
3872         return 1;
3873     }
3874 
3875     is_buffer = PyObject_CheckBuffer(arg);
3876     if (!is_buffer) {
3877         path = PyOS_FSPath(arg);
3878         if (path == NULL) {
3879             return 0;
3880         }
3881     }
3882     else {
3883         path = arg;
3884         Py_INCREF(arg);
3885     }
3886 
3887     if (PyUnicode_Check(path)) {
3888         output = path;
3889     }
3890     else if (PyBytes_Check(path) || is_buffer) {
3891         PyObject *path_bytes = NULL;
3892 
3893         if (!PyBytes_Check(path) &&
3894             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3895             "path should be string, bytes, or os.PathLike, not %.200s",
3896             Py_TYPE(arg)->tp_name)) {
3897                 Py_DECREF(path);
3898             return 0;
3899         }
3900         path_bytes = PyBytes_FromObject(path);
3901         Py_DECREF(path);
3902         if (!path_bytes) {
3903             return 0;
3904         }
3905         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3906                                                   PyBytes_GET_SIZE(path_bytes));
3907         Py_DECREF(path_bytes);
3908         if (!output) {
3909             return 0;
3910         }
3911     }
3912     else {
3913         PyErr_Format(PyExc_TypeError,
3914                      "path should be string, bytes, or os.PathLike, not %.200s",
3915                      Py_TYPE(arg)->tp_name);
3916         Py_DECREF(path);
3917         return 0;
3918     }
3919     if (PyUnicode_READY(output) == -1) {
3920         Py_DECREF(output);
3921         return 0;
3922     }
3923     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3924                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3925         PyErr_SetString(PyExc_ValueError, "embedded null character");
3926         Py_DECREF(output);
3927         return 0;
3928     }
3929     *(PyObject**)addr = output;
3930     return Py_CLEANUP_SUPPORTED;
3931 }
3932 
3933 
3934 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3935 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3936 {
3937     PyObject *bytes;
3938 
3939     if (!PyUnicode_Check(unicode)) {
3940         PyErr_BadArgument();
3941         return NULL;
3942     }
3943     if (PyUnicode_READY(unicode) == -1)
3944         return NULL;
3945 
3946     if (PyUnicode_UTF8(unicode) == NULL) {
3947         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3948         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3949         if (bytes == NULL)
3950             return NULL;
3951         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3952         if (_PyUnicode_UTF8(unicode) == NULL) {
3953             PyErr_NoMemory();
3954             Py_DECREF(bytes);
3955             return NULL;
3956         }
3957         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3958         memcpy(_PyUnicode_UTF8(unicode),
3959                   PyBytes_AS_STRING(bytes),
3960                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
3961         Py_DECREF(bytes);
3962     }
3963 
3964     if (psize)
3965         *psize = PyUnicode_UTF8_LENGTH(unicode);
3966     return PyUnicode_UTF8(unicode);
3967 }
3968 
3969 const char *
PyUnicode_AsUTF8(PyObject * unicode)3970 PyUnicode_AsUTF8(PyObject *unicode)
3971 {
3972     return PyUnicode_AsUTF8AndSize(unicode, NULL);
3973 }
3974 
3975 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3976 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3977 {
3978     if (!PyUnicode_Check(unicode)) {
3979         PyErr_BadArgument();
3980         return NULL;
3981     }
3982     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3983     if (w == NULL) {
3984         /* Non-ASCII compact unicode object */
3985         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
3986         assert(PyUnicode_IS_READY(unicode));
3987 
3988         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3989         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3990             PyErr_NoMemory();
3991             return NULL;
3992         }
3993         w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3994         if (w == NULL) {
3995             PyErr_NoMemory();
3996             return NULL;
3997         }
3998         unicode_copy_as_widechar(unicode, w, wlen + 1);
3999         _PyUnicode_WSTR(unicode) = w;
4000         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4001             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4002         }
4003     }
4004     if (size != NULL)
4005         *size = PyUnicode_WSTR_LENGTH(unicode);
4006     return w;
4007 }
4008 
4009 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4010 PyUnicode_AsUnicode(PyObject *unicode)
4011 {
4012     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4013 }
4014 
4015 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4016 _PyUnicode_AsUnicode(PyObject *unicode)
4017 {
4018     Py_ssize_t size;
4019     const Py_UNICODE *wstr;
4020 
4021     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4022     if (wstr && wcslen(wstr) != (size_t)size) {
4023         PyErr_SetString(PyExc_ValueError, "embedded null character");
4024         return NULL;
4025     }
4026     return wstr;
4027 }
4028 
4029 
4030 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4031 PyUnicode_GetSize(PyObject *unicode)
4032 {
4033     if (!PyUnicode_Check(unicode)) {
4034         PyErr_BadArgument();
4035         goto onError;
4036     }
4037     if (_PyUnicode_WSTR(unicode) == NULL) {
4038         if (PyUnicode_AsUnicode(unicode) == NULL)
4039             goto onError;
4040     }
4041     return PyUnicode_WSTR_LENGTH(unicode);
4042 
4043   onError:
4044     return -1;
4045 }
4046 
4047 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4048 PyUnicode_GetLength(PyObject *unicode)
4049 {
4050     if (!PyUnicode_Check(unicode)) {
4051         PyErr_BadArgument();
4052         return -1;
4053     }
4054     if (PyUnicode_READY(unicode) == -1)
4055         return -1;
4056     return PyUnicode_GET_LENGTH(unicode);
4057 }
4058 
4059 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4060 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4061 {
4062     void *data;
4063     int kind;
4064 
4065     if (!PyUnicode_Check(unicode)) {
4066         PyErr_BadArgument();
4067         return (Py_UCS4)-1;
4068     }
4069     if (PyUnicode_READY(unicode) == -1) {
4070         return (Py_UCS4)-1;
4071     }
4072     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4073         PyErr_SetString(PyExc_IndexError, "string index out of range");
4074         return (Py_UCS4)-1;
4075     }
4076     data = PyUnicode_DATA(unicode);
4077     kind = PyUnicode_KIND(unicode);
4078     return PyUnicode_READ(kind, data, index);
4079 }
4080 
4081 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4082 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4083 {
4084     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4085         PyErr_BadArgument();
4086         return -1;
4087     }
4088     assert(PyUnicode_IS_READY(unicode));
4089     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4090         PyErr_SetString(PyExc_IndexError, "string index out of range");
4091         return -1;
4092     }
4093     if (unicode_check_modifiable(unicode))
4094         return -1;
4095     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4096         PyErr_SetString(PyExc_ValueError, "character out of range");
4097         return -1;
4098     }
4099     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4100                     index, ch);
4101     return 0;
4102 }
4103 
4104 const char *
PyUnicode_GetDefaultEncoding(void)4105 PyUnicode_GetDefaultEncoding(void)
4106 {
4107     return "utf-8";
4108 }
4109 
4110 /* create or adjust a UnicodeDecodeError */
4111 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4112 make_decode_exception(PyObject **exceptionObject,
4113                       const char *encoding,
4114                       const char *input, Py_ssize_t length,
4115                       Py_ssize_t startpos, Py_ssize_t endpos,
4116                       const char *reason)
4117 {
4118     if (*exceptionObject == NULL) {
4119         *exceptionObject = PyUnicodeDecodeError_Create(
4120             encoding, input, length, startpos, endpos, reason);
4121     }
4122     else {
4123         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4124             goto onError;
4125         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4126             goto onError;
4127         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4128             goto onError;
4129     }
4130     return;
4131 
4132 onError:
4133     Py_CLEAR(*exceptionObject);
4134 }
4135 
4136 #ifdef MS_WINDOWS
4137 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4138 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4139 {
4140     if (newsize > *size) {
4141         wchar_t *newbuf = *buf;
4142         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4143             PyErr_NoMemory();
4144             return -1;
4145         }
4146         *buf = newbuf;
4147     }
4148     *size = newsize;
4149     return 0;
4150 }
4151 
4152 /* error handling callback helper:
4153    build arguments, call the callback and check the arguments,
4154    if no exception occurred, copy the replacement to the output
4155    and adjust various state variables.
4156    return 0 on success, -1 on error
4157 */
4158 
4159 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4160 unicode_decode_call_errorhandler_wchar(
4161     const char *errors, PyObject **errorHandler,
4162     const char *encoding, const char *reason,
4163     const char **input, const char **inend, Py_ssize_t *startinpos,
4164     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4165     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4166 {
4167     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4168 
4169     PyObject *restuple = NULL;
4170     PyObject *repunicode = NULL;
4171     Py_ssize_t outsize;
4172     Py_ssize_t insize;
4173     Py_ssize_t requiredsize;
4174     Py_ssize_t newpos;
4175     PyObject *inputobj = NULL;
4176     wchar_t *repwstr;
4177     Py_ssize_t repwlen;
4178 
4179     if (*errorHandler == NULL) {
4180         *errorHandler = PyCodec_LookupError(errors);
4181         if (*errorHandler == NULL)
4182             goto onError;
4183     }
4184 
4185     make_decode_exception(exceptionObject,
4186         encoding,
4187         *input, *inend - *input,
4188         *startinpos, *endinpos,
4189         reason);
4190     if (*exceptionObject == NULL)
4191         goto onError;
4192 
4193     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4194     if (restuple == NULL)
4195         goto onError;
4196     if (!PyTuple_Check(restuple)) {
4197         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4198         goto onError;
4199     }
4200     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4201         goto onError;
4202 
4203     /* Copy back the bytes variables, which might have been modified by the
4204        callback */
4205     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4206     if (!inputobj)
4207         goto onError;
4208     *input = PyBytes_AS_STRING(inputobj);
4209     insize = PyBytes_GET_SIZE(inputobj);
4210     *inend = *input + insize;
4211     /* we can DECREF safely, as the exception has another reference,
4212        so the object won't go away. */
4213     Py_DECREF(inputobj);
4214 
4215     if (newpos<0)
4216         newpos = insize+newpos;
4217     if (newpos<0 || newpos>insize) {
4218         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4219         goto onError;
4220     }
4221 
4222     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4223     if (repwstr == NULL)
4224         goto onError;
4225     /* need more space? (at least enough for what we
4226        have+the replacement+the rest of the string (starting
4227        at the new input position), so we won't have to check space
4228        when there are no errors in the rest of the string) */
4229     requiredsize = *outpos;
4230     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4231         goto overflow;
4232     requiredsize += repwlen;
4233     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4234         goto overflow;
4235     requiredsize += insize - newpos;
4236     outsize = *bufsize;
4237     if (requiredsize > outsize) {
4238         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4239             requiredsize = 2*outsize;
4240         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4241             goto onError;
4242         }
4243     }
4244     wcsncpy(*buf + *outpos, repwstr, repwlen);
4245     *outpos += repwlen;
4246     *endinpos = newpos;
4247     *inptr = *input + newpos;
4248 
4249     /* we made it! */
4250     Py_DECREF(restuple);
4251     return 0;
4252 
4253   overflow:
4254     PyErr_SetString(PyExc_OverflowError,
4255                     "decoded result is too long for a Python string");
4256 
4257   onError:
4258     Py_XDECREF(restuple);
4259     return -1;
4260 }
4261 #endif   /* MS_WINDOWS */
4262 
4263 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4264 unicode_decode_call_errorhandler_writer(
4265     const char *errors, PyObject **errorHandler,
4266     const char *encoding, const char *reason,
4267     const char **input, const char **inend, Py_ssize_t *startinpos,
4268     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4270 {
4271     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4272 
4273     PyObject *restuple = NULL;
4274     PyObject *repunicode = NULL;
4275     Py_ssize_t insize;
4276     Py_ssize_t newpos;
4277     Py_ssize_t replen;
4278     Py_ssize_t remain;
4279     PyObject *inputobj = NULL;
4280     int need_to_grow = 0;
4281     const char *new_inptr;
4282 
4283     if (*errorHandler == NULL) {
4284         *errorHandler = PyCodec_LookupError(errors);
4285         if (*errorHandler == NULL)
4286             goto onError;
4287     }
4288 
4289     make_decode_exception(exceptionObject,
4290         encoding,
4291         *input, *inend - *input,
4292         *startinpos, *endinpos,
4293         reason);
4294     if (*exceptionObject == NULL)
4295         goto onError;
4296 
4297     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4298     if (restuple == NULL)
4299         goto onError;
4300     if (!PyTuple_Check(restuple)) {
4301         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4302         goto onError;
4303     }
4304     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4305         goto onError;
4306 
4307     /* Copy back the bytes variables, which might have been modified by the
4308        callback */
4309     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4310     if (!inputobj)
4311         goto onError;
4312     remain = *inend - *input - *endinpos;
4313     *input = PyBytes_AS_STRING(inputobj);
4314     insize = PyBytes_GET_SIZE(inputobj);
4315     *inend = *input + insize;
4316     /* we can DECREF safely, as the exception has another reference,
4317        so the object won't go away. */
4318     Py_DECREF(inputobj);
4319 
4320     if (newpos<0)
4321         newpos = insize+newpos;
4322     if (newpos<0 || newpos>insize) {
4323         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4324         goto onError;
4325     }
4326 
4327     replen = PyUnicode_GET_LENGTH(repunicode);
4328     if (replen > 1) {
4329         writer->min_length += replen - 1;
4330         need_to_grow = 1;
4331     }
4332     new_inptr = *input + newpos;
4333     if (*inend - new_inptr > remain) {
4334         /* We don't know the decoding algorithm here so we make the worst
4335            assumption that one byte decodes to one unicode character.
4336            If unfortunately one byte could decode to more unicode characters,
4337            the decoder may write out-of-bound then.  Is it possible for the
4338            algorithms using this function? */
4339         writer->min_length += *inend - new_inptr - remain;
4340         need_to_grow = 1;
4341     }
4342     if (need_to_grow) {
4343         writer->overallocate = 1;
4344         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4345                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4346             goto onError;
4347     }
4348     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4349         goto onError;
4350 
4351     *endinpos = newpos;
4352     *inptr = new_inptr;
4353 
4354     /* we made it! */
4355     Py_DECREF(restuple);
4356     return 0;
4357 
4358   onError:
4359     Py_XDECREF(restuple);
4360     return -1;
4361 }
4362 
4363 /* --- UTF-7 Codec -------------------------------------------------------- */
4364 
4365 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4366 
4367 /* Three simple macros defining base-64. */
4368 
4369 /* Is c a base-64 character? */
4370 
4371 #define IS_BASE64(c) \
4372     (((c) >= 'A' && (c) <= 'Z') ||     \
4373      ((c) >= 'a' && (c) <= 'z') ||     \
4374      ((c) >= '0' && (c) <= '9') ||     \
4375      (c) == '+' || (c) == '/')
4376 
4377 /* given that c is a base-64 character, what is its base-64 value? */
4378 
4379 #define FROM_BASE64(c)                                                  \
4380     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4381      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4382      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4383      (c) == '+' ? 62 : 63)
4384 
4385 /* What is the base-64 character of the bottom 6 bits of n? */
4386 
4387 #define TO_BASE64(n)  \
4388     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4389 
4390 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4391  * decoded as itself.  We are permissive on decoding; the only ASCII
4392  * byte not decoding to itself is the + which begins a base64
4393  * string. */
4394 
4395 #define DECODE_DIRECT(c)                                \
4396     ((c) <= 127 && (c) != '+')
4397 
4398 /* The UTF-7 encoder treats ASCII characters differently according to
4399  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4400  * the above).  See RFC2152.  This array identifies these different
4401  * sets:
4402  * 0 : "Set D"
4403  *     alphanumeric and '(),-./:?
4404  * 1 : "Set O"
4405  *     !"#$%&*;<=>@[]^_`{|}
4406  * 2 : "whitespace"
4407  *     ht nl cr sp
4408  * 3 : special (must be base64 encoded)
4409  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4410  */
4411 
4412 static
4413 char utf7_category[128] = {
4414 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4415     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4416 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4417     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4418 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4419     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4420 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4421     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4422 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4423     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4424 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4425     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4426 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4427     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4428 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4429     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4430 };
4431 
4432 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4433  * answer depends on whether we are encoding set O as itself, and also
4434  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4435  * clear that the answers to these questions vary between
4436  * applications, so this code needs to be flexible.  */
4437 
4438 #define ENCODE_DIRECT(c, directO, directWS)             \
4439     ((c) < 128 && (c) > 0 &&                            \
4440      ((utf7_category[(c)] == 0) ||                      \
4441       (directWS && (utf7_category[(c)] == 2)) ||        \
4442       (directO && (utf7_category[(c)] == 1))))
4443 
4444 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4445 PyUnicode_DecodeUTF7(const char *s,
4446                      Py_ssize_t size,
4447                      const char *errors)
4448 {
4449     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4450 }
4451 
4452 /* The decoder.  The only state we preserve is our read position,
4453  * i.e. how many characters we have consumed.  So if we end in the
4454  * middle of a shift sequence we have to back off the read position
4455  * and the output to the beginning of the sequence, otherwise we lose
4456  * all the shift state (seen bits, number of bits seen, high
4457  * surrogate). */
4458 
4459 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4460 PyUnicode_DecodeUTF7Stateful(const char *s,
4461                              Py_ssize_t size,
4462                              const char *errors,
4463                              Py_ssize_t *consumed)
4464 {
4465     const char *starts = s;
4466     Py_ssize_t startinpos;
4467     Py_ssize_t endinpos;
4468     const char *e;
4469     _PyUnicodeWriter writer;
4470     const char *errmsg = "";
4471     int inShift = 0;
4472     Py_ssize_t shiftOutStart;
4473     unsigned int base64bits = 0;
4474     unsigned long base64buffer = 0;
4475     Py_UCS4 surrogate = 0;
4476     PyObject *errorHandler = NULL;
4477     PyObject *exc = NULL;
4478 
4479     if (size == 0) {
4480         if (consumed)
4481             *consumed = 0;
4482         _Py_RETURN_UNICODE_EMPTY();
4483     }
4484 
4485     /* Start off assuming it's all ASCII. Widen later as necessary. */
4486     _PyUnicodeWriter_Init(&writer);
4487     writer.min_length = size;
4488 
4489     shiftOutStart = 0;
4490     e = s + size;
4491 
4492     while (s < e) {
4493         Py_UCS4 ch;
4494       restart:
4495         ch = (unsigned char) *s;
4496 
4497         if (inShift) { /* in a base-64 section */
4498             if (IS_BASE64(ch)) { /* consume a base-64 character */
4499                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4500                 base64bits += 6;
4501                 s++;
4502                 if (base64bits >= 16) {
4503                     /* we have enough bits for a UTF-16 value */
4504                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4505                     base64bits -= 16;
4506                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4507                     assert(outCh <= 0xffff);
4508                     if (surrogate) {
4509                         /* expecting a second surrogate */
4510                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4511                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4512                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4513                                 goto onError;
4514                             surrogate = 0;
4515                             continue;
4516                         }
4517                         else {
4518                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4519                                 goto onError;
4520                             surrogate = 0;
4521                         }
4522                     }
4523                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4524                         /* first surrogate */
4525                         surrogate = outCh;
4526                     }
4527                     else {
4528                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4529                             goto onError;
4530                     }
4531                 }
4532             }
4533             else { /* now leaving a base-64 section */
4534                 inShift = 0;
4535                 if (base64bits > 0) { /* left-over bits */
4536                     if (base64bits >= 6) {
4537                         /* We've seen at least one base-64 character */
4538                         s++;
4539                         errmsg = "partial character in shift sequence";
4540                         goto utf7Error;
4541                     }
4542                     else {
4543                         /* Some bits remain; they should be zero */
4544                         if (base64buffer != 0) {
4545                             s++;
4546                             errmsg = "non-zero padding bits in shift sequence";
4547                             goto utf7Error;
4548                         }
4549                     }
4550                 }
4551                 if (surrogate && DECODE_DIRECT(ch)) {
4552                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4553                         goto onError;
4554                 }
4555                 surrogate = 0;
4556                 if (ch == '-') {
4557                     /* '-' is absorbed; other terminating
4558                        characters are preserved */
4559                     s++;
4560                 }
4561             }
4562         }
4563         else if ( ch == '+' ) {
4564             startinpos = s-starts;
4565             s++; /* consume '+' */
4566             if (s < e && *s == '-') { /* '+-' encodes '+' */
4567                 s++;
4568                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4569                     goto onError;
4570             }
4571             else if (s < e && !IS_BASE64(*s)) {
4572                 s++;
4573                 errmsg = "ill-formed sequence";
4574                 goto utf7Error;
4575             }
4576             else { /* begin base64-encoded section */
4577                 inShift = 1;
4578                 surrogate = 0;
4579                 shiftOutStart = writer.pos;
4580                 base64bits = 0;
4581                 base64buffer = 0;
4582             }
4583         }
4584         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4585             s++;
4586             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4587                 goto onError;
4588         }
4589         else {
4590             startinpos = s-starts;
4591             s++;
4592             errmsg = "unexpected special character";
4593             goto utf7Error;
4594         }
4595         continue;
4596 utf7Error:
4597         endinpos = s-starts;
4598         if (unicode_decode_call_errorhandler_writer(
4599                 errors, &errorHandler,
4600                 "utf7", errmsg,
4601                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4602                 &writer))
4603             goto onError;
4604     }
4605 
4606     /* end of string */
4607 
4608     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4609         /* if we're in an inconsistent state, that's an error */
4610         inShift = 0;
4611         if (surrogate ||
4612                 (base64bits >= 6) ||
4613                 (base64bits > 0 && base64buffer != 0)) {
4614             endinpos = size;
4615             if (unicode_decode_call_errorhandler_writer(
4616                     errors, &errorHandler,
4617                     "utf7", "unterminated shift sequence",
4618                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4619                     &writer))
4620                 goto onError;
4621             if (s < e)
4622                 goto restart;
4623         }
4624     }
4625 
4626     /* return state */
4627     if (consumed) {
4628         if (inShift) {
4629             *consumed = startinpos;
4630             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4631                 PyObject *result = PyUnicode_FromKindAndData(
4632                         writer.kind, writer.data, shiftOutStart);
4633                 Py_XDECREF(errorHandler);
4634                 Py_XDECREF(exc);
4635                 _PyUnicodeWriter_Dealloc(&writer);
4636                 return result;
4637             }
4638             writer.pos = shiftOutStart; /* back off output */
4639         }
4640         else {
4641             *consumed = s-starts;
4642         }
4643     }
4644 
4645     Py_XDECREF(errorHandler);
4646     Py_XDECREF(exc);
4647     return _PyUnicodeWriter_Finish(&writer);
4648 
4649   onError:
4650     Py_XDECREF(errorHandler);
4651     Py_XDECREF(exc);
4652     _PyUnicodeWriter_Dealloc(&writer);
4653     return NULL;
4654 }
4655 
4656 
4657 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4658 _PyUnicode_EncodeUTF7(PyObject *str,
4659                       int base64SetO,
4660                       int base64WhiteSpace,
4661                       const char *errors)
4662 {
4663     int kind;
4664     void *data;
4665     Py_ssize_t len;
4666     PyObject *v;
4667     int inShift = 0;
4668     Py_ssize_t i;
4669     unsigned int base64bits = 0;
4670     unsigned long base64buffer = 0;
4671     char * out;
4672     char * start;
4673 
4674     if (PyUnicode_READY(str) == -1)
4675         return NULL;
4676     kind = PyUnicode_KIND(str);
4677     data = PyUnicode_DATA(str);
4678     len = PyUnicode_GET_LENGTH(str);
4679 
4680     if (len == 0)
4681         return PyBytes_FromStringAndSize(NULL, 0);
4682 
4683     /* It might be possible to tighten this worst case */
4684     if (len > PY_SSIZE_T_MAX / 8)
4685         return PyErr_NoMemory();
4686     v = PyBytes_FromStringAndSize(NULL, len * 8);
4687     if (v == NULL)
4688         return NULL;
4689 
4690     start = out = PyBytes_AS_STRING(v);
4691     for (i = 0; i < len; ++i) {
4692         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4693 
4694         if (inShift) {
4695             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4696                 /* shifting out */
4697                 if (base64bits) { /* output remaining bits */
4698                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4699                     base64buffer = 0;
4700                     base64bits = 0;
4701                 }
4702                 inShift = 0;
4703                 /* Characters not in the BASE64 set implicitly unshift the sequence
4704                    so no '-' is required, except if the character is itself a '-' */
4705                 if (IS_BASE64(ch) || ch == '-') {
4706                     *out++ = '-';
4707                 }
4708                 *out++ = (char) ch;
4709             }
4710             else {
4711                 goto encode_char;
4712             }
4713         }
4714         else { /* not in a shift sequence */
4715             if (ch == '+') {
4716                 *out++ = '+';
4717                         *out++ = '-';
4718             }
4719             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4720                 *out++ = (char) ch;
4721             }
4722             else {
4723                 *out++ = '+';
4724                 inShift = 1;
4725                 goto encode_char;
4726             }
4727         }
4728         continue;
4729 encode_char:
4730         if (ch >= 0x10000) {
4731             assert(ch <= MAX_UNICODE);
4732 
4733             /* code first surrogate */
4734             base64bits += 16;
4735             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4736             while (base64bits >= 6) {
4737                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4738                 base64bits -= 6;
4739             }
4740             /* prepare second surrogate */
4741             ch = Py_UNICODE_LOW_SURROGATE(ch);
4742         }
4743         base64bits += 16;
4744         base64buffer = (base64buffer << 16) | ch;
4745         while (base64bits >= 6) {
4746             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4747             base64bits -= 6;
4748         }
4749     }
4750     if (base64bits)
4751         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4752     if (inShift)
4753         *out++ = '-';
4754     if (_PyBytes_Resize(&v, out - start) < 0)
4755         return NULL;
4756     return v;
4757 }
4758 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4759 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4760                      Py_ssize_t size,
4761                      int base64SetO,
4762                      int base64WhiteSpace,
4763                      const char *errors)
4764 {
4765     PyObject *result;
4766     PyObject *tmp = PyUnicode_FromWideChar(s, size);
4767     if (tmp == NULL)
4768         return NULL;
4769     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4770                                    base64WhiteSpace, errors);
4771     Py_DECREF(tmp);
4772     return result;
4773 }
4774 
4775 #undef IS_BASE64
4776 #undef FROM_BASE64
4777 #undef TO_BASE64
4778 #undef DECODE_DIRECT
4779 #undef ENCODE_DIRECT
4780 
4781 /* --- UTF-8 Codec -------------------------------------------------------- */
4782 
4783 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4784 PyUnicode_DecodeUTF8(const char *s,
4785                      Py_ssize_t size,
4786                      const char *errors)
4787 {
4788     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4789 }
4790 
4791 #include "stringlib/asciilib.h"
4792 #include "stringlib/codecs.h"
4793 #include "stringlib/undef.h"
4794 
4795 #include "stringlib/ucs1lib.h"
4796 #include "stringlib/codecs.h"
4797 #include "stringlib/undef.h"
4798 
4799 #include "stringlib/ucs2lib.h"
4800 #include "stringlib/codecs.h"
4801 #include "stringlib/undef.h"
4802 
4803 #include "stringlib/ucs4lib.h"
4804 #include "stringlib/codecs.h"
4805 #include "stringlib/undef.h"
4806 
4807 /* Mask to quickly check whether a C 'long' contains a
4808    non-ASCII, UTF8-encoded char. */
4809 #if (SIZEOF_LONG == 8)
4810 # define ASCII_CHAR_MASK 0x8080808080808080UL
4811 #elif (SIZEOF_LONG == 4)
4812 # define ASCII_CHAR_MASK 0x80808080UL
4813 #else
4814 # error C 'long' size should be either 4 or 8!
4815 #endif
4816 
4817 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4818 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4819 {
4820     const char *p = start;
4821     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4822 
4823     /*
4824      * Issue #17237: m68k is a bit different from most architectures in
4825      * that objects do not use "natural alignment" - for example, int and
4826      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4827      * won't work; also, tests have shown that skipping the "optimised
4828      * version" will even speed up m68k.
4829      */
4830 #if !defined(__m68k__)
4831 #if SIZEOF_LONG <= SIZEOF_VOID_P
4832     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4833     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4834         /* Fast path, see in STRINGLIB(utf8_decode) for
4835            an explanation. */
4836         /* Help allocation */
4837         const char *_p = p;
4838         Py_UCS1 * q = dest;
4839         while (_p < aligned_end) {
4840             unsigned long value = *(const unsigned long *) _p;
4841             if (value & ASCII_CHAR_MASK)
4842                 break;
4843             *((unsigned long *)q) = value;
4844             _p += SIZEOF_LONG;
4845             q += SIZEOF_LONG;
4846         }
4847         p = _p;
4848         while (p < end) {
4849             if ((unsigned char)*p & 0x80)
4850                 break;
4851             *q++ = *p++;
4852         }
4853         return p - start;
4854     }
4855 #endif
4856 #endif
4857     while (p < end) {
4858         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4859            for an explanation. */
4860         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4861             /* Help allocation */
4862             const char *_p = p;
4863             while (_p < aligned_end) {
4864                 unsigned long value = *(unsigned long *) _p;
4865                 if (value & ASCII_CHAR_MASK)
4866                     break;
4867                 _p += SIZEOF_LONG;
4868             }
4869             p = _p;
4870             if (_p == end)
4871                 break;
4872         }
4873         if ((unsigned char)*p & 0x80)
4874             break;
4875         ++p;
4876     }
4877     memcpy(dest, start, p - start);
4878     return p - start;
4879 }
4880 
4881 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4882 unicode_decode_utf8(const char *s, Py_ssize_t size,
4883                     _Py_error_handler error_handler, const char *errors,
4884                     Py_ssize_t *consumed)
4885 {
4886     _PyUnicodeWriter writer;
4887     const char *starts = s;
4888     const char *end = s + size;
4889 
4890     Py_ssize_t startinpos;
4891     Py_ssize_t endinpos;
4892     const char *errmsg = "";
4893     PyObject *error_handler_obj = NULL;
4894     PyObject *exc = NULL;
4895 
4896     if (size == 0) {
4897         if (consumed)
4898             *consumed = 0;
4899         _Py_RETURN_UNICODE_EMPTY();
4900     }
4901 
4902     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4903     if (size == 1 && (unsigned char)s[0] < 128) {
4904         if (consumed)
4905             *consumed = 1;
4906         return get_latin1_char((unsigned char)s[0]);
4907     }
4908 
4909     _PyUnicodeWriter_Init(&writer);
4910     writer.min_length = size;
4911     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4912         goto onError;
4913 
4914     writer.pos = ascii_decode(s, end, writer.data);
4915     s += writer.pos;
4916     while (s < end) {
4917         Py_UCS4 ch;
4918         int kind = writer.kind;
4919 
4920         if (kind == PyUnicode_1BYTE_KIND) {
4921             if (PyUnicode_IS_ASCII(writer.buffer))
4922                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4923             else
4924                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4925         } else if (kind == PyUnicode_2BYTE_KIND) {
4926             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4927         } else {
4928             assert(kind == PyUnicode_4BYTE_KIND);
4929             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4930         }
4931 
4932         switch (ch) {
4933         case 0:
4934             if (s == end || consumed)
4935                 goto End;
4936             errmsg = "unexpected end of data";
4937             startinpos = s - starts;
4938             endinpos = end - starts;
4939             break;
4940         case 1:
4941             errmsg = "invalid start byte";
4942             startinpos = s - starts;
4943             endinpos = startinpos + 1;
4944             break;
4945         case 2:
4946             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4947                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4948             {
4949                 /* Truncated surrogate code in range D800-DFFF */
4950                 goto End;
4951             }
4952             /* fall through */
4953         case 3:
4954         case 4:
4955             errmsg = "invalid continuation byte";
4956             startinpos = s - starts;
4957             endinpos = startinpos + ch - 1;
4958             break;
4959         default:
4960             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4961                 goto onError;
4962             continue;
4963         }
4964 
4965         if (error_handler == _Py_ERROR_UNKNOWN)
4966             error_handler = _Py_GetErrorHandler(errors);
4967 
4968         switch (error_handler) {
4969         case _Py_ERROR_IGNORE:
4970             s += (endinpos - startinpos);
4971             break;
4972 
4973         case _Py_ERROR_REPLACE:
4974             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4975                 goto onError;
4976             s += (endinpos - startinpos);
4977             break;
4978 
4979         case _Py_ERROR_SURROGATEESCAPE:
4980         {
4981             Py_ssize_t i;
4982 
4983             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4984                 goto onError;
4985             for (i=startinpos; i<endinpos; i++) {
4986                 ch = (Py_UCS4)(unsigned char)(starts[i]);
4987                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4988                                 ch + 0xdc00);
4989                 writer.pos++;
4990             }
4991             s += (endinpos - startinpos);
4992             break;
4993         }
4994 
4995         default:
4996             if (unicode_decode_call_errorhandler_writer(
4997                     errors, &error_handler_obj,
4998                     "utf-8", errmsg,
4999                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5000                     &writer))
5001                 goto onError;
5002         }
5003     }
5004 
5005 End:
5006     if (consumed)
5007         *consumed = s - starts;
5008 
5009     Py_XDECREF(error_handler_obj);
5010     Py_XDECREF(exc);
5011     return _PyUnicodeWriter_Finish(&writer);
5012 
5013 onError:
5014     Py_XDECREF(error_handler_obj);
5015     Py_XDECREF(exc);
5016     _PyUnicodeWriter_Dealloc(&writer);
5017     return NULL;
5018 }
5019 
5020 
5021 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5022 PyUnicode_DecodeUTF8Stateful(const char *s,
5023                              Py_ssize_t size,
5024                              const char *errors,
5025                              Py_ssize_t *consumed)
5026 {
5027     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5028 }
5029 
5030 
5031 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5032    non-zero, use strict error handler otherwise.
5033 
5034    On success, write a pointer to a newly allocated wide character string into
5035    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5036    (in number of wchar_t units) into *wlen (if wlen is set).
5037 
5038    On memory allocation failure, return -1.
5039 
5040    On decoding error (if surrogateescape is zero), return -2. If wlen is
5041    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5042    is not NULL, write the decoding error message into *reason. */
5043 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5044 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5045                  const char **reason, _Py_error_handler errors)
5046 {
5047     const char *orig_s = s;
5048     const char *e;
5049     wchar_t *unicode;
5050     Py_ssize_t outpos;
5051 
5052     int surrogateescape = 0;
5053     int surrogatepass = 0;
5054     switch (errors)
5055     {
5056     case _Py_ERROR_STRICT:
5057         break;
5058     case _Py_ERROR_SURROGATEESCAPE:
5059         surrogateescape = 1;
5060         break;
5061     case _Py_ERROR_SURROGATEPASS:
5062         surrogatepass = 1;
5063         break;
5064     default:
5065         return -3;
5066     }
5067 
5068     /* Note: size will always be longer than the resulting Unicode
5069        character count */
5070     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5071         return -1;
5072     }
5073 
5074     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5075     if (!unicode) {
5076         return -1;
5077     }
5078 
5079     /* Unpack UTF-8 encoded data */
5080     e = s + size;
5081     outpos = 0;
5082     while (s < e) {
5083         Py_UCS4 ch;
5084 #if SIZEOF_WCHAR_T == 4
5085         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5086 #else
5087         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5088 #endif
5089         if (ch > 0xFF) {
5090 #if SIZEOF_WCHAR_T == 4
5091             Py_UNREACHABLE();
5092 #else
5093             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5094             /* write a surrogate pair */
5095             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5096             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5097 #endif
5098         }
5099         else {
5100             if (!ch && s == e) {
5101                 break;
5102             }
5103 
5104             if (surrogateescape) {
5105                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5106             }
5107             else {
5108                 /* Is it a valid three-byte code? */
5109                 if (surrogatepass
5110                     && (e - s) >= 3
5111                     && (s[0] & 0xf0) == 0xe0
5112                     && (s[1] & 0xc0) == 0x80
5113                     && (s[2] & 0xc0) == 0x80)
5114                 {
5115                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5116                     s += 3;
5117                     unicode[outpos++] = ch;
5118                 }
5119                 else {
5120                     PyMem_RawFree(unicode );
5121                     if (reason != NULL) {
5122                         switch (ch) {
5123                         case 0:
5124                             *reason = "unexpected end of data";
5125                             break;
5126                         case 1:
5127                             *reason = "invalid start byte";
5128                             break;
5129                         /* 2, 3, 4 */
5130                         default:
5131                             *reason = "invalid continuation byte";
5132                             break;
5133                         }
5134                     }
5135                     if (wlen != NULL) {
5136                         *wlen = s - orig_s;
5137                     }
5138                     return -2;
5139                 }
5140             }
5141         }
5142     }
5143     unicode[outpos] = L'\0';
5144     if (wlen) {
5145         *wlen = outpos;
5146     }
5147     *wstr = unicode;
5148     return 0;
5149 }
5150 
5151 
5152 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5153 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5154                                size_t *wlen)
5155 {
5156     wchar_t *wstr;
5157     int res = _Py_DecodeUTF8Ex(arg, arglen,
5158                                &wstr, wlen,
5159                                NULL, _Py_ERROR_SURROGATEESCAPE);
5160     if (res != 0) {
5161         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5162         assert(res != -3);
5163         if (wlen) {
5164             *wlen = (size_t)res;
5165         }
5166         return NULL;
5167     }
5168     return wstr;
5169 }
5170 
5171 
5172 /* UTF-8 encoder using the surrogateescape error handler .
5173 
5174    On success, return 0 and write the newly allocated character string (use
5175    PyMem_Free() to free the memory) into *str.
5176 
5177    On encoding failure, return -2 and write the position of the invalid
5178    surrogate character into *error_pos (if error_pos is set) and the decoding
5179    error message into *reason (if reason is set).
5180 
5181    On memory allocation failure, return -1. */
5182 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5183 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5184                  const char **reason, int raw_malloc, _Py_error_handler errors)
5185 {
5186     const Py_ssize_t max_char_size = 4;
5187     Py_ssize_t len = wcslen(text);
5188 
5189     assert(len >= 0);
5190 
5191     int surrogateescape = 0;
5192     int surrogatepass = 0;
5193     switch (errors)
5194     {
5195     case _Py_ERROR_STRICT:
5196         break;
5197     case _Py_ERROR_SURROGATEESCAPE:
5198         surrogateescape = 1;
5199         break;
5200     case _Py_ERROR_SURROGATEPASS:
5201         surrogatepass = 1;
5202         break;
5203     default:
5204         return -3;
5205     }
5206 
5207     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5208         return -1;
5209     }
5210     char *bytes;
5211     if (raw_malloc) {
5212         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5213     }
5214     else {
5215         bytes = PyMem_Malloc((len + 1) * max_char_size);
5216     }
5217     if (bytes == NULL) {
5218         return -1;
5219     }
5220 
5221     char *p = bytes;
5222     Py_ssize_t i;
5223     for (i = 0; i < len; ) {
5224         Py_ssize_t ch_pos = i;
5225         Py_UCS4 ch = text[i];
5226         i++;
5227 #if Py_UNICODE_SIZE == 2
5228         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5229             && i < len
5230             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5231         {
5232             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5233             i++;
5234         }
5235 #endif
5236 
5237         if (ch < 0x80) {
5238             /* Encode ASCII */
5239             *p++ = (char) ch;
5240 
5241         }
5242         else if (ch < 0x0800) {
5243             /* Encode Latin-1 */
5244             *p++ = (char)(0xc0 | (ch >> 6));
5245             *p++ = (char)(0x80 | (ch & 0x3f));
5246         }
5247         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5248             /* surrogateescape error handler */
5249             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5250                 if (error_pos != NULL) {
5251                     *error_pos = (size_t)ch_pos;
5252                 }
5253                 if (reason != NULL) {
5254                     *reason = "encoding error";
5255                 }
5256                 if (raw_malloc) {
5257                     PyMem_RawFree(bytes);
5258                 }
5259                 else {
5260                     PyMem_Free(bytes);
5261                 }
5262                 return -2;
5263             }
5264             *p++ = (char)(ch & 0xff);
5265         }
5266         else if (ch < 0x10000) {
5267             *p++ = (char)(0xe0 | (ch >> 12));
5268             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5269             *p++ = (char)(0x80 | (ch & 0x3f));
5270         }
5271         else {  /* ch >= 0x10000 */
5272             assert(ch <= MAX_UNICODE);
5273             /* Encode UCS4 Unicode ordinals */
5274             *p++ = (char)(0xf0 | (ch >> 18));
5275             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5276             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5277             *p++ = (char)(0x80 | (ch & 0x3f));
5278         }
5279     }
5280     *p++ = '\0';
5281 
5282     size_t final_size = (p - bytes);
5283     char *bytes2;
5284     if (raw_malloc) {
5285         bytes2 = PyMem_RawRealloc(bytes, final_size);
5286     }
5287     else {
5288         bytes2 = PyMem_Realloc(bytes, final_size);
5289     }
5290     if (bytes2 == NULL) {
5291         if (error_pos != NULL) {
5292             *error_pos = (size_t)-1;
5293         }
5294         if (raw_malloc) {
5295             PyMem_RawFree(bytes);
5296         }
5297         else {
5298             PyMem_Free(bytes);
5299         }
5300         return -1;
5301     }
5302     *str = bytes2;
5303     return 0;
5304 }
5305 
5306 
5307 /* Primary internal function which creates utf8 encoded bytes objects.
5308 
5309    Allocation strategy:  if the string is short, convert into a stack buffer
5310    and allocate exactly as much space needed at the end.  Else allocate the
5311    maximum possible needed (4 result bytes per Unicode character), and return
5312    the excess memory at the end.
5313 */
5314 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5315 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5316                     const char *errors)
5317 {
5318     enum PyUnicode_Kind kind;
5319     void *data;
5320     Py_ssize_t size;
5321 
5322     if (!PyUnicode_Check(unicode)) {
5323         PyErr_BadArgument();
5324         return NULL;
5325     }
5326 
5327     if (PyUnicode_READY(unicode) == -1)
5328         return NULL;
5329 
5330     if (PyUnicode_UTF8(unicode))
5331         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5332                                          PyUnicode_UTF8_LENGTH(unicode));
5333 
5334     kind = PyUnicode_KIND(unicode);
5335     data = PyUnicode_DATA(unicode);
5336     size = PyUnicode_GET_LENGTH(unicode);
5337 
5338     switch (kind) {
5339     default:
5340         Py_UNREACHABLE();
5341     case PyUnicode_1BYTE_KIND:
5342         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5343         assert(!PyUnicode_IS_ASCII(unicode));
5344         return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5345     case PyUnicode_2BYTE_KIND:
5346         return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5347     case PyUnicode_4BYTE_KIND:
5348         return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5349     }
5350 }
5351 
5352 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5353 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5354 {
5355     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5356 }
5357 
5358 
5359 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5360 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5361                      Py_ssize_t size,
5362                      const char *errors)
5363 {
5364     PyObject *v, *unicode;
5365 
5366     unicode = PyUnicode_FromWideChar(s, size);
5367     if (unicode == NULL)
5368         return NULL;
5369     v = _PyUnicode_AsUTF8String(unicode, errors);
5370     Py_DECREF(unicode);
5371     return v;
5372 }
5373 
5374 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5375 PyUnicode_AsUTF8String(PyObject *unicode)
5376 {
5377     return _PyUnicode_AsUTF8String(unicode, NULL);
5378 }
5379 
5380 /* --- UTF-32 Codec ------------------------------------------------------- */
5381 
5382 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5383 PyUnicode_DecodeUTF32(const char *s,
5384                       Py_ssize_t size,
5385                       const char *errors,
5386                       int *byteorder)
5387 {
5388     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5389 }
5390 
5391 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5392 PyUnicode_DecodeUTF32Stateful(const char *s,
5393                               Py_ssize_t size,
5394                               const char *errors,
5395                               int *byteorder,
5396                               Py_ssize_t *consumed)
5397 {
5398     const char *starts = s;
5399     Py_ssize_t startinpos;
5400     Py_ssize_t endinpos;
5401     _PyUnicodeWriter writer;
5402     const unsigned char *q, *e;
5403     int le, bo = 0;       /* assume native ordering by default */
5404     const char *encoding;
5405     const char *errmsg = "";
5406     PyObject *errorHandler = NULL;
5407     PyObject *exc = NULL;
5408 
5409     q = (unsigned char *)s;
5410     e = q + size;
5411 
5412     if (byteorder)
5413         bo = *byteorder;
5414 
5415     /* Check for BOM marks (U+FEFF) in the input and adjust current
5416        byte order setting accordingly. In native mode, the leading BOM
5417        mark is skipped, in all other modes, it is copied to the output
5418        stream as-is (giving a ZWNBSP character). */
5419     if (bo == 0 && size >= 4) {
5420         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5421         if (bom == 0x0000FEFF) {
5422             bo = -1;
5423             q += 4;
5424         }
5425         else if (bom == 0xFFFE0000) {
5426             bo = 1;
5427             q += 4;
5428         }
5429         if (byteorder)
5430             *byteorder = bo;
5431     }
5432 
5433     if (q == e) {
5434         if (consumed)
5435             *consumed = size;
5436         _Py_RETURN_UNICODE_EMPTY();
5437     }
5438 
5439 #ifdef WORDS_BIGENDIAN
5440     le = bo < 0;
5441 #else
5442     le = bo <= 0;
5443 #endif
5444     encoding = le ? "utf-32-le" : "utf-32-be";
5445 
5446     _PyUnicodeWriter_Init(&writer);
5447     writer.min_length = (e - q + 3) / 4;
5448     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5449         goto onError;
5450 
5451     while (1) {
5452         Py_UCS4 ch = 0;
5453         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5454 
5455         if (e - q >= 4) {
5456             enum PyUnicode_Kind kind = writer.kind;
5457             void *data = writer.data;
5458             const unsigned char *last = e - 4;
5459             Py_ssize_t pos = writer.pos;
5460             if (le) {
5461                 do {
5462                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5463                     if (ch > maxch)
5464                         break;
5465                     if (kind != PyUnicode_1BYTE_KIND &&
5466                         Py_UNICODE_IS_SURROGATE(ch))
5467                         break;
5468                     PyUnicode_WRITE(kind, data, pos++, ch);
5469                     q += 4;
5470                 } while (q <= last);
5471             }
5472             else {
5473                 do {
5474                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5475                     if (ch > maxch)
5476                         break;
5477                     if (kind != PyUnicode_1BYTE_KIND &&
5478                         Py_UNICODE_IS_SURROGATE(ch))
5479                         break;
5480                     PyUnicode_WRITE(kind, data, pos++, ch);
5481                     q += 4;
5482                 } while (q <= last);
5483             }
5484             writer.pos = pos;
5485         }
5486 
5487         if (Py_UNICODE_IS_SURROGATE(ch)) {
5488             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5489             startinpos = ((const char *)q) - starts;
5490             endinpos = startinpos + 4;
5491         }
5492         else if (ch <= maxch) {
5493             if (q == e || consumed)
5494                 break;
5495             /* remaining bytes at the end? (size should be divisible by 4) */
5496             errmsg = "truncated data";
5497             startinpos = ((const char *)q) - starts;
5498             endinpos = ((const char *)e) - starts;
5499         }
5500         else {
5501             if (ch < 0x110000) {
5502                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5503                     goto onError;
5504                 q += 4;
5505                 continue;
5506             }
5507             errmsg = "code point not in range(0x110000)";
5508             startinpos = ((const char *)q) - starts;
5509             endinpos = startinpos + 4;
5510         }
5511 
5512         /* The remaining input chars are ignored if the callback
5513            chooses to skip the input */
5514         if (unicode_decode_call_errorhandler_writer(
5515                 errors, &errorHandler,
5516                 encoding, errmsg,
5517                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5518                 &writer))
5519             goto onError;
5520     }
5521 
5522     if (consumed)
5523         *consumed = (const char *)q-starts;
5524 
5525     Py_XDECREF(errorHandler);
5526     Py_XDECREF(exc);
5527     return _PyUnicodeWriter_Finish(&writer);
5528 
5529   onError:
5530     _PyUnicodeWriter_Dealloc(&writer);
5531     Py_XDECREF(errorHandler);
5532     Py_XDECREF(exc);
5533     return NULL;
5534 }
5535 
5536 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5537 _PyUnicode_EncodeUTF32(PyObject *str,
5538                        const char *errors,
5539                        int byteorder)
5540 {
5541     enum PyUnicode_Kind kind;
5542     const void *data;
5543     Py_ssize_t len;
5544     PyObject *v;
5545     uint32_t *out;
5546 #if PY_LITTLE_ENDIAN
5547     int native_ordering = byteorder <= 0;
5548 #else
5549     int native_ordering = byteorder >= 0;
5550 #endif
5551     const char *encoding;
5552     Py_ssize_t nsize, pos;
5553     PyObject *errorHandler = NULL;
5554     PyObject *exc = NULL;
5555     PyObject *rep = NULL;
5556 
5557     if (!PyUnicode_Check(str)) {
5558         PyErr_BadArgument();
5559         return NULL;
5560     }
5561     if (PyUnicode_READY(str) == -1)
5562         return NULL;
5563     kind = PyUnicode_KIND(str);
5564     data = PyUnicode_DATA(str);
5565     len = PyUnicode_GET_LENGTH(str);
5566 
5567     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5568         return PyErr_NoMemory();
5569     nsize = len + (byteorder == 0);
5570     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5571     if (v == NULL)
5572         return NULL;
5573 
5574     /* output buffer is 4-bytes aligned */
5575     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5576     out = (uint32_t *)PyBytes_AS_STRING(v);
5577     if (byteorder == 0)
5578         *out++ = 0xFEFF;
5579     if (len == 0)
5580         goto done;
5581 
5582     if (byteorder == -1)
5583         encoding = "utf-32-le";
5584     else if (byteorder == 1)
5585         encoding = "utf-32-be";
5586     else
5587         encoding = "utf-32";
5588 
5589     if (kind == PyUnicode_1BYTE_KIND) {
5590         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5591         goto done;
5592     }
5593 
5594     pos = 0;
5595     while (pos < len) {
5596         Py_ssize_t repsize, moreunits;
5597 
5598         if (kind == PyUnicode_2BYTE_KIND) {
5599             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5600                                         &out, native_ordering);
5601         }
5602         else {
5603             assert(kind == PyUnicode_4BYTE_KIND);
5604             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5605                                         &out, native_ordering);
5606         }
5607         if (pos == len)
5608             break;
5609 
5610         rep = unicode_encode_call_errorhandler(
5611                 errors, &errorHandler,
5612                 encoding, "surrogates not allowed",
5613                 str, &exc, pos, pos + 1, &pos);
5614         if (!rep)
5615             goto error;
5616 
5617         if (PyBytes_Check(rep)) {
5618             repsize = PyBytes_GET_SIZE(rep);
5619             if (repsize & 3) {
5620                 raise_encode_exception(&exc, encoding,
5621                                        str, pos - 1, pos,
5622                                        "surrogates not allowed");
5623                 goto error;
5624             }
5625             moreunits = repsize / 4;
5626         }
5627         else {
5628             assert(PyUnicode_Check(rep));
5629             if (PyUnicode_READY(rep) < 0)
5630                 goto error;
5631             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5632             if (!PyUnicode_IS_ASCII(rep)) {
5633                 raise_encode_exception(&exc, encoding,
5634                                        str, pos - 1, pos,
5635                                        "surrogates not allowed");
5636                 goto error;
5637             }
5638         }
5639 
5640         /* four bytes are reserved for each surrogate */
5641         if (moreunits > 1) {
5642             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5643             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5644                 /* integer overflow */
5645                 PyErr_NoMemory();
5646                 goto error;
5647             }
5648             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5649                 goto error;
5650             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5651         }
5652 
5653         if (PyBytes_Check(rep)) {
5654             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5655             out += moreunits;
5656         } else /* rep is unicode */ {
5657             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5658             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5659                                  &out, native_ordering);
5660         }
5661 
5662         Py_CLEAR(rep);
5663     }
5664 
5665     /* Cut back to size actually needed. This is necessary for, for example,
5666        encoding of a string containing isolated surrogates and the 'ignore'
5667        handler is used. */
5668     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5669     if (nsize != PyBytes_GET_SIZE(v))
5670       _PyBytes_Resize(&v, nsize);
5671     Py_XDECREF(errorHandler);
5672     Py_XDECREF(exc);
5673   done:
5674     return v;
5675   error:
5676     Py_XDECREF(rep);
5677     Py_XDECREF(errorHandler);
5678     Py_XDECREF(exc);
5679     Py_XDECREF(v);
5680     return NULL;
5681 }
5682 
5683 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5684 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5685                       Py_ssize_t size,
5686                       const char *errors,
5687                       int byteorder)
5688 {
5689     PyObject *result;
5690     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5691     if (tmp == NULL)
5692         return NULL;
5693     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5694     Py_DECREF(tmp);
5695     return result;
5696 }
5697 
5698 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5699 PyUnicode_AsUTF32String(PyObject *unicode)
5700 {
5701     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5702 }
5703 
5704 /* --- UTF-16 Codec ------------------------------------------------------- */
5705 
5706 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5707 PyUnicode_DecodeUTF16(const char *s,
5708                       Py_ssize_t size,
5709                       const char *errors,
5710                       int *byteorder)
5711 {
5712     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5713 }
5714 
5715 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5716 PyUnicode_DecodeUTF16Stateful(const char *s,
5717                               Py_ssize_t size,
5718                               const char *errors,
5719                               int *byteorder,
5720                               Py_ssize_t *consumed)
5721 {
5722     const char *starts = s;
5723     Py_ssize_t startinpos;
5724     Py_ssize_t endinpos;
5725     _PyUnicodeWriter writer;
5726     const unsigned char *q, *e;
5727     int bo = 0;       /* assume native ordering by default */
5728     int native_ordering;
5729     const char *errmsg = "";
5730     PyObject *errorHandler = NULL;
5731     PyObject *exc = NULL;
5732     const char *encoding;
5733 
5734     q = (unsigned char *)s;
5735     e = q + size;
5736 
5737     if (byteorder)
5738         bo = *byteorder;
5739 
5740     /* Check for BOM marks (U+FEFF) in the input and adjust current
5741        byte order setting accordingly. In native mode, the leading BOM
5742        mark is skipped, in all other modes, it is copied to the output
5743        stream as-is (giving a ZWNBSP character). */
5744     if (bo == 0 && size >= 2) {
5745         const Py_UCS4 bom = (q[1] << 8) | q[0];
5746         if (bom == 0xFEFF) {
5747             q += 2;
5748             bo = -1;
5749         }
5750         else if (bom == 0xFFFE) {
5751             q += 2;
5752             bo = 1;
5753         }
5754         if (byteorder)
5755             *byteorder = bo;
5756     }
5757 
5758     if (q == e) {
5759         if (consumed)
5760             *consumed = size;
5761         _Py_RETURN_UNICODE_EMPTY();
5762     }
5763 
5764 #if PY_LITTLE_ENDIAN
5765     native_ordering = bo <= 0;
5766     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5767 #else
5768     native_ordering = bo >= 0;
5769     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5770 #endif
5771 
5772     /* Note: size will always be longer than the resulting Unicode
5773        character count normally.  Error handler will take care of
5774        resizing when needed. */
5775     _PyUnicodeWriter_Init(&writer);
5776     writer.min_length = (e - q + 1) / 2;
5777     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5778         goto onError;
5779 
5780     while (1) {
5781         Py_UCS4 ch = 0;
5782         if (e - q >= 2) {
5783             int kind = writer.kind;
5784             if (kind == PyUnicode_1BYTE_KIND) {
5785                 if (PyUnicode_IS_ASCII(writer.buffer))
5786                     ch = asciilib_utf16_decode(&q, e,
5787                             (Py_UCS1*)writer.data, &writer.pos,
5788                             native_ordering);
5789                 else
5790                     ch = ucs1lib_utf16_decode(&q, e,
5791                             (Py_UCS1*)writer.data, &writer.pos,
5792                             native_ordering);
5793             } else if (kind == PyUnicode_2BYTE_KIND) {
5794                 ch = ucs2lib_utf16_decode(&q, e,
5795                         (Py_UCS2*)writer.data, &writer.pos,
5796                         native_ordering);
5797             } else {
5798                 assert(kind == PyUnicode_4BYTE_KIND);
5799                 ch = ucs4lib_utf16_decode(&q, e,
5800                         (Py_UCS4*)writer.data, &writer.pos,
5801                         native_ordering);
5802             }
5803         }
5804 
5805         switch (ch)
5806         {
5807         case 0:
5808             /* remaining byte at the end? (size should be even) */
5809             if (q == e || consumed)
5810                 goto End;
5811             errmsg = "truncated data";
5812             startinpos = ((const char *)q) - starts;
5813             endinpos = ((const char *)e) - starts;
5814             break;
5815             /* The remaining input chars are ignored if the callback
5816                chooses to skip the input */
5817         case 1:
5818             q -= 2;
5819             if (consumed)
5820                 goto End;
5821             errmsg = "unexpected end of data";
5822             startinpos = ((const char *)q) - starts;
5823             endinpos = ((const char *)e) - starts;
5824             break;
5825         case 2:
5826             errmsg = "illegal encoding";
5827             startinpos = ((const char *)q) - 2 - starts;
5828             endinpos = startinpos + 2;
5829             break;
5830         case 3:
5831             errmsg = "illegal UTF-16 surrogate";
5832             startinpos = ((const char *)q) - 4 - starts;
5833             endinpos = startinpos + 2;
5834             break;
5835         default:
5836             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5837                 goto onError;
5838             continue;
5839         }
5840 
5841         if (unicode_decode_call_errorhandler_writer(
5842                 errors,
5843                 &errorHandler,
5844                 encoding, errmsg,
5845                 &starts,
5846                 (const char **)&e,
5847                 &startinpos,
5848                 &endinpos,
5849                 &exc,
5850                 (const char **)&q,
5851                 &writer))
5852             goto onError;
5853     }
5854 
5855 End:
5856     if (consumed)
5857         *consumed = (const char *)q-starts;
5858 
5859     Py_XDECREF(errorHandler);
5860     Py_XDECREF(exc);
5861     return _PyUnicodeWriter_Finish(&writer);
5862 
5863   onError:
5864     _PyUnicodeWriter_Dealloc(&writer);
5865     Py_XDECREF(errorHandler);
5866     Py_XDECREF(exc);
5867     return NULL;
5868 }
5869 
5870 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5871 _PyUnicode_EncodeUTF16(PyObject *str,
5872                        const char *errors,
5873                        int byteorder)
5874 {
5875     enum PyUnicode_Kind kind;
5876     const void *data;
5877     Py_ssize_t len;
5878     PyObject *v;
5879     unsigned short *out;
5880     Py_ssize_t pairs;
5881 #if PY_BIG_ENDIAN
5882     int native_ordering = byteorder >= 0;
5883 #else
5884     int native_ordering = byteorder <= 0;
5885 #endif
5886     const char *encoding;
5887     Py_ssize_t nsize, pos;
5888     PyObject *errorHandler = NULL;
5889     PyObject *exc = NULL;
5890     PyObject *rep = NULL;
5891 
5892     if (!PyUnicode_Check(str)) {
5893         PyErr_BadArgument();
5894         return NULL;
5895     }
5896     if (PyUnicode_READY(str) == -1)
5897         return NULL;
5898     kind = PyUnicode_KIND(str);
5899     data = PyUnicode_DATA(str);
5900     len = PyUnicode_GET_LENGTH(str);
5901 
5902     pairs = 0;
5903     if (kind == PyUnicode_4BYTE_KIND) {
5904         const Py_UCS4 *in = (const Py_UCS4 *)data;
5905         const Py_UCS4 *end = in + len;
5906         while (in < end) {
5907             if (*in++ >= 0x10000) {
5908                 pairs++;
5909             }
5910         }
5911     }
5912     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5913         return PyErr_NoMemory();
5914     }
5915     nsize = len + pairs + (byteorder == 0);
5916     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5917     if (v == NULL) {
5918         return NULL;
5919     }
5920 
5921     /* output buffer is 2-bytes aligned */
5922     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5923     out = (unsigned short *)PyBytes_AS_STRING(v);
5924     if (byteorder == 0) {
5925         *out++ = 0xFEFF;
5926     }
5927     if (len == 0) {
5928         goto done;
5929     }
5930 
5931     if (kind == PyUnicode_1BYTE_KIND) {
5932         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5933         goto done;
5934     }
5935 
5936     if (byteorder < 0) {
5937         encoding = "utf-16-le";
5938     }
5939     else if (byteorder > 0) {
5940         encoding = "utf-16-be";
5941     }
5942     else {
5943         encoding = "utf-16";
5944     }
5945 
5946     pos = 0;
5947     while (pos < len) {
5948         Py_ssize_t repsize, moreunits;
5949 
5950         if (kind == PyUnicode_2BYTE_KIND) {
5951             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5952                                         &out, native_ordering);
5953         }
5954         else {
5955             assert(kind == PyUnicode_4BYTE_KIND);
5956             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5957                                         &out, native_ordering);
5958         }
5959         if (pos == len)
5960             break;
5961 
5962         rep = unicode_encode_call_errorhandler(
5963                 errors, &errorHandler,
5964                 encoding, "surrogates not allowed",
5965                 str, &exc, pos, pos + 1, &pos);
5966         if (!rep)
5967             goto error;
5968 
5969         if (PyBytes_Check(rep)) {
5970             repsize = PyBytes_GET_SIZE(rep);
5971             if (repsize & 1) {
5972                 raise_encode_exception(&exc, encoding,
5973                                        str, pos - 1, pos,
5974                                        "surrogates not allowed");
5975                 goto error;
5976             }
5977             moreunits = repsize / 2;
5978         }
5979         else {
5980             assert(PyUnicode_Check(rep));
5981             if (PyUnicode_READY(rep) < 0)
5982                 goto error;
5983             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5984             if (!PyUnicode_IS_ASCII(rep)) {
5985                 raise_encode_exception(&exc, encoding,
5986                                        str, pos - 1, pos,
5987                                        "surrogates not allowed");
5988                 goto error;
5989             }
5990         }
5991 
5992         /* two bytes are reserved for each surrogate */
5993         if (moreunits > 1) {
5994             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5995             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5996                 /* integer overflow */
5997                 PyErr_NoMemory();
5998                 goto error;
5999             }
6000             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6001                 goto error;
6002             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6003         }
6004 
6005         if (PyBytes_Check(rep)) {
6006             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6007             out += moreunits;
6008         } else /* rep is unicode */ {
6009             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6010             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6011                                  &out, native_ordering);
6012         }
6013 
6014         Py_CLEAR(rep);
6015     }
6016 
6017     /* Cut back to size actually needed. This is necessary for, for example,
6018     encoding of a string containing isolated surrogates and the 'ignore' handler
6019     is used. */
6020     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6021     if (nsize != PyBytes_GET_SIZE(v))
6022       _PyBytes_Resize(&v, nsize);
6023     Py_XDECREF(errorHandler);
6024     Py_XDECREF(exc);
6025   done:
6026     return v;
6027   error:
6028     Py_XDECREF(rep);
6029     Py_XDECREF(errorHandler);
6030     Py_XDECREF(exc);
6031     Py_XDECREF(v);
6032     return NULL;
6033 #undef STORECHAR
6034 }
6035 
6036 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6037 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6038                       Py_ssize_t size,
6039                       const char *errors,
6040                       int byteorder)
6041 {
6042     PyObject *result;
6043     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6044     if (tmp == NULL)
6045         return NULL;
6046     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6047     Py_DECREF(tmp);
6048     return result;
6049 }
6050 
6051 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6052 PyUnicode_AsUTF16String(PyObject *unicode)
6053 {
6054     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6055 }
6056 
6057 /* --- Unicode Escape Codec ----------------------------------------------- */
6058 
6059 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6060 
6061 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6062 _PyUnicode_DecodeUnicodeEscape(const char *s,
6063                                Py_ssize_t size,
6064                                const char *errors,
6065                                const char **first_invalid_escape)
6066 {
6067     const char *starts = s;
6068     _PyUnicodeWriter writer;
6069     const char *end;
6070     PyObject *errorHandler = NULL;
6071     PyObject *exc = NULL;
6072 
6073     // so we can remember if we've seen an invalid escape char or not
6074     *first_invalid_escape = NULL;
6075 
6076     if (size == 0) {
6077         _Py_RETURN_UNICODE_EMPTY();
6078     }
6079     /* Escaped strings will always be longer than the resulting
6080        Unicode string, so we start with size here and then reduce the
6081        length after conversion to the true value.
6082        (but if the error callback returns a long replacement string
6083        we'll have to allocate more space) */
6084     _PyUnicodeWriter_Init(&writer);
6085     writer.min_length = size;
6086     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6087         goto onError;
6088     }
6089 
6090     end = s + size;
6091     while (s < end) {
6092         unsigned char c = (unsigned char) *s++;
6093         Py_UCS4 ch;
6094         int count;
6095         Py_ssize_t startinpos;
6096         Py_ssize_t endinpos;
6097         const char *message;
6098 
6099 #define WRITE_ASCII_CHAR(ch)                                                  \
6100             do {                                                              \
6101                 assert(ch <= 127);                                            \
6102                 assert(writer.pos < writer.size);                             \
6103                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6104             } while(0)
6105 
6106 #define WRITE_CHAR(ch)                                                        \
6107             do {                                                              \
6108                 if (ch <= writer.maxchar) {                                   \
6109                     assert(writer.pos < writer.size);                         \
6110                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6111                 }                                                             \
6112                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6113                     goto onError;                                             \
6114                 }                                                             \
6115             } while(0)
6116 
6117         /* Non-escape characters are interpreted as Unicode ordinals */
6118         if (c != '\\') {
6119             WRITE_CHAR(c);
6120             continue;
6121         }
6122 
6123         startinpos = s - starts - 1;
6124         /* \ - Escapes */
6125         if (s >= end) {
6126             message = "\\ at end of string";
6127             goto error;
6128         }
6129         c = (unsigned char) *s++;
6130 
6131         assert(writer.pos < writer.size);
6132         switch (c) {
6133 
6134             /* \x escapes */
6135         case '\n': continue;
6136         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6137         case '\'': WRITE_ASCII_CHAR('\''); continue;
6138         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6139         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6140         /* FF */
6141         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6142         case 't': WRITE_ASCII_CHAR('\t'); continue;
6143         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6144         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6145         /* VT */
6146         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6147         /* BEL, not classic C */
6148         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6149 
6150             /* \OOO (octal) escapes */
6151         case '0': case '1': case '2': case '3':
6152         case '4': case '5': case '6': case '7':
6153             ch = c - '0';
6154             if (s < end && '0' <= *s && *s <= '7') {
6155                 ch = (ch<<3) + *s++ - '0';
6156                 if (s < end && '0' <= *s && *s <= '7') {
6157                     ch = (ch<<3) + *s++ - '0';
6158                 }
6159             }
6160             WRITE_CHAR(ch);
6161             continue;
6162 
6163             /* hex escapes */
6164             /* \xXX */
6165         case 'x':
6166             count = 2;
6167             message = "truncated \\xXX escape";
6168             goto hexescape;
6169 
6170             /* \uXXXX */
6171         case 'u':
6172             count = 4;
6173             message = "truncated \\uXXXX escape";
6174             goto hexescape;
6175 
6176             /* \UXXXXXXXX */
6177         case 'U':
6178             count = 8;
6179             message = "truncated \\UXXXXXXXX escape";
6180         hexescape:
6181             for (ch = 0; count && s < end; ++s, --count) {
6182                 c = (unsigned char)*s;
6183                 ch <<= 4;
6184                 if (c >= '0' && c <= '9') {
6185                     ch += c - '0';
6186                 }
6187                 else if (c >= 'a' && c <= 'f') {
6188                     ch += c - ('a' - 10);
6189                 }
6190                 else if (c >= 'A' && c <= 'F') {
6191                     ch += c - ('A' - 10);
6192                 }
6193                 else {
6194                     break;
6195                 }
6196             }
6197             if (count) {
6198                 goto error;
6199             }
6200 
6201             /* when we get here, ch is a 32-bit unicode character */
6202             if (ch > MAX_UNICODE) {
6203                 message = "illegal Unicode character";
6204                 goto error;
6205             }
6206 
6207             WRITE_CHAR(ch);
6208             continue;
6209 
6210             /* \N{name} */
6211         case 'N':
6212             if (ucnhash_CAPI == NULL) {
6213                 /* load the unicode data module */
6214                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6215                                                 PyUnicodeData_CAPSULE_NAME, 1);
6216                 if (ucnhash_CAPI == NULL) {
6217                     PyErr_SetString(
6218                         PyExc_UnicodeError,
6219                         "\\N escapes not supported (can't load unicodedata module)"
6220                         );
6221                     goto onError;
6222                 }
6223             }
6224 
6225             message = "malformed \\N character escape";
6226             if (s < end && *s == '{') {
6227                 const char *start = ++s;
6228                 size_t namelen;
6229                 /* look for the closing brace */
6230                 while (s < end && *s != '}')
6231                     s++;
6232                 namelen = s - start;
6233                 if (namelen && s < end) {
6234                     /* found a name.  look it up in the unicode database */
6235                     s++;
6236                     ch = 0xffffffff; /* in case 'getcode' messes up */
6237                     if (namelen <= INT_MAX &&
6238                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6239                                               &ch, 0)) {
6240                         assert(ch <= MAX_UNICODE);
6241                         WRITE_CHAR(ch);
6242                         continue;
6243                     }
6244                     message = "unknown Unicode character name";
6245                 }
6246             }
6247             goto error;
6248 
6249         default:
6250             if (*first_invalid_escape == NULL) {
6251                 *first_invalid_escape = s-1; /* Back up one char, since we've
6252                                                 already incremented s. */
6253             }
6254             WRITE_ASCII_CHAR('\\');
6255             WRITE_CHAR(c);
6256             continue;
6257         }
6258 
6259       error:
6260         endinpos = s-starts;
6261         writer.min_length = end - s + writer.pos;
6262         if (unicode_decode_call_errorhandler_writer(
6263                 errors, &errorHandler,
6264                 "unicodeescape", message,
6265                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6266                 &writer)) {
6267             goto onError;
6268         }
6269         assert(end - s <= writer.size - writer.pos);
6270 
6271 #undef WRITE_ASCII_CHAR
6272 #undef WRITE_CHAR
6273     }
6274 
6275     Py_XDECREF(errorHandler);
6276     Py_XDECREF(exc);
6277     return _PyUnicodeWriter_Finish(&writer);
6278 
6279   onError:
6280     _PyUnicodeWriter_Dealloc(&writer);
6281     Py_XDECREF(errorHandler);
6282     Py_XDECREF(exc);
6283     return NULL;
6284 }
6285 
6286 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6287 PyUnicode_DecodeUnicodeEscape(const char *s,
6288                               Py_ssize_t size,
6289                               const char *errors)
6290 {
6291     const char *first_invalid_escape;
6292     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6293                                                       &first_invalid_escape);
6294     if (result == NULL)
6295         return NULL;
6296     if (first_invalid_escape != NULL) {
6297         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6298                              "invalid escape sequence '\\%c'",
6299                              (unsigned char)*first_invalid_escape) < 0) {
6300             Py_DECREF(result);
6301             return NULL;
6302         }
6303     }
6304     return result;
6305 }
6306 
6307 /* Return a Unicode-Escape string version of the Unicode object. */
6308 
6309 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6310 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6311 {
6312     Py_ssize_t i, len;
6313     PyObject *repr;
6314     char *p;
6315     enum PyUnicode_Kind kind;
6316     void *data;
6317     Py_ssize_t expandsize;
6318 
6319     /* Initial allocation is based on the longest-possible character
6320        escape.
6321 
6322        For UCS1 strings it's '\xxx', 4 bytes per source character.
6323        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6324        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6325     */
6326 
6327     if (!PyUnicode_Check(unicode)) {
6328         PyErr_BadArgument();
6329         return NULL;
6330     }
6331     if (PyUnicode_READY(unicode) == -1) {
6332         return NULL;
6333     }
6334 
6335     len = PyUnicode_GET_LENGTH(unicode);
6336     if (len == 0) {
6337         return PyBytes_FromStringAndSize(NULL, 0);
6338     }
6339 
6340     kind = PyUnicode_KIND(unicode);
6341     data = PyUnicode_DATA(unicode);
6342     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6343        bytes, and 1 byte characters 4. */
6344     expandsize = kind * 2 + 2;
6345     if (len > PY_SSIZE_T_MAX / expandsize) {
6346         return PyErr_NoMemory();
6347     }
6348     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6349     if (repr == NULL) {
6350         return NULL;
6351     }
6352 
6353     p = PyBytes_AS_STRING(repr);
6354     for (i = 0; i < len; i++) {
6355         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6356 
6357         /* U+0000-U+00ff range */
6358         if (ch < 0x100) {
6359             if (ch >= ' ' && ch < 127) {
6360                 if (ch != '\\') {
6361                     /* Copy printable US ASCII as-is */
6362                     *p++ = (char) ch;
6363                 }
6364                 /* Escape backslashes */
6365                 else {
6366                     *p++ = '\\';
6367                     *p++ = '\\';
6368                 }
6369             }
6370 
6371             /* Map special whitespace to '\t', \n', '\r' */
6372             else if (ch == '\t') {
6373                 *p++ = '\\';
6374                 *p++ = 't';
6375             }
6376             else if (ch == '\n') {
6377                 *p++ = '\\';
6378                 *p++ = 'n';
6379             }
6380             else if (ch == '\r') {
6381                 *p++ = '\\';
6382                 *p++ = 'r';
6383             }
6384 
6385             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6386             else {
6387                 *p++ = '\\';
6388                 *p++ = 'x';
6389                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390                 *p++ = Py_hexdigits[ch & 0x000F];
6391             }
6392         }
6393         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6394         else if (ch < 0x10000) {
6395             *p++ = '\\';
6396             *p++ = 'u';
6397             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6398             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6399             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6400             *p++ = Py_hexdigits[ch & 0x000F];
6401         }
6402         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6403         else {
6404 
6405             /* Make sure that the first two digits are zero */
6406             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6407             *p++ = '\\';
6408             *p++ = 'U';
6409             *p++ = '0';
6410             *p++ = '0';
6411             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6412             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6413             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6414             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6415             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6416             *p++ = Py_hexdigits[ch & 0x0000000F];
6417         }
6418     }
6419 
6420     assert(p - PyBytes_AS_STRING(repr) > 0);
6421     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6422         return NULL;
6423     }
6424     return repr;
6425 }
6426 
6427 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6428 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6429                               Py_ssize_t size)
6430 {
6431     PyObject *result;
6432     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6433     if (tmp == NULL) {
6434         return NULL;
6435     }
6436 
6437     result = PyUnicode_AsUnicodeEscapeString(tmp);
6438     Py_DECREF(tmp);
6439     return result;
6440 }
6441 
6442 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6443 
6444 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6445 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6446                                  Py_ssize_t size,
6447                                  const char *errors)
6448 {
6449     const char *starts = s;
6450     _PyUnicodeWriter writer;
6451     const char *end;
6452     PyObject *errorHandler = NULL;
6453     PyObject *exc = NULL;
6454 
6455     if (size == 0) {
6456         _Py_RETURN_UNICODE_EMPTY();
6457     }
6458 
6459     /* Escaped strings will always be longer than the resulting
6460        Unicode string, so we start with size here and then reduce the
6461        length after conversion to the true value. (But decoding error
6462        handler might have to resize the string) */
6463     _PyUnicodeWriter_Init(&writer);
6464      writer.min_length = size;
6465     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6466         goto onError;
6467     }
6468 
6469     end = s + size;
6470     while (s < end) {
6471         unsigned char c = (unsigned char) *s++;
6472         Py_UCS4 ch;
6473         int count;
6474         Py_ssize_t startinpos;
6475         Py_ssize_t endinpos;
6476         const char *message;
6477 
6478 #define WRITE_CHAR(ch)                                                        \
6479             do {                                                              \
6480                 if (ch <= writer.maxchar) {                                   \
6481                     assert(writer.pos < writer.size);                         \
6482                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483                 }                                                             \
6484                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485                     goto onError;                                             \
6486                 }                                                             \
6487             } while(0)
6488 
6489         /* Non-escape characters are interpreted as Unicode ordinals */
6490         if (c != '\\' || s >= end) {
6491             WRITE_CHAR(c);
6492             continue;
6493         }
6494 
6495         c = (unsigned char) *s++;
6496         if (c == 'u') {
6497             count = 4;
6498             message = "truncated \\uXXXX escape";
6499         }
6500         else if (c == 'U') {
6501             count = 8;
6502             message = "truncated \\UXXXXXXXX escape";
6503         }
6504         else {
6505             assert(writer.pos < writer.size);
6506             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6507             WRITE_CHAR(c);
6508             continue;
6509         }
6510         startinpos = s - starts - 2;
6511 
6512         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6513         for (ch = 0; count && s < end; ++s, --count) {
6514             c = (unsigned char)*s;
6515             ch <<= 4;
6516             if (c >= '0' && c <= '9') {
6517                 ch += c - '0';
6518             }
6519             else if (c >= 'a' && c <= 'f') {
6520                 ch += c - ('a' - 10);
6521             }
6522             else if (c >= 'A' && c <= 'F') {
6523                 ch += c - ('A' - 10);
6524             }
6525             else {
6526                 break;
6527             }
6528         }
6529         if (!count) {
6530             if (ch <= MAX_UNICODE) {
6531                 WRITE_CHAR(ch);
6532                 continue;
6533             }
6534             message = "\\Uxxxxxxxx out of range";
6535         }
6536 
6537         endinpos = s-starts;
6538         writer.min_length = end - s + writer.pos;
6539         if (unicode_decode_call_errorhandler_writer(
6540                 errors, &errorHandler,
6541                 "rawunicodeescape", message,
6542                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6543                 &writer)) {
6544             goto onError;
6545         }
6546         assert(end - s <= writer.size - writer.pos);
6547 
6548 #undef WRITE_CHAR
6549     }
6550     Py_XDECREF(errorHandler);
6551     Py_XDECREF(exc);
6552     return _PyUnicodeWriter_Finish(&writer);
6553 
6554   onError:
6555     _PyUnicodeWriter_Dealloc(&writer);
6556     Py_XDECREF(errorHandler);
6557     Py_XDECREF(exc);
6558     return NULL;
6559 
6560 }
6561 
6562 
6563 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6564 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6565 {
6566     PyObject *repr;
6567     char *p;
6568     Py_ssize_t expandsize, pos;
6569     int kind;
6570     void *data;
6571     Py_ssize_t len;
6572 
6573     if (!PyUnicode_Check(unicode)) {
6574         PyErr_BadArgument();
6575         return NULL;
6576     }
6577     if (PyUnicode_READY(unicode) == -1) {
6578         return NULL;
6579     }
6580     kind = PyUnicode_KIND(unicode);
6581     data = PyUnicode_DATA(unicode);
6582     len = PyUnicode_GET_LENGTH(unicode);
6583     if (kind == PyUnicode_1BYTE_KIND) {
6584         return PyBytes_FromStringAndSize(data, len);
6585     }
6586 
6587     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6588        bytes, and 1 byte characters 4. */
6589     expandsize = kind * 2 + 2;
6590 
6591     if (len > PY_SSIZE_T_MAX / expandsize) {
6592         return PyErr_NoMemory();
6593     }
6594     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6595     if (repr == NULL) {
6596         return NULL;
6597     }
6598     if (len == 0) {
6599         return repr;
6600     }
6601 
6602     p = PyBytes_AS_STRING(repr);
6603     for (pos = 0; pos < len; pos++) {
6604         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6605 
6606         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6607         if (ch < 0x100) {
6608             *p++ = (char) ch;
6609         }
6610         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6611         else if (ch < 0x10000) {
6612             *p++ = '\\';
6613             *p++ = 'u';
6614             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6615             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6616             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6617             *p++ = Py_hexdigits[ch & 15];
6618         }
6619         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6620         else {
6621             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6622             *p++ = '\\';
6623             *p++ = 'U';
6624             *p++ = '0';
6625             *p++ = '0';
6626             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6627             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6628             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6629             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6630             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6631             *p++ = Py_hexdigits[ch & 15];
6632         }
6633     }
6634 
6635     assert(p > PyBytes_AS_STRING(repr));
6636     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6637         return NULL;
6638     }
6639     return repr;
6640 }
6641 
6642 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6643 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6644                                  Py_ssize_t size)
6645 {
6646     PyObject *result;
6647     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6648     if (tmp == NULL)
6649         return NULL;
6650     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6651     Py_DECREF(tmp);
6652     return result;
6653 }
6654 
6655 /* --- Latin-1 Codec ------------------------------------------------------ */
6656 
6657 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6658 PyUnicode_DecodeLatin1(const char *s,
6659                        Py_ssize_t size,
6660                        const char *errors)
6661 {
6662     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6663     return _PyUnicode_FromUCS1((unsigned char*)s, size);
6664 }
6665 
6666 /* create or adjust a UnicodeEncodeError */
6667 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6668 make_encode_exception(PyObject **exceptionObject,
6669                       const char *encoding,
6670                       PyObject *unicode,
6671                       Py_ssize_t startpos, Py_ssize_t endpos,
6672                       const char *reason)
6673 {
6674     if (*exceptionObject == NULL) {
6675         *exceptionObject = PyObject_CallFunction(
6676             PyExc_UnicodeEncodeError, "sOnns",
6677             encoding, unicode, startpos, endpos, reason);
6678     }
6679     else {
6680         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6681             goto onError;
6682         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6683             goto onError;
6684         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6685             goto onError;
6686         return;
6687       onError:
6688         Py_CLEAR(*exceptionObject);
6689     }
6690 }
6691 
6692 /* raises a UnicodeEncodeError */
6693 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6694 raise_encode_exception(PyObject **exceptionObject,
6695                        const char *encoding,
6696                        PyObject *unicode,
6697                        Py_ssize_t startpos, Py_ssize_t endpos,
6698                        const char *reason)
6699 {
6700     make_encode_exception(exceptionObject,
6701                           encoding, unicode, startpos, endpos, reason);
6702     if (*exceptionObject != NULL)
6703         PyCodec_StrictErrors(*exceptionObject);
6704 }
6705 
6706 /* error handling callback helper:
6707    build arguments, call the callback and check the arguments,
6708    put the result into newpos and return the replacement string, which
6709    has to be freed by the caller */
6710 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6711 unicode_encode_call_errorhandler(const char *errors,
6712                                  PyObject **errorHandler,
6713                                  const char *encoding, const char *reason,
6714                                  PyObject *unicode, PyObject **exceptionObject,
6715                                  Py_ssize_t startpos, Py_ssize_t endpos,
6716                                  Py_ssize_t *newpos)
6717 {
6718     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6719     Py_ssize_t len;
6720     PyObject *restuple;
6721     PyObject *resunicode;
6722 
6723     if (*errorHandler == NULL) {
6724         *errorHandler = PyCodec_LookupError(errors);
6725         if (*errorHandler == NULL)
6726             return NULL;
6727     }
6728 
6729     if (PyUnicode_READY(unicode) == -1)
6730         return NULL;
6731     len = PyUnicode_GET_LENGTH(unicode);
6732 
6733     make_encode_exception(exceptionObject,
6734                           encoding, unicode, startpos, endpos, reason);
6735     if (*exceptionObject == NULL)
6736         return NULL;
6737 
6738     restuple = PyObject_CallFunctionObjArgs(
6739         *errorHandler, *exceptionObject, NULL);
6740     if (restuple == NULL)
6741         return NULL;
6742     if (!PyTuple_Check(restuple)) {
6743         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6744         Py_DECREF(restuple);
6745         return NULL;
6746     }
6747     if (!PyArg_ParseTuple(restuple, argparse,
6748                           &resunicode, newpos)) {
6749         Py_DECREF(restuple);
6750         return NULL;
6751     }
6752     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6753         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6754         Py_DECREF(restuple);
6755         return NULL;
6756     }
6757     if (*newpos<0)
6758         *newpos = len + *newpos;
6759     if (*newpos<0 || *newpos>len) {
6760         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6761         Py_DECREF(restuple);
6762         return NULL;
6763     }
6764     Py_INCREF(resunicode);
6765     Py_DECREF(restuple);
6766     return resunicode;
6767 }
6768 
6769 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6770 unicode_encode_ucs1(PyObject *unicode,
6771                     const char *errors,
6772                     const Py_UCS4 limit)
6773 {
6774     /* input state */
6775     Py_ssize_t pos=0, size;
6776     int kind;
6777     void *data;
6778     /* pointer into the output */
6779     char *str;
6780     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6781     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6782     PyObject *error_handler_obj = NULL;
6783     PyObject *exc = NULL;
6784     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6785     PyObject *rep = NULL;
6786     /* output object */
6787     _PyBytesWriter writer;
6788 
6789     if (PyUnicode_READY(unicode) == -1)
6790         return NULL;
6791     size = PyUnicode_GET_LENGTH(unicode);
6792     kind = PyUnicode_KIND(unicode);
6793     data = PyUnicode_DATA(unicode);
6794     /* allocate enough for a simple encoding without
6795        replacements, if we need more, we'll resize */
6796     if (size == 0)
6797         return PyBytes_FromStringAndSize(NULL, 0);
6798 
6799     _PyBytesWriter_Init(&writer);
6800     str = _PyBytesWriter_Alloc(&writer, size);
6801     if (str == NULL)
6802         return NULL;
6803 
6804     while (pos < size) {
6805         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6806 
6807         /* can we encode this? */
6808         if (ch < limit) {
6809             /* no overflow check, because we know that the space is enough */
6810             *str++ = (char)ch;
6811             ++pos;
6812         }
6813         else {
6814             Py_ssize_t newpos, i;
6815             /* startpos for collecting unencodable chars */
6816             Py_ssize_t collstart = pos;
6817             Py_ssize_t collend = collstart + 1;
6818             /* find all unecodable characters */
6819 
6820             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6821                 ++collend;
6822 
6823             /* Only overallocate the buffer if it's not the last write */
6824             writer.overallocate = (collend < size);
6825 
6826             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6827             if (error_handler == _Py_ERROR_UNKNOWN)
6828                 error_handler = _Py_GetErrorHandler(errors);
6829 
6830             switch (error_handler) {
6831             case _Py_ERROR_STRICT:
6832                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6833                 goto onError;
6834 
6835             case _Py_ERROR_REPLACE:
6836                 memset(str, '?', collend - collstart);
6837                 str += (collend - collstart);
6838                 /* fall through */
6839             case _Py_ERROR_IGNORE:
6840                 pos = collend;
6841                 break;
6842 
6843             case _Py_ERROR_BACKSLASHREPLACE:
6844                 /* subtract preallocated bytes */
6845                 writer.min_size -= (collend - collstart);
6846                 str = backslashreplace(&writer, str,
6847                                        unicode, collstart, collend);
6848                 if (str == NULL)
6849                     goto onError;
6850                 pos = collend;
6851                 break;
6852 
6853             case _Py_ERROR_XMLCHARREFREPLACE:
6854                 /* subtract preallocated bytes */
6855                 writer.min_size -= (collend - collstart);
6856                 str = xmlcharrefreplace(&writer, str,
6857                                         unicode, collstart, collend);
6858                 if (str == NULL)
6859                     goto onError;
6860                 pos = collend;
6861                 break;
6862 
6863             case _Py_ERROR_SURROGATEESCAPE:
6864                 for (i = collstart; i < collend; ++i) {
6865                     ch = PyUnicode_READ(kind, data, i);
6866                     if (ch < 0xdc80 || 0xdcff < ch) {
6867                         /* Not a UTF-8b surrogate */
6868                         break;
6869                     }
6870                     *str++ = (char)(ch - 0xdc00);
6871                     ++pos;
6872                 }
6873                 if (i >= collend)
6874                     break;
6875                 collstart = pos;
6876                 assert(collstart != collend);
6877                 /* fall through */
6878 
6879             default:
6880                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6881                                                        encoding, reason, unicode, &exc,
6882                                                        collstart, collend, &newpos);
6883                 if (rep == NULL)
6884                     goto onError;
6885 
6886                 /* subtract preallocated bytes */
6887                 writer.min_size -= newpos - collstart;
6888 
6889                 if (PyBytes_Check(rep)) {
6890                     /* Directly copy bytes result to output. */
6891                     str = _PyBytesWriter_WriteBytes(&writer, str,
6892                                                     PyBytes_AS_STRING(rep),
6893                                                     PyBytes_GET_SIZE(rep));
6894                 }
6895                 else {
6896                     assert(PyUnicode_Check(rep));
6897 
6898                     if (PyUnicode_READY(rep) < 0)
6899                         goto onError;
6900 
6901                     if (limit == 256 ?
6902                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6903                         !PyUnicode_IS_ASCII(rep))
6904                     {
6905                         /* Not all characters are smaller than limit */
6906                         raise_encode_exception(&exc, encoding, unicode,
6907                                                collstart, collend, reason);
6908                         goto onError;
6909                     }
6910                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6911                     str = _PyBytesWriter_WriteBytes(&writer, str,
6912                                                     PyUnicode_DATA(rep),
6913                                                     PyUnicode_GET_LENGTH(rep));
6914                 }
6915                 if (str == NULL)
6916                     goto onError;
6917 
6918                 pos = newpos;
6919                 Py_CLEAR(rep);
6920             }
6921 
6922             /* If overallocation was disabled, ensure that it was the last
6923                write. Otherwise, we missed an optimization */
6924             assert(writer.overallocate || pos == size);
6925         }
6926     }
6927 
6928     Py_XDECREF(error_handler_obj);
6929     Py_XDECREF(exc);
6930     return _PyBytesWriter_Finish(&writer, str);
6931 
6932   onError:
6933     Py_XDECREF(rep);
6934     _PyBytesWriter_Dealloc(&writer);
6935     Py_XDECREF(error_handler_obj);
6936     Py_XDECREF(exc);
6937     return NULL;
6938 }
6939 
6940 /* Deprecated */
6941 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6942 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6943                        Py_ssize_t size,
6944                        const char *errors)
6945 {
6946     PyObject *result;
6947     PyObject *unicode = PyUnicode_FromWideChar(p, size);
6948     if (unicode == NULL)
6949         return NULL;
6950     result = unicode_encode_ucs1(unicode, errors, 256);
6951     Py_DECREF(unicode);
6952     return result;
6953 }
6954 
6955 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6956 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6957 {
6958     if (!PyUnicode_Check(unicode)) {
6959         PyErr_BadArgument();
6960         return NULL;
6961     }
6962     if (PyUnicode_READY(unicode) == -1)
6963         return NULL;
6964     /* Fast path: if it is a one-byte string, construct
6965        bytes object directly. */
6966     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6967         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6968                                          PyUnicode_GET_LENGTH(unicode));
6969     /* Non-Latin-1 characters present. Defer to above function to
6970        raise the exception. */
6971     return unicode_encode_ucs1(unicode, errors, 256);
6972 }
6973 
6974 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6975 PyUnicode_AsLatin1String(PyObject *unicode)
6976 {
6977     return _PyUnicode_AsLatin1String(unicode, NULL);
6978 }
6979 
6980 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6981 
6982 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6983 PyUnicode_DecodeASCII(const char *s,
6984                       Py_ssize_t size,
6985                       const char *errors)
6986 {
6987     const char *starts = s;
6988     _PyUnicodeWriter writer;
6989     int kind;
6990     void *data;
6991     Py_ssize_t startinpos;
6992     Py_ssize_t endinpos;
6993     Py_ssize_t outpos;
6994     const char *e;
6995     PyObject *error_handler_obj = NULL;
6996     PyObject *exc = NULL;
6997     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6998 
6999     if (size == 0)
7000         _Py_RETURN_UNICODE_EMPTY();
7001 
7002     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7003     if (size == 1 && (unsigned char)s[0] < 128)
7004         return get_latin1_char((unsigned char)s[0]);
7005 
7006     _PyUnicodeWriter_Init(&writer);
7007     writer.min_length = size;
7008     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
7009         return NULL;
7010 
7011     e = s + size;
7012     data = writer.data;
7013     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7014     writer.pos = outpos;
7015     if (writer.pos == size)
7016         return _PyUnicodeWriter_Finish(&writer);
7017 
7018     s += writer.pos;
7019     kind = writer.kind;
7020     while (s < e) {
7021         unsigned char c = (unsigned char)*s;
7022         if (c < 128) {
7023             PyUnicode_WRITE(kind, data, writer.pos, c);
7024             writer.pos++;
7025             ++s;
7026             continue;
7027         }
7028 
7029         /* byte outsize range 0x00..0x7f: call the error handler */
7030 
7031         if (error_handler == _Py_ERROR_UNKNOWN)
7032             error_handler = _Py_GetErrorHandler(errors);
7033 
7034         switch (error_handler)
7035         {
7036         case _Py_ERROR_REPLACE:
7037         case _Py_ERROR_SURROGATEESCAPE:
7038             /* Fast-path: the error handler only writes one character,
7039                but we may switch to UCS2 at the first write */
7040             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7041                 goto onError;
7042             kind = writer.kind;
7043             data = writer.data;
7044 
7045             if (error_handler == _Py_ERROR_REPLACE)
7046                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7047             else
7048                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7049             writer.pos++;
7050             ++s;
7051             break;
7052 
7053         case _Py_ERROR_IGNORE:
7054             ++s;
7055             break;
7056 
7057         default:
7058             startinpos = s-starts;
7059             endinpos = startinpos + 1;
7060             if (unicode_decode_call_errorhandler_writer(
7061                     errors, &error_handler_obj,
7062                     "ascii", "ordinal not in range(128)",
7063                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7064                     &writer))
7065                 goto onError;
7066             kind = writer.kind;
7067             data = writer.data;
7068         }
7069     }
7070     Py_XDECREF(error_handler_obj);
7071     Py_XDECREF(exc);
7072     return _PyUnicodeWriter_Finish(&writer);
7073 
7074   onError:
7075     _PyUnicodeWriter_Dealloc(&writer);
7076     Py_XDECREF(error_handler_obj);
7077     Py_XDECREF(exc);
7078     return NULL;
7079 }
7080 
7081 /* Deprecated */
7082 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7083 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7084                       Py_ssize_t size,
7085                       const char *errors)
7086 {
7087     PyObject *result;
7088     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7089     if (unicode == NULL)
7090         return NULL;
7091     result = unicode_encode_ucs1(unicode, errors, 128);
7092     Py_DECREF(unicode);
7093     return result;
7094 }
7095 
7096 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7097 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7098 {
7099     if (!PyUnicode_Check(unicode)) {
7100         PyErr_BadArgument();
7101         return NULL;
7102     }
7103     if (PyUnicode_READY(unicode) == -1)
7104         return NULL;
7105     /* Fast path: if it is an ASCII-only string, construct bytes object
7106        directly. Else defer to above function to raise the exception. */
7107     if (PyUnicode_IS_ASCII(unicode))
7108         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7109                                          PyUnicode_GET_LENGTH(unicode));
7110     return unicode_encode_ucs1(unicode, errors, 128);
7111 }
7112 
7113 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7114 PyUnicode_AsASCIIString(PyObject *unicode)
7115 {
7116     return _PyUnicode_AsASCIIString(unicode, NULL);
7117 }
7118 
7119 #ifdef MS_WINDOWS
7120 
7121 /* --- MBCS codecs for Windows -------------------------------------------- */
7122 
7123 #if SIZEOF_INT < SIZEOF_SIZE_T
7124 #define NEED_RETRY
7125 #endif
7126 
7127 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7128    transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7129    both cases also and avoids partial characters overrunning the
7130    length limit in MultiByteToWideChar on Windows */
7131 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7132 
7133 #ifndef WC_ERR_INVALID_CHARS
7134 #  define WC_ERR_INVALID_CHARS 0x0080
7135 #endif
7136 
7137 static const char*
code_page_name(UINT code_page,PyObject ** obj)7138 code_page_name(UINT code_page, PyObject **obj)
7139 {
7140     *obj = NULL;
7141     if (code_page == CP_ACP)
7142         return "mbcs";
7143     if (code_page == CP_UTF7)
7144         return "CP_UTF7";
7145     if (code_page == CP_UTF8)
7146         return "CP_UTF8";
7147 
7148     *obj = PyBytes_FromFormat("cp%u", code_page);
7149     if (*obj == NULL)
7150         return NULL;
7151     return PyBytes_AS_STRING(*obj);
7152 }
7153 
7154 static DWORD
decode_code_page_flags(UINT code_page)7155 decode_code_page_flags(UINT code_page)
7156 {
7157     if (code_page == CP_UTF7) {
7158         /* The CP_UTF7 decoder only supports flags=0 */
7159         return 0;
7160     }
7161     else
7162         return MB_ERR_INVALID_CHARS;
7163 }
7164 
7165 /*
7166  * Decode a byte string from a Windows code page into unicode object in strict
7167  * mode.
7168  *
7169  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7170  * OSError and returns -1 on other error.
7171  */
7172 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7173 decode_code_page_strict(UINT code_page,
7174                         wchar_t **buf,
7175                         Py_ssize_t *bufsize,
7176                         const char *in,
7177                         int insize)
7178 {
7179     DWORD flags = MB_ERR_INVALID_CHARS;
7180     wchar_t *out;
7181     DWORD outsize;
7182 
7183     /* First get the size of the result */
7184     assert(insize > 0);
7185     while ((outsize = MultiByteToWideChar(code_page, flags,
7186                                           in, insize, NULL, 0)) <= 0)
7187     {
7188         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7189             goto error;
7190         }
7191         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7192         flags = 0;
7193     }
7194 
7195     /* Extend a wchar_t* buffer */
7196     Py_ssize_t n = *bufsize;   /* Get the current length */
7197     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7198         return -1;
7199     }
7200     out = *buf + n;
7201 
7202     /* Do the conversion */
7203     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7204     if (outsize <= 0)
7205         goto error;
7206     return insize;
7207 
7208 error:
7209     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7210         return -2;
7211     PyErr_SetFromWindowsErr(0);
7212     return -1;
7213 }
7214 
7215 /*
7216  * Decode a byte string from a code page into unicode object with an error
7217  * handler.
7218  *
7219  * Returns consumed size if succeed, or raise an OSError or
7220  * UnicodeDecodeError exception and returns -1 on error.
7221  */
7222 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7223 decode_code_page_errors(UINT code_page,
7224                         wchar_t **buf,
7225                         Py_ssize_t *bufsize,
7226                         const char *in, const int size,
7227                         const char *errors, int final)
7228 {
7229     const char *startin = in;
7230     const char *endin = in + size;
7231     DWORD flags = MB_ERR_INVALID_CHARS;
7232     /* Ideally, we should get reason from FormatMessage. This is the Windows
7233        2000 English version of the message. */
7234     const char *reason = "No mapping for the Unicode character exists "
7235                          "in the target code page.";
7236     /* each step cannot decode more than 1 character, but a character can be
7237        represented as a surrogate pair */
7238     wchar_t buffer[2], *out;
7239     int insize;
7240     Py_ssize_t outsize;
7241     PyObject *errorHandler = NULL;
7242     PyObject *exc = NULL;
7243     PyObject *encoding_obj = NULL;
7244     const char *encoding;
7245     DWORD err;
7246     int ret = -1;
7247 
7248     assert(size > 0);
7249 
7250     encoding = code_page_name(code_page, &encoding_obj);
7251     if (encoding == NULL)
7252         return -1;
7253 
7254     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7255         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7256            UnicodeDecodeError. */
7257         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7258         if (exc != NULL) {
7259             PyCodec_StrictErrors(exc);
7260             Py_CLEAR(exc);
7261         }
7262         goto error;
7263     }
7264 
7265     /* Extend a wchar_t* buffer */
7266     Py_ssize_t n = *bufsize;   /* Get the current length */
7267     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7268         PyErr_NoMemory();
7269         goto error;
7270     }
7271     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7272         goto error;
7273     }
7274     out = *buf + n;
7275 
7276     /* Decode the byte string character per character */
7277     while (in < endin)
7278     {
7279         /* Decode a character */
7280         insize = 1;
7281         do
7282         {
7283             outsize = MultiByteToWideChar(code_page, flags,
7284                                           in, insize,
7285                                           buffer, Py_ARRAY_LENGTH(buffer));
7286             if (outsize > 0)
7287                 break;
7288             err = GetLastError();
7289             if (err == ERROR_INVALID_FLAGS && flags) {
7290                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7291                 flags = 0;
7292                 continue;
7293             }
7294             if (err != ERROR_NO_UNICODE_TRANSLATION
7295                 && err != ERROR_INSUFFICIENT_BUFFER)
7296             {
7297                 PyErr_SetFromWindowsErr(0);
7298                 goto error;
7299             }
7300             insize++;
7301         }
7302         /* 4=maximum length of a UTF-8 sequence */
7303         while (insize <= 4 && (in + insize) <= endin);
7304 
7305         if (outsize <= 0) {
7306             Py_ssize_t startinpos, endinpos, outpos;
7307 
7308             /* last character in partial decode? */
7309             if (in + insize >= endin && !final)
7310                 break;
7311 
7312             startinpos = in - startin;
7313             endinpos = startinpos + 1;
7314             outpos = out - *buf;
7315             if (unicode_decode_call_errorhandler_wchar(
7316                     errors, &errorHandler,
7317                     encoding, reason,
7318                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7319                     buf, bufsize, &outpos))
7320             {
7321                 goto error;
7322             }
7323             out = *buf + outpos;
7324         }
7325         else {
7326             in += insize;
7327             memcpy(out, buffer, outsize * sizeof(wchar_t));
7328             out += outsize;
7329         }
7330     }
7331 
7332     /* Shrink the buffer */
7333     assert(out - *buf <= *bufsize);
7334     *bufsize = out - *buf;
7335     /* (in - startin) <= size and size is an int */
7336     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7337 
7338 error:
7339     Py_XDECREF(encoding_obj);
7340     Py_XDECREF(errorHandler);
7341     Py_XDECREF(exc);
7342     return ret;
7343 }
7344 
7345 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7346 decode_code_page_stateful(int code_page,
7347                           const char *s, Py_ssize_t size,
7348                           const char *errors, Py_ssize_t *consumed)
7349 {
7350     wchar_t *buf = NULL;
7351     Py_ssize_t bufsize = 0;
7352     int chunk_size, final, converted, done;
7353 
7354     if (code_page < 0) {
7355         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7356         return NULL;
7357     }
7358     if (size < 0) {
7359         PyErr_BadInternalCall();
7360         return NULL;
7361     }
7362 
7363     if (consumed)
7364         *consumed = 0;
7365 
7366     do
7367     {
7368 #ifdef NEED_RETRY
7369         if (size > DECODING_CHUNK_SIZE) {
7370             chunk_size = DECODING_CHUNK_SIZE;
7371             final = 0;
7372             done = 0;
7373         }
7374         else
7375 #endif
7376         {
7377             chunk_size = (int)size;
7378             final = (consumed == NULL);
7379             done = 1;
7380         }
7381 
7382         if (chunk_size == 0 && done) {
7383             if (buf != NULL)
7384                 break;
7385             _Py_RETURN_UNICODE_EMPTY();
7386         }
7387 
7388         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7389                                             s, chunk_size);
7390         if (converted == -2)
7391             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7392                                                 s, chunk_size,
7393                                                 errors, final);
7394         assert(converted != 0 || done);
7395 
7396         if (converted < 0) {
7397             PyMem_Free(buf);
7398             return NULL;
7399         }
7400 
7401         if (consumed)
7402             *consumed += converted;
7403 
7404         s += converted;
7405         size -= converted;
7406     } while (!done);
7407 
7408     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7409     PyMem_Free(buf);
7410     return v;
7411 }
7412 
7413 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7414 PyUnicode_DecodeCodePageStateful(int code_page,
7415                                  const char *s,
7416                                  Py_ssize_t size,
7417                                  const char *errors,
7418                                  Py_ssize_t *consumed)
7419 {
7420     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7421 }
7422 
7423 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7424 PyUnicode_DecodeMBCSStateful(const char *s,
7425                              Py_ssize_t size,
7426                              const char *errors,
7427                              Py_ssize_t *consumed)
7428 {
7429     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7430 }
7431 
7432 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7433 PyUnicode_DecodeMBCS(const char *s,
7434                      Py_ssize_t size,
7435                      const char *errors)
7436 {
7437     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7438 }
7439 
7440 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7441 encode_code_page_flags(UINT code_page, const char *errors)
7442 {
7443     if (code_page == CP_UTF8) {
7444         return WC_ERR_INVALID_CHARS;
7445     }
7446     else if (code_page == CP_UTF7) {
7447         /* CP_UTF7 only supports flags=0 */
7448         return 0;
7449     }
7450     else {
7451         if (errors != NULL && strcmp(errors, "replace") == 0)
7452             return 0;
7453         else
7454             return WC_NO_BEST_FIT_CHARS;
7455     }
7456 }
7457 
7458 /*
7459  * Encode a Unicode string to a Windows code page into a byte string in strict
7460  * mode.
7461  *
7462  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7463  * an OSError and returns -1 on other error.
7464  */
7465 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7466 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7467                         PyObject *unicode, Py_ssize_t offset, int len,
7468                         const char* errors)
7469 {
7470     BOOL usedDefaultChar = FALSE;
7471     BOOL *pusedDefaultChar = &usedDefaultChar;
7472     int outsize;
7473     wchar_t *p;
7474     Py_ssize_t size;
7475     const DWORD flags = encode_code_page_flags(code_page, NULL);
7476     char *out;
7477     /* Create a substring so that we can get the UTF-16 representation
7478        of just the slice under consideration. */
7479     PyObject *substring;
7480 
7481     assert(len > 0);
7482 
7483     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7484         pusedDefaultChar = &usedDefaultChar;
7485     else
7486         pusedDefaultChar = NULL;
7487 
7488     substring = PyUnicode_Substring(unicode, offset, offset+len);
7489     if (substring == NULL)
7490         return -1;
7491     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7492     if (p == NULL) {
7493         Py_DECREF(substring);
7494         return -1;
7495     }
7496     assert(size <= INT_MAX);
7497 
7498     /* First get the size of the result */
7499     outsize = WideCharToMultiByte(code_page, flags,
7500                                   p, (int)size,
7501                                   NULL, 0,
7502                                   NULL, pusedDefaultChar);
7503     if (outsize <= 0)
7504         goto error;
7505     /* If we used a default char, then we failed! */
7506     if (pusedDefaultChar && *pusedDefaultChar) {
7507         Py_DECREF(substring);
7508         return -2;
7509     }
7510 
7511     if (*outbytes == NULL) {
7512         /* Create string object */
7513         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7514         if (*outbytes == NULL) {
7515             Py_DECREF(substring);
7516             return -1;
7517         }
7518         out = PyBytes_AS_STRING(*outbytes);
7519     }
7520     else {
7521         /* Extend string object */
7522         const Py_ssize_t n = PyBytes_Size(*outbytes);
7523         if (outsize > PY_SSIZE_T_MAX - n) {
7524             PyErr_NoMemory();
7525             Py_DECREF(substring);
7526             return -1;
7527         }
7528         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7529             Py_DECREF(substring);
7530             return -1;
7531         }
7532         out = PyBytes_AS_STRING(*outbytes) + n;
7533     }
7534 
7535     /* Do the conversion */
7536     outsize = WideCharToMultiByte(code_page, flags,
7537                                   p, (int)size,
7538                                   out, outsize,
7539                                   NULL, pusedDefaultChar);
7540     Py_CLEAR(substring);
7541     if (outsize <= 0)
7542         goto error;
7543     if (pusedDefaultChar && *pusedDefaultChar)
7544         return -2;
7545     return 0;
7546 
7547 error:
7548     Py_XDECREF(substring);
7549     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7550         return -2;
7551     PyErr_SetFromWindowsErr(0);
7552     return -1;
7553 }
7554 
7555 /*
7556  * Encode a Unicode string to a Windows code page into a byte string using an
7557  * error handler.
7558  *
7559  * Returns consumed characters if succeed, or raise an OSError and returns
7560  * -1 on other error.
7561  */
7562 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7563 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7564                         PyObject *unicode, Py_ssize_t unicode_offset,
7565                         Py_ssize_t insize, const char* errors)
7566 {
7567     const DWORD flags = encode_code_page_flags(code_page, errors);
7568     Py_ssize_t pos = unicode_offset;
7569     Py_ssize_t endin = unicode_offset + insize;
7570     /* Ideally, we should get reason from FormatMessage. This is the Windows
7571        2000 English version of the message. */
7572     const char *reason = "invalid character";
7573     /* 4=maximum length of a UTF-8 sequence */
7574     char buffer[4];
7575     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7576     Py_ssize_t outsize;
7577     char *out;
7578     PyObject *errorHandler = NULL;
7579     PyObject *exc = NULL;
7580     PyObject *encoding_obj = NULL;
7581     const char *encoding;
7582     Py_ssize_t newpos, newoutsize;
7583     PyObject *rep;
7584     int ret = -1;
7585 
7586     assert(insize > 0);
7587 
7588     encoding = code_page_name(code_page, &encoding_obj);
7589     if (encoding == NULL)
7590         return -1;
7591 
7592     if (errors == NULL || strcmp(errors, "strict") == 0) {
7593         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7594            then we raise a UnicodeEncodeError. */
7595         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7596         if (exc != NULL) {
7597             PyCodec_StrictErrors(exc);
7598             Py_DECREF(exc);
7599         }
7600         Py_XDECREF(encoding_obj);
7601         return -1;
7602     }
7603 
7604     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7605         pusedDefaultChar = &usedDefaultChar;
7606     else
7607         pusedDefaultChar = NULL;
7608 
7609     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7610         PyErr_NoMemory();
7611         goto error;
7612     }
7613     outsize = insize * Py_ARRAY_LENGTH(buffer);
7614 
7615     if (*outbytes == NULL) {
7616         /* Create string object */
7617         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7618         if (*outbytes == NULL)
7619             goto error;
7620         out = PyBytes_AS_STRING(*outbytes);
7621     }
7622     else {
7623         /* Extend string object */
7624         Py_ssize_t n = PyBytes_Size(*outbytes);
7625         if (n > PY_SSIZE_T_MAX - outsize) {
7626             PyErr_NoMemory();
7627             goto error;
7628         }
7629         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7630             goto error;
7631         out = PyBytes_AS_STRING(*outbytes) + n;
7632     }
7633 
7634     /* Encode the string character per character */
7635     while (pos < endin)
7636     {
7637         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7638         wchar_t chars[2];
7639         int charsize;
7640         if (ch < 0x10000) {
7641             chars[0] = (wchar_t)ch;
7642             charsize = 1;
7643         }
7644         else {
7645             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7646             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7647             charsize = 2;
7648         }
7649 
7650         outsize = WideCharToMultiByte(code_page, flags,
7651                                       chars, charsize,
7652                                       buffer, Py_ARRAY_LENGTH(buffer),
7653                                       NULL, pusedDefaultChar);
7654         if (outsize > 0) {
7655             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7656             {
7657                 pos++;
7658                 memcpy(out, buffer, outsize);
7659                 out += outsize;
7660                 continue;
7661             }
7662         }
7663         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7664             PyErr_SetFromWindowsErr(0);
7665             goto error;
7666         }
7667 
7668         rep = unicode_encode_call_errorhandler(
7669                   errors, &errorHandler, encoding, reason,
7670                   unicode, &exc,
7671                   pos, pos + 1, &newpos);
7672         if (rep == NULL)
7673             goto error;
7674         pos = newpos;
7675 
7676         if (PyBytes_Check(rep)) {
7677             outsize = PyBytes_GET_SIZE(rep);
7678             if (outsize != 1) {
7679                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7680                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7681                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7682                     Py_DECREF(rep);
7683                     goto error;
7684                 }
7685                 out = PyBytes_AS_STRING(*outbytes) + offset;
7686             }
7687             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7688             out += outsize;
7689         }
7690         else {
7691             Py_ssize_t i;
7692             enum PyUnicode_Kind kind;
7693             void *data;
7694 
7695             if (PyUnicode_READY(rep) == -1) {
7696                 Py_DECREF(rep);
7697                 goto error;
7698             }
7699 
7700             outsize = PyUnicode_GET_LENGTH(rep);
7701             if (outsize != 1) {
7702                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7703                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7704                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7705                     Py_DECREF(rep);
7706                     goto error;
7707                 }
7708                 out = PyBytes_AS_STRING(*outbytes) + offset;
7709             }
7710             kind = PyUnicode_KIND(rep);
7711             data = PyUnicode_DATA(rep);
7712             for (i=0; i < outsize; i++) {
7713                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7714                 if (ch > 127) {
7715                     raise_encode_exception(&exc,
7716                         encoding, unicode,
7717                         pos, pos + 1,
7718                         "unable to encode error handler result to ASCII");
7719                     Py_DECREF(rep);
7720                     goto error;
7721                 }
7722                 *out = (unsigned char)ch;
7723                 out++;
7724             }
7725         }
7726         Py_DECREF(rep);
7727     }
7728     /* write a NUL byte */
7729     *out = 0;
7730     outsize = out - PyBytes_AS_STRING(*outbytes);
7731     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7732     if (_PyBytes_Resize(outbytes, outsize) < 0)
7733         goto error;
7734     ret = 0;
7735 
7736 error:
7737     Py_XDECREF(encoding_obj);
7738     Py_XDECREF(errorHandler);
7739     Py_XDECREF(exc);
7740     return ret;
7741 }
7742 
7743 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7744 encode_code_page(int code_page,
7745                  PyObject *unicode,
7746                  const char *errors)
7747 {
7748     Py_ssize_t len;
7749     PyObject *outbytes = NULL;
7750     Py_ssize_t offset;
7751     int chunk_len, ret, done;
7752 
7753     if (!PyUnicode_Check(unicode)) {
7754         PyErr_BadArgument();
7755         return NULL;
7756     }
7757 
7758     if (PyUnicode_READY(unicode) == -1)
7759         return NULL;
7760     len = PyUnicode_GET_LENGTH(unicode);
7761 
7762     if (code_page < 0) {
7763         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7764         return NULL;
7765     }
7766 
7767     if (len == 0)
7768         return PyBytes_FromStringAndSize(NULL, 0);
7769 
7770     offset = 0;
7771     do
7772     {
7773 #ifdef NEED_RETRY
7774         if (len > DECODING_CHUNK_SIZE) {
7775             chunk_len = DECODING_CHUNK_SIZE;
7776             done = 0;
7777         }
7778         else
7779 #endif
7780         {
7781             chunk_len = (int)len;
7782             done = 1;
7783         }
7784 
7785         ret = encode_code_page_strict(code_page, &outbytes,
7786                                       unicode, offset, chunk_len,
7787                                       errors);
7788         if (ret == -2)
7789             ret = encode_code_page_errors(code_page, &outbytes,
7790                                           unicode, offset,
7791                                           chunk_len, errors);
7792         if (ret < 0) {
7793             Py_XDECREF(outbytes);
7794             return NULL;
7795         }
7796 
7797         offset += chunk_len;
7798         len -= chunk_len;
7799     } while (!done);
7800 
7801     return outbytes;
7802 }
7803 
7804 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7805 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7806                      Py_ssize_t size,
7807                      const char *errors)
7808 {
7809     PyObject *unicode, *res;
7810     unicode = PyUnicode_FromWideChar(p, size);
7811     if (unicode == NULL)
7812         return NULL;
7813     res = encode_code_page(CP_ACP, unicode, errors);
7814     Py_DECREF(unicode);
7815     return res;
7816 }
7817 
7818 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7819 PyUnicode_EncodeCodePage(int code_page,
7820                          PyObject *unicode,
7821                          const char *errors)
7822 {
7823     return encode_code_page(code_page, unicode, errors);
7824 }
7825 
7826 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7827 PyUnicode_AsMBCSString(PyObject *unicode)
7828 {
7829     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7830 }
7831 
7832 #undef NEED_RETRY
7833 
7834 #endif /* MS_WINDOWS */
7835 
7836 /* --- Character Mapping Codec -------------------------------------------- */
7837 
7838 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7839 charmap_decode_string(const char *s,
7840                       Py_ssize_t size,
7841                       PyObject *mapping,
7842                       const char *errors,
7843                       _PyUnicodeWriter *writer)
7844 {
7845     const char *starts = s;
7846     const char *e;
7847     Py_ssize_t startinpos, endinpos;
7848     PyObject *errorHandler = NULL, *exc = NULL;
7849     Py_ssize_t maplen;
7850     enum PyUnicode_Kind mapkind;
7851     void *mapdata;
7852     Py_UCS4 x;
7853     unsigned char ch;
7854 
7855     if (PyUnicode_READY(mapping) == -1)
7856         return -1;
7857 
7858     maplen = PyUnicode_GET_LENGTH(mapping);
7859     mapdata = PyUnicode_DATA(mapping);
7860     mapkind = PyUnicode_KIND(mapping);
7861 
7862     e = s + size;
7863 
7864     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7865         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7866          * is disabled in encoding aliases, latin1 is preferred because
7867          * its implementation is faster. */
7868         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7869         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7870         Py_UCS4 maxchar = writer->maxchar;
7871 
7872         assert (writer->kind == PyUnicode_1BYTE_KIND);
7873         while (s < e) {
7874             ch = *s;
7875             x = mapdata_ucs1[ch];
7876             if (x > maxchar) {
7877                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7878                     goto onError;
7879                 maxchar = writer->maxchar;
7880                 outdata = (Py_UCS1 *)writer->data;
7881             }
7882             outdata[writer->pos] = x;
7883             writer->pos++;
7884             ++s;
7885         }
7886         return 0;
7887     }
7888 
7889     while (s < e) {
7890         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7891             enum PyUnicode_Kind outkind = writer->kind;
7892             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7893             if (outkind == PyUnicode_1BYTE_KIND) {
7894                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7895                 Py_UCS4 maxchar = writer->maxchar;
7896                 while (s < e) {
7897                     ch = *s;
7898                     x = mapdata_ucs2[ch];
7899                     if (x > maxchar)
7900                         goto Error;
7901                     outdata[writer->pos] = x;
7902                     writer->pos++;
7903                     ++s;
7904                 }
7905                 break;
7906             }
7907             else if (outkind == PyUnicode_2BYTE_KIND) {
7908                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7909                 while (s < e) {
7910                     ch = *s;
7911                     x = mapdata_ucs2[ch];
7912                     if (x == 0xFFFE)
7913                         goto Error;
7914                     outdata[writer->pos] = x;
7915                     writer->pos++;
7916                     ++s;
7917                 }
7918                 break;
7919             }
7920         }
7921         ch = *s;
7922 
7923         if (ch < maplen)
7924             x = PyUnicode_READ(mapkind, mapdata, ch);
7925         else
7926             x = 0xfffe; /* invalid value */
7927 Error:
7928         if (x == 0xfffe)
7929         {
7930             /* undefined mapping */
7931             startinpos = s-starts;
7932             endinpos = startinpos+1;
7933             if (unicode_decode_call_errorhandler_writer(
7934                     errors, &errorHandler,
7935                     "charmap", "character maps to <undefined>",
7936                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7937                     writer)) {
7938                 goto onError;
7939             }
7940             continue;
7941         }
7942 
7943         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7944             goto onError;
7945         ++s;
7946     }
7947     Py_XDECREF(errorHandler);
7948     Py_XDECREF(exc);
7949     return 0;
7950 
7951 onError:
7952     Py_XDECREF(errorHandler);
7953     Py_XDECREF(exc);
7954     return -1;
7955 }
7956 
7957 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7958 charmap_decode_mapping(const char *s,
7959                        Py_ssize_t size,
7960                        PyObject *mapping,
7961                        const char *errors,
7962                        _PyUnicodeWriter *writer)
7963 {
7964     const char *starts = s;
7965     const char *e;
7966     Py_ssize_t startinpos, endinpos;
7967     PyObject *errorHandler = NULL, *exc = NULL;
7968     unsigned char ch;
7969     PyObject *key, *item = NULL;
7970 
7971     e = s + size;
7972 
7973     while (s < e) {
7974         ch = *s;
7975 
7976         /* Get mapping (char ordinal -> integer, Unicode char or None) */
7977         key = PyLong_FromLong((long)ch);
7978         if (key == NULL)
7979             goto onError;
7980 
7981         item = PyObject_GetItem(mapping, key);
7982         Py_DECREF(key);
7983         if (item == NULL) {
7984             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985                 /* No mapping found means: mapping is undefined. */
7986                 PyErr_Clear();
7987                 goto Undefined;
7988             } else
7989                 goto onError;
7990         }
7991 
7992         /* Apply mapping */
7993         if (item == Py_None)
7994             goto Undefined;
7995         if (PyLong_Check(item)) {
7996             long value = PyLong_AS_LONG(item);
7997             if (value == 0xFFFE)
7998                 goto Undefined;
7999             if (value < 0 || value > MAX_UNICODE) {
8000                 PyErr_Format(PyExc_TypeError,
8001                              "character mapping must be in range(0x%lx)",
8002                              (unsigned long)MAX_UNICODE + 1);
8003                 goto onError;
8004             }
8005 
8006             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8007                 goto onError;
8008         }
8009         else if (PyUnicode_Check(item)) {
8010             if (PyUnicode_READY(item) == -1)
8011                 goto onError;
8012             if (PyUnicode_GET_LENGTH(item) == 1) {
8013                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8014                 if (value == 0xFFFE)
8015                     goto Undefined;
8016                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8017                     goto onError;
8018             }
8019             else {
8020                 writer->overallocate = 1;
8021                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8022                     goto onError;
8023             }
8024         }
8025         else {
8026             /* wrong return value */
8027             PyErr_SetString(PyExc_TypeError,
8028                             "character mapping must return integer, None or str");
8029             goto onError;
8030         }
8031         Py_CLEAR(item);
8032         ++s;
8033         continue;
8034 
8035 Undefined:
8036         /* undefined mapping */
8037         Py_CLEAR(item);
8038         startinpos = s-starts;
8039         endinpos = startinpos+1;
8040         if (unicode_decode_call_errorhandler_writer(
8041                 errors, &errorHandler,
8042                 "charmap", "character maps to <undefined>",
8043                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8044                 writer)) {
8045             goto onError;
8046         }
8047     }
8048     Py_XDECREF(errorHandler);
8049     Py_XDECREF(exc);
8050     return 0;
8051 
8052 onError:
8053     Py_XDECREF(item);
8054     Py_XDECREF(errorHandler);
8055     Py_XDECREF(exc);
8056     return -1;
8057 }
8058 
8059 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8060 PyUnicode_DecodeCharmap(const char *s,
8061                         Py_ssize_t size,
8062                         PyObject *mapping,
8063                         const char *errors)
8064 {
8065     _PyUnicodeWriter writer;
8066 
8067     /* Default to Latin-1 */
8068     if (mapping == NULL)
8069         return PyUnicode_DecodeLatin1(s, size, errors);
8070 
8071     if (size == 0)
8072         _Py_RETURN_UNICODE_EMPTY();
8073     _PyUnicodeWriter_Init(&writer);
8074     writer.min_length = size;
8075     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8076         goto onError;
8077 
8078     if (PyUnicode_CheckExact(mapping)) {
8079         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8080             goto onError;
8081     }
8082     else {
8083         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8084             goto onError;
8085     }
8086     return _PyUnicodeWriter_Finish(&writer);
8087 
8088   onError:
8089     _PyUnicodeWriter_Dealloc(&writer);
8090     return NULL;
8091 }
8092 
8093 /* Charmap encoding: the lookup table */
8094 
8095 struct encoding_map {
8096     PyObject_HEAD
8097     unsigned char level1[32];
8098     int count2, count3;
8099     unsigned char level23[1];
8100 };
8101 
8102 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8103 encoding_map_size(PyObject *obj, PyObject* args)
8104 {
8105     struct encoding_map *map = (struct encoding_map*)obj;
8106     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8107                            128*map->count3);
8108 }
8109 
8110 static PyMethodDef encoding_map_methods[] = {
8111     {"size", encoding_map_size, METH_NOARGS,
8112      PyDoc_STR("Return the size (in bytes) of this object") },
8113     { 0 }
8114 };
8115 
8116 static PyTypeObject EncodingMapType = {
8117     PyVarObject_HEAD_INIT(NULL, 0)
8118     "EncodingMap",          /*tp_name*/
8119     sizeof(struct encoding_map),   /*tp_basicsize*/
8120     0,                      /*tp_itemsize*/
8121     /* methods */
8122     0,                      /*tp_dealloc*/
8123     0,                      /*tp_vectorcall_offset*/
8124     0,                      /*tp_getattr*/
8125     0,                      /*tp_setattr*/
8126     0,                      /*tp_as_async*/
8127     0,                      /*tp_repr*/
8128     0,                      /*tp_as_number*/
8129     0,                      /*tp_as_sequence*/
8130     0,                      /*tp_as_mapping*/
8131     0,                      /*tp_hash*/
8132     0,                      /*tp_call*/
8133     0,                      /*tp_str*/
8134     0,                      /*tp_getattro*/
8135     0,                      /*tp_setattro*/
8136     0,                      /*tp_as_buffer*/
8137     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8138     0,                      /*tp_doc*/
8139     0,                      /*tp_traverse*/
8140     0,                      /*tp_clear*/
8141     0,                      /*tp_richcompare*/
8142     0,                      /*tp_weaklistoffset*/
8143     0,                      /*tp_iter*/
8144     0,                      /*tp_iternext*/
8145     encoding_map_methods,   /*tp_methods*/
8146     0,                      /*tp_members*/
8147     0,                      /*tp_getset*/
8148     0,                      /*tp_base*/
8149     0,                      /*tp_dict*/
8150     0,                      /*tp_descr_get*/
8151     0,                      /*tp_descr_set*/
8152     0,                      /*tp_dictoffset*/
8153     0,                      /*tp_init*/
8154     0,                      /*tp_alloc*/
8155     0,                      /*tp_new*/
8156     0,                      /*tp_free*/
8157     0,                      /*tp_is_gc*/
8158 };
8159 
8160 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8161 PyUnicode_BuildEncodingMap(PyObject* string)
8162 {
8163     PyObject *result;
8164     struct encoding_map *mresult;
8165     int i;
8166     int need_dict = 0;
8167     unsigned char level1[32];
8168     unsigned char level2[512];
8169     unsigned char *mlevel1, *mlevel2, *mlevel3;
8170     int count2 = 0, count3 = 0;
8171     int kind;
8172     void *data;
8173     Py_ssize_t length;
8174     Py_UCS4 ch;
8175 
8176     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8177         PyErr_BadArgument();
8178         return NULL;
8179     }
8180     kind = PyUnicode_KIND(string);
8181     data = PyUnicode_DATA(string);
8182     length = PyUnicode_GET_LENGTH(string);
8183     length = Py_MIN(length, 256);
8184     memset(level1, 0xFF, sizeof level1);
8185     memset(level2, 0xFF, sizeof level2);
8186 
8187     /* If there isn't a one-to-one mapping of NULL to \0,
8188        or if there are non-BMP characters, we need to use
8189        a mapping dictionary. */
8190     if (PyUnicode_READ(kind, data, 0) != 0)
8191         need_dict = 1;
8192     for (i = 1; i < length; i++) {
8193         int l1, l2;
8194         ch = PyUnicode_READ(kind, data, i);
8195         if (ch == 0 || ch > 0xFFFF) {
8196             need_dict = 1;
8197             break;
8198         }
8199         if (ch == 0xFFFE)
8200             /* unmapped character */
8201             continue;
8202         l1 = ch >> 11;
8203         l2 = ch >> 7;
8204         if (level1[l1] == 0xFF)
8205             level1[l1] = count2++;
8206         if (level2[l2] == 0xFF)
8207             level2[l2] = count3++;
8208     }
8209 
8210     if (count2 >= 0xFF || count3 >= 0xFF)
8211         need_dict = 1;
8212 
8213     if (need_dict) {
8214         PyObject *result = PyDict_New();
8215         PyObject *key, *value;
8216         if (!result)
8217             return NULL;
8218         for (i = 0; i < length; i++) {
8219             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8220             value = PyLong_FromLong(i);
8221             if (!key || !value)
8222                 goto failed1;
8223             if (PyDict_SetItem(result, key, value) == -1)
8224                 goto failed1;
8225             Py_DECREF(key);
8226             Py_DECREF(value);
8227         }
8228         return result;
8229       failed1:
8230         Py_XDECREF(key);
8231         Py_XDECREF(value);
8232         Py_DECREF(result);
8233         return NULL;
8234     }
8235 
8236     /* Create a three-level trie */
8237     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8238                              16*count2 + 128*count3 - 1);
8239     if (!result)
8240         return PyErr_NoMemory();
8241     PyObject_Init(result, &EncodingMapType);
8242     mresult = (struct encoding_map*)result;
8243     mresult->count2 = count2;
8244     mresult->count3 = count3;
8245     mlevel1 = mresult->level1;
8246     mlevel2 = mresult->level23;
8247     mlevel3 = mresult->level23 + 16*count2;
8248     memcpy(mlevel1, level1, 32);
8249     memset(mlevel2, 0xFF, 16*count2);
8250     memset(mlevel3, 0, 128*count3);
8251     count3 = 0;
8252     for (i = 1; i < length; i++) {
8253         int o1, o2, o3, i2, i3;
8254         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8255         if (ch == 0xFFFE)
8256             /* unmapped character */
8257             continue;
8258         o1 = ch>>11;
8259         o2 = (ch>>7) & 0xF;
8260         i2 = 16*mlevel1[o1] + o2;
8261         if (mlevel2[i2] == 0xFF)
8262             mlevel2[i2] = count3++;
8263         o3 = ch & 0x7F;
8264         i3 = 128*mlevel2[i2] + o3;
8265         mlevel3[i3] = i;
8266     }
8267     return result;
8268 }
8269 
8270 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8271 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8272 {
8273     struct encoding_map *map = (struct encoding_map*)mapping;
8274     int l1 = c>>11;
8275     int l2 = (c>>7) & 0xF;
8276     int l3 = c & 0x7F;
8277     int i;
8278 
8279     if (c > 0xFFFF)
8280         return -1;
8281     if (c == 0)
8282         return 0;
8283     /* level 1*/
8284     i = map->level1[l1];
8285     if (i == 0xFF) {
8286         return -1;
8287     }
8288     /* level 2*/
8289     i = map->level23[16*i+l2];
8290     if (i == 0xFF) {
8291         return -1;
8292     }
8293     /* level 3 */
8294     i = map->level23[16*map->count2 + 128*i + l3];
8295     if (i == 0) {
8296         return -1;
8297     }
8298     return i;
8299 }
8300 
8301 /* Lookup the character ch in the mapping. If the character
8302    can't be found, Py_None is returned (or NULL, if another
8303    error occurred). */
8304 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8305 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8306 {
8307     PyObject *w = PyLong_FromLong((long)c);
8308     PyObject *x;
8309 
8310     if (w == NULL)
8311         return NULL;
8312     x = PyObject_GetItem(mapping, w);
8313     Py_DECREF(w);
8314     if (x == NULL) {
8315         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8316             /* No mapping found means: mapping is undefined. */
8317             PyErr_Clear();
8318             Py_RETURN_NONE;
8319         } else
8320             return NULL;
8321     }
8322     else if (x == Py_None)
8323         return x;
8324     else if (PyLong_Check(x)) {
8325         long value = PyLong_AS_LONG(x);
8326         if (value < 0 || value > 255) {
8327             PyErr_SetString(PyExc_TypeError,
8328                             "character mapping must be in range(256)");
8329             Py_DECREF(x);
8330             return NULL;
8331         }
8332         return x;
8333     }
8334     else if (PyBytes_Check(x))
8335         return x;
8336     else {
8337         /* wrong return value */
8338         PyErr_Format(PyExc_TypeError,
8339                      "character mapping must return integer, bytes or None, not %.400s",
8340                      x->ob_type->tp_name);
8341         Py_DECREF(x);
8342         return NULL;
8343     }
8344 }
8345 
8346 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8347 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8348 {
8349     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8350     /* exponentially overallocate to minimize reallocations */
8351     if (requiredsize < 2*outsize)
8352         requiredsize = 2*outsize;
8353     if (_PyBytes_Resize(outobj, requiredsize))
8354         return -1;
8355     return 0;
8356 }
8357 
8358 typedef enum charmapencode_result {
8359     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8360 } charmapencode_result;
8361 /* lookup the character, put the result in the output string and adjust
8362    various state variables. Resize the output bytes object if not enough
8363    space is available. Return a new reference to the object that
8364    was put in the output buffer, or Py_None, if the mapping was undefined
8365    (in which case no character was written) or NULL, if a
8366    reallocation error occurred. The caller must decref the result */
8367 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8368 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8369                      PyObject **outobj, Py_ssize_t *outpos)
8370 {
8371     PyObject *rep;
8372     char *outstart;
8373     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8374 
8375     if (Py_TYPE(mapping) == &EncodingMapType) {
8376         int res = encoding_map_lookup(c, mapping);
8377         Py_ssize_t requiredsize = *outpos+1;
8378         if (res == -1)
8379             return enc_FAILED;
8380         if (outsize<requiredsize)
8381             if (charmapencode_resize(outobj, outpos, requiredsize))
8382                 return enc_EXCEPTION;
8383         outstart = PyBytes_AS_STRING(*outobj);
8384         outstart[(*outpos)++] = (char)res;
8385         return enc_SUCCESS;
8386     }
8387 
8388     rep = charmapencode_lookup(c, mapping);
8389     if (rep==NULL)
8390         return enc_EXCEPTION;
8391     else if (rep==Py_None) {
8392         Py_DECREF(rep);
8393         return enc_FAILED;
8394     } else {
8395         if (PyLong_Check(rep)) {
8396             Py_ssize_t requiredsize = *outpos+1;
8397             if (outsize<requiredsize)
8398                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8399                     Py_DECREF(rep);
8400                     return enc_EXCEPTION;
8401                 }
8402             outstart = PyBytes_AS_STRING(*outobj);
8403             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8404         }
8405         else {
8406             const char *repchars = PyBytes_AS_STRING(rep);
8407             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8408             Py_ssize_t requiredsize = *outpos+repsize;
8409             if (outsize<requiredsize)
8410                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8411                     Py_DECREF(rep);
8412                     return enc_EXCEPTION;
8413                 }
8414             outstart = PyBytes_AS_STRING(*outobj);
8415             memcpy(outstart + *outpos, repchars, repsize);
8416             *outpos += repsize;
8417         }
8418     }
8419     Py_DECREF(rep);
8420     return enc_SUCCESS;
8421 }
8422 
8423 /* handle an error in PyUnicode_EncodeCharmap
8424    Return 0 on success, -1 on error */
8425 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8426 charmap_encoding_error(
8427     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8428     PyObject **exceptionObject,
8429     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8430     PyObject **res, Py_ssize_t *respos)
8431 {
8432     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8433     Py_ssize_t size, repsize;
8434     Py_ssize_t newpos;
8435     enum PyUnicode_Kind kind;
8436     void *data;
8437     Py_ssize_t index;
8438     /* startpos for collecting unencodable chars */
8439     Py_ssize_t collstartpos = *inpos;
8440     Py_ssize_t collendpos = *inpos+1;
8441     Py_ssize_t collpos;
8442     const char *encoding = "charmap";
8443     const char *reason = "character maps to <undefined>";
8444     charmapencode_result x;
8445     Py_UCS4 ch;
8446     int val;
8447 
8448     if (PyUnicode_READY(unicode) == -1)
8449         return -1;
8450     size = PyUnicode_GET_LENGTH(unicode);
8451     /* find all unencodable characters */
8452     while (collendpos < size) {
8453         PyObject *rep;
8454         if (Py_TYPE(mapping) == &EncodingMapType) {
8455             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8456             val = encoding_map_lookup(ch, mapping);
8457             if (val != -1)
8458                 break;
8459             ++collendpos;
8460             continue;
8461         }
8462 
8463         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8464         rep = charmapencode_lookup(ch, mapping);
8465         if (rep==NULL)
8466             return -1;
8467         else if (rep!=Py_None) {
8468             Py_DECREF(rep);
8469             break;
8470         }
8471         Py_DECREF(rep);
8472         ++collendpos;
8473     }
8474     /* cache callback name lookup
8475      * (if not done yet, i.e. it's the first error) */
8476     if (*error_handler == _Py_ERROR_UNKNOWN)
8477         *error_handler = _Py_GetErrorHandler(errors);
8478 
8479     switch (*error_handler) {
8480     case _Py_ERROR_STRICT:
8481         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8482         return -1;
8483 
8484     case _Py_ERROR_REPLACE:
8485         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8486             x = charmapencode_output('?', mapping, res, respos);
8487             if (x==enc_EXCEPTION) {
8488                 return -1;
8489             }
8490             else if (x==enc_FAILED) {
8491                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8492                 return -1;
8493             }
8494         }
8495         /* fall through */
8496     case _Py_ERROR_IGNORE:
8497         *inpos = collendpos;
8498         break;
8499 
8500     case _Py_ERROR_XMLCHARREFREPLACE:
8501         /* generate replacement (temporarily (mis)uses p) */
8502         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8503             char buffer[2+29+1+1];
8504             char *cp;
8505             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8506             for (cp = buffer; *cp; ++cp) {
8507                 x = charmapencode_output(*cp, mapping, res, respos);
8508                 if (x==enc_EXCEPTION)
8509                     return -1;
8510                 else if (x==enc_FAILED) {
8511                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8512                     return -1;
8513                 }
8514             }
8515         }
8516         *inpos = collendpos;
8517         break;
8518 
8519     default:
8520         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8521                                                       encoding, reason, unicode, exceptionObject,
8522                                                       collstartpos, collendpos, &newpos);
8523         if (repunicode == NULL)
8524             return -1;
8525         if (PyBytes_Check(repunicode)) {
8526             /* Directly copy bytes result to output. */
8527             Py_ssize_t outsize = PyBytes_Size(*res);
8528             Py_ssize_t requiredsize;
8529             repsize = PyBytes_Size(repunicode);
8530             requiredsize = *respos + repsize;
8531             if (requiredsize > outsize)
8532                 /* Make room for all additional bytes. */
8533                 if (charmapencode_resize(res, respos, requiredsize)) {
8534                     Py_DECREF(repunicode);
8535                     return -1;
8536                 }
8537             memcpy(PyBytes_AsString(*res) + *respos,
8538                    PyBytes_AsString(repunicode),  repsize);
8539             *respos += repsize;
8540             *inpos = newpos;
8541             Py_DECREF(repunicode);
8542             break;
8543         }
8544         /* generate replacement  */
8545         if (PyUnicode_READY(repunicode) == -1) {
8546             Py_DECREF(repunicode);
8547             return -1;
8548         }
8549         repsize = PyUnicode_GET_LENGTH(repunicode);
8550         data = PyUnicode_DATA(repunicode);
8551         kind = PyUnicode_KIND(repunicode);
8552         for (index = 0; index < repsize; index++) {
8553             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8554             x = charmapencode_output(repch, mapping, res, respos);
8555             if (x==enc_EXCEPTION) {
8556                 Py_DECREF(repunicode);
8557                 return -1;
8558             }
8559             else if (x==enc_FAILED) {
8560                 Py_DECREF(repunicode);
8561                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8562                 return -1;
8563             }
8564         }
8565         *inpos = newpos;
8566         Py_DECREF(repunicode);
8567     }
8568     return 0;
8569 }
8570 
8571 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8572 _PyUnicode_EncodeCharmap(PyObject *unicode,
8573                          PyObject *mapping,
8574                          const char *errors)
8575 {
8576     /* output object */
8577     PyObject *res = NULL;
8578     /* current input position */
8579     Py_ssize_t inpos = 0;
8580     Py_ssize_t size;
8581     /* current output position */
8582     Py_ssize_t respos = 0;
8583     PyObject *error_handler_obj = NULL;
8584     PyObject *exc = NULL;
8585     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8586     void *data;
8587     int kind;
8588 
8589     if (PyUnicode_READY(unicode) == -1)
8590         return NULL;
8591     size = PyUnicode_GET_LENGTH(unicode);
8592     data = PyUnicode_DATA(unicode);
8593     kind = PyUnicode_KIND(unicode);
8594 
8595     /* Default to Latin-1 */
8596     if (mapping == NULL)
8597         return unicode_encode_ucs1(unicode, errors, 256);
8598 
8599     /* allocate enough for a simple encoding without
8600        replacements, if we need more, we'll resize */
8601     res = PyBytes_FromStringAndSize(NULL, size);
8602     if (res == NULL)
8603         goto onError;
8604     if (size == 0)
8605         return res;
8606 
8607     while (inpos<size) {
8608         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8609         /* try to encode it */
8610         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8611         if (x==enc_EXCEPTION) /* error */
8612             goto onError;
8613         if (x==enc_FAILED) { /* unencodable character */
8614             if (charmap_encoding_error(unicode, &inpos, mapping,
8615                                        &exc,
8616                                        &error_handler, &error_handler_obj, errors,
8617                                        &res, &respos)) {
8618                 goto onError;
8619             }
8620         }
8621         else
8622             /* done with this character => adjust input position */
8623             ++inpos;
8624     }
8625 
8626     /* Resize if we allocated to much */
8627     if (respos<PyBytes_GET_SIZE(res))
8628         if (_PyBytes_Resize(&res, respos) < 0)
8629             goto onError;
8630 
8631     Py_XDECREF(exc);
8632     Py_XDECREF(error_handler_obj);
8633     return res;
8634 
8635   onError:
8636     Py_XDECREF(res);
8637     Py_XDECREF(exc);
8638     Py_XDECREF(error_handler_obj);
8639     return NULL;
8640 }
8641 
8642 /* Deprecated */
8643 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8644 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8645                         Py_ssize_t size,
8646                         PyObject *mapping,
8647                         const char *errors)
8648 {
8649     PyObject *result;
8650     PyObject *unicode = PyUnicode_FromWideChar(p, size);
8651     if (unicode == NULL)
8652         return NULL;
8653     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8654     Py_DECREF(unicode);
8655     return result;
8656 }
8657 
8658 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8659 PyUnicode_AsCharmapString(PyObject *unicode,
8660                           PyObject *mapping)
8661 {
8662     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8663         PyErr_BadArgument();
8664         return NULL;
8665     }
8666     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8667 }
8668 
8669 /* create or adjust a UnicodeTranslateError */
8670 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8671 make_translate_exception(PyObject **exceptionObject,
8672                          PyObject *unicode,
8673                          Py_ssize_t startpos, Py_ssize_t endpos,
8674                          const char *reason)
8675 {
8676     if (*exceptionObject == NULL) {
8677         *exceptionObject = _PyUnicodeTranslateError_Create(
8678             unicode, startpos, endpos, reason);
8679     }
8680     else {
8681         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8682             goto onError;
8683         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8684             goto onError;
8685         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8686             goto onError;
8687         return;
8688       onError:
8689         Py_CLEAR(*exceptionObject);
8690     }
8691 }
8692 
8693 /* error handling callback helper:
8694    build arguments, call the callback and check the arguments,
8695    put the result into newpos and return the replacement string, which
8696    has to be freed by the caller */
8697 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8698 unicode_translate_call_errorhandler(const char *errors,
8699                                     PyObject **errorHandler,
8700                                     const char *reason,
8701                                     PyObject *unicode, PyObject **exceptionObject,
8702                                     Py_ssize_t startpos, Py_ssize_t endpos,
8703                                     Py_ssize_t *newpos)
8704 {
8705     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8706 
8707     Py_ssize_t i_newpos;
8708     PyObject *restuple;
8709     PyObject *resunicode;
8710 
8711     if (*errorHandler == NULL) {
8712         *errorHandler = PyCodec_LookupError(errors);
8713         if (*errorHandler == NULL)
8714             return NULL;
8715     }
8716 
8717     make_translate_exception(exceptionObject,
8718                              unicode, startpos, endpos, reason);
8719     if (*exceptionObject == NULL)
8720         return NULL;
8721 
8722     restuple = PyObject_CallFunctionObjArgs(
8723         *errorHandler, *exceptionObject, NULL);
8724     if (restuple == NULL)
8725         return NULL;
8726     if (!PyTuple_Check(restuple)) {
8727         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8728         Py_DECREF(restuple);
8729         return NULL;
8730     }
8731     if (!PyArg_ParseTuple(restuple, argparse,
8732                           &resunicode, &i_newpos)) {
8733         Py_DECREF(restuple);
8734         return NULL;
8735     }
8736     if (i_newpos<0)
8737         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8738     else
8739         *newpos = i_newpos;
8740     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8741         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8742         Py_DECREF(restuple);
8743         return NULL;
8744     }
8745     Py_INCREF(resunicode);
8746     Py_DECREF(restuple);
8747     return resunicode;
8748 }
8749 
8750 /* Lookup the character ch in the mapping and put the result in result,
8751    which must be decrefed by the caller.
8752    Return 0 on success, -1 on error */
8753 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8754 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8755 {
8756     PyObject *w = PyLong_FromLong((long)c);
8757     PyObject *x;
8758 
8759     if (w == NULL)
8760         return -1;
8761     x = PyObject_GetItem(mapping, w);
8762     Py_DECREF(w);
8763     if (x == NULL) {
8764         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8765             /* No mapping found means: use 1:1 mapping. */
8766             PyErr_Clear();
8767             *result = NULL;
8768             return 0;
8769         } else
8770             return -1;
8771     }
8772     else if (x == Py_None) {
8773         *result = x;
8774         return 0;
8775     }
8776     else if (PyLong_Check(x)) {
8777         long value = PyLong_AS_LONG(x);
8778         if (value < 0 || value > MAX_UNICODE) {
8779             PyErr_Format(PyExc_ValueError,
8780                          "character mapping must be in range(0x%x)",
8781                          MAX_UNICODE+1);
8782             Py_DECREF(x);
8783             return -1;
8784         }
8785         *result = x;
8786         return 0;
8787     }
8788     else if (PyUnicode_Check(x)) {
8789         *result = x;
8790         return 0;
8791     }
8792     else {
8793         /* wrong return value */
8794         PyErr_SetString(PyExc_TypeError,
8795                         "character mapping must return integer, None or str");
8796         Py_DECREF(x);
8797         return -1;
8798     }
8799 }
8800 
8801 /* lookup the character, write the result into the writer.
8802    Return 1 if the result was written into the writer, return 0 if the mapping
8803    was undefined, raise an exception return -1 on error. */
8804 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8805 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8806                         _PyUnicodeWriter *writer)
8807 {
8808     PyObject *item;
8809 
8810     if (charmaptranslate_lookup(ch, mapping, &item))
8811         return -1;
8812 
8813     if (item == NULL) {
8814         /* not found => default to 1:1 mapping */
8815         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8816             return -1;
8817         }
8818         return 1;
8819     }
8820 
8821     if (item == Py_None) {
8822         Py_DECREF(item);
8823         return 0;
8824     }
8825 
8826     if (PyLong_Check(item)) {
8827         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8828         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8829            used it */
8830         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8831             Py_DECREF(item);
8832             return -1;
8833         }
8834         Py_DECREF(item);
8835         return 1;
8836     }
8837 
8838     if (!PyUnicode_Check(item)) {
8839         Py_DECREF(item);
8840         return -1;
8841     }
8842 
8843     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8844         Py_DECREF(item);
8845         return -1;
8846     }
8847 
8848     Py_DECREF(item);
8849     return 1;
8850 }
8851 
8852 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8853 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8854                               Py_UCS1 *translate)
8855 {
8856     PyObject *item = NULL;
8857     int ret = 0;
8858 
8859     if (charmaptranslate_lookup(ch, mapping, &item)) {
8860         return -1;
8861     }
8862 
8863     if (item == Py_None) {
8864         /* deletion */
8865         translate[ch] = 0xfe;
8866     }
8867     else if (item == NULL) {
8868         /* not found => default to 1:1 mapping */
8869         translate[ch] = ch;
8870         return 1;
8871     }
8872     else if (PyLong_Check(item)) {
8873         long replace = PyLong_AS_LONG(item);
8874         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8875            used it */
8876         if (127 < replace) {
8877             /* invalid character or character outside ASCII:
8878                skip the fast translate */
8879             goto exit;
8880         }
8881         translate[ch] = (Py_UCS1)replace;
8882     }
8883     else if (PyUnicode_Check(item)) {
8884         Py_UCS4 replace;
8885 
8886         if (PyUnicode_READY(item) == -1) {
8887             Py_DECREF(item);
8888             return -1;
8889         }
8890         if (PyUnicode_GET_LENGTH(item) != 1)
8891             goto exit;
8892 
8893         replace = PyUnicode_READ_CHAR(item, 0);
8894         if (replace > 127)
8895             goto exit;
8896         translate[ch] = (Py_UCS1)replace;
8897     }
8898     else {
8899         /* not None, NULL, long or unicode */
8900         goto exit;
8901     }
8902     ret = 1;
8903 
8904   exit:
8905     Py_DECREF(item);
8906     return ret;
8907 }
8908 
8909 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8910    was translated into writer, return 0 if the input string was partially
8911    translated into writer, raise an exception and return -1 on error. */
8912 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8913 unicode_fast_translate(PyObject *input, PyObject *mapping,
8914                        _PyUnicodeWriter *writer, int ignore,
8915                        Py_ssize_t *input_pos)
8916 {
8917     Py_UCS1 ascii_table[128], ch, ch2;
8918     Py_ssize_t len;
8919     Py_UCS1 *in, *end, *out;
8920     int res = 0;
8921 
8922     len = PyUnicode_GET_LENGTH(input);
8923 
8924     memset(ascii_table, 0xff, 128);
8925 
8926     in = PyUnicode_1BYTE_DATA(input);
8927     end = in + len;
8928 
8929     assert(PyUnicode_IS_ASCII(writer->buffer));
8930     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8931     out = PyUnicode_1BYTE_DATA(writer->buffer);
8932 
8933     for (; in < end; in++) {
8934         ch = *in;
8935         ch2 = ascii_table[ch];
8936         if (ch2 == 0xff) {
8937             int translate = unicode_fast_translate_lookup(mapping, ch,
8938                                                           ascii_table);
8939             if (translate < 0)
8940                 return -1;
8941             if (translate == 0)
8942                 goto exit;
8943             ch2 = ascii_table[ch];
8944         }
8945         if (ch2 == 0xfe) {
8946             if (ignore)
8947                 continue;
8948             goto exit;
8949         }
8950         assert(ch2 < 128);
8951         *out = ch2;
8952         out++;
8953     }
8954     res = 1;
8955 
8956 exit:
8957     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8958     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8959     return res;
8960 }
8961 
8962 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8963 _PyUnicode_TranslateCharmap(PyObject *input,
8964                             PyObject *mapping,
8965                             const char *errors)
8966 {
8967     /* input object */
8968     char *data;
8969     Py_ssize_t size, i;
8970     int kind;
8971     /* output buffer */
8972     _PyUnicodeWriter writer;
8973     /* error handler */
8974     const char *reason = "character maps to <undefined>";
8975     PyObject *errorHandler = NULL;
8976     PyObject *exc = NULL;
8977     int ignore;
8978     int res;
8979 
8980     if (mapping == NULL) {
8981         PyErr_BadArgument();
8982         return NULL;
8983     }
8984 
8985     if (PyUnicode_READY(input) == -1)
8986         return NULL;
8987     data = (char*)PyUnicode_DATA(input);
8988     kind = PyUnicode_KIND(input);
8989     size = PyUnicode_GET_LENGTH(input);
8990 
8991     if (size == 0)
8992         return PyUnicode_FromObject(input);
8993 
8994     /* allocate enough for a simple 1:1 translation without
8995        replacements, if we need more, we'll resize */
8996     _PyUnicodeWriter_Init(&writer);
8997     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8998         goto onError;
8999 
9000     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9001 
9002     if (PyUnicode_READY(input) == -1)
9003         return NULL;
9004     if (PyUnicode_IS_ASCII(input)) {
9005         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9006         if (res < 0) {
9007             _PyUnicodeWriter_Dealloc(&writer);
9008             return NULL;
9009         }
9010         if (res == 1)
9011             return _PyUnicodeWriter_Finish(&writer);
9012     }
9013     else {
9014         i = 0;
9015     }
9016 
9017     while (i<size) {
9018         /* try to encode it */
9019         int translate;
9020         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9021         Py_ssize_t newpos;
9022         /* startpos for collecting untranslatable chars */
9023         Py_ssize_t collstart;
9024         Py_ssize_t collend;
9025         Py_UCS4 ch;
9026 
9027         ch = PyUnicode_READ(kind, data, i);
9028         translate = charmaptranslate_output(ch, mapping, &writer);
9029         if (translate < 0)
9030             goto onError;
9031 
9032         if (translate != 0) {
9033             /* it worked => adjust input pointer */
9034             ++i;
9035             continue;
9036         }
9037 
9038         /* untranslatable character */
9039         collstart = i;
9040         collend = i+1;
9041 
9042         /* find all untranslatable characters */
9043         while (collend < size) {
9044             PyObject *x;
9045             ch = PyUnicode_READ(kind, data, collend);
9046             if (charmaptranslate_lookup(ch, mapping, &x))
9047                 goto onError;
9048             Py_XDECREF(x);
9049             if (x != Py_None)
9050                 break;
9051             ++collend;
9052         }
9053 
9054         if (ignore) {
9055             i = collend;
9056         }
9057         else {
9058             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9059                                                              reason, input, &exc,
9060                                                              collstart, collend, &newpos);
9061             if (repunicode == NULL)
9062                 goto onError;
9063             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9064                 Py_DECREF(repunicode);
9065                 goto onError;
9066             }
9067             Py_DECREF(repunicode);
9068             i = newpos;
9069         }
9070     }
9071     Py_XDECREF(exc);
9072     Py_XDECREF(errorHandler);
9073     return _PyUnicodeWriter_Finish(&writer);
9074 
9075   onError:
9076     _PyUnicodeWriter_Dealloc(&writer);
9077     Py_XDECREF(exc);
9078     Py_XDECREF(errorHandler);
9079     return NULL;
9080 }
9081 
9082 /* Deprecated. Use PyUnicode_Translate instead. */
9083 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9084 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9085                            Py_ssize_t size,
9086                            PyObject *mapping,
9087                            const char *errors)
9088 {
9089     PyObject *result;
9090     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9091     if (!unicode)
9092         return NULL;
9093     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9094     Py_DECREF(unicode);
9095     return result;
9096 }
9097 
9098 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9099 PyUnicode_Translate(PyObject *str,
9100                     PyObject *mapping,
9101                     const char *errors)
9102 {
9103     if (ensure_unicode(str) < 0)
9104         return NULL;
9105     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9106 }
9107 
9108 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9109 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9110 {
9111     if (!PyUnicode_Check(unicode)) {
9112         PyErr_BadInternalCall();
9113         return NULL;
9114     }
9115     if (PyUnicode_READY(unicode) == -1)
9116         return NULL;
9117     if (PyUnicode_IS_ASCII(unicode)) {
9118         /* If the string is already ASCII, just return the same string */
9119         Py_INCREF(unicode);
9120         return unicode;
9121     }
9122 
9123     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9124     PyObject *result = PyUnicode_New(len, 127);
9125     if (result == NULL) {
9126         return NULL;
9127     }
9128 
9129     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9130     int kind = PyUnicode_KIND(unicode);
9131     const void *data = PyUnicode_DATA(unicode);
9132     Py_ssize_t i;
9133     for (i = 0; i < len; ++i) {
9134         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9135         if (ch < 127) {
9136             out[i] = ch;
9137         }
9138         else if (Py_UNICODE_ISSPACE(ch)) {
9139             out[i] = ' ';
9140         }
9141         else {
9142             int decimal = Py_UNICODE_TODECIMAL(ch);
9143             if (decimal < 0) {
9144                 out[i] = '?';
9145                 out[i+1] = '\0';
9146                 _PyUnicode_LENGTH(result) = i + 1;
9147                 break;
9148             }
9149             out[i] = '0' + decimal;
9150         }
9151     }
9152 
9153     assert(_PyUnicode_CheckConsistency(result, 1));
9154     return result;
9155 }
9156 
9157 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9158 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9159                                   Py_ssize_t length)
9160 {
9161     PyObject *decimal;
9162     Py_ssize_t i;
9163     Py_UCS4 maxchar;
9164     enum PyUnicode_Kind kind;
9165     void *data;
9166 
9167     maxchar = 127;
9168     for (i = 0; i < length; i++) {
9169         Py_UCS4 ch = s[i];
9170         if (ch > 127) {
9171             int decimal = Py_UNICODE_TODECIMAL(ch);
9172             if (decimal >= 0)
9173                 ch = '0' + decimal;
9174             maxchar = Py_MAX(maxchar, ch);
9175         }
9176     }
9177 
9178     /* Copy to a new string */
9179     decimal = PyUnicode_New(length, maxchar);
9180     if (decimal == NULL)
9181         return decimal;
9182     kind = PyUnicode_KIND(decimal);
9183     data = PyUnicode_DATA(decimal);
9184     /* Iterate over code points */
9185     for (i = 0; i < length; i++) {
9186         Py_UCS4 ch = s[i];
9187         if (ch > 127) {
9188             int decimal = Py_UNICODE_TODECIMAL(ch);
9189             if (decimal >= 0)
9190                 ch = '0' + decimal;
9191         }
9192         PyUnicode_WRITE(kind, data, i, ch);
9193     }
9194     return unicode_result(decimal);
9195 }
9196 /* --- Decimal Encoder ---------------------------------------------------- */
9197 
9198 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9199 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9200                         Py_ssize_t length,
9201                         char *output,
9202                         const char *errors)
9203 {
9204     PyObject *unicode;
9205     Py_ssize_t i;
9206     enum PyUnicode_Kind kind;
9207     void *data;
9208 
9209     if (output == NULL) {
9210         PyErr_BadArgument();
9211         return -1;
9212     }
9213 
9214     unicode = PyUnicode_FromWideChar(s, length);
9215     if (unicode == NULL)
9216         return -1;
9217 
9218     kind = PyUnicode_KIND(unicode);
9219     data = PyUnicode_DATA(unicode);
9220 
9221     for (i=0; i < length; ) {
9222         PyObject *exc;
9223         Py_UCS4 ch;
9224         int decimal;
9225         Py_ssize_t startpos;
9226 
9227         ch = PyUnicode_READ(kind, data, i);
9228 
9229         if (Py_UNICODE_ISSPACE(ch)) {
9230             *output++ = ' ';
9231             i++;
9232             continue;
9233         }
9234         decimal = Py_UNICODE_TODECIMAL(ch);
9235         if (decimal >= 0) {
9236             *output++ = '0' + decimal;
9237             i++;
9238             continue;
9239         }
9240         if (0 < ch && ch < 256) {
9241             *output++ = (char)ch;
9242             i++;
9243             continue;
9244         }
9245 
9246         startpos = i;
9247         exc = NULL;
9248         raise_encode_exception(&exc, "decimal", unicode,
9249                                startpos, startpos+1,
9250                                "invalid decimal Unicode string");
9251         Py_XDECREF(exc);
9252         Py_DECREF(unicode);
9253         return -1;
9254     }
9255     /* 0-terminate the output string */
9256     *output++ = '\0';
9257     Py_DECREF(unicode);
9258     return 0;
9259 }
9260 
9261 /* --- Helpers ------------------------------------------------------------ */
9262 
9263 /* helper macro to fixup start/end slice values */
9264 #define ADJUST_INDICES(start, end, len)         \
9265     if (end > len)                              \
9266         end = len;                              \
9267     else if (end < 0) {                         \
9268         end += len;                             \
9269         if (end < 0)                            \
9270             end = 0;                            \
9271     }                                           \
9272     if (start < 0) {                            \
9273         start += len;                           \
9274         if (start < 0)                          \
9275             start = 0;                          \
9276     }
9277 
9278 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9279 any_find_slice(PyObject* s1, PyObject* s2,
9280                Py_ssize_t start,
9281                Py_ssize_t end,
9282                int direction)
9283 {
9284     int kind1, kind2;
9285     void *buf1, *buf2;
9286     Py_ssize_t len1, len2, result;
9287 
9288     kind1 = PyUnicode_KIND(s1);
9289     kind2 = PyUnicode_KIND(s2);
9290     if (kind1 < kind2)
9291         return -1;
9292 
9293     len1 = PyUnicode_GET_LENGTH(s1);
9294     len2 = PyUnicode_GET_LENGTH(s2);
9295     ADJUST_INDICES(start, end, len1);
9296     if (end - start < len2)
9297         return -1;
9298 
9299     buf1 = PyUnicode_DATA(s1);
9300     buf2 = PyUnicode_DATA(s2);
9301     if (len2 == 1) {
9302         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9303         result = findchar((const char *)buf1 + kind1*start,
9304                           kind1, end - start, ch, direction);
9305         if (result == -1)
9306             return -1;
9307         else
9308             return start + result;
9309     }
9310 
9311     if (kind2 != kind1) {
9312         buf2 = _PyUnicode_AsKind(s2, kind1);
9313         if (!buf2)
9314             return -2;
9315     }
9316 
9317     if (direction > 0) {
9318         switch (kind1) {
9319         case PyUnicode_1BYTE_KIND:
9320             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9321                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9322             else
9323                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9324             break;
9325         case PyUnicode_2BYTE_KIND:
9326             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9327             break;
9328         case PyUnicode_4BYTE_KIND:
9329             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9330             break;
9331         default:
9332             Py_UNREACHABLE();
9333         }
9334     }
9335     else {
9336         switch (kind1) {
9337         case PyUnicode_1BYTE_KIND:
9338             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9339                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340             else
9341                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342             break;
9343         case PyUnicode_2BYTE_KIND:
9344             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345             break;
9346         case PyUnicode_4BYTE_KIND:
9347             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9348             break;
9349         default:
9350             Py_UNREACHABLE();
9351         }
9352     }
9353 
9354     if (kind2 != kind1)
9355         PyMem_Free(buf2);
9356 
9357     return result;
9358 }
9359 
9360 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9361 #include "stringlib/localeutil.h"
9362 
9363 /**
9364  * InsertThousandsGrouping:
9365  * @writer: Unicode writer.
9366  * @n_buffer: Number of characters in @buffer.
9367  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9368  * @d_pos: Start of digits string.
9369  * @n_digits: The number of digits in the string, in which we want
9370  *            to put the grouping chars.
9371  * @min_width: The minimum width of the digits in the output string.
9372  *             Output will be zero-padded on the left to fill.
9373  * @grouping: see definition in localeconv().
9374  * @thousands_sep: see definition in localeconv().
9375  *
9376  * There are 2 modes: counting and filling. If @writer is NULL,
9377  *  we are in counting mode, else filling mode.
9378  * If counting, the required buffer size is returned.
9379  * If filling, we know the buffer will be large enough, so we don't
9380  *  need to pass in the buffer size.
9381  * Inserts thousand grouping characters (as defined by grouping and
9382  *  thousands_sep) into @writer.
9383  *
9384  * Return value: -1 on error, number of characters otherwise.
9385  **/
9386 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9387 _PyUnicode_InsertThousandsGrouping(
9388     _PyUnicodeWriter *writer,
9389     Py_ssize_t n_buffer,
9390     PyObject *digits,
9391     Py_ssize_t d_pos,
9392     Py_ssize_t n_digits,
9393     Py_ssize_t min_width,
9394     const char *grouping,
9395     PyObject *thousands_sep,
9396     Py_UCS4 *maxchar)
9397 {
9398     min_width = Py_MAX(0, min_width);
9399     if (writer) {
9400         assert(digits != NULL);
9401         assert(maxchar == NULL);
9402     }
9403     else {
9404         assert(digits == NULL);
9405         assert(maxchar != NULL);
9406     }
9407     assert(0 <= d_pos);
9408     assert(0 <= n_digits);
9409     assert(grouping != NULL);
9410 
9411     if (digits != NULL) {
9412         if (PyUnicode_READY(digits) == -1) {
9413             return -1;
9414         }
9415     }
9416     if (PyUnicode_READY(thousands_sep) == -1) {
9417         return -1;
9418     }
9419 
9420     Py_ssize_t count = 0;
9421     Py_ssize_t n_zeros;
9422     int loop_broken = 0;
9423     int use_separator = 0; /* First time through, don't append the
9424                               separator. They only go between
9425                               groups. */
9426     Py_ssize_t buffer_pos;
9427     Py_ssize_t digits_pos;
9428     Py_ssize_t len;
9429     Py_ssize_t n_chars;
9430     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9431                                         be looked at */
9432     /* A generator that returns all of the grouping widths, until it
9433        returns 0. */
9434     GroupGenerator groupgen;
9435     GroupGenerator_init(&groupgen, grouping);
9436     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9437 
9438     /* if digits are not grouped, thousands separator
9439        should be an empty string */
9440     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9441 
9442     digits_pos = d_pos + n_digits;
9443     if (writer) {
9444         buffer_pos = writer->pos + n_buffer;
9445         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9446         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9447     }
9448     else {
9449         buffer_pos = n_buffer;
9450     }
9451 
9452     if (!writer) {
9453         *maxchar = 127;
9454     }
9455 
9456     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9457         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9458         n_zeros = Py_MAX(0, len - remaining);
9459         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9460 
9461         /* Use n_zero zero's and n_chars chars */
9462 
9463         /* Count only, don't do anything. */
9464         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9465 
9466         /* Copy into the writer. */
9467         InsertThousandsGrouping_fill(writer, &buffer_pos,
9468                                      digits, &digits_pos,
9469                                      n_chars, n_zeros,
9470                                      use_separator ? thousands_sep : NULL,
9471                                      thousands_sep_len, maxchar);
9472 
9473         /* Use a separator next time. */
9474         use_separator = 1;
9475 
9476         remaining -= n_chars;
9477         min_width -= len;
9478 
9479         if (remaining <= 0 && min_width <= 0) {
9480             loop_broken = 1;
9481             break;
9482         }
9483         min_width -= thousands_sep_len;
9484     }
9485     if (!loop_broken) {
9486         /* We left the loop without using a break statement. */
9487 
9488         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9489         n_zeros = Py_MAX(0, len - remaining);
9490         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9491 
9492         /* Use n_zero zero's and n_chars chars */
9493         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9494 
9495         /* Copy into the writer. */
9496         InsertThousandsGrouping_fill(writer, &buffer_pos,
9497                                      digits, &digits_pos,
9498                                      n_chars, n_zeros,
9499                                      use_separator ? thousands_sep : NULL,
9500                                      thousands_sep_len, maxchar);
9501     }
9502     return count;
9503 }
9504 
9505 
9506 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9507 PyUnicode_Count(PyObject *str,
9508                 PyObject *substr,
9509                 Py_ssize_t start,
9510                 Py_ssize_t end)
9511 {
9512     Py_ssize_t result;
9513     int kind1, kind2;
9514     void *buf1 = NULL, *buf2 = NULL;
9515     Py_ssize_t len1, len2;
9516 
9517     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9518         return -1;
9519 
9520     kind1 = PyUnicode_KIND(str);
9521     kind2 = PyUnicode_KIND(substr);
9522     if (kind1 < kind2)
9523         return 0;
9524 
9525     len1 = PyUnicode_GET_LENGTH(str);
9526     len2 = PyUnicode_GET_LENGTH(substr);
9527     ADJUST_INDICES(start, end, len1);
9528     if (end - start < len2)
9529         return 0;
9530 
9531     buf1 = PyUnicode_DATA(str);
9532     buf2 = PyUnicode_DATA(substr);
9533     if (kind2 != kind1) {
9534         buf2 = _PyUnicode_AsKind(substr, kind1);
9535         if (!buf2)
9536             goto onError;
9537     }
9538 
9539     switch (kind1) {
9540     case PyUnicode_1BYTE_KIND:
9541         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9542             result = asciilib_count(
9543                 ((Py_UCS1*)buf1) + start, end - start,
9544                 buf2, len2, PY_SSIZE_T_MAX
9545                 );
9546         else
9547             result = ucs1lib_count(
9548                 ((Py_UCS1*)buf1) + start, end - start,
9549                 buf2, len2, PY_SSIZE_T_MAX
9550                 );
9551         break;
9552     case PyUnicode_2BYTE_KIND:
9553         result = ucs2lib_count(
9554             ((Py_UCS2*)buf1) + start, end - start,
9555             buf2, len2, PY_SSIZE_T_MAX
9556             );
9557         break;
9558     case PyUnicode_4BYTE_KIND:
9559         result = ucs4lib_count(
9560             ((Py_UCS4*)buf1) + start, end - start,
9561             buf2, len2, PY_SSIZE_T_MAX
9562             );
9563         break;
9564     default:
9565         Py_UNREACHABLE();
9566     }
9567 
9568     if (kind2 != kind1)
9569         PyMem_Free(buf2);
9570 
9571     return result;
9572   onError:
9573     if (kind2 != kind1 && buf2)
9574         PyMem_Free(buf2);
9575     return -1;
9576 }
9577 
9578 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9579 PyUnicode_Find(PyObject *str,
9580                PyObject *substr,
9581                Py_ssize_t start,
9582                Py_ssize_t end,
9583                int direction)
9584 {
9585     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9586         return -2;
9587 
9588     return any_find_slice(str, substr, start, end, direction);
9589 }
9590 
9591 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9592 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9593                    Py_ssize_t start, Py_ssize_t end,
9594                    int direction)
9595 {
9596     int kind;
9597     Py_ssize_t len, result;
9598     if (PyUnicode_READY(str) == -1)
9599         return -2;
9600     len = PyUnicode_GET_LENGTH(str);
9601     ADJUST_INDICES(start, end, len);
9602     if (end - start < 1)
9603         return -1;
9604     kind = PyUnicode_KIND(str);
9605     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9606                       kind, end-start, ch, direction);
9607     if (result == -1)
9608         return -1;
9609     else
9610         return start + result;
9611 }
9612 
9613 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9614 tailmatch(PyObject *self,
9615           PyObject *substring,
9616           Py_ssize_t start,
9617           Py_ssize_t end,
9618           int direction)
9619 {
9620     int kind_self;
9621     int kind_sub;
9622     void *data_self;
9623     void *data_sub;
9624     Py_ssize_t offset;
9625     Py_ssize_t i;
9626     Py_ssize_t end_sub;
9627 
9628     if (PyUnicode_READY(self) == -1 ||
9629         PyUnicode_READY(substring) == -1)
9630         return -1;
9631 
9632     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9633     end -= PyUnicode_GET_LENGTH(substring);
9634     if (end < start)
9635         return 0;
9636 
9637     if (PyUnicode_GET_LENGTH(substring) == 0)
9638         return 1;
9639 
9640     kind_self = PyUnicode_KIND(self);
9641     data_self = PyUnicode_DATA(self);
9642     kind_sub = PyUnicode_KIND(substring);
9643     data_sub = PyUnicode_DATA(substring);
9644     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9645 
9646     if (direction > 0)
9647         offset = end;
9648     else
9649         offset = start;
9650 
9651     if (PyUnicode_READ(kind_self, data_self, offset) ==
9652         PyUnicode_READ(kind_sub, data_sub, 0) &&
9653         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9654         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9655         /* If both are of the same kind, memcmp is sufficient */
9656         if (kind_self == kind_sub) {
9657             return ! memcmp((char *)data_self +
9658                                 (offset * PyUnicode_KIND(substring)),
9659                             data_sub,
9660                             PyUnicode_GET_LENGTH(substring) *
9661                                 PyUnicode_KIND(substring));
9662         }
9663         /* otherwise we have to compare each character by first accessing it */
9664         else {
9665             /* We do not need to compare 0 and len(substring)-1 because
9666                the if statement above ensured already that they are equal
9667                when we end up here. */
9668             for (i = 1; i < end_sub; ++i) {
9669                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9670                     PyUnicode_READ(kind_sub, data_sub, i))
9671                     return 0;
9672             }
9673             return 1;
9674         }
9675     }
9676 
9677     return 0;
9678 }
9679 
9680 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9681 PyUnicode_Tailmatch(PyObject *str,
9682                     PyObject *substr,
9683                     Py_ssize_t start,
9684                     Py_ssize_t end,
9685                     int direction)
9686 {
9687     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9688         return -1;
9689 
9690     return tailmatch(str, substr, start, end, direction);
9691 }
9692 
9693 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9694 ascii_upper_or_lower(PyObject *self, int lower)
9695 {
9696     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9697     char *resdata, *data = PyUnicode_DATA(self);
9698     PyObject *res;
9699 
9700     res = PyUnicode_New(len, 127);
9701     if (res == NULL)
9702         return NULL;
9703     resdata = PyUnicode_DATA(res);
9704     if (lower)
9705         _Py_bytes_lower(resdata, data, len);
9706     else
9707         _Py_bytes_upper(resdata, data, len);
9708     return res;
9709 }
9710 
9711 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9712 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9713 {
9714     Py_ssize_t j;
9715     int final_sigma;
9716     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9717     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9718 
9719      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9720 
9721     where ! is a negation and \p{xxx} is a character with property xxx.
9722     */
9723     for (j = i - 1; j >= 0; j--) {
9724         c = PyUnicode_READ(kind, data, j);
9725         if (!_PyUnicode_IsCaseIgnorable(c))
9726             break;
9727     }
9728     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9729     if (final_sigma) {
9730         for (j = i + 1; j < length; j++) {
9731             c = PyUnicode_READ(kind, data, j);
9732             if (!_PyUnicode_IsCaseIgnorable(c))
9733                 break;
9734         }
9735         final_sigma = j == length || !_PyUnicode_IsCased(c);
9736     }
9737     return (final_sigma) ? 0x3C2 : 0x3C3;
9738 }
9739 
9740 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9741 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9742            Py_UCS4 c, Py_UCS4 *mapped)
9743 {
9744     /* Obscure special case. */
9745     if (c == 0x3A3) {
9746         mapped[0] = handle_capital_sigma(kind, data, length, i);
9747         return 1;
9748     }
9749     return _PyUnicode_ToLowerFull(c, mapped);
9750 }
9751 
9752 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9753 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9754 {
9755     Py_ssize_t i, k = 0;
9756     int n_res, j;
9757     Py_UCS4 c, mapped[3];
9758 
9759     c = PyUnicode_READ(kind, data, 0);
9760     n_res = _PyUnicode_ToTitleFull(c, mapped);
9761     for (j = 0; j < n_res; j++) {
9762         *maxchar = Py_MAX(*maxchar, mapped[j]);
9763         res[k++] = mapped[j];
9764     }
9765     for (i = 1; i < length; i++) {
9766         c = PyUnicode_READ(kind, data, i);
9767         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768         for (j = 0; j < n_res; j++) {
9769             *maxchar = Py_MAX(*maxchar, mapped[j]);
9770             res[k++] = mapped[j];
9771         }
9772     }
9773     return k;
9774 }
9775 
9776 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9777 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9778     Py_ssize_t i, k = 0;
9779 
9780     for (i = 0; i < length; i++) {
9781         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9782         int n_res, j;
9783         if (Py_UNICODE_ISUPPER(c)) {
9784             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9785         }
9786         else if (Py_UNICODE_ISLOWER(c)) {
9787             n_res = _PyUnicode_ToUpperFull(c, mapped);
9788         }
9789         else {
9790             n_res = 1;
9791             mapped[0] = c;
9792         }
9793         for (j = 0; j < n_res; j++) {
9794             *maxchar = Py_MAX(*maxchar, mapped[j]);
9795             res[k++] = mapped[j];
9796         }
9797     }
9798     return k;
9799 }
9800 
9801 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9802 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9803                   Py_UCS4 *maxchar, int lower)
9804 {
9805     Py_ssize_t i, k = 0;
9806 
9807     for (i = 0; i < length; i++) {
9808         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809         int n_res, j;
9810         if (lower)
9811             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812         else
9813             n_res = _PyUnicode_ToUpperFull(c, mapped);
9814         for (j = 0; j < n_res; j++) {
9815             *maxchar = Py_MAX(*maxchar, mapped[j]);
9816             res[k++] = mapped[j];
9817         }
9818     }
9819     return k;
9820 }
9821 
9822 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9823 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824 {
9825     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9826 }
9827 
9828 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9829 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830 {
9831     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9832 }
9833 
9834 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9835 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836 {
9837     Py_ssize_t i, k = 0;
9838 
9839     for (i = 0; i < length; i++) {
9840         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9841         Py_UCS4 mapped[3];
9842         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9843         for (j = 0; j < n_res; j++) {
9844             *maxchar = Py_MAX(*maxchar, mapped[j]);
9845             res[k++] = mapped[j];
9846         }
9847     }
9848     return k;
9849 }
9850 
9851 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9852 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853 {
9854     Py_ssize_t i, k = 0;
9855     int previous_is_cased;
9856 
9857     previous_is_cased = 0;
9858     for (i = 0; i < length; i++) {
9859         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9860         Py_UCS4 mapped[3];
9861         int n_res, j;
9862 
9863         if (previous_is_cased)
9864             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9865         else
9866             n_res = _PyUnicode_ToTitleFull(c, mapped);
9867 
9868         for (j = 0; j < n_res; j++) {
9869             *maxchar = Py_MAX(*maxchar, mapped[j]);
9870             res[k++] = mapped[j];
9871         }
9872 
9873         previous_is_cased = _PyUnicode_IsCased(c);
9874     }
9875     return k;
9876 }
9877 
9878 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9879 case_operation(PyObject *self,
9880                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9881 {
9882     PyObject *res = NULL;
9883     Py_ssize_t length, newlength = 0;
9884     int kind, outkind;
9885     void *data, *outdata;
9886     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9887 
9888     assert(PyUnicode_IS_READY(self));
9889 
9890     kind = PyUnicode_KIND(self);
9891     data = PyUnicode_DATA(self);
9892     length = PyUnicode_GET_LENGTH(self);
9893     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9894         PyErr_SetString(PyExc_OverflowError, "string is too long");
9895         return NULL;
9896     }
9897     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9898     if (tmp == NULL)
9899         return PyErr_NoMemory();
9900     newlength = perform(kind, data, length, tmp, &maxchar);
9901     res = PyUnicode_New(newlength, maxchar);
9902     if (res == NULL)
9903         goto leave;
9904     tmpend = tmp + newlength;
9905     outdata = PyUnicode_DATA(res);
9906     outkind = PyUnicode_KIND(res);
9907     switch (outkind) {
9908     case PyUnicode_1BYTE_KIND:
9909         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9910         break;
9911     case PyUnicode_2BYTE_KIND:
9912         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9913         break;
9914     case PyUnicode_4BYTE_KIND:
9915         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9916         break;
9917     default:
9918         Py_UNREACHABLE();
9919     }
9920   leave:
9921     PyMem_FREE(tmp);
9922     return res;
9923 }
9924 
9925 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9926 PyUnicode_Join(PyObject *separator, PyObject *seq)
9927 {
9928     PyObject *res;
9929     PyObject *fseq;
9930     Py_ssize_t seqlen;
9931     PyObject **items;
9932 
9933     fseq = PySequence_Fast(seq, "can only join an iterable");
9934     if (fseq == NULL) {
9935         return NULL;
9936     }
9937 
9938     /* NOTE: the following code can't call back into Python code,
9939      * so we are sure that fseq won't be mutated.
9940      */
9941 
9942     items = PySequence_Fast_ITEMS(fseq);
9943     seqlen = PySequence_Fast_GET_SIZE(fseq);
9944     res = _PyUnicode_JoinArray(separator, items, seqlen);
9945     Py_DECREF(fseq);
9946     return res;
9947 }
9948 
9949 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9950 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9951 {
9952     PyObject *res = NULL; /* the result */
9953     PyObject *sep = NULL;
9954     Py_ssize_t seplen;
9955     PyObject *item;
9956     Py_ssize_t sz, i, res_offset;
9957     Py_UCS4 maxchar;
9958     Py_UCS4 item_maxchar;
9959     int use_memcpy;
9960     unsigned char *res_data = NULL, *sep_data = NULL;
9961     PyObject *last_obj;
9962     unsigned int kind = 0;
9963 
9964     /* If empty sequence, return u"". */
9965     if (seqlen == 0) {
9966         _Py_RETURN_UNICODE_EMPTY();
9967     }
9968 
9969     /* If singleton sequence with an exact Unicode, return that. */
9970     last_obj = NULL;
9971     if (seqlen == 1) {
9972         if (PyUnicode_CheckExact(items[0])) {
9973             res = items[0];
9974             Py_INCREF(res);
9975             return res;
9976         }
9977         seplen = 0;
9978         maxchar = 0;
9979     }
9980     else {
9981         /* Set up sep and seplen */
9982         if (separator == NULL) {
9983             /* fall back to a blank space separator */
9984             sep = PyUnicode_FromOrdinal(' ');
9985             if (!sep)
9986                 goto onError;
9987             seplen = 1;
9988             maxchar = 32;
9989         }
9990         else {
9991             if (!PyUnicode_Check(separator)) {
9992                 PyErr_Format(PyExc_TypeError,
9993                              "separator: expected str instance,"
9994                              " %.80s found",
9995                              Py_TYPE(separator)->tp_name);
9996                 goto onError;
9997             }
9998             if (PyUnicode_READY(separator))
9999                 goto onError;
10000             sep = separator;
10001             seplen = PyUnicode_GET_LENGTH(separator);
10002             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10003             /* inc refcount to keep this code path symmetric with the
10004                above case of a blank separator */
10005             Py_INCREF(sep);
10006         }
10007         last_obj = sep;
10008     }
10009 
10010     /* There are at least two things to join, or else we have a subclass
10011      * of str in the sequence.
10012      * Do a pre-pass to figure out the total amount of space we'll
10013      * need (sz), and see whether all argument are strings.
10014      */
10015     sz = 0;
10016 #ifdef Py_DEBUG
10017     use_memcpy = 0;
10018 #else
10019     use_memcpy = 1;
10020 #endif
10021     for (i = 0; i < seqlen; i++) {
10022         size_t add_sz;
10023         item = items[i];
10024         if (!PyUnicode_Check(item)) {
10025             PyErr_Format(PyExc_TypeError,
10026                          "sequence item %zd: expected str instance,"
10027                          " %.80s found",
10028                          i, Py_TYPE(item)->tp_name);
10029             goto onError;
10030         }
10031         if (PyUnicode_READY(item) == -1)
10032             goto onError;
10033         add_sz = PyUnicode_GET_LENGTH(item);
10034         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10035         maxchar = Py_MAX(maxchar, item_maxchar);
10036         if (i != 0) {
10037             add_sz += seplen;
10038         }
10039         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10040             PyErr_SetString(PyExc_OverflowError,
10041                             "join() result is too long for a Python string");
10042             goto onError;
10043         }
10044         sz += add_sz;
10045         if (use_memcpy && last_obj != NULL) {
10046             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10047                 use_memcpy = 0;
10048         }
10049         last_obj = item;
10050     }
10051 
10052     res = PyUnicode_New(sz, maxchar);
10053     if (res == NULL)
10054         goto onError;
10055 
10056     /* Catenate everything. */
10057 #ifdef Py_DEBUG
10058     use_memcpy = 0;
10059 #else
10060     if (use_memcpy) {
10061         res_data = PyUnicode_1BYTE_DATA(res);
10062         kind = PyUnicode_KIND(res);
10063         if (seplen != 0)
10064             sep_data = PyUnicode_1BYTE_DATA(sep);
10065     }
10066 #endif
10067     if (use_memcpy) {
10068         for (i = 0; i < seqlen; ++i) {
10069             Py_ssize_t itemlen;
10070             item = items[i];
10071 
10072             /* Copy item, and maybe the separator. */
10073             if (i && seplen != 0) {
10074                 memcpy(res_data,
10075                           sep_data,
10076                           kind * seplen);
10077                 res_data += kind * seplen;
10078             }
10079 
10080             itemlen = PyUnicode_GET_LENGTH(item);
10081             if (itemlen != 0) {
10082                 memcpy(res_data,
10083                           PyUnicode_DATA(item),
10084                           kind * itemlen);
10085                 res_data += kind * itemlen;
10086             }
10087         }
10088         assert(res_data == PyUnicode_1BYTE_DATA(res)
10089                            + kind * PyUnicode_GET_LENGTH(res));
10090     }
10091     else {
10092         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10093             Py_ssize_t itemlen;
10094             item = items[i];
10095 
10096             /* Copy item, and maybe the separator. */
10097             if (i && seplen != 0) {
10098                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10099                 res_offset += seplen;
10100             }
10101 
10102             itemlen = PyUnicode_GET_LENGTH(item);
10103             if (itemlen != 0) {
10104                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10105                 res_offset += itemlen;
10106             }
10107         }
10108         assert(res_offset == PyUnicode_GET_LENGTH(res));
10109     }
10110 
10111     Py_XDECREF(sep);
10112     assert(_PyUnicode_CheckConsistency(res, 1));
10113     return res;
10114 
10115   onError:
10116     Py_XDECREF(sep);
10117     Py_XDECREF(res);
10118     return NULL;
10119 }
10120 
10121 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10122 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123                     Py_UCS4 fill_char)
10124 {
10125     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10126     void *data = PyUnicode_DATA(unicode);
10127     assert(PyUnicode_IS_READY(unicode));
10128     assert(unicode_modifiable(unicode));
10129     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10130     assert(start >= 0);
10131     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10132     unicode_fill(kind, data, fill_char, start, length);
10133 }
10134 
10135 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10136 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10137                Py_UCS4 fill_char)
10138 {
10139     Py_ssize_t maxlen;
10140 
10141     if (!PyUnicode_Check(unicode)) {
10142         PyErr_BadInternalCall();
10143         return -1;
10144     }
10145     if (PyUnicode_READY(unicode) == -1)
10146         return -1;
10147     if (unicode_check_modifiable(unicode))
10148         return -1;
10149 
10150     if (start < 0) {
10151         PyErr_SetString(PyExc_IndexError, "string index out of range");
10152         return -1;
10153     }
10154     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10155         PyErr_SetString(PyExc_ValueError,
10156                          "fill character is bigger than "
10157                          "the string maximum character");
10158         return -1;
10159     }
10160 
10161     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10162     length = Py_MIN(maxlen, length);
10163     if (length <= 0)
10164         return 0;
10165 
10166     _PyUnicode_FastFill(unicode, start, length, fill_char);
10167     return length;
10168 }
10169 
10170 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10171 pad(PyObject *self,
10172     Py_ssize_t left,
10173     Py_ssize_t right,
10174     Py_UCS4 fill)
10175 {
10176     PyObject *u;
10177     Py_UCS4 maxchar;
10178     int kind;
10179     void *data;
10180 
10181     if (left < 0)
10182         left = 0;
10183     if (right < 0)
10184         right = 0;
10185 
10186     if (left == 0 && right == 0)
10187         return unicode_result_unchanged(self);
10188 
10189     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10190         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10191         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10192         return NULL;
10193     }
10194     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10195     maxchar = Py_MAX(maxchar, fill);
10196     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10197     if (!u)
10198         return NULL;
10199 
10200     kind = PyUnicode_KIND(u);
10201     data = PyUnicode_DATA(u);
10202     if (left)
10203         unicode_fill(kind, data, fill, 0, left);
10204     if (right)
10205         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10206     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10207     assert(_PyUnicode_CheckConsistency(u, 1));
10208     return u;
10209 }
10210 
10211 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10212 PyUnicode_Splitlines(PyObject *string, int keepends)
10213 {
10214     PyObject *list;
10215 
10216     if (ensure_unicode(string) < 0)
10217         return NULL;
10218 
10219     switch (PyUnicode_KIND(string)) {
10220     case PyUnicode_1BYTE_KIND:
10221         if (PyUnicode_IS_ASCII(string))
10222             list = asciilib_splitlines(
10223                 string, PyUnicode_1BYTE_DATA(string),
10224                 PyUnicode_GET_LENGTH(string), keepends);
10225         else
10226             list = ucs1lib_splitlines(
10227                 string, PyUnicode_1BYTE_DATA(string),
10228                 PyUnicode_GET_LENGTH(string), keepends);
10229         break;
10230     case PyUnicode_2BYTE_KIND:
10231         list = ucs2lib_splitlines(
10232             string, PyUnicode_2BYTE_DATA(string),
10233             PyUnicode_GET_LENGTH(string), keepends);
10234         break;
10235     case PyUnicode_4BYTE_KIND:
10236         list = ucs4lib_splitlines(
10237             string, PyUnicode_4BYTE_DATA(string),
10238             PyUnicode_GET_LENGTH(string), keepends);
10239         break;
10240     default:
10241         Py_UNREACHABLE();
10242     }
10243     return list;
10244 }
10245 
10246 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10247 split(PyObject *self,
10248       PyObject *substring,
10249       Py_ssize_t maxcount)
10250 {
10251     int kind1, kind2;
10252     void *buf1, *buf2;
10253     Py_ssize_t len1, len2;
10254     PyObject* out;
10255 
10256     if (maxcount < 0)
10257         maxcount = PY_SSIZE_T_MAX;
10258 
10259     if (PyUnicode_READY(self) == -1)
10260         return NULL;
10261 
10262     if (substring == NULL)
10263         switch (PyUnicode_KIND(self)) {
10264         case PyUnicode_1BYTE_KIND:
10265             if (PyUnicode_IS_ASCII(self))
10266                 return asciilib_split_whitespace(
10267                     self,  PyUnicode_1BYTE_DATA(self),
10268                     PyUnicode_GET_LENGTH(self), maxcount
10269                     );
10270             else
10271                 return ucs1lib_split_whitespace(
10272                     self,  PyUnicode_1BYTE_DATA(self),
10273                     PyUnicode_GET_LENGTH(self), maxcount
10274                     );
10275         case PyUnicode_2BYTE_KIND:
10276             return ucs2lib_split_whitespace(
10277                 self,  PyUnicode_2BYTE_DATA(self),
10278                 PyUnicode_GET_LENGTH(self), maxcount
10279                 );
10280         case PyUnicode_4BYTE_KIND:
10281             return ucs4lib_split_whitespace(
10282                 self,  PyUnicode_4BYTE_DATA(self),
10283                 PyUnicode_GET_LENGTH(self), maxcount
10284                 );
10285         default:
10286             Py_UNREACHABLE();
10287         }
10288 
10289     if (PyUnicode_READY(substring) == -1)
10290         return NULL;
10291 
10292     kind1 = PyUnicode_KIND(self);
10293     kind2 = PyUnicode_KIND(substring);
10294     len1 = PyUnicode_GET_LENGTH(self);
10295     len2 = PyUnicode_GET_LENGTH(substring);
10296     if (kind1 < kind2 || len1 < len2) {
10297         out = PyList_New(1);
10298         if (out == NULL)
10299             return NULL;
10300         Py_INCREF(self);
10301         PyList_SET_ITEM(out, 0, self);
10302         return out;
10303     }
10304     buf1 = PyUnicode_DATA(self);
10305     buf2 = PyUnicode_DATA(substring);
10306     if (kind2 != kind1) {
10307         buf2 = _PyUnicode_AsKind(substring, kind1);
10308         if (!buf2)
10309             return NULL;
10310     }
10311 
10312     switch (kind1) {
10313     case PyUnicode_1BYTE_KIND:
10314         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10315             out = asciilib_split(
10316                 self,  buf1, len1, buf2, len2, maxcount);
10317         else
10318             out = ucs1lib_split(
10319                 self,  buf1, len1, buf2, len2, maxcount);
10320         break;
10321     case PyUnicode_2BYTE_KIND:
10322         out = ucs2lib_split(
10323             self,  buf1, len1, buf2, len2, maxcount);
10324         break;
10325     case PyUnicode_4BYTE_KIND:
10326         out = ucs4lib_split(
10327             self,  buf1, len1, buf2, len2, maxcount);
10328         break;
10329     default:
10330         out = NULL;
10331     }
10332     if (kind2 != kind1)
10333         PyMem_Free(buf2);
10334     return out;
10335 }
10336 
10337 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10338 rsplit(PyObject *self,
10339        PyObject *substring,
10340        Py_ssize_t maxcount)
10341 {
10342     int kind1, kind2;
10343     void *buf1, *buf2;
10344     Py_ssize_t len1, len2;
10345     PyObject* out;
10346 
10347     if (maxcount < 0)
10348         maxcount = PY_SSIZE_T_MAX;
10349 
10350     if (PyUnicode_READY(self) == -1)
10351         return NULL;
10352 
10353     if (substring == NULL)
10354         switch (PyUnicode_KIND(self)) {
10355         case PyUnicode_1BYTE_KIND:
10356             if (PyUnicode_IS_ASCII(self))
10357                 return asciilib_rsplit_whitespace(
10358                     self,  PyUnicode_1BYTE_DATA(self),
10359                     PyUnicode_GET_LENGTH(self), maxcount
10360                     );
10361             else
10362                 return ucs1lib_rsplit_whitespace(
10363                     self,  PyUnicode_1BYTE_DATA(self),
10364                     PyUnicode_GET_LENGTH(self), maxcount
10365                     );
10366         case PyUnicode_2BYTE_KIND:
10367             return ucs2lib_rsplit_whitespace(
10368                 self,  PyUnicode_2BYTE_DATA(self),
10369                 PyUnicode_GET_LENGTH(self), maxcount
10370                 );
10371         case PyUnicode_4BYTE_KIND:
10372             return ucs4lib_rsplit_whitespace(
10373                 self,  PyUnicode_4BYTE_DATA(self),
10374                 PyUnicode_GET_LENGTH(self), maxcount
10375                 );
10376         default:
10377             Py_UNREACHABLE();
10378         }
10379 
10380     if (PyUnicode_READY(substring) == -1)
10381         return NULL;
10382 
10383     kind1 = PyUnicode_KIND(self);
10384     kind2 = PyUnicode_KIND(substring);
10385     len1 = PyUnicode_GET_LENGTH(self);
10386     len2 = PyUnicode_GET_LENGTH(substring);
10387     if (kind1 < kind2 || len1 < len2) {
10388         out = PyList_New(1);
10389         if (out == NULL)
10390             return NULL;
10391         Py_INCREF(self);
10392         PyList_SET_ITEM(out, 0, self);
10393         return out;
10394     }
10395     buf1 = PyUnicode_DATA(self);
10396     buf2 = PyUnicode_DATA(substring);
10397     if (kind2 != kind1) {
10398         buf2 = _PyUnicode_AsKind(substring, kind1);
10399         if (!buf2)
10400             return NULL;
10401     }
10402 
10403     switch (kind1) {
10404     case PyUnicode_1BYTE_KIND:
10405         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10406             out = asciilib_rsplit(
10407                 self,  buf1, len1, buf2, len2, maxcount);
10408         else
10409             out = ucs1lib_rsplit(
10410                 self,  buf1, len1, buf2, len2, maxcount);
10411         break;
10412     case PyUnicode_2BYTE_KIND:
10413         out = ucs2lib_rsplit(
10414             self,  buf1, len1, buf2, len2, maxcount);
10415         break;
10416     case PyUnicode_4BYTE_KIND:
10417         out = ucs4lib_rsplit(
10418             self,  buf1, len1, buf2, len2, maxcount);
10419         break;
10420     default:
10421         out = NULL;
10422     }
10423     if (kind2 != kind1)
10424         PyMem_Free(buf2);
10425     return out;
10426 }
10427 
10428 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10429 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10430             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10431 {
10432     switch (kind) {
10433     case PyUnicode_1BYTE_KIND:
10434         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10435             return asciilib_find(buf1, len1, buf2, len2, offset);
10436         else
10437             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10438     case PyUnicode_2BYTE_KIND:
10439         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10440     case PyUnicode_4BYTE_KIND:
10441         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10442     }
10443     Py_UNREACHABLE();
10444 }
10445 
10446 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10447 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10448              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10449 {
10450     switch (kind) {
10451     case PyUnicode_1BYTE_KIND:
10452         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10453             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10454         else
10455             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10456     case PyUnicode_2BYTE_KIND:
10457         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10458     case PyUnicode_4BYTE_KIND:
10459         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10460     }
10461     Py_UNREACHABLE();
10462 }
10463 
10464 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10465 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10466                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10467 {
10468     int kind = PyUnicode_KIND(u);
10469     void *data = PyUnicode_DATA(u);
10470     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10471     if (kind == PyUnicode_1BYTE_KIND) {
10472         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10473                                       (Py_UCS1 *)data + len,
10474                                       u1, u2, maxcount);
10475     }
10476     else if (kind == PyUnicode_2BYTE_KIND) {
10477         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10478                                       (Py_UCS2 *)data + len,
10479                                       u1, u2, maxcount);
10480     }
10481     else {
10482         assert(kind == PyUnicode_4BYTE_KIND);
10483         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10484                                       (Py_UCS4 *)data + len,
10485                                       u1, u2, maxcount);
10486     }
10487 }
10488 
10489 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10490 replace(PyObject *self, PyObject *str1,
10491         PyObject *str2, Py_ssize_t maxcount)
10492 {
10493     PyObject *u;
10494     char *sbuf = PyUnicode_DATA(self);
10495     char *buf1 = PyUnicode_DATA(str1);
10496     char *buf2 = PyUnicode_DATA(str2);
10497     int srelease = 0, release1 = 0, release2 = 0;
10498     int skind = PyUnicode_KIND(self);
10499     int kind1 = PyUnicode_KIND(str1);
10500     int kind2 = PyUnicode_KIND(str2);
10501     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10502     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10503     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10504     int mayshrink;
10505     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10506 
10507     if (maxcount < 0)
10508         maxcount = PY_SSIZE_T_MAX;
10509     else if (maxcount == 0 || slen == 0)
10510         goto nothing;
10511 
10512     if (str1 == str2)
10513         goto nothing;
10514 
10515     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10516     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10517     if (maxchar < maxchar_str1)
10518         /* substring too wide to be present */
10519         goto nothing;
10520     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10521     /* Replacing str1 with str2 may cause a maxchar reduction in the
10522        result string. */
10523     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10524     maxchar = Py_MAX(maxchar, maxchar_str2);
10525 
10526     if (len1 == len2) {
10527         /* same length */
10528         if (len1 == 0)
10529             goto nothing;
10530         if (len1 == 1) {
10531             /* replace characters */
10532             Py_UCS4 u1, u2;
10533             Py_ssize_t pos;
10534 
10535             u1 = PyUnicode_READ(kind1, buf1, 0);
10536             pos = findchar(sbuf, skind, slen, u1, 1);
10537             if (pos < 0)
10538                 goto nothing;
10539             u2 = PyUnicode_READ(kind2, buf2, 0);
10540             u = PyUnicode_New(slen, maxchar);
10541             if (!u)
10542                 goto error;
10543 
10544             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10545             replace_1char_inplace(u, pos, u1, u2, maxcount);
10546         }
10547         else {
10548             int rkind = skind;
10549             char *res;
10550             Py_ssize_t i;
10551 
10552             if (kind1 < rkind) {
10553                 /* widen substring */
10554                 buf1 = _PyUnicode_AsKind(str1, rkind);
10555                 if (!buf1) goto error;
10556                 release1 = 1;
10557             }
10558             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10559             if (i < 0)
10560                 goto nothing;
10561             if (rkind > kind2) {
10562                 /* widen replacement */
10563                 buf2 = _PyUnicode_AsKind(str2, rkind);
10564                 if (!buf2) goto error;
10565                 release2 = 1;
10566             }
10567             else if (rkind < kind2) {
10568                 /* widen self and buf1 */
10569                 rkind = kind2;
10570                 if (release1) PyMem_Free(buf1);
10571                 release1 = 0;
10572                 sbuf = _PyUnicode_AsKind(self, rkind);
10573                 if (!sbuf) goto error;
10574                 srelease = 1;
10575                 buf1 = _PyUnicode_AsKind(str1, rkind);
10576                 if (!buf1) goto error;
10577                 release1 = 1;
10578             }
10579             u = PyUnicode_New(slen, maxchar);
10580             if (!u)
10581                 goto error;
10582             assert(PyUnicode_KIND(u) == rkind);
10583             res = PyUnicode_DATA(u);
10584 
10585             memcpy(res, sbuf, rkind * slen);
10586             /* change everything in-place, starting with this one */
10587             memcpy(res + rkind * i,
10588                    buf2,
10589                    rkind * len2);
10590             i += len1;
10591 
10592             while ( --maxcount > 0) {
10593                 i = anylib_find(rkind, self,
10594                                 sbuf+rkind*i, slen-i,
10595                                 str1, buf1, len1, i);
10596                 if (i == -1)
10597                     break;
10598                 memcpy(res + rkind * i,
10599                        buf2,
10600                        rkind * len2);
10601                 i += len1;
10602             }
10603         }
10604     }
10605     else {
10606         Py_ssize_t n, i, j, ires;
10607         Py_ssize_t new_size;
10608         int rkind = skind;
10609         char *res;
10610 
10611         if (kind1 < rkind) {
10612             /* widen substring */
10613             buf1 = _PyUnicode_AsKind(str1, rkind);
10614             if (!buf1) goto error;
10615             release1 = 1;
10616         }
10617         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10618         if (n == 0)
10619             goto nothing;
10620         if (kind2 < rkind) {
10621             /* widen replacement */
10622             buf2 = _PyUnicode_AsKind(str2, rkind);
10623             if (!buf2) goto error;
10624             release2 = 1;
10625         }
10626         else if (kind2 > rkind) {
10627             /* widen self and buf1 */
10628             rkind = kind2;
10629             sbuf = _PyUnicode_AsKind(self, rkind);
10630             if (!sbuf) goto error;
10631             srelease = 1;
10632             if (release1) PyMem_Free(buf1);
10633             release1 = 0;
10634             buf1 = _PyUnicode_AsKind(str1, rkind);
10635             if (!buf1) goto error;
10636             release1 = 1;
10637         }
10638         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10639            PyUnicode_GET_LENGTH(str1))); */
10640         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10641                 PyErr_SetString(PyExc_OverflowError,
10642                                 "replace string is too long");
10643                 goto error;
10644         }
10645         new_size = slen + n * (len2 - len1);
10646         if (new_size == 0) {
10647             _Py_INCREF_UNICODE_EMPTY();
10648             if (!unicode_empty)
10649                 goto error;
10650             u = unicode_empty;
10651             goto done;
10652         }
10653         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10654             PyErr_SetString(PyExc_OverflowError,
10655                             "replace string is too long");
10656             goto error;
10657         }
10658         u = PyUnicode_New(new_size, maxchar);
10659         if (!u)
10660             goto error;
10661         assert(PyUnicode_KIND(u) == rkind);
10662         res = PyUnicode_DATA(u);
10663         ires = i = 0;
10664         if (len1 > 0) {
10665             while (n-- > 0) {
10666                 /* look for next match */
10667                 j = anylib_find(rkind, self,
10668                                 sbuf + rkind * i, slen-i,
10669                                 str1, buf1, len1, i);
10670                 if (j == -1)
10671                     break;
10672                 else if (j > i) {
10673                     /* copy unchanged part [i:j] */
10674                     memcpy(res + rkind * ires,
10675                            sbuf + rkind * i,
10676                            rkind * (j-i));
10677                     ires += j - i;
10678                 }
10679                 /* copy substitution string */
10680                 if (len2 > 0) {
10681                     memcpy(res + rkind * ires,
10682                            buf2,
10683                            rkind * len2);
10684                     ires += len2;
10685                 }
10686                 i = j + len1;
10687             }
10688             if (i < slen)
10689                 /* copy tail [i:] */
10690                 memcpy(res + rkind * ires,
10691                        sbuf + rkind * i,
10692                        rkind * (slen-i));
10693         }
10694         else {
10695             /* interleave */
10696             while (n > 0) {
10697                 memcpy(res + rkind * ires,
10698                        buf2,
10699                        rkind * len2);
10700                 ires += len2;
10701                 if (--n <= 0)
10702                     break;
10703                 memcpy(res + rkind * ires,
10704                        sbuf + rkind * i,
10705                        rkind);
10706                 ires++;
10707                 i++;
10708             }
10709             memcpy(res + rkind * ires,
10710                    sbuf + rkind * i,
10711                    rkind * (slen-i));
10712         }
10713     }
10714 
10715     if (mayshrink) {
10716         unicode_adjust_maxchar(&u);
10717         if (u == NULL)
10718             goto error;
10719     }
10720 
10721   done:
10722     if (srelease)
10723         PyMem_FREE(sbuf);
10724     if (release1)
10725         PyMem_FREE(buf1);
10726     if (release2)
10727         PyMem_FREE(buf2);
10728     assert(_PyUnicode_CheckConsistency(u, 1));
10729     return u;
10730 
10731   nothing:
10732     /* nothing to replace; return original string (when possible) */
10733     if (srelease)
10734         PyMem_FREE(sbuf);
10735     if (release1)
10736         PyMem_FREE(buf1);
10737     if (release2)
10738         PyMem_FREE(buf2);
10739     return unicode_result_unchanged(self);
10740 
10741   error:
10742     if (srelease && sbuf)
10743         PyMem_FREE(sbuf);
10744     if (release1 && buf1)
10745         PyMem_FREE(buf1);
10746     if (release2 && buf2)
10747         PyMem_FREE(buf2);
10748     return NULL;
10749 }
10750 
10751 /* --- Unicode Object Methods --------------------------------------------- */
10752 
10753 /*[clinic input]
10754 str.title as unicode_title
10755 
10756 Return a version of the string where each word is titlecased.
10757 
10758 More specifically, words start with uppercased characters and all remaining
10759 cased characters have lower case.
10760 [clinic start generated code]*/
10761 
10762 static PyObject *
unicode_title_impl(PyObject * self)10763 unicode_title_impl(PyObject *self)
10764 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10765 {
10766     if (PyUnicode_READY(self) == -1)
10767         return NULL;
10768     return case_operation(self, do_title);
10769 }
10770 
10771 /*[clinic input]
10772 str.capitalize as unicode_capitalize
10773 
10774 Return a capitalized version of the string.
10775 
10776 More specifically, make the first character have upper case and the rest lower
10777 case.
10778 [clinic start generated code]*/
10779 
10780 static PyObject *
unicode_capitalize_impl(PyObject * self)10781 unicode_capitalize_impl(PyObject *self)
10782 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10783 {
10784     if (PyUnicode_READY(self) == -1)
10785         return NULL;
10786     if (PyUnicode_GET_LENGTH(self) == 0)
10787         return unicode_result_unchanged(self);
10788     return case_operation(self, do_capitalize);
10789 }
10790 
10791 /*[clinic input]
10792 str.casefold as unicode_casefold
10793 
10794 Return a version of the string suitable for caseless comparisons.
10795 [clinic start generated code]*/
10796 
10797 static PyObject *
unicode_casefold_impl(PyObject * self)10798 unicode_casefold_impl(PyObject *self)
10799 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10800 {
10801     if (PyUnicode_READY(self) == -1)
10802         return NULL;
10803     if (PyUnicode_IS_ASCII(self))
10804         return ascii_upper_or_lower(self, 1);
10805     return case_operation(self, do_casefold);
10806 }
10807 
10808 
10809 /* Argument converter. Accepts a single Unicode character. */
10810 
10811 static int
convert_uc(PyObject * obj,void * addr)10812 convert_uc(PyObject *obj, void *addr)
10813 {
10814     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10815 
10816     if (!PyUnicode_Check(obj)) {
10817         PyErr_Format(PyExc_TypeError,
10818                      "The fill character must be a unicode character, "
10819                      "not %.100s", Py_TYPE(obj)->tp_name);
10820         return 0;
10821     }
10822     if (PyUnicode_READY(obj) < 0)
10823         return 0;
10824     if (PyUnicode_GET_LENGTH(obj) != 1) {
10825         PyErr_SetString(PyExc_TypeError,
10826                         "The fill character must be exactly one character long");
10827         return 0;
10828     }
10829     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10830     return 1;
10831 }
10832 
10833 /*[clinic input]
10834 str.center as unicode_center
10835 
10836     width: Py_ssize_t
10837     fillchar: Py_UCS4 = ' '
10838     /
10839 
10840 Return a centered string of length width.
10841 
10842 Padding is done using the specified fill character (default is a space).
10843 [clinic start generated code]*/
10844 
10845 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10846 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10847 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10848 {
10849     Py_ssize_t marg, left;
10850 
10851     if (PyUnicode_READY(self) == -1)
10852         return NULL;
10853 
10854     if (PyUnicode_GET_LENGTH(self) >= width)
10855         return unicode_result_unchanged(self);
10856 
10857     marg = width - PyUnicode_GET_LENGTH(self);
10858     left = marg / 2 + (marg & width & 1);
10859 
10860     return pad(self, left, marg - left, fillchar);
10861 }
10862 
10863 /* This function assumes that str1 and str2 are readied by the caller. */
10864 
10865 static int
unicode_compare(PyObject * str1,PyObject * str2)10866 unicode_compare(PyObject *str1, PyObject *str2)
10867 {
10868 #define COMPARE(TYPE1, TYPE2) \
10869     do { \
10870         TYPE1* p1 = (TYPE1 *)data1; \
10871         TYPE2* p2 = (TYPE2 *)data2; \
10872         TYPE1* end = p1 + len; \
10873         Py_UCS4 c1, c2; \
10874         for (; p1 != end; p1++, p2++) { \
10875             c1 = *p1; \
10876             c2 = *p2; \
10877             if (c1 != c2) \
10878                 return (c1 < c2) ? -1 : 1; \
10879         } \
10880     } \
10881     while (0)
10882 
10883     int kind1, kind2;
10884     void *data1, *data2;
10885     Py_ssize_t len1, len2, len;
10886 
10887     kind1 = PyUnicode_KIND(str1);
10888     kind2 = PyUnicode_KIND(str2);
10889     data1 = PyUnicode_DATA(str1);
10890     data2 = PyUnicode_DATA(str2);
10891     len1 = PyUnicode_GET_LENGTH(str1);
10892     len2 = PyUnicode_GET_LENGTH(str2);
10893     len = Py_MIN(len1, len2);
10894 
10895     switch(kind1) {
10896     case PyUnicode_1BYTE_KIND:
10897     {
10898         switch(kind2) {
10899         case PyUnicode_1BYTE_KIND:
10900         {
10901             int cmp = memcmp(data1, data2, len);
10902             /* normalize result of memcmp() into the range [-1; 1] */
10903             if (cmp < 0)
10904                 return -1;
10905             if (cmp > 0)
10906                 return 1;
10907             break;
10908         }
10909         case PyUnicode_2BYTE_KIND:
10910             COMPARE(Py_UCS1, Py_UCS2);
10911             break;
10912         case PyUnicode_4BYTE_KIND:
10913             COMPARE(Py_UCS1, Py_UCS4);
10914             break;
10915         default:
10916             Py_UNREACHABLE();
10917         }
10918         break;
10919     }
10920     case PyUnicode_2BYTE_KIND:
10921     {
10922         switch(kind2) {
10923         case PyUnicode_1BYTE_KIND:
10924             COMPARE(Py_UCS2, Py_UCS1);
10925             break;
10926         case PyUnicode_2BYTE_KIND:
10927         {
10928             COMPARE(Py_UCS2, Py_UCS2);
10929             break;
10930         }
10931         case PyUnicode_4BYTE_KIND:
10932             COMPARE(Py_UCS2, Py_UCS4);
10933             break;
10934         default:
10935             Py_UNREACHABLE();
10936         }
10937         break;
10938     }
10939     case PyUnicode_4BYTE_KIND:
10940     {
10941         switch(kind2) {
10942         case PyUnicode_1BYTE_KIND:
10943             COMPARE(Py_UCS4, Py_UCS1);
10944             break;
10945         case PyUnicode_2BYTE_KIND:
10946             COMPARE(Py_UCS4, Py_UCS2);
10947             break;
10948         case PyUnicode_4BYTE_KIND:
10949         {
10950 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10951             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10952             /* normalize result of wmemcmp() into the range [-1; 1] */
10953             if (cmp < 0)
10954                 return -1;
10955             if (cmp > 0)
10956                 return 1;
10957 #else
10958             COMPARE(Py_UCS4, Py_UCS4);
10959 #endif
10960             break;
10961         }
10962         default:
10963             Py_UNREACHABLE();
10964         }
10965         break;
10966     }
10967     default:
10968         Py_UNREACHABLE();
10969     }
10970 
10971     if (len1 == len2)
10972         return 0;
10973     if (len1 < len2)
10974         return -1;
10975     else
10976         return 1;
10977 
10978 #undef COMPARE
10979 }
10980 
10981 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10982 unicode_compare_eq(PyObject *str1, PyObject *str2)
10983 {
10984     int kind;
10985     void *data1, *data2;
10986     Py_ssize_t len;
10987     int cmp;
10988 
10989     len = PyUnicode_GET_LENGTH(str1);
10990     if (PyUnicode_GET_LENGTH(str2) != len)
10991         return 0;
10992     kind = PyUnicode_KIND(str1);
10993     if (PyUnicode_KIND(str2) != kind)
10994         return 0;
10995     data1 = PyUnicode_DATA(str1);
10996     data2 = PyUnicode_DATA(str2);
10997 
10998     cmp = memcmp(data1, data2, len * kind);
10999     return (cmp == 0);
11000 }
11001 
11002 
11003 int
PyUnicode_Compare(PyObject * left,PyObject * right)11004 PyUnicode_Compare(PyObject *left, PyObject *right)
11005 {
11006     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11007         if (PyUnicode_READY(left) == -1 ||
11008             PyUnicode_READY(right) == -1)
11009             return -1;
11010 
11011         /* a string is equal to itself */
11012         if (left == right)
11013             return 0;
11014 
11015         return unicode_compare(left, right);
11016     }
11017     PyErr_Format(PyExc_TypeError,
11018                  "Can't compare %.100s and %.100s",
11019                  left->ob_type->tp_name,
11020                  right->ob_type->tp_name);
11021     return -1;
11022 }
11023 
11024 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11025 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11026 {
11027     Py_ssize_t i;
11028     int kind;
11029     Py_UCS4 chr;
11030     const unsigned char *ustr = (const unsigned char *)str;
11031 
11032     assert(_PyUnicode_CHECK(uni));
11033     if (!PyUnicode_IS_READY(uni)) {
11034         const wchar_t *ws = _PyUnicode_WSTR(uni);
11035         /* Compare Unicode string and source character set string */
11036         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11037             if (chr != ustr[i])
11038                 return (chr < ustr[i]) ? -1 : 1;
11039         }
11040         /* This check keeps Python strings that end in '\0' from comparing equal
11041          to C strings identical up to that point. */
11042         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11043             return 1; /* uni is longer */
11044         if (ustr[i])
11045             return -1; /* str is longer */
11046         return 0;
11047     }
11048     kind = PyUnicode_KIND(uni);
11049     if (kind == PyUnicode_1BYTE_KIND) {
11050         const void *data = PyUnicode_1BYTE_DATA(uni);
11051         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11052         size_t len, len2 = strlen(str);
11053         int cmp;
11054 
11055         len = Py_MIN(len1, len2);
11056         cmp = memcmp(data, str, len);
11057         if (cmp != 0) {
11058             if (cmp < 0)
11059                 return -1;
11060             else
11061                 return 1;
11062         }
11063         if (len1 > len2)
11064             return 1; /* uni is longer */
11065         if (len1 < len2)
11066             return -1; /* str is longer */
11067         return 0;
11068     }
11069     else {
11070         void *data = PyUnicode_DATA(uni);
11071         /* Compare Unicode string and source character set string */
11072         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11073             if (chr != (unsigned char)str[i])
11074                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11075         /* This check keeps Python strings that end in '\0' from comparing equal
11076          to C strings identical up to that point. */
11077         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11078             return 1; /* uni is longer */
11079         if (str[i])
11080             return -1; /* str is longer */
11081         return 0;
11082     }
11083 }
11084 
11085 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11086 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11087 {
11088     size_t i, len;
11089     const wchar_t *p;
11090     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11091     if (strlen(str) != len)
11092         return 0;
11093     p = _PyUnicode_WSTR(unicode);
11094     assert(p);
11095     for (i = 0; i < len; i++) {
11096         unsigned char c = (unsigned char)str[i];
11097         if (c >= 128 || p[i] != (wchar_t)c)
11098             return 0;
11099     }
11100     return 1;
11101 }
11102 
11103 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11104 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11105 {
11106     size_t len;
11107     assert(_PyUnicode_CHECK(unicode));
11108     assert(str);
11109 #ifndef NDEBUG
11110     for (const char *p = str; *p; p++) {
11111         assert((unsigned char)*p < 128);
11112     }
11113 #endif
11114     if (PyUnicode_READY(unicode) == -1) {
11115         /* Memory error or bad data */
11116         PyErr_Clear();
11117         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11118     }
11119     if (!PyUnicode_IS_ASCII(unicode))
11120         return 0;
11121     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11122     return strlen(str) == len &&
11123            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11124 }
11125 
11126 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11127 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11128 {
11129     PyObject *right_uni;
11130     Py_hash_t hash;
11131 
11132     assert(_PyUnicode_CHECK(left));
11133     assert(right->string);
11134 #ifndef NDEBUG
11135     for (const char *p = right->string; *p; p++) {
11136         assert((unsigned char)*p < 128);
11137     }
11138 #endif
11139 
11140     if (PyUnicode_READY(left) == -1) {
11141         /* memory error or bad data */
11142         PyErr_Clear();
11143         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11144     }
11145 
11146     if (!PyUnicode_IS_ASCII(left))
11147         return 0;
11148 
11149     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11150     if (right_uni == NULL) {
11151         /* memory error or bad data */
11152         PyErr_Clear();
11153         return _PyUnicode_EqualToASCIIString(left, right->string);
11154     }
11155 
11156     if (left == right_uni)
11157         return 1;
11158 
11159     if (PyUnicode_CHECK_INTERNED(left))
11160         return 0;
11161 
11162     assert(_PyUnicode_HASH(right_uni) != -1);
11163     hash = _PyUnicode_HASH(left);
11164     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11165         return 0;
11166 
11167     return unicode_compare_eq(left, right_uni);
11168 }
11169 
11170 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11171 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11172 {
11173     int result;
11174 
11175     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11176         Py_RETURN_NOTIMPLEMENTED;
11177 
11178     if (PyUnicode_READY(left) == -1 ||
11179         PyUnicode_READY(right) == -1)
11180         return NULL;
11181 
11182     if (left == right) {
11183         switch (op) {
11184         case Py_EQ:
11185         case Py_LE:
11186         case Py_GE:
11187             /* a string is equal to itself */
11188             Py_RETURN_TRUE;
11189         case Py_NE:
11190         case Py_LT:
11191         case Py_GT:
11192             Py_RETURN_FALSE;
11193         default:
11194             PyErr_BadArgument();
11195             return NULL;
11196         }
11197     }
11198     else if (op == Py_EQ || op == Py_NE) {
11199         result = unicode_compare_eq(left, right);
11200         result ^= (op == Py_NE);
11201         return PyBool_FromLong(result);
11202     }
11203     else {
11204         result = unicode_compare(left, right);
11205         Py_RETURN_RICHCOMPARE(result, 0, op);
11206     }
11207 }
11208 
11209 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11210 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211 {
11212     return unicode_eq(aa, bb);
11213 }
11214 
11215 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11216 PyUnicode_Contains(PyObject *str, PyObject *substr)
11217 {
11218     int kind1, kind2;
11219     void *buf1, *buf2;
11220     Py_ssize_t len1, len2;
11221     int result;
11222 
11223     if (!PyUnicode_Check(substr)) {
11224         PyErr_Format(PyExc_TypeError,
11225                      "'in <string>' requires string as left operand, not %.100s",
11226                      Py_TYPE(substr)->tp_name);
11227         return -1;
11228     }
11229     if (PyUnicode_READY(substr) == -1)
11230         return -1;
11231     if (ensure_unicode(str) < 0)
11232         return -1;
11233 
11234     kind1 = PyUnicode_KIND(str);
11235     kind2 = PyUnicode_KIND(substr);
11236     if (kind1 < kind2)
11237         return 0;
11238     len1 = PyUnicode_GET_LENGTH(str);
11239     len2 = PyUnicode_GET_LENGTH(substr);
11240     if (len1 < len2)
11241         return 0;
11242     buf1 = PyUnicode_DATA(str);
11243     buf2 = PyUnicode_DATA(substr);
11244     if (len2 == 1) {
11245         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11247         return result;
11248     }
11249     if (kind2 != kind1) {
11250         buf2 = _PyUnicode_AsKind(substr, kind1);
11251         if (!buf2)
11252             return -1;
11253     }
11254 
11255     switch (kind1) {
11256     case PyUnicode_1BYTE_KIND:
11257         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258         break;
11259     case PyUnicode_2BYTE_KIND:
11260         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261         break;
11262     case PyUnicode_4BYTE_KIND:
11263         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264         break;
11265     default:
11266         Py_UNREACHABLE();
11267     }
11268 
11269     if (kind2 != kind1)
11270         PyMem_Free(buf2);
11271 
11272     return result;
11273 }
11274 
11275 /* Concat to string or Unicode object giving a new Unicode object. */
11276 
11277 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11278 PyUnicode_Concat(PyObject *left, PyObject *right)
11279 {
11280     PyObject *result;
11281     Py_UCS4 maxchar, maxchar2;
11282     Py_ssize_t left_len, right_len, new_len;
11283 
11284     if (ensure_unicode(left) < 0)
11285         return NULL;
11286 
11287     if (!PyUnicode_Check(right)) {
11288         PyErr_Format(PyExc_TypeError,
11289                      "can only concatenate str (not \"%.200s\") to str",
11290                      right->ob_type->tp_name);
11291         return NULL;
11292     }
11293     if (PyUnicode_READY(right) < 0)
11294         return NULL;
11295 
11296     /* Shortcuts */
11297     if (left == unicode_empty)
11298         return PyUnicode_FromObject(right);
11299     if (right == unicode_empty)
11300         return PyUnicode_FromObject(left);
11301 
11302     left_len = PyUnicode_GET_LENGTH(left);
11303     right_len = PyUnicode_GET_LENGTH(right);
11304     if (left_len > PY_SSIZE_T_MAX - right_len) {
11305         PyErr_SetString(PyExc_OverflowError,
11306                         "strings are too large to concat");
11307         return NULL;
11308     }
11309     new_len = left_len + right_len;
11310 
11311     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11312     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11313     maxchar = Py_MAX(maxchar, maxchar2);
11314 
11315     /* Concat the two Unicode strings */
11316     result = PyUnicode_New(new_len, maxchar);
11317     if (result == NULL)
11318         return NULL;
11319     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11320     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11321     assert(_PyUnicode_CheckConsistency(result, 1));
11322     return result;
11323 }
11324 
11325 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11326 PyUnicode_Append(PyObject **p_left, PyObject *right)
11327 {
11328     PyObject *left, *res;
11329     Py_UCS4 maxchar, maxchar2;
11330     Py_ssize_t left_len, right_len, new_len;
11331 
11332     if (p_left == NULL) {
11333         if (!PyErr_Occurred())
11334             PyErr_BadInternalCall();
11335         return;
11336     }
11337     left = *p_left;
11338     if (right == NULL || left == NULL
11339         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11340         if (!PyErr_Occurred())
11341             PyErr_BadInternalCall();
11342         goto error;
11343     }
11344 
11345     if (PyUnicode_READY(left) == -1)
11346         goto error;
11347     if (PyUnicode_READY(right) == -1)
11348         goto error;
11349 
11350     /* Shortcuts */
11351     if (left == unicode_empty) {
11352         Py_DECREF(left);
11353         Py_INCREF(right);
11354         *p_left = right;
11355         return;
11356     }
11357     if (right == unicode_empty)
11358         return;
11359 
11360     left_len = PyUnicode_GET_LENGTH(left);
11361     right_len = PyUnicode_GET_LENGTH(right);
11362     if (left_len > PY_SSIZE_T_MAX - right_len) {
11363         PyErr_SetString(PyExc_OverflowError,
11364                         "strings are too large to concat");
11365         goto error;
11366     }
11367     new_len = left_len + right_len;
11368 
11369     if (unicode_modifiable(left)
11370         && PyUnicode_CheckExact(right)
11371         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11372         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11373            to change the structure size, but characters are stored just after
11374            the structure, and so it requires to move all characters which is
11375            not so different than duplicating the string. */
11376         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11377     {
11378         /* append inplace */
11379         if (unicode_resize(p_left, new_len) != 0)
11380             goto error;
11381 
11382         /* copy 'right' into the newly allocated area of 'left' */
11383         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11384     }
11385     else {
11386         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11387         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11388         maxchar = Py_MAX(maxchar, maxchar2);
11389 
11390         /* Concat the two Unicode strings */
11391         res = PyUnicode_New(new_len, maxchar);
11392         if (res == NULL)
11393             goto error;
11394         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11395         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11396         Py_DECREF(left);
11397         *p_left = res;
11398     }
11399     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11400     return;
11401 
11402 error:
11403     Py_CLEAR(*p_left);
11404 }
11405 
11406 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11407 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11408 {
11409     PyUnicode_Append(pleft, right);
11410     Py_XDECREF(right);
11411 }
11412 
11413 /*
11414 Wraps stringlib_parse_args_finds() and additionally ensures that the
11415 first argument is a unicode object.
11416 */
11417 
11418 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11419 parse_args_finds_unicode(const char * function_name, PyObject *args,
11420                          PyObject **substring,
11421                          Py_ssize_t *start, Py_ssize_t *end)
11422 {
11423     if(stringlib_parse_args_finds(function_name, args, substring,
11424                                   start, end)) {
11425         if (ensure_unicode(*substring) < 0)
11426             return 0;
11427         return 1;
11428     }
11429     return 0;
11430 }
11431 
11432 PyDoc_STRVAR(count__doc__,
11433              "S.count(sub[, start[, end]]) -> int\n\
11434 \n\
11435 Return the number of non-overlapping occurrences of substring sub in\n\
11436 string S[start:end].  Optional arguments start and end are\n\
11437 interpreted as in slice notation.");
11438 
11439 static PyObject *
unicode_count(PyObject * self,PyObject * args)11440 unicode_count(PyObject *self, PyObject *args)
11441 {
11442     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11443     Py_ssize_t start = 0;
11444     Py_ssize_t end = PY_SSIZE_T_MAX;
11445     PyObject *result;
11446     int kind1, kind2;
11447     void *buf1, *buf2;
11448     Py_ssize_t len1, len2, iresult;
11449 
11450     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11451         return NULL;
11452 
11453     kind1 = PyUnicode_KIND(self);
11454     kind2 = PyUnicode_KIND(substring);
11455     if (kind1 < kind2)
11456         return PyLong_FromLong(0);
11457 
11458     len1 = PyUnicode_GET_LENGTH(self);
11459     len2 = PyUnicode_GET_LENGTH(substring);
11460     ADJUST_INDICES(start, end, len1);
11461     if (end - start < len2)
11462         return PyLong_FromLong(0);
11463 
11464     buf1 = PyUnicode_DATA(self);
11465     buf2 = PyUnicode_DATA(substring);
11466     if (kind2 != kind1) {
11467         buf2 = _PyUnicode_AsKind(substring, kind1);
11468         if (!buf2)
11469             return NULL;
11470     }
11471     switch (kind1) {
11472     case PyUnicode_1BYTE_KIND:
11473         iresult = ucs1lib_count(
11474             ((Py_UCS1*)buf1) + start, end - start,
11475             buf2, len2, PY_SSIZE_T_MAX
11476             );
11477         break;
11478     case PyUnicode_2BYTE_KIND:
11479         iresult = ucs2lib_count(
11480             ((Py_UCS2*)buf1) + start, end - start,
11481             buf2, len2, PY_SSIZE_T_MAX
11482             );
11483         break;
11484     case PyUnicode_4BYTE_KIND:
11485         iresult = ucs4lib_count(
11486             ((Py_UCS4*)buf1) + start, end - start,
11487             buf2, len2, PY_SSIZE_T_MAX
11488             );
11489         break;
11490     default:
11491         Py_UNREACHABLE();
11492     }
11493 
11494     result = PyLong_FromSsize_t(iresult);
11495 
11496     if (kind2 != kind1)
11497         PyMem_Free(buf2);
11498 
11499     return result;
11500 }
11501 
11502 /*[clinic input]
11503 str.encode as unicode_encode
11504 
11505     encoding: str(c_default="NULL") = 'utf-8'
11506         The encoding in which to encode the string.
11507     errors: str(c_default="NULL") = 'strict'
11508         The error handling scheme to use for encoding errors.
11509         The default is 'strict' meaning that encoding errors raise a
11510         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11511         'xmlcharrefreplace' as well as any other name registered with
11512         codecs.register_error that can handle UnicodeEncodeErrors.
11513 
11514 Encode the string using the codec registered for encoding.
11515 [clinic start generated code]*/
11516 
11517 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11518 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11519 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11520 {
11521     return PyUnicode_AsEncodedString(self, encoding, errors);
11522 }
11523 
11524 /*[clinic input]
11525 str.expandtabs as unicode_expandtabs
11526 
11527     tabsize: int = 8
11528 
11529 Return a copy where all tab characters are expanded using spaces.
11530 
11531 If tabsize is not given, a tab size of 8 characters is assumed.
11532 [clinic start generated code]*/
11533 
11534 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11535 unicode_expandtabs_impl(PyObject *self, int tabsize)
11536 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11537 {
11538     Py_ssize_t i, j, line_pos, src_len, incr;
11539     Py_UCS4 ch;
11540     PyObject *u;
11541     void *src_data, *dest_data;
11542     int kind;
11543     int found;
11544 
11545     if (PyUnicode_READY(self) == -1)
11546         return NULL;
11547 
11548     /* First pass: determine size of output string */
11549     src_len = PyUnicode_GET_LENGTH(self);
11550     i = j = line_pos = 0;
11551     kind = PyUnicode_KIND(self);
11552     src_data = PyUnicode_DATA(self);
11553     found = 0;
11554     for (; i < src_len; i++) {
11555         ch = PyUnicode_READ(kind, src_data, i);
11556         if (ch == '\t') {
11557             found = 1;
11558             if (tabsize > 0) {
11559                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11560                 if (j > PY_SSIZE_T_MAX - incr)
11561                     goto overflow;
11562                 line_pos += incr;
11563                 j += incr;
11564             }
11565         }
11566         else {
11567             if (j > PY_SSIZE_T_MAX - 1)
11568                 goto overflow;
11569             line_pos++;
11570             j++;
11571             if (ch == '\n' || ch == '\r')
11572                 line_pos = 0;
11573         }
11574     }
11575     if (!found)
11576         return unicode_result_unchanged(self);
11577 
11578     /* Second pass: create output string and fill it */
11579     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11580     if (!u)
11581         return NULL;
11582     dest_data = PyUnicode_DATA(u);
11583 
11584     i = j = line_pos = 0;
11585 
11586     for (; i < src_len; i++) {
11587         ch = PyUnicode_READ(kind, src_data, i);
11588         if (ch == '\t') {
11589             if (tabsize > 0) {
11590                 incr = tabsize - (line_pos % tabsize);
11591                 line_pos += incr;
11592                 unicode_fill(kind, dest_data, ' ', j, incr);
11593                 j += incr;
11594             }
11595         }
11596         else {
11597             line_pos++;
11598             PyUnicode_WRITE(kind, dest_data, j, ch);
11599             j++;
11600             if (ch == '\n' || ch == '\r')
11601                 line_pos = 0;
11602         }
11603     }
11604     assert (j == PyUnicode_GET_LENGTH(u));
11605     return unicode_result(u);
11606 
11607   overflow:
11608     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11609     return NULL;
11610 }
11611 
11612 PyDoc_STRVAR(find__doc__,
11613              "S.find(sub[, start[, end]]) -> int\n\
11614 \n\
11615 Return the lowest index in S where substring sub is found,\n\
11616 such that sub is contained within S[start:end].  Optional\n\
11617 arguments start and end are interpreted as in slice notation.\n\
11618 \n\
11619 Return -1 on failure.");
11620 
11621 static PyObject *
unicode_find(PyObject * self,PyObject * args)11622 unicode_find(PyObject *self, PyObject *args)
11623 {
11624     /* initialize variables to prevent gcc warning */
11625     PyObject *substring = NULL;
11626     Py_ssize_t start = 0;
11627     Py_ssize_t end = 0;
11628     Py_ssize_t result;
11629 
11630     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11631         return NULL;
11632 
11633     if (PyUnicode_READY(self) == -1)
11634         return NULL;
11635 
11636     result = any_find_slice(self, substring, start, end, 1);
11637 
11638     if (result == -2)
11639         return NULL;
11640 
11641     return PyLong_FromSsize_t(result);
11642 }
11643 
11644 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11645 unicode_getitem(PyObject *self, Py_ssize_t index)
11646 {
11647     void *data;
11648     enum PyUnicode_Kind kind;
11649     Py_UCS4 ch;
11650 
11651     if (!PyUnicode_Check(self)) {
11652         PyErr_BadArgument();
11653         return NULL;
11654     }
11655     if (PyUnicode_READY(self) == -1) {
11656         return NULL;
11657     }
11658     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11659         PyErr_SetString(PyExc_IndexError, "string index out of range");
11660         return NULL;
11661     }
11662     kind = PyUnicode_KIND(self);
11663     data = PyUnicode_DATA(self);
11664     ch = PyUnicode_READ(kind, data, index);
11665     return unicode_char(ch);
11666 }
11667 
11668 /* Believe it or not, this produces the same value for ASCII strings
11669    as bytes_hash(). */
11670 static Py_hash_t
unicode_hash(PyObject * self)11671 unicode_hash(PyObject *self)
11672 {
11673     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11674 
11675 #ifdef Py_DEBUG
11676     assert(_Py_HashSecret_Initialized);
11677 #endif
11678     if (_PyUnicode_HASH(self) != -1)
11679         return _PyUnicode_HASH(self);
11680     if (PyUnicode_READY(self) == -1)
11681         return -1;
11682 
11683     x = _Py_HashBytes(PyUnicode_DATA(self),
11684                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11685     _PyUnicode_HASH(self) = x;
11686     return x;
11687 }
11688 
11689 PyDoc_STRVAR(index__doc__,
11690              "S.index(sub[, start[, end]]) -> int\n\
11691 \n\
11692 Return the lowest index in S where substring sub is found,\n\
11693 such that sub is contained within S[start:end].  Optional\n\
11694 arguments start and end are interpreted as in slice notation.\n\
11695 \n\
11696 Raises ValueError when the substring is not found.");
11697 
11698 static PyObject *
unicode_index(PyObject * self,PyObject * args)11699 unicode_index(PyObject *self, PyObject *args)
11700 {
11701     /* initialize variables to prevent gcc warning */
11702     Py_ssize_t result;
11703     PyObject *substring = NULL;
11704     Py_ssize_t start = 0;
11705     Py_ssize_t end = 0;
11706 
11707     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11708         return NULL;
11709 
11710     if (PyUnicode_READY(self) == -1)
11711         return NULL;
11712 
11713     result = any_find_slice(self, substring, start, end, 1);
11714 
11715     if (result == -2)
11716         return NULL;
11717 
11718     if (result < 0) {
11719         PyErr_SetString(PyExc_ValueError, "substring not found");
11720         return NULL;
11721     }
11722 
11723     return PyLong_FromSsize_t(result);
11724 }
11725 
11726 /*[clinic input]
11727 str.isascii as unicode_isascii
11728 
11729 Return True if all characters in the string are ASCII, False otherwise.
11730 
11731 ASCII characters have code points in the range U+0000-U+007F.
11732 Empty string is ASCII too.
11733 [clinic start generated code]*/
11734 
11735 static PyObject *
unicode_isascii_impl(PyObject * self)11736 unicode_isascii_impl(PyObject *self)
11737 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11738 {
11739     if (PyUnicode_READY(self) == -1) {
11740         return NULL;
11741     }
11742     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11743 }
11744 
11745 /*[clinic input]
11746 str.islower as unicode_islower
11747 
11748 Return True if the string is a lowercase string, False otherwise.
11749 
11750 A string is lowercase if all cased characters in the string are lowercase and
11751 there is at least one cased character in the string.
11752 [clinic start generated code]*/
11753 
11754 static PyObject *
unicode_islower_impl(PyObject * self)11755 unicode_islower_impl(PyObject *self)
11756 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11757 {
11758     Py_ssize_t i, length;
11759     int kind;
11760     void *data;
11761     int cased;
11762 
11763     if (PyUnicode_READY(self) == -1)
11764         return NULL;
11765     length = PyUnicode_GET_LENGTH(self);
11766     kind = PyUnicode_KIND(self);
11767     data = PyUnicode_DATA(self);
11768 
11769     /* Shortcut for single character strings */
11770     if (length == 1)
11771         return PyBool_FromLong(
11772             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11773 
11774     /* Special case for empty strings */
11775     if (length == 0)
11776         Py_RETURN_FALSE;
11777 
11778     cased = 0;
11779     for (i = 0; i < length; i++) {
11780         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11781 
11782         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11783             Py_RETURN_FALSE;
11784         else if (!cased && Py_UNICODE_ISLOWER(ch))
11785             cased = 1;
11786     }
11787     return PyBool_FromLong(cased);
11788 }
11789 
11790 /*[clinic input]
11791 str.isupper as unicode_isupper
11792 
11793 Return True if the string is an uppercase string, False otherwise.
11794 
11795 A string is uppercase if all cased characters in the string are uppercase and
11796 there is at least one cased character in the string.
11797 [clinic start generated code]*/
11798 
11799 static PyObject *
unicode_isupper_impl(PyObject * self)11800 unicode_isupper_impl(PyObject *self)
11801 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11802 {
11803     Py_ssize_t i, length;
11804     int kind;
11805     void *data;
11806     int cased;
11807 
11808     if (PyUnicode_READY(self) == -1)
11809         return NULL;
11810     length = PyUnicode_GET_LENGTH(self);
11811     kind = PyUnicode_KIND(self);
11812     data = PyUnicode_DATA(self);
11813 
11814     /* Shortcut for single character strings */
11815     if (length == 1)
11816         return PyBool_FromLong(
11817             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11818 
11819     /* Special case for empty strings */
11820     if (length == 0)
11821         Py_RETURN_FALSE;
11822 
11823     cased = 0;
11824     for (i = 0; i < length; i++) {
11825         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11826 
11827         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11828             Py_RETURN_FALSE;
11829         else if (!cased && Py_UNICODE_ISUPPER(ch))
11830             cased = 1;
11831     }
11832     return PyBool_FromLong(cased);
11833 }
11834 
11835 /*[clinic input]
11836 str.istitle as unicode_istitle
11837 
11838 Return True if the string is a title-cased string, False otherwise.
11839 
11840 In a title-cased string, upper- and title-case characters may only
11841 follow uncased characters and lowercase characters only cased ones.
11842 [clinic start generated code]*/
11843 
11844 static PyObject *
unicode_istitle_impl(PyObject * self)11845 unicode_istitle_impl(PyObject *self)
11846 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11847 {
11848     Py_ssize_t i, length;
11849     int kind;
11850     void *data;
11851     int cased, previous_is_cased;
11852 
11853     if (PyUnicode_READY(self) == -1)
11854         return NULL;
11855     length = PyUnicode_GET_LENGTH(self);
11856     kind = PyUnicode_KIND(self);
11857     data = PyUnicode_DATA(self);
11858 
11859     /* Shortcut for single character strings */
11860     if (length == 1) {
11861         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11862         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11863                                (Py_UNICODE_ISUPPER(ch) != 0));
11864     }
11865 
11866     /* Special case for empty strings */
11867     if (length == 0)
11868         Py_RETURN_FALSE;
11869 
11870     cased = 0;
11871     previous_is_cased = 0;
11872     for (i = 0; i < length; i++) {
11873         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11874 
11875         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11876             if (previous_is_cased)
11877                 Py_RETURN_FALSE;
11878             previous_is_cased = 1;
11879             cased = 1;
11880         }
11881         else if (Py_UNICODE_ISLOWER(ch)) {
11882             if (!previous_is_cased)
11883                 Py_RETURN_FALSE;
11884             previous_is_cased = 1;
11885             cased = 1;
11886         }
11887         else
11888             previous_is_cased = 0;
11889     }
11890     return PyBool_FromLong(cased);
11891 }
11892 
11893 /*[clinic input]
11894 str.isspace as unicode_isspace
11895 
11896 Return True if the string is a whitespace string, False otherwise.
11897 
11898 A string is whitespace if all characters in the string are whitespace and there
11899 is at least one character in the string.
11900 [clinic start generated code]*/
11901 
11902 static PyObject *
unicode_isspace_impl(PyObject * self)11903 unicode_isspace_impl(PyObject *self)
11904 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11905 {
11906     Py_ssize_t i, length;
11907     int kind;
11908     void *data;
11909 
11910     if (PyUnicode_READY(self) == -1)
11911         return NULL;
11912     length = PyUnicode_GET_LENGTH(self);
11913     kind = PyUnicode_KIND(self);
11914     data = PyUnicode_DATA(self);
11915 
11916     /* Shortcut for single character strings */
11917     if (length == 1)
11918         return PyBool_FromLong(
11919             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11920 
11921     /* Special case for empty strings */
11922     if (length == 0)
11923         Py_RETURN_FALSE;
11924 
11925     for (i = 0; i < length; i++) {
11926         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11927         if (!Py_UNICODE_ISSPACE(ch))
11928             Py_RETURN_FALSE;
11929     }
11930     Py_RETURN_TRUE;
11931 }
11932 
11933 /*[clinic input]
11934 str.isalpha as unicode_isalpha
11935 
11936 Return True if the string is an alphabetic string, False otherwise.
11937 
11938 A string is alphabetic if all characters in the string are alphabetic and there
11939 is at least one character in the string.
11940 [clinic start generated code]*/
11941 
11942 static PyObject *
unicode_isalpha_impl(PyObject * self)11943 unicode_isalpha_impl(PyObject *self)
11944 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11945 {
11946     Py_ssize_t i, length;
11947     int kind;
11948     void *data;
11949 
11950     if (PyUnicode_READY(self) == -1)
11951         return NULL;
11952     length = PyUnicode_GET_LENGTH(self);
11953     kind = PyUnicode_KIND(self);
11954     data = PyUnicode_DATA(self);
11955 
11956     /* Shortcut for single character strings */
11957     if (length == 1)
11958         return PyBool_FromLong(
11959             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11960 
11961     /* Special case for empty strings */
11962     if (length == 0)
11963         Py_RETURN_FALSE;
11964 
11965     for (i = 0; i < length; i++) {
11966         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11967             Py_RETURN_FALSE;
11968     }
11969     Py_RETURN_TRUE;
11970 }
11971 
11972 /*[clinic input]
11973 str.isalnum as unicode_isalnum
11974 
11975 Return True if the string is an alpha-numeric string, False otherwise.
11976 
11977 A string is alpha-numeric if all characters in the string are alpha-numeric and
11978 there is at least one character in the string.
11979 [clinic start generated code]*/
11980 
11981 static PyObject *
unicode_isalnum_impl(PyObject * self)11982 unicode_isalnum_impl(PyObject *self)
11983 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11984 {
11985     int kind;
11986     void *data;
11987     Py_ssize_t len, i;
11988 
11989     if (PyUnicode_READY(self) == -1)
11990         return NULL;
11991 
11992     kind = PyUnicode_KIND(self);
11993     data = PyUnicode_DATA(self);
11994     len = PyUnicode_GET_LENGTH(self);
11995 
11996     /* Shortcut for single character strings */
11997     if (len == 1) {
11998         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11999         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12000     }
12001 
12002     /* Special case for empty strings */
12003     if (len == 0)
12004         Py_RETURN_FALSE;
12005 
12006     for (i = 0; i < len; i++) {
12007         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12008         if (!Py_UNICODE_ISALNUM(ch))
12009             Py_RETURN_FALSE;
12010     }
12011     Py_RETURN_TRUE;
12012 }
12013 
12014 /*[clinic input]
12015 str.isdecimal as unicode_isdecimal
12016 
12017 Return True if the string is a decimal string, False otherwise.
12018 
12019 A string is a decimal string if all characters in the string are decimal and
12020 there is at least one character in the string.
12021 [clinic start generated code]*/
12022 
12023 static PyObject *
unicode_isdecimal_impl(PyObject * self)12024 unicode_isdecimal_impl(PyObject *self)
12025 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12026 {
12027     Py_ssize_t i, length;
12028     int kind;
12029     void *data;
12030 
12031     if (PyUnicode_READY(self) == -1)
12032         return NULL;
12033     length = PyUnicode_GET_LENGTH(self);
12034     kind = PyUnicode_KIND(self);
12035     data = PyUnicode_DATA(self);
12036 
12037     /* Shortcut for single character strings */
12038     if (length == 1)
12039         return PyBool_FromLong(
12040             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12041 
12042     /* Special case for empty strings */
12043     if (length == 0)
12044         Py_RETURN_FALSE;
12045 
12046     for (i = 0; i < length; i++) {
12047         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12048             Py_RETURN_FALSE;
12049     }
12050     Py_RETURN_TRUE;
12051 }
12052 
12053 /*[clinic input]
12054 str.isdigit as unicode_isdigit
12055 
12056 Return True if the string is a digit string, False otherwise.
12057 
12058 A string is a digit string if all characters in the string are digits and there
12059 is at least one character in the string.
12060 [clinic start generated code]*/
12061 
12062 static PyObject *
unicode_isdigit_impl(PyObject * self)12063 unicode_isdigit_impl(PyObject *self)
12064 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12065 {
12066     Py_ssize_t i, length;
12067     int kind;
12068     void *data;
12069 
12070     if (PyUnicode_READY(self) == -1)
12071         return NULL;
12072     length = PyUnicode_GET_LENGTH(self);
12073     kind = PyUnicode_KIND(self);
12074     data = PyUnicode_DATA(self);
12075 
12076     /* Shortcut for single character strings */
12077     if (length == 1) {
12078         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12079         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12080     }
12081 
12082     /* Special case for empty strings */
12083     if (length == 0)
12084         Py_RETURN_FALSE;
12085 
12086     for (i = 0; i < length; i++) {
12087         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12088             Py_RETURN_FALSE;
12089     }
12090     Py_RETURN_TRUE;
12091 }
12092 
12093 /*[clinic input]
12094 str.isnumeric as unicode_isnumeric
12095 
12096 Return True if the string is a numeric string, False otherwise.
12097 
12098 A string is numeric if all characters in the string are numeric and there is at
12099 least one character in the string.
12100 [clinic start generated code]*/
12101 
12102 static PyObject *
unicode_isnumeric_impl(PyObject * self)12103 unicode_isnumeric_impl(PyObject *self)
12104 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12105 {
12106     Py_ssize_t i, length;
12107     int kind;
12108     void *data;
12109 
12110     if (PyUnicode_READY(self) == -1)
12111         return NULL;
12112     length = PyUnicode_GET_LENGTH(self);
12113     kind = PyUnicode_KIND(self);
12114     data = PyUnicode_DATA(self);
12115 
12116     /* Shortcut for single character strings */
12117     if (length == 1)
12118         return PyBool_FromLong(
12119             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12120 
12121     /* Special case for empty strings */
12122     if (length == 0)
12123         Py_RETURN_FALSE;
12124 
12125     for (i = 0; i < length; i++) {
12126         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12127             Py_RETURN_FALSE;
12128     }
12129     Py_RETURN_TRUE;
12130 }
12131 
12132 int
PyUnicode_IsIdentifier(PyObject * self)12133 PyUnicode_IsIdentifier(PyObject *self)
12134 {
12135     int kind;
12136     void *data;
12137     Py_ssize_t i;
12138     Py_UCS4 first;
12139 
12140     if (PyUnicode_READY(self) == -1) {
12141         Py_FatalError("identifier not ready");
12142         return 0;
12143     }
12144 
12145     /* Special case for empty strings */
12146     if (PyUnicode_GET_LENGTH(self) == 0)
12147         return 0;
12148     kind = PyUnicode_KIND(self);
12149     data = PyUnicode_DATA(self);
12150 
12151     /* PEP 3131 says that the first character must be in
12152        XID_Start and subsequent characters in XID_Continue,
12153        and for the ASCII range, the 2.x rules apply (i.e
12154        start with letters and underscore, continue with
12155        letters, digits, underscore). However, given the current
12156        definition of XID_Start and XID_Continue, it is sufficient
12157        to check just for these, except that _ must be allowed
12158        as starting an identifier.  */
12159     first = PyUnicode_READ(kind, data, 0);
12160     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12161         return 0;
12162 
12163     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12164         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12165             return 0;
12166     return 1;
12167 }
12168 
12169 /*[clinic input]
12170 str.isidentifier as unicode_isidentifier
12171 
12172 Return True if the string is a valid Python identifier, False otherwise.
12173 
12174 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12175 such as "def" or "class".
12176 [clinic start generated code]*/
12177 
12178 static PyObject *
unicode_isidentifier_impl(PyObject * self)12179 unicode_isidentifier_impl(PyObject *self)
12180 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12181 {
12182     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12183 }
12184 
12185 /*[clinic input]
12186 str.isprintable as unicode_isprintable
12187 
12188 Return True if the string is printable, False otherwise.
12189 
12190 A string is printable if all of its characters are considered printable in
12191 repr() or if it is empty.
12192 [clinic start generated code]*/
12193 
12194 static PyObject *
unicode_isprintable_impl(PyObject * self)12195 unicode_isprintable_impl(PyObject *self)
12196 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12197 {
12198     Py_ssize_t i, length;
12199     int kind;
12200     void *data;
12201 
12202     if (PyUnicode_READY(self) == -1)
12203         return NULL;
12204     length = PyUnicode_GET_LENGTH(self);
12205     kind = PyUnicode_KIND(self);
12206     data = PyUnicode_DATA(self);
12207 
12208     /* Shortcut for single character strings */
12209     if (length == 1)
12210         return PyBool_FromLong(
12211             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12212 
12213     for (i = 0; i < length; i++) {
12214         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12215             Py_RETURN_FALSE;
12216         }
12217     }
12218     Py_RETURN_TRUE;
12219 }
12220 
12221 /*[clinic input]
12222 str.join as unicode_join
12223 
12224     iterable: object
12225     /
12226 
12227 Concatenate any number of strings.
12228 
12229 The string whose method is called is inserted in between each given string.
12230 The result is returned as a new string.
12231 
12232 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12233 [clinic start generated code]*/
12234 
12235 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12236 unicode_join(PyObject *self, PyObject *iterable)
12237 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12238 {
12239     return PyUnicode_Join(self, iterable);
12240 }
12241 
12242 static Py_ssize_t
unicode_length(PyObject * self)12243 unicode_length(PyObject *self)
12244 {
12245     if (PyUnicode_READY(self) == -1)
12246         return -1;
12247     return PyUnicode_GET_LENGTH(self);
12248 }
12249 
12250 /*[clinic input]
12251 str.ljust as unicode_ljust
12252 
12253     width: Py_ssize_t
12254     fillchar: Py_UCS4 = ' '
12255     /
12256 
12257 Return a left-justified string of length width.
12258 
12259 Padding is done using the specified fill character (default is a space).
12260 [clinic start generated code]*/
12261 
12262 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12263 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12264 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12265 {
12266     if (PyUnicode_READY(self) == -1)
12267         return NULL;
12268 
12269     if (PyUnicode_GET_LENGTH(self) >= width)
12270         return unicode_result_unchanged(self);
12271 
12272     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12273 }
12274 
12275 /*[clinic input]
12276 str.lower as unicode_lower
12277 
12278 Return a copy of the string converted to lowercase.
12279 [clinic start generated code]*/
12280 
12281 static PyObject *
unicode_lower_impl(PyObject * self)12282 unicode_lower_impl(PyObject *self)
12283 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12284 {
12285     if (PyUnicode_READY(self) == -1)
12286         return NULL;
12287     if (PyUnicode_IS_ASCII(self))
12288         return ascii_upper_or_lower(self, 1);
12289     return case_operation(self, do_lower);
12290 }
12291 
12292 #define LEFTSTRIP 0
12293 #define RIGHTSTRIP 1
12294 #define BOTHSTRIP 2
12295 
12296 /* Arrays indexed by above */
12297 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12298 
12299 #define STRIPNAME(i) (stripfuncnames[i])
12300 
12301 /* externally visible for str.strip(unicode) */
12302 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12303 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12304 {
12305     void *data;
12306     int kind;
12307     Py_ssize_t i, j, len;
12308     BLOOM_MASK sepmask;
12309     Py_ssize_t seplen;
12310 
12311     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12312         return NULL;
12313 
12314     kind = PyUnicode_KIND(self);
12315     data = PyUnicode_DATA(self);
12316     len = PyUnicode_GET_LENGTH(self);
12317     seplen = PyUnicode_GET_LENGTH(sepobj);
12318     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12319                               PyUnicode_DATA(sepobj),
12320                               seplen);
12321 
12322     i = 0;
12323     if (striptype != RIGHTSTRIP) {
12324         while (i < len) {
12325             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12326             if (!BLOOM(sepmask, ch))
12327                 break;
12328             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12329                 break;
12330             i++;
12331         }
12332     }
12333 
12334     j = len;
12335     if (striptype != LEFTSTRIP) {
12336         j--;
12337         while (j >= i) {
12338             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12339             if (!BLOOM(sepmask, ch))
12340                 break;
12341             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12342                 break;
12343             j--;
12344         }
12345 
12346         j++;
12347     }
12348 
12349     return PyUnicode_Substring(self, i, j);
12350 }
12351 
12352 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12353 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12354 {
12355     unsigned char *data;
12356     int kind;
12357     Py_ssize_t length;
12358 
12359     if (PyUnicode_READY(self) == -1)
12360         return NULL;
12361 
12362     length = PyUnicode_GET_LENGTH(self);
12363     end = Py_MIN(end, length);
12364 
12365     if (start == 0 && end == length)
12366         return unicode_result_unchanged(self);
12367 
12368     if (start < 0 || end < 0) {
12369         PyErr_SetString(PyExc_IndexError, "string index out of range");
12370         return NULL;
12371     }
12372     if (start >= length || end < start)
12373         _Py_RETURN_UNICODE_EMPTY();
12374 
12375     length = end - start;
12376     if (PyUnicode_IS_ASCII(self)) {
12377         data = PyUnicode_1BYTE_DATA(self);
12378         return _PyUnicode_FromASCII((char*)(data + start), length);
12379     }
12380     else {
12381         kind = PyUnicode_KIND(self);
12382         data = PyUnicode_1BYTE_DATA(self);
12383         return PyUnicode_FromKindAndData(kind,
12384                                          data + kind * start,
12385                                          length);
12386     }
12387 }
12388 
12389 static PyObject *
do_strip(PyObject * self,int striptype)12390 do_strip(PyObject *self, int striptype)
12391 {
12392     Py_ssize_t len, i, j;
12393 
12394     if (PyUnicode_READY(self) == -1)
12395         return NULL;
12396 
12397     len = PyUnicode_GET_LENGTH(self);
12398 
12399     if (PyUnicode_IS_ASCII(self)) {
12400         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12401 
12402         i = 0;
12403         if (striptype != RIGHTSTRIP) {
12404             while (i < len) {
12405                 Py_UCS1 ch = data[i];
12406                 if (!_Py_ascii_whitespace[ch])
12407                     break;
12408                 i++;
12409             }
12410         }
12411 
12412         j = len;
12413         if (striptype != LEFTSTRIP) {
12414             j--;
12415             while (j >= i) {
12416                 Py_UCS1 ch = data[j];
12417                 if (!_Py_ascii_whitespace[ch])
12418                     break;
12419                 j--;
12420             }
12421             j++;
12422         }
12423     }
12424     else {
12425         int kind = PyUnicode_KIND(self);
12426         void *data = PyUnicode_DATA(self);
12427 
12428         i = 0;
12429         if (striptype != RIGHTSTRIP) {
12430             while (i < len) {
12431                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12432                 if (!Py_UNICODE_ISSPACE(ch))
12433                     break;
12434                 i++;
12435             }
12436         }
12437 
12438         j = len;
12439         if (striptype != LEFTSTRIP) {
12440             j--;
12441             while (j >= i) {
12442                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12443                 if (!Py_UNICODE_ISSPACE(ch))
12444                     break;
12445                 j--;
12446             }
12447             j++;
12448         }
12449     }
12450 
12451     return PyUnicode_Substring(self, i, j);
12452 }
12453 
12454 
12455 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12456 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12457 {
12458     if (sep != Py_None) {
12459         if (PyUnicode_Check(sep))
12460             return _PyUnicode_XStrip(self, striptype, sep);
12461         else {
12462             PyErr_Format(PyExc_TypeError,
12463                          "%s arg must be None or str",
12464                          STRIPNAME(striptype));
12465             return NULL;
12466         }
12467     }
12468 
12469     return do_strip(self, striptype);
12470 }
12471 
12472 
12473 /*[clinic input]
12474 str.strip as unicode_strip
12475 
12476     chars: object = None
12477     /
12478 
12479 Return a copy of the string with leading and trailing whitespace removed.
12480 
12481 If chars is given and not None, remove characters in chars instead.
12482 [clinic start generated code]*/
12483 
12484 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12485 unicode_strip_impl(PyObject *self, PyObject *chars)
12486 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12487 {
12488     return do_argstrip(self, BOTHSTRIP, chars);
12489 }
12490 
12491 
12492 /*[clinic input]
12493 str.lstrip as unicode_lstrip
12494 
12495     chars: object = None
12496     /
12497 
12498 Return a copy of the string with leading whitespace removed.
12499 
12500 If chars is given and not None, remove characters in chars instead.
12501 [clinic start generated code]*/
12502 
12503 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12504 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12505 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12506 {
12507     return do_argstrip(self, LEFTSTRIP, chars);
12508 }
12509 
12510 
12511 /*[clinic input]
12512 str.rstrip as unicode_rstrip
12513 
12514     chars: object = None
12515     /
12516 
12517 Return a copy of the string with trailing whitespace removed.
12518 
12519 If chars is given and not None, remove characters in chars instead.
12520 [clinic start generated code]*/
12521 
12522 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12523 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12524 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12525 {
12526     return do_argstrip(self, RIGHTSTRIP, chars);
12527 }
12528 
12529 
12530 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12531 unicode_repeat(PyObject *str, Py_ssize_t len)
12532 {
12533     PyObject *u;
12534     Py_ssize_t nchars, n;
12535 
12536     if (len < 1)
12537         _Py_RETURN_UNICODE_EMPTY();
12538 
12539     /* no repeat, return original string */
12540     if (len == 1)
12541         return unicode_result_unchanged(str);
12542 
12543     if (PyUnicode_READY(str) == -1)
12544         return NULL;
12545 
12546     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12547         PyErr_SetString(PyExc_OverflowError,
12548                         "repeated string is too long");
12549         return NULL;
12550     }
12551     nchars = len * PyUnicode_GET_LENGTH(str);
12552 
12553     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12554     if (!u)
12555         return NULL;
12556     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12557 
12558     if (PyUnicode_GET_LENGTH(str) == 1) {
12559         const int kind = PyUnicode_KIND(str);
12560         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12561         if (kind == PyUnicode_1BYTE_KIND) {
12562             void *to = PyUnicode_DATA(u);
12563             memset(to, (unsigned char)fill_char, len);
12564         }
12565         else if (kind == PyUnicode_2BYTE_KIND) {
12566             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12567             for (n = 0; n < len; ++n)
12568                 ucs2[n] = fill_char;
12569         } else {
12570             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12571             assert(kind == PyUnicode_4BYTE_KIND);
12572             for (n = 0; n < len; ++n)
12573                 ucs4[n] = fill_char;
12574         }
12575     }
12576     else {
12577         /* number of characters copied this far */
12578         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12579         const Py_ssize_t char_size = PyUnicode_KIND(str);
12580         char *to = (char *) PyUnicode_DATA(u);
12581         memcpy(to, PyUnicode_DATA(str),
12582                   PyUnicode_GET_LENGTH(str) * char_size);
12583         while (done < nchars) {
12584             n = (done <= nchars-done) ? done : nchars-done;
12585             memcpy(to + (done * char_size), to, n * char_size);
12586             done += n;
12587         }
12588     }
12589 
12590     assert(_PyUnicode_CheckConsistency(u, 1));
12591     return u;
12592 }
12593 
12594 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12595 PyUnicode_Replace(PyObject *str,
12596                   PyObject *substr,
12597                   PyObject *replstr,
12598                   Py_ssize_t maxcount)
12599 {
12600     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12601             ensure_unicode(replstr) < 0)
12602         return NULL;
12603     return replace(str, substr, replstr, maxcount);
12604 }
12605 
12606 /*[clinic input]
12607 str.replace as unicode_replace
12608 
12609     old: unicode
12610     new: unicode
12611     count: Py_ssize_t = -1
12612         Maximum number of occurrences to replace.
12613         -1 (the default value) means replace all occurrences.
12614     /
12615 
12616 Return a copy with all occurrences of substring old replaced by new.
12617 
12618 If the optional argument count is given, only the first count occurrences are
12619 replaced.
12620 [clinic start generated code]*/
12621 
12622 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12623 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12624                      Py_ssize_t count)
12625 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12626 {
12627     if (PyUnicode_READY(self) == -1)
12628         return NULL;
12629     return replace(self, old, new, count);
12630 }
12631 
12632 static PyObject *
unicode_repr(PyObject * unicode)12633 unicode_repr(PyObject *unicode)
12634 {
12635     PyObject *repr;
12636     Py_ssize_t isize;
12637     Py_ssize_t osize, squote, dquote, i, o;
12638     Py_UCS4 max, quote;
12639     int ikind, okind, unchanged;
12640     void *idata, *odata;
12641 
12642     if (PyUnicode_READY(unicode) == -1)
12643         return NULL;
12644 
12645     isize = PyUnicode_GET_LENGTH(unicode);
12646     idata = PyUnicode_DATA(unicode);
12647 
12648     /* Compute length of output, quote characters, and
12649        maximum character */
12650     osize = 0;
12651     max = 127;
12652     squote = dquote = 0;
12653     ikind = PyUnicode_KIND(unicode);
12654     for (i = 0; i < isize; i++) {
12655         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12656         Py_ssize_t incr = 1;
12657         switch (ch) {
12658         case '\'': squote++; break;
12659         case '"':  dquote++; break;
12660         case '\\': case '\t': case '\r': case '\n':
12661             incr = 2;
12662             break;
12663         default:
12664             /* Fast-path ASCII */
12665             if (ch < ' ' || ch == 0x7f)
12666                 incr = 4; /* \xHH */
12667             else if (ch < 0x7f)
12668                 ;
12669             else if (Py_UNICODE_ISPRINTABLE(ch))
12670                 max = ch > max ? ch : max;
12671             else if (ch < 0x100)
12672                 incr = 4; /* \xHH */
12673             else if (ch < 0x10000)
12674                 incr = 6; /* \uHHHH */
12675             else
12676                 incr = 10; /* \uHHHHHHHH */
12677         }
12678         if (osize > PY_SSIZE_T_MAX - incr) {
12679             PyErr_SetString(PyExc_OverflowError,
12680                             "string is too long to generate repr");
12681             return NULL;
12682         }
12683         osize += incr;
12684     }
12685 
12686     quote = '\'';
12687     unchanged = (osize == isize);
12688     if (squote) {
12689         unchanged = 0;
12690         if (dquote)
12691             /* Both squote and dquote present. Use squote,
12692                and escape them */
12693             osize += squote;
12694         else
12695             quote = '"';
12696     }
12697     osize += 2;   /* quotes */
12698 
12699     repr = PyUnicode_New(osize, max);
12700     if (repr == NULL)
12701         return NULL;
12702     okind = PyUnicode_KIND(repr);
12703     odata = PyUnicode_DATA(repr);
12704 
12705     PyUnicode_WRITE(okind, odata, 0, quote);
12706     PyUnicode_WRITE(okind, odata, osize-1, quote);
12707     if (unchanged) {
12708         _PyUnicode_FastCopyCharacters(repr, 1,
12709                                       unicode, 0,
12710                                       isize);
12711     }
12712     else {
12713         for (i = 0, o = 1; i < isize; i++) {
12714             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12715 
12716             /* Escape quotes and backslashes */
12717             if ((ch == quote) || (ch == '\\')) {
12718                 PyUnicode_WRITE(okind, odata, o++, '\\');
12719                 PyUnicode_WRITE(okind, odata, o++, ch);
12720                 continue;
12721             }
12722 
12723             /* Map special whitespace to '\t', \n', '\r' */
12724             if (ch == '\t') {
12725                 PyUnicode_WRITE(okind, odata, o++, '\\');
12726                 PyUnicode_WRITE(okind, odata, o++, 't');
12727             }
12728             else if (ch == '\n') {
12729                 PyUnicode_WRITE(okind, odata, o++, '\\');
12730                 PyUnicode_WRITE(okind, odata, o++, 'n');
12731             }
12732             else if (ch == '\r') {
12733                 PyUnicode_WRITE(okind, odata, o++, '\\');
12734                 PyUnicode_WRITE(okind, odata, o++, 'r');
12735             }
12736 
12737             /* Map non-printable US ASCII to '\xhh' */
12738             else if (ch < ' ' || ch == 0x7F) {
12739                 PyUnicode_WRITE(okind, odata, o++, '\\');
12740                 PyUnicode_WRITE(okind, odata, o++, 'x');
12741                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12742                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12743             }
12744 
12745             /* Copy ASCII characters as-is */
12746             else if (ch < 0x7F) {
12747                 PyUnicode_WRITE(okind, odata, o++, ch);
12748             }
12749 
12750             /* Non-ASCII characters */
12751             else {
12752                 /* Map Unicode whitespace and control characters
12753                    (categories Z* and C* except ASCII space)
12754                 */
12755                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12756                     PyUnicode_WRITE(okind, odata, o++, '\\');
12757                     /* Map 8-bit characters to '\xhh' */
12758                     if (ch <= 0xff) {
12759                         PyUnicode_WRITE(okind, odata, o++, 'x');
12760                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12761                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12762                     }
12763                     /* Map 16-bit characters to '\uxxxx' */
12764                     else if (ch <= 0xffff) {
12765                         PyUnicode_WRITE(okind, odata, o++, 'u');
12766                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12767                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12768                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12769                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12770                     }
12771                     /* Map 21-bit characters to '\U00xxxxxx' */
12772                     else {
12773                         PyUnicode_WRITE(okind, odata, o++, 'U');
12774                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12775                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12776                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12777                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12778                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12779                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12780                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12781                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12782                     }
12783                 }
12784                 /* Copy characters as-is */
12785                 else {
12786                     PyUnicode_WRITE(okind, odata, o++, ch);
12787                 }
12788             }
12789         }
12790     }
12791     /* Closing quote already added at the beginning */
12792     assert(_PyUnicode_CheckConsistency(repr, 1));
12793     return repr;
12794 }
12795 
12796 PyDoc_STRVAR(rfind__doc__,
12797              "S.rfind(sub[, start[, end]]) -> int\n\
12798 \n\
12799 Return the highest index in S where substring sub is found,\n\
12800 such that sub is contained within S[start:end].  Optional\n\
12801 arguments start and end are interpreted as in slice notation.\n\
12802 \n\
12803 Return -1 on failure.");
12804 
12805 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12806 unicode_rfind(PyObject *self, PyObject *args)
12807 {
12808     /* initialize variables to prevent gcc warning */
12809     PyObject *substring = NULL;
12810     Py_ssize_t start = 0;
12811     Py_ssize_t end = 0;
12812     Py_ssize_t result;
12813 
12814     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12815         return NULL;
12816 
12817     if (PyUnicode_READY(self) == -1)
12818         return NULL;
12819 
12820     result = any_find_slice(self, substring, start, end, -1);
12821 
12822     if (result == -2)
12823         return NULL;
12824 
12825     return PyLong_FromSsize_t(result);
12826 }
12827 
12828 PyDoc_STRVAR(rindex__doc__,
12829              "S.rindex(sub[, start[, end]]) -> int\n\
12830 \n\
12831 Return the highest index in S where substring sub is found,\n\
12832 such that sub is contained within S[start:end].  Optional\n\
12833 arguments start and end are interpreted as in slice notation.\n\
12834 \n\
12835 Raises ValueError when the substring is not found.");
12836 
12837 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12838 unicode_rindex(PyObject *self, PyObject *args)
12839 {
12840     /* initialize variables to prevent gcc warning */
12841     PyObject *substring = NULL;
12842     Py_ssize_t start = 0;
12843     Py_ssize_t end = 0;
12844     Py_ssize_t result;
12845 
12846     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12847         return NULL;
12848 
12849     if (PyUnicode_READY(self) == -1)
12850         return NULL;
12851 
12852     result = any_find_slice(self, substring, start, end, -1);
12853 
12854     if (result == -2)
12855         return NULL;
12856 
12857     if (result < 0) {
12858         PyErr_SetString(PyExc_ValueError, "substring not found");
12859         return NULL;
12860     }
12861 
12862     return PyLong_FromSsize_t(result);
12863 }
12864 
12865 /*[clinic input]
12866 str.rjust as unicode_rjust
12867 
12868     width: Py_ssize_t
12869     fillchar: Py_UCS4 = ' '
12870     /
12871 
12872 Return a right-justified string of length width.
12873 
12874 Padding is done using the specified fill character (default is a space).
12875 [clinic start generated code]*/
12876 
12877 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12878 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12879 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12880 {
12881     if (PyUnicode_READY(self) == -1)
12882         return NULL;
12883 
12884     if (PyUnicode_GET_LENGTH(self) >= width)
12885         return unicode_result_unchanged(self);
12886 
12887     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12888 }
12889 
12890 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12891 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12892 {
12893     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12894         return NULL;
12895 
12896     return split(s, sep, maxsplit);
12897 }
12898 
12899 /*[clinic input]
12900 str.split as unicode_split
12901 
12902     sep: object = None
12903         The delimiter according which to split the string.
12904         None (the default value) means split according to any whitespace,
12905         and discard empty strings from the result.
12906     maxsplit: Py_ssize_t = -1
12907         Maximum number of splits to do.
12908         -1 (the default value) means no limit.
12909 
12910 Return a list of the words in the string, using sep as the delimiter string.
12911 [clinic start generated code]*/
12912 
12913 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12914 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12915 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12916 {
12917     if (sep == Py_None)
12918         return split(self, NULL, maxsplit);
12919     if (PyUnicode_Check(sep))
12920         return split(self, sep, maxsplit);
12921 
12922     PyErr_Format(PyExc_TypeError,
12923                  "must be str or None, not %.100s",
12924                  Py_TYPE(sep)->tp_name);
12925     return NULL;
12926 }
12927 
12928 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12929 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12930 {
12931     PyObject* out;
12932     int kind1, kind2;
12933     void *buf1, *buf2;
12934     Py_ssize_t len1, len2;
12935 
12936     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12937         return NULL;
12938 
12939     kind1 = PyUnicode_KIND(str_obj);
12940     kind2 = PyUnicode_KIND(sep_obj);
12941     len1 = PyUnicode_GET_LENGTH(str_obj);
12942     len2 = PyUnicode_GET_LENGTH(sep_obj);
12943     if (kind1 < kind2 || len1 < len2) {
12944         _Py_INCREF_UNICODE_EMPTY();
12945         if (!unicode_empty)
12946             out = NULL;
12947         else {
12948             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12949             Py_DECREF(unicode_empty);
12950         }
12951         return out;
12952     }
12953     buf1 = PyUnicode_DATA(str_obj);
12954     buf2 = PyUnicode_DATA(sep_obj);
12955     if (kind2 != kind1) {
12956         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12957         if (!buf2)
12958             return NULL;
12959     }
12960 
12961     switch (kind1) {
12962     case PyUnicode_1BYTE_KIND:
12963         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12964             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965         else
12966             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967         break;
12968     case PyUnicode_2BYTE_KIND:
12969         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970         break;
12971     case PyUnicode_4BYTE_KIND:
12972         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12973         break;
12974     default:
12975         Py_UNREACHABLE();
12976     }
12977 
12978     if (kind2 != kind1)
12979         PyMem_Free(buf2);
12980 
12981     return out;
12982 }
12983 
12984 
12985 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12986 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12987 {
12988     PyObject* out;
12989     int kind1, kind2;
12990     void *buf1, *buf2;
12991     Py_ssize_t len1, len2;
12992 
12993     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12994         return NULL;
12995 
12996     kind1 = PyUnicode_KIND(str_obj);
12997     kind2 = PyUnicode_KIND(sep_obj);
12998     len1 = PyUnicode_GET_LENGTH(str_obj);
12999     len2 = PyUnicode_GET_LENGTH(sep_obj);
13000     if (kind1 < kind2 || len1 < len2) {
13001         _Py_INCREF_UNICODE_EMPTY();
13002         if (!unicode_empty)
13003             out = NULL;
13004         else {
13005             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13006             Py_DECREF(unicode_empty);
13007         }
13008         return out;
13009     }
13010     buf1 = PyUnicode_DATA(str_obj);
13011     buf2 = PyUnicode_DATA(sep_obj);
13012     if (kind2 != kind1) {
13013         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014         if (!buf2)
13015             return NULL;
13016     }
13017 
13018     switch (kind1) {
13019     case PyUnicode_1BYTE_KIND:
13020         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022         else
13023             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024         break;
13025     case PyUnicode_2BYTE_KIND:
13026         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027         break;
13028     case PyUnicode_4BYTE_KIND:
13029         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030         break;
13031     default:
13032         Py_UNREACHABLE();
13033     }
13034 
13035     if (kind2 != kind1)
13036         PyMem_Free(buf2);
13037 
13038     return out;
13039 }
13040 
13041 /*[clinic input]
13042 str.partition as unicode_partition
13043 
13044     sep: object
13045     /
13046 
13047 Partition the string into three parts using the given separator.
13048 
13049 This will search for the separator in the string.  If the separator is found,
13050 returns a 3-tuple containing the part before the separator, the separator
13051 itself, and the part after it.
13052 
13053 If the separator is not found, returns a 3-tuple containing the original string
13054 and two empty strings.
13055 [clinic start generated code]*/
13056 
13057 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13058 unicode_partition(PyObject *self, PyObject *sep)
13059 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13060 {
13061     return PyUnicode_Partition(self, sep);
13062 }
13063 
13064 /*[clinic input]
13065 str.rpartition as unicode_rpartition = str.partition
13066 
13067 Partition the string into three parts using the given separator.
13068 
13069 This will search for the separator in the string, starting at the end. If
13070 the separator is found, returns a 3-tuple containing the part before the
13071 separator, the separator itself, and the part after it.
13072 
13073 If the separator is not found, returns a 3-tuple containing two empty strings
13074 and the original string.
13075 [clinic start generated code]*/
13076 
13077 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13078 unicode_rpartition(PyObject *self, PyObject *sep)
13079 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13080 {
13081     return PyUnicode_RPartition(self, sep);
13082 }
13083 
13084 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13085 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13086 {
13087     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13088         return NULL;
13089 
13090     return rsplit(s, sep, maxsplit);
13091 }
13092 
13093 /*[clinic input]
13094 str.rsplit as unicode_rsplit = str.split
13095 
13096 Return a list of the words in the string, using sep as the delimiter string.
13097 
13098 Splits are done starting at the end of the string and working to the front.
13099 [clinic start generated code]*/
13100 
13101 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13102 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13103 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13104 {
13105     if (sep == Py_None)
13106         return rsplit(self, NULL, maxsplit);
13107     if (PyUnicode_Check(sep))
13108         return rsplit(self, sep, maxsplit);
13109 
13110     PyErr_Format(PyExc_TypeError,
13111                  "must be str or None, not %.100s",
13112                  Py_TYPE(sep)->tp_name);
13113     return NULL;
13114 }
13115 
13116 /*[clinic input]
13117 str.splitlines as unicode_splitlines
13118 
13119     keepends: bool(accept={int}) = False
13120 
13121 Return a list of the lines in the string, breaking at line boundaries.
13122 
13123 Line breaks are not included in the resulting list unless keepends is given and
13124 true.
13125 [clinic start generated code]*/
13126 
13127 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13128 unicode_splitlines_impl(PyObject *self, int keepends)
13129 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13130 {
13131     return PyUnicode_Splitlines(self, keepends);
13132 }
13133 
13134 static
unicode_str(PyObject * self)13135 PyObject *unicode_str(PyObject *self)
13136 {
13137     return unicode_result_unchanged(self);
13138 }
13139 
13140 /*[clinic input]
13141 str.swapcase as unicode_swapcase
13142 
13143 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13144 [clinic start generated code]*/
13145 
13146 static PyObject *
unicode_swapcase_impl(PyObject * self)13147 unicode_swapcase_impl(PyObject *self)
13148 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13149 {
13150     if (PyUnicode_READY(self) == -1)
13151         return NULL;
13152     return case_operation(self, do_swapcase);
13153 }
13154 
13155 /*[clinic input]
13156 
13157 @staticmethod
13158 str.maketrans as unicode_maketrans
13159 
13160   x: object
13161 
13162   y: unicode=NULL
13163 
13164   z: unicode=NULL
13165 
13166   /
13167 
13168 Return a translation table usable for str.translate().
13169 
13170 If there is only one argument, it must be a dictionary mapping Unicode
13171 ordinals (integers) or characters to Unicode ordinals, strings or None.
13172 Character keys will be then converted to ordinals.
13173 If there are two arguments, they must be strings of equal length, and
13174 in the resulting dictionary, each character in x will be mapped to the
13175 character at the same position in y. If there is a third argument, it
13176 must be a string, whose characters will be mapped to None in the result.
13177 [clinic start generated code]*/
13178 
13179 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13180 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13181 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13182 {
13183     PyObject *new = NULL, *key, *value;
13184     Py_ssize_t i = 0;
13185     int res;
13186 
13187     new = PyDict_New();
13188     if (!new)
13189         return NULL;
13190     if (y != NULL) {
13191         int x_kind, y_kind, z_kind;
13192         void *x_data, *y_data, *z_data;
13193 
13194         /* x must be a string too, of equal length */
13195         if (!PyUnicode_Check(x)) {
13196             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13197                             "be a string if there is a second argument");
13198             goto err;
13199         }
13200         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13201             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13202                             "arguments must have equal length");
13203             goto err;
13204         }
13205         /* create entries for translating chars in x to those in y */
13206         x_kind = PyUnicode_KIND(x);
13207         y_kind = PyUnicode_KIND(y);
13208         x_data = PyUnicode_DATA(x);
13209         y_data = PyUnicode_DATA(y);
13210         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13211             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13212             if (!key)
13213                 goto err;
13214             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13215             if (!value) {
13216                 Py_DECREF(key);
13217                 goto err;
13218             }
13219             res = PyDict_SetItem(new, key, value);
13220             Py_DECREF(key);
13221             Py_DECREF(value);
13222             if (res < 0)
13223                 goto err;
13224         }
13225         /* create entries for deleting chars in z */
13226         if (z != NULL) {
13227             z_kind = PyUnicode_KIND(z);
13228             z_data = PyUnicode_DATA(z);
13229             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13230                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13231                 if (!key)
13232                     goto err;
13233                 res = PyDict_SetItem(new, key, Py_None);
13234                 Py_DECREF(key);
13235                 if (res < 0)
13236                     goto err;
13237             }
13238         }
13239     } else {
13240         int kind;
13241         void *data;
13242 
13243         /* x must be a dict */
13244         if (!PyDict_CheckExact(x)) {
13245             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13246                             "to maketrans it must be a dict");
13247             goto err;
13248         }
13249         /* copy entries into the new dict, converting string keys to int keys */
13250         while (PyDict_Next(x, &i, &key, &value)) {
13251             if (PyUnicode_Check(key)) {
13252                 /* convert string keys to integer keys */
13253                 PyObject *newkey;
13254                 if (PyUnicode_GET_LENGTH(key) != 1) {
13255                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13256                                     "table must be of length 1");
13257                     goto err;
13258                 }
13259                 kind = PyUnicode_KIND(key);
13260                 data = PyUnicode_DATA(key);
13261                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13262                 if (!newkey)
13263                     goto err;
13264                 res = PyDict_SetItem(new, newkey, value);
13265                 Py_DECREF(newkey);
13266                 if (res < 0)
13267                     goto err;
13268             } else if (PyLong_Check(key)) {
13269                 /* just keep integer keys */
13270                 if (PyDict_SetItem(new, key, value) < 0)
13271                     goto err;
13272             } else {
13273                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13274                                 "be strings or integers");
13275                 goto err;
13276             }
13277         }
13278     }
13279     return new;
13280   err:
13281     Py_DECREF(new);
13282     return NULL;
13283 }
13284 
13285 /*[clinic input]
13286 str.translate as unicode_translate
13287 
13288     table: object
13289         Translation table, which must be a mapping of Unicode ordinals to
13290         Unicode ordinals, strings, or None.
13291     /
13292 
13293 Replace each character in the string using the given translation table.
13294 
13295 The table must implement lookup/indexing via __getitem__, for instance a
13296 dictionary or list.  If this operation raises LookupError, the character is
13297 left untouched.  Characters mapped to None are deleted.
13298 [clinic start generated code]*/
13299 
13300 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13301 unicode_translate(PyObject *self, PyObject *table)
13302 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13303 {
13304     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13305 }
13306 
13307 /*[clinic input]
13308 str.upper as unicode_upper
13309 
13310 Return a copy of the string converted to uppercase.
13311 [clinic start generated code]*/
13312 
13313 static PyObject *
unicode_upper_impl(PyObject * self)13314 unicode_upper_impl(PyObject *self)
13315 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13316 {
13317     if (PyUnicode_READY(self) == -1)
13318         return NULL;
13319     if (PyUnicode_IS_ASCII(self))
13320         return ascii_upper_or_lower(self, 0);
13321     return case_operation(self, do_upper);
13322 }
13323 
13324 /*[clinic input]
13325 str.zfill as unicode_zfill
13326 
13327     width: Py_ssize_t
13328     /
13329 
13330 Pad a numeric string with zeros on the left, to fill a field of the given width.
13331 
13332 The string is never truncated.
13333 [clinic start generated code]*/
13334 
13335 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13336 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13337 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13338 {
13339     Py_ssize_t fill;
13340     PyObject *u;
13341     int kind;
13342     void *data;
13343     Py_UCS4 chr;
13344 
13345     if (PyUnicode_READY(self) == -1)
13346         return NULL;
13347 
13348     if (PyUnicode_GET_LENGTH(self) >= width)
13349         return unicode_result_unchanged(self);
13350 
13351     fill = width - PyUnicode_GET_LENGTH(self);
13352 
13353     u = pad(self, fill, 0, '0');
13354 
13355     if (u == NULL)
13356         return NULL;
13357 
13358     kind = PyUnicode_KIND(u);
13359     data = PyUnicode_DATA(u);
13360     chr = PyUnicode_READ(kind, data, fill);
13361 
13362     if (chr == '+' || chr == '-') {
13363         /* move sign to beginning of string */
13364         PyUnicode_WRITE(kind, data, 0, chr);
13365         PyUnicode_WRITE(kind, data, fill, '0');
13366     }
13367 
13368     assert(_PyUnicode_CheckConsistency(u, 1));
13369     return u;
13370 }
13371 
13372 #if 0
13373 static PyObject *
13374 unicode__decimal2ascii(PyObject *self)
13375 {
13376     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13377 }
13378 #endif
13379 
13380 PyDoc_STRVAR(startswith__doc__,
13381              "S.startswith(prefix[, start[, end]]) -> bool\n\
13382 \n\
13383 Return True if S starts with the specified prefix, False otherwise.\n\
13384 With optional start, test S beginning at that position.\n\
13385 With optional end, stop comparing S at that position.\n\
13386 prefix can also be a tuple of strings to try.");
13387 
13388 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13389 unicode_startswith(PyObject *self,
13390                    PyObject *args)
13391 {
13392     PyObject *subobj;
13393     PyObject *substring;
13394     Py_ssize_t start = 0;
13395     Py_ssize_t end = PY_SSIZE_T_MAX;
13396     int result;
13397 
13398     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13399         return NULL;
13400     if (PyTuple_Check(subobj)) {
13401         Py_ssize_t i;
13402         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13403             substring = PyTuple_GET_ITEM(subobj, i);
13404             if (!PyUnicode_Check(substring)) {
13405                 PyErr_Format(PyExc_TypeError,
13406                              "tuple for startswith must only contain str, "
13407                              "not %.100s",
13408                              Py_TYPE(substring)->tp_name);
13409                 return NULL;
13410             }
13411             result = tailmatch(self, substring, start, end, -1);
13412             if (result == -1)
13413                 return NULL;
13414             if (result) {
13415                 Py_RETURN_TRUE;
13416             }
13417         }
13418         /* nothing matched */
13419         Py_RETURN_FALSE;
13420     }
13421     if (!PyUnicode_Check(subobj)) {
13422         PyErr_Format(PyExc_TypeError,
13423                      "startswith first arg must be str or "
13424                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13425         return NULL;
13426     }
13427     result = tailmatch(self, subobj, start, end, -1);
13428     if (result == -1)
13429         return NULL;
13430     return PyBool_FromLong(result);
13431 }
13432 
13433 
13434 PyDoc_STRVAR(endswith__doc__,
13435              "S.endswith(suffix[, start[, end]]) -> bool\n\
13436 \n\
13437 Return True if S ends with the specified suffix, False otherwise.\n\
13438 With optional start, test S beginning at that position.\n\
13439 With optional end, stop comparing S at that position.\n\
13440 suffix can also be a tuple of strings to try.");
13441 
13442 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13443 unicode_endswith(PyObject *self,
13444                  PyObject *args)
13445 {
13446     PyObject *subobj;
13447     PyObject *substring;
13448     Py_ssize_t start = 0;
13449     Py_ssize_t end = PY_SSIZE_T_MAX;
13450     int result;
13451 
13452     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13453         return NULL;
13454     if (PyTuple_Check(subobj)) {
13455         Py_ssize_t i;
13456         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13457             substring = PyTuple_GET_ITEM(subobj, i);
13458             if (!PyUnicode_Check(substring)) {
13459                 PyErr_Format(PyExc_TypeError,
13460                              "tuple for endswith must only contain str, "
13461                              "not %.100s",
13462                              Py_TYPE(substring)->tp_name);
13463                 return NULL;
13464             }
13465             result = tailmatch(self, substring, start, end, +1);
13466             if (result == -1)
13467                 return NULL;
13468             if (result) {
13469                 Py_RETURN_TRUE;
13470             }
13471         }
13472         Py_RETURN_FALSE;
13473     }
13474     if (!PyUnicode_Check(subobj)) {
13475         PyErr_Format(PyExc_TypeError,
13476                      "endswith first arg must be str or "
13477                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13478         return NULL;
13479     }
13480     result = tailmatch(self, subobj, start, end, +1);
13481     if (result == -1)
13482         return NULL;
13483     return PyBool_FromLong(result);
13484 }
13485 
13486 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13487 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13488 {
13489     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13490     writer->data = PyUnicode_DATA(writer->buffer);
13491 
13492     if (!writer->readonly) {
13493         writer->kind = PyUnicode_KIND(writer->buffer);
13494         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13495     }
13496     else {
13497         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13498            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13499         writer->kind = PyUnicode_WCHAR_KIND;
13500         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13501 
13502         /* Copy-on-write mode: set buffer size to 0 so
13503          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13504          * next write. */
13505         writer->size = 0;
13506     }
13507 }
13508 
13509 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13510 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13511 {
13512     memset(writer, 0, sizeof(*writer));
13513 
13514     /* ASCII is the bare minimum */
13515     writer->min_char = 127;
13516 
13517     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13518        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13519     writer->kind = PyUnicode_WCHAR_KIND;
13520     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13521 }
13522 
13523 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13524 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13525                                  Py_ssize_t length, Py_UCS4 maxchar)
13526 {
13527     Py_ssize_t newlen;
13528     PyObject *newbuffer;
13529 
13530     assert(maxchar <= MAX_UNICODE);
13531 
13532     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13533     assert((maxchar > writer->maxchar && length >= 0)
13534            || length > 0);
13535 
13536     if (length > PY_SSIZE_T_MAX - writer->pos) {
13537         PyErr_NoMemory();
13538         return -1;
13539     }
13540     newlen = writer->pos + length;
13541 
13542     maxchar = Py_MAX(maxchar, writer->min_char);
13543 
13544     if (writer->buffer == NULL) {
13545         assert(!writer->readonly);
13546         if (writer->overallocate
13547             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13548             /* overallocate to limit the number of realloc() */
13549             newlen += newlen / OVERALLOCATE_FACTOR;
13550         }
13551         if (newlen < writer->min_length)
13552             newlen = writer->min_length;
13553 
13554         writer->buffer = PyUnicode_New(newlen, maxchar);
13555         if (writer->buffer == NULL)
13556             return -1;
13557     }
13558     else if (newlen > writer->size) {
13559         if (writer->overallocate
13560             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13561             /* overallocate to limit the number of realloc() */
13562             newlen += newlen / OVERALLOCATE_FACTOR;
13563         }
13564         if (newlen < writer->min_length)
13565             newlen = writer->min_length;
13566 
13567         if (maxchar > writer->maxchar || writer->readonly) {
13568             /* resize + widen */
13569             maxchar = Py_MAX(maxchar, writer->maxchar);
13570             newbuffer = PyUnicode_New(newlen, maxchar);
13571             if (newbuffer == NULL)
13572                 return -1;
13573             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13574                                           writer->buffer, 0, writer->pos);
13575             Py_DECREF(writer->buffer);
13576             writer->readonly = 0;
13577         }
13578         else {
13579             newbuffer = resize_compact(writer->buffer, newlen);
13580             if (newbuffer == NULL)
13581                 return -1;
13582         }
13583         writer->buffer = newbuffer;
13584     }
13585     else if (maxchar > writer->maxchar) {
13586         assert(!writer->readonly);
13587         newbuffer = PyUnicode_New(writer->size, maxchar);
13588         if (newbuffer == NULL)
13589             return -1;
13590         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13591                                       writer->buffer, 0, writer->pos);
13592         Py_SETREF(writer->buffer, newbuffer);
13593     }
13594     _PyUnicodeWriter_Update(writer);
13595     return 0;
13596 
13597 #undef OVERALLOCATE_FACTOR
13598 }
13599 
13600 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13601 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13602                                      enum PyUnicode_Kind kind)
13603 {
13604     Py_UCS4 maxchar;
13605 
13606     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13607     assert(writer->kind < kind);
13608 
13609     switch (kind)
13610     {
13611     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13612     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13613     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13614     default:
13615         Py_UNREACHABLE();
13616     }
13617 
13618     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13619 }
13620 
13621 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13622 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13623 {
13624     assert(ch <= MAX_UNICODE);
13625     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13626         return -1;
13627     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13628     writer->pos++;
13629     return 0;
13630 }
13631 
13632 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13633 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13634 {
13635     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13636 }
13637 
13638 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13639 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13640 {
13641     Py_UCS4 maxchar;
13642     Py_ssize_t len;
13643 
13644     if (PyUnicode_READY(str) == -1)
13645         return -1;
13646     len = PyUnicode_GET_LENGTH(str);
13647     if (len == 0)
13648         return 0;
13649     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13650     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13651         if (writer->buffer == NULL && !writer->overallocate) {
13652             assert(_PyUnicode_CheckConsistency(str, 1));
13653             writer->readonly = 1;
13654             Py_INCREF(str);
13655             writer->buffer = str;
13656             _PyUnicodeWriter_Update(writer);
13657             writer->pos += len;
13658             return 0;
13659         }
13660         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13661             return -1;
13662     }
13663     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13664                                   str, 0, len);
13665     writer->pos += len;
13666     return 0;
13667 }
13668 
13669 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13670 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13671                                 Py_ssize_t start, Py_ssize_t end)
13672 {
13673     Py_UCS4 maxchar;
13674     Py_ssize_t len;
13675 
13676     if (PyUnicode_READY(str) == -1)
13677         return -1;
13678 
13679     assert(0 <= start);
13680     assert(end <= PyUnicode_GET_LENGTH(str));
13681     assert(start <= end);
13682 
13683     if (end == 0)
13684         return 0;
13685 
13686     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13687         return _PyUnicodeWriter_WriteStr(writer, str);
13688 
13689     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13690         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13691     else
13692         maxchar = writer->maxchar;
13693     len = end - start;
13694 
13695     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13696         return -1;
13697 
13698     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13699                                   str, start, len);
13700     writer->pos += len;
13701     return 0;
13702 }
13703 
13704 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13705 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13706                                   const char *ascii, Py_ssize_t len)
13707 {
13708     if (len == -1)
13709         len = strlen(ascii);
13710 
13711     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13712 
13713     if (writer->buffer == NULL && !writer->overallocate) {
13714         PyObject *str;
13715 
13716         str = _PyUnicode_FromASCII(ascii, len);
13717         if (str == NULL)
13718             return -1;
13719 
13720         writer->readonly = 1;
13721         writer->buffer = str;
13722         _PyUnicodeWriter_Update(writer);
13723         writer->pos += len;
13724         return 0;
13725     }
13726 
13727     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13728         return -1;
13729 
13730     switch (writer->kind)
13731     {
13732     case PyUnicode_1BYTE_KIND:
13733     {
13734         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13735         Py_UCS1 *data = writer->data;
13736 
13737         memcpy(data + writer->pos, str, len);
13738         break;
13739     }
13740     case PyUnicode_2BYTE_KIND:
13741     {
13742         _PyUnicode_CONVERT_BYTES(
13743             Py_UCS1, Py_UCS2,
13744             ascii, ascii + len,
13745             (Py_UCS2 *)writer->data + writer->pos);
13746         break;
13747     }
13748     case PyUnicode_4BYTE_KIND:
13749     {
13750         _PyUnicode_CONVERT_BYTES(
13751             Py_UCS1, Py_UCS4,
13752             ascii, ascii + len,
13753             (Py_UCS4 *)writer->data + writer->pos);
13754         break;
13755     }
13756     default:
13757         Py_UNREACHABLE();
13758     }
13759 
13760     writer->pos += len;
13761     return 0;
13762 }
13763 
13764 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13765 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13766                                    const char *str, Py_ssize_t len)
13767 {
13768     Py_UCS4 maxchar;
13769 
13770     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13771     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13772         return -1;
13773     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13774     writer->pos += len;
13775     return 0;
13776 }
13777 
13778 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13779 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13780 {
13781     PyObject *str;
13782 
13783     if (writer->pos == 0) {
13784         Py_CLEAR(writer->buffer);
13785         _Py_RETURN_UNICODE_EMPTY();
13786     }
13787 
13788     str = writer->buffer;
13789     writer->buffer = NULL;
13790 
13791     if (writer->readonly) {
13792         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13793         return str;
13794     }
13795 
13796     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13797         PyObject *str2;
13798         str2 = resize_compact(str, writer->pos);
13799         if (str2 == NULL) {
13800             Py_DECREF(str);
13801             return NULL;
13802         }
13803         str = str2;
13804     }
13805 
13806     assert(_PyUnicode_CheckConsistency(str, 1));
13807     return unicode_result_ready(str);
13808 }
13809 
13810 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13811 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13812 {
13813     Py_CLEAR(writer->buffer);
13814 }
13815 
13816 #include "stringlib/unicode_format.h"
13817 
13818 PyDoc_STRVAR(format__doc__,
13819              "S.format(*args, **kwargs) -> str\n\
13820 \n\
13821 Return a formatted version of S, using substitutions from args and kwargs.\n\
13822 The substitutions are identified by braces ('{' and '}').");
13823 
13824 PyDoc_STRVAR(format_map__doc__,
13825              "S.format_map(mapping) -> str\n\
13826 \n\
13827 Return a formatted version of S, using substitutions from mapping.\n\
13828 The substitutions are identified by braces ('{' and '}').");
13829 
13830 /*[clinic input]
13831 str.__format__ as unicode___format__
13832 
13833     format_spec: unicode
13834     /
13835 
13836 Return a formatted version of the string as described by format_spec.
13837 [clinic start generated code]*/
13838 
13839 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13840 unicode___format___impl(PyObject *self, PyObject *format_spec)
13841 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13842 {
13843     _PyUnicodeWriter writer;
13844     int ret;
13845 
13846     if (PyUnicode_READY(self) == -1)
13847         return NULL;
13848     _PyUnicodeWriter_Init(&writer);
13849     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13850                                           self, format_spec, 0,
13851                                           PyUnicode_GET_LENGTH(format_spec));
13852     if (ret == -1) {
13853         _PyUnicodeWriter_Dealloc(&writer);
13854         return NULL;
13855     }
13856     return _PyUnicodeWriter_Finish(&writer);
13857 }
13858 
13859 /*[clinic input]
13860 str.__sizeof__ as unicode_sizeof
13861 
13862 Return the size of the string in memory, in bytes.
13863 [clinic start generated code]*/
13864 
13865 static PyObject *
unicode_sizeof_impl(PyObject * self)13866 unicode_sizeof_impl(PyObject *self)
13867 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13868 {
13869     Py_ssize_t size;
13870 
13871     /* If it's a compact object, account for base structure +
13872        character data. */
13873     if (PyUnicode_IS_COMPACT_ASCII(self))
13874         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13875     else if (PyUnicode_IS_COMPACT(self))
13876         size = sizeof(PyCompactUnicodeObject) +
13877             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13878     else {
13879         /* If it is a two-block object, account for base object, and
13880            for character block if present. */
13881         size = sizeof(PyUnicodeObject);
13882         if (_PyUnicode_DATA_ANY(self))
13883             size += (PyUnicode_GET_LENGTH(self) + 1) *
13884                 PyUnicode_KIND(self);
13885     }
13886     /* If the wstr pointer is present, account for it unless it is shared
13887        with the data pointer. Check if the data is not shared. */
13888     if (_PyUnicode_HAS_WSTR_MEMORY(self))
13889         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13890     if (_PyUnicode_HAS_UTF8_MEMORY(self))
13891         size += PyUnicode_UTF8_LENGTH(self) + 1;
13892 
13893     return PyLong_FromSsize_t(size);
13894 }
13895 
13896 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13897 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13898 {
13899     PyObject *copy = _PyUnicode_Copy(v);
13900     if (!copy)
13901         return NULL;
13902     return Py_BuildValue("(N)", copy);
13903 }
13904 
13905 static PyMethodDef unicode_methods[] = {
13906     UNICODE_ENCODE_METHODDEF
13907     UNICODE_REPLACE_METHODDEF
13908     UNICODE_SPLIT_METHODDEF
13909     UNICODE_RSPLIT_METHODDEF
13910     UNICODE_JOIN_METHODDEF
13911     UNICODE_CAPITALIZE_METHODDEF
13912     UNICODE_CASEFOLD_METHODDEF
13913     UNICODE_TITLE_METHODDEF
13914     UNICODE_CENTER_METHODDEF
13915     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13916     UNICODE_EXPANDTABS_METHODDEF
13917     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13918     UNICODE_PARTITION_METHODDEF
13919     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13920     UNICODE_LJUST_METHODDEF
13921     UNICODE_LOWER_METHODDEF
13922     UNICODE_LSTRIP_METHODDEF
13923     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13924     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13925     UNICODE_RJUST_METHODDEF
13926     UNICODE_RSTRIP_METHODDEF
13927     UNICODE_RPARTITION_METHODDEF
13928     UNICODE_SPLITLINES_METHODDEF
13929     UNICODE_STRIP_METHODDEF
13930     UNICODE_SWAPCASE_METHODDEF
13931     UNICODE_TRANSLATE_METHODDEF
13932     UNICODE_UPPER_METHODDEF
13933     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13934     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13935     UNICODE_ISASCII_METHODDEF
13936     UNICODE_ISLOWER_METHODDEF
13937     UNICODE_ISUPPER_METHODDEF
13938     UNICODE_ISTITLE_METHODDEF
13939     UNICODE_ISSPACE_METHODDEF
13940     UNICODE_ISDECIMAL_METHODDEF
13941     UNICODE_ISDIGIT_METHODDEF
13942     UNICODE_ISNUMERIC_METHODDEF
13943     UNICODE_ISALPHA_METHODDEF
13944     UNICODE_ISALNUM_METHODDEF
13945     UNICODE_ISIDENTIFIER_METHODDEF
13946     UNICODE_ISPRINTABLE_METHODDEF
13947     UNICODE_ZFILL_METHODDEF
13948     {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13949     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13950     UNICODE___FORMAT___METHODDEF
13951     UNICODE_MAKETRANS_METHODDEF
13952     UNICODE_SIZEOF_METHODDEF
13953 #if 0
13954     /* These methods are just used for debugging the implementation. */
13955     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13956 #endif
13957 
13958     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13959     {NULL, NULL}
13960 };
13961 
13962 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13963 unicode_mod(PyObject *v, PyObject *w)
13964 {
13965     if (!PyUnicode_Check(v))
13966         Py_RETURN_NOTIMPLEMENTED;
13967     return PyUnicode_Format(v, w);
13968 }
13969 
13970 static PyNumberMethods unicode_as_number = {
13971     0,              /*nb_add*/
13972     0,              /*nb_subtract*/
13973     0,              /*nb_multiply*/
13974     unicode_mod,            /*nb_remainder*/
13975 };
13976 
13977 static PySequenceMethods unicode_as_sequence = {
13978     (lenfunc) unicode_length,       /* sq_length */
13979     PyUnicode_Concat,           /* sq_concat */
13980     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13981     (ssizeargfunc) unicode_getitem,     /* sq_item */
13982     0,                  /* sq_slice */
13983     0,                  /* sq_ass_item */
13984     0,                  /* sq_ass_slice */
13985     PyUnicode_Contains,         /* sq_contains */
13986 };
13987 
13988 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13989 unicode_subscript(PyObject* self, PyObject* item)
13990 {
13991     if (PyUnicode_READY(self) == -1)
13992         return NULL;
13993 
13994     if (PyIndex_Check(item)) {
13995         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13996         if (i == -1 && PyErr_Occurred())
13997             return NULL;
13998         if (i < 0)
13999             i += PyUnicode_GET_LENGTH(self);
14000         return unicode_getitem(self, i);
14001     } else if (PySlice_Check(item)) {
14002         Py_ssize_t start, stop, step, slicelength, i;
14003         size_t cur;
14004         PyObject *result;
14005         void *src_data, *dest_data;
14006         int src_kind, dest_kind;
14007         Py_UCS4 ch, max_char, kind_limit;
14008 
14009         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14010             return NULL;
14011         }
14012         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14013                                             &start, &stop, step);
14014 
14015         if (slicelength <= 0) {
14016             _Py_RETURN_UNICODE_EMPTY();
14017         } else if (start == 0 && step == 1 &&
14018                    slicelength == PyUnicode_GET_LENGTH(self)) {
14019             return unicode_result_unchanged(self);
14020         } else if (step == 1) {
14021             return PyUnicode_Substring(self,
14022                                        start, start + slicelength);
14023         }
14024         /* General case */
14025         src_kind = PyUnicode_KIND(self);
14026         src_data = PyUnicode_DATA(self);
14027         if (!PyUnicode_IS_ASCII(self)) {
14028             kind_limit = kind_maxchar_limit(src_kind);
14029             max_char = 0;
14030             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14031                 ch = PyUnicode_READ(src_kind, src_data, cur);
14032                 if (ch > max_char) {
14033                     max_char = ch;
14034                     if (max_char >= kind_limit)
14035                         break;
14036                 }
14037             }
14038         }
14039         else
14040             max_char = 127;
14041         result = PyUnicode_New(slicelength, max_char);
14042         if (result == NULL)
14043             return NULL;
14044         dest_kind = PyUnicode_KIND(result);
14045         dest_data = PyUnicode_DATA(result);
14046 
14047         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14048             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14049             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14050         }
14051         assert(_PyUnicode_CheckConsistency(result, 1));
14052         return result;
14053     } else {
14054         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14055         return NULL;
14056     }
14057 }
14058 
14059 static PyMappingMethods unicode_as_mapping = {
14060     (lenfunc)unicode_length,        /* mp_length */
14061     (binaryfunc)unicode_subscript,  /* mp_subscript */
14062     (objobjargproc)0,           /* mp_ass_subscript */
14063 };
14064 
14065 
14066 /* Helpers for PyUnicode_Format() */
14067 
14068 struct unicode_formatter_t {
14069     PyObject *args;
14070     int args_owned;
14071     Py_ssize_t arglen, argidx;
14072     PyObject *dict;
14073 
14074     enum PyUnicode_Kind fmtkind;
14075     Py_ssize_t fmtcnt, fmtpos;
14076     void *fmtdata;
14077     PyObject *fmtstr;
14078 
14079     _PyUnicodeWriter writer;
14080 };
14081 
14082 struct unicode_format_arg_t {
14083     Py_UCS4 ch;
14084     int flags;
14085     Py_ssize_t width;
14086     int prec;
14087     int sign;
14088 };
14089 
14090 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14091 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14092 {
14093     Py_ssize_t argidx = ctx->argidx;
14094 
14095     if (argidx < ctx->arglen) {
14096         ctx->argidx++;
14097         if (ctx->arglen < 0)
14098             return ctx->args;
14099         else
14100             return PyTuple_GetItem(ctx->args, argidx);
14101     }
14102     PyErr_SetString(PyExc_TypeError,
14103                     "not enough arguments for format string");
14104     return NULL;
14105 }
14106 
14107 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14108 
14109 /* Format a float into the writer if the writer is not NULL, or into *p_output
14110    otherwise.
14111 
14112    Return 0 on success, raise an exception and return -1 on error. */
14113 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14114 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14115             PyObject **p_output,
14116             _PyUnicodeWriter *writer)
14117 {
14118     char *p;
14119     double x;
14120     Py_ssize_t len;
14121     int prec;
14122     int dtoa_flags;
14123 
14124     x = PyFloat_AsDouble(v);
14125     if (x == -1.0 && PyErr_Occurred())
14126         return -1;
14127 
14128     prec = arg->prec;
14129     if (prec < 0)
14130         prec = 6;
14131 
14132     if (arg->flags & F_ALT)
14133         dtoa_flags = Py_DTSF_ALT;
14134     else
14135         dtoa_flags = 0;
14136     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14137     if (p == NULL)
14138         return -1;
14139     len = strlen(p);
14140     if (writer) {
14141         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14142             PyMem_Free(p);
14143             return -1;
14144         }
14145     }
14146     else
14147         *p_output = _PyUnicode_FromASCII(p, len);
14148     PyMem_Free(p);
14149     return 0;
14150 }
14151 
14152 /* formatlong() emulates the format codes d, u, o, x and X, and
14153  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14154  * Python's regular ints.
14155  * Return value:  a new PyUnicodeObject*, or NULL if error.
14156  *     The output string is of the form
14157  *         "-"? ("0x" | "0X")? digit+
14158  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14159  *         set in flags.  The case of hex digits will be correct,
14160  *     There will be at least prec digits, zero-filled on the left if
14161  *         necessary to get that many.
14162  * val          object to be converted
14163  * flags        bitmask of format flags; only F_ALT is looked at
14164  * prec         minimum number of digits; 0-fill on left if needed
14165  * type         a character in [duoxX]; u acts the same as d
14166  *
14167  * CAUTION:  o, x and X conversions on regular ints can never
14168  * produce a '-' sign, but can for Python's unbounded ints.
14169  */
14170 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14171 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14172 {
14173     PyObject *result = NULL;
14174     char *buf;
14175     Py_ssize_t i;
14176     int sign;           /* 1 if '-', else 0 */
14177     int len;            /* number of characters */
14178     Py_ssize_t llen;
14179     int numdigits;      /* len == numnondigits + numdigits */
14180     int numnondigits = 0;
14181 
14182     /* Avoid exceeding SSIZE_T_MAX */
14183     if (prec > INT_MAX-3) {
14184         PyErr_SetString(PyExc_OverflowError,
14185                         "precision too large");
14186         return NULL;
14187     }
14188 
14189     assert(PyLong_Check(val));
14190 
14191     switch (type) {
14192     default:
14193         Py_UNREACHABLE();
14194     case 'd':
14195     case 'i':
14196     case 'u':
14197         /* int and int subclasses should print numerically when a numeric */
14198         /* format code is used (see issue18780) */
14199         result = PyNumber_ToBase(val, 10);
14200         break;
14201     case 'o':
14202         numnondigits = 2;
14203         result = PyNumber_ToBase(val, 8);
14204         break;
14205     case 'x':
14206     case 'X':
14207         numnondigits = 2;
14208         result = PyNumber_ToBase(val, 16);
14209         break;
14210     }
14211     if (!result)
14212         return NULL;
14213 
14214     assert(unicode_modifiable(result));
14215     assert(PyUnicode_IS_READY(result));
14216     assert(PyUnicode_IS_ASCII(result));
14217 
14218     /* To modify the string in-place, there can only be one reference. */
14219     if (Py_REFCNT(result) != 1) {
14220         Py_DECREF(result);
14221         PyErr_BadInternalCall();
14222         return NULL;
14223     }
14224     buf = PyUnicode_DATA(result);
14225     llen = PyUnicode_GET_LENGTH(result);
14226     if (llen > INT_MAX) {
14227         Py_DECREF(result);
14228         PyErr_SetString(PyExc_ValueError,
14229                         "string too large in _PyUnicode_FormatLong");
14230         return NULL;
14231     }
14232     len = (int)llen;
14233     sign = buf[0] == '-';
14234     numnondigits += sign;
14235     numdigits = len - numnondigits;
14236     assert(numdigits > 0);
14237 
14238     /* Get rid of base marker unless F_ALT */
14239     if (((alt) == 0 &&
14240         (type == 'o' || type == 'x' || type == 'X'))) {
14241         assert(buf[sign] == '0');
14242         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14243                buf[sign+1] == 'o');
14244         numnondigits -= 2;
14245         buf += 2;
14246         len -= 2;
14247         if (sign)
14248             buf[0] = '-';
14249         assert(len == numnondigits + numdigits);
14250         assert(numdigits > 0);
14251     }
14252 
14253     /* Fill with leading zeroes to meet minimum width. */
14254     if (prec > numdigits) {
14255         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14256                                 numnondigits + prec);
14257         char *b1;
14258         if (!r1) {
14259             Py_DECREF(result);
14260             return NULL;
14261         }
14262         b1 = PyBytes_AS_STRING(r1);
14263         for (i = 0; i < numnondigits; ++i)
14264             *b1++ = *buf++;
14265         for (i = 0; i < prec - numdigits; i++)
14266             *b1++ = '0';
14267         for (i = 0; i < numdigits; i++)
14268             *b1++ = *buf++;
14269         *b1 = '\0';
14270         Py_DECREF(result);
14271         result = r1;
14272         buf = PyBytes_AS_STRING(result);
14273         len = numnondigits + prec;
14274     }
14275 
14276     /* Fix up case for hex conversions. */
14277     if (type == 'X') {
14278         /* Need to convert all lower case letters to upper case.
14279            and need to convert 0x to 0X (and -0x to -0X). */
14280         for (i = 0; i < len; i++)
14281             if (buf[i] >= 'a' && buf[i] <= 'x')
14282                 buf[i] -= 'a'-'A';
14283     }
14284     if (!PyUnicode_Check(result)
14285         || buf != PyUnicode_DATA(result)) {
14286         PyObject *unicode;
14287         unicode = _PyUnicode_FromASCII(buf, len);
14288         Py_DECREF(result);
14289         result = unicode;
14290     }
14291     else if (len != PyUnicode_GET_LENGTH(result)) {
14292         if (PyUnicode_Resize(&result, len) < 0)
14293             Py_CLEAR(result);
14294     }
14295     return result;
14296 }
14297 
14298 /* Format an integer or a float as an integer.
14299  * Return 1 if the number has been formatted into the writer,
14300  *        0 if the number has been formatted into *p_output
14301  *       -1 and raise an exception on error */
14302 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14303 mainformatlong(PyObject *v,
14304                struct unicode_format_arg_t *arg,
14305                PyObject **p_output,
14306                _PyUnicodeWriter *writer)
14307 {
14308     PyObject *iobj, *res;
14309     char type = (char)arg->ch;
14310 
14311     if (!PyNumber_Check(v))
14312         goto wrongtype;
14313 
14314     /* make sure number is a type of integer for o, x, and X */
14315     if (!PyLong_Check(v)) {
14316         if (type == 'o' || type == 'x' || type == 'X') {
14317             iobj = PyNumber_Index(v);
14318             if (iobj == NULL) {
14319                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14320                     goto wrongtype;
14321                 return -1;
14322             }
14323         }
14324         else {
14325             iobj = PyNumber_Long(v);
14326             if (iobj == NULL ) {
14327                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14328                     goto wrongtype;
14329                 return -1;
14330             }
14331         }
14332         assert(PyLong_Check(iobj));
14333     }
14334     else {
14335         iobj = v;
14336         Py_INCREF(iobj);
14337     }
14338 
14339     if (PyLong_CheckExact(v)
14340         && arg->width == -1 && arg->prec == -1
14341         && !(arg->flags & (F_SIGN | F_BLANK))
14342         && type != 'X')
14343     {
14344         /* Fast path */
14345         int alternate = arg->flags & F_ALT;
14346         int base;
14347 
14348         switch(type)
14349         {
14350             default:
14351                 Py_UNREACHABLE();
14352             case 'd':
14353             case 'i':
14354             case 'u':
14355                 base = 10;
14356                 break;
14357             case 'o':
14358                 base = 8;
14359                 break;
14360             case 'x':
14361             case 'X':
14362                 base = 16;
14363                 break;
14364         }
14365 
14366         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14367             Py_DECREF(iobj);
14368             return -1;
14369         }
14370         Py_DECREF(iobj);
14371         return 1;
14372     }
14373 
14374     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14375     Py_DECREF(iobj);
14376     if (res == NULL)
14377         return -1;
14378     *p_output = res;
14379     return 0;
14380 
14381 wrongtype:
14382     switch(type)
14383     {
14384         case 'o':
14385         case 'x':
14386         case 'X':
14387             PyErr_Format(PyExc_TypeError,
14388                     "%%%c format: an integer is required, "
14389                     "not %.200s",
14390                     type, Py_TYPE(v)->tp_name);
14391             break;
14392         default:
14393             PyErr_Format(PyExc_TypeError,
14394                     "%%%c format: a number is required, "
14395                     "not %.200s",
14396                     type, Py_TYPE(v)->tp_name);
14397             break;
14398     }
14399     return -1;
14400 }
14401 
14402 static Py_UCS4
formatchar(PyObject * v)14403 formatchar(PyObject *v)
14404 {
14405     /* presume that the buffer is at least 3 characters long */
14406     if (PyUnicode_Check(v)) {
14407         if (PyUnicode_GET_LENGTH(v) == 1) {
14408             return PyUnicode_READ_CHAR(v, 0);
14409         }
14410         goto onError;
14411     }
14412     else {
14413         PyObject *iobj;
14414         long x;
14415         /* make sure number is a type of integer */
14416         if (!PyLong_Check(v)) {
14417             iobj = PyNumber_Index(v);
14418             if (iobj == NULL) {
14419                 goto onError;
14420             }
14421             x = PyLong_AsLong(iobj);
14422             Py_DECREF(iobj);
14423         }
14424         else {
14425             x = PyLong_AsLong(v);
14426         }
14427         if (x == -1 && PyErr_Occurred())
14428             goto onError;
14429 
14430         if (x < 0 || x > MAX_UNICODE) {
14431             PyErr_SetString(PyExc_OverflowError,
14432                             "%c arg not in range(0x110000)");
14433             return (Py_UCS4) -1;
14434         }
14435 
14436         return (Py_UCS4) x;
14437     }
14438 
14439   onError:
14440     PyErr_SetString(PyExc_TypeError,
14441                     "%c requires int or char");
14442     return (Py_UCS4) -1;
14443 }
14444 
14445 /* Parse options of an argument: flags, width, precision.
14446    Handle also "%(name)" syntax.
14447 
14448    Return 0 if the argument has been formatted into arg->str.
14449    Return 1 if the argument has been written into ctx->writer,
14450    Raise an exception and return -1 on error. */
14451 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14452 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14453                          struct unicode_format_arg_t *arg)
14454 {
14455 #define FORMAT_READ(ctx) \
14456         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14457 
14458     PyObject *v;
14459 
14460     if (arg->ch == '(') {
14461         /* Get argument value from a dictionary. Example: "%(name)s". */
14462         Py_ssize_t keystart;
14463         Py_ssize_t keylen;
14464         PyObject *key;
14465         int pcount = 1;
14466 
14467         if (ctx->dict == NULL) {
14468             PyErr_SetString(PyExc_TypeError,
14469                             "format requires a mapping");
14470             return -1;
14471         }
14472         ++ctx->fmtpos;
14473         --ctx->fmtcnt;
14474         keystart = ctx->fmtpos;
14475         /* Skip over balanced parentheses */
14476         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14477             arg->ch = FORMAT_READ(ctx);
14478             if (arg->ch == ')')
14479                 --pcount;
14480             else if (arg->ch == '(')
14481                 ++pcount;
14482             ctx->fmtpos++;
14483         }
14484         keylen = ctx->fmtpos - keystart - 1;
14485         if (ctx->fmtcnt < 0 || pcount > 0) {
14486             PyErr_SetString(PyExc_ValueError,
14487                             "incomplete format key");
14488             return -1;
14489         }
14490         key = PyUnicode_Substring(ctx->fmtstr,
14491                                   keystart, keystart + keylen);
14492         if (key == NULL)
14493             return -1;
14494         if (ctx->args_owned) {
14495             ctx->args_owned = 0;
14496             Py_DECREF(ctx->args);
14497         }
14498         ctx->args = PyObject_GetItem(ctx->dict, key);
14499         Py_DECREF(key);
14500         if (ctx->args == NULL)
14501             return -1;
14502         ctx->args_owned = 1;
14503         ctx->arglen = -1;
14504         ctx->argidx = -2;
14505     }
14506 
14507     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14508     while (--ctx->fmtcnt >= 0) {
14509         arg->ch = FORMAT_READ(ctx);
14510         ctx->fmtpos++;
14511         switch (arg->ch) {
14512         case '-': arg->flags |= F_LJUST; continue;
14513         case '+': arg->flags |= F_SIGN; continue;
14514         case ' ': arg->flags |= F_BLANK; continue;
14515         case '#': arg->flags |= F_ALT; continue;
14516         case '0': arg->flags |= F_ZERO; continue;
14517         }
14518         break;
14519     }
14520 
14521     /* Parse width. Example: "%10s" => width=10 */
14522     if (arg->ch == '*') {
14523         v = unicode_format_getnextarg(ctx);
14524         if (v == NULL)
14525             return -1;
14526         if (!PyLong_Check(v)) {
14527             PyErr_SetString(PyExc_TypeError,
14528                             "* wants int");
14529             return -1;
14530         }
14531         arg->width = PyLong_AsSsize_t(v);
14532         if (arg->width == -1 && PyErr_Occurred())
14533             return -1;
14534         if (arg->width < 0) {
14535             arg->flags |= F_LJUST;
14536             arg->width = -arg->width;
14537         }
14538         if (--ctx->fmtcnt >= 0) {
14539             arg->ch = FORMAT_READ(ctx);
14540             ctx->fmtpos++;
14541         }
14542     }
14543     else if (arg->ch >= '0' && arg->ch <= '9') {
14544         arg->width = arg->ch - '0';
14545         while (--ctx->fmtcnt >= 0) {
14546             arg->ch = FORMAT_READ(ctx);
14547             ctx->fmtpos++;
14548             if (arg->ch < '0' || arg->ch > '9')
14549                 break;
14550             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14551                mixing signed and unsigned comparison. Since arg->ch is between
14552                '0' and '9', casting to int is safe. */
14553             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14554                 PyErr_SetString(PyExc_ValueError,
14555                                 "width too big");
14556                 return -1;
14557             }
14558             arg->width = arg->width*10 + (arg->ch - '0');
14559         }
14560     }
14561 
14562     /* Parse precision. Example: "%.3f" => prec=3 */
14563     if (arg->ch == '.') {
14564         arg->prec = 0;
14565         if (--ctx->fmtcnt >= 0) {
14566             arg->ch = FORMAT_READ(ctx);
14567             ctx->fmtpos++;
14568         }
14569         if (arg->ch == '*') {
14570             v = unicode_format_getnextarg(ctx);
14571             if (v == NULL)
14572                 return -1;
14573             if (!PyLong_Check(v)) {
14574                 PyErr_SetString(PyExc_TypeError,
14575                                 "* wants int");
14576                 return -1;
14577             }
14578             arg->prec = _PyLong_AsInt(v);
14579             if (arg->prec == -1 && PyErr_Occurred())
14580                 return -1;
14581             if (arg->prec < 0)
14582                 arg->prec = 0;
14583             if (--ctx->fmtcnt >= 0) {
14584                 arg->ch = FORMAT_READ(ctx);
14585                 ctx->fmtpos++;
14586             }
14587         }
14588         else if (arg->ch >= '0' && arg->ch <= '9') {
14589             arg->prec = arg->ch - '0';
14590             while (--ctx->fmtcnt >= 0) {
14591                 arg->ch = FORMAT_READ(ctx);
14592                 ctx->fmtpos++;
14593                 if (arg->ch < '0' || arg->ch > '9')
14594                     break;
14595                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14596                     PyErr_SetString(PyExc_ValueError,
14597                                     "precision too big");
14598                     return -1;
14599                 }
14600                 arg->prec = arg->prec*10 + (arg->ch - '0');
14601             }
14602         }
14603     }
14604 
14605     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14606     if (ctx->fmtcnt >= 0) {
14607         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14608             if (--ctx->fmtcnt >= 0) {
14609                 arg->ch = FORMAT_READ(ctx);
14610                 ctx->fmtpos++;
14611             }
14612         }
14613     }
14614     if (ctx->fmtcnt < 0) {
14615         PyErr_SetString(PyExc_ValueError,
14616                         "incomplete format");
14617         return -1;
14618     }
14619     return 0;
14620 
14621 #undef FORMAT_READ
14622 }
14623 
14624 /* Format one argument. Supported conversion specifiers:
14625 
14626    - "s", "r", "a": any type
14627    - "i", "d", "u": int or float
14628    - "o", "x", "X": int
14629    - "e", "E", "f", "F", "g", "G": float
14630    - "c": int or str (1 character)
14631 
14632    When possible, the output is written directly into the Unicode writer
14633    (ctx->writer). A string is created when padding is required.
14634 
14635    Return 0 if the argument has been formatted into *p_str,
14636           1 if the argument has been written into ctx->writer,
14637          -1 on error. */
14638 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14639 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14640                           struct unicode_format_arg_t *arg,
14641                           PyObject **p_str)
14642 {
14643     PyObject *v;
14644     _PyUnicodeWriter *writer = &ctx->writer;
14645 
14646     if (ctx->fmtcnt == 0)
14647         ctx->writer.overallocate = 0;
14648 
14649     v = unicode_format_getnextarg(ctx);
14650     if (v == NULL)
14651         return -1;
14652 
14653 
14654     switch (arg->ch) {
14655     case 's':
14656     case 'r':
14657     case 'a':
14658         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14659             /* Fast path */
14660             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14661                 return -1;
14662             return 1;
14663         }
14664 
14665         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14666             *p_str = v;
14667             Py_INCREF(*p_str);
14668         }
14669         else {
14670             if (arg->ch == 's')
14671                 *p_str = PyObject_Str(v);
14672             else if (arg->ch == 'r')
14673                 *p_str = PyObject_Repr(v);
14674             else
14675                 *p_str = PyObject_ASCII(v);
14676         }
14677         break;
14678 
14679     case 'i':
14680     case 'd':
14681     case 'u':
14682     case 'o':
14683     case 'x':
14684     case 'X':
14685     {
14686         int ret = mainformatlong(v, arg, p_str, writer);
14687         if (ret != 0)
14688             return ret;
14689         arg->sign = 1;
14690         break;
14691     }
14692 
14693     case 'e':
14694     case 'E':
14695     case 'f':
14696     case 'F':
14697     case 'g':
14698     case 'G':
14699         if (arg->width == -1 && arg->prec == -1
14700             && !(arg->flags & (F_SIGN | F_BLANK)))
14701         {
14702             /* Fast path */
14703             if (formatfloat(v, arg, NULL, writer) == -1)
14704                 return -1;
14705             return 1;
14706         }
14707 
14708         arg->sign = 1;
14709         if (formatfloat(v, arg, p_str, NULL) == -1)
14710             return -1;
14711         break;
14712 
14713     case 'c':
14714     {
14715         Py_UCS4 ch = formatchar(v);
14716         if (ch == (Py_UCS4) -1)
14717             return -1;
14718         if (arg->width == -1 && arg->prec == -1) {
14719             /* Fast path */
14720             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14721                 return -1;
14722             return 1;
14723         }
14724         *p_str = PyUnicode_FromOrdinal(ch);
14725         break;
14726     }
14727 
14728     default:
14729         PyErr_Format(PyExc_ValueError,
14730                      "unsupported format character '%c' (0x%x) "
14731                      "at index %zd",
14732                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14733                      (int)arg->ch,
14734                      ctx->fmtpos - 1);
14735         return -1;
14736     }
14737     if (*p_str == NULL)
14738         return -1;
14739     assert (PyUnicode_Check(*p_str));
14740     return 0;
14741 }
14742 
14743 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14744 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14745                           struct unicode_format_arg_t *arg,
14746                           PyObject *str)
14747 {
14748     Py_ssize_t len;
14749     enum PyUnicode_Kind kind;
14750     void *pbuf;
14751     Py_ssize_t pindex;
14752     Py_UCS4 signchar;
14753     Py_ssize_t buflen;
14754     Py_UCS4 maxchar;
14755     Py_ssize_t sublen;
14756     _PyUnicodeWriter *writer = &ctx->writer;
14757     Py_UCS4 fill;
14758 
14759     fill = ' ';
14760     if (arg->sign && arg->flags & F_ZERO)
14761         fill = '0';
14762 
14763     if (PyUnicode_READY(str) == -1)
14764         return -1;
14765 
14766     len = PyUnicode_GET_LENGTH(str);
14767     if ((arg->width == -1 || arg->width <= len)
14768         && (arg->prec == -1 || arg->prec >= len)
14769         && !(arg->flags & (F_SIGN | F_BLANK)))
14770     {
14771         /* Fast path */
14772         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14773             return -1;
14774         return 0;
14775     }
14776 
14777     /* Truncate the string for "s", "r" and "a" formats
14778        if the precision is set */
14779     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14780         if (arg->prec >= 0 && len > arg->prec)
14781             len = arg->prec;
14782     }
14783 
14784     /* Adjust sign and width */
14785     kind = PyUnicode_KIND(str);
14786     pbuf = PyUnicode_DATA(str);
14787     pindex = 0;
14788     signchar = '\0';
14789     if (arg->sign) {
14790         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14791         if (ch == '-' || ch == '+') {
14792             signchar = ch;
14793             len--;
14794             pindex++;
14795         }
14796         else if (arg->flags & F_SIGN)
14797             signchar = '+';
14798         else if (arg->flags & F_BLANK)
14799             signchar = ' ';
14800         else
14801             arg->sign = 0;
14802     }
14803     if (arg->width < len)
14804         arg->width = len;
14805 
14806     /* Prepare the writer */
14807     maxchar = writer->maxchar;
14808     if (!(arg->flags & F_LJUST)) {
14809         if (arg->sign) {
14810             if ((arg->width-1) > len)
14811                 maxchar = Py_MAX(maxchar, fill);
14812         }
14813         else {
14814             if (arg->width > len)
14815                 maxchar = Py_MAX(maxchar, fill);
14816         }
14817     }
14818     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14819         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14820         maxchar = Py_MAX(maxchar, strmaxchar);
14821     }
14822 
14823     buflen = arg->width;
14824     if (arg->sign && len == arg->width)
14825         buflen++;
14826     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14827         return -1;
14828 
14829     /* Write the sign if needed */
14830     if (arg->sign) {
14831         if (fill != ' ') {
14832             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14833             writer->pos += 1;
14834         }
14835         if (arg->width > len)
14836             arg->width--;
14837     }
14838 
14839     /* Write the numeric prefix for "x", "X" and "o" formats
14840        if the alternate form is used.
14841        For example, write "0x" for the "%#x" format. */
14842     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14845         if (fill != ' ') {
14846             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848             writer->pos += 2;
14849             pindex += 2;
14850         }
14851         arg->width -= 2;
14852         if (arg->width < 0)
14853             arg->width = 0;
14854         len -= 2;
14855     }
14856 
14857     /* Pad left with the fill character if needed */
14858     if (arg->width > len && !(arg->flags & F_LJUST)) {
14859         sublen = arg->width - len;
14860         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14861         writer->pos += sublen;
14862         arg->width = len;
14863     }
14864 
14865     /* If padding with spaces: write sign if needed and/or numeric prefix if
14866        the alternate form is used */
14867     if (fill == ' ') {
14868         if (arg->sign) {
14869             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14870             writer->pos += 1;
14871         }
14872         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14873             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14874             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14875             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14876             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14877             writer->pos += 2;
14878             pindex += 2;
14879         }
14880     }
14881 
14882     /* Write characters */
14883     if (len) {
14884         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14885                                       str, pindex, len);
14886         writer->pos += len;
14887     }
14888 
14889     /* Pad right with the fill character if needed */
14890     if (arg->width > len) {
14891         sublen = arg->width - len;
14892         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14893         writer->pos += sublen;
14894     }
14895     return 0;
14896 }
14897 
14898 /* Helper of PyUnicode_Format(): format one arg.
14899    Return 0 on success, raise an exception and return -1 on error. */
14900 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14901 unicode_format_arg(struct unicode_formatter_t *ctx)
14902 {
14903     struct unicode_format_arg_t arg;
14904     PyObject *str;
14905     int ret;
14906 
14907     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14908     if (arg.ch == '%') {
14909         ctx->fmtpos++;
14910         ctx->fmtcnt--;
14911         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14912             return -1;
14913         return 0;
14914     }
14915     arg.flags = 0;
14916     arg.width = -1;
14917     arg.prec = -1;
14918     arg.sign = 0;
14919     str = NULL;
14920 
14921     ret = unicode_format_arg_parse(ctx, &arg);
14922     if (ret == -1)
14923         return -1;
14924 
14925     ret = unicode_format_arg_format(ctx, &arg, &str);
14926     if (ret == -1)
14927         return -1;
14928 
14929     if (ret != 1) {
14930         ret = unicode_format_arg_output(ctx, &arg, str);
14931         Py_DECREF(str);
14932         if (ret == -1)
14933             return -1;
14934     }
14935 
14936     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14937         PyErr_SetString(PyExc_TypeError,
14938                         "not all arguments converted during string formatting");
14939         return -1;
14940     }
14941     return 0;
14942 }
14943 
14944 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14945 PyUnicode_Format(PyObject *format, PyObject *args)
14946 {
14947     struct unicode_formatter_t ctx;
14948 
14949     if (format == NULL || args == NULL) {
14950         PyErr_BadInternalCall();
14951         return NULL;
14952     }
14953 
14954     if (ensure_unicode(format) < 0)
14955         return NULL;
14956 
14957     ctx.fmtstr = format;
14958     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14959     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14960     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14961     ctx.fmtpos = 0;
14962 
14963     _PyUnicodeWriter_Init(&ctx.writer);
14964     ctx.writer.min_length = ctx.fmtcnt + 100;
14965     ctx.writer.overallocate = 1;
14966 
14967     if (PyTuple_Check(args)) {
14968         ctx.arglen = PyTuple_Size(args);
14969         ctx.argidx = 0;
14970     }
14971     else {
14972         ctx.arglen = -1;
14973         ctx.argidx = -2;
14974     }
14975     ctx.args_owned = 0;
14976     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14977         ctx.dict = args;
14978     else
14979         ctx.dict = NULL;
14980     ctx.args = args;
14981 
14982     while (--ctx.fmtcnt >= 0) {
14983         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14984             Py_ssize_t nonfmtpos;
14985 
14986             nonfmtpos = ctx.fmtpos++;
14987             while (ctx.fmtcnt >= 0 &&
14988                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14989                 ctx.fmtpos++;
14990                 ctx.fmtcnt--;
14991             }
14992             if (ctx.fmtcnt < 0) {
14993                 ctx.fmtpos--;
14994                 ctx.writer.overallocate = 0;
14995             }
14996 
14997             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14998                                                 nonfmtpos, ctx.fmtpos) < 0)
14999                 goto onError;
15000         }
15001         else {
15002             ctx.fmtpos++;
15003             if (unicode_format_arg(&ctx) == -1)
15004                 goto onError;
15005         }
15006     }
15007 
15008     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15009         PyErr_SetString(PyExc_TypeError,
15010                         "not all arguments converted during string formatting");
15011         goto onError;
15012     }
15013 
15014     if (ctx.args_owned) {
15015         Py_DECREF(ctx.args);
15016     }
15017     return _PyUnicodeWriter_Finish(&ctx.writer);
15018 
15019   onError:
15020     _PyUnicodeWriter_Dealloc(&ctx.writer);
15021     if (ctx.args_owned) {
15022         Py_DECREF(ctx.args);
15023     }
15024     return NULL;
15025 }
15026 
15027 static PyObject *
15028 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15029 
15030 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15031 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15032 {
15033     PyObject *x = NULL;
15034     static char *kwlist[] = {"object", "encoding", "errors", 0};
15035     char *encoding = NULL;
15036     char *errors = NULL;
15037 
15038     if (type != &PyUnicode_Type)
15039         return unicode_subtype_new(type, args, kwds);
15040     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15041                                      kwlist, &x, &encoding, &errors))
15042         return NULL;
15043     if (x == NULL)
15044         _Py_RETURN_UNICODE_EMPTY();
15045     if (encoding == NULL && errors == NULL)
15046         return PyObject_Str(x);
15047     else
15048         return PyUnicode_FromEncodedObject(x, encoding, errors);
15049 }
15050 
15051 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15052 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15053 {
15054     PyObject *unicode, *self;
15055     Py_ssize_t length, char_size;
15056     int share_wstr, share_utf8;
15057     unsigned int kind;
15058     void *data;
15059 
15060     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15061 
15062     unicode = unicode_new(&PyUnicode_Type, args, kwds);
15063     if (unicode == NULL)
15064         return NULL;
15065     assert(_PyUnicode_CHECK(unicode));
15066     if (PyUnicode_READY(unicode) == -1) {
15067         Py_DECREF(unicode);
15068         return NULL;
15069     }
15070 
15071     self = type->tp_alloc(type, 0);
15072     if (self == NULL) {
15073         Py_DECREF(unicode);
15074         return NULL;
15075     }
15076     kind = PyUnicode_KIND(unicode);
15077     length = PyUnicode_GET_LENGTH(unicode);
15078 
15079     _PyUnicode_LENGTH(self) = length;
15080 #ifdef Py_DEBUG
15081     _PyUnicode_HASH(self) = -1;
15082 #else
15083     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15084 #endif
15085     _PyUnicode_STATE(self).interned = 0;
15086     _PyUnicode_STATE(self).kind = kind;
15087     _PyUnicode_STATE(self).compact = 0;
15088     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15089     _PyUnicode_STATE(self).ready = 1;
15090     _PyUnicode_WSTR(self) = NULL;
15091     _PyUnicode_UTF8_LENGTH(self) = 0;
15092     _PyUnicode_UTF8(self) = NULL;
15093     _PyUnicode_WSTR_LENGTH(self) = 0;
15094     _PyUnicode_DATA_ANY(self) = NULL;
15095 
15096     share_utf8 = 0;
15097     share_wstr = 0;
15098     if (kind == PyUnicode_1BYTE_KIND) {
15099         char_size = 1;
15100         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15101             share_utf8 = 1;
15102     }
15103     else if (kind == PyUnicode_2BYTE_KIND) {
15104         char_size = 2;
15105         if (sizeof(wchar_t) == 2)
15106             share_wstr = 1;
15107     }
15108     else {
15109         assert(kind == PyUnicode_4BYTE_KIND);
15110         char_size = 4;
15111         if (sizeof(wchar_t) == 4)
15112             share_wstr = 1;
15113     }
15114 
15115     /* Ensure we won't overflow the length. */
15116     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15117         PyErr_NoMemory();
15118         goto onError;
15119     }
15120     data = PyObject_MALLOC((length + 1) * char_size);
15121     if (data == NULL) {
15122         PyErr_NoMemory();
15123         goto onError;
15124     }
15125 
15126     _PyUnicode_DATA_ANY(self) = data;
15127     if (share_utf8) {
15128         _PyUnicode_UTF8_LENGTH(self) = length;
15129         _PyUnicode_UTF8(self) = data;
15130     }
15131     if (share_wstr) {
15132         _PyUnicode_WSTR_LENGTH(self) = length;
15133         _PyUnicode_WSTR(self) = (wchar_t *)data;
15134     }
15135 
15136     memcpy(data, PyUnicode_DATA(unicode),
15137               kind * (length + 1));
15138     assert(_PyUnicode_CheckConsistency(self, 1));
15139 #ifdef Py_DEBUG
15140     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15141 #endif
15142     Py_DECREF(unicode);
15143     return self;
15144 
15145 onError:
15146     Py_DECREF(unicode);
15147     Py_DECREF(self);
15148     return NULL;
15149 }
15150 
15151 PyDoc_STRVAR(unicode_doc,
15152 "str(object='') -> str\n\
15153 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15154 \n\
15155 Create a new string object from the given object. If encoding or\n\
15156 errors is specified, then the object must expose a data buffer\n\
15157 that will be decoded using the given encoding and error handler.\n\
15158 Otherwise, returns the result of object.__str__() (if defined)\n\
15159 or repr(object).\n\
15160 encoding defaults to sys.getdefaultencoding().\n\
15161 errors defaults to 'strict'.");
15162 
15163 static PyObject *unicode_iter(PyObject *seq);
15164 
15165 PyTypeObject PyUnicode_Type = {
15166     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15167     "str",                        /* tp_name */
15168     sizeof(PyUnicodeObject),      /* tp_basicsize */
15169     0,                            /* tp_itemsize */
15170     /* Slots */
15171     (destructor)unicode_dealloc,  /* tp_dealloc */
15172     0,                            /* tp_vectorcall_offset */
15173     0,                            /* tp_getattr */
15174     0,                            /* tp_setattr */
15175     0,                            /* tp_as_async */
15176     unicode_repr,                 /* tp_repr */
15177     &unicode_as_number,           /* tp_as_number */
15178     &unicode_as_sequence,         /* tp_as_sequence */
15179     &unicode_as_mapping,          /* tp_as_mapping */
15180     (hashfunc) unicode_hash,      /* tp_hash*/
15181     0,                            /* tp_call*/
15182     (reprfunc) unicode_str,       /* tp_str */
15183     PyObject_GenericGetAttr,      /* tp_getattro */
15184     0,                            /* tp_setattro */
15185     0,                            /* tp_as_buffer */
15186     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15187     Py_TPFLAGS_UNICODE_SUBCLASS,   /* tp_flags */
15188     unicode_doc,                  /* tp_doc */
15189     0,                            /* tp_traverse */
15190     0,                            /* tp_clear */
15191     PyUnicode_RichCompare,        /* tp_richcompare */
15192     0,                            /* tp_weaklistoffset */
15193     unicode_iter,                 /* tp_iter */
15194     0,                            /* tp_iternext */
15195     unicode_methods,              /* tp_methods */
15196     0,                            /* tp_members */
15197     0,                            /* tp_getset */
15198     &PyBaseObject_Type,           /* tp_base */
15199     0,                            /* tp_dict */
15200     0,                            /* tp_descr_get */
15201     0,                            /* tp_descr_set */
15202     0,                            /* tp_dictoffset */
15203     0,                            /* tp_init */
15204     0,                            /* tp_alloc */
15205     unicode_new,                  /* tp_new */
15206     PyObject_Del,                 /* tp_free */
15207 };
15208 
15209 /* Initialize the Unicode implementation */
15210 
15211 PyStatus
_PyUnicode_Init(void)15212 _PyUnicode_Init(void)
15213 {
15214     /* XXX - move this array to unicodectype.c ? */
15215     Py_UCS2 linebreak[] = {
15216         0x000A, /* LINE FEED */
15217         0x000D, /* CARRIAGE RETURN */
15218         0x001C, /* FILE SEPARATOR */
15219         0x001D, /* GROUP SEPARATOR */
15220         0x001E, /* RECORD SEPARATOR */
15221         0x0085, /* NEXT LINE */
15222         0x2028, /* LINE SEPARATOR */
15223         0x2029, /* PARAGRAPH SEPARATOR */
15224     };
15225 
15226     /* Init the implementation */
15227     _Py_INCREF_UNICODE_EMPTY();
15228     if (!unicode_empty) {
15229         return _PyStatus_ERR("Can't create empty string");
15230     }
15231     Py_DECREF(unicode_empty);
15232 
15233     if (PyType_Ready(&PyUnicode_Type) < 0) {
15234         return _PyStatus_ERR("Can't initialize unicode type");
15235     }
15236 
15237     /* initialize the linebreak bloom filter */
15238     bloom_linebreak = make_bloom_mask(
15239         PyUnicode_2BYTE_KIND, linebreak,
15240         Py_ARRAY_LENGTH(linebreak));
15241 
15242     if (PyType_Ready(&EncodingMapType) < 0) {
15243          return _PyStatus_ERR("Can't initialize encoding map type");
15244     }
15245     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15246         return _PyStatus_ERR("Can't initialize field name iterator type");
15247     }
15248     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15249         return _PyStatus_ERR("Can't initialize formatter iter type");
15250     }
15251     return _PyStatus_OK();
15252 }
15253 
15254 /* Finalize the Unicode implementation */
15255 
15256 int
PyUnicode_ClearFreeList(void)15257 PyUnicode_ClearFreeList(void)
15258 {
15259     return 0;
15260 }
15261 
15262 
15263 void
PyUnicode_InternInPlace(PyObject ** p)15264 PyUnicode_InternInPlace(PyObject **p)
15265 {
15266     PyObject *s = *p;
15267     PyObject *t;
15268 #ifdef Py_DEBUG
15269     assert(s != NULL);
15270     assert(_PyUnicode_CHECK(s));
15271 #else
15272     if (s == NULL || !PyUnicode_Check(s))
15273         return;
15274 #endif
15275     /* If it's a subclass, we don't really know what putting
15276        it in the interned dict might do. */
15277     if (!PyUnicode_CheckExact(s))
15278         return;
15279     if (PyUnicode_CHECK_INTERNED(s))
15280         return;
15281     if (interned == NULL) {
15282         interned = PyDict_New();
15283         if (interned == NULL) {
15284             PyErr_Clear(); /* Don't leave an exception */
15285             return;
15286         }
15287     }
15288     Py_ALLOW_RECURSION
15289     t = PyDict_SetDefault(interned, s, s);
15290     Py_END_ALLOW_RECURSION
15291     if (t == NULL) {
15292         PyErr_Clear();
15293         return;
15294     }
15295     if (t != s) {
15296         Py_INCREF(t);
15297         Py_SETREF(*p, t);
15298         return;
15299     }
15300     /* The two references in interned are not counted by refcnt.
15301        The deallocator will take care of this */
15302     Py_REFCNT(s) -= 2;
15303     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15304 }
15305 
15306 void
PyUnicode_InternImmortal(PyObject ** p)15307 PyUnicode_InternImmortal(PyObject **p)
15308 {
15309     PyUnicode_InternInPlace(p);
15310     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15311         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15312         Py_INCREF(*p);
15313     }
15314 }
15315 
15316 PyObject *
PyUnicode_InternFromString(const char * cp)15317 PyUnicode_InternFromString(const char *cp)
15318 {
15319     PyObject *s = PyUnicode_FromString(cp);
15320     if (s == NULL)
15321         return NULL;
15322     PyUnicode_InternInPlace(&s);
15323     return s;
15324 }
15325 
15326 
15327 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15328 static void
unicode_release_interned(void)15329 unicode_release_interned(void)
15330 {
15331     PyObject *keys;
15332     PyObject *s;
15333     Py_ssize_t i, n;
15334     Py_ssize_t immortal_size = 0, mortal_size = 0;
15335 
15336     if (interned == NULL || !PyDict_Check(interned))
15337         return;
15338     keys = PyDict_Keys(interned);
15339     if (keys == NULL || !PyList_Check(keys)) {
15340         PyErr_Clear();
15341         return;
15342     }
15343 
15344     /* Since unicode_release_interned() is intended to help a leak
15345        detector, interned unicode strings are not forcibly deallocated;
15346        rather, we give them their stolen references back, and then clear
15347        and DECREF the interned dict. */
15348 
15349     n = PyList_GET_SIZE(keys);
15350 #ifdef INTERNED_STATS
15351     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15352             n);
15353 #endif
15354     for (i = 0; i < n; i++) {
15355         s = PyList_GET_ITEM(keys, i);
15356         if (PyUnicode_READY(s) == -1) {
15357             Py_UNREACHABLE();
15358         }
15359         switch (PyUnicode_CHECK_INTERNED(s)) {
15360         case SSTATE_NOT_INTERNED:
15361             /* XXX Shouldn't happen */
15362             break;
15363         case SSTATE_INTERNED_IMMORTAL:
15364             Py_REFCNT(s) += 1;
15365             immortal_size += PyUnicode_GET_LENGTH(s);
15366             break;
15367         case SSTATE_INTERNED_MORTAL:
15368             Py_REFCNT(s) += 2;
15369             mortal_size += PyUnicode_GET_LENGTH(s);
15370             break;
15371         default:
15372             Py_FatalError("Inconsistent interned string state.");
15373         }
15374         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15375     }
15376 #ifdef INTERNED_STATS
15377     fprintf(stderr, "total size of all interned strings: "
15378             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15379             "mortal/immortal\n", mortal_size, immortal_size);
15380 #endif
15381     Py_DECREF(keys);
15382     PyDict_Clear(interned);
15383     Py_CLEAR(interned);
15384 }
15385 #endif
15386 
15387 
15388 /********************* Unicode Iterator **************************/
15389 
15390 typedef struct {
15391     PyObject_HEAD
15392     Py_ssize_t it_index;
15393     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15394 } unicodeiterobject;
15395 
15396 static void
unicodeiter_dealloc(unicodeiterobject * it)15397 unicodeiter_dealloc(unicodeiterobject *it)
15398 {
15399     _PyObject_GC_UNTRACK(it);
15400     Py_XDECREF(it->it_seq);
15401     PyObject_GC_Del(it);
15402 }
15403 
15404 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15405 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406 {
15407     Py_VISIT(it->it_seq);
15408     return 0;
15409 }
15410 
15411 static PyObject *
unicodeiter_next(unicodeiterobject * it)15412 unicodeiter_next(unicodeiterobject *it)
15413 {
15414     PyObject *seq, *item;
15415 
15416     assert(it != NULL);
15417     seq = it->it_seq;
15418     if (seq == NULL)
15419         return NULL;
15420     assert(_PyUnicode_CHECK(seq));
15421 
15422     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423         int kind = PyUnicode_KIND(seq);
15424         void *data = PyUnicode_DATA(seq);
15425         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426         item = PyUnicode_FromOrdinal(chr);
15427         if (item != NULL)
15428             ++it->it_index;
15429         return item;
15430     }
15431 
15432     it->it_seq = NULL;
15433     Py_DECREF(seq);
15434     return NULL;
15435 }
15436 
15437 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15438 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15439 {
15440     Py_ssize_t len = 0;
15441     if (it->it_seq)
15442         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15443     return PyLong_FromSsize_t(len);
15444 }
15445 
15446 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447 
15448 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15449 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15450 {
15451     _Py_IDENTIFIER(iter);
15452     if (it->it_seq != NULL) {
15453         return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15454                              it->it_seq, it->it_index);
15455     } else {
15456         PyObject *u = (PyObject *)_PyUnicode_New(0);
15457         if (u == NULL)
15458             return NULL;
15459         return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15460     }
15461 }
15462 
15463 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15464 
15465 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15466 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15467 {
15468     Py_ssize_t index = PyLong_AsSsize_t(state);
15469     if (index == -1 && PyErr_Occurred())
15470         return NULL;
15471     if (it->it_seq != NULL) {
15472         if (index < 0)
15473             index = 0;
15474         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15475             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15476         it->it_index = index;
15477     }
15478     Py_RETURN_NONE;
15479 }
15480 
15481 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15482 
15483 static PyMethodDef unicodeiter_methods[] = {
15484     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15485      length_hint_doc},
15486     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15487      reduce_doc},
15488     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15489      setstate_doc},
15490     {NULL,      NULL}       /* sentinel */
15491 };
15492 
15493 PyTypeObject PyUnicodeIter_Type = {
15494     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15495     "str_iterator",         /* tp_name */
15496     sizeof(unicodeiterobject),      /* tp_basicsize */
15497     0,                  /* tp_itemsize */
15498     /* methods */
15499     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15500     0,                  /* tp_vectorcall_offset */
15501     0,                  /* tp_getattr */
15502     0,                  /* tp_setattr */
15503     0,                  /* tp_as_async */
15504     0,                  /* tp_repr */
15505     0,                  /* tp_as_number */
15506     0,                  /* tp_as_sequence */
15507     0,                  /* tp_as_mapping */
15508     0,                  /* tp_hash */
15509     0,                  /* tp_call */
15510     0,                  /* tp_str */
15511     PyObject_GenericGetAttr,        /* tp_getattro */
15512     0,                  /* tp_setattro */
15513     0,                  /* tp_as_buffer */
15514     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15515     0,                  /* tp_doc */
15516     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15517     0,                  /* tp_clear */
15518     0,                  /* tp_richcompare */
15519     0,                  /* tp_weaklistoffset */
15520     PyObject_SelfIter,          /* tp_iter */
15521     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15522     unicodeiter_methods,            /* tp_methods */
15523     0,
15524 };
15525 
15526 static PyObject *
unicode_iter(PyObject * seq)15527 unicode_iter(PyObject *seq)
15528 {
15529     unicodeiterobject *it;
15530 
15531     if (!PyUnicode_Check(seq)) {
15532         PyErr_BadInternalCall();
15533         return NULL;
15534     }
15535     if (PyUnicode_READY(seq) == -1)
15536         return NULL;
15537     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15538     if (it == NULL)
15539         return NULL;
15540     it->it_index = 0;
15541     Py_INCREF(seq);
15542     it->it_seq = seq;
15543     _PyObject_GC_TRACK(it);
15544     return (PyObject *)it;
15545 }
15546 
15547 
15548 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15549 Py_UNICODE_strlen(const Py_UNICODE *u)
15550 {
15551     return wcslen(u);
15552 }
15553 
15554 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15555 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15556 {
15557     Py_UNICODE *u = s1;
15558     while ((*u++ = *s2++));
15559     return s1;
15560 }
15561 
15562 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15563 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15564 {
15565     Py_UNICODE *u = s1;
15566     while ((*u++ = *s2++))
15567         if (n-- == 0)
15568             break;
15569     return s1;
15570 }
15571 
15572 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15573 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15574 {
15575     Py_UNICODE *u1 = s1;
15576     u1 += wcslen(u1);
15577     while ((*u1++ = *s2++));
15578     return s1;
15579 }
15580 
15581 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15582 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15583 {
15584     while (*s1 && *s2 && *s1 == *s2)
15585         s1++, s2++;
15586     if (*s1 && *s2)
15587         return (*s1 < *s2) ? -1 : +1;
15588     if (*s1)
15589         return 1;
15590     if (*s2)
15591         return -1;
15592     return 0;
15593 }
15594 
15595 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15596 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15597 {
15598     Py_UNICODE u1, u2;
15599     for (; n != 0; n--) {
15600         u1 = *s1;
15601         u2 = *s2;
15602         if (u1 != u2)
15603             return (u1 < u2) ? -1 : +1;
15604         if (u1 == '\0')
15605             return 0;
15606         s1++;
15607         s2++;
15608     }
15609     return 0;
15610 }
15611 
15612 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15613 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15614 {
15615     const Py_UNICODE *p;
15616     for (p = s; *p; p++)
15617         if (*p == c)
15618             return (Py_UNICODE*)p;
15619     return NULL;
15620 }
15621 
15622 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15623 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15624 {
15625     const Py_UNICODE *p;
15626     p = s + wcslen(s);
15627     while (p != s) {
15628         p--;
15629         if (*p == c)
15630             return (Py_UNICODE*)p;
15631     }
15632     return NULL;
15633 }
15634 
15635 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15636 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15637 {
15638     Py_UNICODE *u, *copy;
15639     Py_ssize_t len, size;
15640 
15641     if (!PyUnicode_Check(unicode)) {
15642         PyErr_BadArgument();
15643         return NULL;
15644     }
15645     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15646     if (u == NULL)
15647         return NULL;
15648     /* Ensure we won't overflow the size. */
15649     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15650         PyErr_NoMemory();
15651         return NULL;
15652     }
15653     size = len + 1; /* copy the null character */
15654     size *= sizeof(Py_UNICODE);
15655     copy = PyMem_Malloc(size);
15656     if (copy == NULL) {
15657         PyErr_NoMemory();
15658         return NULL;
15659     }
15660     memcpy(copy, u, size);
15661     return copy;
15662 }
15663 
15664 
15665 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15666 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15667 {
15668     int res;
15669     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15670     if (res == -2) {
15671         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15672         return -1;
15673     }
15674     if (res < 0) {
15675         PyErr_NoMemory();
15676         return -1;
15677     }
15678     return 0;
15679 }
15680 
15681 
15682 static int
config_get_codec_name(wchar_t ** config_encoding)15683 config_get_codec_name(wchar_t **config_encoding)
15684 {
15685     char *encoding;
15686     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15687         return -1;
15688     }
15689 
15690     PyObject *name_obj = NULL;
15691     PyObject *codec = _PyCodec_Lookup(encoding);
15692     PyMem_RawFree(encoding);
15693 
15694     if (!codec)
15695         goto error;
15696 
15697     name_obj = PyObject_GetAttrString(codec, "name");
15698     Py_CLEAR(codec);
15699     if (!name_obj) {
15700         goto error;
15701     }
15702 
15703     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15704     Py_DECREF(name_obj);
15705     if (wname == NULL) {
15706         goto error;
15707     }
15708 
15709     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15710     if (raw_wname == NULL) {
15711         PyMem_Free(wname);
15712         PyErr_NoMemory();
15713         goto error;
15714     }
15715 
15716     PyMem_RawFree(*config_encoding);
15717     *config_encoding = raw_wname;
15718 
15719     PyMem_Free(wname);
15720     return 0;
15721 
15722 error:
15723     Py_XDECREF(codec);
15724     Py_XDECREF(name_obj);
15725     return -1;
15726 }
15727 
15728 
15729 static PyStatus
init_stdio_encoding(PyThreadState * tstate)15730 init_stdio_encoding(PyThreadState *tstate)
15731 {
15732     /* Update the stdio encoding to the normalized Python codec name. */
15733     PyConfig *config = &tstate->interp->config;
15734     if (config_get_codec_name(&config->stdio_encoding) < 0) {
15735         return _PyStatus_ERR("failed to get the Python codec name "
15736                              "of the stdio encoding");
15737     }
15738     return _PyStatus_OK();
15739 }
15740 
15741 
15742 static int
init_fs_codec(PyInterpreterState * interp)15743 init_fs_codec(PyInterpreterState *interp)
15744 {
15745     PyConfig *config = &interp->config;
15746 
15747     _Py_error_handler error_handler;
15748     error_handler = get_error_handler_wide(config->filesystem_errors);
15749     if (error_handler == _Py_ERROR_UNKNOWN) {
15750         PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15751         return -1;
15752     }
15753 
15754     char *encoding, *errors;
15755     if (encode_wstr_utf8(config->filesystem_encoding,
15756                          &encoding,
15757                          "filesystem_encoding") < 0) {
15758         return -1;
15759     }
15760 
15761     if (encode_wstr_utf8(config->filesystem_errors,
15762                          &errors,
15763                          "filesystem_errors") < 0) {
15764         PyMem_RawFree(encoding);
15765         return -1;
15766     }
15767 
15768     PyMem_RawFree(interp->fs_codec.encoding);
15769     interp->fs_codec.encoding = encoding;
15770     PyMem_RawFree(interp->fs_codec.errors);
15771     interp->fs_codec.errors = errors;
15772     interp->fs_codec.error_handler = error_handler;
15773 
15774     /* At this point, PyUnicode_EncodeFSDefault() and
15775        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15776        the C implementation of the filesystem encoding. */
15777 
15778     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15779        global configuration variables. */
15780     if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15781                                   interp->fs_codec.errors) < 0) {
15782         PyErr_NoMemory();
15783         return -1;
15784     }
15785     return 0;
15786 }
15787 
15788 
15789 static PyStatus
init_fs_encoding(PyThreadState * tstate)15790 init_fs_encoding(PyThreadState *tstate)
15791 {
15792     PyInterpreterState *interp = tstate->interp;
15793 
15794     /* Update the filesystem encoding to the normalized Python codec name.
15795        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15796        (Python codec name). */
15797     PyConfig *config = &interp->config;
15798     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15799         _Py_DumpPathConfig(tstate);
15800         return _PyStatus_ERR("failed to get the Python codec "
15801                              "of the filesystem encoding");
15802     }
15803 
15804     if (init_fs_codec(interp) < 0) {
15805         return _PyStatus_ERR("cannot initialize filesystem codec");
15806     }
15807     return _PyStatus_OK();
15808 }
15809 
15810 
15811 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15812 _PyUnicode_InitEncodings(PyThreadState *tstate)
15813 {
15814     PyStatus status = init_fs_encoding(tstate);
15815     if (_PyStatus_EXCEPTION(status)) {
15816         return status;
15817     }
15818 
15819     return init_stdio_encoding(tstate);
15820 }
15821 
15822 
15823 #ifdef MS_WINDOWS
15824 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15825 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15826 {
15827     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15828     PyConfig *config = &interp->config;
15829 
15830     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15831     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15832     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15833     if (encoding == NULL || errors == NULL) {
15834         PyMem_RawFree(encoding);
15835         PyMem_RawFree(errors);
15836         PyErr_NoMemory();
15837         return -1;
15838     }
15839 
15840     PyMem_RawFree(config->filesystem_encoding);
15841     config->filesystem_encoding = encoding;
15842     PyMem_RawFree(config->filesystem_errors);
15843     config->filesystem_errors = errors;
15844 
15845     return init_fs_codec(interp);
15846 }
15847 #endif
15848 
15849 
15850 void
_PyUnicode_Fini(void)15851 _PyUnicode_Fini(void)
15852 {
15853 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15854     /* Insure++ is a memory analysis tool that aids in discovering
15855      * memory leaks and other memory problems.  On Python exit, the
15856      * interned string dictionaries are flagged as being in use at exit
15857      * (which it is).  Under normal circumstances, this is fine because
15858      * the memory will be automatically reclaimed by the system.  Under
15859      * memory debugging, it's a huge source of useless noise, so we
15860      * trade off slower shutdown for less distraction in the memory
15861      * reports.  -baw
15862      */
15863     unicode_release_interned();
15864 #endif /* __INSURE__ */
15865 
15866     Py_CLEAR(unicode_empty);
15867 
15868     for (Py_ssize_t i = 0; i < 256; i++) {
15869         Py_CLEAR(unicode_latin1[i]);
15870     }
15871     _PyUnicode_ClearStaticStrings();
15872     (void)PyUnicode_ClearFreeList();
15873 
15874     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15875     PyMem_RawFree(interp->fs_codec.encoding);
15876     interp->fs_codec.encoding = NULL;
15877     PyMem_RawFree(interp->fs_codec.errors);
15878     interp->fs_codec.errors = NULL;
15879 }
15880 
15881 
15882 /* A _string module, to export formatter_parser and formatter_field_name_split
15883    to the string.Formatter class implemented in Python. */
15884 
15885 static PyMethodDef _string_methods[] = {
15886     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15887      METH_O, PyDoc_STR("split the argument as a field name")},
15888     {"formatter_parser", (PyCFunction) formatter_parser,
15889      METH_O, PyDoc_STR("parse the argument as a format string")},
15890     {NULL, NULL}
15891 };
15892 
15893 static struct PyModuleDef _string_module = {
15894     PyModuleDef_HEAD_INIT,
15895     "_string",
15896     PyDoc_STR("string helper module"),
15897     0,
15898     _string_methods,
15899     NULL,
15900     NULL,
15901     NULL,
15902     NULL
15903 };
15904 
15905 PyMODINIT_FUNC
PyInit__string(void)15906 PyInit__string(void)
15907 {
15908     return PyModule_Create(&_string_module);
15909 }
15910 
15911 
15912 #ifdef __cplusplus
15913 }
15914 #endif
15915