• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h"      // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
46 #include "pycore_format.h"        // F_LJUST
47 #include "pycore_initconfig.h"    // _PyStatus_OK()
48 #include "pycore_interp.h"        // PyInterpreterState.fs_codec
49 #include "pycore_object.h"        // _PyObject_GC_TRACK()
50 #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
51 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
52 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
53 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
54 #include "stringlib/eq.h"         // unicode_eq()
55 
56 #ifdef MS_WINDOWS
57 #include <windows.h>
58 #endif
59 
60 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61 #include "pycore_fileutils.h"     // _Py_LocaleUsesNonUnicodeWchar()
62 #endif
63 
64 /* Uncomment to display statistics on interned strings at exit
65    in _PyUnicode_ClearInterned(). */
66 /* #define INTERNED_STATS 1 */
67 
68 
69 /*[clinic input]
70 class str "PyObject *" "&PyUnicode_Type"
71 [clinic start generated code]*/
72 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73 
74 /*[python input]
75 class Py_UCS4_converter(CConverter):
76     type = 'Py_UCS4'
77     converter = 'convert_uc'
78 
79     def converter_init(self):
80         if self.default is not unspecified:
81             self.c_default = ascii(self.default)
82             if len(self.c_default) > 4 or self.c_default[0] != "'":
83                 self.c_default = hex(ord(self.default))
84 
85 [python start generated code]*/
86 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
87 
88 /* --- Globals ------------------------------------------------------------
89 
90 NOTE: In the interpreter's initialization phase, some globals are currently
91       initialized dynamically as needed. In the process Unicode objects may
92       be created before the Unicode type is ready.
93 
94 */
95 
96 
97 #ifdef __cplusplus
98 extern "C" {
99 #endif
100 
101 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102 // The value must be the same in fileutils.c.
103 #define MAX_UNICODE 0x10ffff
104 
105 #ifdef Py_DEBUG
106 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
107 #else
108 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109 #endif
110 
111 #define _PyUnicode_UTF8(op)                             \
112     (((PyCompactUnicodeObject*)(op))->utf8)
113 #define PyUnicode_UTF8(op)                              \
114     (assert(_PyUnicode_CHECK(op)),                      \
115      assert(PyUnicode_IS_READY(op)),                    \
116      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
117          ((char*)((PyASCIIObject*)(op) + 1)) :          \
118          _PyUnicode_UTF8(op))
119 #define _PyUnicode_UTF8_LENGTH(op)                      \
120     (((PyCompactUnicodeObject*)(op))->utf8_length)
121 #define PyUnicode_UTF8_LENGTH(op)                       \
122     (assert(_PyUnicode_CHECK(op)),                      \
123      assert(PyUnicode_IS_READY(op)),                    \
124      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
125          ((PyASCIIObject*)(op))->length :               \
126          _PyUnicode_UTF8_LENGTH(op))
127 #define _PyUnicode_WSTR(op)                             \
128     (((PyASCIIObject*)(op))->wstr)
129 
130 /* Don't use deprecated macro of unicodeobject.h */
131 #undef PyUnicode_WSTR_LENGTH
132 #define PyUnicode_WSTR_LENGTH(op) \
133     (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
134      ((PyASCIIObject*)op)->length :                    \
135      ((PyCompactUnicodeObject*)op)->wstr_length)
136 #define _PyUnicode_WSTR_LENGTH(op)                      \
137     (((PyCompactUnicodeObject*)(op))->wstr_length)
138 #define _PyUnicode_LENGTH(op)                           \
139     (((PyASCIIObject *)(op))->length)
140 #define _PyUnicode_STATE(op)                            \
141     (((PyASCIIObject *)(op))->state)
142 #define _PyUnicode_HASH(op)                             \
143     (((PyASCIIObject *)(op))->hash)
144 #define _PyUnicode_KIND(op)                             \
145     (assert(_PyUnicode_CHECK(op)),                      \
146      ((PyASCIIObject *)(op))->state.kind)
147 #define _PyUnicode_GET_LENGTH(op)                       \
148     (assert(_PyUnicode_CHECK(op)),                      \
149      ((PyASCIIObject *)(op))->length)
150 #define _PyUnicode_DATA_ANY(op)                         \
151     (((PyUnicodeObject*)(op))->data.any)
152 
153 #undef PyUnicode_READY
154 #define PyUnicode_READY(op)                             \
155     (assert(_PyUnicode_CHECK(op)),                      \
156      (PyUnicode_IS_READY(op) ?                          \
157       0 :                                               \
158       _PyUnicode_Ready(op)))
159 
160 #define _PyUnicode_SHARE_UTF8(op)                       \
161     (assert(_PyUnicode_CHECK(op)),                      \
162      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
163      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164 #define _PyUnicode_SHARE_WSTR(op)                       \
165     (assert(_PyUnicode_CHECK(op)),                      \
166      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167 
168 /* true if the Unicode object has an allocated UTF-8 memory block
169    (not shared with other data) */
170 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
171     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
172       && _PyUnicode_UTF8(op)                            \
173       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174 
175 /* true if the Unicode object has an allocated wstr memory block
176    (not shared with other data) */
177 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
178     ((_PyUnicode_WSTR(op) &&                            \
179       (!PyUnicode_IS_READY(op) ||                       \
180        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181 
182 /* Generic helper macro to convert characters of different types.
183    from_type and to_type have to be valid type names, begin and end
184    are pointers to the source characters which should be of type
185    "from_type *".  to is a pointer of type "to_type *" and points to the
186    buffer where the result characters are written to. */
187 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188     do {                                                \
189         to_type *_to = (to_type *)(to);                \
190         const from_type *_iter = (const from_type *)(begin);\
191         const from_type *_end = (const from_type *)(end);\
192         Py_ssize_t n = (_end) - (_iter);                \
193         const from_type *_unrolled_end =                \
194             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
195         while (_iter < (_unrolled_end)) {               \
196             _to[0] = (to_type) _iter[0];                \
197             _to[1] = (to_type) _iter[1];                \
198             _to[2] = (to_type) _iter[2];                \
199             _to[3] = (to_type) _iter[3];                \
200             _iter += 4; _to += 4;                       \
201         }                                               \
202         while (_iter < (_end))                          \
203             *_to++ = (to_type) *_iter++;                \
204     } while (0)
205 
206 #ifdef MS_WINDOWS
207    /* On Windows, overallocate by 50% is the best factor */
208 #  define OVERALLOCATE_FACTOR 2
209 #else
210    /* On Linux, overallocate by 25% is the best factor */
211 #  define OVERALLOCATE_FACTOR 4
212 #endif
213 
214 /* bpo-40521: Interned strings are shared by all interpreters. */
215 #ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
216 #  define INTERNED_STRINGS
217 #endif
218 
219 /* This dictionary holds all interned unicode strings.  Note that references
220    to strings in this dictionary are *not* counted in the string's ob_refcnt.
221    When the interned string reaches a refcnt of 0 the string deallocation
222    function will delete the reference from this dictionary.
223 
224    Another way to look at this is that to say that the actual reference
225    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
226 */
227 #ifdef INTERNED_STRINGS
228 static PyObject *interned = NULL;
229 #endif
230 
231 static struct _Py_unicode_state*
get_unicode_state(void)232 get_unicode_state(void)
233 {
234     PyInterpreterState *interp = _PyInterpreterState_GET();
235     return &interp->unicode;
236 }
237 
238 
239 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)240 static inline PyObject* unicode_get_empty(void)
241 {
242     struct _Py_unicode_state *state = get_unicode_state();
243     // unicode_get_empty() must not be called before _PyUnicode_Init()
244     // or after _PyUnicode_Fini()
245     assert(state->empty_string != NULL);
246     return state->empty_string;
247 }
248 
249 
250 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)251 static inline PyObject* unicode_new_empty(void)
252 {
253     PyObject *empty = unicode_get_empty();
254     Py_INCREF(empty);
255     return empty;
256 }
257 
258 #define _Py_RETURN_UNICODE_EMPTY()   \
259     do {                             \
260         return unicode_new_empty();  \
261     } while (0)
262 
263 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)264 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265              Py_ssize_t start, Py_ssize_t length)
266 {
267     assert(0 <= start);
268     assert(kind != PyUnicode_WCHAR_KIND);
269     switch (kind) {
270     case PyUnicode_1BYTE_KIND: {
271         assert(value <= 0xff);
272         Py_UCS1 ch = (unsigned char)value;
273         Py_UCS1 *to = (Py_UCS1 *)data + start;
274         memset(to, ch, length);
275         break;
276     }
277     case PyUnicode_2BYTE_KIND: {
278         assert(value <= 0xffff);
279         Py_UCS2 ch = (Py_UCS2)value;
280         Py_UCS2 *to = (Py_UCS2 *)data + start;
281         const Py_UCS2 *end = to + length;
282         for (; to < end; ++to) *to = ch;
283         break;
284     }
285     case PyUnicode_4BYTE_KIND: {
286         assert(value <= MAX_UNICODE);
287         Py_UCS4 ch = value;
288         Py_UCS4 * to = (Py_UCS4 *)data + start;
289         const Py_UCS4 *end = to + length;
290         for (; to < end; ++to) *to = ch;
291         break;
292     }
293     default: Py_UNREACHABLE();
294     }
295 }
296 
297 
298 /* Forward declaration */
299 static inline int
300 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
301 static inline void
302 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
303 static PyObject *
304 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
305                     const char *errors);
306 static PyObject *
307 unicode_decode_utf8(const char *s, Py_ssize_t size,
308                     _Py_error_handler error_handler, const char *errors,
309                     Py_ssize_t *consumed);
310 
311 /* Fast detection of the most frequent whitespace characters */
312 const unsigned char _Py_ascii_whitespace[] = {
313     0, 0, 0, 0, 0, 0, 0, 0,
314 /*     case 0x0009: * CHARACTER TABULATION */
315 /*     case 0x000A: * LINE FEED */
316 /*     case 0x000B: * LINE TABULATION */
317 /*     case 0x000C: * FORM FEED */
318 /*     case 0x000D: * CARRIAGE RETURN */
319     0, 1, 1, 1, 1, 1, 0, 0,
320     0, 0, 0, 0, 0, 0, 0, 0,
321 /*     case 0x001C: * FILE SEPARATOR */
322 /*     case 0x001D: * GROUP SEPARATOR */
323 /*     case 0x001E: * RECORD SEPARATOR */
324 /*     case 0x001F: * UNIT SEPARATOR */
325     0, 0, 0, 0, 1, 1, 1, 1,
326 /*     case 0x0020: * SPACE */
327     1, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331 
332     0, 0, 0, 0, 0, 0, 0, 0,
333     0, 0, 0, 0, 0, 0, 0, 0,
334     0, 0, 0, 0, 0, 0, 0, 0,
335     0, 0, 0, 0, 0, 0, 0, 0,
336     0, 0, 0, 0, 0, 0, 0, 0,
337     0, 0, 0, 0, 0, 0, 0, 0,
338     0, 0, 0, 0, 0, 0, 0, 0,
339     0, 0, 0, 0, 0, 0, 0, 0
340 };
341 
342 /* forward */
343 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
344 static PyObject* get_latin1_char(unsigned char ch);
345 static int unicode_modifiable(PyObject *unicode);
346 
347 
348 static PyObject *
349 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
350 static PyObject *
351 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
352 static PyObject *
353 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
354 
355 static PyObject *
356 unicode_encode_call_errorhandler(const char *errors,
357        PyObject **errorHandler,const char *encoding, const char *reason,
358        PyObject *unicode, PyObject **exceptionObject,
359        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
360 
361 static void
362 raise_encode_exception(PyObject **exceptionObject,
363                        const char *encoding,
364                        PyObject *unicode,
365                        Py_ssize_t startpos, Py_ssize_t endpos,
366                        const char *reason);
367 
368 /* Same for linebreaks */
369 static const unsigned char ascii_linebreak[] = {
370     0, 0, 0, 0, 0, 0, 0, 0,
371 /*         0x000A, * LINE FEED */
372 /*         0x000B, * LINE TABULATION */
373 /*         0x000C, * FORM FEED */
374 /*         0x000D, * CARRIAGE RETURN */
375     0, 0, 1, 1, 1, 1, 0, 0,
376     0, 0, 0, 0, 0, 0, 0, 0,
377 /*         0x001C, * FILE SEPARATOR */
378 /*         0x001D, * GROUP SEPARATOR */
379 /*         0x001E, * RECORD SEPARATOR */
380     0, 0, 0, 0, 1, 1, 1, 0,
381     0, 0, 0, 0, 0, 0, 0, 0,
382     0, 0, 0, 0, 0, 0, 0, 0,
383     0, 0, 0, 0, 0, 0, 0, 0,
384     0, 0, 0, 0, 0, 0, 0, 0,
385 
386     0, 0, 0, 0, 0, 0, 0, 0,
387     0, 0, 0, 0, 0, 0, 0, 0,
388     0, 0, 0, 0, 0, 0, 0, 0,
389     0, 0, 0, 0, 0, 0, 0, 0,
390     0, 0, 0, 0, 0, 0, 0, 0,
391     0, 0, 0, 0, 0, 0, 0, 0,
392     0, 0, 0, 0, 0, 0, 0, 0,
393     0, 0, 0, 0, 0, 0, 0, 0
394 };
395 
396 static int convert_uc(PyObject *obj, void *addr);
397 
398 #include "clinic/unicodeobject.c.h"
399 
400 _Py_error_handler
_Py_GetErrorHandler(const char * errors)401 _Py_GetErrorHandler(const char *errors)
402 {
403     if (errors == NULL || strcmp(errors, "strict") == 0) {
404         return _Py_ERROR_STRICT;
405     }
406     if (strcmp(errors, "surrogateescape") == 0) {
407         return _Py_ERROR_SURROGATEESCAPE;
408     }
409     if (strcmp(errors, "replace") == 0) {
410         return _Py_ERROR_REPLACE;
411     }
412     if (strcmp(errors, "ignore") == 0) {
413         return _Py_ERROR_IGNORE;
414     }
415     if (strcmp(errors, "backslashreplace") == 0) {
416         return _Py_ERROR_BACKSLASHREPLACE;
417     }
418     if (strcmp(errors, "surrogatepass") == 0) {
419         return _Py_ERROR_SURROGATEPASS;
420     }
421     if (strcmp(errors, "xmlcharrefreplace") == 0) {
422         return _Py_ERROR_XMLCHARREFREPLACE;
423     }
424     return _Py_ERROR_OTHER;
425 }
426 
427 
428 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)429 get_error_handler_wide(const wchar_t *errors)
430 {
431     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
432         return _Py_ERROR_STRICT;
433     }
434     if (wcscmp(errors, L"surrogateescape") == 0) {
435         return _Py_ERROR_SURROGATEESCAPE;
436     }
437     if (wcscmp(errors, L"replace") == 0) {
438         return _Py_ERROR_REPLACE;
439     }
440     if (wcscmp(errors, L"ignore") == 0) {
441         return _Py_ERROR_IGNORE;
442     }
443     if (wcscmp(errors, L"backslashreplace") == 0) {
444         return _Py_ERROR_BACKSLASHREPLACE;
445     }
446     if (wcscmp(errors, L"surrogatepass") == 0) {
447         return _Py_ERROR_SURROGATEPASS;
448     }
449     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
450         return _Py_ERROR_XMLCHARREFREPLACE;
451     }
452     return _Py_ERROR_OTHER;
453 }
454 
455 
456 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)457 unicode_check_encoding_errors(const char *encoding, const char *errors)
458 {
459     if (encoding == NULL && errors == NULL) {
460         return 0;
461     }
462 
463     PyInterpreterState *interp = _PyInterpreterState_GET();
464 #ifndef Py_DEBUG
465     /* In release mode, only check in development mode (-X dev) */
466     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
467         return 0;
468     }
469 #else
470     /* Always check in debug mode */
471 #endif
472 
473     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
474        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
475     if (!interp->unicode.fs_codec.encoding) {
476         return 0;
477     }
478 
479     /* Disable checks during Python finalization. For example, it allows to
480        call _PyObject_Dump() during finalization for debugging purpose. */
481     if (interp->finalizing) {
482         return 0;
483     }
484 
485     if (encoding != NULL) {
486         PyObject *handler = _PyCodec_Lookup(encoding);
487         if (handler == NULL) {
488             return -1;
489         }
490         Py_DECREF(handler);
491     }
492 
493     if (errors != NULL) {
494         PyObject *handler = PyCodec_LookupError(errors);
495         if (handler == NULL) {
496             return -1;
497         }
498         Py_DECREF(handler);
499     }
500     return 0;
501 }
502 
503 
504 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)505 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
506 {
507 #define CHECK(expr) \
508     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
509 
510     PyASCIIObject *ascii;
511     unsigned int kind;
512 
513     assert(op != NULL);
514     CHECK(PyUnicode_Check(op));
515 
516     ascii = (PyASCIIObject *)op;
517     kind = ascii->state.kind;
518 
519     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
520         CHECK(kind == PyUnicode_1BYTE_KIND);
521         CHECK(ascii->state.ready == 1);
522     }
523     else {
524         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
525         void *data;
526 
527         if (ascii->state.compact == 1) {
528             data = compact + 1;
529             CHECK(kind == PyUnicode_1BYTE_KIND
530                                  || kind == PyUnicode_2BYTE_KIND
531                                  || kind == PyUnicode_4BYTE_KIND);
532             CHECK(ascii->state.ascii == 0);
533             CHECK(ascii->state.ready == 1);
534             CHECK(compact->utf8 != data);
535         }
536         else {
537             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
538 
539             data = unicode->data.any;
540             if (kind == PyUnicode_WCHAR_KIND) {
541                 CHECK(ascii->length == 0);
542                 CHECK(ascii->hash == -1);
543                 CHECK(ascii->state.compact == 0);
544                 CHECK(ascii->state.ascii == 0);
545                 CHECK(ascii->state.ready == 0);
546                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
547                 CHECK(ascii->wstr != NULL);
548                 CHECK(data == NULL);
549                 CHECK(compact->utf8 == NULL);
550             }
551             else {
552                 CHECK(kind == PyUnicode_1BYTE_KIND
553                                      || kind == PyUnicode_2BYTE_KIND
554                                      || kind == PyUnicode_4BYTE_KIND);
555                 CHECK(ascii->state.compact == 0);
556                 CHECK(ascii->state.ready == 1);
557                 CHECK(data != NULL);
558                 if (ascii->state.ascii) {
559                     CHECK(compact->utf8 == data);
560                     CHECK(compact->utf8_length == ascii->length);
561                 }
562                 else
563                     CHECK(compact->utf8 != data);
564             }
565         }
566         if (kind != PyUnicode_WCHAR_KIND) {
567             if (
568 #if SIZEOF_WCHAR_T == 2
569                 kind == PyUnicode_2BYTE_KIND
570 #else
571                 kind == PyUnicode_4BYTE_KIND
572 #endif
573                )
574             {
575                 CHECK(ascii->wstr == data);
576                 CHECK(compact->wstr_length == ascii->length);
577             } else
578                 CHECK(ascii->wstr != data);
579         }
580 
581         if (compact->utf8 == NULL)
582             CHECK(compact->utf8_length == 0);
583         if (ascii->wstr == NULL)
584             CHECK(compact->wstr_length == 0);
585     }
586 
587     /* check that the best kind is used: O(n) operation */
588     if (check_content && kind != PyUnicode_WCHAR_KIND) {
589         Py_ssize_t i;
590         Py_UCS4 maxchar = 0;
591         const void *data;
592         Py_UCS4 ch;
593 
594         data = PyUnicode_DATA(ascii);
595         for (i=0; i < ascii->length; i++)
596         {
597             ch = PyUnicode_READ(kind, data, i);
598             if (ch > maxchar)
599                 maxchar = ch;
600         }
601         if (kind == PyUnicode_1BYTE_KIND) {
602             if (ascii->state.ascii == 0) {
603                 CHECK(maxchar >= 128);
604                 CHECK(maxchar <= 255);
605             }
606             else
607                 CHECK(maxchar < 128);
608         }
609         else if (kind == PyUnicode_2BYTE_KIND) {
610             CHECK(maxchar >= 0x100);
611             CHECK(maxchar <= 0xFFFF);
612         }
613         else {
614             CHECK(maxchar >= 0x10000);
615             CHECK(maxchar <= MAX_UNICODE);
616         }
617         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
618     }
619     return 1;
620 
621 #undef CHECK
622 }
623 
624 
625 static PyObject*
unicode_result_wchar(PyObject * unicode)626 unicode_result_wchar(PyObject *unicode)
627 {
628 #ifndef Py_DEBUG
629     Py_ssize_t len;
630 
631     len = _PyUnicode_WSTR_LENGTH(unicode);
632     if (len == 0) {
633         Py_DECREF(unicode);
634         _Py_RETURN_UNICODE_EMPTY();
635     }
636 
637     if (len == 1) {
638         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
639         if ((Py_UCS4)ch < 256) {
640             Py_DECREF(unicode);
641             return get_latin1_char((unsigned char)ch);
642         }
643     }
644 
645     if (_PyUnicode_Ready(unicode) < 0) {
646         Py_DECREF(unicode);
647         return NULL;
648     }
649 #else
650     assert(Py_REFCNT(unicode) == 1);
651 
652     /* don't make the result ready in debug mode to ensure that the caller
653        makes the string ready before using it */
654     assert(_PyUnicode_CheckConsistency(unicode, 1));
655 #endif
656     return unicode;
657 }
658 
659 static PyObject*
unicode_result_ready(PyObject * unicode)660 unicode_result_ready(PyObject *unicode)
661 {
662     Py_ssize_t length;
663 
664     length = PyUnicode_GET_LENGTH(unicode);
665     if (length == 0) {
666         PyObject *empty = unicode_get_empty();
667         if (unicode != empty) {
668             Py_DECREF(unicode);
669             Py_INCREF(empty);
670         }
671         return empty;
672     }
673 
674     if (length == 1) {
675         int kind = PyUnicode_KIND(unicode);
676         if (kind == PyUnicode_1BYTE_KIND) {
677             const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
678             Py_UCS1 ch = data[0];
679             struct _Py_unicode_state *state = get_unicode_state();
680             PyObject *latin1_char = state->latin1[ch];
681             if (latin1_char != NULL) {
682                 if (unicode != latin1_char) {
683                     Py_INCREF(latin1_char);
684                     Py_DECREF(unicode);
685                 }
686                 return latin1_char;
687             }
688             else {
689                 assert(_PyUnicode_CheckConsistency(unicode, 1));
690                 Py_INCREF(unicode);
691                 state->latin1[ch] = unicode;
692                 return unicode;
693             }
694         }
695         else {
696             assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
697         }
698     }
699 
700     assert(_PyUnicode_CheckConsistency(unicode, 1));
701     return unicode;
702 }
703 
704 static PyObject*
unicode_result(PyObject * unicode)705 unicode_result(PyObject *unicode)
706 {
707     assert(_PyUnicode_CHECK(unicode));
708     if (PyUnicode_IS_READY(unicode))
709         return unicode_result_ready(unicode);
710     else
711         return unicode_result_wchar(unicode);
712 }
713 
714 static PyObject*
unicode_result_unchanged(PyObject * unicode)715 unicode_result_unchanged(PyObject *unicode)
716 {
717     if (PyUnicode_CheckExact(unicode)) {
718         if (PyUnicode_READY(unicode) == -1)
719             return NULL;
720         Py_INCREF(unicode);
721         return unicode;
722     }
723     else
724         /* Subtype -- return genuine unicode string with the same value. */
725         return _PyUnicode_Copy(unicode);
726 }
727 
728 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
729    ASCII, Latin1, UTF-8, etc. */
730 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)731 backslashreplace(_PyBytesWriter *writer, char *str,
732                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
733 {
734     Py_ssize_t size, i;
735     Py_UCS4 ch;
736     enum PyUnicode_Kind kind;
737     const void *data;
738 
739     assert(PyUnicode_IS_READY(unicode));
740     kind = PyUnicode_KIND(unicode);
741     data = PyUnicode_DATA(unicode);
742 
743     size = 0;
744     /* determine replacement size */
745     for (i = collstart; i < collend; ++i) {
746         Py_ssize_t incr;
747 
748         ch = PyUnicode_READ(kind, data, i);
749         if (ch < 0x100)
750             incr = 2+2;
751         else if (ch < 0x10000)
752             incr = 2+4;
753         else {
754             assert(ch <= MAX_UNICODE);
755             incr = 2+8;
756         }
757         if (size > PY_SSIZE_T_MAX - incr) {
758             PyErr_SetString(PyExc_OverflowError,
759                             "encoded result is too long for a Python string");
760             return NULL;
761         }
762         size += incr;
763     }
764 
765     str = _PyBytesWriter_Prepare(writer, str, size);
766     if (str == NULL)
767         return NULL;
768 
769     /* generate replacement */
770     for (i = collstart; i < collend; ++i) {
771         ch = PyUnicode_READ(kind, data, i);
772         *str++ = '\\';
773         if (ch >= 0x00010000) {
774             *str++ = 'U';
775             *str++ = Py_hexdigits[(ch>>28)&0xf];
776             *str++ = Py_hexdigits[(ch>>24)&0xf];
777             *str++ = Py_hexdigits[(ch>>20)&0xf];
778             *str++ = Py_hexdigits[(ch>>16)&0xf];
779             *str++ = Py_hexdigits[(ch>>12)&0xf];
780             *str++ = Py_hexdigits[(ch>>8)&0xf];
781         }
782         else if (ch >= 0x100) {
783             *str++ = 'u';
784             *str++ = Py_hexdigits[(ch>>12)&0xf];
785             *str++ = Py_hexdigits[(ch>>8)&0xf];
786         }
787         else
788             *str++ = 'x';
789         *str++ = Py_hexdigits[(ch>>4)&0xf];
790         *str++ = Py_hexdigits[ch&0xf];
791     }
792     return str;
793 }
794 
795 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
796    ASCII, Latin1, UTF-8, etc. */
797 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)798 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
799                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
800 {
801     Py_ssize_t size, i;
802     Py_UCS4 ch;
803     enum PyUnicode_Kind kind;
804     const void *data;
805 
806     assert(PyUnicode_IS_READY(unicode));
807     kind = PyUnicode_KIND(unicode);
808     data = PyUnicode_DATA(unicode);
809 
810     size = 0;
811     /* determine replacement size */
812     for (i = collstart; i < collend; ++i) {
813         Py_ssize_t incr;
814 
815         ch = PyUnicode_READ(kind, data, i);
816         if (ch < 10)
817             incr = 2+1+1;
818         else if (ch < 100)
819             incr = 2+2+1;
820         else if (ch < 1000)
821             incr = 2+3+1;
822         else if (ch < 10000)
823             incr = 2+4+1;
824         else if (ch < 100000)
825             incr = 2+5+1;
826         else if (ch < 1000000)
827             incr = 2+6+1;
828         else {
829             assert(ch <= MAX_UNICODE);
830             incr = 2+7+1;
831         }
832         if (size > PY_SSIZE_T_MAX - incr) {
833             PyErr_SetString(PyExc_OverflowError,
834                             "encoded result is too long for a Python string");
835             return NULL;
836         }
837         size += incr;
838     }
839 
840     str = _PyBytesWriter_Prepare(writer, str, size);
841     if (str == NULL)
842         return NULL;
843 
844     /* generate replacement */
845     for (i = collstart; i < collend; ++i) {
846         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
847         if (size < 0) {
848             return NULL;
849         }
850         str += size;
851     }
852     return str;
853 }
854 
855 /* --- Bloom Filters ----------------------------------------------------- */
856 
857 /* stuff to implement simple "bloom filters" for Unicode characters.
858    to keep things simple, we use a single bitmask, using the least 5
859    bits from each unicode characters as the bit index. */
860 
861 /* the linebreak mask is set up by _PyUnicode_Init() below */
862 
863 #if LONG_BIT >= 128
864 #define BLOOM_WIDTH 128
865 #elif LONG_BIT >= 64
866 #define BLOOM_WIDTH 64
867 #elif LONG_BIT >= 32
868 #define BLOOM_WIDTH 32
869 #else
870 #error "LONG_BIT is smaller than 32"
871 #endif
872 
873 #define BLOOM_MASK unsigned long
874 
875 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
876 
877 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
878 
879 #define BLOOM_LINEBREAK(ch)                                             \
880     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
881      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
882 
883 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)884 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
885 {
886 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
887     do {                                               \
888         TYPE *data = (TYPE *)PTR;                      \
889         TYPE *end = data + LEN;                        \
890         Py_UCS4 ch;                                    \
891         for (; data != end; data++) {                  \
892             ch = *data;                                \
893             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894         }                                              \
895         break;                                         \
896     } while (0)
897 
898     /* calculate simple bloom-style bitmask for a given unicode string */
899 
900     BLOOM_MASK mask;
901 
902     mask = 0;
903     switch (kind) {
904     case PyUnicode_1BYTE_KIND:
905         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906         break;
907     case PyUnicode_2BYTE_KIND:
908         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909         break;
910     case PyUnicode_4BYTE_KIND:
911         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912         break;
913     default:
914         Py_UNREACHABLE();
915     }
916     return mask;
917 
918 #undef BLOOM_UPDATE
919 }
920 
921 static int
ensure_unicode(PyObject * obj)922 ensure_unicode(PyObject *obj)
923 {
924     if (!PyUnicode_Check(obj)) {
925         PyErr_Format(PyExc_TypeError,
926                      "must be str, not %.100s",
927                      Py_TYPE(obj)->tp_name);
928         return -1;
929     }
930     return PyUnicode_READY(obj);
931 }
932 
933 /* Compilation of templated routines */
934 
935 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
936 
937 #include "stringlib/asciilib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/find_max_char.h"
944 #include "stringlib/undef.h"
945 
946 #include "stringlib/ucs1lib.h"
947 #include "stringlib/fastsearch.h"
948 #include "stringlib/partition.h"
949 #include "stringlib/split.h"
950 #include "stringlib/count.h"
951 #include "stringlib/find.h"
952 #include "stringlib/replace.h"
953 #include "stringlib/find_max_char.h"
954 #include "stringlib/undef.h"
955 
956 #include "stringlib/ucs2lib.h"
957 #include "stringlib/fastsearch.h"
958 #include "stringlib/partition.h"
959 #include "stringlib/split.h"
960 #include "stringlib/count.h"
961 #include "stringlib/find.h"
962 #include "stringlib/replace.h"
963 #include "stringlib/find_max_char.h"
964 #include "stringlib/undef.h"
965 
966 #include "stringlib/ucs4lib.h"
967 #include "stringlib/fastsearch.h"
968 #include "stringlib/partition.h"
969 #include "stringlib/split.h"
970 #include "stringlib/count.h"
971 #include "stringlib/find.h"
972 #include "stringlib/replace.h"
973 #include "stringlib/find_max_char.h"
974 #include "stringlib/undef.h"
975 
976 _Py_COMP_DIAG_PUSH
977 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
978 #include "stringlib/unicodedefs.h"
979 #include "stringlib/fastsearch.h"
980 #include "stringlib/count.h"
981 #include "stringlib/find.h"
982 #include "stringlib/undef.h"
983 _Py_COMP_DIAG_POP
984 
985 #undef STRINGLIB_GET_EMPTY
986 
987 /* --- Unicode Object ----------------------------------------------------- */
988 
989 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)990 findchar(const void *s, int kind,
991          Py_ssize_t size, Py_UCS4 ch,
992          int direction)
993 {
994     switch (kind) {
995     case PyUnicode_1BYTE_KIND:
996         if ((Py_UCS1) ch != ch)
997             return -1;
998         if (direction > 0)
999             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000         else
1001             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002     case PyUnicode_2BYTE_KIND:
1003         if ((Py_UCS2) ch != ch)
1004             return -1;
1005         if (direction > 0)
1006             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007         else
1008             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009     case PyUnicode_4BYTE_KIND:
1010         if (direction > 0)
1011             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012         else
1013             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014     default:
1015         Py_UNREACHABLE();
1016     }
1017 }
1018 
1019 #ifdef Py_DEBUG
1020 /* Fill the data of a Unicode string with invalid characters to detect bugs
1021    earlier.
1022 
1023    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025    invalid character in Unicode 6.0. */
1026 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1027 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028 {
1029     int kind = PyUnicode_KIND(unicode);
1030     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032     if (length <= old_length)
1033         return;
1034     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035 }
1036 #endif
1037 
1038 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1039 resize_compact(PyObject *unicode, Py_ssize_t length)
1040 {
1041     Py_ssize_t char_size;
1042     Py_ssize_t struct_size;
1043     Py_ssize_t new_size;
1044     int share_wstr;
1045     PyObject *new_unicode;
1046 #ifdef Py_DEBUG
1047     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048 #endif
1049 
1050     assert(unicode_modifiable(unicode));
1051     assert(PyUnicode_IS_READY(unicode));
1052     assert(PyUnicode_IS_COMPACT(unicode));
1053 
1054     char_size = PyUnicode_KIND(unicode);
1055     if (PyUnicode_IS_ASCII(unicode))
1056         struct_size = sizeof(PyASCIIObject);
1057     else
1058         struct_size = sizeof(PyCompactUnicodeObject);
1059     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060 
1061     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062         PyErr_NoMemory();
1063         return NULL;
1064     }
1065     new_size = (struct_size + (length + 1) * char_size);
1066 
1067     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068         PyObject_Free(_PyUnicode_UTF8(unicode));
1069         _PyUnicode_UTF8(unicode) = NULL;
1070         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071     }
1072 #ifdef Py_REF_DEBUG
1073     _Py_RefTotal--;
1074 #endif
1075 #ifdef Py_TRACE_REFS
1076     _Py_ForgetReference(unicode);
1077 #endif
1078 
1079     new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1080     if (new_unicode == NULL) {
1081         _Py_NewReference(unicode);
1082         PyErr_NoMemory();
1083         return NULL;
1084     }
1085     unicode = new_unicode;
1086     _Py_NewReference(unicode);
1087 
1088     _PyUnicode_LENGTH(unicode) = length;
1089     if (share_wstr) {
1090         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091         if (!PyUnicode_IS_ASCII(unicode))
1092             _PyUnicode_WSTR_LENGTH(unicode) = length;
1093     }
1094     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095         PyObject_Free(_PyUnicode_WSTR(unicode));
1096         _PyUnicode_WSTR(unicode) = NULL;
1097         if (!PyUnicode_IS_ASCII(unicode))
1098             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099     }
1100 #ifdef Py_DEBUG
1101     unicode_fill_invalid(unicode, old_length);
1102 #endif
1103     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104                     length, 0);
1105     assert(_PyUnicode_CheckConsistency(unicode, 0));
1106     return unicode;
1107 }
1108 
1109 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1110 resize_inplace(PyObject *unicode, Py_ssize_t length)
1111 {
1112     wchar_t *wstr;
1113     Py_ssize_t new_size;
1114     assert(!PyUnicode_IS_COMPACT(unicode));
1115     assert(Py_REFCNT(unicode) == 1);
1116 
1117     if (PyUnicode_IS_READY(unicode)) {
1118         Py_ssize_t char_size;
1119         int share_wstr, share_utf8;
1120         void *data;
1121 #ifdef Py_DEBUG
1122         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123 #endif
1124 
1125         data = _PyUnicode_DATA_ANY(unicode);
1126         char_size = PyUnicode_KIND(unicode);
1127         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129 
1130         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131             PyErr_NoMemory();
1132             return -1;
1133         }
1134         new_size = (length + 1) * char_size;
1135 
1136         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137         {
1138             PyObject_Free(_PyUnicode_UTF8(unicode));
1139             _PyUnicode_UTF8(unicode) = NULL;
1140             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141         }
1142 
1143         data = (PyObject *)PyObject_Realloc(data, new_size);
1144         if (data == NULL) {
1145             PyErr_NoMemory();
1146             return -1;
1147         }
1148         _PyUnicode_DATA_ANY(unicode) = data;
1149         if (share_wstr) {
1150             _PyUnicode_WSTR(unicode) = data;
1151             _PyUnicode_WSTR_LENGTH(unicode) = length;
1152         }
1153         if (share_utf8) {
1154             _PyUnicode_UTF8(unicode) = data;
1155             _PyUnicode_UTF8_LENGTH(unicode) = length;
1156         }
1157         _PyUnicode_LENGTH(unicode) = length;
1158         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159 #ifdef Py_DEBUG
1160         unicode_fill_invalid(unicode, old_length);
1161 #endif
1162         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163             assert(_PyUnicode_CheckConsistency(unicode, 0));
1164             return 0;
1165         }
1166     }
1167     assert(_PyUnicode_WSTR(unicode) != NULL);
1168 
1169     /* check for integer overflow */
1170     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171         PyErr_NoMemory();
1172         return -1;
1173     }
1174     new_size = sizeof(wchar_t) * (length + 1);
1175     wstr =  _PyUnicode_WSTR(unicode);
1176     wstr = PyObject_Realloc(wstr, new_size);
1177     if (!wstr) {
1178         PyErr_NoMemory();
1179         return -1;
1180     }
1181     _PyUnicode_WSTR(unicode) = wstr;
1182     _PyUnicode_WSTR(unicode)[length] = 0;
1183     _PyUnicode_WSTR_LENGTH(unicode) = length;
1184     assert(_PyUnicode_CheckConsistency(unicode, 0));
1185     return 0;
1186 }
1187 
1188 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1189 resize_copy(PyObject *unicode, Py_ssize_t length)
1190 {
1191     Py_ssize_t copy_length;
1192     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193         PyObject *copy;
1194 
1195         assert(PyUnicode_IS_READY(unicode));
1196 
1197         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198         if (copy == NULL)
1199             return NULL;
1200 
1201         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203         return copy;
1204     }
1205     else {
1206         PyObject *w;
1207 
1208         w = (PyObject*)_PyUnicode_New(length);
1209         if (w == NULL)
1210             return NULL;
1211         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212         copy_length = Py_MIN(copy_length, length);
1213         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214                   copy_length * sizeof(wchar_t));
1215         return w;
1216     }
1217 }
1218 
1219 /* We allocate one more byte to make sure the string is
1220    Ux0000 terminated; some code (e.g. new_identifier)
1221    relies on that.
1222 
1223    XXX This allocator could further be enhanced by assuring that the
1224    free list never reduces its size below 1.
1225 
1226 */
1227 
1228 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1229 _PyUnicode_New(Py_ssize_t length)
1230 {
1231     PyUnicodeObject *unicode;
1232     size_t new_size;
1233 
1234     /* Optimization for empty strings */
1235     if (length == 0) {
1236         return (PyUnicodeObject *)unicode_new_empty();
1237     }
1238 
1239     /* Ensure we won't overflow the size. */
1240     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1241         return (PyUnicodeObject *)PyErr_NoMemory();
1242     }
1243     if (length < 0) {
1244         PyErr_SetString(PyExc_SystemError,
1245                         "Negative size passed to _PyUnicode_New");
1246         return NULL;
1247     }
1248 
1249     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1250     if (unicode == NULL)
1251         return NULL;
1252     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1253 
1254     _PyUnicode_WSTR_LENGTH(unicode) = length;
1255     _PyUnicode_HASH(unicode) = -1;
1256     _PyUnicode_STATE(unicode).interned = 0;
1257     _PyUnicode_STATE(unicode).kind = 0;
1258     _PyUnicode_STATE(unicode).compact = 0;
1259     _PyUnicode_STATE(unicode).ready = 0;
1260     _PyUnicode_STATE(unicode).ascii = 0;
1261     _PyUnicode_DATA_ANY(unicode) = NULL;
1262     _PyUnicode_LENGTH(unicode) = 0;
1263     _PyUnicode_UTF8(unicode) = NULL;
1264     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1265 
1266     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1267     if (!_PyUnicode_WSTR(unicode)) {
1268         Py_DECREF(unicode);
1269         PyErr_NoMemory();
1270         return NULL;
1271     }
1272 
1273     /* Initialize the first element to guard against cases where
1274      * the caller fails before initializing str -- unicode_resize()
1275      * reads str[0], and the Keep-Alive optimization can keep memory
1276      * allocated for str alive across a call to unicode_dealloc(unicode).
1277      * We don't want unicode_resize to read uninitialized memory in
1278      * that case.
1279      */
1280     _PyUnicode_WSTR(unicode)[0] = 0;
1281     _PyUnicode_WSTR(unicode)[length] = 0;
1282 
1283     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1284     return unicode;
1285 }
1286 
1287 static const char*
unicode_kind_name(PyObject * unicode)1288 unicode_kind_name(PyObject *unicode)
1289 {
1290     /* don't check consistency: unicode_kind_name() is called from
1291        _PyUnicode_Dump() */
1292     if (!PyUnicode_IS_COMPACT(unicode))
1293     {
1294         if (!PyUnicode_IS_READY(unicode))
1295             return "wstr";
1296         switch (PyUnicode_KIND(unicode))
1297         {
1298         case PyUnicode_1BYTE_KIND:
1299             if (PyUnicode_IS_ASCII(unicode))
1300                 return "legacy ascii";
1301             else
1302                 return "legacy latin1";
1303         case PyUnicode_2BYTE_KIND:
1304             return "legacy UCS2";
1305         case PyUnicode_4BYTE_KIND:
1306             return "legacy UCS4";
1307         default:
1308             return "<legacy invalid kind>";
1309         }
1310     }
1311     assert(PyUnicode_IS_READY(unicode));
1312     switch (PyUnicode_KIND(unicode)) {
1313     case PyUnicode_1BYTE_KIND:
1314         if (PyUnicode_IS_ASCII(unicode))
1315             return "ascii";
1316         else
1317             return "latin1";
1318     case PyUnicode_2BYTE_KIND:
1319         return "UCS2";
1320     case PyUnicode_4BYTE_KIND:
1321         return "UCS4";
1322     default:
1323         return "<invalid compact kind>";
1324     }
1325 }
1326 
1327 #ifdef Py_DEBUG
1328 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1329 const char *_PyUnicode_utf8(void *unicode_raw){
1330     PyObject *unicode = _PyObject_CAST(unicode_raw);
1331     return PyUnicode_UTF8(unicode);
1332 }
1333 
_PyUnicode_compact_data(void * unicode_raw)1334 const void *_PyUnicode_compact_data(void *unicode_raw) {
1335     PyObject *unicode = _PyObject_CAST(unicode_raw);
1336     return _PyUnicode_COMPACT_DATA(unicode);
1337 }
_PyUnicode_data(void * unicode_raw)1338 const void *_PyUnicode_data(void *unicode_raw) {
1339     PyObject *unicode = _PyObject_CAST(unicode_raw);
1340     printf("obj %p\n", (void*)unicode);
1341     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1342     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1343     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1344     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1345     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1346     return PyUnicode_DATA(unicode);
1347 }
1348 
1349 void
_PyUnicode_Dump(PyObject * op)1350 _PyUnicode_Dump(PyObject *op)
1351 {
1352     PyASCIIObject *ascii = (PyASCIIObject *)op;
1353     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1354     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1355     const void *data;
1356 
1357     if (ascii->state.compact)
1358     {
1359         if (ascii->state.ascii)
1360             data = (ascii + 1);
1361         else
1362             data = (compact + 1);
1363     }
1364     else
1365         data = unicode->data.any;
1366     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1367 
1368     if (ascii->wstr == data)
1369         printf("shared ");
1370     printf("wstr=%p", (void *)ascii->wstr);
1371 
1372     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1373         printf(" (%zu), ", compact->wstr_length);
1374         if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1375             printf("shared ");
1376         }
1377         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1378     }
1379     printf(", data=%p\n", data);
1380 }
1381 #endif
1382 
1383 static int
unicode_create_empty_string_singleton(struct _Py_unicode_state * state)1384 unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1385 {
1386     // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1387     // optimized to always use state->empty_string without having to check if
1388     // it is NULL or not.
1389     PyObject *empty = PyUnicode_New(1, 0);
1390     if (empty == NULL) {
1391         return -1;
1392     }
1393     PyUnicode_1BYTE_DATA(empty)[0] = 0;
1394     _PyUnicode_LENGTH(empty) = 0;
1395     assert(_PyUnicode_CheckConsistency(empty, 1));
1396 
1397     assert(state->empty_string == NULL);
1398     state->empty_string = empty;
1399     return 0;
1400 }
1401 
1402 
1403 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1404 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1405 {
1406     /* Optimization for empty strings */
1407     if (size == 0) {
1408         return unicode_new_empty();
1409     }
1410 
1411     PyObject *obj;
1412     PyCompactUnicodeObject *unicode;
1413     void *data;
1414     enum PyUnicode_Kind kind;
1415     int is_sharing, is_ascii;
1416     Py_ssize_t char_size;
1417     Py_ssize_t struct_size;
1418 
1419     is_ascii = 0;
1420     is_sharing = 0;
1421     struct_size = sizeof(PyCompactUnicodeObject);
1422     if (maxchar < 128) {
1423         kind = PyUnicode_1BYTE_KIND;
1424         char_size = 1;
1425         is_ascii = 1;
1426         struct_size = sizeof(PyASCIIObject);
1427     }
1428     else if (maxchar < 256) {
1429         kind = PyUnicode_1BYTE_KIND;
1430         char_size = 1;
1431     }
1432     else if (maxchar < 65536) {
1433         kind = PyUnicode_2BYTE_KIND;
1434         char_size = 2;
1435         if (sizeof(wchar_t) == 2)
1436             is_sharing = 1;
1437     }
1438     else {
1439         if (maxchar > MAX_UNICODE) {
1440             PyErr_SetString(PyExc_SystemError,
1441                             "invalid maximum character passed to PyUnicode_New");
1442             return NULL;
1443         }
1444         kind = PyUnicode_4BYTE_KIND;
1445         char_size = 4;
1446         if (sizeof(wchar_t) == 4)
1447             is_sharing = 1;
1448     }
1449 
1450     /* Ensure we won't overflow the size. */
1451     if (size < 0) {
1452         PyErr_SetString(PyExc_SystemError,
1453                         "Negative size passed to PyUnicode_New");
1454         return NULL;
1455     }
1456     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1457         return PyErr_NoMemory();
1458 
1459     /* Duplicated allocation code from _PyObject_New() instead of a call to
1460      * PyObject_New() so we are able to allocate space for the object and
1461      * it's data buffer.
1462      */
1463     obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1464     if (obj == NULL) {
1465         return PyErr_NoMemory();
1466     }
1467     _PyObject_Init(obj, &PyUnicode_Type);
1468 
1469     unicode = (PyCompactUnicodeObject *)obj;
1470     if (is_ascii)
1471         data = ((PyASCIIObject*)obj) + 1;
1472     else
1473         data = unicode + 1;
1474     _PyUnicode_LENGTH(unicode) = size;
1475     _PyUnicode_HASH(unicode) = -1;
1476     _PyUnicode_STATE(unicode).interned = 0;
1477     _PyUnicode_STATE(unicode).kind = kind;
1478     _PyUnicode_STATE(unicode).compact = 1;
1479     _PyUnicode_STATE(unicode).ready = 1;
1480     _PyUnicode_STATE(unicode).ascii = is_ascii;
1481     if (is_ascii) {
1482         ((char*)data)[size] = 0;
1483         _PyUnicode_WSTR(unicode) = NULL;
1484     }
1485     else if (kind == PyUnicode_1BYTE_KIND) {
1486         ((char*)data)[size] = 0;
1487         _PyUnicode_WSTR(unicode) = NULL;
1488         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489         unicode->utf8 = NULL;
1490         unicode->utf8_length = 0;
1491     }
1492     else {
1493         unicode->utf8 = NULL;
1494         unicode->utf8_length = 0;
1495         if (kind == PyUnicode_2BYTE_KIND)
1496             ((Py_UCS2*)data)[size] = 0;
1497         else /* kind == PyUnicode_4BYTE_KIND */
1498             ((Py_UCS4*)data)[size] = 0;
1499         if (is_sharing) {
1500             _PyUnicode_WSTR_LENGTH(unicode) = size;
1501             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1502         }
1503         else {
1504             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505             _PyUnicode_WSTR(unicode) = NULL;
1506         }
1507     }
1508 #ifdef Py_DEBUG
1509     unicode_fill_invalid((PyObject*)unicode, 0);
1510 #endif
1511     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1512     return obj;
1513 }
1514 
1515 #if SIZEOF_WCHAR_T == 2
1516 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1517    will decode surrogate pairs, the other conversions are implemented as macros
1518    for efficiency.
1519 
1520    This function assumes that unicode can hold one more code point than wstr
1521    characters for a terminating null character. */
1522 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1523 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1524                               PyObject *unicode)
1525 {
1526     const wchar_t *iter;
1527     Py_UCS4 *ucs4_out;
1528 
1529     assert(unicode != NULL);
1530     assert(_PyUnicode_CHECK(unicode));
1531     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1532     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1533 
1534     for (iter = begin; iter < end; ) {
1535         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1536                            _PyUnicode_GET_LENGTH(unicode)));
1537         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1538             && (iter+1) < end
1539             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1540         {
1541             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1542             iter += 2;
1543         }
1544         else {
1545             *ucs4_out++ = *iter;
1546             iter++;
1547         }
1548     }
1549     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1550                         _PyUnicode_GET_LENGTH(unicode)));
1551 
1552 }
1553 #endif
1554 
1555 static int
unicode_check_modifiable(PyObject * unicode)1556 unicode_check_modifiable(PyObject *unicode)
1557 {
1558     if (!unicode_modifiable(unicode)) {
1559         PyErr_SetString(PyExc_SystemError,
1560                         "Cannot modify a string currently used");
1561         return -1;
1562     }
1563     return 0;
1564 }
1565 
1566 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1567 _copy_characters(PyObject *to, Py_ssize_t to_start,
1568                  PyObject *from, Py_ssize_t from_start,
1569                  Py_ssize_t how_many, int check_maxchar)
1570 {
1571     unsigned int from_kind, to_kind;
1572     const void *from_data;
1573     void *to_data;
1574 
1575     assert(0 <= how_many);
1576     assert(0 <= from_start);
1577     assert(0 <= to_start);
1578     assert(PyUnicode_Check(from));
1579     assert(PyUnicode_IS_READY(from));
1580     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1581 
1582     assert(PyUnicode_Check(to));
1583     assert(PyUnicode_IS_READY(to));
1584     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1585 
1586     if (how_many == 0)
1587         return 0;
1588 
1589     from_kind = PyUnicode_KIND(from);
1590     from_data = PyUnicode_DATA(from);
1591     to_kind = PyUnicode_KIND(to);
1592     to_data = PyUnicode_DATA(to);
1593 
1594 #ifdef Py_DEBUG
1595     if (!check_maxchar
1596         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1597     {
1598         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1599         Py_UCS4 ch;
1600         Py_ssize_t i;
1601         for (i=0; i < how_many; i++) {
1602             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1603             assert(ch <= to_maxchar);
1604         }
1605     }
1606 #endif
1607 
1608     if (from_kind == to_kind) {
1609         if (check_maxchar
1610             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1611         {
1612             /* Writing Latin-1 characters into an ASCII string requires to
1613                check that all written characters are pure ASCII */
1614             Py_UCS4 max_char;
1615             max_char = ucs1lib_find_max_char(from_data,
1616                                              (const Py_UCS1*)from_data + how_many);
1617             if (max_char >= 128)
1618                 return -1;
1619         }
1620         memcpy((char*)to_data + to_kind * to_start,
1621                   (const char*)from_data + from_kind * from_start,
1622                   to_kind * how_many);
1623     }
1624     else if (from_kind == PyUnicode_1BYTE_KIND
1625              && to_kind == PyUnicode_2BYTE_KIND)
1626     {
1627         _PyUnicode_CONVERT_BYTES(
1628             Py_UCS1, Py_UCS2,
1629             PyUnicode_1BYTE_DATA(from) + from_start,
1630             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1631             PyUnicode_2BYTE_DATA(to) + to_start
1632             );
1633     }
1634     else if (from_kind == PyUnicode_1BYTE_KIND
1635              && to_kind == PyUnicode_4BYTE_KIND)
1636     {
1637         _PyUnicode_CONVERT_BYTES(
1638             Py_UCS1, Py_UCS4,
1639             PyUnicode_1BYTE_DATA(from) + from_start,
1640             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1641             PyUnicode_4BYTE_DATA(to) + to_start
1642             );
1643     }
1644     else if (from_kind == PyUnicode_2BYTE_KIND
1645              && to_kind == PyUnicode_4BYTE_KIND)
1646     {
1647         _PyUnicode_CONVERT_BYTES(
1648             Py_UCS2, Py_UCS4,
1649             PyUnicode_2BYTE_DATA(from) + from_start,
1650             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1651             PyUnicode_4BYTE_DATA(to) + to_start
1652             );
1653     }
1654     else {
1655         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1656 
1657         if (!check_maxchar) {
1658             if (from_kind == PyUnicode_2BYTE_KIND
1659                 && to_kind == PyUnicode_1BYTE_KIND)
1660             {
1661                 _PyUnicode_CONVERT_BYTES(
1662                     Py_UCS2, Py_UCS1,
1663                     PyUnicode_2BYTE_DATA(from) + from_start,
1664                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1665                     PyUnicode_1BYTE_DATA(to) + to_start
1666                     );
1667             }
1668             else if (from_kind == PyUnicode_4BYTE_KIND
1669                      && to_kind == PyUnicode_1BYTE_KIND)
1670             {
1671                 _PyUnicode_CONVERT_BYTES(
1672                     Py_UCS4, Py_UCS1,
1673                     PyUnicode_4BYTE_DATA(from) + from_start,
1674                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1675                     PyUnicode_1BYTE_DATA(to) + to_start
1676                     );
1677             }
1678             else if (from_kind == PyUnicode_4BYTE_KIND
1679                      && to_kind == PyUnicode_2BYTE_KIND)
1680             {
1681                 _PyUnicode_CONVERT_BYTES(
1682                     Py_UCS4, Py_UCS2,
1683                     PyUnicode_4BYTE_DATA(from) + from_start,
1684                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1685                     PyUnicode_2BYTE_DATA(to) + to_start
1686                     );
1687             }
1688             else {
1689                 Py_UNREACHABLE();
1690             }
1691         }
1692         else {
1693             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1694             Py_UCS4 ch;
1695             Py_ssize_t i;
1696 
1697             for (i=0; i < how_many; i++) {
1698                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1699                 if (ch > to_maxchar)
1700                     return -1;
1701                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1702             }
1703         }
1704     }
1705     return 0;
1706 }
1707 
1708 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1709 _PyUnicode_FastCopyCharacters(
1710     PyObject *to, Py_ssize_t to_start,
1711     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1712 {
1713     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1714 }
1715 
1716 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1717 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1718                          PyObject *from, Py_ssize_t from_start,
1719                          Py_ssize_t how_many)
1720 {
1721     int err;
1722 
1723     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1724         PyErr_BadInternalCall();
1725         return -1;
1726     }
1727 
1728     if (PyUnicode_READY(from) == -1)
1729         return -1;
1730     if (PyUnicode_READY(to) == -1)
1731         return -1;
1732 
1733     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1734         PyErr_SetString(PyExc_IndexError, "string index out of range");
1735         return -1;
1736     }
1737     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1738         PyErr_SetString(PyExc_IndexError, "string index out of range");
1739         return -1;
1740     }
1741     if (how_many < 0) {
1742         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1743         return -1;
1744     }
1745     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1746     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1747         PyErr_Format(PyExc_SystemError,
1748                      "Cannot write %zi characters at %zi "
1749                      "in a string of %zi characters",
1750                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1751         return -1;
1752     }
1753 
1754     if (how_many == 0)
1755         return 0;
1756 
1757     if (unicode_check_modifiable(to))
1758         return -1;
1759 
1760     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1761     if (err) {
1762         PyErr_Format(PyExc_SystemError,
1763                      "Cannot copy %s characters "
1764                      "into a string of %s characters",
1765                      unicode_kind_name(from),
1766                      unicode_kind_name(to));
1767         return -1;
1768     }
1769     return how_many;
1770 }
1771 
1772 /* Find the maximum code point and count the number of surrogate pairs so a
1773    correct string length can be computed before converting a string to UCS4.
1774    This function counts single surrogates as a character and not as a pair.
1775 
1776    Return 0 on success, or -1 on error. */
1777 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1778 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1779                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1780 {
1781     const wchar_t *iter;
1782     Py_UCS4 ch;
1783 
1784     assert(num_surrogates != NULL && maxchar != NULL);
1785     *num_surrogates = 0;
1786     *maxchar = 0;
1787 
1788     for (iter = begin; iter < end; ) {
1789 #if SIZEOF_WCHAR_T == 2
1790         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1791             && (iter+1) < end
1792             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1793         {
1794             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1795             ++(*num_surrogates);
1796             iter += 2;
1797         }
1798         else
1799 #endif
1800         {
1801             ch = *iter;
1802             iter++;
1803         }
1804         if (ch > *maxchar) {
1805             *maxchar = ch;
1806             if (*maxchar > MAX_UNICODE) {
1807                 PyErr_Format(PyExc_ValueError,
1808                              "character U+%x is not in range [U+0000; U+%x]",
1809                              ch, MAX_UNICODE);
1810                 return -1;
1811             }
1812         }
1813     }
1814     return 0;
1815 }
1816 
1817 int
_PyUnicode_Ready(PyObject * unicode)1818 _PyUnicode_Ready(PyObject *unicode)
1819 {
1820     wchar_t *end;
1821     Py_UCS4 maxchar = 0;
1822     Py_ssize_t num_surrogates;
1823 #if SIZEOF_WCHAR_T == 2
1824     Py_ssize_t length_wo_surrogates;
1825 #endif
1826 
1827     /* _PyUnicode_Ready() is only intended for old-style API usage where
1828        strings were created using _PyObject_New() and where no canonical
1829        representation (the str field) has been set yet aka strings
1830        which are not yet ready. */
1831     assert(_PyUnicode_CHECK(unicode));
1832     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1833     assert(_PyUnicode_WSTR(unicode) != NULL);
1834     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1835     assert(_PyUnicode_UTF8(unicode) == NULL);
1836     /* Actually, it should neither be interned nor be anything else: */
1837     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1838 
1839     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1840     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1841                                 &maxchar, &num_surrogates) == -1)
1842         return -1;
1843 
1844     if (maxchar < 256) {
1845         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1846         if (!_PyUnicode_DATA_ANY(unicode)) {
1847             PyErr_NoMemory();
1848             return -1;
1849         }
1850         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1851                                 _PyUnicode_WSTR(unicode), end,
1852                                 PyUnicode_1BYTE_DATA(unicode));
1853         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1856         if (maxchar < 128) {
1857             _PyUnicode_STATE(unicode).ascii = 1;
1858             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1859             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860         }
1861         else {
1862             _PyUnicode_STATE(unicode).ascii = 0;
1863             _PyUnicode_UTF8(unicode) = NULL;
1864             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1865         }
1866         PyObject_Free(_PyUnicode_WSTR(unicode));
1867         _PyUnicode_WSTR(unicode) = NULL;
1868         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1869     }
1870     /* In this case we might have to convert down from 4-byte native
1871        wchar_t to 2-byte unicode. */
1872     else if (maxchar < 65536) {
1873         assert(num_surrogates == 0 &&
1874                "FindMaxCharAndNumSurrogatePairs() messed up");
1875 
1876 #if SIZEOF_WCHAR_T == 2
1877         /* We can share representations and are done. */
1878         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1879         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882         _PyUnicode_UTF8(unicode) = NULL;
1883         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884 #else
1885         /* sizeof(wchar_t) == 4 */
1886         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1887             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1888         if (!_PyUnicode_DATA_ANY(unicode)) {
1889             PyErr_NoMemory();
1890             return -1;
1891         }
1892         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1893                                 _PyUnicode_WSTR(unicode), end,
1894                                 PyUnicode_2BYTE_DATA(unicode));
1895         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1896         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1898         _PyUnicode_UTF8(unicode) = NULL;
1899         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1900         PyObject_Free(_PyUnicode_WSTR(unicode));
1901         _PyUnicode_WSTR(unicode) = NULL;
1902         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1903 #endif
1904     }
1905     /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1906     else {
1907 #if SIZEOF_WCHAR_T == 2
1908         /* in case the native representation is 2-bytes, we need to allocate a
1909            new normalized 4-byte version. */
1910         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1911         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1912             PyErr_NoMemory();
1913             return -1;
1914         }
1915         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1916         if (!_PyUnicode_DATA_ANY(unicode)) {
1917             PyErr_NoMemory();
1918             return -1;
1919         }
1920         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1921         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922         _PyUnicode_UTF8(unicode) = NULL;
1923         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1924         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1925         _PyUnicode_STATE(unicode).ready = 1;
1926         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1927         PyObject_Free(_PyUnicode_WSTR(unicode));
1928         _PyUnicode_WSTR(unicode) = NULL;
1929         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1930 #else
1931         assert(num_surrogates == 0);
1932 
1933         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1934         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1935         _PyUnicode_UTF8(unicode) = NULL;
1936         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1937         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1938 #endif
1939         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1940     }
1941     _PyUnicode_STATE(unicode).ready = 1;
1942     assert(_PyUnicode_CheckConsistency(unicode, 1));
1943     return 0;
1944 }
1945 
1946 static void
unicode_dealloc(PyObject * unicode)1947 unicode_dealloc(PyObject *unicode)
1948 {
1949     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1950     case SSTATE_NOT_INTERNED:
1951         break;
1952 
1953     case SSTATE_INTERNED_MORTAL:
1954     {
1955 #ifdef INTERNED_STRINGS
1956         /* Revive the dead object temporarily. PyDict_DelItem() removes two
1957            references (key and value) which were ignored by
1958            PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1959            to prevent calling unicode_dealloc() again. Adjust refcnt after
1960            PyDict_DelItem(). */
1961         assert(Py_REFCNT(unicode) == 0);
1962         Py_SET_REFCNT(unicode, 3);
1963         if (PyDict_DelItem(interned, unicode) != 0) {
1964             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965                                       NULL);
1966         }
1967         assert(Py_REFCNT(unicode) == 1);
1968         Py_SET_REFCNT(unicode, 0);
1969 #endif
1970         break;
1971     }
1972 
1973     case SSTATE_INTERNED_IMMORTAL:
1974         _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1975         break;
1976 
1977     default:
1978         Py_UNREACHABLE();
1979     }
1980 
1981     if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1982         PyObject_Free(_PyUnicode_WSTR(unicode));
1983     }
1984     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1985         PyObject_Free(_PyUnicode_UTF8(unicode));
1986     }
1987     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1988         PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1989     }
1990 
1991     Py_TYPE(unicode)->tp_free(unicode);
1992 }
1993 
1994 #ifdef Py_DEBUG
1995 static int
unicode_is_singleton(PyObject * unicode)1996 unicode_is_singleton(PyObject *unicode)
1997 {
1998     struct _Py_unicode_state *state = get_unicode_state();
1999     if (unicode == state->empty_string) {
2000         return 1;
2001     }
2002     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
2003     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
2004     {
2005         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
2006         if (ch < 256 && state->latin1[ch] == unicode) {
2007             return 1;
2008         }
2009     }
2010     return 0;
2011 }
2012 #endif
2013 
2014 static int
unicode_modifiable(PyObject * unicode)2015 unicode_modifiable(PyObject *unicode)
2016 {
2017     assert(_PyUnicode_CHECK(unicode));
2018     if (Py_REFCNT(unicode) != 1)
2019         return 0;
2020     if (_PyUnicode_HASH(unicode) != -1)
2021         return 0;
2022     if (PyUnicode_CHECK_INTERNED(unicode))
2023         return 0;
2024     if (!PyUnicode_CheckExact(unicode))
2025         return 0;
2026 #ifdef Py_DEBUG
2027     /* singleton refcount is greater than 1 */
2028     assert(!unicode_is_singleton(unicode));
2029 #endif
2030     return 1;
2031 }
2032 
2033 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)2034 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2035 {
2036     PyObject *unicode;
2037     Py_ssize_t old_length;
2038 
2039     assert(p_unicode != NULL);
2040     unicode = *p_unicode;
2041 
2042     assert(unicode != NULL);
2043     assert(PyUnicode_Check(unicode));
2044     assert(0 <= length);
2045 
2046     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2047         old_length = PyUnicode_WSTR_LENGTH(unicode);
2048     else
2049         old_length = PyUnicode_GET_LENGTH(unicode);
2050     if (old_length == length)
2051         return 0;
2052 
2053     if (length == 0) {
2054         PyObject *empty = unicode_new_empty();
2055         Py_SETREF(*p_unicode, empty);
2056         return 0;
2057     }
2058 
2059     if (!unicode_modifiable(unicode)) {
2060         PyObject *copy = resize_copy(unicode, length);
2061         if (copy == NULL)
2062             return -1;
2063         Py_SETREF(*p_unicode, copy);
2064         return 0;
2065     }
2066 
2067     if (PyUnicode_IS_COMPACT(unicode)) {
2068         PyObject *new_unicode = resize_compact(unicode, length);
2069         if (new_unicode == NULL)
2070             return -1;
2071         *p_unicode = new_unicode;
2072         return 0;
2073     }
2074     return resize_inplace(unicode, length);
2075 }
2076 
2077 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2078 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2079 {
2080     PyObject *unicode;
2081     if (p_unicode == NULL) {
2082         PyErr_BadInternalCall();
2083         return -1;
2084     }
2085     unicode = *p_unicode;
2086     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2087     {
2088         PyErr_BadInternalCall();
2089         return -1;
2090     }
2091     return unicode_resize(p_unicode, length);
2092 }
2093 
2094 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2095 
2096    WARNING: The function doesn't copy the terminating null character and
2097    doesn't check the maximum character (may write a latin1 character in an
2098    ASCII string). */
2099 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2100 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2101                    const char *str, Py_ssize_t len)
2102 {
2103     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2104     const void *data = PyUnicode_DATA(unicode);
2105     const char *end = str + len;
2106 
2107     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2108     switch (kind) {
2109     case PyUnicode_1BYTE_KIND: {
2110 #ifdef Py_DEBUG
2111         if (PyUnicode_IS_ASCII(unicode)) {
2112             Py_UCS4 maxchar = ucs1lib_find_max_char(
2113                 (const Py_UCS1*)str,
2114                 (const Py_UCS1*)str + len);
2115             assert(maxchar < 128);
2116         }
2117 #endif
2118         memcpy((char *) data + index, str, len);
2119         break;
2120     }
2121     case PyUnicode_2BYTE_KIND: {
2122         Py_UCS2 *start = (Py_UCS2 *)data + index;
2123         Py_UCS2 *ucs2 = start;
2124 
2125         for (; str < end; ++ucs2, ++str)
2126             *ucs2 = (Py_UCS2)*str;
2127 
2128         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2129         break;
2130     }
2131     case PyUnicode_4BYTE_KIND: {
2132         Py_UCS4 *start = (Py_UCS4 *)data + index;
2133         Py_UCS4 *ucs4 = start;
2134 
2135         for (; str < end; ++ucs4, ++str)
2136             *ucs4 = (Py_UCS4)*str;
2137 
2138         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2139         break;
2140     }
2141     default:
2142         Py_UNREACHABLE();
2143     }
2144 }
2145 
2146 static PyObject*
get_latin1_char(Py_UCS1 ch)2147 get_latin1_char(Py_UCS1 ch)
2148 {
2149     struct _Py_unicode_state *state = get_unicode_state();
2150 
2151     PyObject *unicode = state->latin1[ch];
2152     if (unicode) {
2153         Py_INCREF(unicode);
2154         return unicode;
2155     }
2156 
2157     unicode = PyUnicode_New(1, ch);
2158     if (!unicode) {
2159         return NULL;
2160     }
2161 
2162     PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2163     assert(_PyUnicode_CheckConsistency(unicode, 1));
2164 
2165     Py_INCREF(unicode);
2166     state->latin1[ch] = unicode;
2167     return unicode;
2168 }
2169 
2170 static PyObject*
unicode_char(Py_UCS4 ch)2171 unicode_char(Py_UCS4 ch)
2172 {
2173     PyObject *unicode;
2174 
2175     assert(ch <= MAX_UNICODE);
2176 
2177     if (ch < 256) {
2178         return get_latin1_char(ch);
2179     }
2180 
2181     unicode = PyUnicode_New(1, ch);
2182     if (unicode == NULL)
2183         return NULL;
2184 
2185     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2186     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2187         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2188     } else {
2189         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2190         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2191     }
2192     assert(_PyUnicode_CheckConsistency(unicode, 1));
2193     return unicode;
2194 }
2195 
2196 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2197 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2198 {
2199     if (u == NULL) {
2200         if (size > 0) {
2201             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2202                     "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2203                     "use PyUnicode_New() instead", 1) < 0) {
2204                 return NULL;
2205             }
2206         }
2207         return (PyObject*)_PyUnicode_New(size);
2208     }
2209 
2210     if (size < 0) {
2211         PyErr_BadInternalCall();
2212         return NULL;
2213     }
2214 
2215     return PyUnicode_FromWideChar(u, size);
2216 }
2217 
2218 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2219 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2220 {
2221     PyObject *unicode;
2222     Py_UCS4 maxchar = 0;
2223     Py_ssize_t num_surrogates;
2224 
2225     if (u == NULL && size != 0) {
2226         PyErr_BadInternalCall();
2227         return NULL;
2228     }
2229 
2230     if (size == -1) {
2231         size = wcslen(u);
2232     }
2233 
2234     /* If the Unicode data is known at construction time, we can apply
2235        some optimizations which share commonly used objects. */
2236 
2237     /* Optimization for empty strings */
2238     if (size == 0)
2239         _Py_RETURN_UNICODE_EMPTY();
2240 
2241 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2242     /* Oracle Solaris uses non-Unicode internal wchar_t form for
2243        non-Unicode locales and hence needs conversion to UCS-4 first. */
2244     if (_Py_LocaleUsesNonUnicodeWchar()) {
2245         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2246         if (!converted) {
2247             return NULL;
2248         }
2249         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2250         PyMem_Free(converted);
2251         return unicode;
2252     }
2253 #endif
2254 
2255     /* Single character Unicode objects in the Latin-1 range are
2256        shared when using this constructor */
2257     if (size == 1 && (Py_UCS4)*u < 256)
2258         return get_latin1_char((unsigned char)*u);
2259 
2260     /* If not empty and not single character, copy the Unicode data
2261        into the new object */
2262     if (find_maxchar_surrogates(u, u + size,
2263                                 &maxchar, &num_surrogates) == -1)
2264         return NULL;
2265 
2266     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2267     if (!unicode)
2268         return NULL;
2269 
2270     switch (PyUnicode_KIND(unicode)) {
2271     case PyUnicode_1BYTE_KIND:
2272         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2273                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2274         break;
2275     case PyUnicode_2BYTE_KIND:
2276 #if Py_UNICODE_SIZE == 2
2277         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2278 #else
2279         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2280                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2281 #endif
2282         break;
2283     case PyUnicode_4BYTE_KIND:
2284 #if SIZEOF_WCHAR_T == 2
2285         /* This is the only case which has to process surrogates, thus
2286            a simple copy loop is not enough and we need a function. */
2287         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2288 #else
2289         assert(num_surrogates == 0);
2290         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2291 #endif
2292         break;
2293     default:
2294         Py_UNREACHABLE();
2295     }
2296 
2297     return unicode_result(unicode);
2298 }
2299 
2300 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2301 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2302 {
2303     if (size < 0) {
2304         PyErr_SetString(PyExc_SystemError,
2305                         "Negative size passed to PyUnicode_FromStringAndSize");
2306         return NULL;
2307     }
2308     if (u != NULL) {
2309         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2310     }
2311     else {
2312         if (size > 0) {
2313             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2314                     "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2315                     "use PyUnicode_New() instead", 1) < 0) {
2316                 return NULL;
2317             }
2318         }
2319         return (PyObject *)_PyUnicode_New(size);
2320     }
2321 }
2322 
2323 PyObject *
PyUnicode_FromString(const char * u)2324 PyUnicode_FromString(const char *u)
2325 {
2326     size_t size = strlen(u);
2327     if (size > PY_SSIZE_T_MAX) {
2328         PyErr_SetString(PyExc_OverflowError, "input too long");
2329         return NULL;
2330     }
2331     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2332 }
2333 
2334 
2335 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2336 _PyUnicode_FromId(_Py_Identifier *id)
2337 {
2338     PyInterpreterState *interp = _PyInterpreterState_GET();
2339     struct _Py_unicode_ids *ids = &interp->unicode.ids;
2340 
2341     Py_ssize_t index = _Py_atomic_size_get(&id->index);
2342     if (index < 0) {
2343         struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2344 
2345         PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2346         // Check again to detect concurrent access. Another thread can have
2347         // initialized the index while this thread waited for the lock.
2348         index = _Py_atomic_size_get(&id->index);
2349         if (index < 0) {
2350             assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2351             index = rt_ids->next_index;
2352             rt_ids->next_index++;
2353             _Py_atomic_size_set(&id->index, index);
2354         }
2355         PyThread_release_lock(rt_ids->lock);
2356     }
2357     assert(index >= 0);
2358 
2359     PyObject *obj;
2360     if (index < ids->size) {
2361         obj = ids->array[index];
2362         if (obj) {
2363             // Return a borrowed reference
2364             return obj;
2365         }
2366     }
2367 
2368     obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2369                                        NULL, NULL);
2370     if (!obj) {
2371         return NULL;
2372     }
2373     PyUnicode_InternInPlace(&obj);
2374 
2375     if (index >= ids->size) {
2376         // Overallocate to reduce the number of realloc
2377         Py_ssize_t new_size = Py_MAX(index * 2, 16);
2378         Py_ssize_t item_size = sizeof(ids->array[0]);
2379         PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2380         if (new_array == NULL) {
2381             PyErr_NoMemory();
2382             return NULL;
2383         }
2384         memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2385         ids->array = new_array;
2386         ids->size = new_size;
2387     }
2388 
2389     // The array stores a strong reference
2390     ids->array[index] = obj;
2391 
2392     // Return a borrowed reference
2393     return obj;
2394 }
2395 
2396 
2397 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2398 unicode_clear_identifiers(struct _Py_unicode_state *state)
2399 {
2400     struct _Py_unicode_ids *ids = &state->ids;
2401     for (Py_ssize_t i=0; i < ids->size; i++) {
2402         Py_XDECREF(ids->array[i]);
2403     }
2404     ids->size = 0;
2405     PyMem_Free(ids->array);
2406     ids->array = NULL;
2407     // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2408     // after Py_Finalize().
2409 }
2410 
2411 
2412 /* Internal function, doesn't check maximum character */
2413 
2414 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2415 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2416 {
2417     const unsigned char *s = (const unsigned char *)buffer;
2418     PyObject *unicode;
2419     if (size == 1) {
2420 #ifdef Py_DEBUG
2421         assert((unsigned char)s[0] < 128);
2422 #endif
2423         return get_latin1_char(s[0]);
2424     }
2425     unicode = PyUnicode_New(size, 127);
2426     if (!unicode)
2427         return NULL;
2428     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2429     assert(_PyUnicode_CheckConsistency(unicode, 1));
2430     return unicode;
2431 }
2432 
2433 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2434 kind_maxchar_limit(unsigned int kind)
2435 {
2436     switch (kind) {
2437     case PyUnicode_1BYTE_KIND:
2438         return 0x80;
2439     case PyUnicode_2BYTE_KIND:
2440         return 0x100;
2441     case PyUnicode_4BYTE_KIND:
2442         return 0x10000;
2443     default:
2444         Py_UNREACHABLE();
2445     }
2446 }
2447 
2448 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2449 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2450 {
2451     PyObject *res;
2452     unsigned char max_char;
2453 
2454     if (size == 0) {
2455         _Py_RETURN_UNICODE_EMPTY();
2456     }
2457     assert(size > 0);
2458     if (size == 1) {
2459         return get_latin1_char(u[0]);
2460     }
2461 
2462     max_char = ucs1lib_find_max_char(u, u + size);
2463     res = PyUnicode_New(size, max_char);
2464     if (!res)
2465         return NULL;
2466     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2467     assert(_PyUnicode_CheckConsistency(res, 1));
2468     return res;
2469 }
2470 
2471 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2472 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2473 {
2474     PyObject *res;
2475     Py_UCS2 max_char;
2476 
2477     if (size == 0)
2478         _Py_RETURN_UNICODE_EMPTY();
2479     assert(size > 0);
2480     if (size == 1)
2481         return unicode_char(u[0]);
2482 
2483     max_char = ucs2lib_find_max_char(u, u + size);
2484     res = PyUnicode_New(size, max_char);
2485     if (!res)
2486         return NULL;
2487     if (max_char >= 256)
2488         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2489     else {
2490         _PyUnicode_CONVERT_BYTES(
2491             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2492     }
2493     assert(_PyUnicode_CheckConsistency(res, 1));
2494     return res;
2495 }
2496 
2497 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2498 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2499 {
2500     PyObject *res;
2501     Py_UCS4 max_char;
2502 
2503     if (size == 0)
2504         _Py_RETURN_UNICODE_EMPTY();
2505     assert(size > 0);
2506     if (size == 1)
2507         return unicode_char(u[0]);
2508 
2509     max_char = ucs4lib_find_max_char(u, u + size);
2510     res = PyUnicode_New(size, max_char);
2511     if (!res)
2512         return NULL;
2513     if (max_char < 256)
2514         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2515                                  PyUnicode_1BYTE_DATA(res));
2516     else if (max_char < 0x10000)
2517         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2518                                  PyUnicode_2BYTE_DATA(res));
2519     else
2520         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2521     assert(_PyUnicode_CheckConsistency(res, 1));
2522     return res;
2523 }
2524 
2525 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2526 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2527 {
2528     if (size < 0) {
2529         PyErr_SetString(PyExc_ValueError, "size must be positive");
2530         return NULL;
2531     }
2532     switch (kind) {
2533     case PyUnicode_1BYTE_KIND:
2534         return _PyUnicode_FromUCS1(buffer, size);
2535     case PyUnicode_2BYTE_KIND:
2536         return _PyUnicode_FromUCS2(buffer, size);
2537     case PyUnicode_4BYTE_KIND:
2538         return _PyUnicode_FromUCS4(buffer, size);
2539     default:
2540         PyErr_SetString(PyExc_SystemError, "invalid kind");
2541         return NULL;
2542     }
2543 }
2544 
2545 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2546 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2547 {
2548     enum PyUnicode_Kind kind;
2549     const void *startptr, *endptr;
2550 
2551     assert(PyUnicode_IS_READY(unicode));
2552     assert(0 <= start);
2553     assert(end <= PyUnicode_GET_LENGTH(unicode));
2554     assert(start <= end);
2555 
2556     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2557         return PyUnicode_MAX_CHAR_VALUE(unicode);
2558 
2559     if (start == end)
2560         return 127;
2561 
2562     if (PyUnicode_IS_ASCII(unicode))
2563         return 127;
2564 
2565     kind = PyUnicode_KIND(unicode);
2566     startptr = PyUnicode_DATA(unicode);
2567     endptr = (char *)startptr + end * kind;
2568     startptr = (char *)startptr + start * kind;
2569     switch(kind) {
2570     case PyUnicode_1BYTE_KIND:
2571         return ucs1lib_find_max_char(startptr, endptr);
2572     case PyUnicode_2BYTE_KIND:
2573         return ucs2lib_find_max_char(startptr, endptr);
2574     case PyUnicode_4BYTE_KIND:
2575         return ucs4lib_find_max_char(startptr, endptr);
2576     default:
2577         Py_UNREACHABLE();
2578     }
2579 }
2580 
2581 /* Ensure that a string uses the most efficient storage, if it is not the
2582    case: create a new string with of the right kind. Write NULL into *p_unicode
2583    on error. */
2584 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2585 unicode_adjust_maxchar(PyObject **p_unicode)
2586 {
2587     PyObject *unicode, *copy;
2588     Py_UCS4 max_char;
2589     Py_ssize_t len;
2590     unsigned int kind;
2591 
2592     assert(p_unicode != NULL);
2593     unicode = *p_unicode;
2594     assert(PyUnicode_IS_READY(unicode));
2595     if (PyUnicode_IS_ASCII(unicode))
2596         return;
2597 
2598     len = PyUnicode_GET_LENGTH(unicode);
2599     kind = PyUnicode_KIND(unicode);
2600     if (kind == PyUnicode_1BYTE_KIND) {
2601         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2602         max_char = ucs1lib_find_max_char(u, u + len);
2603         if (max_char >= 128)
2604             return;
2605     }
2606     else if (kind == PyUnicode_2BYTE_KIND) {
2607         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2608         max_char = ucs2lib_find_max_char(u, u + len);
2609         if (max_char >= 256)
2610             return;
2611     }
2612     else if (kind == PyUnicode_4BYTE_KIND) {
2613         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2614         max_char = ucs4lib_find_max_char(u, u + len);
2615         if (max_char >= 0x10000)
2616             return;
2617     }
2618     else
2619         Py_UNREACHABLE();
2620 
2621     copy = PyUnicode_New(len, max_char);
2622     if (copy != NULL)
2623         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2624     Py_DECREF(unicode);
2625     *p_unicode = copy;
2626 }
2627 
2628 PyObject*
_PyUnicode_Copy(PyObject * unicode)2629 _PyUnicode_Copy(PyObject *unicode)
2630 {
2631     Py_ssize_t length;
2632     PyObject *copy;
2633 
2634     if (!PyUnicode_Check(unicode)) {
2635         PyErr_BadInternalCall();
2636         return NULL;
2637     }
2638     if (PyUnicode_READY(unicode) == -1)
2639         return NULL;
2640 
2641     length = PyUnicode_GET_LENGTH(unicode);
2642     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2643     if (!copy)
2644         return NULL;
2645     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2646 
2647     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2648               length * PyUnicode_KIND(unicode));
2649     assert(_PyUnicode_CheckConsistency(copy, 1));
2650     return copy;
2651 }
2652 
2653 
2654 /* Widen Unicode objects to larger buffers. Don't write terminating null
2655    character. Return NULL on error. */
2656 
2657 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2658 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2659 {
2660     void *result;
2661 
2662     assert(skind < kind);
2663     switch (kind) {
2664     case PyUnicode_2BYTE_KIND:
2665         result = PyMem_New(Py_UCS2, len);
2666         if (!result)
2667             return PyErr_NoMemory();
2668         assert(skind == PyUnicode_1BYTE_KIND);
2669         _PyUnicode_CONVERT_BYTES(
2670             Py_UCS1, Py_UCS2,
2671             (const Py_UCS1 *)data,
2672             ((const Py_UCS1 *)data) + len,
2673             result);
2674         return result;
2675     case PyUnicode_4BYTE_KIND:
2676         result = PyMem_New(Py_UCS4, len);
2677         if (!result)
2678             return PyErr_NoMemory();
2679         if (skind == PyUnicode_2BYTE_KIND) {
2680             _PyUnicode_CONVERT_BYTES(
2681                 Py_UCS2, Py_UCS4,
2682                 (const Py_UCS2 *)data,
2683                 ((const Py_UCS2 *)data) + len,
2684                 result);
2685         }
2686         else {
2687             assert(skind == PyUnicode_1BYTE_KIND);
2688             _PyUnicode_CONVERT_BYTES(
2689                 Py_UCS1, Py_UCS4,
2690                 (const Py_UCS1 *)data,
2691                 ((const Py_UCS1 *)data) + len,
2692                 result);
2693         }
2694         return result;
2695     default:
2696         Py_UNREACHABLE();
2697         return NULL;
2698     }
2699 }
2700 
2701 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2702 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2703         int copy_null)
2704 {
2705     int kind;
2706     const void *data;
2707     Py_ssize_t len, targetlen;
2708     if (PyUnicode_READY(string) == -1)
2709         return NULL;
2710     kind = PyUnicode_KIND(string);
2711     data = PyUnicode_DATA(string);
2712     len = PyUnicode_GET_LENGTH(string);
2713     targetlen = len;
2714     if (copy_null)
2715         targetlen++;
2716     if (!target) {
2717         target = PyMem_New(Py_UCS4, targetlen);
2718         if (!target) {
2719             PyErr_NoMemory();
2720             return NULL;
2721         }
2722     }
2723     else {
2724         if (targetsize < targetlen) {
2725             PyErr_Format(PyExc_SystemError,
2726                          "string is longer than the buffer");
2727             if (copy_null && 0 < targetsize)
2728                 target[0] = 0;
2729             return NULL;
2730         }
2731     }
2732     if (kind == PyUnicode_1BYTE_KIND) {
2733         const Py_UCS1 *start = (const Py_UCS1 *) data;
2734         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2735     }
2736     else if (kind == PyUnicode_2BYTE_KIND) {
2737         const Py_UCS2 *start = (const Py_UCS2 *) data;
2738         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2739     }
2740     else if (kind == PyUnicode_4BYTE_KIND) {
2741         memcpy(target, data, len * sizeof(Py_UCS4));
2742     }
2743     else {
2744         Py_UNREACHABLE();
2745     }
2746     if (copy_null)
2747         target[len] = 0;
2748     return target;
2749 }
2750 
2751 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2752 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2753                  int copy_null)
2754 {
2755     if (target == NULL || targetsize < 0) {
2756         PyErr_BadInternalCall();
2757         return NULL;
2758     }
2759     return as_ucs4(string, target, targetsize, copy_null);
2760 }
2761 
2762 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2763 PyUnicode_AsUCS4Copy(PyObject *string)
2764 {
2765     return as_ucs4(string, NULL, 0, 1);
2766 }
2767 
2768 /* maximum number of characters required for output of %lld or %p.
2769    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2770    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2771 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2772 
2773 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2774 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2775                              Py_ssize_t width, Py_ssize_t precision)
2776 {
2777     Py_ssize_t length, fill, arglen;
2778     Py_UCS4 maxchar;
2779 
2780     if (PyUnicode_READY(str) == -1)
2781         return -1;
2782 
2783     length = PyUnicode_GET_LENGTH(str);
2784     if ((precision == -1 || precision >= length)
2785         && width <= length)
2786         return _PyUnicodeWriter_WriteStr(writer, str);
2787 
2788     if (precision != -1)
2789         length = Py_MIN(precision, length);
2790 
2791     arglen = Py_MAX(length, width);
2792     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2793         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2794     else
2795         maxchar = writer->maxchar;
2796 
2797     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2798         return -1;
2799 
2800     if (width > length) {
2801         fill = width - length;
2802         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2803             return -1;
2804         writer->pos += fill;
2805     }
2806 
2807     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2808                                   str, 0, length);
2809     writer->pos += length;
2810     return 0;
2811 }
2812 
2813 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2814 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2815                               Py_ssize_t width, Py_ssize_t precision)
2816 {
2817     /* UTF-8 */
2818     Py_ssize_t length;
2819     PyObject *unicode;
2820     int res;
2821 
2822     if (precision == -1) {
2823         length = strlen(str);
2824     }
2825     else {
2826         length = 0;
2827         while (length < precision && str[length]) {
2828             length++;
2829         }
2830     }
2831     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2832     if (unicode == NULL)
2833         return -1;
2834 
2835     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2836     Py_DECREF(unicode);
2837     return res;
2838 }
2839 
2840 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2841 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2842                        const char *f, va_list *vargs)
2843 {
2844     const char *p;
2845     Py_ssize_t len;
2846     int zeropad;
2847     Py_ssize_t width;
2848     Py_ssize_t precision;
2849     int longflag;
2850     int longlongflag;
2851     int size_tflag;
2852     Py_ssize_t fill;
2853 
2854     p = f;
2855     f++;
2856     zeropad = 0;
2857     if (*f == '0') {
2858         zeropad = 1;
2859         f++;
2860     }
2861 
2862     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2863     width = -1;
2864     if (Py_ISDIGIT((unsigned)*f)) {
2865         width = *f - '0';
2866         f++;
2867         while (Py_ISDIGIT((unsigned)*f)) {
2868             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2869                 PyErr_SetString(PyExc_ValueError,
2870                                 "width too big");
2871                 return NULL;
2872             }
2873             width = (width * 10) + (*f - '0');
2874             f++;
2875         }
2876     }
2877     precision = -1;
2878     if (*f == '.') {
2879         f++;
2880         if (Py_ISDIGIT((unsigned)*f)) {
2881             precision = (*f - '0');
2882             f++;
2883             while (Py_ISDIGIT((unsigned)*f)) {
2884                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2885                     PyErr_SetString(PyExc_ValueError,
2886                                     "precision too big");
2887                     return NULL;
2888                 }
2889                 precision = (precision * 10) + (*f - '0');
2890                 f++;
2891             }
2892         }
2893         if (*f == '%') {
2894             /* "%.3%s" => f points to "3" */
2895             f--;
2896         }
2897     }
2898     if (*f == '\0') {
2899         /* bogus format "%.123" => go backward, f points to "3" */
2900         f--;
2901     }
2902 
2903     /* Handle %ld, %lu, %lld and %llu. */
2904     longflag = 0;
2905     longlongflag = 0;
2906     size_tflag = 0;
2907     if (*f == 'l') {
2908         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2909             longflag = 1;
2910             ++f;
2911         }
2912         else if (f[1] == 'l' &&
2913                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2914             longlongflag = 1;
2915             f += 2;
2916         }
2917     }
2918     /* handle the size_t flag. */
2919     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2920         size_tflag = 1;
2921         ++f;
2922     }
2923 
2924     if (f[1] == '\0')
2925         writer->overallocate = 0;
2926 
2927     switch (*f) {
2928     case 'c':
2929     {
2930         int ordinal = va_arg(*vargs, int);
2931         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2932             PyErr_SetString(PyExc_OverflowError,
2933                             "character argument not in range(0x110000)");
2934             return NULL;
2935         }
2936         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2937             return NULL;
2938         break;
2939     }
2940 
2941     case 'i':
2942     case 'd':
2943     case 'u':
2944     case 'x':
2945     {
2946         /* used by sprintf */
2947         char buffer[MAX_LONG_LONG_CHARS];
2948         Py_ssize_t arglen;
2949 
2950         if (*f == 'u') {
2951             if (longflag) {
2952                 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2953             }
2954             else if (longlongflag) {
2955                 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2956             }
2957             else if (size_tflag) {
2958                 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2959             }
2960             else {
2961                 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2962             }
2963         }
2964         else if (*f == 'x') {
2965             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2966         }
2967         else {
2968             if (longflag) {
2969                 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2970             }
2971             else if (longlongflag) {
2972                 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2973             }
2974             else if (size_tflag) {
2975                 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2976             }
2977             else {
2978                 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2979             }
2980         }
2981         assert(len >= 0);
2982 
2983         if (precision < len)
2984             precision = len;
2985 
2986         arglen = Py_MAX(precision, width);
2987         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2988             return NULL;
2989 
2990         if (width > precision) {
2991             Py_UCS4 fillchar;
2992             fill = width - precision;
2993             fillchar = zeropad?'0':' ';
2994             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2995                 return NULL;
2996             writer->pos += fill;
2997         }
2998         if (precision > len) {
2999             fill = precision - len;
3000             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
3001                 return NULL;
3002             writer->pos += fill;
3003         }
3004 
3005         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
3006             return NULL;
3007         break;
3008     }
3009 
3010     case 'p':
3011     {
3012         char number[MAX_LONG_LONG_CHARS];
3013 
3014         len = sprintf(number, "%p", va_arg(*vargs, void*));
3015         assert(len >= 0);
3016 
3017         /* %p is ill-defined:  ensure leading 0x. */
3018         if (number[1] == 'X')
3019             number[1] = 'x';
3020         else if (number[1] != 'x') {
3021             memmove(number + 2, number,
3022                     strlen(number) + 1);
3023             number[0] = '0';
3024             number[1] = 'x';
3025             len += 2;
3026         }
3027 
3028         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3029             return NULL;
3030         break;
3031     }
3032 
3033     case 's':
3034     {
3035         /* UTF-8 */
3036         const char *s = va_arg(*vargs, const char*);
3037         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
3038             return NULL;
3039         break;
3040     }
3041 
3042     case 'U':
3043     {
3044         PyObject *obj = va_arg(*vargs, PyObject *);
3045         assert(obj && _PyUnicode_CHECK(obj));
3046 
3047         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3048             return NULL;
3049         break;
3050     }
3051 
3052     case 'V':
3053     {
3054         PyObject *obj = va_arg(*vargs, PyObject *);
3055         const char *str = va_arg(*vargs, const char *);
3056         if (obj) {
3057             assert(_PyUnicode_CHECK(obj));
3058             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3059                 return NULL;
3060         }
3061         else {
3062             assert(str != NULL);
3063             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3064                 return NULL;
3065         }
3066         break;
3067     }
3068 
3069     case 'S':
3070     {
3071         PyObject *obj = va_arg(*vargs, PyObject *);
3072         PyObject *str;
3073         assert(obj);
3074         str = PyObject_Str(obj);
3075         if (!str)
3076             return NULL;
3077         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3078             Py_DECREF(str);
3079             return NULL;
3080         }
3081         Py_DECREF(str);
3082         break;
3083     }
3084 
3085     case 'R':
3086     {
3087         PyObject *obj = va_arg(*vargs, PyObject *);
3088         PyObject *repr;
3089         assert(obj);
3090         repr = PyObject_Repr(obj);
3091         if (!repr)
3092             return NULL;
3093         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3094             Py_DECREF(repr);
3095             return NULL;
3096         }
3097         Py_DECREF(repr);
3098         break;
3099     }
3100 
3101     case 'A':
3102     {
3103         PyObject *obj = va_arg(*vargs, PyObject *);
3104         PyObject *ascii;
3105         assert(obj);
3106         ascii = PyObject_ASCII(obj);
3107         if (!ascii)
3108             return NULL;
3109         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3110             Py_DECREF(ascii);
3111             return NULL;
3112         }
3113         Py_DECREF(ascii);
3114         break;
3115     }
3116 
3117     case '%':
3118         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3119             return NULL;
3120         break;
3121 
3122     default:
3123         /* if we stumble upon an unknown formatting code, copy the rest
3124            of the format string to the output string. (we cannot just
3125            skip the code, since there's no way to know what's in the
3126            argument list) */
3127         len = strlen(p);
3128         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3129             return NULL;
3130         f = p+len;
3131         return f;
3132     }
3133 
3134     f++;
3135     return f;
3136 }
3137 
3138 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3139 PyUnicode_FromFormatV(const char *format, va_list vargs)
3140 {
3141     va_list vargs2;
3142     const char *f;
3143     _PyUnicodeWriter writer;
3144 
3145     _PyUnicodeWriter_Init(&writer);
3146     writer.min_length = strlen(format) + 100;
3147     writer.overallocate = 1;
3148 
3149     // Copy varags to be able to pass a reference to a subfunction.
3150     va_copy(vargs2, vargs);
3151 
3152     for (f = format; *f; ) {
3153         if (*f == '%') {
3154             f = unicode_fromformat_arg(&writer, f, &vargs2);
3155             if (f == NULL)
3156                 goto fail;
3157         }
3158         else {
3159             const char *p;
3160             Py_ssize_t len;
3161 
3162             p = f;
3163             do
3164             {
3165                 if ((unsigned char)*p > 127) {
3166                     PyErr_Format(PyExc_ValueError,
3167                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3168                         "string, got a non-ASCII byte: 0x%02x",
3169                         (unsigned char)*p);
3170                     goto fail;
3171                 }
3172                 p++;
3173             }
3174             while (*p != '\0' && *p != '%');
3175             len = p - f;
3176 
3177             if (*p == '\0')
3178                 writer.overallocate = 0;
3179 
3180             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3181                 goto fail;
3182 
3183             f = p;
3184         }
3185     }
3186     va_end(vargs2);
3187     return _PyUnicodeWriter_Finish(&writer);
3188 
3189   fail:
3190     va_end(vargs2);
3191     _PyUnicodeWriter_Dealloc(&writer);
3192     return NULL;
3193 }
3194 
3195 PyObject *
PyUnicode_FromFormat(const char * format,...)3196 PyUnicode_FromFormat(const char *format, ...)
3197 {
3198     PyObject* ret;
3199     va_list vargs;
3200 
3201 #ifdef HAVE_STDARG_PROTOTYPES
3202     va_start(vargs, format);
3203 #else
3204     va_start(vargs);
3205 #endif
3206     ret = PyUnicode_FromFormatV(format, vargs);
3207     va_end(vargs);
3208     return ret;
3209 }
3210 
3211 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3212 unicode_get_widechar_size(PyObject *unicode)
3213 {
3214     Py_ssize_t res;
3215 
3216     assert(unicode != NULL);
3217     assert(_PyUnicode_CHECK(unicode));
3218 
3219 #if USE_UNICODE_WCHAR_CACHE
3220     if (_PyUnicode_WSTR(unicode) != NULL) {
3221         return PyUnicode_WSTR_LENGTH(unicode);
3222     }
3223 #endif /* USE_UNICODE_WCHAR_CACHE */
3224     assert(PyUnicode_IS_READY(unicode));
3225 
3226     res = _PyUnicode_LENGTH(unicode);
3227 #if SIZEOF_WCHAR_T == 2
3228     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3229         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3230         const Py_UCS4 *end = s + res;
3231         for (; s < end; ++s) {
3232             if (*s > 0xFFFF) {
3233                 ++res;
3234             }
3235         }
3236     }
3237 #endif
3238     return res;
3239 }
3240 
3241 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3242 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3243 {
3244     assert(unicode != NULL);
3245     assert(_PyUnicode_CHECK(unicode));
3246 
3247 #if USE_UNICODE_WCHAR_CACHE
3248     const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3249     if (wstr != NULL) {
3250         memcpy(w, wstr, size * sizeof(wchar_t));
3251         return;
3252     }
3253 #else /* USE_UNICODE_WCHAR_CACHE */
3254     if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3255         memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3256         return;
3257     }
3258 #endif /* USE_UNICODE_WCHAR_CACHE */
3259     assert(PyUnicode_IS_READY(unicode));
3260 
3261     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3262         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3263         for (; size--; ++s, ++w) {
3264             *w = *s;
3265         }
3266     }
3267     else {
3268 #if SIZEOF_WCHAR_T == 4
3269         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3270         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3271         for (; size--; ++s, ++w) {
3272             *w = *s;
3273         }
3274 #else
3275         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3276         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3277         for (; size--; ++s, ++w) {
3278             Py_UCS4 ch = *s;
3279             if (ch > 0xFFFF) {
3280                 assert(ch <= MAX_UNICODE);
3281                 /* encode surrogate pair in this case */
3282                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3283                 if (!size--)
3284                     break;
3285                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3286             }
3287             else {
3288                 *w = ch;
3289             }
3290         }
3291 #endif
3292     }
3293 }
3294 
3295 #ifdef HAVE_WCHAR_H
3296 
3297 /* Convert a Unicode object to a wide character string.
3298 
3299    - If w is NULL: return the number of wide characters (including the null
3300      character) required to convert the unicode object. Ignore size argument.
3301 
3302    - Otherwise: return the number of wide characters (excluding the null
3303      character) written into w. Write at most size wide characters (including
3304      the null character). */
3305 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3306 PyUnicode_AsWideChar(PyObject *unicode,
3307                      wchar_t *w,
3308                      Py_ssize_t size)
3309 {
3310     Py_ssize_t res;
3311 
3312     if (unicode == NULL) {
3313         PyErr_BadInternalCall();
3314         return -1;
3315     }
3316     if (!PyUnicode_Check(unicode)) {
3317         PyErr_BadArgument();
3318         return -1;
3319     }
3320 
3321     res = unicode_get_widechar_size(unicode);
3322     if (w == NULL) {
3323         return res + 1;
3324     }
3325 
3326     if (size > res) {
3327         size = res + 1;
3328     }
3329     else {
3330         res = size;
3331     }
3332     unicode_copy_as_widechar(unicode, w, size);
3333 
3334 #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3335     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3336        non-Unicode locales and hence needs conversion first. */
3337     if (_Py_LocaleUsesNonUnicodeWchar()) {
3338         if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3339             return -1;
3340         }
3341     }
3342 #endif
3343 
3344     return res;
3345 }
3346 
3347 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3348 PyUnicode_AsWideCharString(PyObject *unicode,
3349                            Py_ssize_t *size)
3350 {
3351     wchar_t *buffer;
3352     Py_ssize_t buflen;
3353 
3354     if (unicode == NULL) {
3355         PyErr_BadInternalCall();
3356         return NULL;
3357     }
3358     if (!PyUnicode_Check(unicode)) {
3359         PyErr_BadArgument();
3360         return NULL;
3361     }
3362 
3363     buflen = unicode_get_widechar_size(unicode);
3364     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3365     if (buffer == NULL) {
3366         PyErr_NoMemory();
3367         return NULL;
3368     }
3369     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3370 
3371 #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3372     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3373        non-Unicode locales and hence needs conversion first. */
3374     if (_Py_LocaleUsesNonUnicodeWchar()) {
3375         if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3376             return NULL;
3377         }
3378     }
3379 #endif
3380 
3381     if (size != NULL) {
3382         *size = buflen;
3383     }
3384     else if (wcslen(buffer) != (size_t)buflen) {
3385         PyMem_Free(buffer);
3386         PyErr_SetString(PyExc_ValueError,
3387                         "embedded null character");
3388         return NULL;
3389     }
3390     return buffer;
3391 }
3392 
3393 #endif /* HAVE_WCHAR_H */
3394 
3395 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3396 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3397 {
3398     wchar_t **p = (wchar_t **)ptr;
3399     if (obj == NULL) {
3400 #if !USE_UNICODE_WCHAR_CACHE
3401         PyMem_Free(*p);
3402 #endif /* USE_UNICODE_WCHAR_CACHE */
3403         *p = NULL;
3404         return 1;
3405     }
3406     if (PyUnicode_Check(obj)) {
3407 #if USE_UNICODE_WCHAR_CACHE
3408         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3409         if (*p == NULL) {
3410             return 0;
3411         }
3412         return 1;
3413 #else /* USE_UNICODE_WCHAR_CACHE */
3414         *p = PyUnicode_AsWideCharString(obj, NULL);
3415         if (*p == NULL) {
3416             return 0;
3417         }
3418         return Py_CLEANUP_SUPPORTED;
3419 #endif /* USE_UNICODE_WCHAR_CACHE */
3420     }
3421     PyErr_Format(PyExc_TypeError,
3422                  "argument must be str, not %.50s",
3423                  Py_TYPE(obj)->tp_name);
3424     return 0;
3425 }
3426 
3427 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3428 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3429 {
3430     wchar_t **p = (wchar_t **)ptr;
3431     if (obj == NULL) {
3432 #if !USE_UNICODE_WCHAR_CACHE
3433         PyMem_Free(*p);
3434 #endif /* USE_UNICODE_WCHAR_CACHE */
3435         *p = NULL;
3436         return 1;
3437     }
3438     if (obj == Py_None) {
3439         *p = NULL;
3440         return 1;
3441     }
3442     if (PyUnicode_Check(obj)) {
3443 #if USE_UNICODE_WCHAR_CACHE
3444         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3445         if (*p == NULL) {
3446             return 0;
3447         }
3448         return 1;
3449 #else /* USE_UNICODE_WCHAR_CACHE */
3450         *p = PyUnicode_AsWideCharString(obj, NULL);
3451         if (*p == NULL) {
3452             return 0;
3453         }
3454         return Py_CLEANUP_SUPPORTED;
3455 #endif /* USE_UNICODE_WCHAR_CACHE */
3456     }
3457     PyErr_Format(PyExc_TypeError,
3458                  "argument must be str or None, not %.50s",
3459                  Py_TYPE(obj)->tp_name);
3460     return 0;
3461 }
3462 
3463 PyObject *
PyUnicode_FromOrdinal(int ordinal)3464 PyUnicode_FromOrdinal(int ordinal)
3465 {
3466     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3467         PyErr_SetString(PyExc_ValueError,
3468                         "chr() arg not in range(0x110000)");
3469         return NULL;
3470     }
3471 
3472     return unicode_char((Py_UCS4)ordinal);
3473 }
3474 
3475 PyObject *
PyUnicode_FromObject(PyObject * obj)3476 PyUnicode_FromObject(PyObject *obj)
3477 {
3478     /* XXX Perhaps we should make this API an alias of
3479        PyObject_Str() instead ?! */
3480     if (PyUnicode_CheckExact(obj)) {
3481         if (PyUnicode_READY(obj) == -1)
3482             return NULL;
3483         Py_INCREF(obj);
3484         return obj;
3485     }
3486     if (PyUnicode_Check(obj)) {
3487         /* For a Unicode subtype that's not a Unicode object,
3488            return a true Unicode object with the same data. */
3489         return _PyUnicode_Copy(obj);
3490     }
3491     PyErr_Format(PyExc_TypeError,
3492                  "Can't convert '%.100s' object to str implicitly",
3493                  Py_TYPE(obj)->tp_name);
3494     return NULL;
3495 }
3496 
3497 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3498 PyUnicode_FromEncodedObject(PyObject *obj,
3499                             const char *encoding,
3500                             const char *errors)
3501 {
3502     Py_buffer buffer;
3503     PyObject *v;
3504 
3505     if (obj == NULL) {
3506         PyErr_BadInternalCall();
3507         return NULL;
3508     }
3509 
3510     /* Decoding bytes objects is the most common case and should be fast */
3511     if (PyBytes_Check(obj)) {
3512         if (PyBytes_GET_SIZE(obj) == 0) {
3513             if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514                 return NULL;
3515             }
3516             _Py_RETURN_UNICODE_EMPTY();
3517         }
3518         return PyUnicode_Decode(
3519                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3520                 encoding, errors);
3521     }
3522 
3523     if (PyUnicode_Check(obj)) {
3524         PyErr_SetString(PyExc_TypeError,
3525                         "decoding str is not supported");
3526         return NULL;
3527     }
3528 
3529     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3530     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3531         PyErr_Format(PyExc_TypeError,
3532                      "decoding to str: need a bytes-like object, %.80s found",
3533                      Py_TYPE(obj)->tp_name);
3534         return NULL;
3535     }
3536 
3537     if (buffer.len == 0) {
3538         PyBuffer_Release(&buffer);
3539         if (unicode_check_encoding_errors(encoding, errors) < 0) {
3540             return NULL;
3541         }
3542         _Py_RETURN_UNICODE_EMPTY();
3543     }
3544 
3545     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3546     PyBuffer_Release(&buffer);
3547     return v;
3548 }
3549 
3550 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3551    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3552    longer than lower_len-1). */
3553 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3554 _Py_normalize_encoding(const char *encoding,
3555                        char *lower,
3556                        size_t lower_len)
3557 {
3558     const char *e;
3559     char *l;
3560     char *l_end;
3561     int punct;
3562 
3563     assert(encoding != NULL);
3564 
3565     e = encoding;
3566     l = lower;
3567     l_end = &lower[lower_len - 1];
3568     punct = 0;
3569     while (1) {
3570         char c = *e;
3571         if (c == 0) {
3572             break;
3573         }
3574 
3575         if (Py_ISALNUM(c) || c == '.') {
3576             if (punct && l != lower) {
3577                 if (l == l_end) {
3578                     return 0;
3579                 }
3580                 *l++ = '_';
3581             }
3582             punct = 0;
3583 
3584             if (l == l_end) {
3585                 return 0;
3586             }
3587             *l++ = Py_TOLOWER(c);
3588         }
3589         else {
3590             punct = 1;
3591         }
3592 
3593         e++;
3594     }
3595     *l = '\0';
3596     return 1;
3597 }
3598 
3599 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3600 PyUnicode_Decode(const char *s,
3601                  Py_ssize_t size,
3602                  const char *encoding,
3603                  const char *errors)
3604 {
3605     PyObject *buffer = NULL, *unicode;
3606     Py_buffer info;
3607     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3608 
3609     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3610         return NULL;
3611     }
3612 
3613     if (size == 0) {
3614         _Py_RETURN_UNICODE_EMPTY();
3615     }
3616 
3617     if (encoding == NULL) {
3618         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3619     }
3620 
3621     /* Shortcuts for common default encodings */
3622     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3623         char *lower = buflower;
3624 
3625         /* Fast paths */
3626         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3627             lower += 3;
3628             if (*lower == '_') {
3629                 /* Match "utf8" and "utf_8" */
3630                 lower++;
3631             }
3632 
3633             if (lower[0] == '8' && lower[1] == 0) {
3634                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3635             }
3636             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3637                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3638             }
3639             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3640                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3641             }
3642         }
3643         else {
3644             if (strcmp(lower, "ascii") == 0
3645                 || strcmp(lower, "us_ascii") == 0) {
3646                 return PyUnicode_DecodeASCII(s, size, errors);
3647             }
3648     #ifdef MS_WINDOWS
3649             else if (strcmp(lower, "mbcs") == 0) {
3650                 return PyUnicode_DecodeMBCS(s, size, errors);
3651             }
3652     #endif
3653             else if (strcmp(lower, "latin1") == 0
3654                      || strcmp(lower, "latin_1") == 0
3655                      || strcmp(lower, "iso_8859_1") == 0
3656                      || strcmp(lower, "iso8859_1") == 0) {
3657                 return PyUnicode_DecodeLatin1(s, size, errors);
3658             }
3659         }
3660     }
3661 
3662     /* Decode via the codec registry */
3663     buffer = NULL;
3664     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3665         goto onError;
3666     buffer = PyMemoryView_FromBuffer(&info);
3667     if (buffer == NULL)
3668         goto onError;
3669     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3670     if (unicode == NULL)
3671         goto onError;
3672     if (!PyUnicode_Check(unicode)) {
3673         PyErr_Format(PyExc_TypeError,
3674                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3675                      "use codecs.decode() to decode to arbitrary types",
3676                      encoding,
3677                      Py_TYPE(unicode)->tp_name);
3678         Py_DECREF(unicode);
3679         goto onError;
3680     }
3681     Py_DECREF(buffer);
3682     return unicode_result(unicode);
3683 
3684   onError:
3685     Py_XDECREF(buffer);
3686     return NULL;
3687 }
3688 
3689 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3690 PyUnicode_AsDecodedObject(PyObject *unicode,
3691                           const char *encoding,
3692                           const char *errors)
3693 {
3694     if (!PyUnicode_Check(unicode)) {
3695         PyErr_BadArgument();
3696         return NULL;
3697     }
3698 
3699     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3700                      "PyUnicode_AsDecodedObject() is deprecated; "
3701                      "use PyCodec_Decode() to decode from str", 1) < 0)
3702         return NULL;
3703 
3704     if (encoding == NULL)
3705         encoding = PyUnicode_GetDefaultEncoding();
3706 
3707     /* Decode via the codec registry */
3708     return PyCodec_Decode(unicode, encoding, errors);
3709 }
3710 
3711 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3712 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3713                            const char *encoding,
3714                            const char *errors)
3715 {
3716     PyObject *v;
3717 
3718     if (!PyUnicode_Check(unicode)) {
3719         PyErr_BadArgument();
3720         goto onError;
3721     }
3722 
3723     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3724                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3725                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3726         return NULL;
3727 
3728     if (encoding == NULL)
3729         encoding = PyUnicode_GetDefaultEncoding();
3730 
3731     /* Decode via the codec registry */
3732     v = PyCodec_Decode(unicode, encoding, errors);
3733     if (v == NULL)
3734         goto onError;
3735     if (!PyUnicode_Check(v)) {
3736         PyErr_Format(PyExc_TypeError,
3737                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3738                      "use codecs.decode() to decode to arbitrary types",
3739                      encoding,
3740                      Py_TYPE(unicode)->tp_name);
3741         Py_DECREF(v);
3742         goto onError;
3743     }
3744     return unicode_result(v);
3745 
3746   onError:
3747     return NULL;
3748 }
3749 
3750 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3751 PyUnicode_Encode(const Py_UNICODE *s,
3752                  Py_ssize_t size,
3753                  const char *encoding,
3754                  const char *errors)
3755 {
3756     PyObject *v, *unicode;
3757 
3758     unicode = PyUnicode_FromWideChar(s, size);
3759     if (unicode == NULL)
3760         return NULL;
3761     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3762     Py_DECREF(unicode);
3763     return v;
3764 }
3765 
3766 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3767 PyUnicode_AsEncodedObject(PyObject *unicode,
3768                           const char *encoding,
3769                           const char *errors)
3770 {
3771     PyObject *v;
3772 
3773     if (!PyUnicode_Check(unicode)) {
3774         PyErr_BadArgument();
3775         goto onError;
3776     }
3777 
3778     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3779                      "PyUnicode_AsEncodedObject() is deprecated; "
3780                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3781                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3782         return NULL;
3783 
3784     if (encoding == NULL)
3785         encoding = PyUnicode_GetDefaultEncoding();
3786 
3787     /* Encode via the codec registry */
3788     v = PyCodec_Encode(unicode, encoding, errors);
3789     if (v == NULL)
3790         goto onError;
3791     return v;
3792 
3793   onError:
3794     return NULL;
3795 }
3796 
3797 
3798 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3799 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3800                       int current_locale)
3801 {
3802     Py_ssize_t wlen;
3803     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3804     if (wstr == NULL) {
3805         return NULL;
3806     }
3807 
3808     if ((size_t)wlen != wcslen(wstr)) {
3809         PyErr_SetString(PyExc_ValueError, "embedded null character");
3810         PyMem_Free(wstr);
3811         return NULL;
3812     }
3813 
3814     char *str;
3815     size_t error_pos;
3816     const char *reason;
3817     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3818                                  current_locale, error_handler);
3819     PyMem_Free(wstr);
3820 
3821     if (res != 0) {
3822         if (res == -2) {
3823             PyObject *exc;
3824             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3825                     "locale", unicode,
3826                     (Py_ssize_t)error_pos,
3827                     (Py_ssize_t)(error_pos+1),
3828                     reason);
3829             if (exc != NULL) {
3830                 PyCodec_StrictErrors(exc);
3831                 Py_DECREF(exc);
3832             }
3833         }
3834         else if (res == -3) {
3835             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3836         }
3837         else {
3838             PyErr_NoMemory();
3839         }
3840         return NULL;
3841     }
3842 
3843     PyObject *bytes = PyBytes_FromString(str);
3844     PyMem_RawFree(str);
3845     return bytes;
3846 }
3847 
3848 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3849 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3850 {
3851     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3852     return unicode_encode_locale(unicode, error_handler, 1);
3853 }
3854 
3855 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3856 PyUnicode_EncodeFSDefault(PyObject *unicode)
3857 {
3858     PyInterpreterState *interp = _PyInterpreterState_GET();
3859     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3860     if (fs_codec->utf8) {
3861         return unicode_encode_utf8(unicode,
3862                                    fs_codec->error_handler,
3863                                    fs_codec->errors);
3864     }
3865 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3866     else if (fs_codec->encoding) {
3867         return PyUnicode_AsEncodedString(unicode,
3868                                          fs_codec->encoding,
3869                                          fs_codec->errors);
3870     }
3871 #endif
3872     else {
3873         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3874            machinery is not ready and so cannot be used:
3875            use wcstombs() in this case. */
3876         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3877         const wchar_t *filesystem_errors = config->filesystem_errors;
3878         assert(filesystem_errors != NULL);
3879         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3880         assert(errors != _Py_ERROR_UNKNOWN);
3881 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3882         return unicode_encode_utf8(unicode, errors, NULL);
3883 #else
3884         return unicode_encode_locale(unicode, errors, 0);
3885 #endif
3886     }
3887 }
3888 
3889 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3890 PyUnicode_AsEncodedString(PyObject *unicode,
3891                           const char *encoding,
3892                           const char *errors)
3893 {
3894     PyObject *v;
3895     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3896 
3897     if (!PyUnicode_Check(unicode)) {
3898         PyErr_BadArgument();
3899         return NULL;
3900     }
3901 
3902     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3903         return NULL;
3904     }
3905 
3906     if (encoding == NULL) {
3907         return _PyUnicode_AsUTF8String(unicode, errors);
3908     }
3909 
3910     /* Shortcuts for common default encodings */
3911     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3912         char *lower = buflower;
3913 
3914         /* Fast paths */
3915         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3916             lower += 3;
3917             if (*lower == '_') {
3918                 /* Match "utf8" and "utf_8" */
3919                 lower++;
3920             }
3921 
3922             if (lower[0] == '8' && lower[1] == 0) {
3923                 return _PyUnicode_AsUTF8String(unicode, errors);
3924             }
3925             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3926                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3927             }
3928             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3929                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3930             }
3931         }
3932         else {
3933             if (strcmp(lower, "ascii") == 0
3934                 || strcmp(lower, "us_ascii") == 0) {
3935                 return _PyUnicode_AsASCIIString(unicode, errors);
3936             }
3937 #ifdef MS_WINDOWS
3938             else if (strcmp(lower, "mbcs") == 0) {
3939                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3940             }
3941 #endif
3942             else if (strcmp(lower, "latin1") == 0 ||
3943                      strcmp(lower, "latin_1") == 0 ||
3944                      strcmp(lower, "iso_8859_1") == 0 ||
3945                      strcmp(lower, "iso8859_1") == 0) {
3946                 return _PyUnicode_AsLatin1String(unicode, errors);
3947             }
3948         }
3949     }
3950 
3951     /* Encode via the codec registry */
3952     v = _PyCodec_EncodeText(unicode, encoding, errors);
3953     if (v == NULL)
3954         return NULL;
3955 
3956     /* The normal path */
3957     if (PyBytes_Check(v))
3958         return v;
3959 
3960     /* If the codec returns a buffer, raise a warning and convert to bytes */
3961     if (PyByteArray_Check(v)) {
3962         int error;
3963         PyObject *b;
3964 
3965         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3966             "encoder %s returned bytearray instead of bytes; "
3967             "use codecs.encode() to encode to arbitrary types",
3968             encoding);
3969         if (error) {
3970             Py_DECREF(v);
3971             return NULL;
3972         }
3973 
3974         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3975                                       PyByteArray_GET_SIZE(v));
3976         Py_DECREF(v);
3977         return b;
3978     }
3979 
3980     PyErr_Format(PyExc_TypeError,
3981                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3982                  "use codecs.encode() to encode to arbitrary types",
3983                  encoding,
3984                  Py_TYPE(v)->tp_name);
3985     Py_DECREF(v);
3986     return NULL;
3987 }
3988 
3989 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3990 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3991                            const char *encoding,
3992                            const char *errors)
3993 {
3994     PyObject *v;
3995 
3996     if (!PyUnicode_Check(unicode)) {
3997         PyErr_BadArgument();
3998         goto onError;
3999     }
4000 
4001     if (PyErr_WarnEx(PyExc_DeprecationWarning,
4002                      "PyUnicode_AsEncodedUnicode() is deprecated; "
4003                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
4004         return NULL;
4005 
4006     if (encoding == NULL)
4007         encoding = PyUnicode_GetDefaultEncoding();
4008 
4009     /* Encode via the codec registry */
4010     v = PyCodec_Encode(unicode, encoding, errors);
4011     if (v == NULL)
4012         goto onError;
4013     if (!PyUnicode_Check(v)) {
4014         PyErr_Format(PyExc_TypeError,
4015                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
4016                      "use codecs.encode() to encode to arbitrary types",
4017                      encoding,
4018                      Py_TYPE(v)->tp_name);
4019         Py_DECREF(v);
4020         goto onError;
4021     }
4022     return v;
4023 
4024   onError:
4025     return NULL;
4026 }
4027 
4028 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)4029 unicode_decode_locale(const char *str, Py_ssize_t len,
4030                       _Py_error_handler errors, int current_locale)
4031 {
4032     if (str[len] != '\0' || (size_t)len != strlen(str))  {
4033         PyErr_SetString(PyExc_ValueError, "embedded null byte");
4034         return NULL;
4035     }
4036 
4037     wchar_t *wstr;
4038     size_t wlen;
4039     const char *reason;
4040     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4041                                  current_locale, errors);
4042     if (res != 0) {
4043         if (res == -2) {
4044             PyObject *exc;
4045             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4046                                         "locale", str, len,
4047                                         (Py_ssize_t)wlen,
4048                                         (Py_ssize_t)(wlen + 1),
4049                                         reason);
4050             if (exc != NULL) {
4051                 PyCodec_StrictErrors(exc);
4052                 Py_DECREF(exc);
4053             }
4054         }
4055         else if (res == -3) {
4056             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4057         }
4058         else {
4059             PyErr_NoMemory();
4060         }
4061         return NULL;
4062     }
4063 
4064     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4065     PyMem_RawFree(wstr);
4066     return unicode;
4067 }
4068 
4069 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4070 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4071                               const char *errors)
4072 {
4073     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4074     return unicode_decode_locale(str, len, error_handler, 1);
4075 }
4076 
4077 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4078 PyUnicode_DecodeLocale(const char *str, const char *errors)
4079 {
4080     Py_ssize_t size = (Py_ssize_t)strlen(str);
4081     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4082     return unicode_decode_locale(str, size, error_handler, 1);
4083 }
4084 
4085 
4086 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4087 PyUnicode_DecodeFSDefault(const char *s) {
4088     Py_ssize_t size = (Py_ssize_t)strlen(s);
4089     return PyUnicode_DecodeFSDefaultAndSize(s, size);
4090 }
4091 
4092 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4093 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4094 {
4095     PyInterpreterState *interp = _PyInterpreterState_GET();
4096     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4097     if (fs_codec->utf8) {
4098         return unicode_decode_utf8(s, size,
4099                                    fs_codec->error_handler,
4100                                    fs_codec->errors,
4101                                    NULL);
4102     }
4103 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4104     else if (fs_codec->encoding) {
4105         return PyUnicode_Decode(s, size,
4106                                 fs_codec->encoding,
4107                                 fs_codec->errors);
4108     }
4109 #endif
4110     else {
4111         /* Before _PyUnicode_InitEncodings() is called, the Python codec
4112            machinery is not ready and so cannot be used:
4113            use mbstowcs() in this case. */
4114         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4115         const wchar_t *filesystem_errors = config->filesystem_errors;
4116         assert(filesystem_errors != NULL);
4117         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4118         assert(errors != _Py_ERROR_UNKNOWN);
4119 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4120         return unicode_decode_utf8(s, size, errors, NULL, NULL);
4121 #else
4122         return unicode_decode_locale(s, size, errors, 0);
4123 #endif
4124     }
4125 }
4126 
4127 
4128 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4129 PyUnicode_FSConverter(PyObject* arg, void* addr)
4130 {
4131     PyObject *path = NULL;
4132     PyObject *output = NULL;
4133     Py_ssize_t size;
4134     const char *data;
4135     if (arg == NULL) {
4136         Py_DECREF(*(PyObject**)addr);
4137         *(PyObject**)addr = NULL;
4138         return 1;
4139     }
4140     path = PyOS_FSPath(arg);
4141     if (path == NULL) {
4142         return 0;
4143     }
4144     if (PyBytes_Check(path)) {
4145         output = path;
4146     }
4147     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4148         output = PyUnicode_EncodeFSDefault(path);
4149         Py_DECREF(path);
4150         if (!output) {
4151             return 0;
4152         }
4153         assert(PyBytes_Check(output));
4154     }
4155 
4156     size = PyBytes_GET_SIZE(output);
4157     data = PyBytes_AS_STRING(output);
4158     if ((size_t)size != strlen(data)) {
4159         PyErr_SetString(PyExc_ValueError, "embedded null byte");
4160         Py_DECREF(output);
4161         return 0;
4162     }
4163     *(PyObject**)addr = output;
4164     return Py_CLEANUP_SUPPORTED;
4165 }
4166 
4167 
4168 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4169 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4170 {
4171     int is_buffer = 0;
4172     PyObject *path = NULL;
4173     PyObject *output = NULL;
4174     if (arg == NULL) {
4175         Py_DECREF(*(PyObject**)addr);
4176         *(PyObject**)addr = NULL;
4177         return 1;
4178     }
4179 
4180     is_buffer = PyObject_CheckBuffer(arg);
4181     if (!is_buffer) {
4182         path = PyOS_FSPath(arg);
4183         if (path == NULL) {
4184             return 0;
4185         }
4186     }
4187     else {
4188         path = arg;
4189         Py_INCREF(arg);
4190     }
4191 
4192     if (PyUnicode_Check(path)) {
4193         output = path;
4194     }
4195     else if (PyBytes_Check(path) || is_buffer) {
4196         PyObject *path_bytes = NULL;
4197 
4198         if (!PyBytes_Check(path) &&
4199             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4200             "path should be string, bytes, or os.PathLike, not %.200s",
4201             Py_TYPE(arg)->tp_name)) {
4202                 Py_DECREF(path);
4203             return 0;
4204         }
4205         path_bytes = PyBytes_FromObject(path);
4206         Py_DECREF(path);
4207         if (!path_bytes) {
4208             return 0;
4209         }
4210         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4211                                                   PyBytes_GET_SIZE(path_bytes));
4212         Py_DECREF(path_bytes);
4213         if (!output) {
4214             return 0;
4215         }
4216     }
4217     else {
4218         PyErr_Format(PyExc_TypeError,
4219                      "path should be string, bytes, or os.PathLike, not %.200s",
4220                      Py_TYPE(arg)->tp_name);
4221         Py_DECREF(path);
4222         return 0;
4223     }
4224     if (PyUnicode_READY(output) == -1) {
4225         Py_DECREF(output);
4226         return 0;
4227     }
4228     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4229                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4230         PyErr_SetString(PyExc_ValueError, "embedded null character");
4231         Py_DECREF(output);
4232         return 0;
4233     }
4234     *(PyObject**)addr = output;
4235     return Py_CLEANUP_SUPPORTED;
4236 }
4237 
4238 
4239 static int unicode_fill_utf8(PyObject *unicode);
4240 
4241 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4242 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4243 {
4244     if (!PyUnicode_Check(unicode)) {
4245         PyErr_BadArgument();
4246         return NULL;
4247     }
4248     if (PyUnicode_READY(unicode) == -1)
4249         return NULL;
4250 
4251     if (PyUnicode_UTF8(unicode) == NULL) {
4252         if (unicode_fill_utf8(unicode) == -1) {
4253             return NULL;
4254         }
4255     }
4256 
4257     if (psize)
4258         *psize = PyUnicode_UTF8_LENGTH(unicode);
4259     return PyUnicode_UTF8(unicode);
4260 }
4261 
4262 const char *
PyUnicode_AsUTF8(PyObject * unicode)4263 PyUnicode_AsUTF8(PyObject *unicode)
4264 {
4265     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4266 }
4267 
4268 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4269 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4270 {
4271     if (!PyUnicode_Check(unicode)) {
4272         PyErr_BadArgument();
4273         return NULL;
4274     }
4275     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4276     if (w == NULL) {
4277         /* Non-ASCII compact unicode object */
4278         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4279         assert(PyUnicode_IS_READY(unicode));
4280 
4281         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4282         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4283             PyErr_NoMemory();
4284             return NULL;
4285         }
4286         w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4287         if (w == NULL) {
4288             PyErr_NoMemory();
4289             return NULL;
4290         }
4291         unicode_copy_as_widechar(unicode, w, wlen + 1);
4292         _PyUnicode_WSTR(unicode) = w;
4293         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4294             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4295         }
4296     }
4297     if (size != NULL)
4298         *size = PyUnicode_WSTR_LENGTH(unicode);
4299     return w;
4300 }
4301 
4302 /* Deprecated APIs */
4303 
4304 _Py_COMP_DIAG_PUSH
4305 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4306 
4307 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4308 PyUnicode_AsUnicode(PyObject *unicode)
4309 {
4310     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4311 }
4312 
4313 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4314 _PyUnicode_AsUnicode(PyObject *unicode)
4315 {
4316     Py_ssize_t size;
4317     const Py_UNICODE *wstr;
4318 
4319     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4320     if (wstr && wcslen(wstr) != (size_t)size) {
4321         PyErr_SetString(PyExc_ValueError, "embedded null character");
4322         return NULL;
4323     }
4324     return wstr;
4325 }
4326 
4327 
4328 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4329 PyUnicode_GetSize(PyObject *unicode)
4330 {
4331     if (!PyUnicode_Check(unicode)) {
4332         PyErr_BadArgument();
4333         goto onError;
4334     }
4335     if (_PyUnicode_WSTR(unicode) == NULL) {
4336         if (PyUnicode_AsUnicode(unicode) == NULL)
4337             goto onError;
4338     }
4339     return PyUnicode_WSTR_LENGTH(unicode);
4340 
4341   onError:
4342     return -1;
4343 }
4344 
4345 _Py_COMP_DIAG_POP
4346 
4347 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4348 PyUnicode_GetLength(PyObject *unicode)
4349 {
4350     if (!PyUnicode_Check(unicode)) {
4351         PyErr_BadArgument();
4352         return -1;
4353     }
4354     if (PyUnicode_READY(unicode) == -1)
4355         return -1;
4356     return PyUnicode_GET_LENGTH(unicode);
4357 }
4358 
4359 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4360 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4361 {
4362     const void *data;
4363     int kind;
4364 
4365     if (!PyUnicode_Check(unicode)) {
4366         PyErr_BadArgument();
4367         return (Py_UCS4)-1;
4368     }
4369     if (PyUnicode_READY(unicode) == -1) {
4370         return (Py_UCS4)-1;
4371     }
4372     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4373         PyErr_SetString(PyExc_IndexError, "string index out of range");
4374         return (Py_UCS4)-1;
4375     }
4376     data = PyUnicode_DATA(unicode);
4377     kind = PyUnicode_KIND(unicode);
4378     return PyUnicode_READ(kind, data, index);
4379 }
4380 
4381 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4382 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4383 {
4384     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4385         PyErr_BadArgument();
4386         return -1;
4387     }
4388     assert(PyUnicode_IS_READY(unicode));
4389     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4390         PyErr_SetString(PyExc_IndexError, "string index out of range");
4391         return -1;
4392     }
4393     if (unicode_check_modifiable(unicode))
4394         return -1;
4395     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4396         PyErr_SetString(PyExc_ValueError, "character out of range");
4397         return -1;
4398     }
4399     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4400                     index, ch);
4401     return 0;
4402 }
4403 
4404 const char *
PyUnicode_GetDefaultEncoding(void)4405 PyUnicode_GetDefaultEncoding(void)
4406 {
4407     return "utf-8";
4408 }
4409 
4410 /* create or adjust a UnicodeDecodeError */
4411 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4412 make_decode_exception(PyObject **exceptionObject,
4413                       const char *encoding,
4414                       const char *input, Py_ssize_t length,
4415                       Py_ssize_t startpos, Py_ssize_t endpos,
4416                       const char *reason)
4417 {
4418     if (*exceptionObject == NULL) {
4419         *exceptionObject = PyUnicodeDecodeError_Create(
4420             encoding, input, length, startpos, endpos, reason);
4421     }
4422     else {
4423         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4424             goto onError;
4425         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4426             goto onError;
4427         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4428             goto onError;
4429     }
4430     return;
4431 
4432 onError:
4433     Py_CLEAR(*exceptionObject);
4434 }
4435 
4436 #ifdef MS_WINDOWS
4437 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4438 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4439 {
4440     if (newsize > *size) {
4441         wchar_t *newbuf = *buf;
4442         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4443             PyErr_NoMemory();
4444             return -1;
4445         }
4446         *buf = newbuf;
4447     }
4448     *size = newsize;
4449     return 0;
4450 }
4451 
4452 /* error handling callback helper:
4453    build arguments, call the callback and check the arguments,
4454    if no exception occurred, copy the replacement to the output
4455    and adjust various state variables.
4456    return 0 on success, -1 on error
4457 */
4458 
4459 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4460 unicode_decode_call_errorhandler_wchar(
4461     const char *errors, PyObject **errorHandler,
4462     const char *encoding, const char *reason,
4463     const char **input, const char **inend, Py_ssize_t *startinpos,
4464     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4465     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4466 {
4467     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4468 
4469     PyObject *restuple = NULL;
4470     PyObject *repunicode = NULL;
4471     Py_ssize_t outsize;
4472     Py_ssize_t insize;
4473     Py_ssize_t requiredsize;
4474     Py_ssize_t newpos;
4475     PyObject *inputobj = NULL;
4476     Py_ssize_t repwlen;
4477 
4478     if (*errorHandler == NULL) {
4479         *errorHandler = PyCodec_LookupError(errors);
4480         if (*errorHandler == NULL)
4481             goto onError;
4482     }
4483 
4484     make_decode_exception(exceptionObject,
4485         encoding,
4486         *input, *inend - *input,
4487         *startinpos, *endinpos,
4488         reason);
4489     if (*exceptionObject == NULL)
4490         goto onError;
4491 
4492     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4493     if (restuple == NULL)
4494         goto onError;
4495     if (!PyTuple_Check(restuple)) {
4496         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4497         goto onError;
4498     }
4499     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4500         goto onError;
4501 
4502     /* Copy back the bytes variables, which might have been modified by the
4503        callback */
4504     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4505     if (!inputobj)
4506         goto onError;
4507     *input = PyBytes_AS_STRING(inputobj);
4508     insize = PyBytes_GET_SIZE(inputobj);
4509     *inend = *input + insize;
4510     /* we can DECREF safely, as the exception has another reference,
4511        so the object won't go away. */
4512     Py_DECREF(inputobj);
4513 
4514     if (newpos<0)
4515         newpos = insize+newpos;
4516     if (newpos<0 || newpos>insize) {
4517         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4518         goto onError;
4519     }
4520 
4521 #if USE_UNICODE_WCHAR_CACHE
4522 _Py_COMP_DIAG_PUSH
4523 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4524     repwlen = PyUnicode_GetSize(repunicode);
4525     if (repwlen < 0)
4526         goto onError;
4527 _Py_COMP_DIAG_POP
4528 #else /* USE_UNICODE_WCHAR_CACHE */
4529     repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4530     if (repwlen < 0)
4531         goto onError;
4532     repwlen--;
4533 #endif /* USE_UNICODE_WCHAR_CACHE */
4534     /* need more space? (at least enough for what we
4535        have+the replacement+the rest of the string (starting
4536        at the new input position), so we won't have to check space
4537        when there are no errors in the rest of the string) */
4538     requiredsize = *outpos;
4539     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4540         goto overflow;
4541     requiredsize += repwlen;
4542     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4543         goto overflow;
4544     requiredsize += insize - newpos;
4545     outsize = *bufsize;
4546     if (requiredsize > outsize) {
4547         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4548             requiredsize = 2*outsize;
4549         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4550             goto onError;
4551         }
4552     }
4553     PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4554     *outpos += repwlen;
4555     *endinpos = newpos;
4556     *inptr = *input + newpos;
4557 
4558     /* we made it! */
4559     Py_DECREF(restuple);
4560     return 0;
4561 
4562   overflow:
4563     PyErr_SetString(PyExc_OverflowError,
4564                     "decoded result is too long for a Python string");
4565 
4566   onError:
4567     Py_XDECREF(restuple);
4568     return -1;
4569 }
4570 #endif   /* MS_WINDOWS */
4571 
4572 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4573 unicode_decode_call_errorhandler_writer(
4574     const char *errors, PyObject **errorHandler,
4575     const char *encoding, const char *reason,
4576     const char **input, const char **inend, Py_ssize_t *startinpos,
4577     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4578     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4579 {
4580     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4581 
4582     PyObject *restuple = NULL;
4583     PyObject *repunicode = NULL;
4584     Py_ssize_t insize;
4585     Py_ssize_t newpos;
4586     Py_ssize_t replen;
4587     Py_ssize_t remain;
4588     PyObject *inputobj = NULL;
4589     int need_to_grow = 0;
4590     const char *new_inptr;
4591 
4592     if (*errorHandler == NULL) {
4593         *errorHandler = PyCodec_LookupError(errors);
4594         if (*errorHandler == NULL)
4595             goto onError;
4596     }
4597 
4598     make_decode_exception(exceptionObject,
4599         encoding,
4600         *input, *inend - *input,
4601         *startinpos, *endinpos,
4602         reason);
4603     if (*exceptionObject == NULL)
4604         goto onError;
4605 
4606     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4607     if (restuple == NULL)
4608         goto onError;
4609     if (!PyTuple_Check(restuple)) {
4610         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4611         goto onError;
4612     }
4613     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4614         goto onError;
4615 
4616     /* Copy back the bytes variables, which might have been modified by the
4617        callback */
4618     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4619     if (!inputobj)
4620         goto onError;
4621     remain = *inend - *input - *endinpos;
4622     *input = PyBytes_AS_STRING(inputobj);
4623     insize = PyBytes_GET_SIZE(inputobj);
4624     *inend = *input + insize;
4625     /* we can DECREF safely, as the exception has another reference,
4626        so the object won't go away. */
4627     Py_DECREF(inputobj);
4628 
4629     if (newpos<0)
4630         newpos = insize+newpos;
4631     if (newpos<0 || newpos>insize) {
4632         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4633         goto onError;
4634     }
4635 
4636     replen = PyUnicode_GET_LENGTH(repunicode);
4637     if (replen > 1) {
4638         writer->min_length += replen - 1;
4639         need_to_grow = 1;
4640     }
4641     new_inptr = *input + newpos;
4642     if (*inend - new_inptr > remain) {
4643         /* We don't know the decoding algorithm here so we make the worst
4644            assumption that one byte decodes to one unicode character.
4645            If unfortunately one byte could decode to more unicode characters,
4646            the decoder may write out-of-bound then.  Is it possible for the
4647            algorithms using this function? */
4648         writer->min_length += *inend - new_inptr - remain;
4649         need_to_grow = 1;
4650     }
4651     if (need_to_grow) {
4652         writer->overallocate = 1;
4653         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4654                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4655             goto onError;
4656     }
4657     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4658         goto onError;
4659 
4660     *endinpos = newpos;
4661     *inptr = new_inptr;
4662 
4663     /* we made it! */
4664     Py_DECREF(restuple);
4665     return 0;
4666 
4667   onError:
4668     Py_XDECREF(restuple);
4669     return -1;
4670 }
4671 
4672 /* --- UTF-7 Codec -------------------------------------------------------- */
4673 
4674 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4675 
4676 /* Three simple macros defining base-64. */
4677 
4678 /* Is c a base-64 character? */
4679 
4680 #define IS_BASE64(c) \
4681     (((c) >= 'A' && (c) <= 'Z') ||     \
4682      ((c) >= 'a' && (c) <= 'z') ||     \
4683      ((c) >= '0' && (c) <= '9') ||     \
4684      (c) == '+' || (c) == '/')
4685 
4686 /* given that c is a base-64 character, what is its base-64 value? */
4687 
4688 #define FROM_BASE64(c)                                                  \
4689     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4690      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4691      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4692      (c) == '+' ? 62 : 63)
4693 
4694 /* What is the base-64 character of the bottom 6 bits of n? */
4695 
4696 #define TO_BASE64(n)  \
4697     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4698 
4699 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4700  * decoded as itself.  We are permissive on decoding; the only ASCII
4701  * byte not decoding to itself is the + which begins a base64
4702  * string. */
4703 
4704 #define DECODE_DIRECT(c)                                \
4705     ((c) <= 127 && (c) != '+')
4706 
4707 /* The UTF-7 encoder treats ASCII characters differently according to
4708  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4709  * the above).  See RFC2152.  This array identifies these different
4710  * sets:
4711  * 0 : "Set D"
4712  *     alphanumeric and '(),-./:?
4713  * 1 : "Set O"
4714  *     !"#$%&*;<=>@[]^_`{|}
4715  * 2 : "whitespace"
4716  *     ht nl cr sp
4717  * 3 : special (must be base64 encoded)
4718  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4719  */
4720 
4721 static
4722 char utf7_category[128] = {
4723 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4724     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4725 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4726     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4727 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4728     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4729 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4730     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4731 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4732     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4733 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4734     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4735 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4736     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4737 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4738     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4739 };
4740 
4741 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4742  * answer depends on whether we are encoding set O as itself, and also
4743  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4744  * clear that the answers to these questions vary between
4745  * applications, so this code needs to be flexible.  */
4746 
4747 #define ENCODE_DIRECT(c, directO, directWS)             \
4748     ((c) < 128 && (c) > 0 &&                            \
4749      ((utf7_category[(c)] == 0) ||                      \
4750       (directWS && (utf7_category[(c)] == 2)) ||        \
4751       (directO && (utf7_category[(c)] == 1))))
4752 
4753 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4754 PyUnicode_DecodeUTF7(const char *s,
4755                      Py_ssize_t size,
4756                      const char *errors)
4757 {
4758     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4759 }
4760 
4761 /* The decoder.  The only state we preserve is our read position,
4762  * i.e. how many characters we have consumed.  So if we end in the
4763  * middle of a shift sequence we have to back off the read position
4764  * and the output to the beginning of the sequence, otherwise we lose
4765  * all the shift state (seen bits, number of bits seen, high
4766  * surrogate). */
4767 
4768 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4769 PyUnicode_DecodeUTF7Stateful(const char *s,
4770                              Py_ssize_t size,
4771                              const char *errors,
4772                              Py_ssize_t *consumed)
4773 {
4774     const char *starts = s;
4775     Py_ssize_t startinpos;
4776     Py_ssize_t endinpos;
4777     const char *e;
4778     _PyUnicodeWriter writer;
4779     const char *errmsg = "";
4780     int inShift = 0;
4781     Py_ssize_t shiftOutStart;
4782     unsigned int base64bits = 0;
4783     unsigned long base64buffer = 0;
4784     Py_UCS4 surrogate = 0;
4785     PyObject *errorHandler = NULL;
4786     PyObject *exc = NULL;
4787 
4788     if (size == 0) {
4789         if (consumed)
4790             *consumed = 0;
4791         _Py_RETURN_UNICODE_EMPTY();
4792     }
4793 
4794     /* Start off assuming it's all ASCII. Widen later as necessary. */
4795     _PyUnicodeWriter_Init(&writer);
4796     writer.min_length = size;
4797 
4798     shiftOutStart = 0;
4799     e = s + size;
4800 
4801     while (s < e) {
4802         Py_UCS4 ch;
4803       restart:
4804         ch = (unsigned char) *s;
4805 
4806         if (inShift) { /* in a base-64 section */
4807             if (IS_BASE64(ch)) { /* consume a base-64 character */
4808                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4809                 base64bits += 6;
4810                 s++;
4811                 if (base64bits >= 16) {
4812                     /* we have enough bits for a UTF-16 value */
4813                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4814                     base64bits -= 16;
4815                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4816                     assert(outCh <= 0xffff);
4817                     if (surrogate) {
4818                         /* expecting a second surrogate */
4819                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4820                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4821                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4822                                 goto onError;
4823                             surrogate = 0;
4824                             continue;
4825                         }
4826                         else {
4827                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4828                                 goto onError;
4829                             surrogate = 0;
4830                         }
4831                     }
4832                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4833                         /* first surrogate */
4834                         surrogate = outCh;
4835                     }
4836                     else {
4837                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4838                             goto onError;
4839                     }
4840                 }
4841             }
4842             else { /* now leaving a base-64 section */
4843                 inShift = 0;
4844                 if (base64bits > 0) { /* left-over bits */
4845                     if (base64bits >= 6) {
4846                         /* We've seen at least one base-64 character */
4847                         s++;
4848                         errmsg = "partial character in shift sequence";
4849                         goto utf7Error;
4850                     }
4851                     else {
4852                         /* Some bits remain; they should be zero */
4853                         if (base64buffer != 0) {
4854                             s++;
4855                             errmsg = "non-zero padding bits in shift sequence";
4856                             goto utf7Error;
4857                         }
4858                     }
4859                 }
4860                 if (surrogate && DECODE_DIRECT(ch)) {
4861                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4862                         goto onError;
4863                 }
4864                 surrogate = 0;
4865                 if (ch == '-') {
4866                     /* '-' is absorbed; other terminating
4867                        characters are preserved */
4868                     s++;
4869                 }
4870             }
4871         }
4872         else if ( ch == '+' ) {
4873             startinpos = s-starts;
4874             s++; /* consume '+' */
4875             if (s < e && *s == '-') { /* '+-' encodes '+' */
4876                 s++;
4877                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4878                     goto onError;
4879             }
4880             else if (s < e && !IS_BASE64(*s)) {
4881                 s++;
4882                 errmsg = "ill-formed sequence";
4883                 goto utf7Error;
4884             }
4885             else { /* begin base64-encoded section */
4886                 inShift = 1;
4887                 surrogate = 0;
4888                 shiftOutStart = writer.pos;
4889                 base64bits = 0;
4890                 base64buffer = 0;
4891             }
4892         }
4893         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4894             s++;
4895             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4896                 goto onError;
4897         }
4898         else {
4899             startinpos = s-starts;
4900             s++;
4901             errmsg = "unexpected special character";
4902             goto utf7Error;
4903         }
4904         continue;
4905 utf7Error:
4906         endinpos = s-starts;
4907         if (unicode_decode_call_errorhandler_writer(
4908                 errors, &errorHandler,
4909                 "utf7", errmsg,
4910                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4911                 &writer))
4912             goto onError;
4913     }
4914 
4915     /* end of string */
4916 
4917     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4918         /* if we're in an inconsistent state, that's an error */
4919         inShift = 0;
4920         if (surrogate ||
4921                 (base64bits >= 6) ||
4922                 (base64bits > 0 && base64buffer != 0)) {
4923             endinpos = size;
4924             if (unicode_decode_call_errorhandler_writer(
4925                     errors, &errorHandler,
4926                     "utf7", "unterminated shift sequence",
4927                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4928                     &writer))
4929                 goto onError;
4930             if (s < e)
4931                 goto restart;
4932         }
4933     }
4934 
4935     /* return state */
4936     if (consumed) {
4937         if (inShift) {
4938             *consumed = startinpos;
4939             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4940                 PyObject *result = PyUnicode_FromKindAndData(
4941                         writer.kind, writer.data, shiftOutStart);
4942                 Py_XDECREF(errorHandler);
4943                 Py_XDECREF(exc);
4944                 _PyUnicodeWriter_Dealloc(&writer);
4945                 return result;
4946             }
4947             writer.pos = shiftOutStart; /* back off output */
4948         }
4949         else {
4950             *consumed = s-starts;
4951         }
4952     }
4953 
4954     Py_XDECREF(errorHandler);
4955     Py_XDECREF(exc);
4956     return _PyUnicodeWriter_Finish(&writer);
4957 
4958   onError:
4959     Py_XDECREF(errorHandler);
4960     Py_XDECREF(exc);
4961     _PyUnicodeWriter_Dealloc(&writer);
4962     return NULL;
4963 }
4964 
4965 
4966 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4967 _PyUnicode_EncodeUTF7(PyObject *str,
4968                       int base64SetO,
4969                       int base64WhiteSpace,
4970                       const char *errors)
4971 {
4972     int kind;
4973     const void *data;
4974     Py_ssize_t len;
4975     PyObject *v;
4976     int inShift = 0;
4977     Py_ssize_t i;
4978     unsigned int base64bits = 0;
4979     unsigned long base64buffer = 0;
4980     char * out;
4981     const char * start;
4982 
4983     if (PyUnicode_READY(str) == -1)
4984         return NULL;
4985     kind = PyUnicode_KIND(str);
4986     data = PyUnicode_DATA(str);
4987     len = PyUnicode_GET_LENGTH(str);
4988 
4989     if (len == 0)
4990         return PyBytes_FromStringAndSize(NULL, 0);
4991 
4992     /* It might be possible to tighten this worst case */
4993     if (len > PY_SSIZE_T_MAX / 8)
4994         return PyErr_NoMemory();
4995     v = PyBytes_FromStringAndSize(NULL, len * 8);
4996     if (v == NULL)
4997         return NULL;
4998 
4999     start = out = PyBytes_AS_STRING(v);
5000     for (i = 0; i < len; ++i) {
5001         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5002 
5003         if (inShift) {
5004             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5005                 /* shifting out */
5006                 if (base64bits) { /* output remaining bits */
5007                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
5008                     base64buffer = 0;
5009                     base64bits = 0;
5010                 }
5011                 inShift = 0;
5012                 /* Characters not in the BASE64 set implicitly unshift the sequence
5013                    so no '-' is required, except if the character is itself a '-' */
5014                 if (IS_BASE64(ch) || ch == '-') {
5015                     *out++ = '-';
5016                 }
5017                 *out++ = (char) ch;
5018             }
5019             else {
5020                 goto encode_char;
5021             }
5022         }
5023         else { /* not in a shift sequence */
5024             if (ch == '+') {
5025                 *out++ = '+';
5026                         *out++ = '-';
5027             }
5028             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5029                 *out++ = (char) ch;
5030             }
5031             else {
5032                 *out++ = '+';
5033                 inShift = 1;
5034                 goto encode_char;
5035             }
5036         }
5037         continue;
5038 encode_char:
5039         if (ch >= 0x10000) {
5040             assert(ch <= MAX_UNICODE);
5041 
5042             /* code first surrogate */
5043             base64bits += 16;
5044             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
5045             while (base64bits >= 6) {
5046                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5047                 base64bits -= 6;
5048             }
5049             /* prepare second surrogate */
5050             ch = Py_UNICODE_LOW_SURROGATE(ch);
5051         }
5052         base64bits += 16;
5053         base64buffer = (base64buffer << 16) | ch;
5054         while (base64bits >= 6) {
5055             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5056             base64bits -= 6;
5057         }
5058     }
5059     if (base64bits)
5060         *out++= TO_BASE64(base64buffer << (6-base64bits) );
5061     if (inShift)
5062         *out++ = '-';
5063     if (_PyBytes_Resize(&v, out - start) < 0)
5064         return NULL;
5065     return v;
5066 }
5067 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)5068 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5069                      Py_ssize_t size,
5070                      int base64SetO,
5071                      int base64WhiteSpace,
5072                      const char *errors)
5073 {
5074     PyObject *result;
5075     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5076     if (tmp == NULL)
5077         return NULL;
5078     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
5079                                    base64WhiteSpace, errors);
5080     Py_DECREF(tmp);
5081     return result;
5082 }
5083 
5084 #undef IS_BASE64
5085 #undef FROM_BASE64
5086 #undef TO_BASE64
5087 #undef DECODE_DIRECT
5088 #undef ENCODE_DIRECT
5089 
5090 /* --- UTF-8 Codec -------------------------------------------------------- */
5091 
5092 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5093 PyUnicode_DecodeUTF8(const char *s,
5094                      Py_ssize_t size,
5095                      const char *errors)
5096 {
5097     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5098 }
5099 
5100 #include "stringlib/asciilib.h"
5101 #include "stringlib/codecs.h"
5102 #include "stringlib/undef.h"
5103 
5104 #include "stringlib/ucs1lib.h"
5105 #include "stringlib/codecs.h"
5106 #include "stringlib/undef.h"
5107 
5108 #include "stringlib/ucs2lib.h"
5109 #include "stringlib/codecs.h"
5110 #include "stringlib/undef.h"
5111 
5112 #include "stringlib/ucs4lib.h"
5113 #include "stringlib/codecs.h"
5114 #include "stringlib/undef.h"
5115 
5116 /* Mask to quickly check whether a C 'size_t' contains a
5117    non-ASCII, UTF8-encoded char. */
5118 #if (SIZEOF_SIZE_T == 8)
5119 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5120 #elif (SIZEOF_SIZE_T == 4)
5121 # define ASCII_CHAR_MASK 0x80808080U
5122 #else
5123 # error C 'size_t' size should be either 4 or 8!
5124 #endif
5125 
5126 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5127 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5128 {
5129     const char *p = start;
5130 
5131 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5132     assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5133     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5134         /* Fast path, see in STRINGLIB(utf8_decode) for
5135            an explanation. */
5136         /* Help allocation */
5137         const char *_p = p;
5138         Py_UCS1 * q = dest;
5139         while (_p + SIZEOF_SIZE_T <= end) {
5140             size_t value = *(const size_t *) _p;
5141             if (value & ASCII_CHAR_MASK)
5142                 break;
5143             *((size_t *)q) = value;
5144             _p += SIZEOF_SIZE_T;
5145             q += SIZEOF_SIZE_T;
5146         }
5147         p = _p;
5148         while (p < end) {
5149             if ((unsigned char)*p & 0x80)
5150                 break;
5151             *q++ = *p++;
5152         }
5153         return p - start;
5154     }
5155 #endif
5156     while (p < end) {
5157         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5158            for an explanation. */
5159         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5160             /* Help allocation */
5161             const char *_p = p;
5162             while (_p + SIZEOF_SIZE_T <= end) {
5163                 size_t value = *(const size_t *) _p;
5164                 if (value & ASCII_CHAR_MASK)
5165                     break;
5166                 _p += SIZEOF_SIZE_T;
5167             }
5168             p = _p;
5169             if (_p == end)
5170                 break;
5171         }
5172         if ((unsigned char)*p & 0x80)
5173             break;
5174         ++p;
5175     }
5176     memcpy(dest, start, p - start);
5177     return p - start;
5178 }
5179 
5180 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5181 unicode_decode_utf8(const char *s, Py_ssize_t size,
5182                     _Py_error_handler error_handler, const char *errors,
5183                     Py_ssize_t *consumed)
5184 {
5185     if (size == 0) {
5186         if (consumed)
5187             *consumed = 0;
5188         _Py_RETURN_UNICODE_EMPTY();
5189     }
5190 
5191     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5192     if (size == 1 && (unsigned char)s[0] < 128) {
5193         if (consumed) {
5194             *consumed = 1;
5195         }
5196         return get_latin1_char((unsigned char)s[0]);
5197     }
5198 
5199     const char *starts = s;
5200     const char *end = s + size;
5201 
5202     // fast path: try ASCII string.
5203     PyObject *u = PyUnicode_New(size, 127);
5204     if (u == NULL) {
5205         return NULL;
5206     }
5207     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5208     if (s == end) {
5209         return u;
5210     }
5211 
5212     // Use _PyUnicodeWriter after fast path is failed.
5213     _PyUnicodeWriter writer;
5214     _PyUnicodeWriter_InitWithBuffer(&writer, u);
5215     writer.pos = s - starts;
5216 
5217     Py_ssize_t startinpos, endinpos;
5218     const char *errmsg = "";
5219     PyObject *error_handler_obj = NULL;
5220     PyObject *exc = NULL;
5221 
5222     while (s < end) {
5223         Py_UCS4 ch;
5224         int kind = writer.kind;
5225 
5226         if (kind == PyUnicode_1BYTE_KIND) {
5227             if (PyUnicode_IS_ASCII(writer.buffer))
5228                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5229             else
5230                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5231         } else if (kind == PyUnicode_2BYTE_KIND) {
5232             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5233         } else {
5234             assert(kind == PyUnicode_4BYTE_KIND);
5235             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5236         }
5237 
5238         switch (ch) {
5239         case 0:
5240             if (s == end || consumed)
5241                 goto End;
5242             errmsg = "unexpected end of data";
5243             startinpos = s - starts;
5244             endinpos = end - starts;
5245             break;
5246         case 1:
5247             errmsg = "invalid start byte";
5248             startinpos = s - starts;
5249             endinpos = startinpos + 1;
5250             break;
5251         case 2:
5252             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5253                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5254             {
5255                 /* Truncated surrogate code in range D800-DFFF */
5256                 goto End;
5257             }
5258             /* fall through */
5259         case 3:
5260         case 4:
5261             errmsg = "invalid continuation byte";
5262             startinpos = s - starts;
5263             endinpos = startinpos + ch - 1;
5264             break;
5265         default:
5266             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5267                 goto onError;
5268             continue;
5269         }
5270 
5271         if (error_handler == _Py_ERROR_UNKNOWN)
5272             error_handler = _Py_GetErrorHandler(errors);
5273 
5274         switch (error_handler) {
5275         case _Py_ERROR_IGNORE:
5276             s += (endinpos - startinpos);
5277             break;
5278 
5279         case _Py_ERROR_REPLACE:
5280             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5281                 goto onError;
5282             s += (endinpos - startinpos);
5283             break;
5284 
5285         case _Py_ERROR_SURROGATEESCAPE:
5286         {
5287             Py_ssize_t i;
5288 
5289             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5290                 goto onError;
5291             for (i=startinpos; i<endinpos; i++) {
5292                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5293                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5294                                 ch + 0xdc00);
5295                 writer.pos++;
5296             }
5297             s += (endinpos - startinpos);
5298             break;
5299         }
5300 
5301         default:
5302             if (unicode_decode_call_errorhandler_writer(
5303                     errors, &error_handler_obj,
5304                     "utf-8", errmsg,
5305                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5306                     &writer))
5307                 goto onError;
5308         }
5309     }
5310 
5311 End:
5312     if (consumed)
5313         *consumed = s - starts;
5314 
5315     Py_XDECREF(error_handler_obj);
5316     Py_XDECREF(exc);
5317     return _PyUnicodeWriter_Finish(&writer);
5318 
5319 onError:
5320     Py_XDECREF(error_handler_obj);
5321     Py_XDECREF(exc);
5322     _PyUnicodeWriter_Dealloc(&writer);
5323     return NULL;
5324 }
5325 
5326 
5327 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5328 PyUnicode_DecodeUTF8Stateful(const char *s,
5329                              Py_ssize_t size,
5330                              const char *errors,
5331                              Py_ssize_t *consumed)
5332 {
5333     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5334 }
5335 
5336 
5337 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5338    non-zero, use strict error handler otherwise.
5339 
5340    On success, write a pointer to a newly allocated wide character string into
5341    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5342    (in number of wchar_t units) into *wlen (if wlen is set).
5343 
5344    On memory allocation failure, return -1.
5345 
5346    On decoding error (if surrogateescape is zero), return -2. If wlen is
5347    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5348    is not NULL, write the decoding error message into *reason. */
5349 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5350 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5351                  const char **reason, _Py_error_handler errors)
5352 {
5353     const char *orig_s = s;
5354     const char *e;
5355     wchar_t *unicode;
5356     Py_ssize_t outpos;
5357 
5358     int surrogateescape = 0;
5359     int surrogatepass = 0;
5360     switch (errors)
5361     {
5362     case _Py_ERROR_STRICT:
5363         break;
5364     case _Py_ERROR_SURROGATEESCAPE:
5365         surrogateescape = 1;
5366         break;
5367     case _Py_ERROR_SURROGATEPASS:
5368         surrogatepass = 1;
5369         break;
5370     default:
5371         return -3;
5372     }
5373 
5374     /* Note: size will always be longer than the resulting Unicode
5375        character count */
5376     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5377         return -1;
5378     }
5379 
5380     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5381     if (!unicode) {
5382         return -1;
5383     }
5384 
5385     /* Unpack UTF-8 encoded data */
5386     e = s + size;
5387     outpos = 0;
5388     while (s < e) {
5389         Py_UCS4 ch;
5390 #if SIZEOF_WCHAR_T == 4
5391         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5392 #else
5393         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5394 #endif
5395         if (ch > 0xFF) {
5396 #if SIZEOF_WCHAR_T == 4
5397             Py_UNREACHABLE();
5398 #else
5399             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5400             /* write a surrogate pair */
5401             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5402             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5403 #endif
5404         }
5405         else {
5406             if (!ch && s == e) {
5407                 break;
5408             }
5409 
5410             if (surrogateescape) {
5411                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5412             }
5413             else {
5414                 /* Is it a valid three-byte code? */
5415                 if (surrogatepass
5416                     && (e - s) >= 3
5417                     && (s[0] & 0xf0) == 0xe0
5418                     && (s[1] & 0xc0) == 0x80
5419                     && (s[2] & 0xc0) == 0x80)
5420                 {
5421                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5422                     s += 3;
5423                     unicode[outpos++] = ch;
5424                 }
5425                 else {
5426                     PyMem_RawFree(unicode );
5427                     if (reason != NULL) {
5428                         switch (ch) {
5429                         case 0:
5430                             *reason = "unexpected end of data";
5431                             break;
5432                         case 1:
5433                             *reason = "invalid start byte";
5434                             break;
5435                         /* 2, 3, 4 */
5436                         default:
5437                             *reason = "invalid continuation byte";
5438                             break;
5439                         }
5440                     }
5441                     if (wlen != NULL) {
5442                         *wlen = s - orig_s;
5443                     }
5444                     return -2;
5445                 }
5446             }
5447         }
5448     }
5449     unicode[outpos] = L'\0';
5450     if (wlen) {
5451         *wlen = outpos;
5452     }
5453     *wstr = unicode;
5454     return 0;
5455 }
5456 
5457 
5458 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5459 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5460                                size_t *wlen)
5461 {
5462     wchar_t *wstr;
5463     int res = _Py_DecodeUTF8Ex(arg, arglen,
5464                                &wstr, wlen,
5465                                NULL, _Py_ERROR_SURROGATEESCAPE);
5466     if (res != 0) {
5467         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5468         assert(res != -3);
5469         if (wlen) {
5470             *wlen = (size_t)res;
5471         }
5472         return NULL;
5473     }
5474     return wstr;
5475 }
5476 
5477 
5478 /* UTF-8 encoder using the surrogateescape error handler .
5479 
5480    On success, return 0 and write the newly allocated character string (use
5481    PyMem_Free() to free the memory) into *str.
5482 
5483    On encoding failure, return -2 and write the position of the invalid
5484    surrogate character into *error_pos (if error_pos is set) and the decoding
5485    error message into *reason (if reason is set).
5486 
5487    On memory allocation failure, return -1. */
5488 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5489 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5490                  const char **reason, int raw_malloc, _Py_error_handler errors)
5491 {
5492     const Py_ssize_t max_char_size = 4;
5493     Py_ssize_t len = wcslen(text);
5494 
5495     assert(len >= 0);
5496 
5497     int surrogateescape = 0;
5498     int surrogatepass = 0;
5499     switch (errors)
5500     {
5501     case _Py_ERROR_STRICT:
5502         break;
5503     case _Py_ERROR_SURROGATEESCAPE:
5504         surrogateescape = 1;
5505         break;
5506     case _Py_ERROR_SURROGATEPASS:
5507         surrogatepass = 1;
5508         break;
5509     default:
5510         return -3;
5511     }
5512 
5513     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5514         return -1;
5515     }
5516     char *bytes;
5517     if (raw_malloc) {
5518         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5519     }
5520     else {
5521         bytes = PyMem_Malloc((len + 1) * max_char_size);
5522     }
5523     if (bytes == NULL) {
5524         return -1;
5525     }
5526 
5527     char *p = bytes;
5528     Py_ssize_t i;
5529     for (i = 0; i < len; ) {
5530         Py_ssize_t ch_pos = i;
5531         Py_UCS4 ch = text[i];
5532         i++;
5533 #if Py_UNICODE_SIZE == 2
5534         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5535             && i < len
5536             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5537         {
5538             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5539             i++;
5540         }
5541 #endif
5542 
5543         if (ch < 0x80) {
5544             /* Encode ASCII */
5545             *p++ = (char) ch;
5546 
5547         }
5548         else if (ch < 0x0800) {
5549             /* Encode Latin-1 */
5550             *p++ = (char)(0xc0 | (ch >> 6));
5551             *p++ = (char)(0x80 | (ch & 0x3f));
5552         }
5553         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5554             /* surrogateescape error handler */
5555             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5556                 if (error_pos != NULL) {
5557                     *error_pos = (size_t)ch_pos;
5558                 }
5559                 if (reason != NULL) {
5560                     *reason = "encoding error";
5561                 }
5562                 if (raw_malloc) {
5563                     PyMem_RawFree(bytes);
5564                 }
5565                 else {
5566                     PyMem_Free(bytes);
5567                 }
5568                 return -2;
5569             }
5570             *p++ = (char)(ch & 0xff);
5571         }
5572         else if (ch < 0x10000) {
5573             *p++ = (char)(0xe0 | (ch >> 12));
5574             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5575             *p++ = (char)(0x80 | (ch & 0x3f));
5576         }
5577         else {  /* ch >= 0x10000 */
5578             assert(ch <= MAX_UNICODE);
5579             /* Encode UCS4 Unicode ordinals */
5580             *p++ = (char)(0xf0 | (ch >> 18));
5581             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5582             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5583             *p++ = (char)(0x80 | (ch & 0x3f));
5584         }
5585     }
5586     *p++ = '\0';
5587 
5588     size_t final_size = (p - bytes);
5589     char *bytes2;
5590     if (raw_malloc) {
5591         bytes2 = PyMem_RawRealloc(bytes, final_size);
5592     }
5593     else {
5594         bytes2 = PyMem_Realloc(bytes, final_size);
5595     }
5596     if (bytes2 == NULL) {
5597         if (error_pos != NULL) {
5598             *error_pos = (size_t)-1;
5599         }
5600         if (raw_malloc) {
5601             PyMem_RawFree(bytes);
5602         }
5603         else {
5604             PyMem_Free(bytes);
5605         }
5606         return -1;
5607     }
5608     *str = bytes2;
5609     return 0;
5610 }
5611 
5612 
5613 /* Primary internal function which creates utf8 encoded bytes objects.
5614 
5615    Allocation strategy:  if the string is short, convert into a stack buffer
5616    and allocate exactly as much space needed at the end.  Else allocate the
5617    maximum possible needed (4 result bytes per Unicode character), and return
5618    the excess memory at the end.
5619 */
5620 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5621 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5622                     const char *errors)
5623 {
5624     if (!PyUnicode_Check(unicode)) {
5625         PyErr_BadArgument();
5626         return NULL;
5627     }
5628 
5629     if (PyUnicode_READY(unicode) == -1)
5630         return NULL;
5631 
5632     if (PyUnicode_UTF8(unicode))
5633         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5634                                          PyUnicode_UTF8_LENGTH(unicode));
5635 
5636     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5637     const void *data = PyUnicode_DATA(unicode);
5638     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5639 
5640     _PyBytesWriter writer;
5641     char *end;
5642 
5643     switch (kind) {
5644     default:
5645         Py_UNREACHABLE();
5646     case PyUnicode_1BYTE_KIND:
5647         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5648         assert(!PyUnicode_IS_ASCII(unicode));
5649         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5650         break;
5651     case PyUnicode_2BYTE_KIND:
5652         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5653         break;
5654     case PyUnicode_4BYTE_KIND:
5655         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5656         break;
5657     }
5658 
5659     if (end == NULL) {
5660         _PyBytesWriter_Dealloc(&writer);
5661         return NULL;
5662     }
5663     return _PyBytesWriter_Finish(&writer, end);
5664 }
5665 
5666 static int
unicode_fill_utf8(PyObject * unicode)5667 unicode_fill_utf8(PyObject *unicode)
5668 {
5669     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5670     assert(!PyUnicode_IS_ASCII(unicode));
5671 
5672     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5673     const void *data = PyUnicode_DATA(unicode);
5674     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5675 
5676     _PyBytesWriter writer;
5677     char *end;
5678 
5679     switch (kind) {
5680     default:
5681         Py_UNREACHABLE();
5682     case PyUnicode_1BYTE_KIND:
5683         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5684                                    _Py_ERROR_STRICT, NULL);
5685         break;
5686     case PyUnicode_2BYTE_KIND:
5687         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5688                                    _Py_ERROR_STRICT, NULL);
5689         break;
5690     case PyUnicode_4BYTE_KIND:
5691         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5692                                    _Py_ERROR_STRICT, NULL);
5693         break;
5694     }
5695     if (end == NULL) {
5696         _PyBytesWriter_Dealloc(&writer);
5697         return -1;
5698     }
5699 
5700     const char *start = writer.use_small_buffer ? writer.small_buffer :
5701                     PyBytes_AS_STRING(writer.buffer);
5702     Py_ssize_t len = end - start;
5703 
5704     char *cache = PyObject_Malloc(len + 1);
5705     if (cache == NULL) {
5706         _PyBytesWriter_Dealloc(&writer);
5707         PyErr_NoMemory();
5708         return -1;
5709     }
5710     _PyUnicode_UTF8(unicode) = cache;
5711     _PyUnicode_UTF8_LENGTH(unicode) = len;
5712     memcpy(cache, start, len);
5713     cache[len] = '\0';
5714     _PyBytesWriter_Dealloc(&writer);
5715     return 0;
5716 }
5717 
5718 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5719 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5720 {
5721     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5722 }
5723 
5724 
5725 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5726 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5727                      Py_ssize_t size,
5728                      const char *errors)
5729 {
5730     PyObject *v, *unicode;
5731 
5732     unicode = PyUnicode_FromWideChar(s, size);
5733     if (unicode == NULL)
5734         return NULL;
5735     v = _PyUnicode_AsUTF8String(unicode, errors);
5736     Py_DECREF(unicode);
5737     return v;
5738 }
5739 
5740 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5741 PyUnicode_AsUTF8String(PyObject *unicode)
5742 {
5743     return _PyUnicode_AsUTF8String(unicode, NULL);
5744 }
5745 
5746 /* --- UTF-32 Codec ------------------------------------------------------- */
5747 
5748 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5749 PyUnicode_DecodeUTF32(const char *s,
5750                       Py_ssize_t size,
5751                       const char *errors,
5752                       int *byteorder)
5753 {
5754     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5755 }
5756 
5757 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5758 PyUnicode_DecodeUTF32Stateful(const char *s,
5759                               Py_ssize_t size,
5760                               const char *errors,
5761                               int *byteorder,
5762                               Py_ssize_t *consumed)
5763 {
5764     const char *starts = s;
5765     Py_ssize_t startinpos;
5766     Py_ssize_t endinpos;
5767     _PyUnicodeWriter writer;
5768     const unsigned char *q, *e;
5769     int le, bo = 0;       /* assume native ordering by default */
5770     const char *encoding;
5771     const char *errmsg = "";
5772     PyObject *errorHandler = NULL;
5773     PyObject *exc = NULL;
5774 
5775     q = (const unsigned char *)s;
5776     e = q + size;
5777 
5778     if (byteorder)
5779         bo = *byteorder;
5780 
5781     /* Check for BOM marks (U+FEFF) in the input and adjust current
5782        byte order setting accordingly. In native mode, the leading BOM
5783        mark is skipped, in all other modes, it is copied to the output
5784        stream as-is (giving a ZWNBSP character). */
5785     if (bo == 0 && size >= 4) {
5786         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5787         if (bom == 0x0000FEFF) {
5788             bo = -1;
5789             q += 4;
5790         }
5791         else if (bom == 0xFFFE0000) {
5792             bo = 1;
5793             q += 4;
5794         }
5795         if (byteorder)
5796             *byteorder = bo;
5797     }
5798 
5799     if (q == e) {
5800         if (consumed)
5801             *consumed = size;
5802         _Py_RETURN_UNICODE_EMPTY();
5803     }
5804 
5805 #ifdef WORDS_BIGENDIAN
5806     le = bo < 0;
5807 #else
5808     le = bo <= 0;
5809 #endif
5810     encoding = le ? "utf-32-le" : "utf-32-be";
5811 
5812     _PyUnicodeWriter_Init(&writer);
5813     writer.min_length = (e - q + 3) / 4;
5814     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5815         goto onError;
5816 
5817     while (1) {
5818         Py_UCS4 ch = 0;
5819         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5820 
5821         if (e - q >= 4) {
5822             enum PyUnicode_Kind kind = writer.kind;
5823             void *data = writer.data;
5824             const unsigned char *last = e - 4;
5825             Py_ssize_t pos = writer.pos;
5826             if (le) {
5827                 do {
5828                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5829                     if (ch > maxch)
5830                         break;
5831                     if (kind != PyUnicode_1BYTE_KIND &&
5832                         Py_UNICODE_IS_SURROGATE(ch))
5833                         break;
5834                     PyUnicode_WRITE(kind, data, pos++, ch);
5835                     q += 4;
5836                 } while (q <= last);
5837             }
5838             else {
5839                 do {
5840                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5841                     if (ch > maxch)
5842                         break;
5843                     if (kind != PyUnicode_1BYTE_KIND &&
5844                         Py_UNICODE_IS_SURROGATE(ch))
5845                         break;
5846                     PyUnicode_WRITE(kind, data, pos++, ch);
5847                     q += 4;
5848                 } while (q <= last);
5849             }
5850             writer.pos = pos;
5851         }
5852 
5853         if (Py_UNICODE_IS_SURROGATE(ch)) {
5854             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5855             startinpos = ((const char *)q) - starts;
5856             endinpos = startinpos + 4;
5857         }
5858         else if (ch <= maxch) {
5859             if (q == e || consumed)
5860                 break;
5861             /* remaining bytes at the end? (size should be divisible by 4) */
5862             errmsg = "truncated data";
5863             startinpos = ((const char *)q) - starts;
5864             endinpos = ((const char *)e) - starts;
5865         }
5866         else {
5867             if (ch < 0x110000) {
5868                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5869                     goto onError;
5870                 q += 4;
5871                 continue;
5872             }
5873             errmsg = "code point not in range(0x110000)";
5874             startinpos = ((const char *)q) - starts;
5875             endinpos = startinpos + 4;
5876         }
5877 
5878         /* The remaining input chars are ignored if the callback
5879            chooses to skip the input */
5880         if (unicode_decode_call_errorhandler_writer(
5881                 errors, &errorHandler,
5882                 encoding, errmsg,
5883                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5884                 &writer))
5885             goto onError;
5886     }
5887 
5888     if (consumed)
5889         *consumed = (const char *)q-starts;
5890 
5891     Py_XDECREF(errorHandler);
5892     Py_XDECREF(exc);
5893     return _PyUnicodeWriter_Finish(&writer);
5894 
5895   onError:
5896     _PyUnicodeWriter_Dealloc(&writer);
5897     Py_XDECREF(errorHandler);
5898     Py_XDECREF(exc);
5899     return NULL;
5900 }
5901 
5902 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5903 _PyUnicode_EncodeUTF32(PyObject *str,
5904                        const char *errors,
5905                        int byteorder)
5906 {
5907     enum PyUnicode_Kind kind;
5908     const void *data;
5909     Py_ssize_t len;
5910     PyObject *v;
5911     uint32_t *out;
5912 #if PY_LITTLE_ENDIAN
5913     int native_ordering = byteorder <= 0;
5914 #else
5915     int native_ordering = byteorder >= 0;
5916 #endif
5917     const char *encoding;
5918     Py_ssize_t nsize, pos;
5919     PyObject *errorHandler = NULL;
5920     PyObject *exc = NULL;
5921     PyObject *rep = NULL;
5922 
5923     if (!PyUnicode_Check(str)) {
5924         PyErr_BadArgument();
5925         return NULL;
5926     }
5927     if (PyUnicode_READY(str) == -1)
5928         return NULL;
5929     kind = PyUnicode_KIND(str);
5930     data = PyUnicode_DATA(str);
5931     len = PyUnicode_GET_LENGTH(str);
5932 
5933     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5934         return PyErr_NoMemory();
5935     nsize = len + (byteorder == 0);
5936     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5937     if (v == NULL)
5938         return NULL;
5939 
5940     /* output buffer is 4-bytes aligned */
5941     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5942     out = (uint32_t *)PyBytes_AS_STRING(v);
5943     if (byteorder == 0)
5944         *out++ = 0xFEFF;
5945     if (len == 0)
5946         goto done;
5947 
5948     if (byteorder == -1)
5949         encoding = "utf-32-le";
5950     else if (byteorder == 1)
5951         encoding = "utf-32-be";
5952     else
5953         encoding = "utf-32";
5954 
5955     if (kind == PyUnicode_1BYTE_KIND) {
5956         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5957         goto done;
5958     }
5959 
5960     pos = 0;
5961     while (pos < len) {
5962         Py_ssize_t repsize, moreunits;
5963 
5964         if (kind == PyUnicode_2BYTE_KIND) {
5965             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5966                                         &out, native_ordering);
5967         }
5968         else {
5969             assert(kind == PyUnicode_4BYTE_KIND);
5970             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5971                                         &out, native_ordering);
5972         }
5973         if (pos == len)
5974             break;
5975 
5976         rep = unicode_encode_call_errorhandler(
5977                 errors, &errorHandler,
5978                 encoding, "surrogates not allowed",
5979                 str, &exc, pos, pos + 1, &pos);
5980         if (!rep)
5981             goto error;
5982 
5983         if (PyBytes_Check(rep)) {
5984             repsize = PyBytes_GET_SIZE(rep);
5985             if (repsize & 3) {
5986                 raise_encode_exception(&exc, encoding,
5987                                        str, pos - 1, pos,
5988                                        "surrogates not allowed");
5989                 goto error;
5990             }
5991             moreunits = repsize / 4;
5992         }
5993         else {
5994             assert(PyUnicode_Check(rep));
5995             if (PyUnicode_READY(rep) < 0)
5996                 goto error;
5997             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5998             if (!PyUnicode_IS_ASCII(rep)) {
5999                 raise_encode_exception(&exc, encoding,
6000                                        str, pos - 1, pos,
6001                                        "surrogates not allowed");
6002                 goto error;
6003             }
6004         }
6005 
6006         /* four bytes are reserved for each surrogate */
6007         if (moreunits > 1) {
6008             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6009             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6010                 /* integer overflow */
6011                 PyErr_NoMemory();
6012                 goto error;
6013             }
6014             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
6015                 goto error;
6016             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6017         }
6018 
6019         if (PyBytes_Check(rep)) {
6020             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6021             out += moreunits;
6022         } else /* rep is unicode */ {
6023             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6024             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6025                                  &out, native_ordering);
6026         }
6027 
6028         Py_CLEAR(rep);
6029     }
6030 
6031     /* Cut back to size actually needed. This is necessary for, for example,
6032        encoding of a string containing isolated surrogates and the 'ignore'
6033        handler is used. */
6034     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6035     if (nsize != PyBytes_GET_SIZE(v))
6036       _PyBytes_Resize(&v, nsize);
6037     Py_XDECREF(errorHandler);
6038     Py_XDECREF(exc);
6039   done:
6040     return v;
6041   error:
6042     Py_XDECREF(rep);
6043     Py_XDECREF(errorHandler);
6044     Py_XDECREF(exc);
6045     Py_XDECREF(v);
6046     return NULL;
6047 }
6048 
6049 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6050 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6051                       Py_ssize_t size,
6052                       const char *errors,
6053                       int byteorder)
6054 {
6055     PyObject *result;
6056     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6057     if (tmp == NULL)
6058         return NULL;
6059     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6060     Py_DECREF(tmp);
6061     return result;
6062 }
6063 
6064 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)6065 PyUnicode_AsUTF32String(PyObject *unicode)
6066 {
6067     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6068 }
6069 
6070 /* --- UTF-16 Codec ------------------------------------------------------- */
6071 
6072 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)6073 PyUnicode_DecodeUTF16(const char *s,
6074                       Py_ssize_t size,
6075                       const char *errors,
6076                       int *byteorder)
6077 {
6078     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6079 }
6080 
6081 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)6082 PyUnicode_DecodeUTF16Stateful(const char *s,
6083                               Py_ssize_t size,
6084                               const char *errors,
6085                               int *byteorder,
6086                               Py_ssize_t *consumed)
6087 {
6088     const char *starts = s;
6089     Py_ssize_t startinpos;
6090     Py_ssize_t endinpos;
6091     _PyUnicodeWriter writer;
6092     const unsigned char *q, *e;
6093     int bo = 0;       /* assume native ordering by default */
6094     int native_ordering;
6095     const char *errmsg = "";
6096     PyObject *errorHandler = NULL;
6097     PyObject *exc = NULL;
6098     const char *encoding;
6099 
6100     q = (const unsigned char *)s;
6101     e = q + size;
6102 
6103     if (byteorder)
6104         bo = *byteorder;
6105 
6106     /* Check for BOM marks (U+FEFF) in the input and adjust current
6107        byte order setting accordingly. In native mode, the leading BOM
6108        mark is skipped, in all other modes, it is copied to the output
6109        stream as-is (giving a ZWNBSP character). */
6110     if (bo == 0 && size >= 2) {
6111         const Py_UCS4 bom = (q[1] << 8) | q[0];
6112         if (bom == 0xFEFF) {
6113             q += 2;
6114             bo = -1;
6115         }
6116         else if (bom == 0xFFFE) {
6117             q += 2;
6118             bo = 1;
6119         }
6120         if (byteorder)
6121             *byteorder = bo;
6122     }
6123 
6124     if (q == e) {
6125         if (consumed)
6126             *consumed = size;
6127         _Py_RETURN_UNICODE_EMPTY();
6128     }
6129 
6130 #if PY_LITTLE_ENDIAN
6131     native_ordering = bo <= 0;
6132     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6133 #else
6134     native_ordering = bo >= 0;
6135     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6136 #endif
6137 
6138     /* Note: size will always be longer than the resulting Unicode
6139        character count normally.  Error handler will take care of
6140        resizing when needed. */
6141     _PyUnicodeWriter_Init(&writer);
6142     writer.min_length = (e - q + 1) / 2;
6143     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6144         goto onError;
6145 
6146     while (1) {
6147         Py_UCS4 ch = 0;
6148         if (e - q >= 2) {
6149             int kind = writer.kind;
6150             if (kind == PyUnicode_1BYTE_KIND) {
6151                 if (PyUnicode_IS_ASCII(writer.buffer))
6152                     ch = asciilib_utf16_decode(&q, e,
6153                             (Py_UCS1*)writer.data, &writer.pos,
6154                             native_ordering);
6155                 else
6156                     ch = ucs1lib_utf16_decode(&q, e,
6157                             (Py_UCS1*)writer.data, &writer.pos,
6158                             native_ordering);
6159             } else if (kind == PyUnicode_2BYTE_KIND) {
6160                 ch = ucs2lib_utf16_decode(&q, e,
6161                         (Py_UCS2*)writer.data, &writer.pos,
6162                         native_ordering);
6163             } else {
6164                 assert(kind == PyUnicode_4BYTE_KIND);
6165                 ch = ucs4lib_utf16_decode(&q, e,
6166                         (Py_UCS4*)writer.data, &writer.pos,
6167                         native_ordering);
6168             }
6169         }
6170 
6171         switch (ch)
6172         {
6173         case 0:
6174             /* remaining byte at the end? (size should be even) */
6175             if (q == e || consumed)
6176                 goto End;
6177             errmsg = "truncated data";
6178             startinpos = ((const char *)q) - starts;
6179             endinpos = ((const char *)e) - starts;
6180             break;
6181             /* The remaining input chars are ignored if the callback
6182                chooses to skip the input */
6183         case 1:
6184             q -= 2;
6185             if (consumed)
6186                 goto End;
6187             errmsg = "unexpected end of data";
6188             startinpos = ((const char *)q) - starts;
6189             endinpos = ((const char *)e) - starts;
6190             break;
6191         case 2:
6192             errmsg = "illegal encoding";
6193             startinpos = ((const char *)q) - 2 - starts;
6194             endinpos = startinpos + 2;
6195             break;
6196         case 3:
6197             errmsg = "illegal UTF-16 surrogate";
6198             startinpos = ((const char *)q) - 4 - starts;
6199             endinpos = startinpos + 2;
6200             break;
6201         default:
6202             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6203                 goto onError;
6204             continue;
6205         }
6206 
6207         if (unicode_decode_call_errorhandler_writer(
6208                 errors,
6209                 &errorHandler,
6210                 encoding, errmsg,
6211                 &starts,
6212                 (const char **)&e,
6213                 &startinpos,
6214                 &endinpos,
6215                 &exc,
6216                 (const char **)&q,
6217                 &writer))
6218             goto onError;
6219     }
6220 
6221 End:
6222     if (consumed)
6223         *consumed = (const char *)q-starts;
6224 
6225     Py_XDECREF(errorHandler);
6226     Py_XDECREF(exc);
6227     return _PyUnicodeWriter_Finish(&writer);
6228 
6229   onError:
6230     _PyUnicodeWriter_Dealloc(&writer);
6231     Py_XDECREF(errorHandler);
6232     Py_XDECREF(exc);
6233     return NULL;
6234 }
6235 
6236 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6237 _PyUnicode_EncodeUTF16(PyObject *str,
6238                        const char *errors,
6239                        int byteorder)
6240 {
6241     enum PyUnicode_Kind kind;
6242     const void *data;
6243     Py_ssize_t len;
6244     PyObject *v;
6245     unsigned short *out;
6246     Py_ssize_t pairs;
6247 #if PY_BIG_ENDIAN
6248     int native_ordering = byteorder >= 0;
6249 #else
6250     int native_ordering = byteorder <= 0;
6251 #endif
6252     const char *encoding;
6253     Py_ssize_t nsize, pos;
6254     PyObject *errorHandler = NULL;
6255     PyObject *exc = NULL;
6256     PyObject *rep = NULL;
6257 
6258     if (!PyUnicode_Check(str)) {
6259         PyErr_BadArgument();
6260         return NULL;
6261     }
6262     if (PyUnicode_READY(str) == -1)
6263         return NULL;
6264     kind = PyUnicode_KIND(str);
6265     data = PyUnicode_DATA(str);
6266     len = PyUnicode_GET_LENGTH(str);
6267 
6268     pairs = 0;
6269     if (kind == PyUnicode_4BYTE_KIND) {
6270         const Py_UCS4 *in = (const Py_UCS4 *)data;
6271         const Py_UCS4 *end = in + len;
6272         while (in < end) {
6273             if (*in++ >= 0x10000) {
6274                 pairs++;
6275             }
6276         }
6277     }
6278     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6279         return PyErr_NoMemory();
6280     }
6281     nsize = len + pairs + (byteorder == 0);
6282     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6283     if (v == NULL) {
6284         return NULL;
6285     }
6286 
6287     /* output buffer is 2-bytes aligned */
6288     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6289     out = (unsigned short *)PyBytes_AS_STRING(v);
6290     if (byteorder == 0) {
6291         *out++ = 0xFEFF;
6292     }
6293     if (len == 0) {
6294         goto done;
6295     }
6296 
6297     if (kind == PyUnicode_1BYTE_KIND) {
6298         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6299         goto done;
6300     }
6301 
6302     if (byteorder < 0) {
6303         encoding = "utf-16-le";
6304     }
6305     else if (byteorder > 0) {
6306         encoding = "utf-16-be";
6307     }
6308     else {
6309         encoding = "utf-16";
6310     }
6311 
6312     pos = 0;
6313     while (pos < len) {
6314         Py_ssize_t repsize, moreunits;
6315 
6316         if (kind == PyUnicode_2BYTE_KIND) {
6317             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6318                                         &out, native_ordering);
6319         }
6320         else {
6321             assert(kind == PyUnicode_4BYTE_KIND);
6322             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6323                                         &out, native_ordering);
6324         }
6325         if (pos == len)
6326             break;
6327 
6328         rep = unicode_encode_call_errorhandler(
6329                 errors, &errorHandler,
6330                 encoding, "surrogates not allowed",
6331                 str, &exc, pos, pos + 1, &pos);
6332         if (!rep)
6333             goto error;
6334 
6335         if (PyBytes_Check(rep)) {
6336             repsize = PyBytes_GET_SIZE(rep);
6337             if (repsize & 1) {
6338                 raise_encode_exception(&exc, encoding,
6339                                        str, pos - 1, pos,
6340                                        "surrogates not allowed");
6341                 goto error;
6342             }
6343             moreunits = repsize / 2;
6344         }
6345         else {
6346             assert(PyUnicode_Check(rep));
6347             if (PyUnicode_READY(rep) < 0)
6348                 goto error;
6349             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6350             if (!PyUnicode_IS_ASCII(rep)) {
6351                 raise_encode_exception(&exc, encoding,
6352                                        str, pos - 1, pos,
6353                                        "surrogates not allowed");
6354                 goto error;
6355             }
6356         }
6357 
6358         /* two bytes are reserved for each surrogate */
6359         if (moreunits > 1) {
6360             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6361             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6362                 /* integer overflow */
6363                 PyErr_NoMemory();
6364                 goto error;
6365             }
6366             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6367                 goto error;
6368             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6369         }
6370 
6371         if (PyBytes_Check(rep)) {
6372             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6373             out += moreunits;
6374         } else /* rep is unicode */ {
6375             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6376             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6377                                  &out, native_ordering);
6378         }
6379 
6380         Py_CLEAR(rep);
6381     }
6382 
6383     /* Cut back to size actually needed. This is necessary for, for example,
6384     encoding of a string containing isolated surrogates and the 'ignore' handler
6385     is used. */
6386     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6387     if (nsize != PyBytes_GET_SIZE(v))
6388       _PyBytes_Resize(&v, nsize);
6389     Py_XDECREF(errorHandler);
6390     Py_XDECREF(exc);
6391   done:
6392     return v;
6393   error:
6394     Py_XDECREF(rep);
6395     Py_XDECREF(errorHandler);
6396     Py_XDECREF(exc);
6397     Py_XDECREF(v);
6398     return NULL;
6399 #undef STORECHAR
6400 }
6401 
6402 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6403 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6404                       Py_ssize_t size,
6405                       const char *errors,
6406                       int byteorder)
6407 {
6408     PyObject *result;
6409     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6410     if (tmp == NULL)
6411         return NULL;
6412     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6413     Py_DECREF(tmp);
6414     return result;
6415 }
6416 
6417 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6418 PyUnicode_AsUTF16String(PyObject *unicode)
6419 {
6420     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6421 }
6422 
6423 /* --- Unicode Escape Codec ----------------------------------------------- */
6424 
6425 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6426 
6427 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6428 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6429                                Py_ssize_t size,
6430                                const char *errors,
6431                                Py_ssize_t *consumed,
6432                                const char **first_invalid_escape)
6433 {
6434     const char *starts = s;
6435     _PyUnicodeWriter writer;
6436     const char *end;
6437     PyObject *errorHandler = NULL;
6438     PyObject *exc = NULL;
6439 
6440     // so we can remember if we've seen an invalid escape char or not
6441     *first_invalid_escape = NULL;
6442 
6443     if (size == 0) {
6444         if (consumed) {
6445             *consumed = 0;
6446         }
6447         _Py_RETURN_UNICODE_EMPTY();
6448     }
6449     /* Escaped strings will always be longer than the resulting
6450        Unicode string, so we start with size here and then reduce the
6451        length after conversion to the true value.
6452        (but if the error callback returns a long replacement string
6453        we'll have to allocate more space) */
6454     _PyUnicodeWriter_Init(&writer);
6455     writer.min_length = size;
6456     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6457         goto onError;
6458     }
6459 
6460     end = s + size;
6461     while (s < end) {
6462         unsigned char c = (unsigned char) *s++;
6463         Py_UCS4 ch;
6464         int count;
6465         const char *message;
6466 
6467 #define WRITE_ASCII_CHAR(ch)                                                  \
6468             do {                                                              \
6469                 assert(ch <= 127);                                            \
6470                 assert(writer.pos < writer.size);                             \
6471                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6472             } while(0)
6473 
6474 #define WRITE_CHAR(ch)                                                        \
6475             do {                                                              \
6476                 if (ch <= writer.maxchar) {                                   \
6477                     assert(writer.pos < writer.size);                         \
6478                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6479                 }                                                             \
6480                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6481                     goto onError;                                             \
6482                 }                                                             \
6483             } while(0)
6484 
6485         /* Non-escape characters are interpreted as Unicode ordinals */
6486         if (c != '\\') {
6487             WRITE_CHAR(c);
6488             continue;
6489         }
6490 
6491         Py_ssize_t startinpos = s - starts - 1;
6492         /* \ - Escapes */
6493         if (s >= end) {
6494             message = "\\ at end of string";
6495             goto incomplete;
6496         }
6497         c = (unsigned char) *s++;
6498 
6499         assert(writer.pos < writer.size);
6500         switch (c) {
6501 
6502             /* \x escapes */
6503         case '\n': continue;
6504         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6505         case '\'': WRITE_ASCII_CHAR('\''); continue;
6506         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6507         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6508         /* FF */
6509         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6510         case 't': WRITE_ASCII_CHAR('\t'); continue;
6511         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6512         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6513         /* VT */
6514         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6515         /* BEL, not classic C */
6516         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6517 
6518             /* \OOO (octal) escapes */
6519         case '0': case '1': case '2': case '3':
6520         case '4': case '5': case '6': case '7':
6521             ch = c - '0';
6522             if (s < end && '0' <= *s && *s <= '7') {
6523                 ch = (ch<<3) + *s++ - '0';
6524                 if (s < end && '0' <= *s && *s <= '7') {
6525                     ch = (ch<<3) + *s++ - '0';
6526                 }
6527             }
6528             WRITE_CHAR(ch);
6529             continue;
6530 
6531             /* hex escapes */
6532             /* \xXX */
6533         case 'x':
6534             count = 2;
6535             message = "truncated \\xXX escape";
6536             goto hexescape;
6537 
6538             /* \uXXXX */
6539         case 'u':
6540             count = 4;
6541             message = "truncated \\uXXXX escape";
6542             goto hexescape;
6543 
6544             /* \UXXXXXXXX */
6545         case 'U':
6546             count = 8;
6547             message = "truncated \\UXXXXXXXX escape";
6548         hexescape:
6549             for (ch = 0; count; ++s, --count) {
6550                 if (s >= end) {
6551                     goto incomplete;
6552                 }
6553                 c = (unsigned char)*s;
6554                 ch <<= 4;
6555                 if (c >= '0' && c <= '9') {
6556                     ch += c - '0';
6557                 }
6558                 else if (c >= 'a' && c <= 'f') {
6559                     ch += c - ('a' - 10);
6560                 }
6561                 else if (c >= 'A' && c <= 'F') {
6562                     ch += c - ('A' - 10);
6563                 }
6564                 else {
6565                     goto error;
6566                 }
6567             }
6568 
6569             /* when we get here, ch is a 32-bit unicode character */
6570             if (ch > MAX_UNICODE) {
6571                 message = "illegal Unicode character";
6572                 goto error;
6573             }
6574 
6575             WRITE_CHAR(ch);
6576             continue;
6577 
6578             /* \N{name} */
6579         case 'N':
6580             if (ucnhash_capi == NULL) {
6581                 /* load the unicode data module */
6582                 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6583                                                 PyUnicodeData_CAPSULE_NAME, 1);
6584                 if (ucnhash_capi == NULL) {
6585                     PyErr_SetString(
6586                         PyExc_UnicodeError,
6587                         "\\N escapes not supported (can't load unicodedata module)"
6588                         );
6589                     goto onError;
6590                 }
6591             }
6592 
6593             message = "malformed \\N character escape";
6594             if (s >= end) {
6595                 goto incomplete;
6596             }
6597             if (*s == '{') {
6598                 const char *start = ++s;
6599                 size_t namelen;
6600                 /* look for the closing brace */
6601                 while (s < end && *s != '}')
6602                     s++;
6603                 if (s >= end) {
6604                     goto incomplete;
6605                 }
6606                 namelen = s - start;
6607                 if (namelen) {
6608                     /* found a name.  look it up in the unicode database */
6609                     s++;
6610                     ch = 0xffffffff; /* in case 'getcode' messes up */
6611                     if (namelen <= INT_MAX &&
6612                         ucnhash_capi->getcode(start, (int)namelen,
6613                                               &ch, 0)) {
6614                         assert(ch <= MAX_UNICODE);
6615                         WRITE_CHAR(ch);
6616                         continue;
6617                     }
6618                     message = "unknown Unicode character name";
6619                 }
6620             }
6621             goto error;
6622 
6623         default:
6624             if (*first_invalid_escape == NULL) {
6625                 *first_invalid_escape = s-1; /* Back up one char, since we've
6626                                                 already incremented s. */
6627             }
6628             WRITE_ASCII_CHAR('\\');
6629             WRITE_CHAR(c);
6630             continue;
6631         }
6632 
6633       incomplete:
6634         if (consumed) {
6635             *consumed = startinpos;
6636             break;
6637         }
6638       error:;
6639         Py_ssize_t endinpos = s-starts;
6640         writer.min_length = end - s + writer.pos;
6641         if (unicode_decode_call_errorhandler_writer(
6642                 errors, &errorHandler,
6643                 "unicodeescape", message,
6644                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6645                 &writer)) {
6646             goto onError;
6647         }
6648         assert(end - s <= writer.size - writer.pos);
6649 
6650 #undef WRITE_ASCII_CHAR
6651 #undef WRITE_CHAR
6652     }
6653 
6654     Py_XDECREF(errorHandler);
6655     Py_XDECREF(exc);
6656     return _PyUnicodeWriter_Finish(&writer);
6657 
6658   onError:
6659     _PyUnicodeWriter_Dealloc(&writer);
6660     Py_XDECREF(errorHandler);
6661     Py_XDECREF(exc);
6662     return NULL;
6663 }
6664 
6665 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6666 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6667                               Py_ssize_t size,
6668                               const char *errors,
6669                               Py_ssize_t *consumed)
6670 {
6671     const char *first_invalid_escape;
6672     PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6673                                                       consumed,
6674                                                       &first_invalid_escape);
6675     if (result == NULL)
6676         return NULL;
6677     if (first_invalid_escape != NULL) {
6678         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6679                              "invalid escape sequence '\\%c'",
6680                              (unsigned char)*first_invalid_escape) < 0) {
6681             Py_DECREF(result);
6682             return NULL;
6683         }
6684     }
6685     return result;
6686 }
6687 
6688 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6689 PyUnicode_DecodeUnicodeEscape(const char *s,
6690                               Py_ssize_t size,
6691                               const char *errors)
6692 {
6693     return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6694 }
6695 
6696 /* Return a Unicode-Escape string version of the Unicode object. */
6697 
6698 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6699 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6700 {
6701     Py_ssize_t i, len;
6702     PyObject *repr;
6703     char *p;
6704     enum PyUnicode_Kind kind;
6705     const void *data;
6706     Py_ssize_t expandsize;
6707 
6708     /* Initial allocation is based on the longest-possible character
6709        escape.
6710 
6711        For UCS1 strings it's '\xxx', 4 bytes per source character.
6712        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6713        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6714     */
6715 
6716     if (!PyUnicode_Check(unicode)) {
6717         PyErr_BadArgument();
6718         return NULL;
6719     }
6720     if (PyUnicode_READY(unicode) == -1) {
6721         return NULL;
6722     }
6723 
6724     len = PyUnicode_GET_LENGTH(unicode);
6725     if (len == 0) {
6726         return PyBytes_FromStringAndSize(NULL, 0);
6727     }
6728 
6729     kind = PyUnicode_KIND(unicode);
6730     data = PyUnicode_DATA(unicode);
6731     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6732        bytes, and 1 byte characters 4. */
6733     expandsize = kind * 2 + 2;
6734     if (len > PY_SSIZE_T_MAX / expandsize) {
6735         return PyErr_NoMemory();
6736     }
6737     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6738     if (repr == NULL) {
6739         return NULL;
6740     }
6741 
6742     p = PyBytes_AS_STRING(repr);
6743     for (i = 0; i < len; i++) {
6744         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6745 
6746         /* U+0000-U+00ff range */
6747         if (ch < 0x100) {
6748             if (ch >= ' ' && ch < 127) {
6749                 if (ch != '\\') {
6750                     /* Copy printable US ASCII as-is */
6751                     *p++ = (char) ch;
6752                 }
6753                 /* Escape backslashes */
6754                 else {
6755                     *p++ = '\\';
6756                     *p++ = '\\';
6757                 }
6758             }
6759 
6760             /* Map special whitespace to '\t', \n', '\r' */
6761             else if (ch == '\t') {
6762                 *p++ = '\\';
6763                 *p++ = 't';
6764             }
6765             else if (ch == '\n') {
6766                 *p++ = '\\';
6767                 *p++ = 'n';
6768             }
6769             else if (ch == '\r') {
6770                 *p++ = '\\';
6771                 *p++ = 'r';
6772             }
6773 
6774             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6775             else {
6776                 *p++ = '\\';
6777                 *p++ = 'x';
6778                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6779                 *p++ = Py_hexdigits[ch & 0x000F];
6780             }
6781         }
6782         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6783         else if (ch < 0x10000) {
6784             *p++ = '\\';
6785             *p++ = 'u';
6786             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6787             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6788             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6789             *p++ = Py_hexdigits[ch & 0x000F];
6790         }
6791         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6792         else {
6793 
6794             /* Make sure that the first two digits are zero */
6795             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6796             *p++ = '\\';
6797             *p++ = 'U';
6798             *p++ = '0';
6799             *p++ = '0';
6800             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6801             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6802             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6803             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6804             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6805             *p++ = Py_hexdigits[ch & 0x0000000F];
6806         }
6807     }
6808 
6809     assert(p - PyBytes_AS_STRING(repr) > 0);
6810     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6811         return NULL;
6812     }
6813     return repr;
6814 }
6815 
6816 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6817 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6818                               Py_ssize_t size)
6819 {
6820     PyObject *result;
6821     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6822     if (tmp == NULL) {
6823         return NULL;
6824     }
6825 
6826     result = PyUnicode_AsUnicodeEscapeString(tmp);
6827     Py_DECREF(tmp);
6828     return result;
6829 }
6830 
6831 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6832 
6833 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6834 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6835                                           Py_ssize_t size,
6836                                           const char *errors,
6837                                           Py_ssize_t *consumed)
6838 {
6839     const char *starts = s;
6840     _PyUnicodeWriter writer;
6841     const char *end;
6842     PyObject *errorHandler = NULL;
6843     PyObject *exc = NULL;
6844 
6845     if (size == 0) {
6846         if (consumed) {
6847             *consumed = 0;
6848         }
6849         _Py_RETURN_UNICODE_EMPTY();
6850     }
6851 
6852     /* Escaped strings will always be longer than the resulting
6853        Unicode string, so we start with size here and then reduce the
6854        length after conversion to the true value. (But decoding error
6855        handler might have to resize the string) */
6856     _PyUnicodeWriter_Init(&writer);
6857     writer.min_length = size;
6858     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6859         goto onError;
6860     }
6861 
6862     end = s + size;
6863     while (s < end) {
6864         unsigned char c = (unsigned char) *s++;
6865         Py_UCS4 ch;
6866         int count;
6867         const char *message;
6868 
6869 #define WRITE_CHAR(ch)                                                        \
6870             do {                                                              \
6871                 if (ch <= writer.maxchar) {                                   \
6872                     assert(writer.pos < writer.size);                         \
6873                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6874                 }                                                             \
6875                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6876                     goto onError;                                             \
6877                 }                                                             \
6878             } while(0)
6879 
6880         /* Non-escape characters are interpreted as Unicode ordinals */
6881         if (c != '\\' || (s >= end && !consumed)) {
6882             WRITE_CHAR(c);
6883             continue;
6884         }
6885 
6886         Py_ssize_t startinpos = s - starts - 1;
6887         /* \ - Escapes */
6888         if (s >= end) {
6889             assert(consumed);
6890             // Set message to silent compiler warning.
6891             // Actually it is never used.
6892             message = "\\ at end of string";
6893             goto incomplete;
6894         }
6895 
6896         c = (unsigned char) *s++;
6897         if (c == 'u') {
6898             count = 4;
6899             message = "truncated \\uXXXX escape";
6900         }
6901         else if (c == 'U') {
6902             count = 8;
6903             message = "truncated \\UXXXXXXXX escape";
6904         }
6905         else {
6906             assert(writer.pos < writer.size);
6907             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6908             WRITE_CHAR(c);
6909             continue;
6910         }
6911 
6912         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6913         for (ch = 0; count; ++s, --count) {
6914             if (s >= end) {
6915                 goto incomplete;
6916             }
6917             c = (unsigned char)*s;
6918             ch <<= 4;
6919             if (c >= '0' && c <= '9') {
6920                 ch += c - '0';
6921             }
6922             else if (c >= 'a' && c <= 'f') {
6923                 ch += c - ('a' - 10);
6924             }
6925             else if (c >= 'A' && c <= 'F') {
6926                 ch += c - ('A' - 10);
6927             }
6928             else {
6929                 goto error;
6930             }
6931         }
6932         if (ch > MAX_UNICODE) {
6933             message = "\\Uxxxxxxxx out of range";
6934             goto error;
6935         }
6936         WRITE_CHAR(ch);
6937         continue;
6938 
6939       incomplete:
6940         if (consumed) {
6941             *consumed = startinpos;
6942             break;
6943         }
6944       error:;
6945         Py_ssize_t endinpos = s-starts;
6946         writer.min_length = end - s + writer.pos;
6947         if (unicode_decode_call_errorhandler_writer(
6948                 errors, &errorHandler,
6949                 "rawunicodeescape", message,
6950                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6951                 &writer)) {
6952             goto onError;
6953         }
6954         assert(end - s <= writer.size - writer.pos);
6955 
6956 #undef WRITE_CHAR
6957     }
6958     Py_XDECREF(errorHandler);
6959     Py_XDECREF(exc);
6960     return _PyUnicodeWriter_Finish(&writer);
6961 
6962   onError:
6963     _PyUnicodeWriter_Dealloc(&writer);
6964     Py_XDECREF(errorHandler);
6965     Py_XDECREF(exc);
6966     return NULL;
6967 }
6968 
6969 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6970 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6971                                  Py_ssize_t size,
6972                                  const char *errors)
6973 {
6974     return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6975 }
6976 
6977 
6978 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6979 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6980 {
6981     PyObject *repr;
6982     char *p;
6983     Py_ssize_t expandsize, pos;
6984     int kind;
6985     const void *data;
6986     Py_ssize_t len;
6987 
6988     if (!PyUnicode_Check(unicode)) {
6989         PyErr_BadArgument();
6990         return NULL;
6991     }
6992     if (PyUnicode_READY(unicode) == -1) {
6993         return NULL;
6994     }
6995     kind = PyUnicode_KIND(unicode);
6996     data = PyUnicode_DATA(unicode);
6997     len = PyUnicode_GET_LENGTH(unicode);
6998     if (kind == PyUnicode_1BYTE_KIND) {
6999         return PyBytes_FromStringAndSize(data, len);
7000     }
7001 
7002     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7003        bytes, and 1 byte characters 4. */
7004     expandsize = kind * 2 + 2;
7005 
7006     if (len > PY_SSIZE_T_MAX / expandsize) {
7007         return PyErr_NoMemory();
7008     }
7009     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7010     if (repr == NULL) {
7011         return NULL;
7012     }
7013     if (len == 0) {
7014         return repr;
7015     }
7016 
7017     p = PyBytes_AS_STRING(repr);
7018     for (pos = 0; pos < len; pos++) {
7019         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7020 
7021         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7022         if (ch < 0x100) {
7023             *p++ = (char) ch;
7024         }
7025         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7026         else if (ch < 0x10000) {
7027             *p++ = '\\';
7028             *p++ = 'u';
7029             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7030             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7031             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7032             *p++ = Py_hexdigits[ch & 15];
7033         }
7034         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7035         else {
7036             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7037             *p++ = '\\';
7038             *p++ = 'U';
7039             *p++ = '0';
7040             *p++ = '0';
7041             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7042             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7043             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7044             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7045             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7046             *p++ = Py_hexdigits[ch & 15];
7047         }
7048     }
7049 
7050     assert(p > PyBytes_AS_STRING(repr));
7051     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7052         return NULL;
7053     }
7054     return repr;
7055 }
7056 
7057 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)7058 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7059                                  Py_ssize_t size)
7060 {
7061     PyObject *result;
7062     PyObject *tmp = PyUnicode_FromWideChar(s, size);
7063     if (tmp == NULL)
7064         return NULL;
7065     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7066     Py_DECREF(tmp);
7067     return result;
7068 }
7069 
7070 /* --- Latin-1 Codec ------------------------------------------------------ */
7071 
7072 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)7073 PyUnicode_DecodeLatin1(const char *s,
7074                        Py_ssize_t size,
7075                        const char *errors)
7076 {
7077     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7078     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7079 }
7080 
7081 /* create or adjust a UnicodeEncodeError */
7082 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7083 make_encode_exception(PyObject **exceptionObject,
7084                       const char *encoding,
7085                       PyObject *unicode,
7086                       Py_ssize_t startpos, Py_ssize_t endpos,
7087                       const char *reason)
7088 {
7089     if (*exceptionObject == NULL) {
7090         *exceptionObject = PyObject_CallFunction(
7091             PyExc_UnicodeEncodeError, "sOnns",
7092             encoding, unicode, startpos, endpos, reason);
7093     }
7094     else {
7095         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7096             goto onError;
7097         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7098             goto onError;
7099         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7100             goto onError;
7101         return;
7102       onError:
7103         Py_CLEAR(*exceptionObject);
7104     }
7105 }
7106 
7107 /* raises a UnicodeEncodeError */
7108 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7109 raise_encode_exception(PyObject **exceptionObject,
7110                        const char *encoding,
7111                        PyObject *unicode,
7112                        Py_ssize_t startpos, Py_ssize_t endpos,
7113                        const char *reason)
7114 {
7115     make_encode_exception(exceptionObject,
7116                           encoding, unicode, startpos, endpos, reason);
7117     if (*exceptionObject != NULL)
7118         PyCodec_StrictErrors(*exceptionObject);
7119 }
7120 
7121 /* error handling callback helper:
7122    build arguments, call the callback and check the arguments,
7123    put the result into newpos and return the replacement string, which
7124    has to be freed by the caller */
7125 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)7126 unicode_encode_call_errorhandler(const char *errors,
7127                                  PyObject **errorHandler,
7128                                  const char *encoding, const char *reason,
7129                                  PyObject *unicode, PyObject **exceptionObject,
7130                                  Py_ssize_t startpos, Py_ssize_t endpos,
7131                                  Py_ssize_t *newpos)
7132 {
7133     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7134     Py_ssize_t len;
7135     PyObject *restuple;
7136     PyObject *resunicode;
7137 
7138     if (*errorHandler == NULL) {
7139         *errorHandler = PyCodec_LookupError(errors);
7140         if (*errorHandler == NULL)
7141             return NULL;
7142     }
7143 
7144     if (PyUnicode_READY(unicode) == -1)
7145         return NULL;
7146     len = PyUnicode_GET_LENGTH(unicode);
7147 
7148     make_encode_exception(exceptionObject,
7149                           encoding, unicode, startpos, endpos, reason);
7150     if (*exceptionObject == NULL)
7151         return NULL;
7152 
7153     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154     if (restuple == NULL)
7155         return NULL;
7156     if (!PyTuple_Check(restuple)) {
7157         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158         Py_DECREF(restuple);
7159         return NULL;
7160     }
7161     if (!PyArg_ParseTuple(restuple, argparse,
7162                           &resunicode, newpos)) {
7163         Py_DECREF(restuple);
7164         return NULL;
7165     }
7166     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168         Py_DECREF(restuple);
7169         return NULL;
7170     }
7171     if (*newpos<0)
7172         *newpos = len + *newpos;
7173     if (*newpos<0 || *newpos>len) {
7174         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175         Py_DECREF(restuple);
7176         return NULL;
7177     }
7178     Py_INCREF(resunicode);
7179     Py_DECREF(restuple);
7180     return resunicode;
7181 }
7182 
7183 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7184 unicode_encode_ucs1(PyObject *unicode,
7185                     const char *errors,
7186                     const Py_UCS4 limit)
7187 {
7188     /* input state */
7189     Py_ssize_t pos=0, size;
7190     int kind;
7191     const void *data;
7192     /* pointer into the output */
7193     char *str;
7194     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7195     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7196     PyObject *error_handler_obj = NULL;
7197     PyObject *exc = NULL;
7198     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7199     PyObject *rep = NULL;
7200     /* output object */
7201     _PyBytesWriter writer;
7202 
7203     if (PyUnicode_READY(unicode) == -1)
7204         return NULL;
7205     size = PyUnicode_GET_LENGTH(unicode);
7206     kind = PyUnicode_KIND(unicode);
7207     data = PyUnicode_DATA(unicode);
7208     /* allocate enough for a simple encoding without
7209        replacements, if we need more, we'll resize */
7210     if (size == 0)
7211         return PyBytes_FromStringAndSize(NULL, 0);
7212 
7213     _PyBytesWriter_Init(&writer);
7214     str = _PyBytesWriter_Alloc(&writer, size);
7215     if (str == NULL)
7216         return NULL;
7217 
7218     while (pos < size) {
7219         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7220 
7221         /* can we encode this? */
7222         if (ch < limit) {
7223             /* no overflow check, because we know that the space is enough */
7224             *str++ = (char)ch;
7225             ++pos;
7226         }
7227         else {
7228             Py_ssize_t newpos, i;
7229             /* startpos for collecting unencodable chars */
7230             Py_ssize_t collstart = pos;
7231             Py_ssize_t collend = collstart + 1;
7232             /* find all unecodable characters */
7233 
7234             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7235                 ++collend;
7236 
7237             /* Only overallocate the buffer if it's not the last write */
7238             writer.overallocate = (collend < size);
7239 
7240             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7241             if (error_handler == _Py_ERROR_UNKNOWN)
7242                 error_handler = _Py_GetErrorHandler(errors);
7243 
7244             switch (error_handler) {
7245             case _Py_ERROR_STRICT:
7246                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7247                 goto onError;
7248 
7249             case _Py_ERROR_REPLACE:
7250                 memset(str, '?', collend - collstart);
7251                 str += (collend - collstart);
7252                 /* fall through */
7253             case _Py_ERROR_IGNORE:
7254                 pos = collend;
7255                 break;
7256 
7257             case _Py_ERROR_BACKSLASHREPLACE:
7258                 /* subtract preallocated bytes */
7259                 writer.min_size -= (collend - collstart);
7260                 str = backslashreplace(&writer, str,
7261                                        unicode, collstart, collend);
7262                 if (str == NULL)
7263                     goto onError;
7264                 pos = collend;
7265                 break;
7266 
7267             case _Py_ERROR_XMLCHARREFREPLACE:
7268                 /* subtract preallocated bytes */
7269                 writer.min_size -= (collend - collstart);
7270                 str = xmlcharrefreplace(&writer, str,
7271                                         unicode, collstart, collend);
7272                 if (str == NULL)
7273                     goto onError;
7274                 pos = collend;
7275                 break;
7276 
7277             case _Py_ERROR_SURROGATEESCAPE:
7278                 for (i = collstart; i < collend; ++i) {
7279                     ch = PyUnicode_READ(kind, data, i);
7280                     if (ch < 0xdc80 || 0xdcff < ch) {
7281                         /* Not a UTF-8b surrogate */
7282                         break;
7283                     }
7284                     *str++ = (char)(ch - 0xdc00);
7285                     ++pos;
7286                 }
7287                 if (i >= collend)
7288                     break;
7289                 collstart = pos;
7290                 assert(collstart != collend);
7291                 /* fall through */
7292 
7293             default:
7294                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7295                                                        encoding, reason, unicode, &exc,
7296                                                        collstart, collend, &newpos);
7297                 if (rep == NULL)
7298                     goto onError;
7299 
7300                 /* subtract preallocated bytes */
7301                 writer.min_size -= newpos - collstart;
7302 
7303                 if (PyBytes_Check(rep)) {
7304                     /* Directly copy bytes result to output. */
7305                     str = _PyBytesWriter_WriteBytes(&writer, str,
7306                                                     PyBytes_AS_STRING(rep),
7307                                                     PyBytes_GET_SIZE(rep));
7308                 }
7309                 else {
7310                     assert(PyUnicode_Check(rep));
7311 
7312                     if (PyUnicode_READY(rep) < 0)
7313                         goto onError;
7314 
7315                     if (limit == 256 ?
7316                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317                         !PyUnicode_IS_ASCII(rep))
7318                     {
7319                         /* Not all characters are smaller than limit */
7320                         raise_encode_exception(&exc, encoding, unicode,
7321                                                collstart, collend, reason);
7322                         goto onError;
7323                     }
7324                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325                     str = _PyBytesWriter_WriteBytes(&writer, str,
7326                                                     PyUnicode_DATA(rep),
7327                                                     PyUnicode_GET_LENGTH(rep));
7328                 }
7329                 if (str == NULL)
7330                     goto onError;
7331 
7332                 pos = newpos;
7333                 Py_CLEAR(rep);
7334             }
7335 
7336             /* If overallocation was disabled, ensure that it was the last
7337                write. Otherwise, we missed an optimization */
7338             assert(writer.overallocate || pos == size);
7339         }
7340     }
7341 
7342     Py_XDECREF(error_handler_obj);
7343     Py_XDECREF(exc);
7344     return _PyBytesWriter_Finish(&writer, str);
7345 
7346   onError:
7347     Py_XDECREF(rep);
7348     _PyBytesWriter_Dealloc(&writer);
7349     Py_XDECREF(error_handler_obj);
7350     Py_XDECREF(exc);
7351     return NULL;
7352 }
7353 
7354 /* Deprecated */
7355 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7356 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7357                        Py_ssize_t size,
7358                        const char *errors)
7359 {
7360     PyObject *result;
7361     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7362     if (unicode == NULL)
7363         return NULL;
7364     result = unicode_encode_ucs1(unicode, errors, 256);
7365     Py_DECREF(unicode);
7366     return result;
7367 }
7368 
7369 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7370 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7371 {
7372     if (!PyUnicode_Check(unicode)) {
7373         PyErr_BadArgument();
7374         return NULL;
7375     }
7376     if (PyUnicode_READY(unicode) == -1)
7377         return NULL;
7378     /* Fast path: if it is a one-byte string, construct
7379        bytes object directly. */
7380     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7381         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7382                                          PyUnicode_GET_LENGTH(unicode));
7383     /* Non-Latin-1 characters present. Defer to above function to
7384        raise the exception. */
7385     return unicode_encode_ucs1(unicode, errors, 256);
7386 }
7387 
7388 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7389 PyUnicode_AsLatin1String(PyObject *unicode)
7390 {
7391     return _PyUnicode_AsLatin1String(unicode, NULL);
7392 }
7393 
7394 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7395 
7396 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7397 PyUnicode_DecodeASCII(const char *s,
7398                       Py_ssize_t size,
7399                       const char *errors)
7400 {
7401     const char *starts = s;
7402     const char *e = s + size;
7403     PyObject *error_handler_obj = NULL;
7404     PyObject *exc = NULL;
7405     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7406 
7407     if (size == 0)
7408         _Py_RETURN_UNICODE_EMPTY();
7409 
7410     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7411     if (size == 1 && (unsigned char)s[0] < 128) {
7412         return get_latin1_char((unsigned char)s[0]);
7413     }
7414 
7415     // Shortcut for simple case
7416     PyObject *u = PyUnicode_New(size, 127);
7417     if (u == NULL) {
7418         return NULL;
7419     }
7420     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7421     if (outpos == size) {
7422         return u;
7423     }
7424 
7425     _PyUnicodeWriter writer;
7426     _PyUnicodeWriter_InitWithBuffer(&writer, u);
7427     writer.pos = outpos;
7428 
7429     s += outpos;
7430     int kind = writer.kind;
7431     void *data = writer.data;
7432     Py_ssize_t startinpos, endinpos;
7433 
7434     while (s < e) {
7435         unsigned char c = (unsigned char)*s;
7436         if (c < 128) {
7437             PyUnicode_WRITE(kind, data, writer.pos, c);
7438             writer.pos++;
7439             ++s;
7440             continue;
7441         }
7442 
7443         /* byte outsize range 0x00..0x7f: call the error handler */
7444 
7445         if (error_handler == _Py_ERROR_UNKNOWN)
7446             error_handler = _Py_GetErrorHandler(errors);
7447 
7448         switch (error_handler)
7449         {
7450         case _Py_ERROR_REPLACE:
7451         case _Py_ERROR_SURROGATEESCAPE:
7452             /* Fast-path: the error handler only writes one character,
7453                but we may switch to UCS2 at the first write */
7454             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7455                 goto onError;
7456             kind = writer.kind;
7457             data = writer.data;
7458 
7459             if (error_handler == _Py_ERROR_REPLACE)
7460                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7461             else
7462                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7463             writer.pos++;
7464             ++s;
7465             break;
7466 
7467         case _Py_ERROR_IGNORE:
7468             ++s;
7469             break;
7470 
7471         default:
7472             startinpos = s-starts;
7473             endinpos = startinpos + 1;
7474             if (unicode_decode_call_errorhandler_writer(
7475                     errors, &error_handler_obj,
7476                     "ascii", "ordinal not in range(128)",
7477                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7478                     &writer))
7479                 goto onError;
7480             kind = writer.kind;
7481             data = writer.data;
7482         }
7483     }
7484     Py_XDECREF(error_handler_obj);
7485     Py_XDECREF(exc);
7486     return _PyUnicodeWriter_Finish(&writer);
7487 
7488   onError:
7489     _PyUnicodeWriter_Dealloc(&writer);
7490     Py_XDECREF(error_handler_obj);
7491     Py_XDECREF(exc);
7492     return NULL;
7493 }
7494 
7495 /* Deprecated */
7496 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7497 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7498                       Py_ssize_t size,
7499                       const char *errors)
7500 {
7501     PyObject *result;
7502     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7503     if (unicode == NULL)
7504         return NULL;
7505     result = unicode_encode_ucs1(unicode, errors, 128);
7506     Py_DECREF(unicode);
7507     return result;
7508 }
7509 
7510 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7511 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7512 {
7513     if (!PyUnicode_Check(unicode)) {
7514         PyErr_BadArgument();
7515         return NULL;
7516     }
7517     if (PyUnicode_READY(unicode) == -1)
7518         return NULL;
7519     /* Fast path: if it is an ASCII-only string, construct bytes object
7520        directly. Else defer to above function to raise the exception. */
7521     if (PyUnicode_IS_ASCII(unicode))
7522         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7523                                          PyUnicode_GET_LENGTH(unicode));
7524     return unicode_encode_ucs1(unicode, errors, 128);
7525 }
7526 
7527 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7528 PyUnicode_AsASCIIString(PyObject *unicode)
7529 {
7530     return _PyUnicode_AsASCIIString(unicode, NULL);
7531 }
7532 
7533 #ifdef MS_WINDOWS
7534 
7535 /* --- MBCS codecs for Windows -------------------------------------------- */
7536 
7537 #if SIZEOF_INT < SIZEOF_SIZE_T
7538 #define NEED_RETRY
7539 #endif
7540 
7541 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7542    transcoding from UTF-16), but INT_MAX / 4 performs better in
7543    both cases also and avoids partial characters overrunning the
7544    length limit in MultiByteToWideChar on Windows */
7545 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7546 
7547 #ifndef WC_ERR_INVALID_CHARS
7548 #  define WC_ERR_INVALID_CHARS 0x0080
7549 #endif
7550 
7551 static const char*
code_page_name(UINT code_page,PyObject ** obj)7552 code_page_name(UINT code_page, PyObject **obj)
7553 {
7554     *obj = NULL;
7555     if (code_page == CP_ACP)
7556         return "mbcs";
7557     if (code_page == CP_UTF7)
7558         return "CP_UTF7";
7559     if (code_page == CP_UTF8)
7560         return "CP_UTF8";
7561 
7562     *obj = PyBytes_FromFormat("cp%u", code_page);
7563     if (*obj == NULL)
7564         return NULL;
7565     return PyBytes_AS_STRING(*obj);
7566 }
7567 
7568 static DWORD
decode_code_page_flags(UINT code_page)7569 decode_code_page_flags(UINT code_page)
7570 {
7571     if (code_page == CP_UTF7) {
7572         /* The CP_UTF7 decoder only supports flags=0 */
7573         return 0;
7574     }
7575     else
7576         return MB_ERR_INVALID_CHARS;
7577 }
7578 
7579 /*
7580  * Decode a byte string from a Windows code page into unicode object in strict
7581  * mode.
7582  *
7583  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7584  * OSError and returns -1 on other error.
7585  */
7586 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7587 decode_code_page_strict(UINT code_page,
7588                         wchar_t **buf,
7589                         Py_ssize_t *bufsize,
7590                         const char *in,
7591                         int insize)
7592 {
7593     DWORD flags = MB_ERR_INVALID_CHARS;
7594     wchar_t *out;
7595     DWORD outsize;
7596 
7597     /* First get the size of the result */
7598     assert(insize > 0);
7599     while ((outsize = MultiByteToWideChar(code_page, flags,
7600                                           in, insize, NULL, 0)) <= 0)
7601     {
7602         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7603             goto error;
7604         }
7605         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7606         flags = 0;
7607     }
7608 
7609     /* Extend a wchar_t* buffer */
7610     Py_ssize_t n = *bufsize;   /* Get the current length */
7611     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7612         return -1;
7613     }
7614     out = *buf + n;
7615 
7616     /* Do the conversion */
7617     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7618     if (outsize <= 0)
7619         goto error;
7620     return insize;
7621 
7622 error:
7623     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7624         return -2;
7625     PyErr_SetFromWindowsErr(0);
7626     return -1;
7627 }
7628 
7629 /*
7630  * Decode a byte string from a code page into unicode object with an error
7631  * handler.
7632  *
7633  * Returns consumed size if succeed, or raise an OSError or
7634  * UnicodeDecodeError exception and returns -1 on error.
7635  */
7636 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7637 decode_code_page_errors(UINT code_page,
7638                         wchar_t **buf,
7639                         Py_ssize_t *bufsize,
7640                         const char *in, const int size,
7641                         const char *errors, int final)
7642 {
7643     const char *startin = in;
7644     const char *endin = in + size;
7645     DWORD flags = MB_ERR_INVALID_CHARS;
7646     /* Ideally, we should get reason from FormatMessage. This is the Windows
7647        2000 English version of the message. */
7648     const char *reason = "No mapping for the Unicode character exists "
7649                          "in the target code page.";
7650     /* each step cannot decode more than 1 character, but a character can be
7651        represented as a surrogate pair */
7652     wchar_t buffer[2], *out;
7653     int insize;
7654     Py_ssize_t outsize;
7655     PyObject *errorHandler = NULL;
7656     PyObject *exc = NULL;
7657     PyObject *encoding_obj = NULL;
7658     const char *encoding;
7659     DWORD err;
7660     int ret = -1;
7661 
7662     assert(size > 0);
7663 
7664     encoding = code_page_name(code_page, &encoding_obj);
7665     if (encoding == NULL)
7666         return -1;
7667 
7668     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7669         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7670            UnicodeDecodeError. */
7671         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7672         if (exc != NULL) {
7673             PyCodec_StrictErrors(exc);
7674             Py_CLEAR(exc);
7675         }
7676         goto error;
7677     }
7678 
7679     /* Extend a wchar_t* buffer */
7680     Py_ssize_t n = *bufsize;   /* Get the current length */
7681     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7682         PyErr_NoMemory();
7683         goto error;
7684     }
7685     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7686         goto error;
7687     }
7688     out = *buf + n;
7689 
7690     /* Decode the byte string character per character */
7691     while (in < endin)
7692     {
7693         /* Decode a character */
7694         insize = 1;
7695         do
7696         {
7697             outsize = MultiByteToWideChar(code_page, flags,
7698                                           in, insize,
7699                                           buffer, Py_ARRAY_LENGTH(buffer));
7700             if (outsize > 0)
7701                 break;
7702             err = GetLastError();
7703             if (err == ERROR_INVALID_FLAGS && flags) {
7704                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7705                 flags = 0;
7706                 continue;
7707             }
7708             if (err != ERROR_NO_UNICODE_TRANSLATION
7709                 && err != ERROR_INSUFFICIENT_BUFFER)
7710             {
7711                 PyErr_SetFromWindowsErr(0);
7712                 goto error;
7713             }
7714             insize++;
7715         }
7716         /* 4=maximum length of a UTF-8 sequence */
7717         while (insize <= 4 && (in + insize) <= endin);
7718 
7719         if (outsize <= 0) {
7720             Py_ssize_t startinpos, endinpos, outpos;
7721 
7722             /* last character in partial decode? */
7723             if (in + insize >= endin && !final)
7724                 break;
7725 
7726             startinpos = in - startin;
7727             endinpos = startinpos + 1;
7728             outpos = out - *buf;
7729             if (unicode_decode_call_errorhandler_wchar(
7730                     errors, &errorHandler,
7731                     encoding, reason,
7732                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7733                     buf, bufsize, &outpos))
7734             {
7735                 goto error;
7736             }
7737             out = *buf + outpos;
7738         }
7739         else {
7740             in += insize;
7741             memcpy(out, buffer, outsize * sizeof(wchar_t));
7742             out += outsize;
7743         }
7744     }
7745 
7746     /* Shrink the buffer */
7747     assert(out - *buf <= *bufsize);
7748     *bufsize = out - *buf;
7749     /* (in - startin) <= size and size is an int */
7750     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7751 
7752 error:
7753     Py_XDECREF(encoding_obj);
7754     Py_XDECREF(errorHandler);
7755     Py_XDECREF(exc);
7756     return ret;
7757 }
7758 
7759 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7760 decode_code_page_stateful(int code_page,
7761                           const char *s, Py_ssize_t size,
7762                           const char *errors, Py_ssize_t *consumed)
7763 {
7764     wchar_t *buf = NULL;
7765     Py_ssize_t bufsize = 0;
7766     int chunk_size, final, converted, done;
7767 
7768     if (code_page < 0) {
7769         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7770         return NULL;
7771     }
7772     if (size < 0) {
7773         PyErr_BadInternalCall();
7774         return NULL;
7775     }
7776 
7777     if (consumed)
7778         *consumed = 0;
7779 
7780     do
7781     {
7782 #ifdef NEED_RETRY
7783         if (size > DECODING_CHUNK_SIZE) {
7784             chunk_size = DECODING_CHUNK_SIZE;
7785             final = 0;
7786             done = 0;
7787         }
7788         else
7789 #endif
7790         {
7791             chunk_size = (int)size;
7792             final = (consumed == NULL);
7793             done = 1;
7794         }
7795 
7796         if (chunk_size == 0 && done) {
7797             if (buf != NULL)
7798                 break;
7799             _Py_RETURN_UNICODE_EMPTY();
7800         }
7801 
7802         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7803                                             s, chunk_size);
7804         if (converted == -2)
7805             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7806                                                 s, chunk_size,
7807                                                 errors, final);
7808         assert(converted != 0 || done);
7809 
7810         if (converted < 0) {
7811             PyMem_Free(buf);
7812             return NULL;
7813         }
7814 
7815         if (consumed)
7816             *consumed += converted;
7817 
7818         s += converted;
7819         size -= converted;
7820     } while (!done);
7821 
7822     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7823     PyMem_Free(buf);
7824     return v;
7825 }
7826 
7827 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7828 PyUnicode_DecodeCodePageStateful(int code_page,
7829                                  const char *s,
7830                                  Py_ssize_t size,
7831                                  const char *errors,
7832                                  Py_ssize_t *consumed)
7833 {
7834     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7835 }
7836 
7837 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7838 PyUnicode_DecodeMBCSStateful(const char *s,
7839                              Py_ssize_t size,
7840                              const char *errors,
7841                              Py_ssize_t *consumed)
7842 {
7843     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7844 }
7845 
7846 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7847 PyUnicode_DecodeMBCS(const char *s,
7848                      Py_ssize_t size,
7849                      const char *errors)
7850 {
7851     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7852 }
7853 
7854 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7855 encode_code_page_flags(UINT code_page, const char *errors)
7856 {
7857     if (code_page == CP_UTF8) {
7858         return WC_ERR_INVALID_CHARS;
7859     }
7860     else if (code_page == CP_UTF7) {
7861         /* CP_UTF7 only supports flags=0 */
7862         return 0;
7863     }
7864     else {
7865         if (errors != NULL && strcmp(errors, "replace") == 0)
7866             return 0;
7867         else
7868             return WC_NO_BEST_FIT_CHARS;
7869     }
7870 }
7871 
7872 /*
7873  * Encode a Unicode string to a Windows code page into a byte string in strict
7874  * mode.
7875  *
7876  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7877  * an OSError and returns -1 on other error.
7878  */
7879 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7880 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7881                         PyObject *unicode, Py_ssize_t offset, int len,
7882                         const char* errors)
7883 {
7884     BOOL usedDefaultChar = FALSE;
7885     BOOL *pusedDefaultChar = &usedDefaultChar;
7886     int outsize;
7887     wchar_t *p;
7888     Py_ssize_t size;
7889     const DWORD flags = encode_code_page_flags(code_page, NULL);
7890     char *out;
7891     /* Create a substring so that we can get the UTF-16 representation
7892        of just the slice under consideration. */
7893     PyObject *substring;
7894     int ret = -1;
7895 
7896     assert(len > 0);
7897 
7898     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7899         pusedDefaultChar = &usedDefaultChar;
7900     else
7901         pusedDefaultChar = NULL;
7902 
7903     substring = PyUnicode_Substring(unicode, offset, offset+len);
7904     if (substring == NULL)
7905         return -1;
7906 #if USE_UNICODE_WCHAR_CACHE
7907 _Py_COMP_DIAG_PUSH
7908 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7909     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7910     if (p == NULL) {
7911         Py_DECREF(substring);
7912         return -1;
7913     }
7914 _Py_COMP_DIAG_POP
7915 #else /* USE_UNICODE_WCHAR_CACHE */
7916     p = PyUnicode_AsWideCharString(substring, &size);
7917     Py_CLEAR(substring);
7918     if (p == NULL) {
7919         return -1;
7920     }
7921 #endif /* USE_UNICODE_WCHAR_CACHE */
7922     assert(size <= INT_MAX);
7923 
7924     /* First get the size of the result */
7925     outsize = WideCharToMultiByte(code_page, flags,
7926                                   p, (int)size,
7927                                   NULL, 0,
7928                                   NULL, pusedDefaultChar);
7929     if (outsize <= 0)
7930         goto error;
7931     /* If we used a default char, then we failed! */
7932     if (pusedDefaultChar && *pusedDefaultChar) {
7933         ret = -2;
7934         goto done;
7935     }
7936 
7937     if (*outbytes == NULL) {
7938         /* Create string object */
7939         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7940         if (*outbytes == NULL) {
7941             goto done;
7942         }
7943         out = PyBytes_AS_STRING(*outbytes);
7944     }
7945     else {
7946         /* Extend string object */
7947         const Py_ssize_t n = PyBytes_Size(*outbytes);
7948         if (outsize > PY_SSIZE_T_MAX - n) {
7949             PyErr_NoMemory();
7950             goto done;
7951         }
7952         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7953             goto done;
7954         }
7955         out = PyBytes_AS_STRING(*outbytes) + n;
7956     }
7957 
7958     /* Do the conversion */
7959     outsize = WideCharToMultiByte(code_page, flags,
7960                                   p, (int)size,
7961                                   out, outsize,
7962                                   NULL, pusedDefaultChar);
7963     if (outsize <= 0)
7964         goto error;
7965     if (pusedDefaultChar && *pusedDefaultChar) {
7966         ret = -2;
7967         goto done;
7968     }
7969     ret = 0;
7970 
7971 done:
7972 #if USE_UNICODE_WCHAR_CACHE
7973     Py_DECREF(substring);
7974 #else /* USE_UNICODE_WCHAR_CACHE */
7975     PyMem_Free(p);
7976 #endif /* USE_UNICODE_WCHAR_CACHE */
7977     return ret;
7978 
7979 error:
7980     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7981         ret = -2;
7982         goto done;
7983     }
7984     PyErr_SetFromWindowsErr(0);
7985     goto done;
7986 }
7987 
7988 /*
7989  * Encode a Unicode string to a Windows code page into a byte string using an
7990  * error handler.
7991  *
7992  * Returns consumed characters if succeed, or raise an OSError and returns
7993  * -1 on other error.
7994  */
7995 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7996 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7997                         PyObject *unicode, Py_ssize_t unicode_offset,
7998                         Py_ssize_t insize, const char* errors)
7999 {
8000     const DWORD flags = encode_code_page_flags(code_page, errors);
8001     Py_ssize_t pos = unicode_offset;
8002     Py_ssize_t endin = unicode_offset + insize;
8003     /* Ideally, we should get reason from FormatMessage. This is the Windows
8004        2000 English version of the message. */
8005     const char *reason = "invalid character";
8006     /* 4=maximum length of a UTF-8 sequence */
8007     char buffer[4];
8008     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8009     Py_ssize_t outsize;
8010     char *out;
8011     PyObject *errorHandler = NULL;
8012     PyObject *exc = NULL;
8013     PyObject *encoding_obj = NULL;
8014     const char *encoding;
8015     Py_ssize_t newpos, newoutsize;
8016     PyObject *rep;
8017     int ret = -1;
8018 
8019     assert(insize > 0);
8020 
8021     encoding = code_page_name(code_page, &encoding_obj);
8022     if (encoding == NULL)
8023         return -1;
8024 
8025     if (errors == NULL || strcmp(errors, "strict") == 0) {
8026         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8027            then we raise a UnicodeEncodeError. */
8028         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8029         if (exc != NULL) {
8030             PyCodec_StrictErrors(exc);
8031             Py_DECREF(exc);
8032         }
8033         Py_XDECREF(encoding_obj);
8034         return -1;
8035     }
8036 
8037     if (code_page != CP_UTF8 && code_page != CP_UTF7)
8038         pusedDefaultChar = &usedDefaultChar;
8039     else
8040         pusedDefaultChar = NULL;
8041 
8042     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8043         PyErr_NoMemory();
8044         goto error;
8045     }
8046     outsize = insize * Py_ARRAY_LENGTH(buffer);
8047 
8048     if (*outbytes == NULL) {
8049         /* Create string object */
8050         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8051         if (*outbytes == NULL)
8052             goto error;
8053         out = PyBytes_AS_STRING(*outbytes);
8054     }
8055     else {
8056         /* Extend string object */
8057         Py_ssize_t n = PyBytes_Size(*outbytes);
8058         if (n > PY_SSIZE_T_MAX - outsize) {
8059             PyErr_NoMemory();
8060             goto error;
8061         }
8062         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8063             goto error;
8064         out = PyBytes_AS_STRING(*outbytes) + n;
8065     }
8066 
8067     /* Encode the string character per character */
8068     while (pos < endin)
8069     {
8070         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8071         wchar_t chars[2];
8072         int charsize;
8073         if (ch < 0x10000) {
8074             chars[0] = (wchar_t)ch;
8075             charsize = 1;
8076         }
8077         else {
8078             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8079             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8080             charsize = 2;
8081         }
8082 
8083         outsize = WideCharToMultiByte(code_page, flags,
8084                                       chars, charsize,
8085                                       buffer, Py_ARRAY_LENGTH(buffer),
8086                                       NULL, pusedDefaultChar);
8087         if (outsize > 0) {
8088             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8089             {
8090                 pos++;
8091                 memcpy(out, buffer, outsize);
8092                 out += outsize;
8093                 continue;
8094             }
8095         }
8096         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8097             PyErr_SetFromWindowsErr(0);
8098             goto error;
8099         }
8100 
8101         rep = unicode_encode_call_errorhandler(
8102                   errors, &errorHandler, encoding, reason,
8103                   unicode, &exc,
8104                   pos, pos + 1, &newpos);
8105         if (rep == NULL)
8106             goto error;
8107         pos = newpos;
8108 
8109         if (PyBytes_Check(rep)) {
8110             outsize = PyBytes_GET_SIZE(rep);
8111             if (outsize != 1) {
8112                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8113                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8114                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8115                     Py_DECREF(rep);
8116                     goto error;
8117                 }
8118                 out = PyBytes_AS_STRING(*outbytes) + offset;
8119             }
8120             memcpy(out, PyBytes_AS_STRING(rep), outsize);
8121             out += outsize;
8122         }
8123         else {
8124             Py_ssize_t i;
8125             enum PyUnicode_Kind kind;
8126             const void *data;
8127 
8128             if (PyUnicode_READY(rep) == -1) {
8129                 Py_DECREF(rep);
8130                 goto error;
8131             }
8132 
8133             outsize = PyUnicode_GET_LENGTH(rep);
8134             if (outsize != 1) {
8135                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8136                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8137                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8138                     Py_DECREF(rep);
8139                     goto error;
8140                 }
8141                 out = PyBytes_AS_STRING(*outbytes) + offset;
8142             }
8143             kind = PyUnicode_KIND(rep);
8144             data = PyUnicode_DATA(rep);
8145             for (i=0; i < outsize; i++) {
8146                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8147                 if (ch > 127) {
8148                     raise_encode_exception(&exc,
8149                         encoding, unicode,
8150                         pos, pos + 1,
8151                         "unable to encode error handler result to ASCII");
8152                     Py_DECREF(rep);
8153                     goto error;
8154                 }
8155                 *out = (unsigned char)ch;
8156                 out++;
8157             }
8158         }
8159         Py_DECREF(rep);
8160     }
8161     /* write a NUL byte */
8162     *out = 0;
8163     outsize = out - PyBytes_AS_STRING(*outbytes);
8164     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8165     if (_PyBytes_Resize(outbytes, outsize) < 0)
8166         goto error;
8167     ret = 0;
8168 
8169 error:
8170     Py_XDECREF(encoding_obj);
8171     Py_XDECREF(errorHandler);
8172     Py_XDECREF(exc);
8173     return ret;
8174 }
8175 
8176 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8177 encode_code_page(int code_page,
8178                  PyObject *unicode,
8179                  const char *errors)
8180 {
8181     Py_ssize_t len;
8182     PyObject *outbytes = NULL;
8183     Py_ssize_t offset;
8184     int chunk_len, ret, done;
8185 
8186     if (!PyUnicode_Check(unicode)) {
8187         PyErr_BadArgument();
8188         return NULL;
8189     }
8190 
8191     if (PyUnicode_READY(unicode) == -1)
8192         return NULL;
8193     len = PyUnicode_GET_LENGTH(unicode);
8194 
8195     if (code_page < 0) {
8196         PyErr_SetString(PyExc_ValueError, "invalid code page number");
8197         return NULL;
8198     }
8199 
8200     if (len == 0)
8201         return PyBytes_FromStringAndSize(NULL, 0);
8202 
8203     offset = 0;
8204     do
8205     {
8206 #ifdef NEED_RETRY
8207         if (len > DECODING_CHUNK_SIZE) {
8208             chunk_len = DECODING_CHUNK_SIZE;
8209             done = 0;
8210         }
8211         else
8212 #endif
8213         {
8214             chunk_len = (int)len;
8215             done = 1;
8216         }
8217 
8218         ret = encode_code_page_strict(code_page, &outbytes,
8219                                       unicode, offset, chunk_len,
8220                                       errors);
8221         if (ret == -2)
8222             ret = encode_code_page_errors(code_page, &outbytes,
8223                                           unicode, offset,
8224                                           chunk_len, errors);
8225         if (ret < 0) {
8226             Py_XDECREF(outbytes);
8227             return NULL;
8228         }
8229 
8230         offset += chunk_len;
8231         len -= chunk_len;
8232     } while (!done);
8233 
8234     return outbytes;
8235 }
8236 
8237 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)8238 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8239                      Py_ssize_t size,
8240                      const char *errors)
8241 {
8242     PyObject *unicode, *res;
8243     unicode = PyUnicode_FromWideChar(p, size);
8244     if (unicode == NULL)
8245         return NULL;
8246     res = encode_code_page(CP_ACP, unicode, errors);
8247     Py_DECREF(unicode);
8248     return res;
8249 }
8250 
8251 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8252 PyUnicode_EncodeCodePage(int code_page,
8253                          PyObject *unicode,
8254                          const char *errors)
8255 {
8256     return encode_code_page(code_page, unicode, errors);
8257 }
8258 
8259 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8260 PyUnicode_AsMBCSString(PyObject *unicode)
8261 {
8262     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8263 }
8264 
8265 #undef NEED_RETRY
8266 
8267 #endif /* MS_WINDOWS */
8268 
8269 /* --- Character Mapping Codec -------------------------------------------- */
8270 
8271 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8272 charmap_decode_string(const char *s,
8273                       Py_ssize_t size,
8274                       PyObject *mapping,
8275                       const char *errors,
8276                       _PyUnicodeWriter *writer)
8277 {
8278     const char *starts = s;
8279     const char *e;
8280     Py_ssize_t startinpos, endinpos;
8281     PyObject *errorHandler = NULL, *exc = NULL;
8282     Py_ssize_t maplen;
8283     enum PyUnicode_Kind mapkind;
8284     const void *mapdata;
8285     Py_UCS4 x;
8286     unsigned char ch;
8287 
8288     if (PyUnicode_READY(mapping) == -1)
8289         return -1;
8290 
8291     maplen = PyUnicode_GET_LENGTH(mapping);
8292     mapdata = PyUnicode_DATA(mapping);
8293     mapkind = PyUnicode_KIND(mapping);
8294 
8295     e = s + size;
8296 
8297     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8298         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8299          * is disabled in encoding aliases, latin1 is preferred because
8300          * its implementation is faster. */
8301         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8302         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8303         Py_UCS4 maxchar = writer->maxchar;
8304 
8305         assert (writer->kind == PyUnicode_1BYTE_KIND);
8306         while (s < e) {
8307             ch = *s;
8308             x = mapdata_ucs1[ch];
8309             if (x > maxchar) {
8310                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8311                     goto onError;
8312                 maxchar = writer->maxchar;
8313                 outdata = (Py_UCS1 *)writer->data;
8314             }
8315             outdata[writer->pos] = x;
8316             writer->pos++;
8317             ++s;
8318         }
8319         return 0;
8320     }
8321 
8322     while (s < e) {
8323         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8324             enum PyUnicode_Kind outkind = writer->kind;
8325             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8326             if (outkind == PyUnicode_1BYTE_KIND) {
8327                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8328                 Py_UCS4 maxchar = writer->maxchar;
8329                 while (s < e) {
8330                     ch = *s;
8331                     x = mapdata_ucs2[ch];
8332                     if (x > maxchar)
8333                         goto Error;
8334                     outdata[writer->pos] = x;
8335                     writer->pos++;
8336                     ++s;
8337                 }
8338                 break;
8339             }
8340             else if (outkind == PyUnicode_2BYTE_KIND) {
8341                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8342                 while (s < e) {
8343                     ch = *s;
8344                     x = mapdata_ucs2[ch];
8345                     if (x == 0xFFFE)
8346                         goto Error;
8347                     outdata[writer->pos] = x;
8348                     writer->pos++;
8349                     ++s;
8350                 }
8351                 break;
8352             }
8353         }
8354         ch = *s;
8355 
8356         if (ch < maplen)
8357             x = PyUnicode_READ(mapkind, mapdata, ch);
8358         else
8359             x = 0xfffe; /* invalid value */
8360 Error:
8361         if (x == 0xfffe)
8362         {
8363             /* undefined mapping */
8364             startinpos = s-starts;
8365             endinpos = startinpos+1;
8366             if (unicode_decode_call_errorhandler_writer(
8367                     errors, &errorHandler,
8368                     "charmap", "character maps to <undefined>",
8369                     &starts, &e, &startinpos, &endinpos, &exc, &s,
8370                     writer)) {
8371                 goto onError;
8372             }
8373             continue;
8374         }
8375 
8376         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8377             goto onError;
8378         ++s;
8379     }
8380     Py_XDECREF(errorHandler);
8381     Py_XDECREF(exc);
8382     return 0;
8383 
8384 onError:
8385     Py_XDECREF(errorHandler);
8386     Py_XDECREF(exc);
8387     return -1;
8388 }
8389 
8390 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8391 charmap_decode_mapping(const char *s,
8392                        Py_ssize_t size,
8393                        PyObject *mapping,
8394                        const char *errors,
8395                        _PyUnicodeWriter *writer)
8396 {
8397     const char *starts = s;
8398     const char *e;
8399     Py_ssize_t startinpos, endinpos;
8400     PyObject *errorHandler = NULL, *exc = NULL;
8401     unsigned char ch;
8402     PyObject *key, *item = NULL;
8403 
8404     e = s + size;
8405 
8406     while (s < e) {
8407         ch = *s;
8408 
8409         /* Get mapping (char ordinal -> integer, Unicode char or None) */
8410         key = PyLong_FromLong((long)ch);
8411         if (key == NULL)
8412             goto onError;
8413 
8414         item = PyObject_GetItem(mapping, key);
8415         Py_DECREF(key);
8416         if (item == NULL) {
8417             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8418                 /* No mapping found means: mapping is undefined. */
8419                 PyErr_Clear();
8420                 goto Undefined;
8421             } else
8422                 goto onError;
8423         }
8424 
8425         /* Apply mapping */
8426         if (item == Py_None)
8427             goto Undefined;
8428         if (PyLong_Check(item)) {
8429             long value = PyLong_AS_LONG(item);
8430             if (value == 0xFFFE)
8431                 goto Undefined;
8432             if (value < 0 || value > MAX_UNICODE) {
8433                 PyErr_Format(PyExc_TypeError,
8434                              "character mapping must be in range(0x%x)",
8435                              (unsigned long)MAX_UNICODE + 1);
8436                 goto onError;
8437             }
8438 
8439             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8440                 goto onError;
8441         }
8442         else if (PyUnicode_Check(item)) {
8443             if (PyUnicode_READY(item) == -1)
8444                 goto onError;
8445             if (PyUnicode_GET_LENGTH(item) == 1) {
8446                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8447                 if (value == 0xFFFE)
8448                     goto Undefined;
8449                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8450                     goto onError;
8451             }
8452             else {
8453                 writer->overallocate = 1;
8454                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8455                     goto onError;
8456             }
8457         }
8458         else {
8459             /* wrong return value */
8460             PyErr_SetString(PyExc_TypeError,
8461                             "character mapping must return integer, None or str");
8462             goto onError;
8463         }
8464         Py_CLEAR(item);
8465         ++s;
8466         continue;
8467 
8468 Undefined:
8469         /* undefined mapping */
8470         Py_CLEAR(item);
8471         startinpos = s-starts;
8472         endinpos = startinpos+1;
8473         if (unicode_decode_call_errorhandler_writer(
8474                 errors, &errorHandler,
8475                 "charmap", "character maps to <undefined>",
8476                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8477                 writer)) {
8478             goto onError;
8479         }
8480     }
8481     Py_XDECREF(errorHandler);
8482     Py_XDECREF(exc);
8483     return 0;
8484 
8485 onError:
8486     Py_XDECREF(item);
8487     Py_XDECREF(errorHandler);
8488     Py_XDECREF(exc);
8489     return -1;
8490 }
8491 
8492 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8493 PyUnicode_DecodeCharmap(const char *s,
8494                         Py_ssize_t size,
8495                         PyObject *mapping,
8496                         const char *errors)
8497 {
8498     _PyUnicodeWriter writer;
8499 
8500     /* Default to Latin-1 */
8501     if (mapping == NULL)
8502         return PyUnicode_DecodeLatin1(s, size, errors);
8503 
8504     if (size == 0)
8505         _Py_RETURN_UNICODE_EMPTY();
8506     _PyUnicodeWriter_Init(&writer);
8507     writer.min_length = size;
8508     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8509         goto onError;
8510 
8511     if (PyUnicode_CheckExact(mapping)) {
8512         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8513             goto onError;
8514     }
8515     else {
8516         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8517             goto onError;
8518     }
8519     return _PyUnicodeWriter_Finish(&writer);
8520 
8521   onError:
8522     _PyUnicodeWriter_Dealloc(&writer);
8523     return NULL;
8524 }
8525 
8526 /* Charmap encoding: the lookup table */
8527 
8528 struct encoding_map {
8529     PyObject_HEAD
8530     unsigned char level1[32];
8531     int count2, count3;
8532     unsigned char level23[1];
8533 };
8534 
8535 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8536 encoding_map_size(PyObject *obj, PyObject* args)
8537 {
8538     struct encoding_map *map = (struct encoding_map*)obj;
8539     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8540                            128*map->count3);
8541 }
8542 
8543 static PyMethodDef encoding_map_methods[] = {
8544     {"size", encoding_map_size, METH_NOARGS,
8545      PyDoc_STR("Return the size (in bytes) of this object") },
8546     { 0 }
8547 };
8548 
8549 static PyTypeObject EncodingMapType = {
8550     PyVarObject_HEAD_INIT(NULL, 0)
8551     "EncodingMap",          /*tp_name*/
8552     sizeof(struct encoding_map),   /*tp_basicsize*/
8553     0,                      /*tp_itemsize*/
8554     /* methods */
8555     0,                      /*tp_dealloc*/
8556     0,                      /*tp_vectorcall_offset*/
8557     0,                      /*tp_getattr*/
8558     0,                      /*tp_setattr*/
8559     0,                      /*tp_as_async*/
8560     0,                      /*tp_repr*/
8561     0,                      /*tp_as_number*/
8562     0,                      /*tp_as_sequence*/
8563     0,                      /*tp_as_mapping*/
8564     0,                      /*tp_hash*/
8565     0,                      /*tp_call*/
8566     0,                      /*tp_str*/
8567     0,                      /*tp_getattro*/
8568     0,                      /*tp_setattro*/
8569     0,                      /*tp_as_buffer*/
8570     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8571     0,                      /*tp_doc*/
8572     0,                      /*tp_traverse*/
8573     0,                      /*tp_clear*/
8574     0,                      /*tp_richcompare*/
8575     0,                      /*tp_weaklistoffset*/
8576     0,                      /*tp_iter*/
8577     0,                      /*tp_iternext*/
8578     encoding_map_methods,   /*tp_methods*/
8579     0,                      /*tp_members*/
8580     0,                      /*tp_getset*/
8581     0,                      /*tp_base*/
8582     0,                      /*tp_dict*/
8583     0,                      /*tp_descr_get*/
8584     0,                      /*tp_descr_set*/
8585     0,                      /*tp_dictoffset*/
8586     0,                      /*tp_init*/
8587     0,                      /*tp_alloc*/
8588     0,                      /*tp_new*/
8589     0,                      /*tp_free*/
8590     0,                      /*tp_is_gc*/
8591 };
8592 
8593 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8594 PyUnicode_BuildEncodingMap(PyObject* string)
8595 {
8596     PyObject *result;
8597     struct encoding_map *mresult;
8598     int i;
8599     int need_dict = 0;
8600     unsigned char level1[32];
8601     unsigned char level2[512];
8602     unsigned char *mlevel1, *mlevel2, *mlevel3;
8603     int count2 = 0, count3 = 0;
8604     int kind;
8605     const void *data;
8606     Py_ssize_t length;
8607     Py_UCS4 ch;
8608 
8609     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8610         PyErr_BadArgument();
8611         return NULL;
8612     }
8613     kind = PyUnicode_KIND(string);
8614     data = PyUnicode_DATA(string);
8615     length = PyUnicode_GET_LENGTH(string);
8616     length = Py_MIN(length, 256);
8617     memset(level1, 0xFF, sizeof level1);
8618     memset(level2, 0xFF, sizeof level2);
8619 
8620     /* If there isn't a one-to-one mapping of NULL to \0,
8621        or if there are non-BMP characters, we need to use
8622        a mapping dictionary. */
8623     if (PyUnicode_READ(kind, data, 0) != 0)
8624         need_dict = 1;
8625     for (i = 1; i < length; i++) {
8626         int l1, l2;
8627         ch = PyUnicode_READ(kind, data, i);
8628         if (ch == 0 || ch > 0xFFFF) {
8629             need_dict = 1;
8630             break;
8631         }
8632         if (ch == 0xFFFE)
8633             /* unmapped character */
8634             continue;
8635         l1 = ch >> 11;
8636         l2 = ch >> 7;
8637         if (level1[l1] == 0xFF)
8638             level1[l1] = count2++;
8639         if (level2[l2] == 0xFF)
8640             level2[l2] = count3++;
8641     }
8642 
8643     if (count2 >= 0xFF || count3 >= 0xFF)
8644         need_dict = 1;
8645 
8646     if (need_dict) {
8647         PyObject *result = PyDict_New();
8648         PyObject *key, *value;
8649         if (!result)
8650             return NULL;
8651         for (i = 0; i < length; i++) {
8652             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8653             value = PyLong_FromLong(i);
8654             if (!key || !value)
8655                 goto failed1;
8656             if (PyDict_SetItem(result, key, value) == -1)
8657                 goto failed1;
8658             Py_DECREF(key);
8659             Py_DECREF(value);
8660         }
8661         return result;
8662       failed1:
8663         Py_XDECREF(key);
8664         Py_XDECREF(value);
8665         Py_DECREF(result);
8666         return NULL;
8667     }
8668 
8669     /* Create a three-level trie */
8670     result = PyObject_Malloc(sizeof(struct encoding_map) +
8671                              16*count2 + 128*count3 - 1);
8672     if (!result) {
8673         return PyErr_NoMemory();
8674     }
8675 
8676     _PyObject_Init(result, &EncodingMapType);
8677     mresult = (struct encoding_map*)result;
8678     mresult->count2 = count2;
8679     mresult->count3 = count3;
8680     mlevel1 = mresult->level1;
8681     mlevel2 = mresult->level23;
8682     mlevel3 = mresult->level23 + 16*count2;
8683     memcpy(mlevel1, level1, 32);
8684     memset(mlevel2, 0xFF, 16*count2);
8685     memset(mlevel3, 0, 128*count3);
8686     count3 = 0;
8687     for (i = 1; i < length; i++) {
8688         int o1, o2, o3, i2, i3;
8689         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8690         if (ch == 0xFFFE)
8691             /* unmapped character */
8692             continue;
8693         o1 = ch>>11;
8694         o2 = (ch>>7) & 0xF;
8695         i2 = 16*mlevel1[o1] + o2;
8696         if (mlevel2[i2] == 0xFF)
8697             mlevel2[i2] = count3++;
8698         o3 = ch & 0x7F;
8699         i3 = 128*mlevel2[i2] + o3;
8700         mlevel3[i3] = i;
8701     }
8702     return result;
8703 }
8704 
8705 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8706 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8707 {
8708     struct encoding_map *map = (struct encoding_map*)mapping;
8709     int l1 = c>>11;
8710     int l2 = (c>>7) & 0xF;
8711     int l3 = c & 0x7F;
8712     int i;
8713 
8714     if (c > 0xFFFF)
8715         return -1;
8716     if (c == 0)
8717         return 0;
8718     /* level 1*/
8719     i = map->level1[l1];
8720     if (i == 0xFF) {
8721         return -1;
8722     }
8723     /* level 2*/
8724     i = map->level23[16*i+l2];
8725     if (i == 0xFF) {
8726         return -1;
8727     }
8728     /* level 3 */
8729     i = map->level23[16*map->count2 + 128*i + l3];
8730     if (i == 0) {
8731         return -1;
8732     }
8733     return i;
8734 }
8735 
8736 /* Lookup the character ch in the mapping. If the character
8737    can't be found, Py_None is returned (or NULL, if another
8738    error occurred). */
8739 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8740 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8741 {
8742     PyObject *w = PyLong_FromLong((long)c);
8743     PyObject *x;
8744 
8745     if (w == NULL)
8746         return NULL;
8747     x = PyObject_GetItem(mapping, w);
8748     Py_DECREF(w);
8749     if (x == NULL) {
8750         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8751             /* No mapping found means: mapping is undefined. */
8752             PyErr_Clear();
8753             Py_RETURN_NONE;
8754         } else
8755             return NULL;
8756     }
8757     else if (x == Py_None)
8758         return x;
8759     else if (PyLong_Check(x)) {
8760         long value = PyLong_AS_LONG(x);
8761         if (value < 0 || value > 255) {
8762             PyErr_SetString(PyExc_TypeError,
8763                             "character mapping must be in range(256)");
8764             Py_DECREF(x);
8765             return NULL;
8766         }
8767         return x;
8768     }
8769     else if (PyBytes_Check(x))
8770         return x;
8771     else {
8772         /* wrong return value */
8773         PyErr_Format(PyExc_TypeError,
8774                      "character mapping must return integer, bytes or None, not %.400s",
8775                      Py_TYPE(x)->tp_name);
8776         Py_DECREF(x);
8777         return NULL;
8778     }
8779 }
8780 
8781 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8782 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8783 {
8784     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8785     /* exponentially overallocate to minimize reallocations */
8786     if (requiredsize < 2*outsize)
8787         requiredsize = 2*outsize;
8788     if (_PyBytes_Resize(outobj, requiredsize))
8789         return -1;
8790     return 0;
8791 }
8792 
8793 typedef enum charmapencode_result {
8794     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8795 } charmapencode_result;
8796 /* lookup the character, put the result in the output string and adjust
8797    various state variables. Resize the output bytes object if not enough
8798    space is available. Return a new reference to the object that
8799    was put in the output buffer, or Py_None, if the mapping was undefined
8800    (in which case no character was written) or NULL, if a
8801    reallocation error occurred. The caller must decref the result */
8802 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8803 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8804                      PyObject **outobj, Py_ssize_t *outpos)
8805 {
8806     PyObject *rep;
8807     char *outstart;
8808     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8809 
8810     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8811         int res = encoding_map_lookup(c, mapping);
8812         Py_ssize_t requiredsize = *outpos+1;
8813         if (res == -1)
8814             return enc_FAILED;
8815         if (outsize<requiredsize)
8816             if (charmapencode_resize(outobj, outpos, requiredsize))
8817                 return enc_EXCEPTION;
8818         outstart = PyBytes_AS_STRING(*outobj);
8819         outstart[(*outpos)++] = (char)res;
8820         return enc_SUCCESS;
8821     }
8822 
8823     rep = charmapencode_lookup(c, mapping);
8824     if (rep==NULL)
8825         return enc_EXCEPTION;
8826     else if (rep==Py_None) {
8827         Py_DECREF(rep);
8828         return enc_FAILED;
8829     } else {
8830         if (PyLong_Check(rep)) {
8831             Py_ssize_t requiredsize = *outpos+1;
8832             if (outsize<requiredsize)
8833                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8834                     Py_DECREF(rep);
8835                     return enc_EXCEPTION;
8836                 }
8837             outstart = PyBytes_AS_STRING(*outobj);
8838             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8839         }
8840         else {
8841             const char *repchars = PyBytes_AS_STRING(rep);
8842             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8843             Py_ssize_t requiredsize = *outpos+repsize;
8844             if (outsize<requiredsize)
8845                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8846                     Py_DECREF(rep);
8847                     return enc_EXCEPTION;
8848                 }
8849             outstart = PyBytes_AS_STRING(*outobj);
8850             memcpy(outstart + *outpos, repchars, repsize);
8851             *outpos += repsize;
8852         }
8853     }
8854     Py_DECREF(rep);
8855     return enc_SUCCESS;
8856 }
8857 
8858 /* handle an error in PyUnicode_EncodeCharmap
8859    Return 0 on success, -1 on error */
8860 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8861 charmap_encoding_error(
8862     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8863     PyObject **exceptionObject,
8864     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8865     PyObject **res, Py_ssize_t *respos)
8866 {
8867     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8868     Py_ssize_t size, repsize;
8869     Py_ssize_t newpos;
8870     enum PyUnicode_Kind kind;
8871     const void *data;
8872     Py_ssize_t index;
8873     /* startpos for collecting unencodable chars */
8874     Py_ssize_t collstartpos = *inpos;
8875     Py_ssize_t collendpos = *inpos+1;
8876     Py_ssize_t collpos;
8877     const char *encoding = "charmap";
8878     const char *reason = "character maps to <undefined>";
8879     charmapencode_result x;
8880     Py_UCS4 ch;
8881     int val;
8882 
8883     if (PyUnicode_READY(unicode) == -1)
8884         return -1;
8885     size = PyUnicode_GET_LENGTH(unicode);
8886     /* find all unencodable characters */
8887     while (collendpos < size) {
8888         PyObject *rep;
8889         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8890             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8891             val = encoding_map_lookup(ch, mapping);
8892             if (val != -1)
8893                 break;
8894             ++collendpos;
8895             continue;
8896         }
8897 
8898         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8899         rep = charmapencode_lookup(ch, mapping);
8900         if (rep==NULL)
8901             return -1;
8902         else if (rep!=Py_None) {
8903             Py_DECREF(rep);
8904             break;
8905         }
8906         Py_DECREF(rep);
8907         ++collendpos;
8908     }
8909     /* cache callback name lookup
8910      * (if not done yet, i.e. it's the first error) */
8911     if (*error_handler == _Py_ERROR_UNKNOWN)
8912         *error_handler = _Py_GetErrorHandler(errors);
8913 
8914     switch (*error_handler) {
8915     case _Py_ERROR_STRICT:
8916         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8917         return -1;
8918 
8919     case _Py_ERROR_REPLACE:
8920         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8921             x = charmapencode_output('?', mapping, res, respos);
8922             if (x==enc_EXCEPTION) {
8923                 return -1;
8924             }
8925             else if (x==enc_FAILED) {
8926                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8927                 return -1;
8928             }
8929         }
8930         /* fall through */
8931     case _Py_ERROR_IGNORE:
8932         *inpos = collendpos;
8933         break;
8934 
8935     case _Py_ERROR_XMLCHARREFREPLACE:
8936         /* generate replacement (temporarily (mis)uses p) */
8937         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8938             char buffer[2+29+1+1];
8939             char *cp;
8940             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8941             for (cp = buffer; *cp; ++cp) {
8942                 x = charmapencode_output(*cp, mapping, res, respos);
8943                 if (x==enc_EXCEPTION)
8944                     return -1;
8945                 else if (x==enc_FAILED) {
8946                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8947                     return -1;
8948                 }
8949             }
8950         }
8951         *inpos = collendpos;
8952         break;
8953 
8954     default:
8955         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8956                                                       encoding, reason, unicode, exceptionObject,
8957                                                       collstartpos, collendpos, &newpos);
8958         if (repunicode == NULL)
8959             return -1;
8960         if (PyBytes_Check(repunicode)) {
8961             /* Directly copy bytes result to output. */
8962             Py_ssize_t outsize = PyBytes_Size(*res);
8963             Py_ssize_t requiredsize;
8964             repsize = PyBytes_Size(repunicode);
8965             requiredsize = *respos + repsize;
8966             if (requiredsize > outsize)
8967                 /* Make room for all additional bytes. */
8968                 if (charmapencode_resize(res, respos, requiredsize)) {
8969                     Py_DECREF(repunicode);
8970                     return -1;
8971                 }
8972             memcpy(PyBytes_AsString(*res) + *respos,
8973                    PyBytes_AsString(repunicode),  repsize);
8974             *respos += repsize;
8975             *inpos = newpos;
8976             Py_DECREF(repunicode);
8977             break;
8978         }
8979         /* generate replacement  */
8980         if (PyUnicode_READY(repunicode) == -1) {
8981             Py_DECREF(repunicode);
8982             return -1;
8983         }
8984         repsize = PyUnicode_GET_LENGTH(repunicode);
8985         data = PyUnicode_DATA(repunicode);
8986         kind = PyUnicode_KIND(repunicode);
8987         for (index = 0; index < repsize; index++) {
8988             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8989             x = charmapencode_output(repch, mapping, res, respos);
8990             if (x==enc_EXCEPTION) {
8991                 Py_DECREF(repunicode);
8992                 return -1;
8993             }
8994             else if (x==enc_FAILED) {
8995                 Py_DECREF(repunicode);
8996                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8997                 return -1;
8998             }
8999         }
9000         *inpos = newpos;
9001         Py_DECREF(repunicode);
9002     }
9003     return 0;
9004 }
9005 
9006 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)9007 _PyUnicode_EncodeCharmap(PyObject *unicode,
9008                          PyObject *mapping,
9009                          const char *errors)
9010 {
9011     /* output object */
9012     PyObject *res = NULL;
9013     /* current input position */
9014     Py_ssize_t inpos = 0;
9015     Py_ssize_t size;
9016     /* current output position */
9017     Py_ssize_t respos = 0;
9018     PyObject *error_handler_obj = NULL;
9019     PyObject *exc = NULL;
9020     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9021     const void *data;
9022     int kind;
9023 
9024     if (PyUnicode_READY(unicode) == -1)
9025         return NULL;
9026     size = PyUnicode_GET_LENGTH(unicode);
9027     data = PyUnicode_DATA(unicode);
9028     kind = PyUnicode_KIND(unicode);
9029 
9030     /* Default to Latin-1 */
9031     if (mapping == NULL)
9032         return unicode_encode_ucs1(unicode, errors, 256);
9033 
9034     /* allocate enough for a simple encoding without
9035        replacements, if we need more, we'll resize */
9036     res = PyBytes_FromStringAndSize(NULL, size);
9037     if (res == NULL)
9038         goto onError;
9039     if (size == 0)
9040         return res;
9041 
9042     while (inpos<size) {
9043         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9044         /* try to encode it */
9045         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9046         if (x==enc_EXCEPTION) /* error */
9047             goto onError;
9048         if (x==enc_FAILED) { /* unencodable character */
9049             if (charmap_encoding_error(unicode, &inpos, mapping,
9050                                        &exc,
9051                                        &error_handler, &error_handler_obj, errors,
9052                                        &res, &respos)) {
9053                 goto onError;
9054             }
9055         }
9056         else
9057             /* done with this character => adjust input position */
9058             ++inpos;
9059     }
9060 
9061     /* Resize if we allocated to much */
9062     if (respos<PyBytes_GET_SIZE(res))
9063         if (_PyBytes_Resize(&res, respos) < 0)
9064             goto onError;
9065 
9066     Py_XDECREF(exc);
9067     Py_XDECREF(error_handler_obj);
9068     return res;
9069 
9070   onError:
9071     Py_XDECREF(res);
9072     Py_XDECREF(exc);
9073     Py_XDECREF(error_handler_obj);
9074     return NULL;
9075 }
9076 
9077 /* Deprecated */
9078 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9079 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9080                         Py_ssize_t size,
9081                         PyObject *mapping,
9082                         const char *errors)
9083 {
9084     PyObject *result;
9085     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9086     if (unicode == NULL)
9087         return NULL;
9088     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9089     Py_DECREF(unicode);
9090     return result;
9091 }
9092 
9093 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)9094 PyUnicode_AsCharmapString(PyObject *unicode,
9095                           PyObject *mapping)
9096 {
9097     if (!PyUnicode_Check(unicode) || mapping == NULL) {
9098         PyErr_BadArgument();
9099         return NULL;
9100     }
9101     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9102 }
9103 
9104 /* create or adjust a UnicodeTranslateError */
9105 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)9106 make_translate_exception(PyObject **exceptionObject,
9107                          PyObject *unicode,
9108                          Py_ssize_t startpos, Py_ssize_t endpos,
9109                          const char *reason)
9110 {
9111     if (*exceptionObject == NULL) {
9112         *exceptionObject = _PyUnicodeTranslateError_Create(
9113             unicode, startpos, endpos, reason);
9114     }
9115     else {
9116         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9117             goto onError;
9118         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9119             goto onError;
9120         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9121             goto onError;
9122         return;
9123       onError:
9124         Py_CLEAR(*exceptionObject);
9125     }
9126 }
9127 
9128 /* error handling callback helper:
9129    build arguments, call the callback and check the arguments,
9130    put the result into newpos and return the replacement string, which
9131    has to be freed by the caller */
9132 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)9133 unicode_translate_call_errorhandler(const char *errors,
9134                                     PyObject **errorHandler,
9135                                     const char *reason,
9136                                     PyObject *unicode, PyObject **exceptionObject,
9137                                     Py_ssize_t startpos, Py_ssize_t endpos,
9138                                     Py_ssize_t *newpos)
9139 {
9140     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9141 
9142     Py_ssize_t i_newpos;
9143     PyObject *restuple;
9144     PyObject *resunicode;
9145 
9146     if (*errorHandler == NULL) {
9147         *errorHandler = PyCodec_LookupError(errors);
9148         if (*errorHandler == NULL)
9149             return NULL;
9150     }
9151 
9152     make_translate_exception(exceptionObject,
9153                              unicode, startpos, endpos, reason);
9154     if (*exceptionObject == NULL)
9155         return NULL;
9156 
9157     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9158     if (restuple == NULL)
9159         return NULL;
9160     if (!PyTuple_Check(restuple)) {
9161         PyErr_SetString(PyExc_TypeError, &argparse[3]);
9162         Py_DECREF(restuple);
9163         return NULL;
9164     }
9165     if (!PyArg_ParseTuple(restuple, argparse,
9166                           &resunicode, &i_newpos)) {
9167         Py_DECREF(restuple);
9168         return NULL;
9169     }
9170     if (i_newpos<0)
9171         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9172     else
9173         *newpos = i_newpos;
9174     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9175         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9176         Py_DECREF(restuple);
9177         return NULL;
9178     }
9179     Py_INCREF(resunicode);
9180     Py_DECREF(restuple);
9181     return resunicode;
9182 }
9183 
9184 /* Lookup the character ch in the mapping and put the result in result,
9185    which must be decrefed by the caller.
9186    Return 0 on success, -1 on error */
9187 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)9188 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9189 {
9190     PyObject *w = PyLong_FromLong((long)c);
9191     PyObject *x;
9192 
9193     if (w == NULL)
9194         return -1;
9195     x = PyObject_GetItem(mapping, w);
9196     Py_DECREF(w);
9197     if (x == NULL) {
9198         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9199             /* No mapping found means: use 1:1 mapping. */
9200             PyErr_Clear();
9201             *result = NULL;
9202             return 0;
9203         } else
9204             return -1;
9205     }
9206     else if (x == Py_None) {
9207         *result = x;
9208         return 0;
9209     }
9210     else if (PyLong_Check(x)) {
9211         long value = PyLong_AS_LONG(x);
9212         if (value < 0 || value > MAX_UNICODE) {
9213             PyErr_Format(PyExc_ValueError,
9214                          "character mapping must be in range(0x%x)",
9215                          MAX_UNICODE+1);
9216             Py_DECREF(x);
9217             return -1;
9218         }
9219         *result = x;
9220         return 0;
9221     }
9222     else if (PyUnicode_Check(x)) {
9223         *result = x;
9224         return 0;
9225     }
9226     else {
9227         /* wrong return value */
9228         PyErr_SetString(PyExc_TypeError,
9229                         "character mapping must return integer, None or str");
9230         Py_DECREF(x);
9231         return -1;
9232     }
9233 }
9234 
9235 /* lookup the character, write the result into the writer.
9236    Return 1 if the result was written into the writer, return 0 if the mapping
9237    was undefined, raise an exception return -1 on error. */
9238 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9239 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9240                         _PyUnicodeWriter *writer)
9241 {
9242     PyObject *item;
9243 
9244     if (charmaptranslate_lookup(ch, mapping, &item))
9245         return -1;
9246 
9247     if (item == NULL) {
9248         /* not found => default to 1:1 mapping */
9249         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9250             return -1;
9251         }
9252         return 1;
9253     }
9254 
9255     if (item == Py_None) {
9256         Py_DECREF(item);
9257         return 0;
9258     }
9259 
9260     if (PyLong_Check(item)) {
9261         long ch = (Py_UCS4)PyLong_AS_LONG(item);
9262         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9263            used it */
9264         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9265             Py_DECREF(item);
9266             return -1;
9267         }
9268         Py_DECREF(item);
9269         return 1;
9270     }
9271 
9272     if (!PyUnicode_Check(item)) {
9273         Py_DECREF(item);
9274         return -1;
9275     }
9276 
9277     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9278         Py_DECREF(item);
9279         return -1;
9280     }
9281 
9282     Py_DECREF(item);
9283     return 1;
9284 }
9285 
9286 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9287 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9288                               Py_UCS1 *translate)
9289 {
9290     PyObject *item = NULL;
9291     int ret = 0;
9292 
9293     if (charmaptranslate_lookup(ch, mapping, &item)) {
9294         return -1;
9295     }
9296 
9297     if (item == Py_None) {
9298         /* deletion */
9299         translate[ch] = 0xfe;
9300     }
9301     else if (item == NULL) {
9302         /* not found => default to 1:1 mapping */
9303         translate[ch] = ch;
9304         return 1;
9305     }
9306     else if (PyLong_Check(item)) {
9307         long replace = PyLong_AS_LONG(item);
9308         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9309            used it */
9310         if (127 < replace) {
9311             /* invalid character or character outside ASCII:
9312                skip the fast translate */
9313             goto exit;
9314         }
9315         translate[ch] = (Py_UCS1)replace;
9316     }
9317     else if (PyUnicode_Check(item)) {
9318         Py_UCS4 replace;
9319 
9320         if (PyUnicode_READY(item) == -1) {
9321             Py_DECREF(item);
9322             return -1;
9323         }
9324         if (PyUnicode_GET_LENGTH(item) != 1)
9325             goto exit;
9326 
9327         replace = PyUnicode_READ_CHAR(item, 0);
9328         if (replace > 127)
9329             goto exit;
9330         translate[ch] = (Py_UCS1)replace;
9331     }
9332     else {
9333         /* not None, NULL, long or unicode */
9334         goto exit;
9335     }
9336     ret = 1;
9337 
9338   exit:
9339     Py_DECREF(item);
9340     return ret;
9341 }
9342 
9343 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9344    was translated into writer, return 0 if the input string was partially
9345    translated into writer, raise an exception and return -1 on error. */
9346 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9347 unicode_fast_translate(PyObject *input, PyObject *mapping,
9348                        _PyUnicodeWriter *writer, int ignore,
9349                        Py_ssize_t *input_pos)
9350 {
9351     Py_UCS1 ascii_table[128], ch, ch2;
9352     Py_ssize_t len;
9353     const Py_UCS1 *in, *end;
9354     Py_UCS1 *out;
9355     int res = 0;
9356 
9357     len = PyUnicode_GET_LENGTH(input);
9358 
9359     memset(ascii_table, 0xff, 128);
9360 
9361     in = PyUnicode_1BYTE_DATA(input);
9362     end = in + len;
9363 
9364     assert(PyUnicode_IS_ASCII(writer->buffer));
9365     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9366     out = PyUnicode_1BYTE_DATA(writer->buffer);
9367 
9368     for (; in < end; in++) {
9369         ch = *in;
9370         ch2 = ascii_table[ch];
9371         if (ch2 == 0xff) {
9372             int translate = unicode_fast_translate_lookup(mapping, ch,
9373                                                           ascii_table);
9374             if (translate < 0)
9375                 return -1;
9376             if (translate == 0)
9377                 goto exit;
9378             ch2 = ascii_table[ch];
9379         }
9380         if (ch2 == 0xfe) {
9381             if (ignore)
9382                 continue;
9383             goto exit;
9384         }
9385         assert(ch2 < 128);
9386         *out = ch2;
9387         out++;
9388     }
9389     res = 1;
9390 
9391 exit:
9392     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9393     *input_pos = in - PyUnicode_1BYTE_DATA(input);
9394     return res;
9395 }
9396 
9397 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9398 _PyUnicode_TranslateCharmap(PyObject *input,
9399                             PyObject *mapping,
9400                             const char *errors)
9401 {
9402     /* input object */
9403     const void *data;
9404     Py_ssize_t size, i;
9405     int kind;
9406     /* output buffer */
9407     _PyUnicodeWriter writer;
9408     /* error handler */
9409     const char *reason = "character maps to <undefined>";
9410     PyObject *errorHandler = NULL;
9411     PyObject *exc = NULL;
9412     int ignore;
9413     int res;
9414 
9415     if (mapping == NULL) {
9416         PyErr_BadArgument();
9417         return NULL;
9418     }
9419 
9420     if (PyUnicode_READY(input) == -1)
9421         return NULL;
9422     data = PyUnicode_DATA(input);
9423     kind = PyUnicode_KIND(input);
9424     size = PyUnicode_GET_LENGTH(input);
9425 
9426     if (size == 0)
9427         return PyUnicode_FromObject(input);
9428 
9429     /* allocate enough for a simple 1:1 translation without
9430        replacements, if we need more, we'll resize */
9431     _PyUnicodeWriter_Init(&writer);
9432     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9433         goto onError;
9434 
9435     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9436 
9437     if (PyUnicode_READY(input) == -1)
9438         return NULL;
9439     if (PyUnicode_IS_ASCII(input)) {
9440         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9441         if (res < 0) {
9442             _PyUnicodeWriter_Dealloc(&writer);
9443             return NULL;
9444         }
9445         if (res == 1)
9446             return _PyUnicodeWriter_Finish(&writer);
9447     }
9448     else {
9449         i = 0;
9450     }
9451 
9452     while (i<size) {
9453         /* try to encode it */
9454         int translate;
9455         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9456         Py_ssize_t newpos;
9457         /* startpos for collecting untranslatable chars */
9458         Py_ssize_t collstart;
9459         Py_ssize_t collend;
9460         Py_UCS4 ch;
9461 
9462         ch = PyUnicode_READ(kind, data, i);
9463         translate = charmaptranslate_output(ch, mapping, &writer);
9464         if (translate < 0)
9465             goto onError;
9466 
9467         if (translate != 0) {
9468             /* it worked => adjust input pointer */
9469             ++i;
9470             continue;
9471         }
9472 
9473         /* untranslatable character */
9474         collstart = i;
9475         collend = i+1;
9476 
9477         /* find all untranslatable characters */
9478         while (collend < size) {
9479             PyObject *x;
9480             ch = PyUnicode_READ(kind, data, collend);
9481             if (charmaptranslate_lookup(ch, mapping, &x))
9482                 goto onError;
9483             Py_XDECREF(x);
9484             if (x != Py_None)
9485                 break;
9486             ++collend;
9487         }
9488 
9489         if (ignore) {
9490             i = collend;
9491         }
9492         else {
9493             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9494                                                              reason, input, &exc,
9495                                                              collstart, collend, &newpos);
9496             if (repunicode == NULL)
9497                 goto onError;
9498             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9499                 Py_DECREF(repunicode);
9500                 goto onError;
9501             }
9502             Py_DECREF(repunicode);
9503             i = newpos;
9504         }
9505     }
9506     Py_XDECREF(exc);
9507     Py_XDECREF(errorHandler);
9508     return _PyUnicodeWriter_Finish(&writer);
9509 
9510   onError:
9511     _PyUnicodeWriter_Dealloc(&writer);
9512     Py_XDECREF(exc);
9513     Py_XDECREF(errorHandler);
9514     return NULL;
9515 }
9516 
9517 /* Deprecated. Use PyUnicode_Translate instead. */
9518 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9519 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9520                            Py_ssize_t size,
9521                            PyObject *mapping,
9522                            const char *errors)
9523 {
9524     PyObject *result;
9525     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9526     if (!unicode)
9527         return NULL;
9528     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9529     Py_DECREF(unicode);
9530     return result;
9531 }
9532 
9533 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9534 PyUnicode_Translate(PyObject *str,
9535                     PyObject *mapping,
9536                     const char *errors)
9537 {
9538     if (ensure_unicode(str) < 0)
9539         return NULL;
9540     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9541 }
9542 
9543 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9544 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9545 {
9546     if (!PyUnicode_Check(unicode)) {
9547         PyErr_BadInternalCall();
9548         return NULL;
9549     }
9550     if (PyUnicode_READY(unicode) == -1)
9551         return NULL;
9552     if (PyUnicode_IS_ASCII(unicode)) {
9553         /* If the string is already ASCII, just return the same string */
9554         Py_INCREF(unicode);
9555         return unicode;
9556     }
9557 
9558     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9559     PyObject *result = PyUnicode_New(len, 127);
9560     if (result == NULL) {
9561         return NULL;
9562     }
9563 
9564     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9565     int kind = PyUnicode_KIND(unicode);
9566     const void *data = PyUnicode_DATA(unicode);
9567     Py_ssize_t i;
9568     for (i = 0; i < len; ++i) {
9569         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9570         if (ch < 127) {
9571             out[i] = ch;
9572         }
9573         else if (Py_UNICODE_ISSPACE(ch)) {
9574             out[i] = ' ';
9575         }
9576         else {
9577             int decimal = Py_UNICODE_TODECIMAL(ch);
9578             if (decimal < 0) {
9579                 out[i] = '?';
9580                 out[i+1] = '\0';
9581                 _PyUnicode_LENGTH(result) = i + 1;
9582                 break;
9583             }
9584             out[i] = '0' + decimal;
9585         }
9586     }
9587 
9588     assert(_PyUnicode_CheckConsistency(result, 1));
9589     return result;
9590 }
9591 
9592 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9593 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9594                                   Py_ssize_t length)
9595 {
9596     PyObject *decimal;
9597     Py_ssize_t i;
9598     Py_UCS4 maxchar;
9599     enum PyUnicode_Kind kind;
9600     const void *data;
9601 
9602     maxchar = 127;
9603     for (i = 0; i < length; i++) {
9604         Py_UCS4 ch = s[i];
9605         if (ch > 127) {
9606             int decimal = Py_UNICODE_TODECIMAL(ch);
9607             if (decimal >= 0)
9608                 ch = '0' + decimal;
9609             maxchar = Py_MAX(maxchar, ch);
9610         }
9611     }
9612 
9613     /* Copy to a new string */
9614     decimal = PyUnicode_New(length, maxchar);
9615     if (decimal == NULL)
9616         return decimal;
9617     kind = PyUnicode_KIND(decimal);
9618     data = PyUnicode_DATA(decimal);
9619     /* Iterate over code points */
9620     for (i = 0; i < length; i++) {
9621         Py_UCS4 ch = s[i];
9622         if (ch > 127) {
9623             int decimal = Py_UNICODE_TODECIMAL(ch);
9624             if (decimal >= 0)
9625                 ch = '0' + decimal;
9626         }
9627         PyUnicode_WRITE(kind, data, i, ch);
9628     }
9629     return unicode_result(decimal);
9630 }
9631 /* --- Decimal Encoder ---------------------------------------------------- */
9632 
9633 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9634 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9635                         Py_ssize_t length,
9636                         char *output,
9637                         const char *errors)
9638 {
9639     PyObject *unicode;
9640     Py_ssize_t i;
9641     enum PyUnicode_Kind kind;
9642     const void *data;
9643 
9644     if (output == NULL) {
9645         PyErr_BadArgument();
9646         return -1;
9647     }
9648 
9649     unicode = PyUnicode_FromWideChar(s, length);
9650     if (unicode == NULL)
9651         return -1;
9652 
9653     kind = PyUnicode_KIND(unicode);
9654     data = PyUnicode_DATA(unicode);
9655 
9656     for (i=0; i < length; ) {
9657         PyObject *exc;
9658         Py_UCS4 ch;
9659         int decimal;
9660         Py_ssize_t startpos;
9661 
9662         ch = PyUnicode_READ(kind, data, i);
9663 
9664         if (Py_UNICODE_ISSPACE(ch)) {
9665             *output++ = ' ';
9666             i++;
9667             continue;
9668         }
9669         decimal = Py_UNICODE_TODECIMAL(ch);
9670         if (decimal >= 0) {
9671             *output++ = '0' + decimal;
9672             i++;
9673             continue;
9674         }
9675         if (0 < ch && ch < 256) {
9676             *output++ = (char)ch;
9677             i++;
9678             continue;
9679         }
9680 
9681         startpos = i;
9682         exc = NULL;
9683         raise_encode_exception(&exc, "decimal", unicode,
9684                                startpos, startpos+1,
9685                                "invalid decimal Unicode string");
9686         Py_XDECREF(exc);
9687         Py_DECREF(unicode);
9688         return -1;
9689     }
9690     /* 0-terminate the output string */
9691     *output++ = '\0';
9692     Py_DECREF(unicode);
9693     return 0;
9694 }
9695 
9696 /* --- Helpers ------------------------------------------------------------ */
9697 
9698 /* helper macro to fixup start/end slice values */
9699 #define ADJUST_INDICES(start, end, len)         \
9700     if (end > len)                              \
9701         end = len;                              \
9702     else if (end < 0) {                         \
9703         end += len;                             \
9704         if (end < 0)                            \
9705             end = 0;                            \
9706     }                                           \
9707     if (start < 0) {                            \
9708         start += len;                           \
9709         if (start < 0)                          \
9710             start = 0;                          \
9711     }
9712 
9713 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9714 any_find_slice(PyObject* s1, PyObject* s2,
9715                Py_ssize_t start,
9716                Py_ssize_t end,
9717                int direction)
9718 {
9719     int kind1, kind2;
9720     const void *buf1, *buf2;
9721     Py_ssize_t len1, len2, result;
9722 
9723     kind1 = PyUnicode_KIND(s1);
9724     kind2 = PyUnicode_KIND(s2);
9725     if (kind1 < kind2)
9726         return -1;
9727 
9728     len1 = PyUnicode_GET_LENGTH(s1);
9729     len2 = PyUnicode_GET_LENGTH(s2);
9730     ADJUST_INDICES(start, end, len1);
9731     if (end - start < len2)
9732         return -1;
9733 
9734     buf1 = PyUnicode_DATA(s1);
9735     buf2 = PyUnicode_DATA(s2);
9736     if (len2 == 1) {
9737         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9738         result = findchar((const char *)buf1 + kind1*start,
9739                           kind1, end - start, ch, direction);
9740         if (result == -1)
9741             return -1;
9742         else
9743             return start + result;
9744     }
9745 
9746     if (kind2 != kind1) {
9747         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9748         if (!buf2)
9749             return -2;
9750     }
9751 
9752     if (direction > 0) {
9753         switch (kind1) {
9754         case PyUnicode_1BYTE_KIND:
9755             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9756                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9757             else
9758                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9759             break;
9760         case PyUnicode_2BYTE_KIND:
9761             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9762             break;
9763         case PyUnicode_4BYTE_KIND:
9764             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9765             break;
9766         default:
9767             Py_UNREACHABLE();
9768         }
9769     }
9770     else {
9771         switch (kind1) {
9772         case PyUnicode_1BYTE_KIND:
9773             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9774                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9775             else
9776                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9777             break;
9778         case PyUnicode_2BYTE_KIND:
9779             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9780             break;
9781         case PyUnicode_4BYTE_KIND:
9782             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9783             break;
9784         default:
9785             Py_UNREACHABLE();
9786         }
9787     }
9788 
9789     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9790     if (kind2 != kind1)
9791         PyMem_Free((void *)buf2);
9792 
9793     return result;
9794 }
9795 
9796 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9797 #include "stringlib/localeutil.h"
9798 
9799 /**
9800  * InsertThousandsGrouping:
9801  * @writer: Unicode writer.
9802  * @n_buffer: Number of characters in @buffer.
9803  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9804  * @d_pos: Start of digits string.
9805  * @n_digits: The number of digits in the string, in which we want
9806  *            to put the grouping chars.
9807  * @min_width: The minimum width of the digits in the output string.
9808  *             Output will be zero-padded on the left to fill.
9809  * @grouping: see definition in localeconv().
9810  * @thousands_sep: see definition in localeconv().
9811  *
9812  * There are 2 modes: counting and filling. If @writer is NULL,
9813  *  we are in counting mode, else filling mode.
9814  * If counting, the required buffer size is returned.
9815  * If filling, we know the buffer will be large enough, so we don't
9816  *  need to pass in the buffer size.
9817  * Inserts thousand grouping characters (as defined by grouping and
9818  *  thousands_sep) into @writer.
9819  *
9820  * Return value: -1 on error, number of characters otherwise.
9821  **/
9822 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9823 _PyUnicode_InsertThousandsGrouping(
9824     _PyUnicodeWriter *writer,
9825     Py_ssize_t n_buffer,
9826     PyObject *digits,
9827     Py_ssize_t d_pos,
9828     Py_ssize_t n_digits,
9829     Py_ssize_t min_width,
9830     const char *grouping,
9831     PyObject *thousands_sep,
9832     Py_UCS4 *maxchar)
9833 {
9834     min_width = Py_MAX(0, min_width);
9835     if (writer) {
9836         assert(digits != NULL);
9837         assert(maxchar == NULL);
9838     }
9839     else {
9840         assert(digits == NULL);
9841         assert(maxchar != NULL);
9842     }
9843     assert(0 <= d_pos);
9844     assert(0 <= n_digits);
9845     assert(grouping != NULL);
9846 
9847     if (digits != NULL) {
9848         if (PyUnicode_READY(digits) == -1) {
9849             return -1;
9850         }
9851     }
9852     if (PyUnicode_READY(thousands_sep) == -1) {
9853         return -1;
9854     }
9855 
9856     Py_ssize_t count = 0;
9857     Py_ssize_t n_zeros;
9858     int loop_broken = 0;
9859     int use_separator = 0; /* First time through, don't append the
9860                               separator. They only go between
9861                               groups. */
9862     Py_ssize_t buffer_pos;
9863     Py_ssize_t digits_pos;
9864     Py_ssize_t len;
9865     Py_ssize_t n_chars;
9866     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9867                                         be looked at */
9868     /* A generator that returns all of the grouping widths, until it
9869        returns 0. */
9870     GroupGenerator groupgen;
9871     GroupGenerator_init(&groupgen, grouping);
9872     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9873 
9874     /* if digits are not grouped, thousands separator
9875        should be an empty string */
9876     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9877 
9878     digits_pos = d_pos + n_digits;
9879     if (writer) {
9880         buffer_pos = writer->pos + n_buffer;
9881         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9882         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9883     }
9884     else {
9885         buffer_pos = n_buffer;
9886     }
9887 
9888     if (!writer) {
9889         *maxchar = 127;
9890     }
9891 
9892     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9893         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9894         n_zeros = Py_MAX(0, len - remaining);
9895         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9896 
9897         /* Use n_zero zero's and n_chars chars */
9898 
9899         /* Count only, don't do anything. */
9900         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9901 
9902         /* Copy into the writer. */
9903         InsertThousandsGrouping_fill(writer, &buffer_pos,
9904                                      digits, &digits_pos,
9905                                      n_chars, n_zeros,
9906                                      use_separator ? thousands_sep : NULL,
9907                                      thousands_sep_len, maxchar);
9908 
9909         /* Use a separator next time. */
9910         use_separator = 1;
9911 
9912         remaining -= n_chars;
9913         min_width -= len;
9914 
9915         if (remaining <= 0 && min_width <= 0) {
9916             loop_broken = 1;
9917             break;
9918         }
9919         min_width -= thousands_sep_len;
9920     }
9921     if (!loop_broken) {
9922         /* We left the loop without using a break statement. */
9923 
9924         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9925         n_zeros = Py_MAX(0, len - remaining);
9926         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9927 
9928         /* Use n_zero zero's and n_chars chars */
9929         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9930 
9931         /* Copy into the writer. */
9932         InsertThousandsGrouping_fill(writer, &buffer_pos,
9933                                      digits, &digits_pos,
9934                                      n_chars, n_zeros,
9935                                      use_separator ? thousands_sep : NULL,
9936                                      thousands_sep_len, maxchar);
9937     }
9938     return count;
9939 }
9940 
9941 
9942 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9943 PyUnicode_Count(PyObject *str,
9944                 PyObject *substr,
9945                 Py_ssize_t start,
9946                 Py_ssize_t end)
9947 {
9948     Py_ssize_t result;
9949     int kind1, kind2;
9950     const void *buf1 = NULL, *buf2 = NULL;
9951     Py_ssize_t len1, len2;
9952 
9953     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9954         return -1;
9955 
9956     kind1 = PyUnicode_KIND(str);
9957     kind2 = PyUnicode_KIND(substr);
9958     if (kind1 < kind2)
9959         return 0;
9960 
9961     len1 = PyUnicode_GET_LENGTH(str);
9962     len2 = PyUnicode_GET_LENGTH(substr);
9963     ADJUST_INDICES(start, end, len1);
9964     if (end - start < len2)
9965         return 0;
9966 
9967     buf1 = PyUnicode_DATA(str);
9968     buf2 = PyUnicode_DATA(substr);
9969     if (kind2 != kind1) {
9970         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9971         if (!buf2)
9972             goto onError;
9973     }
9974 
9975     switch (kind1) {
9976     case PyUnicode_1BYTE_KIND:
9977         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9978             result = asciilib_count(
9979                 ((const Py_UCS1*)buf1) + start, end - start,
9980                 buf2, len2, PY_SSIZE_T_MAX
9981                 );
9982         else
9983             result = ucs1lib_count(
9984                 ((const Py_UCS1*)buf1) + start, end - start,
9985                 buf2, len2, PY_SSIZE_T_MAX
9986                 );
9987         break;
9988     case PyUnicode_2BYTE_KIND:
9989         result = ucs2lib_count(
9990             ((const Py_UCS2*)buf1) + start, end - start,
9991             buf2, len2, PY_SSIZE_T_MAX
9992             );
9993         break;
9994     case PyUnicode_4BYTE_KIND:
9995         result = ucs4lib_count(
9996             ((const Py_UCS4*)buf1) + start, end - start,
9997             buf2, len2, PY_SSIZE_T_MAX
9998             );
9999         break;
10000     default:
10001         Py_UNREACHABLE();
10002     }
10003 
10004     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10005     if (kind2 != kind1)
10006         PyMem_Free((void *)buf2);
10007 
10008     return result;
10009   onError:
10010     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10011     if (kind2 != kind1)
10012         PyMem_Free((void *)buf2);
10013     return -1;
10014 }
10015 
10016 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10017 PyUnicode_Find(PyObject *str,
10018                PyObject *substr,
10019                Py_ssize_t start,
10020                Py_ssize_t end,
10021                int direction)
10022 {
10023     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10024         return -2;
10025 
10026     return any_find_slice(str, substr, start, end, direction);
10027 }
10028 
10029 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)10030 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10031                    Py_ssize_t start, Py_ssize_t end,
10032                    int direction)
10033 {
10034     int kind;
10035     Py_ssize_t len, result;
10036     if (PyUnicode_READY(str) == -1)
10037         return -2;
10038     len = PyUnicode_GET_LENGTH(str);
10039     ADJUST_INDICES(start, end, len);
10040     if (end - start < 1)
10041         return -1;
10042     kind = PyUnicode_KIND(str);
10043     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10044                       kind, end-start, ch, direction);
10045     if (result == -1)
10046         return -1;
10047     else
10048         return start + result;
10049 }
10050 
10051 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)10052 tailmatch(PyObject *self,
10053           PyObject *substring,
10054           Py_ssize_t start,
10055           Py_ssize_t end,
10056           int direction)
10057 {
10058     int kind_self;
10059     int kind_sub;
10060     const void *data_self;
10061     const void *data_sub;
10062     Py_ssize_t offset;
10063     Py_ssize_t i;
10064     Py_ssize_t end_sub;
10065 
10066     if (PyUnicode_READY(self) == -1 ||
10067         PyUnicode_READY(substring) == -1)
10068         return -1;
10069 
10070     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10071     end -= PyUnicode_GET_LENGTH(substring);
10072     if (end < start)
10073         return 0;
10074 
10075     if (PyUnicode_GET_LENGTH(substring) == 0)
10076         return 1;
10077 
10078     kind_self = PyUnicode_KIND(self);
10079     data_self = PyUnicode_DATA(self);
10080     kind_sub = PyUnicode_KIND(substring);
10081     data_sub = PyUnicode_DATA(substring);
10082     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10083 
10084     if (direction > 0)
10085         offset = end;
10086     else
10087         offset = start;
10088 
10089     if (PyUnicode_READ(kind_self, data_self, offset) ==
10090         PyUnicode_READ(kind_sub, data_sub, 0) &&
10091         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10092         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10093         /* If both are of the same kind, memcmp is sufficient */
10094         if (kind_self == kind_sub) {
10095             return ! memcmp((char *)data_self +
10096                                 (offset * PyUnicode_KIND(substring)),
10097                             data_sub,
10098                             PyUnicode_GET_LENGTH(substring) *
10099                                 PyUnicode_KIND(substring));
10100         }
10101         /* otherwise we have to compare each character by first accessing it */
10102         else {
10103             /* We do not need to compare 0 and len(substring)-1 because
10104                the if statement above ensured already that they are equal
10105                when we end up here. */
10106             for (i = 1; i < end_sub; ++i) {
10107                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10108                     PyUnicode_READ(kind_sub, data_sub, i))
10109                     return 0;
10110             }
10111             return 1;
10112         }
10113     }
10114 
10115     return 0;
10116 }
10117 
10118 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10119 PyUnicode_Tailmatch(PyObject *str,
10120                     PyObject *substr,
10121                     Py_ssize_t start,
10122                     Py_ssize_t end,
10123                     int direction)
10124 {
10125     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10126         return -1;
10127 
10128     return tailmatch(str, substr, start, end, direction);
10129 }
10130 
10131 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)10132 ascii_upper_or_lower(PyObject *self, int lower)
10133 {
10134     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
10135     const char *data = PyUnicode_DATA(self);
10136     char *resdata;
10137     PyObject *res;
10138 
10139     res = PyUnicode_New(len, 127);
10140     if (res == NULL)
10141         return NULL;
10142     resdata = PyUnicode_DATA(res);
10143     if (lower)
10144         _Py_bytes_lower(resdata, data, len);
10145     else
10146         _Py_bytes_upper(resdata, data, len);
10147     return res;
10148 }
10149 
10150 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)10151 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10152 {
10153     Py_ssize_t j;
10154     int final_sigma;
10155     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
10156     /* U+03A3 is in the Final_Sigma context when, it is found like this:
10157 
10158      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10159 
10160     where ! is a negation and \p{xxx} is a character with property xxx.
10161     */
10162     for (j = i - 1; j >= 0; j--) {
10163         c = PyUnicode_READ(kind, data, j);
10164         if (!_PyUnicode_IsCaseIgnorable(c))
10165             break;
10166     }
10167     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10168     if (final_sigma) {
10169         for (j = i + 1; j < length; j++) {
10170             c = PyUnicode_READ(kind, data, j);
10171             if (!_PyUnicode_IsCaseIgnorable(c))
10172                 break;
10173         }
10174         final_sigma = j == length || !_PyUnicode_IsCased(c);
10175     }
10176     return (final_sigma) ? 0x3C2 : 0x3C3;
10177 }
10178 
10179 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)10180 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10181            Py_UCS4 c, Py_UCS4 *mapped)
10182 {
10183     /* Obscure special case. */
10184     if (c == 0x3A3) {
10185         mapped[0] = handle_capital_sigma(kind, data, length, i);
10186         return 1;
10187     }
10188     return _PyUnicode_ToLowerFull(c, mapped);
10189 }
10190 
10191 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10192 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10193 {
10194     Py_ssize_t i, k = 0;
10195     int n_res, j;
10196     Py_UCS4 c, mapped[3];
10197 
10198     c = PyUnicode_READ(kind, data, 0);
10199     n_res = _PyUnicode_ToTitleFull(c, mapped);
10200     for (j = 0; j < n_res; j++) {
10201         *maxchar = Py_MAX(*maxchar, mapped[j]);
10202         res[k++] = mapped[j];
10203     }
10204     for (i = 1; i < length; i++) {
10205         c = PyUnicode_READ(kind, data, i);
10206         n_res = lower_ucs4(kind, data, length, i, c, mapped);
10207         for (j = 0; j < n_res; j++) {
10208             *maxchar = Py_MAX(*maxchar, mapped[j]);
10209             res[k++] = mapped[j];
10210         }
10211     }
10212     return k;
10213 }
10214 
10215 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10216 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10217     Py_ssize_t i, k = 0;
10218 
10219     for (i = 0; i < length; i++) {
10220         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10221         int n_res, j;
10222         if (Py_UNICODE_ISUPPER(c)) {
10223             n_res = lower_ucs4(kind, data, length, i, c, mapped);
10224         }
10225         else if (Py_UNICODE_ISLOWER(c)) {
10226             n_res = _PyUnicode_ToUpperFull(c, mapped);
10227         }
10228         else {
10229             n_res = 1;
10230             mapped[0] = c;
10231         }
10232         for (j = 0; j < n_res; j++) {
10233             *maxchar = Py_MAX(*maxchar, mapped[j]);
10234             res[k++] = mapped[j];
10235         }
10236     }
10237     return k;
10238 }
10239 
10240 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)10241 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10242                   Py_UCS4 *maxchar, int lower)
10243 {
10244     Py_ssize_t i, k = 0;
10245 
10246     for (i = 0; i < length; i++) {
10247         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10248         int n_res, j;
10249         if (lower)
10250             n_res = lower_ucs4(kind, data, length, i, c, mapped);
10251         else
10252             n_res = _PyUnicode_ToUpperFull(c, mapped);
10253         for (j = 0; j < n_res; j++) {
10254             *maxchar = Py_MAX(*maxchar, mapped[j]);
10255             res[k++] = mapped[j];
10256         }
10257     }
10258     return k;
10259 }
10260 
10261 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10262 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10263 {
10264     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10265 }
10266 
10267 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10268 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10269 {
10270     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10271 }
10272 
10273 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10274 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10275 {
10276     Py_ssize_t i, k = 0;
10277 
10278     for (i = 0; i < length; i++) {
10279         Py_UCS4 c = PyUnicode_READ(kind, data, i);
10280         Py_UCS4 mapped[3];
10281         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10282         for (j = 0; j < n_res; j++) {
10283             *maxchar = Py_MAX(*maxchar, mapped[j]);
10284             res[k++] = mapped[j];
10285         }
10286     }
10287     return k;
10288 }
10289 
10290 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10291 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10292 {
10293     Py_ssize_t i, k = 0;
10294     int previous_is_cased;
10295 
10296     previous_is_cased = 0;
10297     for (i = 0; i < length; i++) {
10298         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10299         Py_UCS4 mapped[3];
10300         int n_res, j;
10301 
10302         if (previous_is_cased)
10303             n_res = lower_ucs4(kind, data, length, i, c, mapped);
10304         else
10305             n_res = _PyUnicode_ToTitleFull(c, mapped);
10306 
10307         for (j = 0; j < n_res; j++) {
10308             *maxchar = Py_MAX(*maxchar, mapped[j]);
10309             res[k++] = mapped[j];
10310         }
10311 
10312         previous_is_cased = _PyUnicode_IsCased(c);
10313     }
10314     return k;
10315 }
10316 
10317 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10318 case_operation(PyObject *self,
10319                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10320 {
10321     PyObject *res = NULL;
10322     Py_ssize_t length, newlength = 0;
10323     int kind, outkind;
10324     const void *data;
10325     void *outdata;
10326     Py_UCS4 maxchar = 0, *tmp, *tmpend;
10327 
10328     assert(PyUnicode_IS_READY(self));
10329 
10330     kind = PyUnicode_KIND(self);
10331     data = PyUnicode_DATA(self);
10332     length = PyUnicode_GET_LENGTH(self);
10333     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10334         PyErr_SetString(PyExc_OverflowError, "string is too long");
10335         return NULL;
10336     }
10337     tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10338     if (tmp == NULL)
10339         return PyErr_NoMemory();
10340     newlength = perform(kind, data, length, tmp, &maxchar);
10341     res = PyUnicode_New(newlength, maxchar);
10342     if (res == NULL)
10343         goto leave;
10344     tmpend = tmp + newlength;
10345     outdata = PyUnicode_DATA(res);
10346     outkind = PyUnicode_KIND(res);
10347     switch (outkind) {
10348     case PyUnicode_1BYTE_KIND:
10349         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10350         break;
10351     case PyUnicode_2BYTE_KIND:
10352         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10353         break;
10354     case PyUnicode_4BYTE_KIND:
10355         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10356         break;
10357     default:
10358         Py_UNREACHABLE();
10359     }
10360   leave:
10361     PyMem_Free(tmp);
10362     return res;
10363 }
10364 
10365 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10366 PyUnicode_Join(PyObject *separator, PyObject *seq)
10367 {
10368     PyObject *res;
10369     PyObject *fseq;
10370     Py_ssize_t seqlen;
10371     PyObject **items;
10372 
10373     fseq = PySequence_Fast(seq, "can only join an iterable");
10374     if (fseq == NULL) {
10375         return NULL;
10376     }
10377 
10378     /* NOTE: the following code can't call back into Python code,
10379      * so we are sure that fseq won't be mutated.
10380      */
10381 
10382     items = PySequence_Fast_ITEMS(fseq);
10383     seqlen = PySequence_Fast_GET_SIZE(fseq);
10384     res = _PyUnicode_JoinArray(separator, items, seqlen);
10385     Py_DECREF(fseq);
10386     return res;
10387 }
10388 
10389 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10390 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10391 {
10392     PyObject *res = NULL; /* the result */
10393     PyObject *sep = NULL;
10394     Py_ssize_t seplen;
10395     PyObject *item;
10396     Py_ssize_t sz, i, res_offset;
10397     Py_UCS4 maxchar;
10398     Py_UCS4 item_maxchar;
10399     int use_memcpy;
10400     unsigned char *res_data = NULL, *sep_data = NULL;
10401     PyObject *last_obj;
10402     unsigned int kind = 0;
10403 
10404     /* If empty sequence, return u"". */
10405     if (seqlen == 0) {
10406         _Py_RETURN_UNICODE_EMPTY();
10407     }
10408 
10409     /* If singleton sequence with an exact Unicode, return that. */
10410     last_obj = NULL;
10411     if (seqlen == 1) {
10412         if (PyUnicode_CheckExact(items[0])) {
10413             res = items[0];
10414             Py_INCREF(res);
10415             return res;
10416         }
10417         seplen = 0;
10418         maxchar = 0;
10419     }
10420     else {
10421         /* Set up sep and seplen */
10422         if (separator == NULL) {
10423             /* fall back to a blank space separator */
10424             sep = PyUnicode_FromOrdinal(' ');
10425             if (!sep)
10426                 goto onError;
10427             seplen = 1;
10428             maxchar = 32;
10429         }
10430         else {
10431             if (!PyUnicode_Check(separator)) {
10432                 PyErr_Format(PyExc_TypeError,
10433                              "separator: expected str instance,"
10434                              " %.80s found",
10435                              Py_TYPE(separator)->tp_name);
10436                 goto onError;
10437             }
10438             if (PyUnicode_READY(separator))
10439                 goto onError;
10440             sep = separator;
10441             seplen = PyUnicode_GET_LENGTH(separator);
10442             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10443             /* inc refcount to keep this code path symmetric with the
10444                above case of a blank separator */
10445             Py_INCREF(sep);
10446         }
10447         last_obj = sep;
10448     }
10449 
10450     /* There are at least two things to join, or else we have a subclass
10451      * of str in the sequence.
10452      * Do a pre-pass to figure out the total amount of space we'll
10453      * need (sz), and see whether all argument are strings.
10454      */
10455     sz = 0;
10456 #ifdef Py_DEBUG
10457     use_memcpy = 0;
10458 #else
10459     use_memcpy = 1;
10460 #endif
10461     for (i = 0; i < seqlen; i++) {
10462         size_t add_sz;
10463         item = items[i];
10464         if (!PyUnicode_Check(item)) {
10465             PyErr_Format(PyExc_TypeError,
10466                          "sequence item %zd: expected str instance,"
10467                          " %.80s found",
10468                          i, Py_TYPE(item)->tp_name);
10469             goto onError;
10470         }
10471         if (PyUnicode_READY(item) == -1)
10472             goto onError;
10473         add_sz = PyUnicode_GET_LENGTH(item);
10474         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10475         maxchar = Py_MAX(maxchar, item_maxchar);
10476         if (i != 0) {
10477             add_sz += seplen;
10478         }
10479         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10480             PyErr_SetString(PyExc_OverflowError,
10481                             "join() result is too long for a Python string");
10482             goto onError;
10483         }
10484         sz += add_sz;
10485         if (use_memcpy && last_obj != NULL) {
10486             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10487                 use_memcpy = 0;
10488         }
10489         last_obj = item;
10490     }
10491 
10492     res = PyUnicode_New(sz, maxchar);
10493     if (res == NULL)
10494         goto onError;
10495 
10496     /* Catenate everything. */
10497 #ifdef Py_DEBUG
10498     use_memcpy = 0;
10499 #else
10500     if (use_memcpy) {
10501         res_data = PyUnicode_1BYTE_DATA(res);
10502         kind = PyUnicode_KIND(res);
10503         if (seplen != 0)
10504             sep_data = PyUnicode_1BYTE_DATA(sep);
10505     }
10506 #endif
10507     if (use_memcpy) {
10508         for (i = 0; i < seqlen; ++i) {
10509             Py_ssize_t itemlen;
10510             item = items[i];
10511 
10512             /* Copy item, and maybe the separator. */
10513             if (i && seplen != 0) {
10514                 memcpy(res_data,
10515                           sep_data,
10516                           kind * seplen);
10517                 res_data += kind * seplen;
10518             }
10519 
10520             itemlen = PyUnicode_GET_LENGTH(item);
10521             if (itemlen != 0) {
10522                 memcpy(res_data,
10523                           PyUnicode_DATA(item),
10524                           kind * itemlen);
10525                 res_data += kind * itemlen;
10526             }
10527         }
10528         assert(res_data == PyUnicode_1BYTE_DATA(res)
10529                            + kind * PyUnicode_GET_LENGTH(res));
10530     }
10531     else {
10532         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10533             Py_ssize_t itemlen;
10534             item = items[i];
10535 
10536             /* Copy item, and maybe the separator. */
10537             if (i && seplen != 0) {
10538                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10539                 res_offset += seplen;
10540             }
10541 
10542             itemlen = PyUnicode_GET_LENGTH(item);
10543             if (itemlen != 0) {
10544                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10545                 res_offset += itemlen;
10546             }
10547         }
10548         assert(res_offset == PyUnicode_GET_LENGTH(res));
10549     }
10550 
10551     Py_XDECREF(sep);
10552     assert(_PyUnicode_CheckConsistency(res, 1));
10553     return res;
10554 
10555   onError:
10556     Py_XDECREF(sep);
10557     Py_XDECREF(res);
10558     return NULL;
10559 }
10560 
10561 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10562 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10563                     Py_UCS4 fill_char)
10564 {
10565     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10566     void *data = PyUnicode_DATA(unicode);
10567     assert(PyUnicode_IS_READY(unicode));
10568     assert(unicode_modifiable(unicode));
10569     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10570     assert(start >= 0);
10571     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10572     unicode_fill(kind, data, fill_char, start, length);
10573 }
10574 
10575 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10576 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10577                Py_UCS4 fill_char)
10578 {
10579     Py_ssize_t maxlen;
10580 
10581     if (!PyUnicode_Check(unicode)) {
10582         PyErr_BadInternalCall();
10583         return -1;
10584     }
10585     if (PyUnicode_READY(unicode) == -1)
10586         return -1;
10587     if (unicode_check_modifiable(unicode))
10588         return -1;
10589 
10590     if (start < 0) {
10591         PyErr_SetString(PyExc_IndexError, "string index out of range");
10592         return -1;
10593     }
10594     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10595         PyErr_SetString(PyExc_ValueError,
10596                          "fill character is bigger than "
10597                          "the string maximum character");
10598         return -1;
10599     }
10600 
10601     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10602     length = Py_MIN(maxlen, length);
10603     if (length <= 0)
10604         return 0;
10605 
10606     _PyUnicode_FastFill(unicode, start, length, fill_char);
10607     return length;
10608 }
10609 
10610 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10611 pad(PyObject *self,
10612     Py_ssize_t left,
10613     Py_ssize_t right,
10614     Py_UCS4 fill)
10615 {
10616     PyObject *u;
10617     Py_UCS4 maxchar;
10618     int kind;
10619     void *data;
10620 
10621     if (left < 0)
10622         left = 0;
10623     if (right < 0)
10624         right = 0;
10625 
10626     if (left == 0 && right == 0)
10627         return unicode_result_unchanged(self);
10628 
10629     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10630         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10631         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10632         return NULL;
10633     }
10634     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10635     maxchar = Py_MAX(maxchar, fill);
10636     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10637     if (!u)
10638         return NULL;
10639 
10640     kind = PyUnicode_KIND(u);
10641     data = PyUnicode_DATA(u);
10642     if (left)
10643         unicode_fill(kind, data, fill, 0, left);
10644     if (right)
10645         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10646     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10647     assert(_PyUnicode_CheckConsistency(u, 1));
10648     return u;
10649 }
10650 
10651 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10652 PyUnicode_Splitlines(PyObject *string, int keepends)
10653 {
10654     PyObject *list;
10655 
10656     if (ensure_unicode(string) < 0)
10657         return NULL;
10658 
10659     switch (PyUnicode_KIND(string)) {
10660     case PyUnicode_1BYTE_KIND:
10661         if (PyUnicode_IS_ASCII(string))
10662             list = asciilib_splitlines(
10663                 string, PyUnicode_1BYTE_DATA(string),
10664                 PyUnicode_GET_LENGTH(string), keepends);
10665         else
10666             list = ucs1lib_splitlines(
10667                 string, PyUnicode_1BYTE_DATA(string),
10668                 PyUnicode_GET_LENGTH(string), keepends);
10669         break;
10670     case PyUnicode_2BYTE_KIND:
10671         list = ucs2lib_splitlines(
10672             string, PyUnicode_2BYTE_DATA(string),
10673             PyUnicode_GET_LENGTH(string), keepends);
10674         break;
10675     case PyUnicode_4BYTE_KIND:
10676         list = ucs4lib_splitlines(
10677             string, PyUnicode_4BYTE_DATA(string),
10678             PyUnicode_GET_LENGTH(string), keepends);
10679         break;
10680     default:
10681         Py_UNREACHABLE();
10682     }
10683     return list;
10684 }
10685 
10686 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10687 split(PyObject *self,
10688       PyObject *substring,
10689       Py_ssize_t maxcount)
10690 {
10691     int kind1, kind2;
10692     const void *buf1, *buf2;
10693     Py_ssize_t len1, len2;
10694     PyObject* out;
10695 
10696     if (maxcount < 0)
10697         maxcount = PY_SSIZE_T_MAX;
10698 
10699     if (PyUnicode_READY(self) == -1)
10700         return NULL;
10701 
10702     if (substring == NULL)
10703         switch (PyUnicode_KIND(self)) {
10704         case PyUnicode_1BYTE_KIND:
10705             if (PyUnicode_IS_ASCII(self))
10706                 return asciilib_split_whitespace(
10707                     self,  PyUnicode_1BYTE_DATA(self),
10708                     PyUnicode_GET_LENGTH(self), maxcount
10709                     );
10710             else
10711                 return ucs1lib_split_whitespace(
10712                     self,  PyUnicode_1BYTE_DATA(self),
10713                     PyUnicode_GET_LENGTH(self), maxcount
10714                     );
10715         case PyUnicode_2BYTE_KIND:
10716             return ucs2lib_split_whitespace(
10717                 self,  PyUnicode_2BYTE_DATA(self),
10718                 PyUnicode_GET_LENGTH(self), maxcount
10719                 );
10720         case PyUnicode_4BYTE_KIND:
10721             return ucs4lib_split_whitespace(
10722                 self,  PyUnicode_4BYTE_DATA(self),
10723                 PyUnicode_GET_LENGTH(self), maxcount
10724                 );
10725         default:
10726             Py_UNREACHABLE();
10727         }
10728 
10729     if (PyUnicode_READY(substring) == -1)
10730         return NULL;
10731 
10732     kind1 = PyUnicode_KIND(self);
10733     kind2 = PyUnicode_KIND(substring);
10734     len1 = PyUnicode_GET_LENGTH(self);
10735     len2 = PyUnicode_GET_LENGTH(substring);
10736     if (kind1 < kind2 || len1 < len2) {
10737         out = PyList_New(1);
10738         if (out == NULL)
10739             return NULL;
10740         Py_INCREF(self);
10741         PyList_SET_ITEM(out, 0, self);
10742         return out;
10743     }
10744     buf1 = PyUnicode_DATA(self);
10745     buf2 = PyUnicode_DATA(substring);
10746     if (kind2 != kind1) {
10747         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10748         if (!buf2)
10749             return NULL;
10750     }
10751 
10752     switch (kind1) {
10753     case PyUnicode_1BYTE_KIND:
10754         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10755             out = asciilib_split(
10756                 self,  buf1, len1, buf2, len2, maxcount);
10757         else
10758             out = ucs1lib_split(
10759                 self,  buf1, len1, buf2, len2, maxcount);
10760         break;
10761     case PyUnicode_2BYTE_KIND:
10762         out = ucs2lib_split(
10763             self,  buf1, len1, buf2, len2, maxcount);
10764         break;
10765     case PyUnicode_4BYTE_KIND:
10766         out = ucs4lib_split(
10767             self,  buf1, len1, buf2, len2, maxcount);
10768         break;
10769     default:
10770         out = NULL;
10771     }
10772     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10773     if (kind2 != kind1)
10774         PyMem_Free((void *)buf2);
10775     return out;
10776 }
10777 
10778 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10779 rsplit(PyObject *self,
10780        PyObject *substring,
10781        Py_ssize_t maxcount)
10782 {
10783     int kind1, kind2;
10784     const void *buf1, *buf2;
10785     Py_ssize_t len1, len2;
10786     PyObject* out;
10787 
10788     if (maxcount < 0)
10789         maxcount = PY_SSIZE_T_MAX;
10790 
10791     if (PyUnicode_READY(self) == -1)
10792         return NULL;
10793 
10794     if (substring == NULL)
10795         switch (PyUnicode_KIND(self)) {
10796         case PyUnicode_1BYTE_KIND:
10797             if (PyUnicode_IS_ASCII(self))
10798                 return asciilib_rsplit_whitespace(
10799                     self,  PyUnicode_1BYTE_DATA(self),
10800                     PyUnicode_GET_LENGTH(self), maxcount
10801                     );
10802             else
10803                 return ucs1lib_rsplit_whitespace(
10804                     self,  PyUnicode_1BYTE_DATA(self),
10805                     PyUnicode_GET_LENGTH(self), maxcount
10806                     );
10807         case PyUnicode_2BYTE_KIND:
10808             return ucs2lib_rsplit_whitespace(
10809                 self,  PyUnicode_2BYTE_DATA(self),
10810                 PyUnicode_GET_LENGTH(self), maxcount
10811                 );
10812         case PyUnicode_4BYTE_KIND:
10813             return ucs4lib_rsplit_whitespace(
10814                 self,  PyUnicode_4BYTE_DATA(self),
10815                 PyUnicode_GET_LENGTH(self), maxcount
10816                 );
10817         default:
10818             Py_UNREACHABLE();
10819         }
10820 
10821     if (PyUnicode_READY(substring) == -1)
10822         return NULL;
10823 
10824     kind1 = PyUnicode_KIND(self);
10825     kind2 = PyUnicode_KIND(substring);
10826     len1 = PyUnicode_GET_LENGTH(self);
10827     len2 = PyUnicode_GET_LENGTH(substring);
10828     if (kind1 < kind2 || len1 < len2) {
10829         out = PyList_New(1);
10830         if (out == NULL)
10831             return NULL;
10832         Py_INCREF(self);
10833         PyList_SET_ITEM(out, 0, self);
10834         return out;
10835     }
10836     buf1 = PyUnicode_DATA(self);
10837     buf2 = PyUnicode_DATA(substring);
10838     if (kind2 != kind1) {
10839         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10840         if (!buf2)
10841             return NULL;
10842     }
10843 
10844     switch (kind1) {
10845     case PyUnicode_1BYTE_KIND:
10846         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10847             out = asciilib_rsplit(
10848                 self,  buf1, len1, buf2, len2, maxcount);
10849         else
10850             out = ucs1lib_rsplit(
10851                 self,  buf1, len1, buf2, len2, maxcount);
10852         break;
10853     case PyUnicode_2BYTE_KIND:
10854         out = ucs2lib_rsplit(
10855             self,  buf1, len1, buf2, len2, maxcount);
10856         break;
10857     case PyUnicode_4BYTE_KIND:
10858         out = ucs4lib_rsplit(
10859             self,  buf1, len1, buf2, len2, maxcount);
10860         break;
10861     default:
10862         out = NULL;
10863     }
10864     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10865     if (kind2 != kind1)
10866         PyMem_Free((void *)buf2);
10867     return out;
10868 }
10869 
10870 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10871 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10872             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10873 {
10874     switch (kind) {
10875     case PyUnicode_1BYTE_KIND:
10876         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10877             return asciilib_find(buf1, len1, buf2, len2, offset);
10878         else
10879             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10880     case PyUnicode_2BYTE_KIND:
10881         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10882     case PyUnicode_4BYTE_KIND:
10883         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10884     }
10885     Py_UNREACHABLE();
10886 }
10887 
10888 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10889 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10890              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10891 {
10892     switch (kind) {
10893     case PyUnicode_1BYTE_KIND:
10894         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10895             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10896         else
10897             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10898     case PyUnicode_2BYTE_KIND:
10899         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10900     case PyUnicode_4BYTE_KIND:
10901         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10902     }
10903     Py_UNREACHABLE();
10904 }
10905 
10906 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10907 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10908                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10909 {
10910     int kind = PyUnicode_KIND(u);
10911     void *data = PyUnicode_DATA(u);
10912     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10913     if (kind == PyUnicode_1BYTE_KIND) {
10914         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10915                                       (Py_UCS1 *)data + len,
10916                                       u1, u2, maxcount);
10917     }
10918     else if (kind == PyUnicode_2BYTE_KIND) {
10919         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10920                                       (Py_UCS2 *)data + len,
10921                                       u1, u2, maxcount);
10922     }
10923     else {
10924         assert(kind == PyUnicode_4BYTE_KIND);
10925         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10926                                       (Py_UCS4 *)data + len,
10927                                       u1, u2, maxcount);
10928     }
10929 }
10930 
10931 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10932 replace(PyObject *self, PyObject *str1,
10933         PyObject *str2, Py_ssize_t maxcount)
10934 {
10935     PyObject *u;
10936     const char *sbuf = PyUnicode_DATA(self);
10937     const void *buf1 = PyUnicode_DATA(str1);
10938     const void *buf2 = PyUnicode_DATA(str2);
10939     int srelease = 0, release1 = 0, release2 = 0;
10940     int skind = PyUnicode_KIND(self);
10941     int kind1 = PyUnicode_KIND(str1);
10942     int kind2 = PyUnicode_KIND(str2);
10943     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10944     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10945     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10946     int mayshrink;
10947     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10948 
10949     if (slen < len1)
10950         goto nothing;
10951 
10952     if (maxcount < 0)
10953         maxcount = PY_SSIZE_T_MAX;
10954     else if (maxcount == 0)
10955         goto nothing;
10956 
10957     if (str1 == str2)
10958         goto nothing;
10959 
10960     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10961     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10962     if (maxchar < maxchar_str1)
10963         /* substring too wide to be present */
10964         goto nothing;
10965     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10966     /* Replacing str1 with str2 may cause a maxchar reduction in the
10967        result string. */
10968     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10969     maxchar = Py_MAX(maxchar, maxchar_str2);
10970 
10971     if (len1 == len2) {
10972         /* same length */
10973         if (len1 == 0)
10974             goto nothing;
10975         if (len1 == 1) {
10976             /* replace characters */
10977             Py_UCS4 u1, u2;
10978             Py_ssize_t pos;
10979 
10980             u1 = PyUnicode_READ(kind1, buf1, 0);
10981             pos = findchar(sbuf, skind, slen, u1, 1);
10982             if (pos < 0)
10983                 goto nothing;
10984             u2 = PyUnicode_READ(kind2, buf2, 0);
10985             u = PyUnicode_New(slen, maxchar);
10986             if (!u)
10987                 goto error;
10988 
10989             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10990             replace_1char_inplace(u, pos, u1, u2, maxcount);
10991         }
10992         else {
10993             int rkind = skind;
10994             char *res;
10995             Py_ssize_t i;
10996 
10997             if (kind1 < rkind) {
10998                 /* widen substring */
10999                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11000                 if (!buf1) goto error;
11001                 release1 = 1;
11002             }
11003             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
11004             if (i < 0)
11005                 goto nothing;
11006             if (rkind > kind2) {
11007                 /* widen replacement */
11008                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
11009                 if (!buf2) goto error;
11010                 release2 = 1;
11011             }
11012             else if (rkind < kind2) {
11013                 /* widen self and buf1 */
11014                 rkind = kind2;
11015                 if (release1) {
11016                     assert(buf1 != PyUnicode_DATA(str1));
11017                     PyMem_Free((void *)buf1);
11018                     buf1 = PyUnicode_DATA(str1);
11019                     release1 = 0;
11020                 }
11021                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
11022                 if (!sbuf) goto error;
11023                 srelease = 1;
11024                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11025                 if (!buf1) goto error;
11026                 release1 = 1;
11027             }
11028             u = PyUnicode_New(slen, maxchar);
11029             if (!u)
11030                 goto error;
11031             assert(PyUnicode_KIND(u) == rkind);
11032             res = PyUnicode_DATA(u);
11033 
11034             memcpy(res, sbuf, rkind * slen);
11035             /* change everything in-place, starting with this one */
11036             memcpy(res + rkind * i,
11037                    buf2,
11038                    rkind * len2);
11039             i += len1;
11040 
11041             while ( --maxcount > 0) {
11042                 i = anylib_find(rkind, self,
11043                                 sbuf+rkind*i, slen-i,
11044                                 str1, buf1, len1, i);
11045                 if (i == -1)
11046                     break;
11047                 memcpy(res + rkind * i,
11048                        buf2,
11049                        rkind * len2);
11050                 i += len1;
11051             }
11052         }
11053     }
11054     else {
11055         Py_ssize_t n, i, j, ires;
11056         Py_ssize_t new_size;
11057         int rkind = skind;
11058         char *res;
11059 
11060         if (kind1 < rkind) {
11061             /* widen substring */
11062             buf1 = unicode_askind(kind1, buf1, len1, rkind);
11063             if (!buf1) goto error;
11064             release1 = 1;
11065         }
11066         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
11067         if (n == 0)
11068             goto nothing;
11069         if (kind2 < rkind) {
11070             /* widen replacement */
11071             buf2 = unicode_askind(kind2, buf2, len2, rkind);
11072             if (!buf2) goto error;
11073             release2 = 1;
11074         }
11075         else if (kind2 > rkind) {
11076             /* widen self and buf1 */
11077             rkind = kind2;
11078             sbuf = unicode_askind(skind, sbuf, slen, rkind);
11079             if (!sbuf) goto error;
11080             srelease = 1;
11081             if (release1) {
11082                 assert(buf1 != PyUnicode_DATA(str1));
11083                 PyMem_Free((void *)buf1);
11084                 buf1 = PyUnicode_DATA(str1);
11085                 release1 = 0;
11086             }
11087             buf1 = unicode_askind(kind1, buf1, len1, rkind);
11088             if (!buf1) goto error;
11089             release1 = 1;
11090         }
11091         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11092            PyUnicode_GET_LENGTH(str1)); */
11093         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
11094                 PyErr_SetString(PyExc_OverflowError,
11095                                 "replace string is too long");
11096                 goto error;
11097         }
11098         new_size = slen + n * (len2 - len1);
11099         if (new_size == 0) {
11100             u = unicode_new_empty();
11101             goto done;
11102         }
11103         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
11104             PyErr_SetString(PyExc_OverflowError,
11105                             "replace string is too long");
11106             goto error;
11107         }
11108         u = PyUnicode_New(new_size, maxchar);
11109         if (!u)
11110             goto error;
11111         assert(PyUnicode_KIND(u) == rkind);
11112         res = PyUnicode_DATA(u);
11113         ires = i = 0;
11114         if (len1 > 0) {
11115             while (n-- > 0) {
11116                 /* look for next match */
11117                 j = anylib_find(rkind, self,
11118                                 sbuf + rkind * i, slen-i,
11119                                 str1, buf1, len1, i);
11120                 if (j == -1)
11121                     break;
11122                 else if (j > i) {
11123                     /* copy unchanged part [i:j] */
11124                     memcpy(res + rkind * ires,
11125                            sbuf + rkind * i,
11126                            rkind * (j-i));
11127                     ires += j - i;
11128                 }
11129                 /* copy substitution string */
11130                 if (len2 > 0) {
11131                     memcpy(res + rkind * ires,
11132                            buf2,
11133                            rkind * len2);
11134                     ires += len2;
11135                 }
11136                 i = j + len1;
11137             }
11138             if (i < slen)
11139                 /* copy tail [i:] */
11140                 memcpy(res + rkind * ires,
11141                        sbuf + rkind * i,
11142                        rkind * (slen-i));
11143         }
11144         else {
11145             /* interleave */
11146             while (n > 0) {
11147                 memcpy(res + rkind * ires,
11148                        buf2,
11149                        rkind * len2);
11150                 ires += len2;
11151                 if (--n <= 0)
11152                     break;
11153                 memcpy(res + rkind * ires,
11154                        sbuf + rkind * i,
11155                        rkind);
11156                 ires++;
11157                 i++;
11158             }
11159             memcpy(res + rkind * ires,
11160                    sbuf + rkind * i,
11161                    rkind * (slen-i));
11162         }
11163     }
11164 
11165     if (mayshrink) {
11166         unicode_adjust_maxchar(&u);
11167         if (u == NULL)
11168             goto error;
11169     }
11170 
11171   done:
11172     assert(srelease == (sbuf != PyUnicode_DATA(self)));
11173     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11174     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11175     if (srelease)
11176         PyMem_Free((void *)sbuf);
11177     if (release1)
11178         PyMem_Free((void *)buf1);
11179     if (release2)
11180         PyMem_Free((void *)buf2);
11181     assert(_PyUnicode_CheckConsistency(u, 1));
11182     return u;
11183 
11184   nothing:
11185     /* nothing to replace; return original string (when possible) */
11186     assert(srelease == (sbuf != PyUnicode_DATA(self)));
11187     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11188     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11189     if (srelease)
11190         PyMem_Free((void *)sbuf);
11191     if (release1)
11192         PyMem_Free((void *)buf1);
11193     if (release2)
11194         PyMem_Free((void *)buf2);
11195     return unicode_result_unchanged(self);
11196 
11197   error:
11198     assert(srelease == (sbuf != PyUnicode_DATA(self)));
11199     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11200     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11201     if (srelease)
11202         PyMem_Free((void *)sbuf);
11203     if (release1)
11204         PyMem_Free((void *)buf1);
11205     if (release2)
11206         PyMem_Free((void *)buf2);
11207     return NULL;
11208 }
11209 
11210 /* --- Unicode Object Methods --------------------------------------------- */
11211 
11212 /*[clinic input]
11213 str.title as unicode_title
11214 
11215 Return a version of the string where each word is titlecased.
11216 
11217 More specifically, words start with uppercased characters and all remaining
11218 cased characters have lower case.
11219 [clinic start generated code]*/
11220 
11221 static PyObject *
unicode_title_impl(PyObject * self)11222 unicode_title_impl(PyObject *self)
11223 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11224 {
11225     if (PyUnicode_READY(self) == -1)
11226         return NULL;
11227     return case_operation(self, do_title);
11228 }
11229 
11230 /*[clinic input]
11231 str.capitalize as unicode_capitalize
11232 
11233 Return a capitalized version of the string.
11234 
11235 More specifically, make the first character have upper case and the rest lower
11236 case.
11237 [clinic start generated code]*/
11238 
11239 static PyObject *
unicode_capitalize_impl(PyObject * self)11240 unicode_capitalize_impl(PyObject *self)
11241 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11242 {
11243     if (PyUnicode_READY(self) == -1)
11244         return NULL;
11245     if (PyUnicode_GET_LENGTH(self) == 0)
11246         return unicode_result_unchanged(self);
11247     return case_operation(self, do_capitalize);
11248 }
11249 
11250 /*[clinic input]
11251 str.casefold as unicode_casefold
11252 
11253 Return a version of the string suitable for caseless comparisons.
11254 [clinic start generated code]*/
11255 
11256 static PyObject *
unicode_casefold_impl(PyObject * self)11257 unicode_casefold_impl(PyObject *self)
11258 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11259 {
11260     if (PyUnicode_READY(self) == -1)
11261         return NULL;
11262     if (PyUnicode_IS_ASCII(self))
11263         return ascii_upper_or_lower(self, 1);
11264     return case_operation(self, do_casefold);
11265 }
11266 
11267 
11268 /* Argument converter. Accepts a single Unicode character. */
11269 
11270 static int
convert_uc(PyObject * obj,void * addr)11271 convert_uc(PyObject *obj, void *addr)
11272 {
11273     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11274 
11275     if (!PyUnicode_Check(obj)) {
11276         PyErr_Format(PyExc_TypeError,
11277                      "The fill character must be a unicode character, "
11278                      "not %.100s", Py_TYPE(obj)->tp_name);
11279         return 0;
11280     }
11281     if (PyUnicode_READY(obj) < 0)
11282         return 0;
11283     if (PyUnicode_GET_LENGTH(obj) != 1) {
11284         PyErr_SetString(PyExc_TypeError,
11285                         "The fill character must be exactly one character long");
11286         return 0;
11287     }
11288     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11289     return 1;
11290 }
11291 
11292 /*[clinic input]
11293 str.center as unicode_center
11294 
11295     width: Py_ssize_t
11296     fillchar: Py_UCS4 = ' '
11297     /
11298 
11299 Return a centered string of length width.
11300 
11301 Padding is done using the specified fill character (default is a space).
11302 [clinic start generated code]*/
11303 
11304 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11305 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11306 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11307 {
11308     Py_ssize_t marg, left;
11309 
11310     if (PyUnicode_READY(self) == -1)
11311         return NULL;
11312 
11313     if (PyUnicode_GET_LENGTH(self) >= width)
11314         return unicode_result_unchanged(self);
11315 
11316     marg = width - PyUnicode_GET_LENGTH(self);
11317     left = marg / 2 + (marg & width & 1);
11318 
11319     return pad(self, left, marg - left, fillchar);
11320 }
11321 
11322 /* This function assumes that str1 and str2 are readied by the caller. */
11323 
11324 static int
unicode_compare(PyObject * str1,PyObject * str2)11325 unicode_compare(PyObject *str1, PyObject *str2)
11326 {
11327 #define COMPARE(TYPE1, TYPE2) \
11328     do { \
11329         TYPE1* p1 = (TYPE1 *)data1; \
11330         TYPE2* p2 = (TYPE2 *)data2; \
11331         TYPE1* end = p1 + len; \
11332         Py_UCS4 c1, c2; \
11333         for (; p1 != end; p1++, p2++) { \
11334             c1 = *p1; \
11335             c2 = *p2; \
11336             if (c1 != c2) \
11337                 return (c1 < c2) ? -1 : 1; \
11338         } \
11339     } \
11340     while (0)
11341 
11342     int kind1, kind2;
11343     const void *data1, *data2;
11344     Py_ssize_t len1, len2, len;
11345 
11346     kind1 = PyUnicode_KIND(str1);
11347     kind2 = PyUnicode_KIND(str2);
11348     data1 = PyUnicode_DATA(str1);
11349     data2 = PyUnicode_DATA(str2);
11350     len1 = PyUnicode_GET_LENGTH(str1);
11351     len2 = PyUnicode_GET_LENGTH(str2);
11352     len = Py_MIN(len1, len2);
11353 
11354     switch(kind1) {
11355     case PyUnicode_1BYTE_KIND:
11356     {
11357         switch(kind2) {
11358         case PyUnicode_1BYTE_KIND:
11359         {
11360             int cmp = memcmp(data1, data2, len);
11361             /* normalize result of memcmp() into the range [-1; 1] */
11362             if (cmp < 0)
11363                 return -1;
11364             if (cmp > 0)
11365                 return 1;
11366             break;
11367         }
11368         case PyUnicode_2BYTE_KIND:
11369             COMPARE(Py_UCS1, Py_UCS2);
11370             break;
11371         case PyUnicode_4BYTE_KIND:
11372             COMPARE(Py_UCS1, Py_UCS4);
11373             break;
11374         default:
11375             Py_UNREACHABLE();
11376         }
11377         break;
11378     }
11379     case PyUnicode_2BYTE_KIND:
11380     {
11381         switch(kind2) {
11382         case PyUnicode_1BYTE_KIND:
11383             COMPARE(Py_UCS2, Py_UCS1);
11384             break;
11385         case PyUnicode_2BYTE_KIND:
11386         {
11387             COMPARE(Py_UCS2, Py_UCS2);
11388             break;
11389         }
11390         case PyUnicode_4BYTE_KIND:
11391             COMPARE(Py_UCS2, Py_UCS4);
11392             break;
11393         default:
11394             Py_UNREACHABLE();
11395         }
11396         break;
11397     }
11398     case PyUnicode_4BYTE_KIND:
11399     {
11400         switch(kind2) {
11401         case PyUnicode_1BYTE_KIND:
11402             COMPARE(Py_UCS4, Py_UCS1);
11403             break;
11404         case PyUnicode_2BYTE_KIND:
11405             COMPARE(Py_UCS4, Py_UCS2);
11406             break;
11407         case PyUnicode_4BYTE_KIND:
11408         {
11409 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11410             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11411             /* normalize result of wmemcmp() into the range [-1; 1] */
11412             if (cmp < 0)
11413                 return -1;
11414             if (cmp > 0)
11415                 return 1;
11416 #else
11417             COMPARE(Py_UCS4, Py_UCS4);
11418 #endif
11419             break;
11420         }
11421         default:
11422             Py_UNREACHABLE();
11423         }
11424         break;
11425     }
11426     default:
11427         Py_UNREACHABLE();
11428     }
11429 
11430     if (len1 == len2)
11431         return 0;
11432     if (len1 < len2)
11433         return -1;
11434     else
11435         return 1;
11436 
11437 #undef COMPARE
11438 }
11439 
11440 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11441 unicode_compare_eq(PyObject *str1, PyObject *str2)
11442 {
11443     int kind;
11444     const void *data1, *data2;
11445     Py_ssize_t len;
11446     int cmp;
11447 
11448     len = PyUnicode_GET_LENGTH(str1);
11449     if (PyUnicode_GET_LENGTH(str2) != len)
11450         return 0;
11451     kind = PyUnicode_KIND(str1);
11452     if (PyUnicode_KIND(str2) != kind)
11453         return 0;
11454     data1 = PyUnicode_DATA(str1);
11455     data2 = PyUnicode_DATA(str2);
11456 
11457     cmp = memcmp(data1, data2, len * kind);
11458     return (cmp == 0);
11459 }
11460 
11461 
11462 int
PyUnicode_Compare(PyObject * left,PyObject * right)11463 PyUnicode_Compare(PyObject *left, PyObject *right)
11464 {
11465     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11466         if (PyUnicode_READY(left) == -1 ||
11467             PyUnicode_READY(right) == -1)
11468             return -1;
11469 
11470         /* a string is equal to itself */
11471         if (left == right)
11472             return 0;
11473 
11474         return unicode_compare(left, right);
11475     }
11476     PyErr_Format(PyExc_TypeError,
11477                  "Can't compare %.100s and %.100s",
11478                  Py_TYPE(left)->tp_name,
11479                  Py_TYPE(right)->tp_name);
11480     return -1;
11481 }
11482 
11483 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11484 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11485 {
11486     Py_ssize_t i;
11487     int kind;
11488     Py_UCS4 chr;
11489     const unsigned char *ustr = (const unsigned char *)str;
11490 
11491     assert(_PyUnicode_CHECK(uni));
11492     if (!PyUnicode_IS_READY(uni)) {
11493         const wchar_t *ws = _PyUnicode_WSTR(uni);
11494         /* Compare Unicode string and source character set string */
11495         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11496             if (chr != ustr[i])
11497                 return (chr < ustr[i]) ? -1 : 1;
11498         }
11499         /* This check keeps Python strings that end in '\0' from comparing equal
11500          to C strings identical up to that point. */
11501         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11502             return 1; /* uni is longer */
11503         if (ustr[i])
11504             return -1; /* str is longer */
11505         return 0;
11506     }
11507     kind = PyUnicode_KIND(uni);
11508     if (kind == PyUnicode_1BYTE_KIND) {
11509         const void *data = PyUnicode_1BYTE_DATA(uni);
11510         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11511         size_t len, len2 = strlen(str);
11512         int cmp;
11513 
11514         len = Py_MIN(len1, len2);
11515         cmp = memcmp(data, str, len);
11516         if (cmp != 0) {
11517             if (cmp < 0)
11518                 return -1;
11519             else
11520                 return 1;
11521         }
11522         if (len1 > len2)
11523             return 1; /* uni is longer */
11524         if (len1 < len2)
11525             return -1; /* str is longer */
11526         return 0;
11527     }
11528     else {
11529         const void *data = PyUnicode_DATA(uni);
11530         /* Compare Unicode string and source character set string */
11531         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11532             if (chr != (unsigned char)str[i])
11533                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11534         /* This check keeps Python strings that end in '\0' from comparing equal
11535          to C strings identical up to that point. */
11536         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11537             return 1; /* uni is longer */
11538         if (str[i])
11539             return -1; /* str is longer */
11540         return 0;
11541     }
11542 }
11543 
11544 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11545 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11546 {
11547     size_t i, len;
11548     const wchar_t *p;
11549     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11550     if (strlen(str) != len)
11551         return 0;
11552     p = _PyUnicode_WSTR(unicode);
11553     assert(p);
11554     for (i = 0; i < len; i++) {
11555         unsigned char c = (unsigned char)str[i];
11556         if (c >= 128 || p[i] != (wchar_t)c)
11557             return 0;
11558     }
11559     return 1;
11560 }
11561 
11562 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11563 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11564 {
11565     size_t len;
11566     assert(_PyUnicode_CHECK(unicode));
11567     assert(str);
11568 #ifndef NDEBUG
11569     for (const char *p = str; *p; p++) {
11570         assert((unsigned char)*p < 128);
11571     }
11572 #endif
11573     if (PyUnicode_READY(unicode) == -1) {
11574         /* Memory error or bad data */
11575         PyErr_Clear();
11576         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11577     }
11578     if (!PyUnicode_IS_ASCII(unicode))
11579         return 0;
11580     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11581     return strlen(str) == len &&
11582            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11583 }
11584 
11585 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11586 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11587 {
11588     PyObject *right_uni;
11589 
11590     assert(_PyUnicode_CHECK(left));
11591     assert(right->string);
11592 #ifndef NDEBUG
11593     for (const char *p = right->string; *p; p++) {
11594         assert((unsigned char)*p < 128);
11595     }
11596 #endif
11597 
11598     if (PyUnicode_READY(left) == -1) {
11599         /* memory error or bad data */
11600         PyErr_Clear();
11601         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11602     }
11603 
11604     if (!PyUnicode_IS_ASCII(left))
11605         return 0;
11606 
11607     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11608     if (right_uni == NULL) {
11609         /* memory error or bad data */
11610         PyErr_Clear();
11611         return _PyUnicode_EqualToASCIIString(left, right->string);
11612     }
11613 
11614     if (left == right_uni)
11615         return 1;
11616 
11617     if (PyUnicode_CHECK_INTERNED(left))
11618         return 0;
11619 
11620 #ifdef INTERNED_STRINGS
11621     assert(_PyUnicode_HASH(right_uni) != -1);
11622     Py_hash_t hash = _PyUnicode_HASH(left);
11623     if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11624         return 0;
11625     }
11626 #endif
11627 
11628     return unicode_compare_eq(left, right_uni);
11629 }
11630 
11631 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11632 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11633 {
11634     int result;
11635 
11636     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11637         Py_RETURN_NOTIMPLEMENTED;
11638 
11639     if (PyUnicode_READY(left) == -1 ||
11640         PyUnicode_READY(right) == -1)
11641         return NULL;
11642 
11643     if (left == right) {
11644         switch (op) {
11645         case Py_EQ:
11646         case Py_LE:
11647         case Py_GE:
11648             /* a string is equal to itself */
11649             Py_RETURN_TRUE;
11650         case Py_NE:
11651         case Py_LT:
11652         case Py_GT:
11653             Py_RETURN_FALSE;
11654         default:
11655             PyErr_BadArgument();
11656             return NULL;
11657         }
11658     }
11659     else if (op == Py_EQ || op == Py_NE) {
11660         result = unicode_compare_eq(left, right);
11661         result ^= (op == Py_NE);
11662         return PyBool_FromLong(result);
11663     }
11664     else {
11665         result = unicode_compare(left, right);
11666         Py_RETURN_RICHCOMPARE(result, 0, op);
11667     }
11668 }
11669 
11670 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11671 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11672 {
11673     return unicode_eq(aa, bb);
11674 }
11675 
11676 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11677 PyUnicode_Contains(PyObject *str, PyObject *substr)
11678 {
11679     int kind1, kind2;
11680     const void *buf1, *buf2;
11681     Py_ssize_t len1, len2;
11682     int result;
11683 
11684     if (!PyUnicode_Check(substr)) {
11685         PyErr_Format(PyExc_TypeError,
11686                      "'in <string>' requires string as left operand, not %.100s",
11687                      Py_TYPE(substr)->tp_name);
11688         return -1;
11689     }
11690     if (PyUnicode_READY(substr) == -1)
11691         return -1;
11692     if (ensure_unicode(str) < 0)
11693         return -1;
11694 
11695     kind1 = PyUnicode_KIND(str);
11696     kind2 = PyUnicode_KIND(substr);
11697     if (kind1 < kind2)
11698         return 0;
11699     len1 = PyUnicode_GET_LENGTH(str);
11700     len2 = PyUnicode_GET_LENGTH(substr);
11701     if (len1 < len2)
11702         return 0;
11703     buf1 = PyUnicode_DATA(str);
11704     buf2 = PyUnicode_DATA(substr);
11705     if (len2 == 1) {
11706         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11707         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11708         return result;
11709     }
11710     if (kind2 != kind1) {
11711         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11712         if (!buf2)
11713             return -1;
11714     }
11715 
11716     switch (kind1) {
11717     case PyUnicode_1BYTE_KIND:
11718         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11719         break;
11720     case PyUnicode_2BYTE_KIND:
11721         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11722         break;
11723     case PyUnicode_4BYTE_KIND:
11724         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11725         break;
11726     default:
11727         Py_UNREACHABLE();
11728     }
11729 
11730     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11731     if (kind2 != kind1)
11732         PyMem_Free((void *)buf2);
11733 
11734     return result;
11735 }
11736 
11737 /* Concat to string or Unicode object giving a new Unicode object. */
11738 
11739 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11740 PyUnicode_Concat(PyObject *left, PyObject *right)
11741 {
11742     PyObject *result;
11743     Py_UCS4 maxchar, maxchar2;
11744     Py_ssize_t left_len, right_len, new_len;
11745 
11746     if (ensure_unicode(left) < 0)
11747         return NULL;
11748 
11749     if (!PyUnicode_Check(right)) {
11750         PyErr_Format(PyExc_TypeError,
11751                      "can only concatenate str (not \"%.200s\") to str",
11752                      Py_TYPE(right)->tp_name);
11753         return NULL;
11754     }
11755     if (PyUnicode_READY(right) < 0)
11756         return NULL;
11757 
11758     /* Shortcuts */
11759     PyObject *empty = unicode_get_empty();  // Borrowed reference
11760     if (left == empty) {
11761         return PyUnicode_FromObject(right);
11762     }
11763     if (right == empty) {
11764         return PyUnicode_FromObject(left);
11765     }
11766 
11767     left_len = PyUnicode_GET_LENGTH(left);
11768     right_len = PyUnicode_GET_LENGTH(right);
11769     if (left_len > PY_SSIZE_T_MAX - right_len) {
11770         PyErr_SetString(PyExc_OverflowError,
11771                         "strings are too large to concat");
11772         return NULL;
11773     }
11774     new_len = left_len + right_len;
11775 
11776     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11777     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11778     maxchar = Py_MAX(maxchar, maxchar2);
11779 
11780     /* Concat the two Unicode strings */
11781     result = PyUnicode_New(new_len, maxchar);
11782     if (result == NULL)
11783         return NULL;
11784     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11785     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11786     assert(_PyUnicode_CheckConsistency(result, 1));
11787     return result;
11788 }
11789 
11790 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11791 PyUnicode_Append(PyObject **p_left, PyObject *right)
11792 {
11793     PyObject *left, *res;
11794     Py_UCS4 maxchar, maxchar2;
11795     Py_ssize_t left_len, right_len, new_len;
11796 
11797     if (p_left == NULL) {
11798         if (!PyErr_Occurred())
11799             PyErr_BadInternalCall();
11800         return;
11801     }
11802     left = *p_left;
11803     if (right == NULL || left == NULL
11804         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11805         if (!PyErr_Occurred())
11806             PyErr_BadInternalCall();
11807         goto error;
11808     }
11809 
11810     if (PyUnicode_READY(left) == -1)
11811         goto error;
11812     if (PyUnicode_READY(right) == -1)
11813         goto error;
11814 
11815     /* Shortcuts */
11816     PyObject *empty = unicode_get_empty();  // Borrowed reference
11817     if (left == empty) {
11818         Py_DECREF(left);
11819         Py_INCREF(right);
11820         *p_left = right;
11821         return;
11822     }
11823     if (right == empty) {
11824         return;
11825     }
11826 
11827     left_len = PyUnicode_GET_LENGTH(left);
11828     right_len = PyUnicode_GET_LENGTH(right);
11829     if (left_len > PY_SSIZE_T_MAX - right_len) {
11830         PyErr_SetString(PyExc_OverflowError,
11831                         "strings are too large to concat");
11832         goto error;
11833     }
11834     new_len = left_len + right_len;
11835 
11836     if (unicode_modifiable(left)
11837         && PyUnicode_CheckExact(right)
11838         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11839         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11840            to change the structure size, but characters are stored just after
11841            the structure, and so it requires to move all characters which is
11842            not so different than duplicating the string. */
11843         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11844     {
11845         /* append inplace */
11846         if (unicode_resize(p_left, new_len) != 0)
11847             goto error;
11848 
11849         /* copy 'right' into the newly allocated area of 'left' */
11850         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11851     }
11852     else {
11853         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11854         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11855         maxchar = Py_MAX(maxchar, maxchar2);
11856 
11857         /* Concat the two Unicode strings */
11858         res = PyUnicode_New(new_len, maxchar);
11859         if (res == NULL)
11860             goto error;
11861         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11862         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11863         Py_DECREF(left);
11864         *p_left = res;
11865     }
11866     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11867     return;
11868 
11869 error:
11870     Py_CLEAR(*p_left);
11871 }
11872 
11873 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11874 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11875 {
11876     PyUnicode_Append(pleft, right);
11877     Py_XDECREF(right);
11878 }
11879 
11880 /*
11881 Wraps stringlib_parse_args_finds() and additionally ensures that the
11882 first argument is a unicode object.
11883 */
11884 
11885 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11886 parse_args_finds_unicode(const char * function_name, PyObject *args,
11887                          PyObject **substring,
11888                          Py_ssize_t *start, Py_ssize_t *end)
11889 {
11890     if(stringlib_parse_args_finds(function_name, args, substring,
11891                                   start, end)) {
11892         if (ensure_unicode(*substring) < 0)
11893             return 0;
11894         return 1;
11895     }
11896     return 0;
11897 }
11898 
11899 PyDoc_STRVAR(count__doc__,
11900              "S.count(sub[, start[, end]]) -> int\n\
11901 \n\
11902 Return the number of non-overlapping occurrences of substring sub in\n\
11903 string S[start:end].  Optional arguments start and end are\n\
11904 interpreted as in slice notation.");
11905 
11906 static PyObject *
unicode_count(PyObject * self,PyObject * args)11907 unicode_count(PyObject *self, PyObject *args)
11908 {
11909     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11910     Py_ssize_t start = 0;
11911     Py_ssize_t end = PY_SSIZE_T_MAX;
11912     PyObject *result;
11913     int kind1, kind2;
11914     const void *buf1, *buf2;
11915     Py_ssize_t len1, len2, iresult;
11916 
11917     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11918         return NULL;
11919 
11920     kind1 = PyUnicode_KIND(self);
11921     kind2 = PyUnicode_KIND(substring);
11922     if (kind1 < kind2)
11923         return PyLong_FromLong(0);
11924 
11925     len1 = PyUnicode_GET_LENGTH(self);
11926     len2 = PyUnicode_GET_LENGTH(substring);
11927     ADJUST_INDICES(start, end, len1);
11928     if (end - start < len2)
11929         return PyLong_FromLong(0);
11930 
11931     buf1 = PyUnicode_DATA(self);
11932     buf2 = PyUnicode_DATA(substring);
11933     if (kind2 != kind1) {
11934         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11935         if (!buf2)
11936             return NULL;
11937     }
11938     switch (kind1) {
11939     case PyUnicode_1BYTE_KIND:
11940         iresult = ucs1lib_count(
11941             ((const Py_UCS1*)buf1) + start, end - start,
11942             buf2, len2, PY_SSIZE_T_MAX
11943             );
11944         break;
11945     case PyUnicode_2BYTE_KIND:
11946         iresult = ucs2lib_count(
11947             ((const Py_UCS2*)buf1) + start, end - start,
11948             buf2, len2, PY_SSIZE_T_MAX
11949             );
11950         break;
11951     case PyUnicode_4BYTE_KIND:
11952         iresult = ucs4lib_count(
11953             ((const Py_UCS4*)buf1) + start, end - start,
11954             buf2, len2, PY_SSIZE_T_MAX
11955             );
11956         break;
11957     default:
11958         Py_UNREACHABLE();
11959     }
11960 
11961     result = PyLong_FromSsize_t(iresult);
11962 
11963     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11964     if (kind2 != kind1)
11965         PyMem_Free((void *)buf2);
11966 
11967     return result;
11968 }
11969 
11970 /*[clinic input]
11971 str.encode as unicode_encode
11972 
11973     encoding: str(c_default="NULL") = 'utf-8'
11974         The encoding in which to encode the string.
11975     errors: str(c_default="NULL") = 'strict'
11976         The error handling scheme to use for encoding errors.
11977         The default is 'strict' meaning that encoding errors raise a
11978         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11979         'xmlcharrefreplace' as well as any other name registered with
11980         codecs.register_error that can handle UnicodeEncodeErrors.
11981 
11982 Encode the string using the codec registered for encoding.
11983 [clinic start generated code]*/
11984 
11985 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11986 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11987 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11988 {
11989     return PyUnicode_AsEncodedString(self, encoding, errors);
11990 }
11991 
11992 /*[clinic input]
11993 str.expandtabs as unicode_expandtabs
11994 
11995     tabsize: int = 8
11996 
11997 Return a copy where all tab characters are expanded using spaces.
11998 
11999 If tabsize is not given, a tab size of 8 characters is assumed.
12000 [clinic start generated code]*/
12001 
12002 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)12003 unicode_expandtabs_impl(PyObject *self, int tabsize)
12004 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
12005 {
12006     Py_ssize_t i, j, line_pos, src_len, incr;
12007     Py_UCS4 ch;
12008     PyObject *u;
12009     const void *src_data;
12010     void *dest_data;
12011     int kind;
12012     int found;
12013 
12014     if (PyUnicode_READY(self) == -1)
12015         return NULL;
12016 
12017     /* First pass: determine size of output string */
12018     src_len = PyUnicode_GET_LENGTH(self);
12019     i = j = line_pos = 0;
12020     kind = PyUnicode_KIND(self);
12021     src_data = PyUnicode_DATA(self);
12022     found = 0;
12023     for (; i < src_len; i++) {
12024         ch = PyUnicode_READ(kind, src_data, i);
12025         if (ch == '\t') {
12026             found = 1;
12027             if (tabsize > 0) {
12028                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
12029                 if (j > PY_SSIZE_T_MAX - incr)
12030                     goto overflow;
12031                 line_pos += incr;
12032                 j += incr;
12033             }
12034         }
12035         else {
12036             if (j > PY_SSIZE_T_MAX - 1)
12037                 goto overflow;
12038             line_pos++;
12039             j++;
12040             if (ch == '\n' || ch == '\r')
12041                 line_pos = 0;
12042         }
12043     }
12044     if (!found)
12045         return unicode_result_unchanged(self);
12046 
12047     /* Second pass: create output string and fill it */
12048     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
12049     if (!u)
12050         return NULL;
12051     dest_data = PyUnicode_DATA(u);
12052 
12053     i = j = line_pos = 0;
12054 
12055     for (; i < src_len; i++) {
12056         ch = PyUnicode_READ(kind, src_data, i);
12057         if (ch == '\t') {
12058             if (tabsize > 0) {
12059                 incr = tabsize - (line_pos % tabsize);
12060                 line_pos += incr;
12061                 unicode_fill(kind, dest_data, ' ', j, incr);
12062                 j += incr;
12063             }
12064         }
12065         else {
12066             line_pos++;
12067             PyUnicode_WRITE(kind, dest_data, j, ch);
12068             j++;
12069             if (ch == '\n' || ch == '\r')
12070                 line_pos = 0;
12071         }
12072     }
12073     assert (j == PyUnicode_GET_LENGTH(u));
12074     return unicode_result(u);
12075 
12076   overflow:
12077     PyErr_SetString(PyExc_OverflowError, "new string is too long");
12078     return NULL;
12079 }
12080 
12081 PyDoc_STRVAR(find__doc__,
12082              "S.find(sub[, start[, end]]) -> int\n\
12083 \n\
12084 Return the lowest index in S where substring sub is found,\n\
12085 such that sub is contained within S[start:end].  Optional\n\
12086 arguments start and end are interpreted as in slice notation.\n\
12087 \n\
12088 Return -1 on failure.");
12089 
12090 static PyObject *
unicode_find(PyObject * self,PyObject * args)12091 unicode_find(PyObject *self, PyObject *args)
12092 {
12093     /* initialize variables to prevent gcc warning */
12094     PyObject *substring = NULL;
12095     Py_ssize_t start = 0;
12096     Py_ssize_t end = 0;
12097     Py_ssize_t result;
12098 
12099     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
12100         return NULL;
12101 
12102     if (PyUnicode_READY(self) == -1)
12103         return NULL;
12104 
12105     result = any_find_slice(self, substring, start, end, 1);
12106 
12107     if (result == -2)
12108         return NULL;
12109 
12110     return PyLong_FromSsize_t(result);
12111 }
12112 
12113 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)12114 unicode_getitem(PyObject *self, Py_ssize_t index)
12115 {
12116     const void *data;
12117     enum PyUnicode_Kind kind;
12118     Py_UCS4 ch;
12119 
12120     if (!PyUnicode_Check(self)) {
12121         PyErr_BadArgument();
12122         return NULL;
12123     }
12124     if (PyUnicode_READY(self) == -1) {
12125         return NULL;
12126     }
12127     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12128         PyErr_SetString(PyExc_IndexError, "string index out of range");
12129         return NULL;
12130     }
12131     kind = PyUnicode_KIND(self);
12132     data = PyUnicode_DATA(self);
12133     ch = PyUnicode_READ(kind, data, index);
12134     return unicode_char(ch);
12135 }
12136 
12137 /* Believe it or not, this produces the same value for ASCII strings
12138    as bytes_hash(). */
12139 static Py_hash_t
unicode_hash(PyObject * self)12140 unicode_hash(PyObject *self)
12141 {
12142     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
12143 
12144 #ifdef Py_DEBUG
12145     assert(_Py_HashSecret_Initialized);
12146 #endif
12147     if (_PyUnicode_HASH(self) != -1)
12148         return _PyUnicode_HASH(self);
12149     if (PyUnicode_READY(self) == -1)
12150         return -1;
12151 
12152     x = _Py_HashBytes(PyUnicode_DATA(self),
12153                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
12154     _PyUnicode_HASH(self) = x;
12155     return x;
12156 }
12157 
12158 PyDoc_STRVAR(index__doc__,
12159              "S.index(sub[, start[, end]]) -> int\n\
12160 \n\
12161 Return the lowest index in S where substring sub is found,\n\
12162 such that sub is contained within S[start:end].  Optional\n\
12163 arguments start and end are interpreted as in slice notation.\n\
12164 \n\
12165 Raises ValueError when the substring is not found.");
12166 
12167 static PyObject *
unicode_index(PyObject * self,PyObject * args)12168 unicode_index(PyObject *self, PyObject *args)
12169 {
12170     /* initialize variables to prevent gcc warning */
12171     Py_ssize_t result;
12172     PyObject *substring = NULL;
12173     Py_ssize_t start = 0;
12174     Py_ssize_t end = 0;
12175 
12176     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
12177         return NULL;
12178 
12179     if (PyUnicode_READY(self) == -1)
12180         return NULL;
12181 
12182     result = any_find_slice(self, substring, start, end, 1);
12183 
12184     if (result == -2)
12185         return NULL;
12186 
12187     if (result < 0) {
12188         PyErr_SetString(PyExc_ValueError, "substring not found");
12189         return NULL;
12190     }
12191 
12192     return PyLong_FromSsize_t(result);
12193 }
12194 
12195 /*[clinic input]
12196 str.isascii as unicode_isascii
12197 
12198 Return True if all characters in the string are ASCII, False otherwise.
12199 
12200 ASCII characters have code points in the range U+0000-U+007F.
12201 Empty string is ASCII too.
12202 [clinic start generated code]*/
12203 
12204 static PyObject *
unicode_isascii_impl(PyObject * self)12205 unicode_isascii_impl(PyObject *self)
12206 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12207 {
12208     if (PyUnicode_READY(self) == -1) {
12209         return NULL;
12210     }
12211     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12212 }
12213 
12214 /*[clinic input]
12215 str.islower as unicode_islower
12216 
12217 Return True if the string is a lowercase string, False otherwise.
12218 
12219 A string is lowercase if all cased characters in the string are lowercase and
12220 there is at least one cased character in the string.
12221 [clinic start generated code]*/
12222 
12223 static PyObject *
unicode_islower_impl(PyObject * self)12224 unicode_islower_impl(PyObject *self)
12225 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12226 {
12227     Py_ssize_t i, length;
12228     int kind;
12229     const void *data;
12230     int cased;
12231 
12232     if (PyUnicode_READY(self) == -1)
12233         return NULL;
12234     length = PyUnicode_GET_LENGTH(self);
12235     kind = PyUnicode_KIND(self);
12236     data = PyUnicode_DATA(self);
12237 
12238     /* Shortcut for single character strings */
12239     if (length == 1)
12240         return PyBool_FromLong(
12241             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12242 
12243     /* Special case for empty strings */
12244     if (length == 0)
12245         Py_RETURN_FALSE;
12246 
12247     cased = 0;
12248     for (i = 0; i < length; i++) {
12249         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12250 
12251         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12252             Py_RETURN_FALSE;
12253         else if (!cased && Py_UNICODE_ISLOWER(ch))
12254             cased = 1;
12255     }
12256     return PyBool_FromLong(cased);
12257 }
12258 
12259 /*[clinic input]
12260 str.isupper as unicode_isupper
12261 
12262 Return True if the string is an uppercase string, False otherwise.
12263 
12264 A string is uppercase if all cased characters in the string are uppercase and
12265 there is at least one cased character in the string.
12266 [clinic start generated code]*/
12267 
12268 static PyObject *
unicode_isupper_impl(PyObject * self)12269 unicode_isupper_impl(PyObject *self)
12270 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12271 {
12272     Py_ssize_t i, length;
12273     int kind;
12274     const void *data;
12275     int cased;
12276 
12277     if (PyUnicode_READY(self) == -1)
12278         return NULL;
12279     length = PyUnicode_GET_LENGTH(self);
12280     kind = PyUnicode_KIND(self);
12281     data = PyUnicode_DATA(self);
12282 
12283     /* Shortcut for single character strings */
12284     if (length == 1)
12285         return PyBool_FromLong(
12286             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12287 
12288     /* Special case for empty strings */
12289     if (length == 0)
12290         Py_RETURN_FALSE;
12291 
12292     cased = 0;
12293     for (i = 0; i < length; i++) {
12294         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12295 
12296         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12297             Py_RETURN_FALSE;
12298         else if (!cased && Py_UNICODE_ISUPPER(ch))
12299             cased = 1;
12300     }
12301     return PyBool_FromLong(cased);
12302 }
12303 
12304 /*[clinic input]
12305 str.istitle as unicode_istitle
12306 
12307 Return True if the string is a title-cased string, False otherwise.
12308 
12309 In a title-cased string, upper- and title-case characters may only
12310 follow uncased characters and lowercase characters only cased ones.
12311 [clinic start generated code]*/
12312 
12313 static PyObject *
unicode_istitle_impl(PyObject * self)12314 unicode_istitle_impl(PyObject *self)
12315 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12316 {
12317     Py_ssize_t i, length;
12318     int kind;
12319     const void *data;
12320     int cased, previous_is_cased;
12321 
12322     if (PyUnicode_READY(self) == -1)
12323         return NULL;
12324     length = PyUnicode_GET_LENGTH(self);
12325     kind = PyUnicode_KIND(self);
12326     data = PyUnicode_DATA(self);
12327 
12328     /* Shortcut for single character strings */
12329     if (length == 1) {
12330         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12331         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12332                                (Py_UNICODE_ISUPPER(ch) != 0));
12333     }
12334 
12335     /* Special case for empty strings */
12336     if (length == 0)
12337         Py_RETURN_FALSE;
12338 
12339     cased = 0;
12340     previous_is_cased = 0;
12341     for (i = 0; i < length; i++) {
12342         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12343 
12344         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12345             if (previous_is_cased)
12346                 Py_RETURN_FALSE;
12347             previous_is_cased = 1;
12348             cased = 1;
12349         }
12350         else if (Py_UNICODE_ISLOWER(ch)) {
12351             if (!previous_is_cased)
12352                 Py_RETURN_FALSE;
12353             previous_is_cased = 1;
12354             cased = 1;
12355         }
12356         else
12357             previous_is_cased = 0;
12358     }
12359     return PyBool_FromLong(cased);
12360 }
12361 
12362 /*[clinic input]
12363 str.isspace as unicode_isspace
12364 
12365 Return True if the string is a whitespace string, False otherwise.
12366 
12367 A string is whitespace if all characters in the string are whitespace and there
12368 is at least one character in the string.
12369 [clinic start generated code]*/
12370 
12371 static PyObject *
unicode_isspace_impl(PyObject * self)12372 unicode_isspace_impl(PyObject *self)
12373 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12374 {
12375     Py_ssize_t i, length;
12376     int kind;
12377     const void *data;
12378 
12379     if (PyUnicode_READY(self) == -1)
12380         return NULL;
12381     length = PyUnicode_GET_LENGTH(self);
12382     kind = PyUnicode_KIND(self);
12383     data = PyUnicode_DATA(self);
12384 
12385     /* Shortcut for single character strings */
12386     if (length == 1)
12387         return PyBool_FromLong(
12388             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12389 
12390     /* Special case for empty strings */
12391     if (length == 0)
12392         Py_RETURN_FALSE;
12393 
12394     for (i = 0; i < length; i++) {
12395         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12396         if (!Py_UNICODE_ISSPACE(ch))
12397             Py_RETURN_FALSE;
12398     }
12399     Py_RETURN_TRUE;
12400 }
12401 
12402 /*[clinic input]
12403 str.isalpha as unicode_isalpha
12404 
12405 Return True if the string is an alphabetic string, False otherwise.
12406 
12407 A string is alphabetic if all characters in the string are alphabetic and there
12408 is at least one character in the string.
12409 [clinic start generated code]*/
12410 
12411 static PyObject *
unicode_isalpha_impl(PyObject * self)12412 unicode_isalpha_impl(PyObject *self)
12413 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12414 {
12415     Py_ssize_t i, length;
12416     int kind;
12417     const void *data;
12418 
12419     if (PyUnicode_READY(self) == -1)
12420         return NULL;
12421     length = PyUnicode_GET_LENGTH(self);
12422     kind = PyUnicode_KIND(self);
12423     data = PyUnicode_DATA(self);
12424 
12425     /* Shortcut for single character strings */
12426     if (length == 1)
12427         return PyBool_FromLong(
12428             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12429 
12430     /* Special case for empty strings */
12431     if (length == 0)
12432         Py_RETURN_FALSE;
12433 
12434     for (i = 0; i < length; i++) {
12435         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12436             Py_RETURN_FALSE;
12437     }
12438     Py_RETURN_TRUE;
12439 }
12440 
12441 /*[clinic input]
12442 str.isalnum as unicode_isalnum
12443 
12444 Return True if the string is an alpha-numeric string, False otherwise.
12445 
12446 A string is alpha-numeric if all characters in the string are alpha-numeric and
12447 there is at least one character in the string.
12448 [clinic start generated code]*/
12449 
12450 static PyObject *
unicode_isalnum_impl(PyObject * self)12451 unicode_isalnum_impl(PyObject *self)
12452 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12453 {
12454     int kind;
12455     const void *data;
12456     Py_ssize_t len, i;
12457 
12458     if (PyUnicode_READY(self) == -1)
12459         return NULL;
12460 
12461     kind = PyUnicode_KIND(self);
12462     data = PyUnicode_DATA(self);
12463     len = PyUnicode_GET_LENGTH(self);
12464 
12465     /* Shortcut for single character strings */
12466     if (len == 1) {
12467         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12468         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12469     }
12470 
12471     /* Special case for empty strings */
12472     if (len == 0)
12473         Py_RETURN_FALSE;
12474 
12475     for (i = 0; i < len; i++) {
12476         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12477         if (!Py_UNICODE_ISALNUM(ch))
12478             Py_RETURN_FALSE;
12479     }
12480     Py_RETURN_TRUE;
12481 }
12482 
12483 /*[clinic input]
12484 str.isdecimal as unicode_isdecimal
12485 
12486 Return True if the string is a decimal string, False otherwise.
12487 
12488 A string is a decimal string if all characters in the string are decimal and
12489 there is at least one character in the string.
12490 [clinic start generated code]*/
12491 
12492 static PyObject *
unicode_isdecimal_impl(PyObject * self)12493 unicode_isdecimal_impl(PyObject *self)
12494 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12495 {
12496     Py_ssize_t i, length;
12497     int kind;
12498     const void *data;
12499 
12500     if (PyUnicode_READY(self) == -1)
12501         return NULL;
12502     length = PyUnicode_GET_LENGTH(self);
12503     kind = PyUnicode_KIND(self);
12504     data = PyUnicode_DATA(self);
12505 
12506     /* Shortcut for single character strings */
12507     if (length == 1)
12508         return PyBool_FromLong(
12509             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12510 
12511     /* Special case for empty strings */
12512     if (length == 0)
12513         Py_RETURN_FALSE;
12514 
12515     for (i = 0; i < length; i++) {
12516         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12517             Py_RETURN_FALSE;
12518     }
12519     Py_RETURN_TRUE;
12520 }
12521 
12522 /*[clinic input]
12523 str.isdigit as unicode_isdigit
12524 
12525 Return True if the string is a digit string, False otherwise.
12526 
12527 A string is a digit string if all characters in the string are digits and there
12528 is at least one character in the string.
12529 [clinic start generated code]*/
12530 
12531 static PyObject *
unicode_isdigit_impl(PyObject * self)12532 unicode_isdigit_impl(PyObject *self)
12533 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12534 {
12535     Py_ssize_t i, length;
12536     int kind;
12537     const void *data;
12538 
12539     if (PyUnicode_READY(self) == -1)
12540         return NULL;
12541     length = PyUnicode_GET_LENGTH(self);
12542     kind = PyUnicode_KIND(self);
12543     data = PyUnicode_DATA(self);
12544 
12545     /* Shortcut for single character strings */
12546     if (length == 1) {
12547         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12548         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12549     }
12550 
12551     /* Special case for empty strings */
12552     if (length == 0)
12553         Py_RETURN_FALSE;
12554 
12555     for (i = 0; i < length; i++) {
12556         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12557             Py_RETURN_FALSE;
12558     }
12559     Py_RETURN_TRUE;
12560 }
12561 
12562 /*[clinic input]
12563 str.isnumeric as unicode_isnumeric
12564 
12565 Return True if the string is a numeric string, False otherwise.
12566 
12567 A string is numeric if all characters in the string are numeric and there is at
12568 least one character in the string.
12569 [clinic start generated code]*/
12570 
12571 static PyObject *
unicode_isnumeric_impl(PyObject * self)12572 unicode_isnumeric_impl(PyObject *self)
12573 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12574 {
12575     Py_ssize_t i, length;
12576     int kind;
12577     const void *data;
12578 
12579     if (PyUnicode_READY(self) == -1)
12580         return NULL;
12581     length = PyUnicode_GET_LENGTH(self);
12582     kind = PyUnicode_KIND(self);
12583     data = PyUnicode_DATA(self);
12584 
12585     /* Shortcut for single character strings */
12586     if (length == 1)
12587         return PyBool_FromLong(
12588             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12589 
12590     /* Special case for empty strings */
12591     if (length == 0)
12592         Py_RETURN_FALSE;
12593 
12594     for (i = 0; i < length; i++) {
12595         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12596             Py_RETURN_FALSE;
12597     }
12598     Py_RETURN_TRUE;
12599 }
12600 
12601 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12602 _PyUnicode_ScanIdentifier(PyObject *self)
12603 {
12604     Py_ssize_t i;
12605     if (PyUnicode_READY(self) == -1)
12606         return -1;
12607 
12608     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12609     if (len == 0) {
12610         /* an empty string is not a valid identifier */
12611         return 0;
12612     }
12613 
12614     int kind = PyUnicode_KIND(self);
12615     const void *data = PyUnicode_DATA(self);
12616     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12617     /* PEP 3131 says that the first character must be in
12618        XID_Start and subsequent characters in XID_Continue,
12619        and for the ASCII range, the 2.x rules apply (i.e
12620        start with letters and underscore, continue with
12621        letters, digits, underscore). However, given the current
12622        definition of XID_Start and XID_Continue, it is sufficient
12623        to check just for these, except that _ must be allowed
12624        as starting an identifier.  */
12625     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12626         return 0;
12627     }
12628 
12629     for (i = 1; i < len; i++) {
12630         ch = PyUnicode_READ(kind, data, i);
12631         if (!_PyUnicode_IsXidContinue(ch)) {
12632             return i;
12633         }
12634     }
12635     return i;
12636 }
12637 
12638 int
PyUnicode_IsIdentifier(PyObject * self)12639 PyUnicode_IsIdentifier(PyObject *self)
12640 {
12641     if (PyUnicode_IS_READY(self)) {
12642         Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12643         Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12644         /* an empty string is not a valid identifier */
12645         return len && i == len;
12646     }
12647     else {
12648 _Py_COMP_DIAG_PUSH
12649 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12650         Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12651         if (len == 0) {
12652             /* an empty string is not a valid identifier */
12653             return 0;
12654         }
12655 
12656         const wchar_t *wstr = _PyUnicode_WSTR(self);
12657         Py_UCS4 ch = wstr[i++];
12658 #if SIZEOF_WCHAR_T == 2
12659         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12660             && i < len
12661             && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12662         {
12663             ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12664             i++;
12665         }
12666 #endif
12667         if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12668             return 0;
12669         }
12670 
12671         while (i < len) {
12672             ch = wstr[i++];
12673 #if SIZEOF_WCHAR_T == 2
12674             if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12675                 && i < len
12676                 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12677             {
12678                 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12679                 i++;
12680             }
12681 #endif
12682             if (!_PyUnicode_IsXidContinue(ch)) {
12683                 return 0;
12684             }
12685         }
12686         return 1;
12687 _Py_COMP_DIAG_POP
12688     }
12689 }
12690 
12691 /*[clinic input]
12692 str.isidentifier as unicode_isidentifier
12693 
12694 Return True if the string is a valid Python identifier, False otherwise.
12695 
12696 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12697 such as "def" or "class".
12698 [clinic start generated code]*/
12699 
12700 static PyObject *
unicode_isidentifier_impl(PyObject * self)12701 unicode_isidentifier_impl(PyObject *self)
12702 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12703 {
12704     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12705 }
12706 
12707 /*[clinic input]
12708 str.isprintable as unicode_isprintable
12709 
12710 Return True if the string is printable, False otherwise.
12711 
12712 A string is printable if all of its characters are considered printable in
12713 repr() or if it is empty.
12714 [clinic start generated code]*/
12715 
12716 static PyObject *
unicode_isprintable_impl(PyObject * self)12717 unicode_isprintable_impl(PyObject *self)
12718 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12719 {
12720     Py_ssize_t i, length;
12721     int kind;
12722     const void *data;
12723 
12724     if (PyUnicode_READY(self) == -1)
12725         return NULL;
12726     length = PyUnicode_GET_LENGTH(self);
12727     kind = PyUnicode_KIND(self);
12728     data = PyUnicode_DATA(self);
12729 
12730     /* Shortcut for single character strings */
12731     if (length == 1)
12732         return PyBool_FromLong(
12733             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12734 
12735     for (i = 0; i < length; i++) {
12736         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12737             Py_RETURN_FALSE;
12738         }
12739     }
12740     Py_RETURN_TRUE;
12741 }
12742 
12743 /*[clinic input]
12744 str.join as unicode_join
12745 
12746     iterable: object
12747     /
12748 
12749 Concatenate any number of strings.
12750 
12751 The string whose method is called is inserted in between each given string.
12752 The result is returned as a new string.
12753 
12754 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12755 [clinic start generated code]*/
12756 
12757 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12758 unicode_join(PyObject *self, PyObject *iterable)
12759 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12760 {
12761     return PyUnicode_Join(self, iterable);
12762 }
12763 
12764 static Py_ssize_t
unicode_length(PyObject * self)12765 unicode_length(PyObject *self)
12766 {
12767     if (PyUnicode_READY(self) == -1)
12768         return -1;
12769     return PyUnicode_GET_LENGTH(self);
12770 }
12771 
12772 /*[clinic input]
12773 str.ljust as unicode_ljust
12774 
12775     width: Py_ssize_t
12776     fillchar: Py_UCS4 = ' '
12777     /
12778 
12779 Return a left-justified string of length width.
12780 
12781 Padding is done using the specified fill character (default is a space).
12782 [clinic start generated code]*/
12783 
12784 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12785 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12786 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12787 {
12788     if (PyUnicode_READY(self) == -1)
12789         return NULL;
12790 
12791     if (PyUnicode_GET_LENGTH(self) >= width)
12792         return unicode_result_unchanged(self);
12793 
12794     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12795 }
12796 
12797 /*[clinic input]
12798 str.lower as unicode_lower
12799 
12800 Return a copy of the string converted to lowercase.
12801 [clinic start generated code]*/
12802 
12803 static PyObject *
unicode_lower_impl(PyObject * self)12804 unicode_lower_impl(PyObject *self)
12805 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12806 {
12807     if (PyUnicode_READY(self) == -1)
12808         return NULL;
12809     if (PyUnicode_IS_ASCII(self))
12810         return ascii_upper_or_lower(self, 1);
12811     return case_operation(self, do_lower);
12812 }
12813 
12814 #define LEFTSTRIP 0
12815 #define RIGHTSTRIP 1
12816 #define BOTHSTRIP 2
12817 
12818 /* Arrays indexed by above */
12819 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12820 
12821 #define STRIPNAME(i) (stripfuncnames[i])
12822 
12823 /* externally visible for str.strip(unicode) */
12824 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12825 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12826 {
12827     const void *data;
12828     int kind;
12829     Py_ssize_t i, j, len;
12830     BLOOM_MASK sepmask;
12831     Py_ssize_t seplen;
12832 
12833     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12834         return NULL;
12835 
12836     kind = PyUnicode_KIND(self);
12837     data = PyUnicode_DATA(self);
12838     len = PyUnicode_GET_LENGTH(self);
12839     seplen = PyUnicode_GET_LENGTH(sepobj);
12840     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12841                               PyUnicode_DATA(sepobj),
12842                               seplen);
12843 
12844     i = 0;
12845     if (striptype != RIGHTSTRIP) {
12846         while (i < len) {
12847             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12848             if (!BLOOM(sepmask, ch))
12849                 break;
12850             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12851                 break;
12852             i++;
12853         }
12854     }
12855 
12856     j = len;
12857     if (striptype != LEFTSTRIP) {
12858         j--;
12859         while (j >= i) {
12860             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12861             if (!BLOOM(sepmask, ch))
12862                 break;
12863             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12864                 break;
12865             j--;
12866         }
12867 
12868         j++;
12869     }
12870 
12871     return PyUnicode_Substring(self, i, j);
12872 }
12873 
12874 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12875 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12876 {
12877     const unsigned char *data;
12878     int kind;
12879     Py_ssize_t length;
12880 
12881     if (PyUnicode_READY(self) == -1)
12882         return NULL;
12883 
12884     length = PyUnicode_GET_LENGTH(self);
12885     end = Py_MIN(end, length);
12886 
12887     if (start == 0 && end == length)
12888         return unicode_result_unchanged(self);
12889 
12890     if (start < 0 || end < 0) {
12891         PyErr_SetString(PyExc_IndexError, "string index out of range");
12892         return NULL;
12893     }
12894     if (start >= length || end < start)
12895         _Py_RETURN_UNICODE_EMPTY();
12896 
12897     length = end - start;
12898     if (PyUnicode_IS_ASCII(self)) {
12899         data = PyUnicode_1BYTE_DATA(self);
12900         return _PyUnicode_FromASCII((const char*)(data + start), length);
12901     }
12902     else {
12903         kind = PyUnicode_KIND(self);
12904         data = PyUnicode_1BYTE_DATA(self);
12905         return PyUnicode_FromKindAndData(kind,
12906                                          data + kind * start,
12907                                          length);
12908     }
12909 }
12910 
12911 static PyObject *
do_strip(PyObject * self,int striptype)12912 do_strip(PyObject *self, int striptype)
12913 {
12914     Py_ssize_t len, i, j;
12915 
12916     if (PyUnicode_READY(self) == -1)
12917         return NULL;
12918 
12919     len = PyUnicode_GET_LENGTH(self);
12920 
12921     if (PyUnicode_IS_ASCII(self)) {
12922         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12923 
12924         i = 0;
12925         if (striptype != RIGHTSTRIP) {
12926             while (i < len) {
12927                 Py_UCS1 ch = data[i];
12928                 if (!_Py_ascii_whitespace[ch])
12929                     break;
12930                 i++;
12931             }
12932         }
12933 
12934         j = len;
12935         if (striptype != LEFTSTRIP) {
12936             j--;
12937             while (j >= i) {
12938                 Py_UCS1 ch = data[j];
12939                 if (!_Py_ascii_whitespace[ch])
12940                     break;
12941                 j--;
12942             }
12943             j++;
12944         }
12945     }
12946     else {
12947         int kind = PyUnicode_KIND(self);
12948         const void *data = PyUnicode_DATA(self);
12949 
12950         i = 0;
12951         if (striptype != RIGHTSTRIP) {
12952             while (i < len) {
12953                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12954                 if (!Py_UNICODE_ISSPACE(ch))
12955                     break;
12956                 i++;
12957             }
12958         }
12959 
12960         j = len;
12961         if (striptype != LEFTSTRIP) {
12962             j--;
12963             while (j >= i) {
12964                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12965                 if (!Py_UNICODE_ISSPACE(ch))
12966                     break;
12967                 j--;
12968             }
12969             j++;
12970         }
12971     }
12972 
12973     return PyUnicode_Substring(self, i, j);
12974 }
12975 
12976 
12977 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12978 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12979 {
12980     if (sep != Py_None) {
12981         if (PyUnicode_Check(sep))
12982             return _PyUnicode_XStrip(self, striptype, sep);
12983         else {
12984             PyErr_Format(PyExc_TypeError,
12985                          "%s arg must be None or str",
12986                          STRIPNAME(striptype));
12987             return NULL;
12988         }
12989     }
12990 
12991     return do_strip(self, striptype);
12992 }
12993 
12994 
12995 /*[clinic input]
12996 str.strip as unicode_strip
12997 
12998     chars: object = None
12999     /
13000 
13001 Return a copy of the string with leading and trailing whitespace removed.
13002 
13003 If chars is given and not None, remove characters in chars instead.
13004 [clinic start generated code]*/
13005 
13006 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)13007 unicode_strip_impl(PyObject *self, PyObject *chars)
13008 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
13009 {
13010     return do_argstrip(self, BOTHSTRIP, chars);
13011 }
13012 
13013 
13014 /*[clinic input]
13015 str.lstrip as unicode_lstrip
13016 
13017     chars: object = None
13018     /
13019 
13020 Return a copy of the string with leading whitespace removed.
13021 
13022 If chars is given and not None, remove characters in chars instead.
13023 [clinic start generated code]*/
13024 
13025 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)13026 unicode_lstrip_impl(PyObject *self, PyObject *chars)
13027 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
13028 {
13029     return do_argstrip(self, LEFTSTRIP, chars);
13030 }
13031 
13032 
13033 /*[clinic input]
13034 str.rstrip as unicode_rstrip
13035 
13036     chars: object = None
13037     /
13038 
13039 Return a copy of the string with trailing whitespace removed.
13040 
13041 If chars is given and not None, remove characters in chars instead.
13042 [clinic start generated code]*/
13043 
13044 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)13045 unicode_rstrip_impl(PyObject *self, PyObject *chars)
13046 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
13047 {
13048     return do_argstrip(self, RIGHTSTRIP, chars);
13049 }
13050 
13051 
13052 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)13053 unicode_repeat(PyObject *str, Py_ssize_t len)
13054 {
13055     PyObject *u;
13056     Py_ssize_t nchars, n;
13057 
13058     if (len < 1)
13059         _Py_RETURN_UNICODE_EMPTY();
13060 
13061     /* no repeat, return original string */
13062     if (len == 1)
13063         return unicode_result_unchanged(str);
13064 
13065     if (PyUnicode_READY(str) == -1)
13066         return NULL;
13067 
13068     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
13069         PyErr_SetString(PyExc_OverflowError,
13070                         "repeated string is too long");
13071         return NULL;
13072     }
13073     nchars = len * PyUnicode_GET_LENGTH(str);
13074 
13075     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
13076     if (!u)
13077         return NULL;
13078     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
13079 
13080     if (PyUnicode_GET_LENGTH(str) == 1) {
13081         int kind = PyUnicode_KIND(str);
13082         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
13083         if (kind == PyUnicode_1BYTE_KIND) {
13084             void *to = PyUnicode_DATA(u);
13085             memset(to, (unsigned char)fill_char, len);
13086         }
13087         else if (kind == PyUnicode_2BYTE_KIND) {
13088             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
13089             for (n = 0; n < len; ++n)
13090                 ucs2[n] = fill_char;
13091         } else {
13092             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13093             assert(kind == PyUnicode_4BYTE_KIND);
13094             for (n = 0; n < len; ++n)
13095                 ucs4[n] = fill_char;
13096         }
13097     }
13098     else {
13099         /* number of characters copied this far */
13100         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
13101         Py_ssize_t char_size = PyUnicode_KIND(str);
13102         char *to = (char *) PyUnicode_DATA(u);
13103         memcpy(to, PyUnicode_DATA(str),
13104                   PyUnicode_GET_LENGTH(str) * char_size);
13105         while (done < nchars) {
13106             n = (done <= nchars-done) ? done : nchars-done;
13107             memcpy(to + (done * char_size), to, n * char_size);
13108             done += n;
13109         }
13110     }
13111 
13112     assert(_PyUnicode_CheckConsistency(u, 1));
13113     return u;
13114 }
13115 
13116 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)13117 PyUnicode_Replace(PyObject *str,
13118                   PyObject *substr,
13119                   PyObject *replstr,
13120                   Py_ssize_t maxcount)
13121 {
13122     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13123             ensure_unicode(replstr) < 0)
13124         return NULL;
13125     return replace(str, substr, replstr, maxcount);
13126 }
13127 
13128 /*[clinic input]
13129 str.replace as unicode_replace
13130 
13131     old: unicode
13132     new: unicode
13133     count: Py_ssize_t = -1
13134         Maximum number of occurrences to replace.
13135         -1 (the default value) means replace all occurrences.
13136     /
13137 
13138 Return a copy with all occurrences of substring old replaced by new.
13139 
13140 If the optional argument count is given, only the first count occurrences are
13141 replaced.
13142 [clinic start generated code]*/
13143 
13144 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)13145 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13146                      Py_ssize_t count)
13147 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
13148 {
13149     if (PyUnicode_READY(self) == -1)
13150         return NULL;
13151     return replace(self, old, new, count);
13152 }
13153 
13154 /*[clinic input]
13155 str.removeprefix as unicode_removeprefix
13156 
13157     prefix: unicode
13158     /
13159 
13160 Return a str with the given prefix string removed if present.
13161 
13162 If the string starts with the prefix string, return string[len(prefix):].
13163 Otherwise, return a copy of the original string.
13164 [clinic start generated code]*/
13165 
13166 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)13167 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13168 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13169 {
13170     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13171     if (match == -1) {
13172         return NULL;
13173     }
13174     if (match) {
13175         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13176                                    PyUnicode_GET_LENGTH(self));
13177     }
13178     return unicode_result_unchanged(self);
13179 }
13180 
13181 /*[clinic input]
13182 str.removesuffix as unicode_removesuffix
13183 
13184     suffix: unicode
13185     /
13186 
13187 Return a str with the given suffix string removed if present.
13188 
13189 If the string ends with the suffix string and that suffix is not empty,
13190 return string[:-len(suffix)]. Otherwise, return a copy of the original
13191 string.
13192 [clinic start generated code]*/
13193 
13194 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)13195 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13196 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13197 {
13198     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13199     if (match == -1) {
13200         return NULL;
13201     }
13202     if (match) {
13203         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13204                                             - PyUnicode_GET_LENGTH(suffix));
13205     }
13206     return unicode_result_unchanged(self);
13207 }
13208 
13209 static PyObject *
unicode_repr(PyObject * unicode)13210 unicode_repr(PyObject *unicode)
13211 {
13212     PyObject *repr;
13213     Py_ssize_t isize;
13214     Py_ssize_t osize, squote, dquote, i, o;
13215     Py_UCS4 max, quote;
13216     int ikind, okind, unchanged;
13217     const void *idata;
13218     void *odata;
13219 
13220     if (PyUnicode_READY(unicode) == -1)
13221         return NULL;
13222 
13223     isize = PyUnicode_GET_LENGTH(unicode);
13224     idata = PyUnicode_DATA(unicode);
13225 
13226     /* Compute length of output, quote characters, and
13227        maximum character */
13228     osize = 0;
13229     max = 127;
13230     squote = dquote = 0;
13231     ikind = PyUnicode_KIND(unicode);
13232     for (i = 0; i < isize; i++) {
13233         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13234         Py_ssize_t incr = 1;
13235         switch (ch) {
13236         case '\'': squote++; break;
13237         case '"':  dquote++; break;
13238         case '\\': case '\t': case '\r': case '\n':
13239             incr = 2;
13240             break;
13241         default:
13242             /* Fast-path ASCII */
13243             if (ch < ' ' || ch == 0x7f)
13244                 incr = 4; /* \xHH */
13245             else if (ch < 0x7f)
13246                 ;
13247             else if (Py_UNICODE_ISPRINTABLE(ch))
13248                 max = ch > max ? ch : max;
13249             else if (ch < 0x100)
13250                 incr = 4; /* \xHH */
13251             else if (ch < 0x10000)
13252                 incr = 6; /* \uHHHH */
13253             else
13254                 incr = 10; /* \uHHHHHHHH */
13255         }
13256         if (osize > PY_SSIZE_T_MAX - incr) {
13257             PyErr_SetString(PyExc_OverflowError,
13258                             "string is too long to generate repr");
13259             return NULL;
13260         }
13261         osize += incr;
13262     }
13263 
13264     quote = '\'';
13265     unchanged = (osize == isize);
13266     if (squote) {
13267         unchanged = 0;
13268         if (dquote)
13269             /* Both squote and dquote present. Use squote,
13270                and escape them */
13271             osize += squote;
13272         else
13273             quote = '"';
13274     }
13275     osize += 2;   /* quotes */
13276 
13277     repr = PyUnicode_New(osize, max);
13278     if (repr == NULL)
13279         return NULL;
13280     okind = PyUnicode_KIND(repr);
13281     odata = PyUnicode_DATA(repr);
13282 
13283     PyUnicode_WRITE(okind, odata, 0, quote);
13284     PyUnicode_WRITE(okind, odata, osize-1, quote);
13285     if (unchanged) {
13286         _PyUnicode_FastCopyCharacters(repr, 1,
13287                                       unicode, 0,
13288                                       isize);
13289     }
13290     else {
13291         for (i = 0, o = 1; i < isize; i++) {
13292             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13293 
13294             /* Escape quotes and backslashes */
13295             if ((ch == quote) || (ch == '\\')) {
13296                 PyUnicode_WRITE(okind, odata, o++, '\\');
13297                 PyUnicode_WRITE(okind, odata, o++, ch);
13298                 continue;
13299             }
13300 
13301             /* Map special whitespace to '\t', \n', '\r' */
13302             if (ch == '\t') {
13303                 PyUnicode_WRITE(okind, odata, o++, '\\');
13304                 PyUnicode_WRITE(okind, odata, o++, 't');
13305             }
13306             else if (ch == '\n') {
13307                 PyUnicode_WRITE(okind, odata, o++, '\\');
13308                 PyUnicode_WRITE(okind, odata, o++, 'n');
13309             }
13310             else if (ch == '\r') {
13311                 PyUnicode_WRITE(okind, odata, o++, '\\');
13312                 PyUnicode_WRITE(okind, odata, o++, 'r');
13313             }
13314 
13315             /* Map non-printable US ASCII to '\xhh' */
13316             else if (ch < ' ' || ch == 0x7F) {
13317                 PyUnicode_WRITE(okind, odata, o++, '\\');
13318                 PyUnicode_WRITE(okind, odata, o++, 'x');
13319                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13320                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13321             }
13322 
13323             /* Copy ASCII characters as-is */
13324             else if (ch < 0x7F) {
13325                 PyUnicode_WRITE(okind, odata, o++, ch);
13326             }
13327 
13328             /* Non-ASCII characters */
13329             else {
13330                 /* Map Unicode whitespace and control characters
13331                    (categories Z* and C* except ASCII space)
13332                 */
13333                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13334                     PyUnicode_WRITE(okind, odata, o++, '\\');
13335                     /* Map 8-bit characters to '\xhh' */
13336                     if (ch <= 0xff) {
13337                         PyUnicode_WRITE(okind, odata, o++, 'x');
13338                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13339                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13340                     }
13341                     /* Map 16-bit characters to '\uxxxx' */
13342                     else if (ch <= 0xffff) {
13343                         PyUnicode_WRITE(okind, odata, o++, 'u');
13344                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13345                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13346                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13347                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13348                     }
13349                     /* Map 21-bit characters to '\U00xxxxxx' */
13350                     else {
13351                         PyUnicode_WRITE(okind, odata, o++, 'U');
13352                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13353                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13354                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13355                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13356                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13357                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13358                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13359                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13360                     }
13361                 }
13362                 /* Copy characters as-is */
13363                 else {
13364                     PyUnicode_WRITE(okind, odata, o++, ch);
13365                 }
13366             }
13367         }
13368     }
13369     /* Closing quote already added at the beginning */
13370     assert(_PyUnicode_CheckConsistency(repr, 1));
13371     return repr;
13372 }
13373 
13374 PyDoc_STRVAR(rfind__doc__,
13375              "S.rfind(sub[, start[, end]]) -> int\n\
13376 \n\
13377 Return the highest index in S where substring sub is found,\n\
13378 such that sub is contained within S[start:end].  Optional\n\
13379 arguments start and end are interpreted as in slice notation.\n\
13380 \n\
13381 Return -1 on failure.");
13382 
13383 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13384 unicode_rfind(PyObject *self, PyObject *args)
13385 {
13386     /* initialize variables to prevent gcc warning */
13387     PyObject *substring = NULL;
13388     Py_ssize_t start = 0;
13389     Py_ssize_t end = 0;
13390     Py_ssize_t result;
13391 
13392     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13393         return NULL;
13394 
13395     if (PyUnicode_READY(self) == -1)
13396         return NULL;
13397 
13398     result = any_find_slice(self, substring, start, end, -1);
13399 
13400     if (result == -2)
13401         return NULL;
13402 
13403     return PyLong_FromSsize_t(result);
13404 }
13405 
13406 PyDoc_STRVAR(rindex__doc__,
13407              "S.rindex(sub[, start[, end]]) -> int\n\
13408 \n\
13409 Return the highest index in S where substring sub is found,\n\
13410 such that sub is contained within S[start:end].  Optional\n\
13411 arguments start and end are interpreted as in slice notation.\n\
13412 \n\
13413 Raises ValueError when the substring is not found.");
13414 
13415 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13416 unicode_rindex(PyObject *self, PyObject *args)
13417 {
13418     /* initialize variables to prevent gcc warning */
13419     PyObject *substring = NULL;
13420     Py_ssize_t start = 0;
13421     Py_ssize_t end = 0;
13422     Py_ssize_t result;
13423 
13424     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13425         return NULL;
13426 
13427     if (PyUnicode_READY(self) == -1)
13428         return NULL;
13429 
13430     result = any_find_slice(self, substring, start, end, -1);
13431 
13432     if (result == -2)
13433         return NULL;
13434 
13435     if (result < 0) {
13436         PyErr_SetString(PyExc_ValueError, "substring not found");
13437         return NULL;
13438     }
13439 
13440     return PyLong_FromSsize_t(result);
13441 }
13442 
13443 /*[clinic input]
13444 str.rjust as unicode_rjust
13445 
13446     width: Py_ssize_t
13447     fillchar: Py_UCS4 = ' '
13448     /
13449 
13450 Return a right-justified string of length width.
13451 
13452 Padding is done using the specified fill character (default is a space).
13453 [clinic start generated code]*/
13454 
13455 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13456 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13457 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13458 {
13459     if (PyUnicode_READY(self) == -1)
13460         return NULL;
13461 
13462     if (PyUnicode_GET_LENGTH(self) >= width)
13463         return unicode_result_unchanged(self);
13464 
13465     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13466 }
13467 
13468 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13469 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13470 {
13471     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13472         return NULL;
13473 
13474     return split(s, sep, maxsplit);
13475 }
13476 
13477 /*[clinic input]
13478 str.split as unicode_split
13479 
13480     sep: object = None
13481         The delimiter according which to split the string.
13482         None (the default value) means split according to any whitespace,
13483         and discard empty strings from the result.
13484     maxsplit: Py_ssize_t = -1
13485         Maximum number of splits to do.
13486         -1 (the default value) means no limit.
13487 
13488 Return a list of the words in the string, using sep as the delimiter string.
13489 [clinic start generated code]*/
13490 
13491 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13492 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13493 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
13494 {
13495     if (sep == Py_None)
13496         return split(self, NULL, maxsplit);
13497     if (PyUnicode_Check(sep))
13498         return split(self, sep, maxsplit);
13499 
13500     PyErr_Format(PyExc_TypeError,
13501                  "must be str or None, not %.100s",
13502                  Py_TYPE(sep)->tp_name);
13503     return NULL;
13504 }
13505 
13506 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13507 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13508 {
13509     PyObject* out;
13510     int kind1, kind2;
13511     const void *buf1, *buf2;
13512     Py_ssize_t len1, len2;
13513 
13514     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13515         return NULL;
13516 
13517     kind1 = PyUnicode_KIND(str_obj);
13518     kind2 = PyUnicode_KIND(sep_obj);
13519     len1 = PyUnicode_GET_LENGTH(str_obj);
13520     len2 = PyUnicode_GET_LENGTH(sep_obj);
13521     if (kind1 < kind2 || len1 < len2) {
13522         PyObject *empty = unicode_get_empty();  // Borrowed reference
13523         return PyTuple_Pack(3, str_obj, empty, empty);
13524     }
13525     buf1 = PyUnicode_DATA(str_obj);
13526     buf2 = PyUnicode_DATA(sep_obj);
13527     if (kind2 != kind1) {
13528         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13529         if (!buf2)
13530             return NULL;
13531     }
13532 
13533     switch (kind1) {
13534     case PyUnicode_1BYTE_KIND:
13535         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13536             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13537         else
13538             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13539         break;
13540     case PyUnicode_2BYTE_KIND:
13541         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13542         break;
13543     case PyUnicode_4BYTE_KIND:
13544         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13545         break;
13546     default:
13547         Py_UNREACHABLE();
13548     }
13549 
13550     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13551     if (kind2 != kind1)
13552         PyMem_Free((void *)buf2);
13553 
13554     return out;
13555 }
13556 
13557 
13558 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13559 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13560 {
13561     PyObject* out;
13562     int kind1, kind2;
13563     const void *buf1, *buf2;
13564     Py_ssize_t len1, len2;
13565 
13566     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13567         return NULL;
13568 
13569     kind1 = PyUnicode_KIND(str_obj);
13570     kind2 = PyUnicode_KIND(sep_obj);
13571     len1 = PyUnicode_GET_LENGTH(str_obj);
13572     len2 = PyUnicode_GET_LENGTH(sep_obj);
13573     if (kind1 < kind2 || len1 < len2) {
13574         PyObject *empty = unicode_get_empty();  // Borrowed reference
13575         return PyTuple_Pack(3, empty, empty, str_obj);
13576     }
13577     buf1 = PyUnicode_DATA(str_obj);
13578     buf2 = PyUnicode_DATA(sep_obj);
13579     if (kind2 != kind1) {
13580         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13581         if (!buf2)
13582             return NULL;
13583     }
13584 
13585     switch (kind1) {
13586     case PyUnicode_1BYTE_KIND:
13587         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13588             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13589         else
13590             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13591         break;
13592     case PyUnicode_2BYTE_KIND:
13593         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13594         break;
13595     case PyUnicode_4BYTE_KIND:
13596         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13597         break;
13598     default:
13599         Py_UNREACHABLE();
13600     }
13601 
13602     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13603     if (kind2 != kind1)
13604         PyMem_Free((void *)buf2);
13605 
13606     return out;
13607 }
13608 
13609 /*[clinic input]
13610 str.partition as unicode_partition
13611 
13612     sep: object
13613     /
13614 
13615 Partition the string into three parts using the given separator.
13616 
13617 This will search for the separator in the string.  If the separator is found,
13618 returns a 3-tuple containing the part before the separator, the separator
13619 itself, and the part after it.
13620 
13621 If the separator is not found, returns a 3-tuple containing the original string
13622 and two empty strings.
13623 [clinic start generated code]*/
13624 
13625 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13626 unicode_partition(PyObject *self, PyObject *sep)
13627 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13628 {
13629     return PyUnicode_Partition(self, sep);
13630 }
13631 
13632 /*[clinic input]
13633 str.rpartition as unicode_rpartition = str.partition
13634 
13635 Partition the string into three parts using the given separator.
13636 
13637 This will search for the separator in the string, starting at the end. If
13638 the separator is found, returns a 3-tuple containing the part before the
13639 separator, the separator itself, and the part after it.
13640 
13641 If the separator is not found, returns a 3-tuple containing two empty strings
13642 and the original string.
13643 [clinic start generated code]*/
13644 
13645 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13646 unicode_rpartition(PyObject *self, PyObject *sep)
13647 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13648 {
13649     return PyUnicode_RPartition(self, sep);
13650 }
13651 
13652 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13653 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13654 {
13655     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13656         return NULL;
13657 
13658     return rsplit(s, sep, maxsplit);
13659 }
13660 
13661 /*[clinic input]
13662 str.rsplit as unicode_rsplit = str.split
13663 
13664 Return a list of the words in the string, using sep as the delimiter string.
13665 
13666 Splits are done starting at the end of the string and working to the front.
13667 [clinic start generated code]*/
13668 
13669 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13670 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13671 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13672 {
13673     if (sep == Py_None)
13674         return rsplit(self, NULL, maxsplit);
13675     if (PyUnicode_Check(sep))
13676         return rsplit(self, sep, maxsplit);
13677 
13678     PyErr_Format(PyExc_TypeError,
13679                  "must be str or None, not %.100s",
13680                  Py_TYPE(sep)->tp_name);
13681     return NULL;
13682 }
13683 
13684 /*[clinic input]
13685 str.splitlines as unicode_splitlines
13686 
13687     keepends: bool(accept={int}) = False
13688 
13689 Return a list of the lines in the string, breaking at line boundaries.
13690 
13691 Line breaks are not included in the resulting list unless keepends is given and
13692 true.
13693 [clinic start generated code]*/
13694 
13695 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13696 unicode_splitlines_impl(PyObject *self, int keepends)
13697 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13698 {
13699     return PyUnicode_Splitlines(self, keepends);
13700 }
13701 
13702 static
unicode_str(PyObject * self)13703 PyObject *unicode_str(PyObject *self)
13704 {
13705     return unicode_result_unchanged(self);
13706 }
13707 
13708 /*[clinic input]
13709 str.swapcase as unicode_swapcase
13710 
13711 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13712 [clinic start generated code]*/
13713 
13714 static PyObject *
unicode_swapcase_impl(PyObject * self)13715 unicode_swapcase_impl(PyObject *self)
13716 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13717 {
13718     if (PyUnicode_READY(self) == -1)
13719         return NULL;
13720     return case_operation(self, do_swapcase);
13721 }
13722 
13723 /*[clinic input]
13724 
13725 @staticmethod
13726 str.maketrans as unicode_maketrans
13727 
13728   x: object
13729 
13730   y: unicode=NULL
13731 
13732   z: unicode=NULL
13733 
13734   /
13735 
13736 Return a translation table usable for str.translate().
13737 
13738 If there is only one argument, it must be a dictionary mapping Unicode
13739 ordinals (integers) or characters to Unicode ordinals, strings or None.
13740 Character keys will be then converted to ordinals.
13741 If there are two arguments, they must be strings of equal length, and
13742 in the resulting dictionary, each character in x will be mapped to the
13743 character at the same position in y. If there is a third argument, it
13744 must be a string, whose characters will be mapped to None in the result.
13745 [clinic start generated code]*/
13746 
13747 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13748 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13749 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13750 {
13751     PyObject *new = NULL, *key, *value;
13752     Py_ssize_t i = 0;
13753     int res;
13754 
13755     new = PyDict_New();
13756     if (!new)
13757         return NULL;
13758     if (y != NULL) {
13759         int x_kind, y_kind, z_kind;
13760         const void *x_data, *y_data, *z_data;
13761 
13762         /* x must be a string too, of equal length */
13763         if (!PyUnicode_Check(x)) {
13764             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13765                             "be a string if there is a second argument");
13766             goto err;
13767         }
13768         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13769             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13770                             "arguments must have equal length");
13771             goto err;
13772         }
13773         /* create entries for translating chars in x to those in y */
13774         x_kind = PyUnicode_KIND(x);
13775         y_kind = PyUnicode_KIND(y);
13776         x_data = PyUnicode_DATA(x);
13777         y_data = PyUnicode_DATA(y);
13778         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13779             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13780             if (!key)
13781                 goto err;
13782             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13783             if (!value) {
13784                 Py_DECREF(key);
13785                 goto err;
13786             }
13787             res = PyDict_SetItem(new, key, value);
13788             Py_DECREF(key);
13789             Py_DECREF(value);
13790             if (res < 0)
13791                 goto err;
13792         }
13793         /* create entries for deleting chars in z */
13794         if (z != NULL) {
13795             z_kind = PyUnicode_KIND(z);
13796             z_data = PyUnicode_DATA(z);
13797             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13798                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13799                 if (!key)
13800                     goto err;
13801                 res = PyDict_SetItem(new, key, Py_None);
13802                 Py_DECREF(key);
13803                 if (res < 0)
13804                     goto err;
13805             }
13806         }
13807     } else {
13808         int kind;
13809         const void *data;
13810 
13811         /* x must be a dict */
13812         if (!PyDict_CheckExact(x)) {
13813             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13814                             "to maketrans it must be a dict");
13815             goto err;
13816         }
13817         /* copy entries into the new dict, converting string keys to int keys */
13818         while (PyDict_Next(x, &i, &key, &value)) {
13819             if (PyUnicode_Check(key)) {
13820                 /* convert string keys to integer keys */
13821                 PyObject *newkey;
13822                 if (PyUnicode_GET_LENGTH(key) != 1) {
13823                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13824                                     "table must be of length 1");
13825                     goto err;
13826                 }
13827                 kind = PyUnicode_KIND(key);
13828                 data = PyUnicode_DATA(key);
13829                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13830                 if (!newkey)
13831                     goto err;
13832                 res = PyDict_SetItem(new, newkey, value);
13833                 Py_DECREF(newkey);
13834                 if (res < 0)
13835                     goto err;
13836             } else if (PyLong_Check(key)) {
13837                 /* just keep integer keys */
13838                 if (PyDict_SetItem(new, key, value) < 0)
13839                     goto err;
13840             } else {
13841                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13842                                 "be strings or integers");
13843                 goto err;
13844             }
13845         }
13846     }
13847     return new;
13848   err:
13849     Py_DECREF(new);
13850     return NULL;
13851 }
13852 
13853 /*[clinic input]
13854 str.translate as unicode_translate
13855 
13856     table: object
13857         Translation table, which must be a mapping of Unicode ordinals to
13858         Unicode ordinals, strings, or None.
13859     /
13860 
13861 Replace each character in the string using the given translation table.
13862 
13863 The table must implement lookup/indexing via __getitem__, for instance a
13864 dictionary or list.  If this operation raises LookupError, the character is
13865 left untouched.  Characters mapped to None are deleted.
13866 [clinic start generated code]*/
13867 
13868 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13869 unicode_translate(PyObject *self, PyObject *table)
13870 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13871 {
13872     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13873 }
13874 
13875 /*[clinic input]
13876 str.upper as unicode_upper
13877 
13878 Return a copy of the string converted to uppercase.
13879 [clinic start generated code]*/
13880 
13881 static PyObject *
unicode_upper_impl(PyObject * self)13882 unicode_upper_impl(PyObject *self)
13883 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13884 {
13885     if (PyUnicode_READY(self) == -1)
13886         return NULL;
13887     if (PyUnicode_IS_ASCII(self))
13888         return ascii_upper_or_lower(self, 0);
13889     return case_operation(self, do_upper);
13890 }
13891 
13892 /*[clinic input]
13893 str.zfill as unicode_zfill
13894 
13895     width: Py_ssize_t
13896     /
13897 
13898 Pad a numeric string with zeros on the left, to fill a field of the given width.
13899 
13900 The string is never truncated.
13901 [clinic start generated code]*/
13902 
13903 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13904 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13905 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13906 {
13907     Py_ssize_t fill;
13908     PyObject *u;
13909     int kind;
13910     const void *data;
13911     Py_UCS4 chr;
13912 
13913     if (PyUnicode_READY(self) == -1)
13914         return NULL;
13915 
13916     if (PyUnicode_GET_LENGTH(self) >= width)
13917         return unicode_result_unchanged(self);
13918 
13919     fill = width - PyUnicode_GET_LENGTH(self);
13920 
13921     u = pad(self, fill, 0, '0');
13922 
13923     if (u == NULL)
13924         return NULL;
13925 
13926     kind = PyUnicode_KIND(u);
13927     data = PyUnicode_DATA(u);
13928     chr = PyUnicode_READ(kind, data, fill);
13929 
13930     if (chr == '+' || chr == '-') {
13931         /* move sign to beginning of string */
13932         PyUnicode_WRITE(kind, data, 0, chr);
13933         PyUnicode_WRITE(kind, data, fill, '0');
13934     }
13935 
13936     assert(_PyUnicode_CheckConsistency(u, 1));
13937     return u;
13938 }
13939 
13940 #if 0
13941 static PyObject *
13942 unicode__decimal2ascii(PyObject *self)
13943 {
13944     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13945 }
13946 #endif
13947 
13948 PyDoc_STRVAR(startswith__doc__,
13949              "S.startswith(prefix[, start[, end]]) -> bool\n\
13950 \n\
13951 Return True if S starts with the specified prefix, False otherwise.\n\
13952 With optional start, test S beginning at that position.\n\
13953 With optional end, stop comparing S at that position.\n\
13954 prefix can also be a tuple of strings to try.");
13955 
13956 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13957 unicode_startswith(PyObject *self,
13958                    PyObject *args)
13959 {
13960     PyObject *subobj;
13961     PyObject *substring;
13962     Py_ssize_t start = 0;
13963     Py_ssize_t end = PY_SSIZE_T_MAX;
13964     int result;
13965 
13966     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13967         return NULL;
13968     if (PyTuple_Check(subobj)) {
13969         Py_ssize_t i;
13970         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13971             substring = PyTuple_GET_ITEM(subobj, i);
13972             if (!PyUnicode_Check(substring)) {
13973                 PyErr_Format(PyExc_TypeError,
13974                              "tuple for startswith must only contain str, "
13975                              "not %.100s",
13976                              Py_TYPE(substring)->tp_name);
13977                 return NULL;
13978             }
13979             result = tailmatch(self, substring, start, end, -1);
13980             if (result == -1)
13981                 return NULL;
13982             if (result) {
13983                 Py_RETURN_TRUE;
13984             }
13985         }
13986         /* nothing matched */
13987         Py_RETURN_FALSE;
13988     }
13989     if (!PyUnicode_Check(subobj)) {
13990         PyErr_Format(PyExc_TypeError,
13991                      "startswith first arg must be str or "
13992                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13993         return NULL;
13994     }
13995     result = tailmatch(self, subobj, start, end, -1);
13996     if (result == -1)
13997         return NULL;
13998     return PyBool_FromLong(result);
13999 }
14000 
14001 
14002 PyDoc_STRVAR(endswith__doc__,
14003              "S.endswith(suffix[, start[, end]]) -> bool\n\
14004 \n\
14005 Return True if S ends with the specified suffix, False otherwise.\n\
14006 With optional start, test S beginning at that position.\n\
14007 With optional end, stop comparing S at that position.\n\
14008 suffix can also be a tuple of strings to try.");
14009 
14010 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)14011 unicode_endswith(PyObject *self,
14012                  PyObject *args)
14013 {
14014     PyObject *subobj;
14015     PyObject *substring;
14016     Py_ssize_t start = 0;
14017     Py_ssize_t end = PY_SSIZE_T_MAX;
14018     int result;
14019 
14020     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
14021         return NULL;
14022     if (PyTuple_Check(subobj)) {
14023         Py_ssize_t i;
14024         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
14025             substring = PyTuple_GET_ITEM(subobj, i);
14026             if (!PyUnicode_Check(substring)) {
14027                 PyErr_Format(PyExc_TypeError,
14028                              "tuple for endswith must only contain str, "
14029                              "not %.100s",
14030                              Py_TYPE(substring)->tp_name);
14031                 return NULL;
14032             }
14033             result = tailmatch(self, substring, start, end, +1);
14034             if (result == -1)
14035                 return NULL;
14036             if (result) {
14037                 Py_RETURN_TRUE;
14038             }
14039         }
14040         Py_RETURN_FALSE;
14041     }
14042     if (!PyUnicode_Check(subobj)) {
14043         PyErr_Format(PyExc_TypeError,
14044                      "endswith first arg must be str or "
14045                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14046         return NULL;
14047     }
14048     result = tailmatch(self, subobj, start, end, +1);
14049     if (result == -1)
14050         return NULL;
14051     return PyBool_FromLong(result);
14052 }
14053 
14054 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)14055 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
14056 {
14057     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14058     writer->data = PyUnicode_DATA(writer->buffer);
14059 
14060     if (!writer->readonly) {
14061         writer->kind = PyUnicode_KIND(writer->buffer);
14062         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
14063     }
14064     else {
14065         /* use a value smaller than PyUnicode_1BYTE_KIND() so
14066            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14067         writer->kind = PyUnicode_WCHAR_KIND;
14068         assert(writer->kind <= PyUnicode_1BYTE_KIND);
14069 
14070         /* Copy-on-write mode: set buffer size to 0 so
14071          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14072          * next write. */
14073         writer->size = 0;
14074     }
14075 }
14076 
14077 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)14078 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
14079 {
14080     memset(writer, 0, sizeof(*writer));
14081 
14082     /* ASCII is the bare minimum */
14083     writer->min_char = 127;
14084 
14085     /* use a value smaller than PyUnicode_1BYTE_KIND() so
14086        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14087     writer->kind = PyUnicode_WCHAR_KIND;
14088     assert(writer->kind <= PyUnicode_1BYTE_KIND);
14089 }
14090 
14091 // Initialize _PyUnicodeWriter with initial buffer
14092 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)14093 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14094 {
14095     memset(writer, 0, sizeof(*writer));
14096     writer->buffer = buffer;
14097     _PyUnicodeWriter_Update(writer);
14098     writer->min_length = writer->size;
14099 }
14100 
14101 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)14102 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14103                                  Py_ssize_t length, Py_UCS4 maxchar)
14104 {
14105     Py_ssize_t newlen;
14106     PyObject *newbuffer;
14107 
14108     assert(maxchar <= MAX_UNICODE);
14109 
14110     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
14111     assert((maxchar > writer->maxchar && length >= 0)
14112            || length > 0);
14113 
14114     if (length > PY_SSIZE_T_MAX - writer->pos) {
14115         PyErr_NoMemory();
14116         return -1;
14117     }
14118     newlen = writer->pos + length;
14119 
14120     maxchar = Py_MAX(maxchar, writer->min_char);
14121 
14122     if (writer->buffer == NULL) {
14123         assert(!writer->readonly);
14124         if (writer->overallocate
14125             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14126             /* overallocate to limit the number of realloc() */
14127             newlen += newlen / OVERALLOCATE_FACTOR;
14128         }
14129         if (newlen < writer->min_length)
14130             newlen = writer->min_length;
14131 
14132         writer->buffer = PyUnicode_New(newlen, maxchar);
14133         if (writer->buffer == NULL)
14134             return -1;
14135     }
14136     else if (newlen > writer->size) {
14137         if (writer->overallocate
14138             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14139             /* overallocate to limit the number of realloc() */
14140             newlen += newlen / OVERALLOCATE_FACTOR;
14141         }
14142         if (newlen < writer->min_length)
14143             newlen = writer->min_length;
14144 
14145         if (maxchar > writer->maxchar || writer->readonly) {
14146             /* resize + widen */
14147             maxchar = Py_MAX(maxchar, writer->maxchar);
14148             newbuffer = PyUnicode_New(newlen, maxchar);
14149             if (newbuffer == NULL)
14150                 return -1;
14151             _PyUnicode_FastCopyCharacters(newbuffer, 0,
14152                                           writer->buffer, 0, writer->pos);
14153             Py_DECREF(writer->buffer);
14154             writer->readonly = 0;
14155         }
14156         else {
14157             newbuffer = resize_compact(writer->buffer, newlen);
14158             if (newbuffer == NULL)
14159                 return -1;
14160         }
14161         writer->buffer = newbuffer;
14162     }
14163     else if (maxchar > writer->maxchar) {
14164         assert(!writer->readonly);
14165         newbuffer = PyUnicode_New(writer->size, maxchar);
14166         if (newbuffer == NULL)
14167             return -1;
14168         _PyUnicode_FastCopyCharacters(newbuffer, 0,
14169                                       writer->buffer, 0, writer->pos);
14170         Py_SETREF(writer->buffer, newbuffer);
14171     }
14172     _PyUnicodeWriter_Update(writer);
14173     return 0;
14174 
14175 #undef OVERALLOCATE_FACTOR
14176 }
14177 
14178 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)14179 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14180                                      enum PyUnicode_Kind kind)
14181 {
14182     Py_UCS4 maxchar;
14183 
14184     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14185     assert(writer->kind < kind);
14186 
14187     switch (kind)
14188     {
14189     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14190     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14191     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
14192     default:
14193         Py_UNREACHABLE();
14194     }
14195 
14196     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14197 }
14198 
14199 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)14200 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
14201 {
14202     assert(ch <= MAX_UNICODE);
14203     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14204         return -1;
14205     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14206     writer->pos++;
14207     return 0;
14208 }
14209 
14210 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)14211 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14212 {
14213     return _PyUnicodeWriter_WriteCharInline(writer, ch);
14214 }
14215 
14216 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)14217 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14218 {
14219     Py_UCS4 maxchar;
14220     Py_ssize_t len;
14221 
14222     if (PyUnicode_READY(str) == -1)
14223         return -1;
14224     len = PyUnicode_GET_LENGTH(str);
14225     if (len == 0)
14226         return 0;
14227     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14228     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
14229         if (writer->buffer == NULL && !writer->overallocate) {
14230             assert(_PyUnicode_CheckConsistency(str, 1));
14231             writer->readonly = 1;
14232             Py_INCREF(str);
14233             writer->buffer = str;
14234             _PyUnicodeWriter_Update(writer);
14235             writer->pos += len;
14236             return 0;
14237         }
14238         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14239             return -1;
14240     }
14241     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14242                                   str, 0, len);
14243     writer->pos += len;
14244     return 0;
14245 }
14246 
14247 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)14248 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14249                                 Py_ssize_t start, Py_ssize_t end)
14250 {
14251     Py_UCS4 maxchar;
14252     Py_ssize_t len;
14253 
14254     if (PyUnicode_READY(str) == -1)
14255         return -1;
14256 
14257     assert(0 <= start);
14258     assert(end <= PyUnicode_GET_LENGTH(str));
14259     assert(start <= end);
14260 
14261     if (end == 0)
14262         return 0;
14263 
14264     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14265         return _PyUnicodeWriter_WriteStr(writer, str);
14266 
14267     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14268         maxchar = _PyUnicode_FindMaxChar(str, start, end);
14269     else
14270         maxchar = writer->maxchar;
14271     len = end - start;
14272 
14273     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14274         return -1;
14275 
14276     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14277                                   str, start, len);
14278     writer->pos += len;
14279     return 0;
14280 }
14281 
14282 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)14283 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14284                                   const char *ascii, Py_ssize_t len)
14285 {
14286     if (len == -1)
14287         len = strlen(ascii);
14288 
14289     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14290 
14291     if (writer->buffer == NULL && !writer->overallocate) {
14292         PyObject *str;
14293 
14294         str = _PyUnicode_FromASCII(ascii, len);
14295         if (str == NULL)
14296             return -1;
14297 
14298         writer->readonly = 1;
14299         writer->buffer = str;
14300         _PyUnicodeWriter_Update(writer);
14301         writer->pos += len;
14302         return 0;
14303     }
14304 
14305     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14306         return -1;
14307 
14308     switch (writer->kind)
14309     {
14310     case PyUnicode_1BYTE_KIND:
14311     {
14312         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14313         Py_UCS1 *data = writer->data;
14314 
14315         memcpy(data + writer->pos, str, len);
14316         break;
14317     }
14318     case PyUnicode_2BYTE_KIND:
14319     {
14320         _PyUnicode_CONVERT_BYTES(
14321             Py_UCS1, Py_UCS2,
14322             ascii, ascii + len,
14323             (Py_UCS2 *)writer->data + writer->pos);
14324         break;
14325     }
14326     case PyUnicode_4BYTE_KIND:
14327     {
14328         _PyUnicode_CONVERT_BYTES(
14329             Py_UCS1, Py_UCS4,
14330             ascii, ascii + len,
14331             (Py_UCS4 *)writer->data + writer->pos);
14332         break;
14333     }
14334     default:
14335         Py_UNREACHABLE();
14336     }
14337 
14338     writer->pos += len;
14339     return 0;
14340 }
14341 
14342 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14343 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14344                                    const char *str, Py_ssize_t len)
14345 {
14346     Py_UCS4 maxchar;
14347 
14348     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14349     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14350         return -1;
14351     unicode_write_cstr(writer->buffer, writer->pos, str, len);
14352     writer->pos += len;
14353     return 0;
14354 }
14355 
14356 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14357 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14358 {
14359     PyObject *str;
14360 
14361     if (writer->pos == 0) {
14362         Py_CLEAR(writer->buffer);
14363         _Py_RETURN_UNICODE_EMPTY();
14364     }
14365 
14366     str = writer->buffer;
14367     writer->buffer = NULL;
14368 
14369     if (writer->readonly) {
14370         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14371         return str;
14372     }
14373 
14374     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14375         PyObject *str2;
14376         str2 = resize_compact(str, writer->pos);
14377         if (str2 == NULL) {
14378             Py_DECREF(str);
14379             return NULL;
14380         }
14381         str = str2;
14382     }
14383 
14384     assert(_PyUnicode_CheckConsistency(str, 1));
14385     return unicode_result_ready(str);
14386 }
14387 
14388 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14389 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14390 {
14391     Py_CLEAR(writer->buffer);
14392 }
14393 
14394 #include "stringlib/unicode_format.h"
14395 
14396 PyDoc_STRVAR(format__doc__,
14397              "S.format(*args, **kwargs) -> str\n\
14398 \n\
14399 Return a formatted version of S, using substitutions from args and kwargs.\n\
14400 The substitutions are identified by braces ('{' and '}').");
14401 
14402 PyDoc_STRVAR(format_map__doc__,
14403              "S.format_map(mapping) -> str\n\
14404 \n\
14405 Return a formatted version of S, using substitutions from mapping.\n\
14406 The substitutions are identified by braces ('{' and '}').");
14407 
14408 /*[clinic input]
14409 str.__format__ as unicode___format__
14410 
14411     format_spec: unicode
14412     /
14413 
14414 Return a formatted version of the string as described by format_spec.
14415 [clinic start generated code]*/
14416 
14417 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14418 unicode___format___impl(PyObject *self, PyObject *format_spec)
14419 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14420 {
14421     _PyUnicodeWriter writer;
14422     int ret;
14423 
14424     if (PyUnicode_READY(self) == -1)
14425         return NULL;
14426     _PyUnicodeWriter_Init(&writer);
14427     ret = _PyUnicode_FormatAdvancedWriter(&writer,
14428                                           self, format_spec, 0,
14429                                           PyUnicode_GET_LENGTH(format_spec));
14430     if (ret == -1) {
14431         _PyUnicodeWriter_Dealloc(&writer);
14432         return NULL;
14433     }
14434     return _PyUnicodeWriter_Finish(&writer);
14435 }
14436 
14437 /*[clinic input]
14438 str.__sizeof__ as unicode_sizeof
14439 
14440 Return the size of the string in memory, in bytes.
14441 [clinic start generated code]*/
14442 
14443 static PyObject *
unicode_sizeof_impl(PyObject * self)14444 unicode_sizeof_impl(PyObject *self)
14445 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14446 {
14447     Py_ssize_t size;
14448 
14449     /* If it's a compact object, account for base structure +
14450        character data. */
14451     if (PyUnicode_IS_COMPACT_ASCII(self))
14452         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14453     else if (PyUnicode_IS_COMPACT(self))
14454         size = sizeof(PyCompactUnicodeObject) +
14455             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14456     else {
14457         /* If it is a two-block object, account for base object, and
14458            for character block if present. */
14459         size = sizeof(PyUnicodeObject);
14460         if (_PyUnicode_DATA_ANY(self))
14461             size += (PyUnicode_GET_LENGTH(self) + 1) *
14462                 PyUnicode_KIND(self);
14463     }
14464     /* If the wstr pointer is present, account for it unless it is shared
14465        with the data pointer. Check if the data is not shared. */
14466     if (_PyUnicode_HAS_WSTR_MEMORY(self))
14467         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14468     if (_PyUnicode_HAS_UTF8_MEMORY(self))
14469         size += PyUnicode_UTF8_LENGTH(self) + 1;
14470 
14471     return PyLong_FromSsize_t(size);
14472 }
14473 
14474 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14475 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14476 {
14477     PyObject *copy = _PyUnicode_Copy(v);
14478     if (!copy)
14479         return NULL;
14480     return Py_BuildValue("(N)", copy);
14481 }
14482 
14483 static PyMethodDef unicode_methods[] = {
14484     UNICODE_ENCODE_METHODDEF
14485     UNICODE_REPLACE_METHODDEF
14486     UNICODE_SPLIT_METHODDEF
14487     UNICODE_RSPLIT_METHODDEF
14488     UNICODE_JOIN_METHODDEF
14489     UNICODE_CAPITALIZE_METHODDEF
14490     UNICODE_CASEFOLD_METHODDEF
14491     UNICODE_TITLE_METHODDEF
14492     UNICODE_CENTER_METHODDEF
14493     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14494     UNICODE_EXPANDTABS_METHODDEF
14495     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14496     UNICODE_PARTITION_METHODDEF
14497     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14498     UNICODE_LJUST_METHODDEF
14499     UNICODE_LOWER_METHODDEF
14500     UNICODE_LSTRIP_METHODDEF
14501     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14502     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14503     UNICODE_RJUST_METHODDEF
14504     UNICODE_RSTRIP_METHODDEF
14505     UNICODE_RPARTITION_METHODDEF
14506     UNICODE_SPLITLINES_METHODDEF
14507     UNICODE_STRIP_METHODDEF
14508     UNICODE_SWAPCASE_METHODDEF
14509     UNICODE_TRANSLATE_METHODDEF
14510     UNICODE_UPPER_METHODDEF
14511     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14512     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14513     UNICODE_REMOVEPREFIX_METHODDEF
14514     UNICODE_REMOVESUFFIX_METHODDEF
14515     UNICODE_ISASCII_METHODDEF
14516     UNICODE_ISLOWER_METHODDEF
14517     UNICODE_ISUPPER_METHODDEF
14518     UNICODE_ISTITLE_METHODDEF
14519     UNICODE_ISSPACE_METHODDEF
14520     UNICODE_ISDECIMAL_METHODDEF
14521     UNICODE_ISDIGIT_METHODDEF
14522     UNICODE_ISNUMERIC_METHODDEF
14523     UNICODE_ISALPHA_METHODDEF
14524     UNICODE_ISALNUM_METHODDEF
14525     UNICODE_ISIDENTIFIER_METHODDEF
14526     UNICODE_ISPRINTABLE_METHODDEF
14527     UNICODE_ZFILL_METHODDEF
14528     {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14529     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14530     UNICODE___FORMAT___METHODDEF
14531     UNICODE_MAKETRANS_METHODDEF
14532     UNICODE_SIZEOF_METHODDEF
14533 #if 0
14534     /* These methods are just used for debugging the implementation. */
14535     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14536 #endif
14537 
14538     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14539     {NULL, NULL}
14540 };
14541 
14542 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14543 unicode_mod(PyObject *v, PyObject *w)
14544 {
14545     if (!PyUnicode_Check(v))
14546         Py_RETURN_NOTIMPLEMENTED;
14547     return PyUnicode_Format(v, w);
14548 }
14549 
14550 static PyNumberMethods unicode_as_number = {
14551     0,              /*nb_add*/
14552     0,              /*nb_subtract*/
14553     0,              /*nb_multiply*/
14554     unicode_mod,            /*nb_remainder*/
14555 };
14556 
14557 static PySequenceMethods unicode_as_sequence = {
14558     (lenfunc) unicode_length,       /* sq_length */
14559     PyUnicode_Concat,           /* sq_concat */
14560     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14561     (ssizeargfunc) unicode_getitem,     /* sq_item */
14562     0,                  /* sq_slice */
14563     0,                  /* sq_ass_item */
14564     0,                  /* sq_ass_slice */
14565     PyUnicode_Contains,         /* sq_contains */
14566 };
14567 
14568 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14569 unicode_subscript(PyObject* self, PyObject* item)
14570 {
14571     if (PyUnicode_READY(self) == -1)
14572         return NULL;
14573 
14574     if (_PyIndex_Check(item)) {
14575         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14576         if (i == -1 && PyErr_Occurred())
14577             return NULL;
14578         if (i < 0)
14579             i += PyUnicode_GET_LENGTH(self);
14580         return unicode_getitem(self, i);
14581     } else if (PySlice_Check(item)) {
14582         Py_ssize_t start, stop, step, slicelength, i;
14583         size_t cur;
14584         PyObject *result;
14585         const void *src_data;
14586         void *dest_data;
14587         int src_kind, dest_kind;
14588         Py_UCS4 ch, max_char, kind_limit;
14589 
14590         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14591             return NULL;
14592         }
14593         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14594                                             &start, &stop, step);
14595 
14596         if (slicelength <= 0) {
14597             _Py_RETURN_UNICODE_EMPTY();
14598         } else if (start == 0 && step == 1 &&
14599                    slicelength == PyUnicode_GET_LENGTH(self)) {
14600             return unicode_result_unchanged(self);
14601         } else if (step == 1) {
14602             return PyUnicode_Substring(self,
14603                                        start, start + slicelength);
14604         }
14605         /* General case */
14606         src_kind = PyUnicode_KIND(self);
14607         src_data = PyUnicode_DATA(self);
14608         if (!PyUnicode_IS_ASCII(self)) {
14609             kind_limit = kind_maxchar_limit(src_kind);
14610             max_char = 0;
14611             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14612                 ch = PyUnicode_READ(src_kind, src_data, cur);
14613                 if (ch > max_char) {
14614                     max_char = ch;
14615                     if (max_char >= kind_limit)
14616                         break;
14617                 }
14618             }
14619         }
14620         else
14621             max_char = 127;
14622         result = PyUnicode_New(slicelength, max_char);
14623         if (result == NULL)
14624             return NULL;
14625         dest_kind = PyUnicode_KIND(result);
14626         dest_data = PyUnicode_DATA(result);
14627 
14628         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14629             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14630             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14631         }
14632         assert(_PyUnicode_CheckConsistency(result, 1));
14633         return result;
14634     } else {
14635         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14636         return NULL;
14637     }
14638 }
14639 
14640 static PyMappingMethods unicode_as_mapping = {
14641     (lenfunc)unicode_length,        /* mp_length */
14642     (binaryfunc)unicode_subscript,  /* mp_subscript */
14643     (objobjargproc)0,           /* mp_ass_subscript */
14644 };
14645 
14646 
14647 /* Helpers for PyUnicode_Format() */
14648 
14649 struct unicode_formatter_t {
14650     PyObject *args;
14651     int args_owned;
14652     Py_ssize_t arglen, argidx;
14653     PyObject *dict;
14654 
14655     enum PyUnicode_Kind fmtkind;
14656     Py_ssize_t fmtcnt, fmtpos;
14657     const void *fmtdata;
14658     PyObject *fmtstr;
14659 
14660     _PyUnicodeWriter writer;
14661 };
14662 
14663 struct unicode_format_arg_t {
14664     Py_UCS4 ch;
14665     int flags;
14666     Py_ssize_t width;
14667     int prec;
14668     int sign;
14669 };
14670 
14671 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14672 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14673 {
14674     Py_ssize_t argidx = ctx->argidx;
14675 
14676     if (argidx < ctx->arglen) {
14677         ctx->argidx++;
14678         if (ctx->arglen < 0)
14679             return ctx->args;
14680         else
14681             return PyTuple_GetItem(ctx->args, argidx);
14682     }
14683     PyErr_SetString(PyExc_TypeError,
14684                     "not enough arguments for format string");
14685     return NULL;
14686 }
14687 
14688 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14689 
14690 /* Format a float into the writer if the writer is not NULL, or into *p_output
14691    otherwise.
14692 
14693    Return 0 on success, raise an exception and return -1 on error. */
14694 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14695 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14696             PyObject **p_output,
14697             _PyUnicodeWriter *writer)
14698 {
14699     char *p;
14700     double x;
14701     Py_ssize_t len;
14702     int prec;
14703     int dtoa_flags;
14704 
14705     x = PyFloat_AsDouble(v);
14706     if (x == -1.0 && PyErr_Occurred())
14707         return -1;
14708 
14709     prec = arg->prec;
14710     if (prec < 0)
14711         prec = 6;
14712 
14713     if (arg->flags & F_ALT)
14714         dtoa_flags = Py_DTSF_ALT;
14715     else
14716         dtoa_flags = 0;
14717     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14718     if (p == NULL)
14719         return -1;
14720     len = strlen(p);
14721     if (writer) {
14722         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14723             PyMem_Free(p);
14724             return -1;
14725         }
14726     }
14727     else
14728         *p_output = _PyUnicode_FromASCII(p, len);
14729     PyMem_Free(p);
14730     return 0;
14731 }
14732 
14733 /* formatlong() emulates the format codes d, u, o, x and X, and
14734  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14735  * Python's regular ints.
14736  * Return value:  a new PyUnicodeObject*, or NULL if error.
14737  *     The output string is of the form
14738  *         "-"? ("0x" | "0X")? digit+
14739  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14740  *         set in flags.  The case of hex digits will be correct,
14741  *     There will be at least prec digits, zero-filled on the left if
14742  *         necessary to get that many.
14743  * val          object to be converted
14744  * flags        bitmask of format flags; only F_ALT is looked at
14745  * prec         minimum number of digits; 0-fill on left if needed
14746  * type         a character in [duoxX]; u acts the same as d
14747  *
14748  * CAUTION:  o, x and X conversions on regular ints can never
14749  * produce a '-' sign, but can for Python's unbounded ints.
14750  */
14751 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14752 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14753 {
14754     PyObject *result = NULL;
14755     char *buf;
14756     Py_ssize_t i;
14757     int sign;           /* 1 if '-', else 0 */
14758     int len;            /* number of characters */
14759     Py_ssize_t llen;
14760     int numdigits;      /* len == numnondigits + numdigits */
14761     int numnondigits = 0;
14762 
14763     /* Avoid exceeding SSIZE_T_MAX */
14764     if (prec > INT_MAX-3) {
14765         PyErr_SetString(PyExc_OverflowError,
14766                         "precision too large");
14767         return NULL;
14768     }
14769 
14770     assert(PyLong_Check(val));
14771 
14772     switch (type) {
14773     default:
14774         Py_UNREACHABLE();
14775     case 'd':
14776     case 'i':
14777     case 'u':
14778         /* int and int subclasses should print numerically when a numeric */
14779         /* format code is used (see issue18780) */
14780         result = PyNumber_ToBase(val, 10);
14781         break;
14782     case 'o':
14783         numnondigits = 2;
14784         result = PyNumber_ToBase(val, 8);
14785         break;
14786     case 'x':
14787     case 'X':
14788         numnondigits = 2;
14789         result = PyNumber_ToBase(val, 16);
14790         break;
14791     }
14792     if (!result)
14793         return NULL;
14794 
14795     assert(unicode_modifiable(result));
14796     assert(PyUnicode_IS_READY(result));
14797     assert(PyUnicode_IS_ASCII(result));
14798 
14799     /* To modify the string in-place, there can only be one reference. */
14800     if (Py_REFCNT(result) != 1) {
14801         Py_DECREF(result);
14802         PyErr_BadInternalCall();
14803         return NULL;
14804     }
14805     buf = PyUnicode_DATA(result);
14806     llen = PyUnicode_GET_LENGTH(result);
14807     if (llen > INT_MAX) {
14808         Py_DECREF(result);
14809         PyErr_SetString(PyExc_ValueError,
14810                         "string too large in _PyUnicode_FormatLong");
14811         return NULL;
14812     }
14813     len = (int)llen;
14814     sign = buf[0] == '-';
14815     numnondigits += sign;
14816     numdigits = len - numnondigits;
14817     assert(numdigits > 0);
14818 
14819     /* Get rid of base marker unless F_ALT */
14820     if (((alt) == 0 &&
14821         (type == 'o' || type == 'x' || type == 'X'))) {
14822         assert(buf[sign] == '0');
14823         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14824                buf[sign+1] == 'o');
14825         numnondigits -= 2;
14826         buf += 2;
14827         len -= 2;
14828         if (sign)
14829             buf[0] = '-';
14830         assert(len == numnondigits + numdigits);
14831         assert(numdigits > 0);
14832     }
14833 
14834     /* Fill with leading zeroes to meet minimum width. */
14835     if (prec > numdigits) {
14836         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14837                                 numnondigits + prec);
14838         char *b1;
14839         if (!r1) {
14840             Py_DECREF(result);
14841             return NULL;
14842         }
14843         b1 = PyBytes_AS_STRING(r1);
14844         for (i = 0; i < numnondigits; ++i)
14845             *b1++ = *buf++;
14846         for (i = 0; i < prec - numdigits; i++)
14847             *b1++ = '0';
14848         for (i = 0; i < numdigits; i++)
14849             *b1++ = *buf++;
14850         *b1 = '\0';
14851         Py_DECREF(result);
14852         result = r1;
14853         buf = PyBytes_AS_STRING(result);
14854         len = numnondigits + prec;
14855     }
14856 
14857     /* Fix up case for hex conversions. */
14858     if (type == 'X') {
14859         /* Need to convert all lower case letters to upper case.
14860            and need to convert 0x to 0X (and -0x to -0X). */
14861         for (i = 0; i < len; i++)
14862             if (buf[i] >= 'a' && buf[i] <= 'x')
14863                 buf[i] -= 'a'-'A';
14864     }
14865     if (!PyUnicode_Check(result)
14866         || buf != PyUnicode_DATA(result)) {
14867         PyObject *unicode;
14868         unicode = _PyUnicode_FromASCII(buf, len);
14869         Py_DECREF(result);
14870         result = unicode;
14871     }
14872     else if (len != PyUnicode_GET_LENGTH(result)) {
14873         if (PyUnicode_Resize(&result, len) < 0)
14874             Py_CLEAR(result);
14875     }
14876     return result;
14877 }
14878 
14879 /* Format an integer or a float as an integer.
14880  * Return 1 if the number has been formatted into the writer,
14881  *        0 if the number has been formatted into *p_output
14882  *       -1 and raise an exception on error */
14883 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14884 mainformatlong(PyObject *v,
14885                struct unicode_format_arg_t *arg,
14886                PyObject **p_output,
14887                _PyUnicodeWriter *writer)
14888 {
14889     PyObject *iobj, *res;
14890     char type = (char)arg->ch;
14891 
14892     if (!PyNumber_Check(v))
14893         goto wrongtype;
14894 
14895     /* make sure number is a type of integer for o, x, and X */
14896     if (!PyLong_Check(v)) {
14897         if (type == 'o' || type == 'x' || type == 'X') {
14898             iobj = _PyNumber_Index(v);
14899         }
14900         else {
14901             iobj = PyNumber_Long(v);
14902         }
14903         if (iobj == NULL ) {
14904             if (PyErr_ExceptionMatches(PyExc_TypeError))
14905                 goto wrongtype;
14906             return -1;
14907         }
14908         assert(PyLong_Check(iobj));
14909     }
14910     else {
14911         iobj = v;
14912         Py_INCREF(iobj);
14913     }
14914 
14915     if (PyLong_CheckExact(v)
14916         && arg->width == -1 && arg->prec == -1
14917         && !(arg->flags & (F_SIGN | F_BLANK))
14918         && type != 'X')
14919     {
14920         /* Fast path */
14921         int alternate = arg->flags & F_ALT;
14922         int base;
14923 
14924         switch(type)
14925         {
14926             default:
14927                 Py_UNREACHABLE();
14928             case 'd':
14929             case 'i':
14930             case 'u':
14931                 base = 10;
14932                 break;
14933             case 'o':
14934                 base = 8;
14935                 break;
14936             case 'x':
14937             case 'X':
14938                 base = 16;
14939                 break;
14940         }
14941 
14942         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14943             Py_DECREF(iobj);
14944             return -1;
14945         }
14946         Py_DECREF(iobj);
14947         return 1;
14948     }
14949 
14950     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14951     Py_DECREF(iobj);
14952     if (res == NULL)
14953         return -1;
14954     *p_output = res;
14955     return 0;
14956 
14957 wrongtype:
14958     switch(type)
14959     {
14960         case 'o':
14961         case 'x':
14962         case 'X':
14963             PyErr_Format(PyExc_TypeError,
14964                     "%%%c format: an integer is required, "
14965                     "not %.200s",
14966                     type, Py_TYPE(v)->tp_name);
14967             break;
14968         default:
14969             PyErr_Format(PyExc_TypeError,
14970                     "%%%c format: a real number is required, "
14971                     "not %.200s",
14972                     type, Py_TYPE(v)->tp_name);
14973             break;
14974     }
14975     return -1;
14976 }
14977 
14978 static Py_UCS4
formatchar(PyObject * v)14979 formatchar(PyObject *v)
14980 {
14981     /* presume that the buffer is at least 3 characters long */
14982     if (PyUnicode_Check(v)) {
14983         if (PyUnicode_GET_LENGTH(v) == 1) {
14984             return PyUnicode_READ_CHAR(v, 0);
14985         }
14986         goto onError;
14987     }
14988     else {
14989         int overflow;
14990         long x = PyLong_AsLongAndOverflow(v, &overflow);
14991         if (x == -1 && PyErr_Occurred()) {
14992             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14993                 goto onError;
14994             }
14995             return (Py_UCS4) -1;
14996         }
14997 
14998         if (x < 0 || x > MAX_UNICODE) {
14999             /* this includes an overflow in converting to C long */
15000             PyErr_SetString(PyExc_OverflowError,
15001                             "%c arg not in range(0x110000)");
15002             return (Py_UCS4) -1;
15003         }
15004 
15005         return (Py_UCS4) x;
15006     }
15007 
15008   onError:
15009     PyErr_SetString(PyExc_TypeError,
15010                     "%c requires int or char");
15011     return (Py_UCS4) -1;
15012 }
15013 
15014 /* Parse options of an argument: flags, width, precision.
15015    Handle also "%(name)" syntax.
15016 
15017    Return 0 if the argument has been formatted into arg->str.
15018    Return 1 if the argument has been written into ctx->writer,
15019    Raise an exception and return -1 on error. */
15020 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)15021 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15022                          struct unicode_format_arg_t *arg)
15023 {
15024 #define FORMAT_READ(ctx) \
15025         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15026 
15027     PyObject *v;
15028 
15029     if (arg->ch == '(') {
15030         /* Get argument value from a dictionary. Example: "%(name)s". */
15031         Py_ssize_t keystart;
15032         Py_ssize_t keylen;
15033         PyObject *key;
15034         int pcount = 1;
15035 
15036         if (ctx->dict == NULL) {
15037             PyErr_SetString(PyExc_TypeError,
15038                             "format requires a mapping");
15039             return -1;
15040         }
15041         ++ctx->fmtpos;
15042         --ctx->fmtcnt;
15043         keystart = ctx->fmtpos;
15044         /* Skip over balanced parentheses */
15045         while (pcount > 0 && --ctx->fmtcnt >= 0) {
15046             arg->ch = FORMAT_READ(ctx);
15047             if (arg->ch == ')')
15048                 --pcount;
15049             else if (arg->ch == '(')
15050                 ++pcount;
15051             ctx->fmtpos++;
15052         }
15053         keylen = ctx->fmtpos - keystart - 1;
15054         if (ctx->fmtcnt < 0 || pcount > 0) {
15055             PyErr_SetString(PyExc_ValueError,
15056                             "incomplete format key");
15057             return -1;
15058         }
15059         key = PyUnicode_Substring(ctx->fmtstr,
15060                                   keystart, keystart + keylen);
15061         if (key == NULL)
15062             return -1;
15063         if (ctx->args_owned) {
15064             ctx->args_owned = 0;
15065             Py_DECREF(ctx->args);
15066         }
15067         ctx->args = PyObject_GetItem(ctx->dict, key);
15068         Py_DECREF(key);
15069         if (ctx->args == NULL)
15070             return -1;
15071         ctx->args_owned = 1;
15072         ctx->arglen = -1;
15073         ctx->argidx = -2;
15074     }
15075 
15076     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15077     while (--ctx->fmtcnt >= 0) {
15078         arg->ch = FORMAT_READ(ctx);
15079         ctx->fmtpos++;
15080         switch (arg->ch) {
15081         case '-': arg->flags |= F_LJUST; continue;
15082         case '+': arg->flags |= F_SIGN; continue;
15083         case ' ': arg->flags |= F_BLANK; continue;
15084         case '#': arg->flags |= F_ALT; continue;
15085         case '0': arg->flags |= F_ZERO; continue;
15086         }
15087         break;
15088     }
15089 
15090     /* Parse width. Example: "%10s" => width=10 */
15091     if (arg->ch == '*') {
15092         v = unicode_format_getnextarg(ctx);
15093         if (v == NULL)
15094             return -1;
15095         if (!PyLong_Check(v)) {
15096             PyErr_SetString(PyExc_TypeError,
15097                             "* wants int");
15098             return -1;
15099         }
15100         arg->width = PyLong_AsSsize_t(v);
15101         if (arg->width == -1 && PyErr_Occurred())
15102             return -1;
15103         if (arg->width < 0) {
15104             arg->flags |= F_LJUST;
15105             arg->width = -arg->width;
15106         }
15107         if (--ctx->fmtcnt >= 0) {
15108             arg->ch = FORMAT_READ(ctx);
15109             ctx->fmtpos++;
15110         }
15111     }
15112     else if (arg->ch >= '0' && arg->ch <= '9') {
15113         arg->width = arg->ch - '0';
15114         while (--ctx->fmtcnt >= 0) {
15115             arg->ch = FORMAT_READ(ctx);
15116             ctx->fmtpos++;
15117             if (arg->ch < '0' || arg->ch > '9')
15118                 break;
15119             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15120                mixing signed and unsigned comparison. Since arg->ch is between
15121                '0' and '9', casting to int is safe. */
15122             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15123                 PyErr_SetString(PyExc_ValueError,
15124                                 "width too big");
15125                 return -1;
15126             }
15127             arg->width = arg->width*10 + (arg->ch - '0');
15128         }
15129     }
15130 
15131     /* Parse precision. Example: "%.3f" => prec=3 */
15132     if (arg->ch == '.') {
15133         arg->prec = 0;
15134         if (--ctx->fmtcnt >= 0) {
15135             arg->ch = FORMAT_READ(ctx);
15136             ctx->fmtpos++;
15137         }
15138         if (arg->ch == '*') {
15139             v = unicode_format_getnextarg(ctx);
15140             if (v == NULL)
15141                 return -1;
15142             if (!PyLong_Check(v)) {
15143                 PyErr_SetString(PyExc_TypeError,
15144                                 "* wants int");
15145                 return -1;
15146             }
15147             arg->prec = _PyLong_AsInt(v);
15148             if (arg->prec == -1 && PyErr_Occurred())
15149                 return -1;
15150             if (arg->prec < 0)
15151                 arg->prec = 0;
15152             if (--ctx->fmtcnt >= 0) {
15153                 arg->ch = FORMAT_READ(ctx);
15154                 ctx->fmtpos++;
15155             }
15156         }
15157         else if (arg->ch >= '0' && arg->ch <= '9') {
15158             arg->prec = arg->ch - '0';
15159             while (--ctx->fmtcnt >= 0) {
15160                 arg->ch = FORMAT_READ(ctx);
15161                 ctx->fmtpos++;
15162                 if (arg->ch < '0' || arg->ch > '9')
15163                     break;
15164                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15165                     PyErr_SetString(PyExc_ValueError,
15166                                     "precision too big");
15167                     return -1;
15168                 }
15169                 arg->prec = arg->prec*10 + (arg->ch - '0');
15170             }
15171         }
15172     }
15173 
15174     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15175     if (ctx->fmtcnt >= 0) {
15176         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15177             if (--ctx->fmtcnt >= 0) {
15178                 arg->ch = FORMAT_READ(ctx);
15179                 ctx->fmtpos++;
15180             }
15181         }
15182     }
15183     if (ctx->fmtcnt < 0) {
15184         PyErr_SetString(PyExc_ValueError,
15185                         "incomplete format");
15186         return -1;
15187     }
15188     return 0;
15189 
15190 #undef FORMAT_READ
15191 }
15192 
15193 /* Format one argument. Supported conversion specifiers:
15194 
15195    - "s", "r", "a": any type
15196    - "i", "d", "u": int or float
15197    - "o", "x", "X": int
15198    - "e", "E", "f", "F", "g", "G": float
15199    - "c": int or str (1 character)
15200 
15201    When possible, the output is written directly into the Unicode writer
15202    (ctx->writer). A string is created when padding is required.
15203 
15204    Return 0 if the argument has been formatted into *p_str,
15205           1 if the argument has been written into ctx->writer,
15206          -1 on error. */
15207 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)15208 unicode_format_arg_format(struct unicode_formatter_t *ctx,
15209                           struct unicode_format_arg_t *arg,
15210                           PyObject **p_str)
15211 {
15212     PyObject *v;
15213     _PyUnicodeWriter *writer = &ctx->writer;
15214 
15215     if (ctx->fmtcnt == 0)
15216         ctx->writer.overallocate = 0;
15217 
15218     v = unicode_format_getnextarg(ctx);
15219     if (v == NULL)
15220         return -1;
15221 
15222 
15223     switch (arg->ch) {
15224     case 's':
15225     case 'r':
15226     case 'a':
15227         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15228             /* Fast path */
15229             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15230                 return -1;
15231             return 1;
15232         }
15233 
15234         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15235             *p_str = v;
15236             Py_INCREF(*p_str);
15237         }
15238         else {
15239             if (arg->ch == 's')
15240                 *p_str = PyObject_Str(v);
15241             else if (arg->ch == 'r')
15242                 *p_str = PyObject_Repr(v);
15243             else
15244                 *p_str = PyObject_ASCII(v);
15245         }
15246         break;
15247 
15248     case 'i':
15249     case 'd':
15250     case 'u':
15251     case 'o':
15252     case 'x':
15253     case 'X':
15254     {
15255         int ret = mainformatlong(v, arg, p_str, writer);
15256         if (ret != 0)
15257             return ret;
15258         arg->sign = 1;
15259         break;
15260     }
15261 
15262     case 'e':
15263     case 'E':
15264     case 'f':
15265     case 'F':
15266     case 'g':
15267     case 'G':
15268         if (arg->width == -1 && arg->prec == -1
15269             && !(arg->flags & (F_SIGN | F_BLANK)))
15270         {
15271             /* Fast path */
15272             if (formatfloat(v, arg, NULL, writer) == -1)
15273                 return -1;
15274             return 1;
15275         }
15276 
15277         arg->sign = 1;
15278         if (formatfloat(v, arg, p_str, NULL) == -1)
15279             return -1;
15280         break;
15281 
15282     case 'c':
15283     {
15284         Py_UCS4 ch = formatchar(v);
15285         if (ch == (Py_UCS4) -1)
15286             return -1;
15287         if (arg->width == -1 && arg->prec == -1) {
15288             /* Fast path */
15289             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15290                 return -1;
15291             return 1;
15292         }
15293         *p_str = PyUnicode_FromOrdinal(ch);
15294         break;
15295     }
15296 
15297     default:
15298         PyErr_Format(PyExc_ValueError,
15299                      "unsupported format character '%c' (0x%x) "
15300                      "at index %zd",
15301                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15302                      (int)arg->ch,
15303                      ctx->fmtpos - 1);
15304         return -1;
15305     }
15306     if (*p_str == NULL)
15307         return -1;
15308     assert (PyUnicode_Check(*p_str));
15309     return 0;
15310 }
15311 
15312 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15313 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15314                           struct unicode_format_arg_t *arg,
15315                           PyObject *str)
15316 {
15317     Py_ssize_t len;
15318     enum PyUnicode_Kind kind;
15319     const void *pbuf;
15320     Py_ssize_t pindex;
15321     Py_UCS4 signchar;
15322     Py_ssize_t buflen;
15323     Py_UCS4 maxchar;
15324     Py_ssize_t sublen;
15325     _PyUnicodeWriter *writer = &ctx->writer;
15326     Py_UCS4 fill;
15327 
15328     fill = ' ';
15329     if (arg->sign && arg->flags & F_ZERO)
15330         fill = '0';
15331 
15332     if (PyUnicode_READY(str) == -1)
15333         return -1;
15334 
15335     len = PyUnicode_GET_LENGTH(str);
15336     if ((arg->width == -1 || arg->width <= len)
15337         && (arg->prec == -1 || arg->prec >= len)
15338         && !(arg->flags & (F_SIGN | F_BLANK)))
15339     {
15340         /* Fast path */
15341         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15342             return -1;
15343         return 0;
15344     }
15345 
15346     /* Truncate the string for "s", "r" and "a" formats
15347        if the precision is set */
15348     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15349         if (arg->prec >= 0 && len > arg->prec)
15350             len = arg->prec;
15351     }
15352 
15353     /* Adjust sign and width */
15354     kind = PyUnicode_KIND(str);
15355     pbuf = PyUnicode_DATA(str);
15356     pindex = 0;
15357     signchar = '\0';
15358     if (arg->sign) {
15359         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15360         if (ch == '-' || ch == '+') {
15361             signchar = ch;
15362             len--;
15363             pindex++;
15364         }
15365         else if (arg->flags & F_SIGN)
15366             signchar = '+';
15367         else if (arg->flags & F_BLANK)
15368             signchar = ' ';
15369         else
15370             arg->sign = 0;
15371     }
15372     if (arg->width < len)
15373         arg->width = len;
15374 
15375     /* Prepare the writer */
15376     maxchar = writer->maxchar;
15377     if (!(arg->flags & F_LJUST)) {
15378         if (arg->sign) {
15379             if ((arg->width-1) > len)
15380                 maxchar = Py_MAX(maxchar, fill);
15381         }
15382         else {
15383             if (arg->width > len)
15384                 maxchar = Py_MAX(maxchar, fill);
15385         }
15386     }
15387     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15388         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15389         maxchar = Py_MAX(maxchar, strmaxchar);
15390     }
15391 
15392     buflen = arg->width;
15393     if (arg->sign && len == arg->width)
15394         buflen++;
15395     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15396         return -1;
15397 
15398     /* Write the sign if needed */
15399     if (arg->sign) {
15400         if (fill != ' ') {
15401             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15402             writer->pos += 1;
15403         }
15404         if (arg->width > len)
15405             arg->width--;
15406     }
15407 
15408     /* Write the numeric prefix for "x", "X" and "o" formats
15409        if the alternate form is used.
15410        For example, write "0x" for the "%#x" format. */
15411     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15412         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15413         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15414         if (fill != ' ') {
15415             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15416             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15417             writer->pos += 2;
15418             pindex += 2;
15419         }
15420         arg->width -= 2;
15421         if (arg->width < 0)
15422             arg->width = 0;
15423         len -= 2;
15424     }
15425 
15426     /* Pad left with the fill character if needed */
15427     if (arg->width > len && !(arg->flags & F_LJUST)) {
15428         sublen = arg->width - len;
15429         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15430         writer->pos += sublen;
15431         arg->width = len;
15432     }
15433 
15434     /* If padding with spaces: write sign if needed and/or numeric prefix if
15435        the alternate form is used */
15436     if (fill == ' ') {
15437         if (arg->sign) {
15438             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15439             writer->pos += 1;
15440         }
15441         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15442             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15443             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15444             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15445             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15446             writer->pos += 2;
15447             pindex += 2;
15448         }
15449     }
15450 
15451     /* Write characters */
15452     if (len) {
15453         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15454                                       str, pindex, len);
15455         writer->pos += len;
15456     }
15457 
15458     /* Pad right with the fill character if needed */
15459     if (arg->width > len) {
15460         sublen = arg->width - len;
15461         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15462         writer->pos += sublen;
15463     }
15464     return 0;
15465 }
15466 
15467 /* Helper of PyUnicode_Format(): format one arg.
15468    Return 0 on success, raise an exception and return -1 on error. */
15469 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15470 unicode_format_arg(struct unicode_formatter_t *ctx)
15471 {
15472     struct unicode_format_arg_t arg;
15473     PyObject *str;
15474     int ret;
15475 
15476     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15477     if (arg.ch == '%') {
15478         ctx->fmtpos++;
15479         ctx->fmtcnt--;
15480         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15481             return -1;
15482         return 0;
15483     }
15484     arg.flags = 0;
15485     arg.width = -1;
15486     arg.prec = -1;
15487     arg.sign = 0;
15488     str = NULL;
15489 
15490     ret = unicode_format_arg_parse(ctx, &arg);
15491     if (ret == -1)
15492         return -1;
15493 
15494     ret = unicode_format_arg_format(ctx, &arg, &str);
15495     if (ret == -1)
15496         return -1;
15497 
15498     if (ret != 1) {
15499         ret = unicode_format_arg_output(ctx, &arg, str);
15500         Py_DECREF(str);
15501         if (ret == -1)
15502             return -1;
15503     }
15504 
15505     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15506         PyErr_SetString(PyExc_TypeError,
15507                         "not all arguments converted during string formatting");
15508         return -1;
15509     }
15510     return 0;
15511 }
15512 
15513 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15514 PyUnicode_Format(PyObject *format, PyObject *args)
15515 {
15516     struct unicode_formatter_t ctx;
15517 
15518     if (format == NULL || args == NULL) {
15519         PyErr_BadInternalCall();
15520         return NULL;
15521     }
15522 
15523     if (ensure_unicode(format) < 0)
15524         return NULL;
15525 
15526     ctx.fmtstr = format;
15527     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15528     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15529     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15530     ctx.fmtpos = 0;
15531 
15532     _PyUnicodeWriter_Init(&ctx.writer);
15533     ctx.writer.min_length = ctx.fmtcnt + 100;
15534     ctx.writer.overallocate = 1;
15535 
15536     if (PyTuple_Check(args)) {
15537         ctx.arglen = PyTuple_Size(args);
15538         ctx.argidx = 0;
15539     }
15540     else {
15541         ctx.arglen = -1;
15542         ctx.argidx = -2;
15543     }
15544     ctx.args_owned = 0;
15545     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15546         ctx.dict = args;
15547     else
15548         ctx.dict = NULL;
15549     ctx.args = args;
15550 
15551     while (--ctx.fmtcnt >= 0) {
15552         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15553             Py_ssize_t nonfmtpos;
15554 
15555             nonfmtpos = ctx.fmtpos++;
15556             while (ctx.fmtcnt >= 0 &&
15557                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15558                 ctx.fmtpos++;
15559                 ctx.fmtcnt--;
15560             }
15561             if (ctx.fmtcnt < 0) {
15562                 ctx.fmtpos--;
15563                 ctx.writer.overallocate = 0;
15564             }
15565 
15566             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15567                                                 nonfmtpos, ctx.fmtpos) < 0)
15568                 goto onError;
15569         }
15570         else {
15571             ctx.fmtpos++;
15572             if (unicode_format_arg(&ctx) == -1)
15573                 goto onError;
15574         }
15575     }
15576 
15577     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15578         PyErr_SetString(PyExc_TypeError,
15579                         "not all arguments converted during string formatting");
15580         goto onError;
15581     }
15582 
15583     if (ctx.args_owned) {
15584         Py_DECREF(ctx.args);
15585     }
15586     return _PyUnicodeWriter_Finish(&ctx.writer);
15587 
15588   onError:
15589     _PyUnicodeWriter_Dealloc(&ctx.writer);
15590     if (ctx.args_owned) {
15591         Py_DECREF(ctx.args);
15592     }
15593     return NULL;
15594 }
15595 
15596 static PyObject *
15597 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15598 
15599 /*[clinic input]
15600 @classmethod
15601 str.__new__ as unicode_new
15602 
15603     object as x: object = NULL
15604     encoding: str = NULL
15605     errors: str = NULL
15606 
15607 [clinic start generated code]*/
15608 
15609 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15610 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15611                  const char *errors)
15612 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15613 {
15614     PyObject *unicode;
15615     if (x == NULL) {
15616         unicode = unicode_new_empty();
15617     }
15618     else if (encoding == NULL && errors == NULL) {
15619         unicode = PyObject_Str(x);
15620     }
15621     else {
15622         unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15623     }
15624 
15625     if (unicode != NULL && type != &PyUnicode_Type) {
15626         Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15627     }
15628     return unicode;
15629 }
15630 
15631 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15632 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15633 {
15634     PyObject *self;
15635     Py_ssize_t length, char_size;
15636     int share_wstr, share_utf8;
15637     unsigned int kind;
15638     void *data;
15639 
15640     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15641     assert(_PyUnicode_CHECK(unicode));
15642     if (PyUnicode_READY(unicode) == -1) {
15643         return NULL;
15644     }
15645 
15646     self = type->tp_alloc(type, 0);
15647     if (self == NULL) {
15648         return NULL;
15649     }
15650     kind = PyUnicode_KIND(unicode);
15651     length = PyUnicode_GET_LENGTH(unicode);
15652 
15653     _PyUnicode_LENGTH(self) = length;
15654 #ifdef Py_DEBUG
15655     _PyUnicode_HASH(self) = -1;
15656 #else
15657     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15658 #endif
15659     _PyUnicode_STATE(self).interned = 0;
15660     _PyUnicode_STATE(self).kind = kind;
15661     _PyUnicode_STATE(self).compact = 0;
15662     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15663     _PyUnicode_STATE(self).ready = 1;
15664     _PyUnicode_WSTR(self) = NULL;
15665     _PyUnicode_UTF8_LENGTH(self) = 0;
15666     _PyUnicode_UTF8(self) = NULL;
15667     _PyUnicode_WSTR_LENGTH(self) = 0;
15668     _PyUnicode_DATA_ANY(self) = NULL;
15669 
15670     share_utf8 = 0;
15671     share_wstr = 0;
15672     if (kind == PyUnicode_1BYTE_KIND) {
15673         char_size = 1;
15674         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15675             share_utf8 = 1;
15676     }
15677     else if (kind == PyUnicode_2BYTE_KIND) {
15678         char_size = 2;
15679         if (sizeof(wchar_t) == 2)
15680             share_wstr = 1;
15681     }
15682     else {
15683         assert(kind == PyUnicode_4BYTE_KIND);
15684         char_size = 4;
15685         if (sizeof(wchar_t) == 4)
15686             share_wstr = 1;
15687     }
15688 
15689     /* Ensure we won't overflow the length. */
15690     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15691         PyErr_NoMemory();
15692         goto onError;
15693     }
15694     data = PyObject_Malloc((length + 1) * char_size);
15695     if (data == NULL) {
15696         PyErr_NoMemory();
15697         goto onError;
15698     }
15699 
15700     _PyUnicode_DATA_ANY(self) = data;
15701     if (share_utf8) {
15702         _PyUnicode_UTF8_LENGTH(self) = length;
15703         _PyUnicode_UTF8(self) = data;
15704     }
15705     if (share_wstr) {
15706         _PyUnicode_WSTR_LENGTH(self) = length;
15707         _PyUnicode_WSTR(self) = (wchar_t *)data;
15708     }
15709 
15710     memcpy(data, PyUnicode_DATA(unicode),
15711               kind * (length + 1));
15712     assert(_PyUnicode_CheckConsistency(self, 1));
15713 #ifdef Py_DEBUG
15714     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15715 #endif
15716     return self;
15717 
15718 onError:
15719     Py_DECREF(self);
15720     return NULL;
15721 }
15722 
15723 PyDoc_STRVAR(unicode_doc,
15724 "str(object='') -> str\n\
15725 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15726 \n\
15727 Create a new string object from the given object. If encoding or\n\
15728 errors is specified, then the object must expose a data buffer\n\
15729 that will be decoded using the given encoding and error handler.\n\
15730 Otherwise, returns the result of object.__str__() (if defined)\n\
15731 or repr(object).\n\
15732 encoding defaults to sys.getdefaultencoding().\n\
15733 errors defaults to 'strict'.");
15734 
15735 static PyObject *unicode_iter(PyObject *seq);
15736 
15737 PyTypeObject PyUnicode_Type = {
15738     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15739     "str",                        /* tp_name */
15740     sizeof(PyUnicodeObject),      /* tp_basicsize */
15741     0,                            /* tp_itemsize */
15742     /* Slots */
15743     (destructor)unicode_dealloc,  /* tp_dealloc */
15744     0,                            /* tp_vectorcall_offset */
15745     0,                            /* tp_getattr */
15746     0,                            /* tp_setattr */
15747     0,                            /* tp_as_async */
15748     unicode_repr,                 /* tp_repr */
15749     &unicode_as_number,           /* tp_as_number */
15750     &unicode_as_sequence,         /* tp_as_sequence */
15751     &unicode_as_mapping,          /* tp_as_mapping */
15752     (hashfunc) unicode_hash,      /* tp_hash*/
15753     0,                            /* tp_call*/
15754     (reprfunc) unicode_str,       /* tp_str */
15755     PyObject_GenericGetAttr,      /* tp_getattro */
15756     0,                            /* tp_setattro */
15757     0,                            /* tp_as_buffer */
15758     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15759         Py_TPFLAGS_UNICODE_SUBCLASS |
15760         _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15761     unicode_doc,                  /* tp_doc */
15762     0,                            /* tp_traverse */
15763     0,                            /* tp_clear */
15764     PyUnicode_RichCompare,        /* tp_richcompare */
15765     0,                            /* tp_weaklistoffset */
15766     unicode_iter,                 /* tp_iter */
15767     0,                            /* tp_iternext */
15768     unicode_methods,              /* tp_methods */
15769     0,                            /* tp_members */
15770     0,                            /* tp_getset */
15771     &PyBaseObject_Type,           /* tp_base */
15772     0,                            /* tp_dict */
15773     0,                            /* tp_descr_get */
15774     0,                            /* tp_descr_set */
15775     0,                            /* tp_dictoffset */
15776     0,                            /* tp_init */
15777     0,                            /* tp_alloc */
15778     unicode_new,                  /* tp_new */
15779     PyObject_Del,                 /* tp_free */
15780 };
15781 
15782 /* Initialize the Unicode implementation */
15783 
15784 PyStatus
_PyUnicode_Init(PyInterpreterState * interp)15785 _PyUnicode_Init(PyInterpreterState *interp)
15786 {
15787     struct _Py_unicode_state *state = &interp->unicode;
15788     if (unicode_create_empty_string_singleton(state) < 0) {
15789         return _PyStatus_NO_MEMORY();
15790     }
15791 
15792     if (_Py_IsMainInterpreter(interp)) {
15793         /* initialize the linebreak bloom filter */
15794         const Py_UCS2 linebreak[] = {
15795             0x000A, /* LINE FEED */
15796             0x000D, /* CARRIAGE RETURN */
15797             0x001C, /* FILE SEPARATOR */
15798             0x001D, /* GROUP SEPARATOR */
15799             0x001E, /* RECORD SEPARATOR */
15800             0x0085, /* NEXT LINE */
15801             0x2028, /* LINE SEPARATOR */
15802             0x2029, /* PARAGRAPH SEPARATOR */
15803         };
15804         bloom_linebreak = make_bloom_mask(
15805             PyUnicode_2BYTE_KIND, linebreak,
15806             Py_ARRAY_LENGTH(linebreak));
15807     }
15808 
15809     return _PyStatus_OK();
15810 }
15811 
15812 
15813 PyStatus
_PyUnicode_InitTypes(void)15814 _PyUnicode_InitTypes(void)
15815 {
15816     if (PyType_Ready(&PyUnicode_Type) < 0) {
15817         return _PyStatus_ERR("Can't initialize unicode type");
15818     }
15819     if (PyType_Ready(&EncodingMapType) < 0) {
15820          return _PyStatus_ERR("Can't initialize encoding map type");
15821     }
15822     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15823         return _PyStatus_ERR("Can't initialize field name iterator type");
15824     }
15825     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15826         return _PyStatus_ERR("Can't initialize formatter iter type");
15827     }
15828     return _PyStatus_OK();
15829 }
15830 
15831 
15832 void
PyUnicode_InternInPlace(PyObject ** p)15833 PyUnicode_InternInPlace(PyObject **p)
15834 {
15835     PyObject *s = *p;
15836 #ifdef Py_DEBUG
15837     assert(s != NULL);
15838     assert(_PyUnicode_CHECK(s));
15839 #else
15840     if (s == NULL || !PyUnicode_Check(s)) {
15841         return;
15842     }
15843 #endif
15844 
15845     /* If it's a subclass, we don't really know what putting
15846        it in the interned dict might do. */
15847     if (!PyUnicode_CheckExact(s)) {
15848         return;
15849     }
15850 
15851     if (PyUnicode_CHECK_INTERNED(s)) {
15852         return;
15853     }
15854 
15855 #ifdef INTERNED_STRINGS
15856     if (PyUnicode_READY(s) == -1) {
15857         PyErr_Clear();
15858         return;
15859     }
15860 
15861     if (interned == NULL) {
15862         interned = PyDict_New();
15863         if (interned == NULL) {
15864             PyErr_Clear(); /* Don't leave an exception */
15865             return;
15866         }
15867     }
15868 
15869     PyObject *t = PyDict_SetDefault(interned, s, s);
15870     if (t == NULL) {
15871         PyErr_Clear();
15872         return;
15873     }
15874 
15875     if (t != s) {
15876         Py_INCREF(t);
15877         Py_SETREF(*p, t);
15878         return;
15879     }
15880 
15881     /* The two references in interned dict (key and value) are not counted by
15882        refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15883        this. */
15884     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15885     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15886 #else
15887     // PyDict expects that interned strings have their hash
15888     // (PyASCIIObject.hash) already computed.
15889     (void)unicode_hash(s);
15890 #endif
15891 }
15892 
15893 void
PyUnicode_InternImmortal(PyObject ** p)15894 PyUnicode_InternImmortal(PyObject **p)
15895 {
15896     if (PyErr_WarnEx(PyExc_DeprecationWarning,
15897             "PyUnicode_InternImmortal() is deprecated; "
15898             "use PyUnicode_InternInPlace() instead", 1) < 0)
15899     {
15900         // The function has no return value, the exception cannot
15901         // be reported to the caller, so just log it.
15902         PyErr_WriteUnraisable(NULL);
15903     }
15904 
15905     PyUnicode_InternInPlace(p);
15906     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15907         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15908         Py_INCREF(*p);
15909     }
15910 }
15911 
15912 PyObject *
PyUnicode_InternFromString(const char * cp)15913 PyUnicode_InternFromString(const char *cp)
15914 {
15915     PyObject *s = PyUnicode_FromString(cp);
15916     if (s == NULL)
15917         return NULL;
15918     PyUnicode_InternInPlace(&s);
15919     return s;
15920 }
15921 
15922 
15923 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15924 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15925 {
15926     if (!_Py_IsMainInterpreter(interp)) {
15927         // interned dict is shared by all interpreters
15928         return;
15929     }
15930 
15931     if (interned == NULL) {
15932         return;
15933     }
15934     assert(PyDict_CheckExact(interned));
15935 
15936     /* Interned unicode strings are not forcibly deallocated; rather, we give
15937        them their stolen references back, and then clear and DECREF the
15938        interned dict. */
15939 
15940 #ifdef INTERNED_STATS
15941     fprintf(stderr, "releasing %zd interned strings\n",
15942             PyDict_GET_SIZE(interned));
15943 
15944     Py_ssize_t immortal_size = 0, mortal_size = 0;
15945 #endif
15946     Py_ssize_t pos = 0;
15947     PyObject *s, *ignored_value;
15948     while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15949         assert(PyUnicode_IS_READY(s));
15950 
15951         switch (PyUnicode_CHECK_INTERNED(s)) {
15952         case SSTATE_INTERNED_IMMORTAL:
15953             Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15954 #ifdef INTERNED_STATS
15955             immortal_size += PyUnicode_GET_LENGTH(s);
15956 #endif
15957             break;
15958         case SSTATE_INTERNED_MORTAL:
15959             // Restore the two references (key and value) ignored
15960             // by PyUnicode_InternInPlace().
15961             Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15962 #ifdef INTERNED_STATS
15963             mortal_size += PyUnicode_GET_LENGTH(s);
15964 #endif
15965             break;
15966         case SSTATE_NOT_INTERNED:
15967             /* fall through */
15968         default:
15969             Py_UNREACHABLE();
15970         }
15971         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15972     }
15973 #ifdef INTERNED_STATS
15974     fprintf(stderr,
15975             "total size of all interned strings: %zd/%zd mortal/immortal\n",
15976             mortal_size, immortal_size);
15977 #endif
15978 
15979     PyDict_Clear(interned);
15980     Py_CLEAR(interned);
15981 }
15982 
15983 
15984 /********************* Unicode Iterator **************************/
15985 
15986 typedef struct {
15987     PyObject_HEAD
15988     Py_ssize_t it_index;
15989     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15990 } unicodeiterobject;
15991 
15992 static void
unicodeiter_dealloc(unicodeiterobject * it)15993 unicodeiter_dealloc(unicodeiterobject *it)
15994 {
15995     _PyObject_GC_UNTRACK(it);
15996     Py_XDECREF(it->it_seq);
15997     PyObject_GC_Del(it);
15998 }
15999 
16000 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)16001 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
16002 {
16003     Py_VISIT(it->it_seq);
16004     return 0;
16005 }
16006 
16007 static PyObject *
unicodeiter_next(unicodeiterobject * it)16008 unicodeiter_next(unicodeiterobject *it)
16009 {
16010     PyObject *seq, *item;
16011 
16012     assert(it != NULL);
16013     seq = it->it_seq;
16014     if (seq == NULL)
16015         return NULL;
16016     assert(_PyUnicode_CHECK(seq));
16017 
16018     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16019         int kind = PyUnicode_KIND(seq);
16020         const void *data = PyUnicode_DATA(seq);
16021         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16022         item = PyUnicode_FromOrdinal(chr);
16023         if (item != NULL)
16024             ++it->it_index;
16025         return item;
16026     }
16027 
16028     it->it_seq = NULL;
16029     Py_DECREF(seq);
16030     return NULL;
16031 }
16032 
16033 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16034 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16035 {
16036     Py_ssize_t len = 0;
16037     if (it->it_seq)
16038         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16039     return PyLong_FromSsize_t(len);
16040 }
16041 
16042 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16043 
16044 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16045 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16046 {
16047     _Py_IDENTIFIER(iter);
16048     if (it->it_seq != NULL) {
16049         return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
16050                              it->it_seq, it->it_index);
16051     } else {
16052         PyObject *u = (PyObject *)_PyUnicode_New(0);
16053         if (u == NULL)
16054             return NULL;
16055         return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
16056     }
16057 }
16058 
16059 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16060 
16061 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)16062 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16063 {
16064     Py_ssize_t index = PyLong_AsSsize_t(state);
16065     if (index == -1 && PyErr_Occurred())
16066         return NULL;
16067     if (it->it_seq != NULL) {
16068         if (index < 0)
16069             index = 0;
16070         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16071             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16072         it->it_index = index;
16073     }
16074     Py_RETURN_NONE;
16075 }
16076 
16077 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16078 
16079 static PyMethodDef unicodeiter_methods[] = {
16080     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
16081      length_hint_doc},
16082     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16083      reduce_doc},
16084     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
16085      setstate_doc},
16086     {NULL,      NULL}       /* sentinel */
16087 };
16088 
16089 PyTypeObject PyUnicodeIter_Type = {
16090     PyVarObject_HEAD_INIT(&PyType_Type, 0)
16091     "str_iterator",         /* tp_name */
16092     sizeof(unicodeiterobject),      /* tp_basicsize */
16093     0,                  /* tp_itemsize */
16094     /* methods */
16095     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
16096     0,                  /* tp_vectorcall_offset */
16097     0,                  /* tp_getattr */
16098     0,                  /* tp_setattr */
16099     0,                  /* tp_as_async */
16100     0,                  /* tp_repr */
16101     0,                  /* tp_as_number */
16102     0,                  /* tp_as_sequence */
16103     0,                  /* tp_as_mapping */
16104     0,                  /* tp_hash */
16105     0,                  /* tp_call */
16106     0,                  /* tp_str */
16107     PyObject_GenericGetAttr,        /* tp_getattro */
16108     0,                  /* tp_setattro */
16109     0,                  /* tp_as_buffer */
16110     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16111     0,                  /* tp_doc */
16112     (traverseproc)unicodeiter_traverse, /* tp_traverse */
16113     0,                  /* tp_clear */
16114     0,                  /* tp_richcompare */
16115     0,                  /* tp_weaklistoffset */
16116     PyObject_SelfIter,          /* tp_iter */
16117     (iternextfunc)unicodeiter_next,     /* tp_iternext */
16118     unicodeiter_methods,            /* tp_methods */
16119     0,
16120 };
16121 
16122 static PyObject *
unicode_iter(PyObject * seq)16123 unicode_iter(PyObject *seq)
16124 {
16125     unicodeiterobject *it;
16126 
16127     if (!PyUnicode_Check(seq)) {
16128         PyErr_BadInternalCall();
16129         return NULL;
16130     }
16131     if (PyUnicode_READY(seq) == -1)
16132         return NULL;
16133     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16134     if (it == NULL)
16135         return NULL;
16136     it->it_index = 0;
16137     Py_INCREF(seq);
16138     it->it_seq = seq;
16139     _PyObject_GC_TRACK(it);
16140     return (PyObject *)it;
16141 }
16142 
16143 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)16144 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16145 {
16146     int res;
16147     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16148     if (res == -2) {
16149         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16150         return -1;
16151     }
16152     if (res < 0) {
16153         PyErr_NoMemory();
16154         return -1;
16155     }
16156     return 0;
16157 }
16158 
16159 
16160 static int
config_get_codec_name(wchar_t ** config_encoding)16161 config_get_codec_name(wchar_t **config_encoding)
16162 {
16163     char *encoding;
16164     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16165         return -1;
16166     }
16167 
16168     PyObject *name_obj = NULL;
16169     PyObject *codec = _PyCodec_Lookup(encoding);
16170     PyMem_RawFree(encoding);
16171 
16172     if (!codec)
16173         goto error;
16174 
16175     name_obj = PyObject_GetAttrString(codec, "name");
16176     Py_CLEAR(codec);
16177     if (!name_obj) {
16178         goto error;
16179     }
16180 
16181     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16182     Py_DECREF(name_obj);
16183     if (wname == NULL) {
16184         goto error;
16185     }
16186 
16187     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16188     if (raw_wname == NULL) {
16189         PyMem_Free(wname);
16190         PyErr_NoMemory();
16191         goto error;
16192     }
16193 
16194     PyMem_RawFree(*config_encoding);
16195     *config_encoding = raw_wname;
16196 
16197     PyMem_Free(wname);
16198     return 0;
16199 
16200 error:
16201     Py_XDECREF(codec);
16202     Py_XDECREF(name_obj);
16203     return -1;
16204 }
16205 
16206 
16207 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)16208 init_stdio_encoding(PyInterpreterState *interp)
16209 {
16210     /* Update the stdio encoding to the normalized Python codec name. */
16211     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16212     if (config_get_codec_name(&config->stdio_encoding) < 0) {
16213         return _PyStatus_ERR("failed to get the Python codec name "
16214                              "of the stdio encoding");
16215     }
16216     return _PyStatus_OK();
16217 }
16218 
16219 
16220 static int
init_fs_codec(PyInterpreterState * interp)16221 init_fs_codec(PyInterpreterState *interp)
16222 {
16223     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16224 
16225     _Py_error_handler error_handler;
16226     error_handler = get_error_handler_wide(config->filesystem_errors);
16227     if (error_handler == _Py_ERROR_UNKNOWN) {
16228         PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16229         return -1;
16230     }
16231 
16232     char *encoding, *errors;
16233     if (encode_wstr_utf8(config->filesystem_encoding,
16234                          &encoding,
16235                          "filesystem_encoding") < 0) {
16236         return -1;
16237     }
16238 
16239     if (encode_wstr_utf8(config->filesystem_errors,
16240                          &errors,
16241                          "filesystem_errors") < 0) {
16242         PyMem_RawFree(encoding);
16243         return -1;
16244     }
16245 
16246     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16247     PyMem_RawFree(fs_codec->encoding);
16248     fs_codec->encoding = encoding;
16249     /* encoding has been normalized by init_fs_encoding() */
16250     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16251     PyMem_RawFree(fs_codec->errors);
16252     fs_codec->errors = errors;
16253     fs_codec->error_handler = error_handler;
16254 
16255 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16256     assert(fs_codec->utf8 == 1);
16257 #endif
16258 
16259     /* At this point, PyUnicode_EncodeFSDefault() and
16260        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16261        the C implementation of the filesystem encoding. */
16262 
16263     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16264        global configuration variables. */
16265     if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16266                                   fs_codec->errors) < 0) {
16267         PyErr_NoMemory();
16268         return -1;
16269     }
16270     return 0;
16271 }
16272 
16273 
16274 static PyStatus
init_fs_encoding(PyThreadState * tstate)16275 init_fs_encoding(PyThreadState *tstate)
16276 {
16277     PyInterpreterState *interp = tstate->interp;
16278 
16279     /* Update the filesystem encoding to the normalized Python codec name.
16280        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16281        (Python codec name). */
16282     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16283     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16284         _Py_DumpPathConfig(tstate);
16285         return _PyStatus_ERR("failed to get the Python codec "
16286                              "of the filesystem encoding");
16287     }
16288 
16289     if (init_fs_codec(interp) < 0) {
16290         return _PyStatus_ERR("cannot initialize filesystem codec");
16291     }
16292     return _PyStatus_OK();
16293 }
16294 
16295 
16296 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16297 _PyUnicode_InitEncodings(PyThreadState *tstate)
16298 {
16299     PyStatus status = init_fs_encoding(tstate);
16300     if (_PyStatus_EXCEPTION(status)) {
16301         return status;
16302     }
16303 
16304     return init_stdio_encoding(tstate->interp);
16305 }
16306 
16307 
16308 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16309 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16310 {
16311     PyMem_RawFree(fs_codec->encoding);
16312     fs_codec->encoding = NULL;
16313     fs_codec->utf8 = 0;
16314     PyMem_RawFree(fs_codec->errors);
16315     fs_codec->errors = NULL;
16316     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16317 }
16318 
16319 
16320 #ifdef MS_WINDOWS
16321 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16322 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16323 {
16324     PyInterpreterState *interp = _PyInterpreterState_GET();
16325     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16326 
16327     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16328     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16329     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16330     if (encoding == NULL || errors == NULL) {
16331         PyMem_RawFree(encoding);
16332         PyMem_RawFree(errors);
16333         PyErr_NoMemory();
16334         return -1;
16335     }
16336 
16337     PyMem_RawFree(config->filesystem_encoding);
16338     config->filesystem_encoding = encoding;
16339     PyMem_RawFree(config->filesystem_errors);
16340     config->filesystem_errors = errors;
16341 
16342     return init_fs_codec(interp);
16343 }
16344 #endif
16345 
16346 
16347 void
_PyUnicode_Fini(PyInterpreterState * interp)16348 _PyUnicode_Fini(PyInterpreterState *interp)
16349 {
16350     struct _Py_unicode_state *state = &interp->unicode;
16351 
16352     if (_Py_IsMainInterpreter(interp)) {
16353         // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16354         assert(interned == NULL);
16355     }
16356 
16357     _PyUnicode_FiniEncodings(&state->fs_codec);
16358 
16359     unicode_clear_identifiers(state);
16360 
16361     for (Py_ssize_t i = 0; i < 256; i++) {
16362         Py_CLEAR(state->latin1[i]);
16363     }
16364     Py_CLEAR(state->empty_string);
16365 }
16366 
16367 
16368 /* A _string module, to export formatter_parser and formatter_field_name_split
16369    to the string.Formatter class implemented in Python. */
16370 
16371 static PyMethodDef _string_methods[] = {
16372     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16373      METH_O, PyDoc_STR("split the argument as a field name")},
16374     {"formatter_parser", (PyCFunction) formatter_parser,
16375      METH_O, PyDoc_STR("parse the argument as a format string")},
16376     {NULL, NULL}
16377 };
16378 
16379 static struct PyModuleDef _string_module = {
16380     PyModuleDef_HEAD_INIT,
16381     .m_name = "_string",
16382     .m_doc = PyDoc_STR("string helper module"),
16383     .m_size = 0,
16384     .m_methods = _string_methods,
16385 };
16386 
16387 PyMODINIT_FUNC
PyInit__string(void)16388 PyInit__string(void)
16389 {
16390     return PyModuleDef_Init(&_string_module);
16391 }
16392 
16393 
16394 #ifdef __cplusplus
16395 }
16396 #endif
16397