• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #include "Python.h"
42 #include "pycore_abstract.h"      // _PyIndex_Check()
43 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
44 #include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45 #include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46 #include "pycore_codecs.h"        // _PyCodec_Lookup()
47 #include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48 #include "pycore_format.h"        // F_LJUST
49 #include "pycore_initconfig.h"    // _PyStatus_OK()
50 #include "pycore_interp.h"        // PyInterpreterState.fs_codec
51 #include "pycore_long.h"          // _PyLong_FormatWriter()
52 #include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53 #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54 #include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
56 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
57 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
58 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
59 #include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
60 
61 #include "stringlib/eq.h"         // unicode_eq()
62 #include <stddef.h>               // ptrdiff_t
63 
64 #ifdef MS_WINDOWS
65 #include <windows.h>
66 #endif
67 
68 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
69 #  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
70 #endif
71 
72 /* Uncomment to display statistics on interned strings at exit
73    in _PyUnicode_ClearInterned(). */
74 /* #define INTERNED_STATS 1 */
75 
76 
77 /*[clinic input]
78 class str "PyObject *" "&PyUnicode_Type"
79 [clinic start generated code]*/
80 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
81 
82 /*[python input]
83 class Py_UCS4_converter(CConverter):
84     type = 'Py_UCS4'
85     converter = 'convert_uc'
86 
87     def converter_init(self):
88         if self.default is not unspecified:
89             self.c_default = ascii(self.default)
90             if len(self.c_default) > 4 or self.c_default[0] != "'":
91                 self.c_default = hex(ord(self.default))
92 
93 [python start generated code]*/
94 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
95 
96 /* --- Globals ------------------------------------------------------------
97 
98 NOTE: In the interpreter's initialization phase, some globals are currently
99       initialized dynamically as needed. In the process Unicode objects may
100       be created before the Unicode type is ready.
101 
102 */
103 
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107 
108 #ifdef Py_DEBUG
109 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113 
114 #define _PyUnicode_UTF8(op)                             \
115     (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op)                              \
117     (assert(_PyUnicode_CHECK(op)),                      \
118      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
119          ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
120          _PyUnicode_UTF8(op))
121 #define _PyUnicode_UTF8_LENGTH(op)                      \
122     (_PyCompactUnicodeObject_CAST(op)->utf8_length)
123 #define PyUnicode_UTF8_LENGTH(op)                       \
124     (assert(_PyUnicode_CHECK(op)),                      \
125      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
126          _PyASCIIObject_CAST(op)->length :              \
127          _PyUnicode_UTF8_LENGTH(op))
128 
129 #define _PyUnicode_LENGTH(op)                           \
130     (_PyASCIIObject_CAST(op)->length)
131 #define _PyUnicode_STATE(op)                            \
132     (_PyASCIIObject_CAST(op)->state)
133 #define _PyUnicode_HASH(op)                             \
134     (_PyASCIIObject_CAST(op)->hash)
135 #define _PyUnicode_KIND(op)                             \
136     (assert(_PyUnicode_CHECK(op)),                      \
137      _PyASCIIObject_CAST(op)->state.kind)
138 #define _PyUnicode_GET_LENGTH(op)                       \
139     (assert(_PyUnicode_CHECK(op)),                      \
140      _PyASCIIObject_CAST(op)->length)
141 #define _PyUnicode_DATA_ANY(op)                         \
142     (_PyUnicodeObject_CAST(op)->data.any)
143 
144 #define _PyUnicode_SHARE_UTF8(op)                       \
145     (assert(_PyUnicode_CHECK(op)),                      \
146      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
147      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148 
149 /* true if the Unicode object has an allocated UTF-8 memory block
150    (not shared with other data) */
151 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
152     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
153       && _PyUnicode_UTF8(op)                            \
154       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
155 
156 /* Generic helper macro to convert characters of different types.
157    from_type and to_type have to be valid type names, begin and end
158    are pointers to the source characters which should be of type
159    "from_type *".  to is a pointer of type "to_type *" and points to the
160    buffer where the result characters are written to. */
161 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
162     do {                                                \
163         to_type *_to = (to_type *)(to);                 \
164         const from_type *_iter = (const from_type *)(begin);\
165         const from_type *_end = (const from_type *)(end);\
166         Py_ssize_t n = (_end) - (_iter);                \
167         const from_type *_unrolled_end =                \
168             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
169         while (_iter < (_unrolled_end)) {               \
170             _to[0] = (to_type) _iter[0];                \
171             _to[1] = (to_type) _iter[1];                \
172             _to[2] = (to_type) _iter[2];                \
173             _to[3] = (to_type) _iter[3];                \
174             _iter += 4; _to += 4;                       \
175         }                                               \
176         while (_iter < (_end))                          \
177             *_to++ = (to_type) *_iter++;                \
178     } while (0)
179 
180 #define LATIN1 _Py_LATIN1_CHR
181 
182 #ifdef MS_WINDOWS
183    /* On Windows, overallocate by 50% is the best factor */
184 #  define OVERALLOCATE_FACTOR 2
185 #else
186    /* On Linux, overallocate by 25% is the best factor */
187 #  define OVERALLOCATE_FACTOR 4
188 #endif
189 
190 /* Forward declaration */
191 static inline int
192 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
193 static inline void
194 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
195 static PyObject *
196 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
197                     const char *errors);
198 static PyObject *
199 unicode_decode_utf8(const char *s, Py_ssize_t size,
200                     _Py_error_handler error_handler, const char *errors,
201                     Py_ssize_t *consumed);
202 #ifdef Py_DEBUG
203 static inline int unicode_is_finalizing(void);
204 static int unicode_is_singleton(PyObject *unicode);
205 #endif
206 
207 
208 // Return a reference to the immortal empty string singleton.
unicode_get_empty(void)209 static inline PyObject* unicode_get_empty(void)
210 {
211     _Py_DECLARE_STR(empty, "");
212     return &_Py_STR(empty);
213 }
214 
215 /* This dictionary holds per-interpreter interned strings.
216  * See InternalDocs/string_interning.md for details.
217  */
get_interned_dict(PyInterpreterState * interp)218 static inline PyObject *get_interned_dict(PyInterpreterState *interp)
219 {
220     return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
221 }
222 
223 /* This hashtable holds statically allocated interned strings.
224  * See InternalDocs/string_interning.md for details.
225  */
226 #define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
227 
228 /* Get number of all interned strings for the current interpreter. */
229 Py_ssize_t
_PyUnicode_InternedSize(void)230 _PyUnicode_InternedSize(void)
231 {
232     PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
233     return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
234 }
235 
236 /* Get number of immortal interned strings for the current interpreter. */
237 Py_ssize_t
_PyUnicode_InternedSize_Immortal(void)238 _PyUnicode_InternedSize_Immortal(void)
239 {
240     PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
241     PyObject *key, *value;
242     Py_ssize_t pos = 0;
243     Py_ssize_t count = 0;
244 
245     // It's tempting to keep a count and avoid a loop here. But, this function
246     // is intended for refleak tests. It spends extra work to report the true
247     // value, to help detect bugs in optimizations.
248 
249     while (PyDict_Next(dict, &pos, &key, &value)) {
250         assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
251         if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
252            count++;
253        }
254     }
255     return _Py_hashtable_len(INTERNED_STRINGS) + count;
256 }
257 
258 static Py_hash_t unicode_hash(PyObject *);
259 static int unicode_compare_eq(PyObject *, PyObject *);
260 
261 static Py_uhash_t
hashtable_unicode_hash(const void * key)262 hashtable_unicode_hash(const void *key)
263 {
264     return unicode_hash((PyObject *)key);
265 }
266 
267 static int
hashtable_unicode_compare(const void * key1,const void * key2)268 hashtable_unicode_compare(const void *key1, const void *key2)
269 {
270     PyObject *obj1 = (PyObject *)key1;
271     PyObject *obj2 = (PyObject *)key2;
272     if (obj1 != NULL && obj2 != NULL) {
273         return unicode_compare_eq(obj1, obj2);
274     }
275     else {
276         return obj1 == obj2;
277     }
278 }
279 
280 /* Return true if this interpreter should share the main interpreter's
281    intern_dict.  That's important for interpreters which load basic
282    single-phase init extension modules (m_size == -1).  There could be interned
283    immortal strings that are shared between interpreters, due to the
284    PyDict_Update(mdict, m_copy) call in import_find_extension().
285 
286    It's not safe to deallocate those strings until all interpreters that
287    potentially use them are freed.  By storing them in the main interpreter, we
288    ensure they get freed after all other interpreters are freed.
289 */
290 static bool
has_shared_intern_dict(PyInterpreterState * interp)291 has_shared_intern_dict(PyInterpreterState *interp)
292 {
293     PyInterpreterState *main_interp = _PyInterpreterState_Main();
294     return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
295 }
296 
297 static int
init_interned_dict(PyInterpreterState * interp)298 init_interned_dict(PyInterpreterState *interp)
299 {
300     assert(get_interned_dict(interp) == NULL);
301     PyObject *interned;
302     if (has_shared_intern_dict(interp)) {
303         interned = get_interned_dict(_PyInterpreterState_Main());
304         Py_INCREF(interned);
305     }
306     else {
307         interned = PyDict_New();
308         if (interned == NULL) {
309             return -1;
310         }
311     }
312     _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
313     return 0;
314 }
315 
316 static void
clear_interned_dict(PyInterpreterState * interp)317 clear_interned_dict(PyInterpreterState *interp)
318 {
319     PyObject *interned = get_interned_dict(interp);
320     if (interned != NULL) {
321         if (!has_shared_intern_dict(interp)) {
322             // only clear if the dict belongs to this interpreter
323             PyDict_Clear(interned);
324         }
325         Py_DECREF(interned);
326         _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
327     }
328 }
329 
330 static PyStatus
init_global_interned_strings(PyInterpreterState * interp)331 init_global_interned_strings(PyInterpreterState *interp)
332 {
333     assert(INTERNED_STRINGS == NULL);
334     _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
335 
336     INTERNED_STRINGS = _Py_hashtable_new_full(
337         hashtable_unicode_hash,
338         hashtable_unicode_compare,
339         // Objects stored here are immortal and statically allocated,
340         // so we don't need key_destroy_func & value_destroy_func:
341         NULL,
342         NULL,
343         &hashtable_alloc
344     );
345     if (INTERNED_STRINGS == NULL) {
346         PyErr_Clear();
347         return _PyStatus_ERR("failed to create global interned dict");
348     }
349 
350     /* Intern statically allocated string identifiers, deepfreeze strings,
351         * and one-byte latin-1 strings.
352         * This must be done before any module initialization so that statically
353         * allocated string identifiers are used instead of heap allocated strings.
354         * Deepfreeze uses the interned identifiers if present to save space
355         * else generates them and they are interned to speed up dict lookups.
356     */
357     _PyUnicode_InitStaticStrings(interp);
358 
359     for (int i = 0; i < 256; i++) {
360         PyObject *s = LATIN1(i);
361         _PyUnicode_InternStatic(interp, &s);
362         assert(s == LATIN1(i));
363     }
364 #ifdef Py_DEBUG
365     assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
366 
367     for (int i = 0; i < 256; i++) {
368         assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
369     }
370 #endif
371     return _PyStatus_OK();
372 }
373 
clear_global_interned_strings(void)374 static void clear_global_interned_strings(void)
375 {
376     if (INTERNED_STRINGS != NULL) {
377         _Py_hashtable_destroy(INTERNED_STRINGS);
378         INTERNED_STRINGS = NULL;
379     }
380 }
381 
382 #define _Py_RETURN_UNICODE_EMPTY()   \
383     do {                             \
384         return unicode_get_empty();  \
385     } while (0)
386 
387 static inline void
unicode_fill(int kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)388 unicode_fill(int kind, void *data, Py_UCS4 value,
389              Py_ssize_t start, Py_ssize_t length)
390 {
391     assert(0 <= start);
392     switch (kind) {
393     case PyUnicode_1BYTE_KIND: {
394         assert(value <= 0xff);
395         Py_UCS1 ch = (unsigned char)value;
396         Py_UCS1 *to = (Py_UCS1 *)data + start;
397         memset(to, ch, length);
398         break;
399     }
400     case PyUnicode_2BYTE_KIND: {
401         assert(value <= 0xffff);
402         Py_UCS2 ch = (Py_UCS2)value;
403         Py_UCS2 *to = (Py_UCS2 *)data + start;
404         const Py_UCS2 *end = to + length;
405         for (; to < end; ++to) *to = ch;
406         break;
407     }
408     case PyUnicode_4BYTE_KIND: {
409         assert(value <= MAX_UNICODE);
410         Py_UCS4 ch = value;
411         Py_UCS4 * to = (Py_UCS4 *)data + start;
412         const Py_UCS4 *end = to + length;
413         for (; to < end; ++to) *to = ch;
414         break;
415     }
416     default: Py_UNREACHABLE();
417     }
418 }
419 
420 
421 /* Fast detection of the most frequent whitespace characters */
422 const unsigned char _Py_ascii_whitespace[] = {
423     0, 0, 0, 0, 0, 0, 0, 0,
424 /*     case 0x0009: * CHARACTER TABULATION */
425 /*     case 0x000A: * LINE FEED */
426 /*     case 0x000B: * LINE TABULATION */
427 /*     case 0x000C: * FORM FEED */
428 /*     case 0x000D: * CARRIAGE RETURN */
429     0, 1, 1, 1, 1, 1, 0, 0,
430     0, 0, 0, 0, 0, 0, 0, 0,
431 /*     case 0x001C: * FILE SEPARATOR */
432 /*     case 0x001D: * GROUP SEPARATOR */
433 /*     case 0x001E: * RECORD SEPARATOR */
434 /*     case 0x001F: * UNIT SEPARATOR */
435     0, 0, 0, 0, 1, 1, 1, 1,
436 /*     case 0x0020: * SPACE */
437     1, 0, 0, 0, 0, 0, 0, 0,
438     0, 0, 0, 0, 0, 0, 0, 0,
439     0, 0, 0, 0, 0, 0, 0, 0,
440     0, 0, 0, 0, 0, 0, 0, 0,
441 
442     0, 0, 0, 0, 0, 0, 0, 0,
443     0, 0, 0, 0, 0, 0, 0, 0,
444     0, 0, 0, 0, 0, 0, 0, 0,
445     0, 0, 0, 0, 0, 0, 0, 0,
446     0, 0, 0, 0, 0, 0, 0, 0,
447     0, 0, 0, 0, 0, 0, 0, 0,
448     0, 0, 0, 0, 0, 0, 0, 0,
449     0, 0, 0, 0, 0, 0, 0, 0
450 };
451 
452 /* forward */
453 static PyObject* get_latin1_char(unsigned char ch);
454 static int unicode_modifiable(PyObject *unicode);
455 
456 
457 static PyObject *
458 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
459 static PyObject *
460 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
461 static PyObject *
462 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
463 
464 static PyObject *
465 unicode_encode_call_errorhandler(const char *errors,
466        PyObject **errorHandler,const char *encoding, const char *reason,
467        PyObject *unicode, PyObject **exceptionObject,
468        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
469 
470 static void
471 raise_encode_exception(PyObject **exceptionObject,
472                        const char *encoding,
473                        PyObject *unicode,
474                        Py_ssize_t startpos, Py_ssize_t endpos,
475                        const char *reason);
476 
477 /* Same for linebreaks */
478 static const unsigned char ascii_linebreak[] = {
479     0, 0, 0, 0, 0, 0, 0, 0,
480 /*         0x000A, * LINE FEED */
481 /*         0x000B, * LINE TABULATION */
482 /*         0x000C, * FORM FEED */
483 /*         0x000D, * CARRIAGE RETURN */
484     0, 0, 1, 1, 1, 1, 0, 0,
485     0, 0, 0, 0, 0, 0, 0, 0,
486 /*         0x001C, * FILE SEPARATOR */
487 /*         0x001D, * GROUP SEPARATOR */
488 /*         0x001E, * RECORD SEPARATOR */
489     0, 0, 0, 0, 1, 1, 1, 0,
490     0, 0, 0, 0, 0, 0, 0, 0,
491     0, 0, 0, 0, 0, 0, 0, 0,
492     0, 0, 0, 0, 0, 0, 0, 0,
493     0, 0, 0, 0, 0, 0, 0, 0,
494 
495     0, 0, 0, 0, 0, 0, 0, 0,
496     0, 0, 0, 0, 0, 0, 0, 0,
497     0, 0, 0, 0, 0, 0, 0, 0,
498     0, 0, 0, 0, 0, 0, 0, 0,
499     0, 0, 0, 0, 0, 0, 0, 0,
500     0, 0, 0, 0, 0, 0, 0, 0,
501     0, 0, 0, 0, 0, 0, 0, 0,
502     0, 0, 0, 0, 0, 0, 0, 0
503 };
504 
505 static int convert_uc(PyObject *obj, void *addr);
506 
507 struct encoding_map;
508 #include "clinic/unicodeobject.c.h"
509 
510 _Py_error_handler
_Py_GetErrorHandler(const char * errors)511 _Py_GetErrorHandler(const char *errors)
512 {
513     if (errors == NULL || strcmp(errors, "strict") == 0) {
514         return _Py_ERROR_STRICT;
515     }
516     if (strcmp(errors, "surrogateescape") == 0) {
517         return _Py_ERROR_SURROGATEESCAPE;
518     }
519     if (strcmp(errors, "replace") == 0) {
520         return _Py_ERROR_REPLACE;
521     }
522     if (strcmp(errors, "ignore") == 0) {
523         return _Py_ERROR_IGNORE;
524     }
525     if (strcmp(errors, "backslashreplace") == 0) {
526         return _Py_ERROR_BACKSLASHREPLACE;
527     }
528     if (strcmp(errors, "surrogatepass") == 0) {
529         return _Py_ERROR_SURROGATEPASS;
530     }
531     if (strcmp(errors, "xmlcharrefreplace") == 0) {
532         return _Py_ERROR_XMLCHARREFREPLACE;
533     }
534     return _Py_ERROR_OTHER;
535 }
536 
537 
538 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)539 get_error_handler_wide(const wchar_t *errors)
540 {
541     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
542         return _Py_ERROR_STRICT;
543     }
544     if (wcscmp(errors, L"surrogateescape") == 0) {
545         return _Py_ERROR_SURROGATEESCAPE;
546     }
547     if (wcscmp(errors, L"replace") == 0) {
548         return _Py_ERROR_REPLACE;
549     }
550     if (wcscmp(errors, L"ignore") == 0) {
551         return _Py_ERROR_IGNORE;
552     }
553     if (wcscmp(errors, L"backslashreplace") == 0) {
554         return _Py_ERROR_BACKSLASHREPLACE;
555     }
556     if (wcscmp(errors, L"surrogatepass") == 0) {
557         return _Py_ERROR_SURROGATEPASS;
558     }
559     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
560         return _Py_ERROR_XMLCHARREFREPLACE;
561     }
562     return _Py_ERROR_OTHER;
563 }
564 
565 
566 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)567 unicode_check_encoding_errors(const char *encoding, const char *errors)
568 {
569     if (encoding == NULL && errors == NULL) {
570         return 0;
571     }
572 
573     PyInterpreterState *interp = _PyInterpreterState_GET();
574 #ifndef Py_DEBUG
575     /* In release mode, only check in development mode (-X dev) */
576     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
577         return 0;
578     }
579 #else
580     /* Always check in debug mode */
581 #endif
582 
583     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
584        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
585     if (!interp->unicode.fs_codec.encoding) {
586         return 0;
587     }
588 
589     /* Disable checks during Python finalization. For example, it allows to
590        call _PyObject_Dump() during finalization for debugging purpose. */
591     if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
592         return 0;
593     }
594 
595     if (encoding != NULL
596         // Fast path for the most common built-in encodings. Even if the codec
597         // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
598         // create a temporary Unicode string (the key in the cache).
599         && strcmp(encoding, "utf-8") != 0
600         && strcmp(encoding, "utf8") != 0
601         && strcmp(encoding, "ascii") != 0)
602     {
603         PyObject *handler = _PyCodec_Lookup(encoding);
604         if (handler == NULL) {
605             return -1;
606         }
607         Py_DECREF(handler);
608     }
609 
610     if (errors != NULL
611         // Fast path for the most common built-in error handlers.
612         && strcmp(errors, "strict") != 0
613         && strcmp(errors, "ignore") != 0
614         && strcmp(errors, "replace") != 0
615         && strcmp(errors, "surrogateescape") != 0
616         && strcmp(errors, "surrogatepass") != 0)
617     {
618         PyObject *handler = PyCodec_LookupError(errors);
619         if (handler == NULL) {
620             return -1;
621         }
622         Py_DECREF(handler);
623     }
624     return 0;
625 }
626 
627 
628 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)629 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
630 {
631 #define CHECK(expr) \
632     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
633 
634     assert(op != NULL);
635     CHECK(PyUnicode_Check(op));
636 
637     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
638     int kind = ascii->state.kind;
639 
640     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
641         CHECK(kind == PyUnicode_1BYTE_KIND);
642     }
643     else {
644         PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
645         void *data;
646 
647         if (ascii->state.compact == 1) {
648             data = compact + 1;
649             CHECK(kind == PyUnicode_1BYTE_KIND
650                                  || kind == PyUnicode_2BYTE_KIND
651                                  || kind == PyUnicode_4BYTE_KIND);
652             CHECK(ascii->state.ascii == 0);
653             CHECK(compact->utf8 != data);
654         }
655         else {
656             PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
657 
658             data = unicode->data.any;
659             CHECK(kind == PyUnicode_1BYTE_KIND
660                      || kind == PyUnicode_2BYTE_KIND
661                      || kind == PyUnicode_4BYTE_KIND);
662             CHECK(ascii->state.compact == 0);
663             CHECK(data != NULL);
664             if (ascii->state.ascii) {
665                 CHECK(compact->utf8 == data);
666                 CHECK(compact->utf8_length == ascii->length);
667             }
668             else {
669                 CHECK(compact->utf8 != data);
670             }
671         }
672 
673         if (compact->utf8 == NULL)
674             CHECK(compact->utf8_length == 0);
675     }
676 
677     /* check that the best kind is used: O(n) operation */
678     if (check_content) {
679         Py_ssize_t i;
680         Py_UCS4 maxchar = 0;
681         const void *data;
682         Py_UCS4 ch;
683 
684         data = PyUnicode_DATA(ascii);
685         for (i=0; i < ascii->length; i++)
686         {
687             ch = PyUnicode_READ(kind, data, i);
688             if (ch > maxchar)
689                 maxchar = ch;
690         }
691         if (kind == PyUnicode_1BYTE_KIND) {
692             if (ascii->state.ascii == 0) {
693                 CHECK(maxchar >= 128);
694                 CHECK(maxchar <= 255);
695             }
696             else
697                 CHECK(maxchar < 128);
698         }
699         else if (kind == PyUnicode_2BYTE_KIND) {
700             CHECK(maxchar >= 0x100);
701             CHECK(maxchar <= 0xFFFF);
702         }
703         else {
704             CHECK(maxchar >= 0x10000);
705             CHECK(maxchar <= MAX_UNICODE);
706         }
707         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
708     }
709 
710     /* Check interning state */
711 #ifdef Py_DEBUG
712     // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
713     // extensions can make immortal strings mortal (but with a high enough
714     // refcount).
715     // The other way is extremely unlikely (worth a potential failed assertion
716     // in a debug build), so we do check `!_Py_IsImmortal(op)`.
717     switch (PyUnicode_CHECK_INTERNED(op)) {
718         case SSTATE_NOT_INTERNED:
719             if (ascii->state.statically_allocated) {
720                 // This state is for two exceptions:
721                 // - strings are currently checked before they're interned
722                 // - the 256 one-latin1-character strings
723                 //   are static but use SSTATE_NOT_INTERNED
724             }
725             else {
726                 CHECK(!_Py_IsImmortal(op));
727             }
728             break;
729         case SSTATE_INTERNED_MORTAL:
730             CHECK(!ascii->state.statically_allocated);
731             CHECK(!_Py_IsImmortal(op));
732             break;
733         case SSTATE_INTERNED_IMMORTAL:
734             CHECK(!ascii->state.statically_allocated);
735             break;
736         case SSTATE_INTERNED_IMMORTAL_STATIC:
737             CHECK(ascii->state.statically_allocated);
738             break;
739         default:
740             Py_UNREACHABLE();
741     }
742 #endif
743 
744     return 1;
745 
746 #undef CHECK
747 }
748 
749 static PyObject*
unicode_result(PyObject * unicode)750 unicode_result(PyObject *unicode)
751 {
752     assert(_PyUnicode_CHECK(unicode));
753 
754     Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
755     if (length == 0) {
756         PyObject *empty = unicode_get_empty();
757         if (unicode != empty) {
758             Py_DECREF(unicode);
759         }
760         return empty;
761     }
762 
763     if (length == 1) {
764         int kind = PyUnicode_KIND(unicode);
765         if (kind == PyUnicode_1BYTE_KIND) {
766             const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
767             Py_UCS1 ch = data[0];
768             PyObject *latin1_char = LATIN1(ch);
769             if (unicode != latin1_char) {
770                 Py_DECREF(unicode);
771             }
772             return latin1_char;
773         }
774     }
775 
776     assert(_PyUnicode_CheckConsistency(unicode, 1));
777     return unicode;
778 }
779 
780 static PyObject*
unicode_result_unchanged(PyObject * unicode)781 unicode_result_unchanged(PyObject *unicode)
782 {
783     if (PyUnicode_CheckExact(unicode)) {
784         return Py_NewRef(unicode);
785     }
786     else
787         /* Subtype -- return genuine unicode string with the same value. */
788         return _PyUnicode_Copy(unicode);
789 }
790 
791 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
792    ASCII, Latin1, UTF-8, etc. */
793 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)794 backslashreplace(_PyBytesWriter *writer, char *str,
795                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796 {
797     Py_ssize_t size, i;
798     Py_UCS4 ch;
799     int kind;
800     const void *data;
801 
802     kind = PyUnicode_KIND(unicode);
803     data = PyUnicode_DATA(unicode);
804 
805     size = 0;
806     /* determine replacement size */
807     for (i = collstart; i < collend; ++i) {
808         Py_ssize_t incr;
809 
810         ch = PyUnicode_READ(kind, data, i);
811         if (ch < 0x100)
812             incr = 2+2;
813         else if (ch < 0x10000)
814             incr = 2+4;
815         else {
816             assert(ch <= MAX_UNICODE);
817             incr = 2+8;
818         }
819         if (size > PY_SSIZE_T_MAX - incr) {
820             PyErr_SetString(PyExc_OverflowError,
821                             "encoded result is too long for a Python string");
822             return NULL;
823         }
824         size += incr;
825     }
826 
827     str = _PyBytesWriter_Prepare(writer, str, size);
828     if (str == NULL)
829         return NULL;
830 
831     /* generate replacement */
832     for (i = collstart; i < collend; ++i) {
833         ch = PyUnicode_READ(kind, data, i);
834         *str++ = '\\';
835         if (ch >= 0x00010000) {
836             *str++ = 'U';
837             *str++ = Py_hexdigits[(ch>>28)&0xf];
838             *str++ = Py_hexdigits[(ch>>24)&0xf];
839             *str++ = Py_hexdigits[(ch>>20)&0xf];
840             *str++ = Py_hexdigits[(ch>>16)&0xf];
841             *str++ = Py_hexdigits[(ch>>12)&0xf];
842             *str++ = Py_hexdigits[(ch>>8)&0xf];
843         }
844         else if (ch >= 0x100) {
845             *str++ = 'u';
846             *str++ = Py_hexdigits[(ch>>12)&0xf];
847             *str++ = Py_hexdigits[(ch>>8)&0xf];
848         }
849         else
850             *str++ = 'x';
851         *str++ = Py_hexdigits[(ch>>4)&0xf];
852         *str++ = Py_hexdigits[ch&0xf];
853     }
854     return str;
855 }
856 
857 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
858    ASCII, Latin1, UTF-8, etc. */
859 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)860 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
861                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
862 {
863     Py_ssize_t size, i;
864     Py_UCS4 ch;
865     int kind;
866     const void *data;
867 
868     kind = PyUnicode_KIND(unicode);
869     data = PyUnicode_DATA(unicode);
870 
871     size = 0;
872     /* determine replacement size */
873     for (i = collstart; i < collend; ++i) {
874         Py_ssize_t incr;
875 
876         ch = PyUnicode_READ(kind, data, i);
877         if (ch < 10)
878             incr = 2+1+1;
879         else if (ch < 100)
880             incr = 2+2+1;
881         else if (ch < 1000)
882             incr = 2+3+1;
883         else if (ch < 10000)
884             incr = 2+4+1;
885         else if (ch < 100000)
886             incr = 2+5+1;
887         else if (ch < 1000000)
888             incr = 2+6+1;
889         else {
890             assert(ch <= MAX_UNICODE);
891             incr = 2+7+1;
892         }
893         if (size > PY_SSIZE_T_MAX - incr) {
894             PyErr_SetString(PyExc_OverflowError,
895                             "encoded result is too long for a Python string");
896             return NULL;
897         }
898         size += incr;
899     }
900 
901     str = _PyBytesWriter_Prepare(writer, str, size);
902     if (str == NULL)
903         return NULL;
904 
905     /* generate replacement */
906     for (i = collstart; i < collend; ++i) {
907         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
908         if (size < 0) {
909             return NULL;
910         }
911         str += size;
912     }
913     return str;
914 }
915 
916 /* --- Bloom Filters ----------------------------------------------------- */
917 
918 /* stuff to implement simple "bloom filters" for Unicode characters.
919    to keep things simple, we use a single bitmask, using the least 5
920    bits from each unicode characters as the bit index. */
921 
922 /* the linebreak mask is set up by _PyUnicode_Init() below */
923 
924 #if LONG_BIT >= 128
925 #define BLOOM_WIDTH 128
926 #elif LONG_BIT >= 64
927 #define BLOOM_WIDTH 64
928 #elif LONG_BIT >= 32
929 #define BLOOM_WIDTH 32
930 #else
931 #error "LONG_BIT is smaller than 32"
932 #endif
933 
934 #define BLOOM_MASK unsigned long
935 
936 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
937 
938 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
939 
940 #define BLOOM_LINEBREAK(ch)                                             \
941     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
942      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
943 
944 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)945 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
946 {
947 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
948     do {                                               \
949         TYPE *data = (TYPE *)PTR;                      \
950         TYPE *end = data + LEN;                        \
951         Py_UCS4 ch;                                    \
952         for (; data != end; data++) {                  \
953             ch = *data;                                \
954             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
955         }                                              \
956         break;                                         \
957     } while (0)
958 
959     /* calculate simple bloom-style bitmask for a given unicode string */
960 
961     BLOOM_MASK mask;
962 
963     mask = 0;
964     switch (kind) {
965     case PyUnicode_1BYTE_KIND:
966         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
967         break;
968     case PyUnicode_2BYTE_KIND:
969         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
970         break;
971     case PyUnicode_4BYTE_KIND:
972         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
973         break;
974     default:
975         Py_UNREACHABLE();
976     }
977     return mask;
978 
979 #undef BLOOM_UPDATE
980 }
981 
982 static int
ensure_unicode(PyObject * obj)983 ensure_unicode(PyObject *obj)
984 {
985     if (!PyUnicode_Check(obj)) {
986         PyErr_Format(PyExc_TypeError,
987                      "must be str, not %.100s",
988                      Py_TYPE(obj)->tp_name);
989         return -1;
990     }
991     return 0;
992 }
993 
994 /* Compilation of templated routines */
995 
996 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
997 
998 #include "stringlib/asciilib.h"
999 #include "stringlib/fastsearch.h"
1000 #include "stringlib/partition.h"
1001 #include "stringlib/split.h"
1002 #include "stringlib/count.h"
1003 #include "stringlib/find.h"
1004 #include "stringlib/find_max_char.h"
1005 #include "stringlib/undef.h"
1006 
1007 #include "stringlib/ucs1lib.h"
1008 #include "stringlib/fastsearch.h"
1009 #include "stringlib/partition.h"
1010 #include "stringlib/split.h"
1011 #include "stringlib/count.h"
1012 #include "stringlib/find.h"
1013 #include "stringlib/replace.h"
1014 #include "stringlib/find_max_char.h"
1015 #include "stringlib/undef.h"
1016 
1017 #include "stringlib/ucs2lib.h"
1018 #include "stringlib/fastsearch.h"
1019 #include "stringlib/partition.h"
1020 #include "stringlib/split.h"
1021 #include "stringlib/count.h"
1022 #include "stringlib/find.h"
1023 #include "stringlib/replace.h"
1024 #include "stringlib/find_max_char.h"
1025 #include "stringlib/undef.h"
1026 
1027 #include "stringlib/ucs4lib.h"
1028 #include "stringlib/fastsearch.h"
1029 #include "stringlib/partition.h"
1030 #include "stringlib/split.h"
1031 #include "stringlib/count.h"
1032 #include "stringlib/find.h"
1033 #include "stringlib/replace.h"
1034 #include "stringlib/find_max_char.h"
1035 #include "stringlib/undef.h"
1036 
1037 #undef STRINGLIB_GET_EMPTY
1038 
1039 /* --- Unicode Object ----------------------------------------------------- */
1040 
1041 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)1042 findchar(const void *s, int kind,
1043          Py_ssize_t size, Py_UCS4 ch,
1044          int direction)
1045 {
1046     switch (kind) {
1047     case PyUnicode_1BYTE_KIND:
1048         if ((Py_UCS1) ch != ch)
1049             return -1;
1050         if (direction > 0)
1051             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1052         else
1053             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1054     case PyUnicode_2BYTE_KIND:
1055         if ((Py_UCS2) ch != ch)
1056             return -1;
1057         if (direction > 0)
1058             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1059         else
1060             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1061     case PyUnicode_4BYTE_KIND:
1062         if (direction > 0)
1063             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1064         else
1065             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1066     default:
1067         Py_UNREACHABLE();
1068     }
1069 }
1070 
1071 #ifdef Py_DEBUG
1072 /* Fill the data of a Unicode string with invalid characters to detect bugs
1073    earlier.
1074 
1075    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1076    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1077    invalid character in Unicode 6.0. */
1078 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1079 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1080 {
1081     int kind = PyUnicode_KIND(unicode);
1082     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1083     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1084     if (length <= old_length)
1085         return;
1086     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1087 }
1088 #endif
1089 
1090 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1091 resize_compact(PyObject *unicode, Py_ssize_t length)
1092 {
1093     Py_ssize_t char_size;
1094     Py_ssize_t struct_size;
1095     Py_ssize_t new_size;
1096     PyObject *new_unicode;
1097 #ifdef Py_DEBUG
1098     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1099 #endif
1100 
1101     assert(unicode_modifiable(unicode));
1102     assert(PyUnicode_IS_COMPACT(unicode));
1103 
1104     char_size = PyUnicode_KIND(unicode);
1105     if (PyUnicode_IS_ASCII(unicode))
1106         struct_size = sizeof(PyASCIIObject);
1107     else
1108         struct_size = sizeof(PyCompactUnicodeObject);
1109 
1110     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1111         PyErr_NoMemory();
1112         return NULL;
1113     }
1114     new_size = (struct_size + (length + 1) * char_size);
1115 
1116     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1117         PyMem_Free(_PyUnicode_UTF8(unicode));
1118         _PyUnicode_UTF8(unicode) = NULL;
1119         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120     }
1121 #ifdef Py_TRACE_REFS
1122     _Py_ForgetReference(unicode);
1123 #endif
1124 
1125     new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1126     if (new_unicode == NULL) {
1127         _Py_NewReferenceNoTotal(unicode);
1128         PyErr_NoMemory();
1129         return NULL;
1130     }
1131     unicode = new_unicode;
1132     _Py_NewReferenceNoTotal(unicode);
1133 
1134     _PyUnicode_LENGTH(unicode) = length;
1135 #ifdef Py_DEBUG
1136     unicode_fill_invalid(unicode, old_length);
1137 #endif
1138     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1139                     length, 0);
1140     assert(_PyUnicode_CheckConsistency(unicode, 0));
1141     return unicode;
1142 }
1143 
1144 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1145 resize_inplace(PyObject *unicode, Py_ssize_t length)
1146 {
1147     assert(!PyUnicode_IS_COMPACT(unicode));
1148     assert(Py_REFCNT(unicode) == 1);
1149 
1150     Py_ssize_t new_size;
1151     Py_ssize_t char_size;
1152     int share_utf8;
1153     void *data;
1154 #ifdef Py_DEBUG
1155     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1156 #endif
1157 
1158     data = _PyUnicode_DATA_ANY(unicode);
1159     char_size = PyUnicode_KIND(unicode);
1160     share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1161 
1162     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1163         PyErr_NoMemory();
1164         return -1;
1165     }
1166     new_size = (length + 1) * char_size;
1167 
1168     if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1169     {
1170         PyMem_Free(_PyUnicode_UTF8(unicode));
1171         _PyUnicode_UTF8(unicode) = NULL;
1172         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1173     }
1174 
1175     data = (PyObject *)PyObject_Realloc(data, new_size);
1176     if (data == NULL) {
1177         PyErr_NoMemory();
1178         return -1;
1179     }
1180     _PyUnicode_DATA_ANY(unicode) = data;
1181     if (share_utf8) {
1182         _PyUnicode_UTF8(unicode) = data;
1183         _PyUnicode_UTF8_LENGTH(unicode) = length;
1184     }
1185     _PyUnicode_LENGTH(unicode) = length;
1186     PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1187 #ifdef Py_DEBUG
1188     unicode_fill_invalid(unicode, old_length);
1189 #endif
1190 
1191     /* check for integer overflow */
1192     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1193         PyErr_NoMemory();
1194         return -1;
1195     }
1196     assert(_PyUnicode_CheckConsistency(unicode, 0));
1197     return 0;
1198 }
1199 
1200 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1201 resize_copy(PyObject *unicode, Py_ssize_t length)
1202 {
1203     Py_ssize_t copy_length;
1204     PyObject *copy;
1205 
1206     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1207     if (copy == NULL)
1208         return NULL;
1209 
1210     copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1211     _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1212     return copy;
1213 }
1214 
1215 static const char*
unicode_kind_name(PyObject * unicode)1216 unicode_kind_name(PyObject *unicode)
1217 {
1218     /* don't check consistency: unicode_kind_name() is called from
1219        _PyUnicode_Dump() */
1220     if (!PyUnicode_IS_COMPACT(unicode))
1221     {
1222         switch (PyUnicode_KIND(unicode))
1223         {
1224         case PyUnicode_1BYTE_KIND:
1225             if (PyUnicode_IS_ASCII(unicode))
1226                 return "legacy ascii";
1227             else
1228                 return "legacy latin1";
1229         case PyUnicode_2BYTE_KIND:
1230             return "legacy UCS2";
1231         case PyUnicode_4BYTE_KIND:
1232             return "legacy UCS4";
1233         default:
1234             return "<legacy invalid kind>";
1235         }
1236     }
1237     switch (PyUnicode_KIND(unicode)) {
1238     case PyUnicode_1BYTE_KIND:
1239         if (PyUnicode_IS_ASCII(unicode))
1240             return "ascii";
1241         else
1242             return "latin1";
1243     case PyUnicode_2BYTE_KIND:
1244         return "UCS2";
1245     case PyUnicode_4BYTE_KIND:
1246         return "UCS4";
1247     default:
1248         return "<invalid compact kind>";
1249     }
1250 }
1251 
1252 #ifdef Py_DEBUG
1253 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1254 const char *_PyUnicode_utf8(void *unicode_raw){
1255     PyObject *unicode = _PyObject_CAST(unicode_raw);
1256     return PyUnicode_UTF8(unicode);
1257 }
1258 
_PyUnicode_compact_data(void * unicode_raw)1259 const void *_PyUnicode_compact_data(void *unicode_raw) {
1260     PyObject *unicode = _PyObject_CAST(unicode_raw);
1261     return _PyUnicode_COMPACT_DATA(unicode);
1262 }
_PyUnicode_data(void * unicode_raw)1263 const void *_PyUnicode_data(void *unicode_raw) {
1264     PyObject *unicode = _PyObject_CAST(unicode_raw);
1265     printf("obj %p\n", (void*)unicode);
1266     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1267     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1268     printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1269     printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1270     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1271     return PyUnicode_DATA(unicode);
1272 }
1273 
1274 void
_PyUnicode_Dump(PyObject * op)1275 _PyUnicode_Dump(PyObject *op)
1276 {
1277     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1278     PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1279     PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1280     const void *data;
1281 
1282     if (ascii->state.compact)
1283     {
1284         if (ascii->state.ascii)
1285             data = (ascii + 1);
1286         else
1287             data = (compact + 1);
1288     }
1289     else
1290         data = unicode->data.any;
1291     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1292 
1293     if (!ascii->state.ascii) {
1294         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1295     }
1296     printf(", data=%p\n", data);
1297 }
1298 #endif
1299 
1300 
1301 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1302 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1303 {
1304     /* Optimization for empty strings */
1305     if (size == 0) {
1306         return unicode_get_empty();
1307     }
1308 
1309     PyObject *obj;
1310     PyCompactUnicodeObject *unicode;
1311     void *data;
1312     int kind;
1313     int is_ascii;
1314     Py_ssize_t char_size;
1315     Py_ssize_t struct_size;
1316 
1317     is_ascii = 0;
1318     struct_size = sizeof(PyCompactUnicodeObject);
1319     if (maxchar < 128) {
1320         kind = PyUnicode_1BYTE_KIND;
1321         char_size = 1;
1322         is_ascii = 1;
1323         struct_size = sizeof(PyASCIIObject);
1324     }
1325     else if (maxchar < 256) {
1326         kind = PyUnicode_1BYTE_KIND;
1327         char_size = 1;
1328     }
1329     else if (maxchar < 65536) {
1330         kind = PyUnicode_2BYTE_KIND;
1331         char_size = 2;
1332     }
1333     else {
1334         if (maxchar > MAX_UNICODE) {
1335             PyErr_SetString(PyExc_SystemError,
1336                             "invalid maximum character passed to PyUnicode_New");
1337             return NULL;
1338         }
1339         kind = PyUnicode_4BYTE_KIND;
1340         char_size = 4;
1341     }
1342 
1343     /* Ensure we won't overflow the size. */
1344     if (size < 0) {
1345         PyErr_SetString(PyExc_SystemError,
1346                         "Negative size passed to PyUnicode_New");
1347         return NULL;
1348     }
1349     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1350         return PyErr_NoMemory();
1351 
1352     /* Duplicated allocation code from _PyObject_New() instead of a call to
1353      * PyObject_New() so we are able to allocate space for the object and
1354      * it's data buffer.
1355      */
1356     obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1357     if (obj == NULL) {
1358         return PyErr_NoMemory();
1359     }
1360     _PyObject_Init(obj, &PyUnicode_Type);
1361 
1362     unicode = (PyCompactUnicodeObject *)obj;
1363     if (is_ascii)
1364         data = ((PyASCIIObject*)obj) + 1;
1365     else
1366         data = unicode + 1;
1367     _PyUnicode_LENGTH(unicode) = size;
1368     _PyUnicode_HASH(unicode) = -1;
1369     _PyUnicode_STATE(unicode).interned = 0;
1370     _PyUnicode_STATE(unicode).kind = kind;
1371     _PyUnicode_STATE(unicode).compact = 1;
1372     _PyUnicode_STATE(unicode).ascii = is_ascii;
1373     _PyUnicode_STATE(unicode).statically_allocated = 0;
1374     if (is_ascii) {
1375         ((char*)data)[size] = 0;
1376     }
1377     else if (kind == PyUnicode_1BYTE_KIND) {
1378         ((char*)data)[size] = 0;
1379         unicode->utf8 = NULL;
1380         unicode->utf8_length = 0;
1381     }
1382     else {
1383         unicode->utf8 = NULL;
1384         unicode->utf8_length = 0;
1385         if (kind == PyUnicode_2BYTE_KIND)
1386             ((Py_UCS2*)data)[size] = 0;
1387         else /* kind == PyUnicode_4BYTE_KIND */
1388             ((Py_UCS4*)data)[size] = 0;
1389     }
1390 #ifdef Py_DEBUG
1391     unicode_fill_invalid((PyObject*)unicode, 0);
1392 #endif
1393     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1394     return obj;
1395 }
1396 
1397 #if SIZEOF_WCHAR_T == 2
1398 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1399    will decode surrogate pairs, the other conversions are implemented as macros
1400    for efficiency.
1401 
1402    This function assumes that unicode can hold one more code point than wstr
1403    characters for a terminating null character. */
1404 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1405 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1406                               PyObject *unicode)
1407 {
1408     const wchar_t *iter;
1409     Py_UCS4 *ucs4_out;
1410 
1411     assert(unicode != NULL);
1412     assert(_PyUnicode_CHECK(unicode));
1413     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1414     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1415 
1416     for (iter = begin; iter < end; ) {
1417         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1418                            _PyUnicode_GET_LENGTH(unicode)));
1419         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1420             && (iter+1) < end
1421             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1422         {
1423             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1424             iter += 2;
1425         }
1426         else {
1427             *ucs4_out++ = *iter;
1428             iter++;
1429         }
1430     }
1431     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1432                         _PyUnicode_GET_LENGTH(unicode)));
1433 
1434 }
1435 #endif
1436 
1437 static int
unicode_check_modifiable(PyObject * unicode)1438 unicode_check_modifiable(PyObject *unicode)
1439 {
1440     if (!unicode_modifiable(unicode)) {
1441         PyErr_SetString(PyExc_SystemError,
1442                         "Cannot modify a string currently used");
1443         return -1;
1444     }
1445     return 0;
1446 }
1447 
1448 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1449 _copy_characters(PyObject *to, Py_ssize_t to_start,
1450                  PyObject *from, Py_ssize_t from_start,
1451                  Py_ssize_t how_many, int check_maxchar)
1452 {
1453     int from_kind, to_kind;
1454     const void *from_data;
1455     void *to_data;
1456 
1457     assert(0 <= how_many);
1458     assert(0 <= from_start);
1459     assert(0 <= to_start);
1460     assert(PyUnicode_Check(from));
1461     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1462 
1463     assert(PyUnicode_Check(to));
1464     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1465 
1466     if (how_many == 0)
1467         return 0;
1468 
1469     from_kind = PyUnicode_KIND(from);
1470     from_data = PyUnicode_DATA(from);
1471     to_kind = PyUnicode_KIND(to);
1472     to_data = PyUnicode_DATA(to);
1473 
1474 #ifdef Py_DEBUG
1475     if (!check_maxchar
1476         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1477     {
1478         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1479         Py_UCS4 ch;
1480         Py_ssize_t i;
1481         for (i=0; i < how_many; i++) {
1482             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1483             assert(ch <= to_maxchar);
1484         }
1485     }
1486 #endif
1487 
1488     if (from_kind == to_kind) {
1489         if (check_maxchar
1490             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1491         {
1492             /* Writing Latin-1 characters into an ASCII string requires to
1493                check that all written characters are pure ASCII */
1494             Py_UCS4 max_char;
1495             max_char = ucs1lib_find_max_char(from_data,
1496                                              (const Py_UCS1*)from_data + how_many);
1497             if (max_char >= 128)
1498                 return -1;
1499         }
1500         memcpy((char*)to_data + to_kind * to_start,
1501                   (const char*)from_data + from_kind * from_start,
1502                   to_kind * how_many);
1503     }
1504     else if (from_kind == PyUnicode_1BYTE_KIND
1505              && to_kind == PyUnicode_2BYTE_KIND)
1506     {
1507         _PyUnicode_CONVERT_BYTES(
1508             Py_UCS1, Py_UCS2,
1509             PyUnicode_1BYTE_DATA(from) + from_start,
1510             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1511             PyUnicode_2BYTE_DATA(to) + to_start
1512             );
1513     }
1514     else if (from_kind == PyUnicode_1BYTE_KIND
1515              && to_kind == PyUnicode_4BYTE_KIND)
1516     {
1517         _PyUnicode_CONVERT_BYTES(
1518             Py_UCS1, Py_UCS4,
1519             PyUnicode_1BYTE_DATA(from) + from_start,
1520             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1521             PyUnicode_4BYTE_DATA(to) + to_start
1522             );
1523     }
1524     else if (from_kind == PyUnicode_2BYTE_KIND
1525              && to_kind == PyUnicode_4BYTE_KIND)
1526     {
1527         _PyUnicode_CONVERT_BYTES(
1528             Py_UCS2, Py_UCS4,
1529             PyUnicode_2BYTE_DATA(from) + from_start,
1530             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1531             PyUnicode_4BYTE_DATA(to) + to_start
1532             );
1533     }
1534     else {
1535         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1536 
1537         if (!check_maxchar) {
1538             if (from_kind == PyUnicode_2BYTE_KIND
1539                 && to_kind == PyUnicode_1BYTE_KIND)
1540             {
1541                 _PyUnicode_CONVERT_BYTES(
1542                     Py_UCS2, Py_UCS1,
1543                     PyUnicode_2BYTE_DATA(from) + from_start,
1544                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1545                     PyUnicode_1BYTE_DATA(to) + to_start
1546                     );
1547             }
1548             else if (from_kind == PyUnicode_4BYTE_KIND
1549                      && to_kind == PyUnicode_1BYTE_KIND)
1550             {
1551                 _PyUnicode_CONVERT_BYTES(
1552                     Py_UCS4, Py_UCS1,
1553                     PyUnicode_4BYTE_DATA(from) + from_start,
1554                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1555                     PyUnicode_1BYTE_DATA(to) + to_start
1556                     );
1557             }
1558             else if (from_kind == PyUnicode_4BYTE_KIND
1559                      && to_kind == PyUnicode_2BYTE_KIND)
1560             {
1561                 _PyUnicode_CONVERT_BYTES(
1562                     Py_UCS4, Py_UCS2,
1563                     PyUnicode_4BYTE_DATA(from) + from_start,
1564                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1565                     PyUnicode_2BYTE_DATA(to) + to_start
1566                     );
1567             }
1568             else {
1569                 Py_UNREACHABLE();
1570             }
1571         }
1572         else {
1573             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1574             Py_UCS4 ch;
1575             Py_ssize_t i;
1576 
1577             for (i=0; i < how_many; i++) {
1578                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1579                 if (ch > to_maxchar)
1580                     return -1;
1581                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1582             }
1583         }
1584     }
1585     return 0;
1586 }
1587 
1588 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1589 _PyUnicode_FastCopyCharacters(
1590     PyObject *to, Py_ssize_t to_start,
1591     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1592 {
1593     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1594 }
1595 
1596 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1597 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1598                          PyObject *from, Py_ssize_t from_start,
1599                          Py_ssize_t how_many)
1600 {
1601     int err;
1602 
1603     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1604         PyErr_BadInternalCall();
1605         return -1;
1606     }
1607 
1608     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1609         PyErr_SetString(PyExc_IndexError, "string index out of range");
1610         return -1;
1611     }
1612     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1613         PyErr_SetString(PyExc_IndexError, "string index out of range");
1614         return -1;
1615     }
1616     if (how_many < 0) {
1617         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1618         return -1;
1619     }
1620     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1621     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1622         PyErr_Format(PyExc_SystemError,
1623                      "Cannot write %zi characters at %zi "
1624                      "in a string of %zi characters",
1625                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1626         return -1;
1627     }
1628 
1629     if (how_many == 0)
1630         return 0;
1631 
1632     if (unicode_check_modifiable(to))
1633         return -1;
1634 
1635     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1636     if (err) {
1637         PyErr_Format(PyExc_SystemError,
1638                      "Cannot copy %s characters "
1639                      "into a string of %s characters",
1640                      unicode_kind_name(from),
1641                      unicode_kind_name(to));
1642         return -1;
1643     }
1644     return how_many;
1645 }
1646 
1647 /* Find the maximum code point and count the number of surrogate pairs so a
1648    correct string length can be computed before converting a string to UCS4.
1649    This function counts single surrogates as a character and not as a pair.
1650 
1651    Return 0 on success, or -1 on error. */
1652 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1653 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1654                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1655 {
1656     const wchar_t *iter;
1657     Py_UCS4 ch;
1658 
1659     assert(num_surrogates != NULL && maxchar != NULL);
1660     *num_surrogates = 0;
1661     *maxchar = 0;
1662 
1663     for (iter = begin; iter < end; ) {
1664 #if SIZEOF_WCHAR_T == 2
1665         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1666             && (iter+1) < end
1667             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1668         {
1669             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1670             ++(*num_surrogates);
1671             iter += 2;
1672         }
1673         else
1674 #endif
1675         {
1676             ch = *iter;
1677             iter++;
1678         }
1679         if (ch > *maxchar) {
1680             *maxchar = ch;
1681             if (*maxchar > MAX_UNICODE) {
1682                 PyErr_Format(PyExc_ValueError,
1683                              "character U+%x is not in range [U+0000; U+%x]",
1684                              ch, MAX_UNICODE);
1685                 return -1;
1686             }
1687         }
1688     }
1689     return 0;
1690 }
1691 
1692 static void
unicode_dealloc(PyObject * unicode)1693 unicode_dealloc(PyObject *unicode)
1694 {
1695 #ifdef Py_DEBUG
1696     if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1697         _Py_FatalRefcountError("deallocating an Unicode singleton");
1698     }
1699 #endif
1700     if (_PyUnicode_STATE(unicode).statically_allocated) {
1701         /* This should never get called, but we also don't want to SEGV if
1702         * we accidentally decref an immortal string out of existence. Since
1703         * the string is an immortal object, just re-set the reference count.
1704         */
1705 #ifdef Py_DEBUG
1706         Py_UNREACHABLE();
1707 #endif
1708         _Py_SetImmortal(unicode);
1709         return;
1710     }
1711     switch (_PyUnicode_STATE(unicode).interned) {
1712         case SSTATE_NOT_INTERNED:
1713             break;
1714         case SSTATE_INTERNED_MORTAL:
1715             /* Remove the object from the intern dict.
1716              * Before doing so, we set the refcount to 2: the key and value
1717              * in the interned_dict.
1718              */
1719             assert(Py_REFCNT(unicode) == 0);
1720             Py_SET_REFCNT(unicode, 2);
1721 #ifdef Py_REF_DEBUG
1722             /* let's be pedantic with the ref total */
1723             _Py_IncRefTotal(_PyThreadState_GET());
1724             _Py_IncRefTotal(_PyThreadState_GET());
1725 #endif
1726             PyInterpreterState *interp = _PyInterpreterState_GET();
1727             PyObject *interned = get_interned_dict(interp);
1728             assert(interned != NULL);
1729             PyObject *popped;
1730             int r = PyDict_Pop(interned, unicode, &popped);
1731             if (r == -1) {
1732                 PyErr_WriteUnraisable(unicode);
1733                 // We don't know what happened to the string. It's probably
1734                 // best to leak it:
1735                 // - if it was popped, there are no more references to it
1736                 //   so it can't cause trouble (except wasted memory)
1737                 // - if it wasn't popped, it'll remain interned
1738                 _Py_SetImmortal(unicode);
1739                 _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1740                 return;
1741             }
1742             if (r == 0) {
1743                 // The interned string was not found in the interned_dict.
1744 #ifdef Py_DEBUG
1745                 Py_UNREACHABLE();
1746 #endif
1747                 _Py_SetImmortal(unicode);
1748                 return;
1749             }
1750             // Successfully popped.
1751             assert(popped == unicode);
1752             // Only our `popped` reference should be left; remove it too.
1753             assert(Py_REFCNT(unicode) == 1);
1754             Py_SET_REFCNT(unicode, 0);
1755 #ifdef Py_REF_DEBUG
1756             /* let's be pedantic with the ref total */
1757             _Py_DecRefTotal(_PyThreadState_GET());
1758 #endif
1759             break;
1760         default:
1761             // As with `statically_allocated` above.
1762 #ifdef Py_REF_DEBUG
1763             Py_UNREACHABLE();
1764 #endif
1765             _Py_SetImmortal(unicode);
1766             return;
1767     }
1768     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1769         PyMem_Free(_PyUnicode_UTF8(unicode));
1770     }
1771     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1772         PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1773     }
1774 
1775     Py_TYPE(unicode)->tp_free(unicode);
1776 }
1777 
1778 #ifdef Py_DEBUG
1779 static int
unicode_is_singleton(PyObject * unicode)1780 unicode_is_singleton(PyObject *unicode)
1781 {
1782     if (unicode == &_Py_STR(empty)) {
1783         return 1;
1784     }
1785 
1786     PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1787     if (ascii->length == 1) {
1788         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1789         if (ch < 256 && LATIN1(ch) == unicode) {
1790             return 1;
1791         }
1792     }
1793     return 0;
1794 }
1795 #endif
1796 
1797 static int
unicode_modifiable(PyObject * unicode)1798 unicode_modifiable(PyObject *unicode)
1799 {
1800     assert(_PyUnicode_CHECK(unicode));
1801     if (Py_REFCNT(unicode) != 1)
1802         return 0;
1803     if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1)
1804         return 0;
1805     if (PyUnicode_CHECK_INTERNED(unicode))
1806         return 0;
1807     if (!PyUnicode_CheckExact(unicode))
1808         return 0;
1809 #ifdef Py_DEBUG
1810     /* singleton refcount is greater than 1 */
1811     assert(!unicode_is_singleton(unicode));
1812 #endif
1813     return 1;
1814 }
1815 
1816 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1817 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1818 {
1819     PyObject *unicode;
1820     Py_ssize_t old_length;
1821 
1822     assert(p_unicode != NULL);
1823     unicode = *p_unicode;
1824 
1825     assert(unicode != NULL);
1826     assert(PyUnicode_Check(unicode));
1827     assert(0 <= length);
1828 
1829     old_length = PyUnicode_GET_LENGTH(unicode);
1830     if (old_length == length)
1831         return 0;
1832 
1833     if (length == 0) {
1834         PyObject *empty = unicode_get_empty();
1835         Py_SETREF(*p_unicode, empty);
1836         return 0;
1837     }
1838 
1839     if (!unicode_modifiable(unicode)) {
1840         PyObject *copy = resize_copy(unicode, length);
1841         if (copy == NULL)
1842             return -1;
1843         Py_SETREF(*p_unicode, copy);
1844         return 0;
1845     }
1846 
1847     if (PyUnicode_IS_COMPACT(unicode)) {
1848         PyObject *new_unicode = resize_compact(unicode, length);
1849         if (new_unicode == NULL)
1850             return -1;
1851         *p_unicode = new_unicode;
1852         return 0;
1853     }
1854     return resize_inplace(unicode, length);
1855 }
1856 
1857 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1858 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1859 {
1860     PyObject *unicode;
1861     if (p_unicode == NULL) {
1862         PyErr_BadInternalCall();
1863         return -1;
1864     }
1865     unicode = *p_unicode;
1866     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1867     {
1868         PyErr_BadInternalCall();
1869         return -1;
1870     }
1871     return unicode_resize(p_unicode, length);
1872 }
1873 
1874 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1875 
1876    WARNING: The function doesn't copy the terminating null character and
1877    doesn't check the maximum character (may write a latin1 character in an
1878    ASCII string). */
1879 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1880 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1881                    const char *str, Py_ssize_t len)
1882 {
1883     int kind = PyUnicode_KIND(unicode);
1884     const void *data = PyUnicode_DATA(unicode);
1885     const char *end = str + len;
1886 
1887     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1888     switch (kind) {
1889     case PyUnicode_1BYTE_KIND: {
1890 #ifdef Py_DEBUG
1891         if (PyUnicode_IS_ASCII(unicode)) {
1892             Py_UCS4 maxchar = ucs1lib_find_max_char(
1893                 (const Py_UCS1*)str,
1894                 (const Py_UCS1*)str + len);
1895             assert(maxchar < 128);
1896         }
1897 #endif
1898         memcpy((char *) data + index, str, len);
1899         break;
1900     }
1901     case PyUnicode_2BYTE_KIND: {
1902         Py_UCS2 *start = (Py_UCS2 *)data + index;
1903         Py_UCS2 *ucs2 = start;
1904 
1905         for (; str < end; ++ucs2, ++str)
1906             *ucs2 = (Py_UCS2)*str;
1907 
1908         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1909         break;
1910     }
1911     case PyUnicode_4BYTE_KIND: {
1912         Py_UCS4 *start = (Py_UCS4 *)data + index;
1913         Py_UCS4 *ucs4 = start;
1914 
1915         for (; str < end; ++ucs4, ++str)
1916             *ucs4 = (Py_UCS4)*str;
1917 
1918         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1919         break;
1920     }
1921     default:
1922         Py_UNREACHABLE();
1923     }
1924 }
1925 
1926 static PyObject*
get_latin1_char(Py_UCS1 ch)1927 get_latin1_char(Py_UCS1 ch)
1928 {
1929     PyObject *o = LATIN1(ch);
1930     return o;
1931 }
1932 
1933 static PyObject*
unicode_char(Py_UCS4 ch)1934 unicode_char(Py_UCS4 ch)
1935 {
1936     PyObject *unicode;
1937 
1938     assert(ch <= MAX_UNICODE);
1939 
1940     if (ch < 256) {
1941         return get_latin1_char(ch);
1942     }
1943 
1944     unicode = PyUnicode_New(1, ch);
1945     if (unicode == NULL)
1946         return NULL;
1947 
1948     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1949     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1950         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1951     } else {
1952         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1953         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1954     }
1955     assert(_PyUnicode_CheckConsistency(unicode, 1));
1956     return unicode;
1957 }
1958 
1959 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)1960 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1961 {
1962     PyObject *unicode;
1963     Py_UCS4 maxchar = 0;
1964     Py_ssize_t num_surrogates;
1965 
1966     if (u == NULL && size != 0) {
1967         PyErr_BadInternalCall();
1968         return NULL;
1969     }
1970 
1971     if (size == -1) {
1972         size = wcslen(u);
1973     }
1974 
1975     /* If the Unicode data is known at construction time, we can apply
1976        some optimizations which share commonly used objects. */
1977 
1978     /* Optimization for empty strings */
1979     if (size == 0)
1980         _Py_RETURN_UNICODE_EMPTY();
1981 
1982 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1983     /* Oracle Solaris uses non-Unicode internal wchar_t form for
1984        non-Unicode locales and hence needs conversion to UCS-4 first. */
1985     if (_Py_LocaleUsesNonUnicodeWchar()) {
1986         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1987         if (!converted) {
1988             return NULL;
1989         }
1990         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1991         PyMem_Free(converted);
1992         return unicode;
1993     }
1994 #endif
1995 
1996     /* Single character Unicode objects in the Latin-1 range are
1997        shared when using this constructor */
1998     if (size == 1 && (Py_UCS4)*u < 256)
1999         return get_latin1_char((unsigned char)*u);
2000 
2001     /* If not empty and not single character, copy the Unicode data
2002        into the new object */
2003     if (find_maxchar_surrogates(u, u + size,
2004                                 &maxchar, &num_surrogates) == -1)
2005         return NULL;
2006 
2007     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2008     if (!unicode)
2009         return NULL;
2010 
2011     switch (PyUnicode_KIND(unicode)) {
2012     case PyUnicode_1BYTE_KIND:
2013         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
2014                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2015         break;
2016     case PyUnicode_2BYTE_KIND:
2017 #if Py_UNICODE_SIZE == 2
2018         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2019 #else
2020         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
2021                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2022 #endif
2023         break;
2024     case PyUnicode_4BYTE_KIND:
2025 #if SIZEOF_WCHAR_T == 2
2026         /* This is the only case which has to process surrogates, thus
2027            a simple copy loop is not enough and we need a function. */
2028         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2029 #else
2030         assert(num_surrogates == 0);
2031         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2032 #endif
2033         break;
2034     default:
2035         Py_UNREACHABLE();
2036     }
2037 
2038     return unicode_result(unicode);
2039 }
2040 
2041 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2042 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2043 {
2044     if (size < 0) {
2045         PyErr_SetString(PyExc_SystemError,
2046                         "Negative size passed to PyUnicode_FromStringAndSize");
2047         return NULL;
2048     }
2049     if (u != NULL) {
2050         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2051     }
2052     if (size > 0) {
2053         PyErr_SetString(PyExc_SystemError,
2054             "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2055         return NULL;
2056     }
2057     return unicode_get_empty();
2058 }
2059 
2060 PyObject *
PyUnicode_FromString(const char * u)2061 PyUnicode_FromString(const char *u)
2062 {
2063     size_t size = strlen(u);
2064     if (size > PY_SSIZE_T_MAX) {
2065         PyErr_SetString(PyExc_OverflowError, "input too long");
2066         return NULL;
2067     }
2068     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2069 }
2070 
2071 
2072 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2073 _PyUnicode_FromId(_Py_Identifier *id)
2074 {
2075     PyMutex_Lock((PyMutex *)&id->mutex);
2076     PyInterpreterState *interp = _PyInterpreterState_GET();
2077     struct _Py_unicode_ids *ids = &interp->unicode.ids;
2078 
2079     Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2080     if (index < 0) {
2081         struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2082 
2083         PyMutex_Lock(&rt_ids->mutex);
2084         // Check again to detect concurrent access. Another thread can have
2085         // initialized the index while this thread waited for the lock.
2086         index = _Py_atomic_load_ssize(&id->index);
2087         if (index < 0) {
2088             assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2089             index = rt_ids->next_index;
2090             rt_ids->next_index++;
2091             _Py_atomic_store_ssize(&id->index, index);
2092         }
2093         PyMutex_Unlock(&rt_ids->mutex);
2094     }
2095     assert(index >= 0);
2096 
2097     PyObject *obj;
2098     if (index < ids->size) {
2099         obj = ids->array[index];
2100         if (obj) {
2101             // Return a borrowed reference
2102             goto end;
2103         }
2104     }
2105 
2106     obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2107                                        NULL, NULL);
2108     if (!obj) {
2109         goto end;
2110     }
2111     _PyUnicode_InternImmortal(interp, &obj);
2112 
2113     if (index >= ids->size) {
2114         // Overallocate to reduce the number of realloc
2115         Py_ssize_t new_size = Py_MAX(index * 2, 16);
2116         Py_ssize_t item_size = sizeof(ids->array[0]);
2117         PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2118         if (new_array == NULL) {
2119             PyErr_NoMemory();
2120             obj = NULL;
2121             goto end;
2122         }
2123         memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2124         ids->array = new_array;
2125         ids->size = new_size;
2126     }
2127 
2128     // The array stores a strong reference
2129     ids->array[index] = obj;
2130 
2131 end:
2132     PyMutex_Unlock((PyMutex *)&id->mutex);
2133     // Return a borrowed reference
2134     return obj;
2135 }
2136 
2137 
2138 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2139 unicode_clear_identifiers(struct _Py_unicode_state *state)
2140 {
2141     struct _Py_unicode_ids *ids = &state->ids;
2142     for (Py_ssize_t i=0; i < ids->size; i++) {
2143         Py_XDECREF(ids->array[i]);
2144     }
2145     ids->size = 0;
2146     PyMem_Free(ids->array);
2147     ids->array = NULL;
2148     // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2149     // after Py_Finalize().
2150 }
2151 
2152 
2153 /* Internal function, doesn't check maximum character */
2154 
2155 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2156 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2157 {
2158     const unsigned char *s = (const unsigned char *)buffer;
2159     PyObject *unicode;
2160     if (size == 1) {
2161 #ifdef Py_DEBUG
2162         assert((unsigned char)s[0] < 128);
2163 #endif
2164         return get_latin1_char(s[0]);
2165     }
2166     unicode = PyUnicode_New(size, 127);
2167     if (!unicode)
2168         return NULL;
2169     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2170     assert(_PyUnicode_CheckConsistency(unicode, 1));
2171     return unicode;
2172 }
2173 
2174 static Py_UCS4
kind_maxchar_limit(int kind)2175 kind_maxchar_limit(int kind)
2176 {
2177     switch (kind) {
2178     case PyUnicode_1BYTE_KIND:
2179         return 0x80;
2180     case PyUnicode_2BYTE_KIND:
2181         return 0x100;
2182     case PyUnicode_4BYTE_KIND:
2183         return 0x10000;
2184     default:
2185         Py_UNREACHABLE();
2186     }
2187 }
2188 
2189 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2190 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2191 {
2192     PyObject *res;
2193     unsigned char max_char;
2194 
2195     if (size == 0) {
2196         _Py_RETURN_UNICODE_EMPTY();
2197     }
2198     assert(size > 0);
2199     if (size == 1) {
2200         return get_latin1_char(u[0]);
2201     }
2202 
2203     max_char = ucs1lib_find_max_char(u, u + size);
2204     res = PyUnicode_New(size, max_char);
2205     if (!res)
2206         return NULL;
2207     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2208     assert(_PyUnicode_CheckConsistency(res, 1));
2209     return res;
2210 }
2211 
2212 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2213 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2214 {
2215     PyObject *res;
2216     Py_UCS2 max_char;
2217 
2218     if (size == 0)
2219         _Py_RETURN_UNICODE_EMPTY();
2220     assert(size > 0);
2221     if (size == 1)
2222         return unicode_char(u[0]);
2223 
2224     max_char = ucs2lib_find_max_char(u, u + size);
2225     res = PyUnicode_New(size, max_char);
2226     if (!res)
2227         return NULL;
2228     if (max_char >= 256)
2229         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2230     else {
2231         _PyUnicode_CONVERT_BYTES(
2232             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2233     }
2234     assert(_PyUnicode_CheckConsistency(res, 1));
2235     return res;
2236 }
2237 
2238 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2239 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2240 {
2241     PyObject *res;
2242     Py_UCS4 max_char;
2243 
2244     if (size == 0)
2245         _Py_RETURN_UNICODE_EMPTY();
2246     assert(size > 0);
2247     if (size == 1)
2248         return unicode_char(u[0]);
2249 
2250     max_char = ucs4lib_find_max_char(u, u + size);
2251     res = PyUnicode_New(size, max_char);
2252     if (!res)
2253         return NULL;
2254     if (max_char < 256)
2255         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2256                                  PyUnicode_1BYTE_DATA(res));
2257     else if (max_char < 0x10000)
2258         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2259                                  PyUnicode_2BYTE_DATA(res));
2260     else
2261         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2262     assert(_PyUnicode_CheckConsistency(res, 1));
2263     return res;
2264 }
2265 
2266 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2267 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2268 {
2269     if (size < 0) {
2270         PyErr_SetString(PyExc_ValueError, "size must be positive");
2271         return NULL;
2272     }
2273     switch (kind) {
2274     case PyUnicode_1BYTE_KIND:
2275         return _PyUnicode_FromUCS1(buffer, size);
2276     case PyUnicode_2BYTE_KIND:
2277         return _PyUnicode_FromUCS2(buffer, size);
2278     case PyUnicode_4BYTE_KIND:
2279         return _PyUnicode_FromUCS4(buffer, size);
2280     default:
2281         PyErr_SetString(PyExc_SystemError, "invalid kind");
2282         return NULL;
2283     }
2284 }
2285 
2286 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2287 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2288 {
2289     int kind;
2290     const void *startptr, *endptr;
2291 
2292     assert(0 <= start);
2293     assert(end <= PyUnicode_GET_LENGTH(unicode));
2294     assert(start <= end);
2295 
2296     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2297         return PyUnicode_MAX_CHAR_VALUE(unicode);
2298 
2299     if (start == end)
2300         return 127;
2301 
2302     if (PyUnicode_IS_ASCII(unicode))
2303         return 127;
2304 
2305     kind = PyUnicode_KIND(unicode);
2306     startptr = PyUnicode_DATA(unicode);
2307     endptr = (char *)startptr + end * kind;
2308     startptr = (char *)startptr + start * kind;
2309     switch(kind) {
2310     case PyUnicode_1BYTE_KIND:
2311         return ucs1lib_find_max_char(startptr, endptr);
2312     case PyUnicode_2BYTE_KIND:
2313         return ucs2lib_find_max_char(startptr, endptr);
2314     case PyUnicode_4BYTE_KIND:
2315         return ucs4lib_find_max_char(startptr, endptr);
2316     default:
2317         Py_UNREACHABLE();
2318     }
2319 }
2320 
2321 /* Ensure that a string uses the most efficient storage, if it is not the
2322    case: create a new string with of the right kind. Write NULL into *p_unicode
2323    on error. */
2324 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2325 unicode_adjust_maxchar(PyObject **p_unicode)
2326 {
2327     PyObject *unicode, *copy;
2328     Py_UCS4 max_char;
2329     Py_ssize_t len;
2330     int kind;
2331 
2332     assert(p_unicode != NULL);
2333     unicode = *p_unicode;
2334     if (PyUnicode_IS_ASCII(unicode))
2335         return;
2336 
2337     len = PyUnicode_GET_LENGTH(unicode);
2338     kind = PyUnicode_KIND(unicode);
2339     if (kind == PyUnicode_1BYTE_KIND) {
2340         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2341         max_char = ucs1lib_find_max_char(u, u + len);
2342         if (max_char >= 128)
2343             return;
2344     }
2345     else if (kind == PyUnicode_2BYTE_KIND) {
2346         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2347         max_char = ucs2lib_find_max_char(u, u + len);
2348         if (max_char >= 256)
2349             return;
2350     }
2351     else if (kind == PyUnicode_4BYTE_KIND) {
2352         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2353         max_char = ucs4lib_find_max_char(u, u + len);
2354         if (max_char >= 0x10000)
2355             return;
2356     }
2357     else
2358         Py_UNREACHABLE();
2359 
2360     copy = PyUnicode_New(len, max_char);
2361     if (copy != NULL)
2362         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2363     Py_DECREF(unicode);
2364     *p_unicode = copy;
2365 }
2366 
2367 PyObject*
_PyUnicode_Copy(PyObject * unicode)2368 _PyUnicode_Copy(PyObject *unicode)
2369 {
2370     Py_ssize_t length;
2371     PyObject *copy;
2372 
2373     if (!PyUnicode_Check(unicode)) {
2374         PyErr_BadInternalCall();
2375         return NULL;
2376     }
2377 
2378     length = PyUnicode_GET_LENGTH(unicode);
2379     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2380     if (!copy)
2381         return NULL;
2382     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2383 
2384     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2385               length * PyUnicode_KIND(unicode));
2386     assert(_PyUnicode_CheckConsistency(copy, 1));
2387     return copy;
2388 }
2389 
2390 
2391 /* Widen Unicode objects to larger buffers. Don't write terminating null
2392    character. Return NULL on error. */
2393 
2394 static void*
unicode_askind(int skind,void const * data,Py_ssize_t len,int kind)2395 unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2396 {
2397     void *result;
2398 
2399     assert(skind < kind);
2400     switch (kind) {
2401     case PyUnicode_2BYTE_KIND:
2402         result = PyMem_New(Py_UCS2, len);
2403         if (!result)
2404             return PyErr_NoMemory();
2405         assert(skind == PyUnicode_1BYTE_KIND);
2406         _PyUnicode_CONVERT_BYTES(
2407             Py_UCS1, Py_UCS2,
2408             (const Py_UCS1 *)data,
2409             ((const Py_UCS1 *)data) + len,
2410             result);
2411         return result;
2412     case PyUnicode_4BYTE_KIND:
2413         result = PyMem_New(Py_UCS4, len);
2414         if (!result)
2415             return PyErr_NoMemory();
2416         if (skind == PyUnicode_2BYTE_KIND) {
2417             _PyUnicode_CONVERT_BYTES(
2418                 Py_UCS2, Py_UCS4,
2419                 (const Py_UCS2 *)data,
2420                 ((const Py_UCS2 *)data) + len,
2421                 result);
2422         }
2423         else {
2424             assert(skind == PyUnicode_1BYTE_KIND);
2425             _PyUnicode_CONVERT_BYTES(
2426                 Py_UCS1, Py_UCS4,
2427                 (const Py_UCS1 *)data,
2428                 ((const Py_UCS1 *)data) + len,
2429                 result);
2430         }
2431         return result;
2432     default:
2433         Py_UNREACHABLE();
2434         return NULL;
2435     }
2436 }
2437 
2438 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2439 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2440         int copy_null)
2441 {
2442     int kind;
2443     const void *data;
2444     Py_ssize_t len, targetlen;
2445     kind = PyUnicode_KIND(string);
2446     data = PyUnicode_DATA(string);
2447     len = PyUnicode_GET_LENGTH(string);
2448     targetlen = len;
2449     if (copy_null)
2450         targetlen++;
2451     if (!target) {
2452         target = PyMem_New(Py_UCS4, targetlen);
2453         if (!target) {
2454             PyErr_NoMemory();
2455             return NULL;
2456         }
2457     }
2458     else {
2459         if (targetsize < targetlen) {
2460             PyErr_Format(PyExc_SystemError,
2461                          "string is longer than the buffer");
2462             if (copy_null && 0 < targetsize)
2463                 target[0] = 0;
2464             return NULL;
2465         }
2466     }
2467     if (kind == PyUnicode_1BYTE_KIND) {
2468         const Py_UCS1 *start = (const Py_UCS1 *) data;
2469         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2470     }
2471     else if (kind == PyUnicode_2BYTE_KIND) {
2472         const Py_UCS2 *start = (const Py_UCS2 *) data;
2473         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2474     }
2475     else if (kind == PyUnicode_4BYTE_KIND) {
2476         memcpy(target, data, len * sizeof(Py_UCS4));
2477     }
2478     else {
2479         Py_UNREACHABLE();
2480     }
2481     if (copy_null)
2482         target[len] = 0;
2483     return target;
2484 }
2485 
2486 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2487 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2488                  int copy_null)
2489 {
2490     if (target == NULL || targetsize < 0) {
2491         PyErr_BadInternalCall();
2492         return NULL;
2493     }
2494     return as_ucs4(string, target, targetsize, copy_null);
2495 }
2496 
2497 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2498 PyUnicode_AsUCS4Copy(PyObject *string)
2499 {
2500     return as_ucs4(string, NULL, 0, 1);
2501 }
2502 
2503 /* maximum number of characters required for output of %jo or %jd or %p.
2504    We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2505    plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2506    plus 1 for the terminal NUL. */
2507 #define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2508 
2509 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision,int flags)2510 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2511                              Py_ssize_t width, Py_ssize_t precision, int flags)
2512 {
2513     Py_ssize_t length, fill, arglen;
2514     Py_UCS4 maxchar;
2515 
2516     length = PyUnicode_GET_LENGTH(str);
2517     if ((precision == -1 || precision >= length)
2518         && width <= length)
2519         return _PyUnicodeWriter_WriteStr(writer, str);
2520 
2521     if (precision != -1)
2522         length = Py_MIN(precision, length);
2523 
2524     arglen = Py_MAX(length, width);
2525     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527     else
2528         maxchar = writer->maxchar;
2529 
2530     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531         return -1;
2532 
2533     fill = Py_MAX(width - length, 0);
2534     if (fill && !(flags & F_LJUST)) {
2535         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536             return -1;
2537         writer->pos += fill;
2538     }
2539 
2540     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541                                   str, 0, length);
2542     writer->pos += length;
2543 
2544     if (fill && (flags & F_LJUST)) {
2545         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2546             return -1;
2547         writer->pos += fill;
2548     }
2549 
2550     return 0;
2551 }
2552 
2553 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision,int flags)2554 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2555                               Py_ssize_t width, Py_ssize_t precision, int flags)
2556 {
2557     /* UTF-8 */
2558     Py_ssize_t length;
2559     PyObject *unicode;
2560     int res;
2561 
2562     if (precision == -1) {
2563         length = strlen(str);
2564     }
2565     else {
2566         length = 0;
2567         while (length < precision && str[length]) {
2568             length++;
2569         }
2570     }
2571     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2572     if (unicode == NULL)
2573         return -1;
2574 
2575     res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2576     Py_DECREF(unicode);
2577     return res;
2578 }
2579 
2580 static int
unicode_fromformat_write_wcstr(_PyUnicodeWriter * writer,const wchar_t * str,Py_ssize_t width,Py_ssize_t precision,int flags)2581 unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2582                               Py_ssize_t width, Py_ssize_t precision, int flags)
2583 {
2584     /* UTF-8 */
2585     Py_ssize_t length;
2586     PyObject *unicode;
2587     int res;
2588 
2589     if (precision == -1) {
2590         length = wcslen(str);
2591     }
2592     else {
2593         length = 0;
2594         while (length < precision && str[length]) {
2595             length++;
2596         }
2597     }
2598     unicode = PyUnicode_FromWideChar(str, length);
2599     if (unicode == NULL)
2600         return -1;
2601 
2602     res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2603     Py_DECREF(unicode);
2604     return res;
2605 }
2606 
2607 #define F_LONG 1
2608 #define F_LONGLONG 2
2609 #define F_SIZE 3
2610 #define F_PTRDIFF 4
2611 #define F_INTMAX 5
2612 static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
2613 static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
2614 static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
2615 static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
2616 static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
2617 
2618 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2619 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2620                        const char *f, va_list *vargs)
2621 {
2622     const char *p;
2623     Py_ssize_t len;
2624     int flags = 0;
2625     Py_ssize_t width;
2626     Py_ssize_t precision;
2627 
2628     p = f;
2629     f++;
2630     if (*f == '%') {
2631         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2632             return NULL;
2633         f++;
2634         return f;
2635     }
2636 
2637     /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2638     /* Flags '+', ' ' and '#' are not particularly useful.
2639      * They are not worth the implementation and maintenance costs.
2640      * In addition, '#' should add "0" for "o" conversions for compatibility
2641      * with printf, but it would confuse Python users. */
2642     while (1) {
2643         switch (*f++) {
2644         case '-': flags |= F_LJUST; continue;
2645         case '0': flags |= F_ZERO; continue;
2646         case '#': flags |= F_ALT; continue;
2647         }
2648         f--;
2649         break;
2650     }
2651 
2652     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2653     width = -1;
2654     if (*f == '*') {
2655         width = va_arg(*vargs, int);
2656         if (width < 0) {
2657             flags |= F_LJUST;
2658             width = -width;
2659         }
2660         f++;
2661     }
2662     else if (Py_ISDIGIT((unsigned)*f)) {
2663         width = *f - '0';
2664         f++;
2665         while (Py_ISDIGIT((unsigned)*f)) {
2666             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2667                 PyErr_SetString(PyExc_ValueError,
2668                                 "width too big");
2669                 return NULL;
2670             }
2671             width = (width * 10) + (*f - '0');
2672             f++;
2673         }
2674     }
2675     precision = -1;
2676     if (*f == '.') {
2677         f++;
2678         if (*f == '*') {
2679             precision = va_arg(*vargs, int);
2680             if (precision < 0) {
2681                 precision = -2;
2682             }
2683             f++;
2684         }
2685         else if (Py_ISDIGIT((unsigned)*f)) {
2686             precision = (*f - '0');
2687             f++;
2688             while (Py_ISDIGIT((unsigned)*f)) {
2689                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2690                     PyErr_SetString(PyExc_ValueError,
2691                                     "precision too big");
2692                     return NULL;
2693                 }
2694                 precision = (precision * 10) + (*f - '0');
2695                 f++;
2696             }
2697         }
2698     }
2699 
2700     int sizemod = 0;
2701     if (*f == 'l') {
2702         if (f[1] == 'l') {
2703             sizemod = F_LONGLONG;
2704             f += 2;
2705         }
2706         else {
2707             sizemod = F_LONG;
2708             ++f;
2709         }
2710     }
2711     else if (*f == 'z') {
2712         sizemod = F_SIZE;
2713         ++f;
2714     }
2715     else if (*f == 't') {
2716         sizemod = F_PTRDIFF;
2717         ++f;
2718     }
2719     else if (*f == 'j') {
2720         sizemod = F_INTMAX;
2721         ++f;
2722     }
2723     if (f[0] != '\0' && f[1] == '\0')
2724         writer->overallocate = 0;
2725 
2726     switch (*f) {
2727     case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2728         break;
2729     case 'c': case 'p':
2730         if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2731         break;
2732     case 's':
2733     case 'V':
2734         if (sizemod && sizemod != F_LONG) goto invalid_format;
2735         break;
2736     default:
2737         if (sizemod) goto invalid_format;
2738         break;
2739     }
2740 
2741     switch (*f) {
2742     case 'c':
2743     {
2744         int ordinal = va_arg(*vargs, int);
2745         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2746             PyErr_SetString(PyExc_OverflowError,
2747                             "character argument not in range(0x110000)");
2748             return NULL;
2749         }
2750         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2751             return NULL;
2752         break;
2753     }
2754 
2755     case 'd': case 'i':
2756     case 'o': case 'u': case 'x': case 'X':
2757     {
2758         /* used by sprintf */
2759         char buffer[MAX_INTMAX_CHARS];
2760         const char *fmt = NULL;
2761         switch (*f) {
2762             case 'o': fmt = formats_o[sizemod]; break;
2763             case 'u': fmt = formats_u[sizemod]; break;
2764             case 'x': fmt = formats_x[sizemod]; break;
2765             case 'X': fmt = formats_X[sizemod]; break;
2766             default: fmt = formats[sizemod]; break;
2767         }
2768         int issigned = (*f == 'd' || *f == 'i');
2769         switch (sizemod) {
2770             case F_LONG:
2771                 len = issigned ?
2772                     sprintf(buffer, fmt, va_arg(*vargs, long)) :
2773                     sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
2774                 break;
2775             case F_LONGLONG:
2776                 len = issigned ?
2777                     sprintf(buffer, fmt, va_arg(*vargs, long long)) :
2778                     sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
2779                 break;
2780             case F_SIZE:
2781                 len = issigned ?
2782                     sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
2783                     sprintf(buffer, fmt, va_arg(*vargs, size_t));
2784                 break;
2785             case F_PTRDIFF:
2786                 len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
2787                 break;
2788             case F_INTMAX:
2789                 len = issigned ?
2790                     sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
2791                     sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
2792                 break;
2793             default:
2794                 len = issigned ?
2795                     sprintf(buffer, fmt, va_arg(*vargs, int)) :
2796                     sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
2797                 break;
2798         }
2799         assert(len >= 0);
2800 
2801         int sign = (buffer[0] == '-');
2802         len -= sign;
2803 
2804         precision = Py_MAX(precision, len);
2805         width = Py_MAX(width, precision + sign);
2806         if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2807             precision = width - sign;
2808         }
2809 
2810         Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2811         Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2812 
2813         if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2814             return NULL;
2815 
2816         if (spacepad && !(flags & F_LJUST)) {
2817             if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2818                 return NULL;
2819             writer->pos += spacepad;
2820         }
2821 
2822         if (sign) {
2823             if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2824                 return NULL;
2825         }
2826 
2827         if (zeropad) {
2828             if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2829                 return NULL;
2830             writer->pos += zeropad;
2831         }
2832 
2833         if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2834             return NULL;
2835 
2836         if (spacepad && (flags & F_LJUST)) {
2837             if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2838                 return NULL;
2839             writer->pos += spacepad;
2840         }
2841         break;
2842     }
2843 
2844     case 'p':
2845     {
2846         char number[MAX_INTMAX_CHARS];
2847 
2848         len = sprintf(number, "%p", va_arg(*vargs, void*));
2849         assert(len >= 0);
2850 
2851         /* %p is ill-defined:  ensure leading 0x. */
2852         if (number[1] == 'X')
2853             number[1] = 'x';
2854         else if (number[1] != 'x') {
2855             memmove(number + 2, number,
2856                     strlen(number) + 1);
2857             number[0] = '0';
2858             number[1] = 'x';
2859             len += 2;
2860         }
2861 
2862         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2863             return NULL;
2864         break;
2865     }
2866 
2867     case 's':
2868     {
2869         if (sizemod) {
2870             const wchar_t *s = va_arg(*vargs, const wchar_t*);
2871             if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2872                 return NULL;
2873         }
2874         else {
2875             /* UTF-8 */
2876             const char *s = va_arg(*vargs, const char*);
2877             if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
2878                 return NULL;
2879         }
2880         break;
2881     }
2882 
2883     case 'U':
2884     {
2885         PyObject *obj = va_arg(*vargs, PyObject *);
2886         assert(obj && _PyUnicode_CHECK(obj));
2887 
2888         if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2889             return NULL;
2890         break;
2891     }
2892 
2893     case 'V':
2894     {
2895         PyObject *obj = va_arg(*vargs, PyObject *);
2896         const char *str;
2897         const wchar_t *wstr;
2898         if (sizemod) {
2899             wstr = va_arg(*vargs, const wchar_t*);
2900         }
2901         else {
2902             str = va_arg(*vargs, const char *);
2903         }
2904         if (obj) {
2905             assert(_PyUnicode_CHECK(obj));
2906             if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2907                 return NULL;
2908         }
2909         else if (sizemod) {
2910             assert(wstr != NULL);
2911             if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2912                 return NULL;
2913         }
2914         else {
2915             assert(str != NULL);
2916             if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
2917                 return NULL;
2918         }
2919         break;
2920     }
2921 
2922     case 'S':
2923     {
2924         PyObject *obj = va_arg(*vargs, PyObject *);
2925         PyObject *str;
2926         assert(obj);
2927         str = PyObject_Str(obj);
2928         if (!str)
2929             return NULL;
2930         if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2931             Py_DECREF(str);
2932             return NULL;
2933         }
2934         Py_DECREF(str);
2935         break;
2936     }
2937 
2938     case 'R':
2939     {
2940         PyObject *obj = va_arg(*vargs, PyObject *);
2941         PyObject *repr;
2942         assert(obj);
2943         repr = PyObject_Repr(obj);
2944         if (!repr)
2945             return NULL;
2946         if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2947             Py_DECREF(repr);
2948             return NULL;
2949         }
2950         Py_DECREF(repr);
2951         break;
2952     }
2953 
2954     case 'A':
2955     {
2956         PyObject *obj = va_arg(*vargs, PyObject *);
2957         PyObject *ascii;
2958         assert(obj);
2959         ascii = PyObject_ASCII(obj);
2960         if (!ascii)
2961             return NULL;
2962         if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2963             Py_DECREF(ascii);
2964             return NULL;
2965         }
2966         Py_DECREF(ascii);
2967         break;
2968     }
2969 
2970     case 'T':
2971     {
2972         PyObject *obj = va_arg(*vargs, PyObject *);
2973         PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2974 
2975         PyObject *type_name;
2976         if (flags & F_ALT) {
2977             type_name = _PyType_GetFullyQualifiedName(type, ':');
2978         }
2979         else {
2980             type_name = PyType_GetFullyQualifiedName(type);
2981         }
2982         Py_DECREF(type);
2983         if (!type_name) {
2984             return NULL;
2985         }
2986 
2987         if (unicode_fromformat_write_str(writer, type_name,
2988                                          width, precision, flags) == -1) {
2989             Py_DECREF(type_name);
2990             return NULL;
2991         }
2992         Py_DECREF(type_name);
2993         break;
2994     }
2995 
2996     case 'N':
2997     {
2998         PyObject *type_raw = va_arg(*vargs, PyObject *);
2999         assert(type_raw != NULL);
3000 
3001         if (!PyType_Check(type_raw)) {
3002             PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3003             return NULL;
3004         }
3005         PyTypeObject *type = (PyTypeObject*)type_raw;
3006 
3007         PyObject *type_name;
3008         if (flags & F_ALT) {
3009             type_name = _PyType_GetFullyQualifiedName(type, ':');
3010         }
3011         else {
3012             type_name = PyType_GetFullyQualifiedName(type);
3013         }
3014         if (!type_name) {
3015             return NULL;
3016         }
3017         if (unicode_fromformat_write_str(writer, type_name,
3018                                          width, precision, flags) == -1) {
3019             Py_DECREF(type_name);
3020             return NULL;
3021         }
3022         Py_DECREF(type_name);
3023         break;
3024     }
3025 
3026     default:
3027     invalid_format:
3028         PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3029         return NULL;
3030     }
3031 
3032     f++;
3033     return f;
3034 }
3035 
3036 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3037 PyUnicode_FromFormatV(const char *format, va_list vargs)
3038 {
3039     va_list vargs2;
3040     const char *f;
3041     _PyUnicodeWriter writer;
3042 
3043     _PyUnicodeWriter_Init(&writer);
3044     writer.min_length = strlen(format) + 100;
3045     writer.overallocate = 1;
3046 
3047     // Copy varags to be able to pass a reference to a subfunction.
3048     va_copy(vargs2, vargs);
3049 
3050     for (f = format; *f; ) {
3051         if (*f == '%') {
3052             f = unicode_fromformat_arg(&writer, f, &vargs2);
3053             if (f == NULL)
3054                 goto fail;
3055         }
3056         else {
3057             const char *p;
3058             Py_ssize_t len;
3059 
3060             p = f;
3061             do
3062             {
3063                 if ((unsigned char)*p > 127) {
3064                     PyErr_Format(PyExc_ValueError,
3065                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066                         "string, got a non-ASCII byte: 0x%02x",
3067                         (unsigned char)*p);
3068                     goto fail;
3069                 }
3070                 p++;
3071             }
3072             while (*p != '\0' && *p != '%');
3073             len = p - f;
3074 
3075             if (*p == '\0')
3076                 writer.overallocate = 0;
3077 
3078             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3079                 goto fail;
3080 
3081             f = p;
3082         }
3083     }
3084     va_end(vargs2);
3085     return _PyUnicodeWriter_Finish(&writer);
3086 
3087   fail:
3088     va_end(vargs2);
3089     _PyUnicodeWriter_Dealloc(&writer);
3090     return NULL;
3091 }
3092 
3093 PyObject *
PyUnicode_FromFormat(const char * format,...)3094 PyUnicode_FromFormat(const char *format, ...)
3095 {
3096     PyObject* ret;
3097     va_list vargs;
3098 
3099     va_start(vargs, format);
3100     ret = PyUnicode_FromFormatV(format, vargs);
3101     va_end(vargs);
3102     return ret;
3103 }
3104 
3105 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3106 unicode_get_widechar_size(PyObject *unicode)
3107 {
3108     Py_ssize_t res;
3109 
3110     assert(unicode != NULL);
3111     assert(_PyUnicode_CHECK(unicode));
3112 
3113     res = _PyUnicode_LENGTH(unicode);
3114 #if SIZEOF_WCHAR_T == 2
3115     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3116         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3117         const Py_UCS4 *end = s + res;
3118         for (; s < end; ++s) {
3119             if (*s > 0xFFFF) {
3120                 ++res;
3121             }
3122         }
3123     }
3124 #endif
3125     return res;
3126 }
3127 
3128 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3129 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3130 {
3131     assert(unicode != NULL);
3132     assert(_PyUnicode_CHECK(unicode));
3133 
3134     if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3135         memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3136         return;
3137     }
3138 
3139     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3140         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3141         for (; size--; ++s, ++w) {
3142             *w = *s;
3143         }
3144     }
3145     else {
3146 #if SIZEOF_WCHAR_T == 4
3147         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3148         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3149         for (; size--; ++s, ++w) {
3150             *w = *s;
3151         }
3152 #else
3153         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3154         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3155         for (; size--; ++s, ++w) {
3156             Py_UCS4 ch = *s;
3157             if (ch > 0xFFFF) {
3158                 assert(ch <= MAX_UNICODE);
3159                 /* encode surrogate pair in this case */
3160                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3161                 if (!size--)
3162                     break;
3163                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3164             }
3165             else {
3166                 *w = ch;
3167             }
3168         }
3169 #endif
3170     }
3171 }
3172 
3173 #ifdef HAVE_WCHAR_H
3174 
3175 /* Convert a Unicode object to a wide character string.
3176 
3177    - If w is NULL: return the number of wide characters (including the null
3178      character) required to convert the unicode object. Ignore size argument.
3179 
3180    - Otherwise: return the number of wide characters (excluding the null
3181      character) written into w. Write at most size wide characters (including
3182      the null character). */
3183 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3184 PyUnicode_AsWideChar(PyObject *unicode,
3185                      wchar_t *w,
3186                      Py_ssize_t size)
3187 {
3188     Py_ssize_t res;
3189 
3190     if (unicode == NULL) {
3191         PyErr_BadInternalCall();
3192         return -1;
3193     }
3194     if (!PyUnicode_Check(unicode)) {
3195         PyErr_BadArgument();
3196         return -1;
3197     }
3198 
3199     res = unicode_get_widechar_size(unicode);
3200     if (w == NULL) {
3201         return res + 1;
3202     }
3203 
3204     if (size > res) {
3205         size = res + 1;
3206     }
3207     else {
3208         res = size;
3209     }
3210     unicode_copy_as_widechar(unicode, w, size);
3211 
3212 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3213     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3214        non-Unicode locales and hence needs conversion first. */
3215     if (_Py_LocaleUsesNonUnicodeWchar()) {
3216         if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3217             return -1;
3218         }
3219     }
3220 #endif
3221 
3222     return res;
3223 }
3224 
3225 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3226 PyUnicode_AsWideCharString(PyObject *unicode,
3227                            Py_ssize_t *size)
3228 {
3229     wchar_t *buffer;
3230     Py_ssize_t buflen;
3231 
3232     if (unicode == NULL) {
3233         PyErr_BadInternalCall();
3234         return NULL;
3235     }
3236     if (!PyUnicode_Check(unicode)) {
3237         PyErr_BadArgument();
3238         return NULL;
3239     }
3240 
3241     buflen = unicode_get_widechar_size(unicode);
3242     buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3243     if (buffer == NULL) {
3244         PyErr_NoMemory();
3245         return NULL;
3246     }
3247     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3248 
3249 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3250     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3251        non-Unicode locales and hence needs conversion first. */
3252     if (_Py_LocaleUsesNonUnicodeWchar()) {
3253         if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3254             return NULL;
3255         }
3256     }
3257 #endif
3258 
3259     if (size != NULL) {
3260         *size = buflen;
3261     }
3262     else if (wcslen(buffer) != (size_t)buflen) {
3263         PyMem_Free(buffer);
3264         PyErr_SetString(PyExc_ValueError,
3265                         "embedded null character");
3266         return NULL;
3267     }
3268     return buffer;
3269 }
3270 
3271 #endif /* HAVE_WCHAR_H */
3272 
3273 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3274 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3275 {
3276     wchar_t **p = (wchar_t **)ptr;
3277     if (obj == NULL) {
3278         PyMem_Free(*p);
3279         *p = NULL;
3280         return 1;
3281     }
3282     if (PyUnicode_Check(obj)) {
3283         *p = PyUnicode_AsWideCharString(obj, NULL);
3284         if (*p == NULL) {
3285             return 0;
3286         }
3287         return Py_CLEANUP_SUPPORTED;
3288     }
3289     PyErr_Format(PyExc_TypeError,
3290                  "argument must be str, not %.50s",
3291                  Py_TYPE(obj)->tp_name);
3292     return 0;
3293 }
3294 
3295 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3296 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3297 {
3298     wchar_t **p = (wchar_t **)ptr;
3299     if (obj == NULL) {
3300         PyMem_Free(*p);
3301         *p = NULL;
3302         return 1;
3303     }
3304     if (obj == Py_None) {
3305         *p = NULL;
3306         return 1;
3307     }
3308     if (PyUnicode_Check(obj)) {
3309         *p = PyUnicode_AsWideCharString(obj, NULL);
3310         if (*p == NULL) {
3311             return 0;
3312         }
3313         return Py_CLEANUP_SUPPORTED;
3314     }
3315     PyErr_Format(PyExc_TypeError,
3316                  "argument must be str or None, not %.50s",
3317                  Py_TYPE(obj)->tp_name);
3318     return 0;
3319 }
3320 
3321 PyObject *
PyUnicode_FromOrdinal(int ordinal)3322 PyUnicode_FromOrdinal(int ordinal)
3323 {
3324     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3325         PyErr_SetString(PyExc_ValueError,
3326                         "chr() arg not in range(0x110000)");
3327         return NULL;
3328     }
3329 
3330     return unicode_char((Py_UCS4)ordinal);
3331 }
3332 
3333 PyObject *
PyUnicode_FromObject(PyObject * obj)3334 PyUnicode_FromObject(PyObject *obj)
3335 {
3336     /* XXX Perhaps we should make this API an alias of
3337        PyObject_Str() instead ?! */
3338     if (PyUnicode_CheckExact(obj)) {
3339         return Py_NewRef(obj);
3340     }
3341     if (PyUnicode_Check(obj)) {
3342         /* For a Unicode subtype that's not a Unicode object,
3343            return a true Unicode object with the same data. */
3344         return _PyUnicode_Copy(obj);
3345     }
3346     PyErr_Format(PyExc_TypeError,
3347                  "Can't convert '%.100s' object to str implicitly",
3348                  Py_TYPE(obj)->tp_name);
3349     return NULL;
3350 }
3351 
3352 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3353 PyUnicode_FromEncodedObject(PyObject *obj,
3354                             const char *encoding,
3355                             const char *errors)
3356 {
3357     Py_buffer buffer;
3358     PyObject *v;
3359 
3360     if (obj == NULL) {
3361         PyErr_BadInternalCall();
3362         return NULL;
3363     }
3364 
3365     /* Decoding bytes objects is the most common case and should be fast */
3366     if (PyBytes_Check(obj)) {
3367         if (PyBytes_GET_SIZE(obj) == 0) {
3368             if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369                 return NULL;
3370             }
3371             _Py_RETURN_UNICODE_EMPTY();
3372         }
3373         return PyUnicode_Decode(
3374                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3375                 encoding, errors);
3376     }
3377 
3378     if (PyUnicode_Check(obj)) {
3379         PyErr_SetString(PyExc_TypeError,
3380                         "decoding str is not supported");
3381         return NULL;
3382     }
3383 
3384     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3385     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3386         PyErr_Format(PyExc_TypeError,
3387                      "decoding to str: need a bytes-like object, %.80s found",
3388                      Py_TYPE(obj)->tp_name);
3389         return NULL;
3390     }
3391 
3392     if (buffer.len == 0) {
3393         PyBuffer_Release(&buffer);
3394         if (unicode_check_encoding_errors(encoding, errors) < 0) {
3395             return NULL;
3396         }
3397         _Py_RETURN_UNICODE_EMPTY();
3398     }
3399 
3400     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3401     PyBuffer_Release(&buffer);
3402     return v;
3403 }
3404 
3405 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3406    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3407    longer than lower_len-1). */
3408 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3409 _Py_normalize_encoding(const char *encoding,
3410                        char *lower,
3411                        size_t lower_len)
3412 {
3413     const char *e;
3414     char *l;
3415     char *l_end;
3416     int punct;
3417 
3418     assert(encoding != NULL);
3419 
3420     e = encoding;
3421     l = lower;
3422     l_end = &lower[lower_len - 1];
3423     punct = 0;
3424     while (1) {
3425         char c = *e;
3426         if (c == 0) {
3427             break;
3428         }
3429 
3430         if (Py_ISALNUM(c) || c == '.') {
3431             if (punct && l != lower) {
3432                 if (l == l_end) {
3433                     return 0;
3434                 }
3435                 *l++ = '_';
3436             }
3437             punct = 0;
3438 
3439             if (l == l_end) {
3440                 return 0;
3441             }
3442             *l++ = Py_TOLOWER(c);
3443         }
3444         else {
3445             punct = 1;
3446         }
3447 
3448         e++;
3449     }
3450     *l = '\0';
3451     return 1;
3452 }
3453 
3454 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3455 PyUnicode_Decode(const char *s,
3456                  Py_ssize_t size,
3457                  const char *encoding,
3458                  const char *errors)
3459 {
3460     PyObject *buffer = NULL, *unicode;
3461     Py_buffer info;
3462     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3463 
3464     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3465         return NULL;
3466     }
3467 
3468     if (size == 0) {
3469         _Py_RETURN_UNICODE_EMPTY();
3470     }
3471 
3472     if (encoding == NULL) {
3473         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3474     }
3475 
3476     /* Shortcuts for common default encodings */
3477     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3478         char *lower = buflower;
3479 
3480         /* Fast paths */
3481         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3482             lower += 3;
3483             if (*lower == '_') {
3484                 /* Match "utf8" and "utf_8" */
3485                 lower++;
3486             }
3487 
3488             if (lower[0] == '8' && lower[1] == 0) {
3489                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3490             }
3491             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3492                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3493             }
3494             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3495                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3496             }
3497         }
3498         else {
3499             if (strcmp(lower, "ascii") == 0
3500                 || strcmp(lower, "us_ascii") == 0) {
3501                 return PyUnicode_DecodeASCII(s, size, errors);
3502             }
3503     #ifdef MS_WINDOWS
3504             else if (strcmp(lower, "mbcs") == 0) {
3505                 return PyUnicode_DecodeMBCS(s, size, errors);
3506             }
3507     #endif
3508             else if (strcmp(lower, "latin1") == 0
3509                      || strcmp(lower, "latin_1") == 0
3510                      || strcmp(lower, "iso_8859_1") == 0
3511                      || strcmp(lower, "iso8859_1") == 0) {
3512                 return PyUnicode_DecodeLatin1(s, size, errors);
3513             }
3514         }
3515     }
3516 
3517     /* Decode via the codec registry */
3518     buffer = NULL;
3519     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3520         goto onError;
3521     buffer = PyMemoryView_FromBuffer(&info);
3522     if (buffer == NULL)
3523         goto onError;
3524     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3525     if (unicode == NULL)
3526         goto onError;
3527     if (!PyUnicode_Check(unicode)) {
3528         PyErr_Format(PyExc_TypeError,
3529                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3530                      "use codecs.decode() to decode to arbitrary types",
3531                      encoding,
3532                      Py_TYPE(unicode)->tp_name);
3533         Py_DECREF(unicode);
3534         goto onError;
3535     }
3536     Py_DECREF(buffer);
3537     return unicode_result(unicode);
3538 
3539   onError:
3540     Py_XDECREF(buffer);
3541     return NULL;
3542 }
3543 
3544 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3545 PyUnicode_AsDecodedObject(PyObject *unicode,
3546                           const char *encoding,
3547                           const char *errors)
3548 {
3549     if (!PyUnicode_Check(unicode)) {
3550         PyErr_BadArgument();
3551         return NULL;
3552     }
3553 
3554     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3555                      "PyUnicode_AsDecodedObject() is deprecated; "
3556                      "use PyCodec_Decode() to decode from str", 1) < 0)
3557         return NULL;
3558 
3559     if (encoding == NULL)
3560         encoding = PyUnicode_GetDefaultEncoding();
3561 
3562     /* Decode via the codec registry */
3563     return PyCodec_Decode(unicode, encoding, errors);
3564 }
3565 
3566 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3567 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3568                            const char *encoding,
3569                            const char *errors)
3570 {
3571     PyObject *v;
3572 
3573     if (!PyUnicode_Check(unicode)) {
3574         PyErr_BadArgument();
3575         goto onError;
3576     }
3577 
3578     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3579                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3580                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3581         return NULL;
3582 
3583     if (encoding == NULL)
3584         encoding = PyUnicode_GetDefaultEncoding();
3585 
3586     /* Decode via the codec registry */
3587     v = PyCodec_Decode(unicode, encoding, errors);
3588     if (v == NULL)
3589         goto onError;
3590     if (!PyUnicode_Check(v)) {
3591         PyErr_Format(PyExc_TypeError,
3592                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3593                      "use codecs.decode() to decode to arbitrary types",
3594                      encoding,
3595                      Py_TYPE(unicode)->tp_name);
3596         Py_DECREF(v);
3597         goto onError;
3598     }
3599     return unicode_result(v);
3600 
3601   onError:
3602     return NULL;
3603 }
3604 
3605 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3606 PyUnicode_AsEncodedObject(PyObject *unicode,
3607                           const char *encoding,
3608                           const char *errors)
3609 {
3610     PyObject *v;
3611 
3612     if (!PyUnicode_Check(unicode)) {
3613         PyErr_BadArgument();
3614         goto onError;
3615     }
3616 
3617     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3618                      "PyUnicode_AsEncodedObject() is deprecated; "
3619                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3620                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3621         return NULL;
3622 
3623     if (encoding == NULL)
3624         encoding = PyUnicode_GetDefaultEncoding();
3625 
3626     /* Encode via the codec registry */
3627     v = PyCodec_Encode(unicode, encoding, errors);
3628     if (v == NULL)
3629         goto onError;
3630     return v;
3631 
3632   onError:
3633     return NULL;
3634 }
3635 
3636 
3637 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3638 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3639                       int current_locale)
3640 {
3641     Py_ssize_t wlen;
3642     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3643     if (wstr == NULL) {
3644         return NULL;
3645     }
3646 
3647     if ((size_t)wlen != wcslen(wstr)) {
3648         PyErr_SetString(PyExc_ValueError, "embedded null character");
3649         PyMem_Free(wstr);
3650         return NULL;
3651     }
3652 
3653     char *str;
3654     size_t error_pos;
3655     const char *reason;
3656     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3657                                  current_locale, error_handler);
3658     PyMem_Free(wstr);
3659 
3660     if (res != 0) {
3661         if (res == -2) {
3662             PyObject *exc;
3663             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3664                     "locale", unicode,
3665                     (Py_ssize_t)error_pos,
3666                     (Py_ssize_t)(error_pos+1),
3667                     reason);
3668             if (exc != NULL) {
3669                 PyCodec_StrictErrors(exc);
3670                 Py_DECREF(exc);
3671             }
3672         }
3673         else if (res == -3) {
3674             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3675         }
3676         else {
3677             PyErr_NoMemory();
3678         }
3679         return NULL;
3680     }
3681 
3682     PyObject *bytes = PyBytes_FromString(str);
3683     PyMem_RawFree(str);
3684     return bytes;
3685 }
3686 
3687 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3688 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3689 {
3690     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3691     return unicode_encode_locale(unicode, error_handler, 1);
3692 }
3693 
3694 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3695 PyUnicode_EncodeFSDefault(PyObject *unicode)
3696 {
3697     PyInterpreterState *interp = _PyInterpreterState_GET();
3698     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3699     if (fs_codec->utf8) {
3700         return unicode_encode_utf8(unicode,
3701                                    fs_codec->error_handler,
3702                                    fs_codec->errors);
3703     }
3704 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3705     else if (fs_codec->encoding) {
3706         return PyUnicode_AsEncodedString(unicode,
3707                                          fs_codec->encoding,
3708                                          fs_codec->errors);
3709     }
3710 #endif
3711     else {
3712         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3713            machinery is not ready and so cannot be used:
3714            use wcstombs() in this case. */
3715         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3716         const wchar_t *filesystem_errors = config->filesystem_errors;
3717         assert(filesystem_errors != NULL);
3718         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3719         assert(errors != _Py_ERROR_UNKNOWN);
3720 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3721         return unicode_encode_utf8(unicode, errors, NULL);
3722 #else
3723         return unicode_encode_locale(unicode, errors, 0);
3724 #endif
3725     }
3726 }
3727 
3728 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3729 PyUnicode_AsEncodedString(PyObject *unicode,
3730                           const char *encoding,
3731                           const char *errors)
3732 {
3733     PyObject *v;
3734     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3735 
3736     if (!PyUnicode_Check(unicode)) {
3737         PyErr_BadArgument();
3738         return NULL;
3739     }
3740 
3741     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3742         return NULL;
3743     }
3744 
3745     if (encoding == NULL) {
3746         return _PyUnicode_AsUTF8String(unicode, errors);
3747     }
3748 
3749     /* Shortcuts for common default encodings */
3750     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3751         char *lower = buflower;
3752 
3753         /* Fast paths */
3754         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3755             lower += 3;
3756             if (*lower == '_') {
3757                 /* Match "utf8" and "utf_8" */
3758                 lower++;
3759             }
3760 
3761             if (lower[0] == '8' && lower[1] == 0) {
3762                 return _PyUnicode_AsUTF8String(unicode, errors);
3763             }
3764             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3765                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3766             }
3767             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3768                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3769             }
3770         }
3771         else {
3772             if (strcmp(lower, "ascii") == 0
3773                 || strcmp(lower, "us_ascii") == 0) {
3774                 return _PyUnicode_AsASCIIString(unicode, errors);
3775             }
3776 #ifdef MS_WINDOWS
3777             else if (strcmp(lower, "mbcs") == 0) {
3778                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3779             }
3780 #endif
3781             else if (strcmp(lower, "latin1") == 0 ||
3782                      strcmp(lower, "latin_1") == 0 ||
3783                      strcmp(lower, "iso_8859_1") == 0 ||
3784                      strcmp(lower, "iso8859_1") == 0) {
3785                 return _PyUnicode_AsLatin1String(unicode, errors);
3786             }
3787         }
3788     }
3789 
3790     /* Encode via the codec registry */
3791     v = _PyCodec_EncodeText(unicode, encoding, errors);
3792     if (v == NULL)
3793         return NULL;
3794 
3795     /* The normal path */
3796     if (PyBytes_Check(v))
3797         return v;
3798 
3799     /* If the codec returns a buffer, raise a warning and convert to bytes */
3800     if (PyByteArray_Check(v)) {
3801         int error;
3802         PyObject *b;
3803 
3804         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3805             "encoder %s returned bytearray instead of bytes; "
3806             "use codecs.encode() to encode to arbitrary types",
3807             encoding);
3808         if (error) {
3809             Py_DECREF(v);
3810             return NULL;
3811         }
3812 
3813         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3814                                       PyByteArray_GET_SIZE(v));
3815         Py_DECREF(v);
3816         return b;
3817     }
3818 
3819     PyErr_Format(PyExc_TypeError,
3820                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3821                  "use codecs.encode() to encode to arbitrary types",
3822                  encoding,
3823                  Py_TYPE(v)->tp_name);
3824     Py_DECREF(v);
3825     return NULL;
3826 }
3827 
3828 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3829 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3830                            const char *encoding,
3831                            const char *errors)
3832 {
3833     PyObject *v;
3834 
3835     if (!PyUnicode_Check(unicode)) {
3836         PyErr_BadArgument();
3837         goto onError;
3838     }
3839 
3840     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3841                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3842                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3843         return NULL;
3844 
3845     if (encoding == NULL)
3846         encoding = PyUnicode_GetDefaultEncoding();
3847 
3848     /* Encode via the codec registry */
3849     v = PyCodec_Encode(unicode, encoding, errors);
3850     if (v == NULL)
3851         goto onError;
3852     if (!PyUnicode_Check(v)) {
3853         PyErr_Format(PyExc_TypeError,
3854                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3855                      "use codecs.encode() to encode to arbitrary types",
3856                      encoding,
3857                      Py_TYPE(v)->tp_name);
3858         Py_DECREF(v);
3859         goto onError;
3860     }
3861     return v;
3862 
3863   onError:
3864     return NULL;
3865 }
3866 
3867 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3868 unicode_decode_locale(const char *str, Py_ssize_t len,
3869                       _Py_error_handler errors, int current_locale)
3870 {
3871     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3872         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3873         return NULL;
3874     }
3875 
3876     wchar_t *wstr;
3877     size_t wlen;
3878     const char *reason;
3879     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3880                                  current_locale, errors);
3881     if (res != 0) {
3882         if (res == -2) {
3883             PyObject *exc;
3884             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3885                                         "locale", str, len,
3886                                         (Py_ssize_t)wlen,
3887                                         (Py_ssize_t)(wlen + 1),
3888                                         reason);
3889             if (exc != NULL) {
3890                 PyCodec_StrictErrors(exc);
3891                 Py_DECREF(exc);
3892             }
3893         }
3894         else if (res == -3) {
3895             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3896         }
3897         else {
3898             PyErr_NoMemory();
3899         }
3900         return NULL;
3901     }
3902 
3903     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3904     PyMem_RawFree(wstr);
3905     return unicode;
3906 }
3907 
3908 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3909 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3910                               const char *errors)
3911 {
3912     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3913     return unicode_decode_locale(str, len, error_handler, 1);
3914 }
3915 
3916 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3917 PyUnicode_DecodeLocale(const char *str, const char *errors)
3918 {
3919     Py_ssize_t size = (Py_ssize_t)strlen(str);
3920     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3921     return unicode_decode_locale(str, size, error_handler, 1);
3922 }
3923 
3924 
3925 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3926 PyUnicode_DecodeFSDefault(const char *s) {
3927     Py_ssize_t size = (Py_ssize_t)strlen(s);
3928     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3929 }
3930 
3931 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3932 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3933 {
3934     PyInterpreterState *interp = _PyInterpreterState_GET();
3935     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3936     if (fs_codec->utf8) {
3937         return unicode_decode_utf8(s, size,
3938                                    fs_codec->error_handler,
3939                                    fs_codec->errors,
3940                                    NULL);
3941     }
3942 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3943     else if (fs_codec->encoding) {
3944         return PyUnicode_Decode(s, size,
3945                                 fs_codec->encoding,
3946                                 fs_codec->errors);
3947     }
3948 #endif
3949     else {
3950         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3951            machinery is not ready and so cannot be used:
3952            use mbstowcs() in this case. */
3953         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3954         const wchar_t *filesystem_errors = config->filesystem_errors;
3955         assert(filesystem_errors != NULL);
3956         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3957         assert(errors != _Py_ERROR_UNKNOWN);
3958 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3959         return unicode_decode_utf8(s, size, errors, NULL, NULL);
3960 #else
3961         return unicode_decode_locale(s, size, errors, 0);
3962 #endif
3963     }
3964 }
3965 
3966 
3967 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3968 PyUnicode_FSConverter(PyObject* arg, void* addr)
3969 {
3970     PyObject *path = NULL;
3971     PyObject *output = NULL;
3972     Py_ssize_t size;
3973     const char *data;
3974     if (arg == NULL) {
3975         Py_DECREF(*(PyObject**)addr);
3976         *(PyObject**)addr = NULL;
3977         return 1;
3978     }
3979     path = PyOS_FSPath(arg);
3980     if (path == NULL) {
3981         return 0;
3982     }
3983     if (PyBytes_Check(path)) {
3984         output = path;
3985     }
3986     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3987         output = PyUnicode_EncodeFSDefault(path);
3988         Py_DECREF(path);
3989         if (!output) {
3990             return 0;
3991         }
3992         assert(PyBytes_Check(output));
3993     }
3994 
3995     size = PyBytes_GET_SIZE(output);
3996     data = PyBytes_AS_STRING(output);
3997     if ((size_t)size != strlen(data)) {
3998         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3999         Py_DECREF(output);
4000         return 0;
4001     }
4002     *(PyObject**)addr = output;
4003     return Py_CLEANUP_SUPPORTED;
4004 }
4005 
4006 
4007 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4008 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4009 {
4010     if (arg == NULL) {
4011         Py_DECREF(*(PyObject**)addr);
4012         *(PyObject**)addr = NULL;
4013         return 1;
4014     }
4015 
4016     PyObject *path = PyOS_FSPath(arg);
4017     if (path == NULL) {
4018         return 0;
4019     }
4020 
4021     PyObject *output = NULL;
4022     if (PyUnicode_Check(path)) {
4023         output = path;
4024     }
4025     else if (PyBytes_Check(path)) {
4026         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4027                                                   PyBytes_GET_SIZE(path));
4028         Py_DECREF(path);
4029         if (!output) {
4030             return 0;
4031         }
4032     }
4033     else {
4034         PyErr_Format(PyExc_TypeError,
4035                      "path should be string, bytes, or os.PathLike, not %.200s",
4036                      Py_TYPE(arg)->tp_name);
4037         Py_DECREF(path);
4038         return 0;
4039     }
4040 
4041     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4042                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4043         PyErr_SetString(PyExc_ValueError, "embedded null character");
4044         Py_DECREF(output);
4045         return 0;
4046     }
4047     *(PyObject**)addr = output;
4048     return Py_CLEANUP_SUPPORTED;
4049 }
4050 
4051 
4052 static int unicode_fill_utf8(PyObject *unicode);
4053 
4054 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4055 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4056 {
4057     if (!PyUnicode_Check(unicode)) {
4058         PyErr_BadArgument();
4059         if (psize) {
4060             *psize = -1;
4061         }
4062         return NULL;
4063     }
4064 
4065     if (PyUnicode_UTF8(unicode) == NULL) {
4066         if (unicode_fill_utf8(unicode) == -1) {
4067             if (psize) {
4068                 *psize = -1;
4069             }
4070             return NULL;
4071         }
4072     }
4073 
4074     if (psize) {
4075         *psize = PyUnicode_UTF8_LENGTH(unicode);
4076     }
4077     return PyUnicode_UTF8(unicode);
4078 }
4079 
4080 const char *
PyUnicode_AsUTF8(PyObject * unicode)4081 PyUnicode_AsUTF8(PyObject *unicode)
4082 {
4083     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4084 }
4085 
4086 const char *
_PyUnicode_AsUTF8NoNUL(PyObject * unicode)4087 _PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4088 {
4089     Py_ssize_t size;
4090     const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4091     if (s && strlen(s) != (size_t)size) {
4092         PyErr_SetString(PyExc_ValueError, "embedded null character");
4093         return NULL;
4094     }
4095     return s;
4096 }
4097 
4098 /*
4099 PyUnicode_GetSize() has been deprecated since Python 3.3
4100 because it returned length of Py_UNICODE.
4101 
4102 But this function is part of stable abi, because it doesn't
4103 include Py_UNICODE in signature and it was not excluded from
4104 stable ABI in PEP 384.
4105 */
4106 PyAPI_FUNC(Py_ssize_t)
PyUnicode_GetSize(PyObject * unicode)4107 PyUnicode_GetSize(PyObject *unicode)
4108 {
4109     PyErr_SetString(PyExc_RuntimeError,
4110                     "PyUnicode_GetSize has been removed.");
4111     return -1;
4112 }
4113 
4114 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4115 PyUnicode_GetLength(PyObject *unicode)
4116 {
4117     if (!PyUnicode_Check(unicode)) {
4118         PyErr_BadArgument();
4119         return -1;
4120     }
4121     return PyUnicode_GET_LENGTH(unicode);
4122 }
4123 
4124 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4125 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4126 {
4127     const void *data;
4128     int kind;
4129 
4130     if (!PyUnicode_Check(unicode)) {
4131         PyErr_BadArgument();
4132         return (Py_UCS4)-1;
4133     }
4134     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4135         PyErr_SetString(PyExc_IndexError, "string index out of range");
4136         return (Py_UCS4)-1;
4137     }
4138     data = PyUnicode_DATA(unicode);
4139     kind = PyUnicode_KIND(unicode);
4140     return PyUnicode_READ(kind, data, index);
4141 }
4142 
4143 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4144 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4145 {
4146     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4147         PyErr_BadArgument();
4148         return -1;
4149     }
4150     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4151         PyErr_SetString(PyExc_IndexError, "string index out of range");
4152         return -1;
4153     }
4154     if (unicode_check_modifiable(unicode))
4155         return -1;
4156     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4157         PyErr_SetString(PyExc_ValueError, "character out of range");
4158         return -1;
4159     }
4160     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4161                     index, ch);
4162     return 0;
4163 }
4164 
4165 const char *
PyUnicode_GetDefaultEncoding(void)4166 PyUnicode_GetDefaultEncoding(void)
4167 {
4168     return "utf-8";
4169 }
4170 
4171 /* create or adjust a UnicodeDecodeError */
4172 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4173 make_decode_exception(PyObject **exceptionObject,
4174                       const char *encoding,
4175                       const char *input, Py_ssize_t length,
4176                       Py_ssize_t startpos, Py_ssize_t endpos,
4177                       const char *reason)
4178 {
4179     if (*exceptionObject == NULL) {
4180         *exceptionObject = PyUnicodeDecodeError_Create(
4181             encoding, input, length, startpos, endpos, reason);
4182     }
4183     else {
4184         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4185             goto onError;
4186         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4187             goto onError;
4188         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4189             goto onError;
4190     }
4191     return;
4192 
4193 onError:
4194     Py_CLEAR(*exceptionObject);
4195 }
4196 
4197 #ifdef MS_WINDOWS
4198 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4199 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4200 {
4201     if (newsize > *size) {
4202         wchar_t *newbuf = *buf;
4203         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4204             PyErr_NoMemory();
4205             return -1;
4206         }
4207         *buf = newbuf;
4208     }
4209     *size = newsize;
4210     return 0;
4211 }
4212 
4213 /* error handling callback helper:
4214    build arguments, call the callback and check the arguments,
4215    if no exception occurred, copy the replacement to the output
4216    and adjust various state variables.
4217    return 0 on success, -1 on error
4218 */
4219 
4220 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4221 unicode_decode_call_errorhandler_wchar(
4222     const char *errors, PyObject **errorHandler,
4223     const char *encoding, const char *reason,
4224     const char **input, const char **inend, Py_ssize_t *startinpos,
4225     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4226     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4227 {
4228     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4229 
4230     PyObject *restuple = NULL;
4231     PyObject *repunicode = NULL;
4232     Py_ssize_t outsize;
4233     Py_ssize_t insize;
4234     Py_ssize_t requiredsize;
4235     Py_ssize_t newpos;
4236     PyObject *inputobj = NULL;
4237     Py_ssize_t repwlen;
4238 
4239     if (*errorHandler == NULL) {
4240         *errorHandler = PyCodec_LookupError(errors);
4241         if (*errorHandler == NULL)
4242             goto onError;
4243     }
4244 
4245     make_decode_exception(exceptionObject,
4246         encoding,
4247         *input, *inend - *input,
4248         *startinpos, *endinpos,
4249         reason);
4250     if (*exceptionObject == NULL)
4251         goto onError;
4252 
4253     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4254     if (restuple == NULL)
4255         goto onError;
4256     if (!PyTuple_Check(restuple)) {
4257         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4258         goto onError;
4259     }
4260     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4261         goto onError;
4262 
4263     /* Copy back the bytes variables, which might have been modified by the
4264        callback */
4265     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4266     if (!inputobj)
4267         goto onError;
4268     *input = PyBytes_AS_STRING(inputobj);
4269     insize = PyBytes_GET_SIZE(inputobj);
4270     *inend = *input + insize;
4271     /* we can DECREF safely, as the exception has another reference,
4272        so the object won't go away. */
4273     Py_DECREF(inputobj);
4274 
4275     if (newpos<0)
4276         newpos = insize+newpos;
4277     if (newpos<0 || newpos>insize) {
4278         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4279         goto onError;
4280     }
4281 
4282     repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4283     if (repwlen < 0)
4284         goto onError;
4285     repwlen--;
4286     /* need more space? (at least enough for what we
4287        have+the replacement+the rest of the string (starting
4288        at the new input position), so we won't have to check space
4289        when there are no errors in the rest of the string) */
4290     requiredsize = *outpos;
4291     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4292         goto overflow;
4293     requiredsize += repwlen;
4294     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4295         goto overflow;
4296     requiredsize += insize - newpos;
4297     outsize = *bufsize;
4298     if (requiredsize > outsize) {
4299         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4300             requiredsize = 2*outsize;
4301         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4302             goto onError;
4303         }
4304     }
4305     PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4306     *outpos += repwlen;
4307     *endinpos = newpos;
4308     *inptr = *input + newpos;
4309 
4310     /* we made it! */
4311     Py_DECREF(restuple);
4312     return 0;
4313 
4314   overflow:
4315     PyErr_SetString(PyExc_OverflowError,
4316                     "decoded result is too long for a Python string");
4317 
4318   onError:
4319     Py_XDECREF(restuple);
4320     return -1;
4321 }
4322 #endif   /* MS_WINDOWS */
4323 
4324 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4325 unicode_decode_call_errorhandler_writer(
4326     const char *errors, PyObject **errorHandler,
4327     const char *encoding, const char *reason,
4328     const char **input, const char **inend, Py_ssize_t *startinpos,
4329     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4330     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4331 {
4332     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4333 
4334     PyObject *restuple = NULL;
4335     PyObject *repunicode = NULL;
4336     Py_ssize_t insize;
4337     Py_ssize_t newpos;
4338     Py_ssize_t replen;
4339     Py_ssize_t remain;
4340     PyObject *inputobj = NULL;
4341     int need_to_grow = 0;
4342     const char *new_inptr;
4343 
4344     if (*errorHandler == NULL) {
4345         *errorHandler = PyCodec_LookupError(errors);
4346         if (*errorHandler == NULL)
4347             goto onError;
4348     }
4349 
4350     make_decode_exception(exceptionObject,
4351         encoding,
4352         *input, *inend - *input,
4353         *startinpos, *endinpos,
4354         reason);
4355     if (*exceptionObject == NULL)
4356         goto onError;
4357 
4358     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4359     if (restuple == NULL)
4360         goto onError;
4361     if (!PyTuple_Check(restuple)) {
4362         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4363         goto onError;
4364     }
4365     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4366         goto onError;
4367 
4368     /* Copy back the bytes variables, which might have been modified by the
4369        callback */
4370     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4371     if (!inputobj)
4372         goto onError;
4373     remain = *inend - *input - *endinpos;
4374     *input = PyBytes_AS_STRING(inputobj);
4375     insize = PyBytes_GET_SIZE(inputobj);
4376     *inend = *input + insize;
4377     /* we can DECREF safely, as the exception has another reference,
4378        so the object won't go away. */
4379     Py_DECREF(inputobj);
4380 
4381     if (newpos<0)
4382         newpos = insize+newpos;
4383     if (newpos<0 || newpos>insize) {
4384         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4385         goto onError;
4386     }
4387 
4388     replen = PyUnicode_GET_LENGTH(repunicode);
4389     if (replen > 1) {
4390         writer->min_length += replen - 1;
4391         need_to_grow = 1;
4392     }
4393     new_inptr = *input + newpos;
4394     if (*inend - new_inptr > remain) {
4395         /* We don't know the decoding algorithm here so we make the worst
4396            assumption that one byte decodes to one unicode character.
4397            If unfortunately one byte could decode to more unicode characters,
4398            the decoder may write out-of-bound then.  Is it possible for the
4399            algorithms using this function? */
4400         writer->min_length += *inend - new_inptr - remain;
4401         need_to_grow = 1;
4402     }
4403     if (need_to_grow) {
4404         writer->overallocate = 1;
4405         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4406                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4407             goto onError;
4408     }
4409     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4410         goto onError;
4411 
4412     *endinpos = newpos;
4413     *inptr = new_inptr;
4414 
4415     /* we made it! */
4416     Py_DECREF(restuple);
4417     return 0;
4418 
4419   onError:
4420     Py_XDECREF(restuple);
4421     return -1;
4422 }
4423 
4424 /* --- UTF-7 Codec -------------------------------------------------------- */
4425 
4426 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4427 
4428 /* Three simple macros defining base-64. */
4429 
4430 /* Is c a base-64 character? */
4431 
4432 #define IS_BASE64(c) \
4433     (((c) >= 'A' && (c) <= 'Z') ||     \
4434      ((c) >= 'a' && (c) <= 'z') ||     \
4435      ((c) >= '0' && (c) <= '9') ||     \
4436      (c) == '+' || (c) == '/')
4437 
4438 /* given that c is a base-64 character, what is its base-64 value? */
4439 
4440 #define FROM_BASE64(c)                                                  \
4441     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4442      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4443      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4444      (c) == '+' ? 62 : 63)
4445 
4446 /* What is the base-64 character of the bottom 6 bits of n? */
4447 
4448 #define TO_BASE64(n)  \
4449     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4450 
4451 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4452  * decoded as itself.  We are permissive on decoding; the only ASCII
4453  * byte not decoding to itself is the + which begins a base64
4454  * string. */
4455 
4456 #define DECODE_DIRECT(c)                                \
4457     ((c) <= 127 && (c) != '+')
4458 
4459 /* The UTF-7 encoder treats ASCII characters differently according to
4460  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4461  * the above).  See RFC2152.  This array identifies these different
4462  * sets:
4463  * 0 : "Set D"
4464  *     alphanumeric and '(),-./:?
4465  * 1 : "Set O"
4466  *     !"#$%&*;<=>@[]^_`{|}
4467  * 2 : "whitespace"
4468  *     ht nl cr sp
4469  * 3 : special (must be base64 encoded)
4470  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4471  */
4472 
4473 static
4474 char utf7_category[128] = {
4475 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4476     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4477 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4478     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4479 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4480     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4481 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4482     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4483 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4484     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4485 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4486     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4487 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4488     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4489 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4490     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4491 };
4492 
4493 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4494  * answer depends on whether we are encoding set O as itself, and also
4495  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4496  * clear that the answers to these questions vary between
4497  * applications, so this code needs to be flexible.  */
4498 
4499 #define ENCODE_DIRECT(c, directO, directWS)             \
4500     ((c) < 128 && (c) > 0 &&                            \
4501      ((utf7_category[(c)] == 0) ||                      \
4502       (directWS && (utf7_category[(c)] == 2)) ||        \
4503       (directO && (utf7_category[(c)] == 1))))
4504 
4505 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4506 PyUnicode_DecodeUTF7(const char *s,
4507                      Py_ssize_t size,
4508                      const char *errors)
4509 {
4510     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4511 }
4512 
4513 /* The decoder.  The only state we preserve is our read position,
4514  * i.e. how many characters we have consumed.  So if we end in the
4515  * middle of a shift sequence we have to back off the read position
4516  * and the output to the beginning of the sequence, otherwise we lose
4517  * all the shift state (seen bits, number of bits seen, high
4518  * surrogate). */
4519 
4520 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4521 PyUnicode_DecodeUTF7Stateful(const char *s,
4522                              Py_ssize_t size,
4523                              const char *errors,
4524                              Py_ssize_t *consumed)
4525 {
4526     const char *starts = s;
4527     Py_ssize_t startinpos;
4528     Py_ssize_t endinpos;
4529     const char *e;
4530     _PyUnicodeWriter writer;
4531     const char *errmsg = "";
4532     int inShift = 0;
4533     Py_ssize_t shiftOutStart;
4534     unsigned int base64bits = 0;
4535     unsigned long base64buffer = 0;
4536     Py_UCS4 surrogate = 0;
4537     PyObject *errorHandler = NULL;
4538     PyObject *exc = NULL;
4539 
4540     if (size == 0) {
4541         if (consumed)
4542             *consumed = 0;
4543         _Py_RETURN_UNICODE_EMPTY();
4544     }
4545 
4546     /* Start off assuming it's all ASCII. Widen later as necessary. */
4547     _PyUnicodeWriter_Init(&writer);
4548     writer.min_length = size;
4549 
4550     shiftOutStart = 0;
4551     e = s + size;
4552 
4553     while (s < e) {
4554         Py_UCS4 ch;
4555       restart:
4556         ch = (unsigned char) *s;
4557 
4558         if (inShift) { /* in a base-64 section */
4559             if (IS_BASE64(ch)) { /* consume a base-64 character */
4560                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4561                 base64bits += 6;
4562                 s++;
4563                 if (base64bits >= 16) {
4564                     /* we have enough bits for a UTF-16 value */
4565                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4566                     base64bits -= 16;
4567                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4568                     assert(outCh <= 0xffff);
4569                     if (surrogate) {
4570                         /* expecting a second surrogate */
4571                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4572                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4573                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4574                                 goto onError;
4575                             surrogate = 0;
4576                             continue;
4577                         }
4578                         else {
4579                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4580                                 goto onError;
4581                             surrogate = 0;
4582                         }
4583                     }
4584                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4585                         /* first surrogate */
4586                         surrogate = outCh;
4587                     }
4588                     else {
4589                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4590                             goto onError;
4591                     }
4592                 }
4593             }
4594             else { /* now leaving a base-64 section */
4595                 inShift = 0;
4596                 if (base64bits > 0) { /* left-over bits */
4597                     if (base64bits >= 6) {
4598                         /* We've seen at least one base-64 character */
4599                         s++;
4600                         errmsg = "partial character in shift sequence";
4601                         goto utf7Error;
4602                     }
4603                     else {
4604                         /* Some bits remain; they should be zero */
4605                         if (base64buffer != 0) {
4606                             s++;
4607                             errmsg = "non-zero padding bits in shift sequence";
4608                             goto utf7Error;
4609                         }
4610                     }
4611                 }
4612                 if (surrogate && DECODE_DIRECT(ch)) {
4613                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4614                         goto onError;
4615                 }
4616                 surrogate = 0;
4617                 if (ch == '-') {
4618                     /* '-' is absorbed; other terminating
4619                        characters are preserved */
4620                     s++;
4621                 }
4622             }
4623         }
4624         else if ( ch == '+' ) {
4625             startinpos = s-starts;
4626             s++; /* consume '+' */
4627             if (s < e && *s == '-') { /* '+-' encodes '+' */
4628                 s++;
4629                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4630                     goto onError;
4631             }
4632             else if (s < e && !IS_BASE64(*s)) {
4633                 s++;
4634                 errmsg = "ill-formed sequence";
4635                 goto utf7Error;
4636             }
4637             else { /* begin base64-encoded section */
4638                 inShift = 1;
4639                 surrogate = 0;
4640                 shiftOutStart = writer.pos;
4641                 base64bits = 0;
4642                 base64buffer = 0;
4643             }
4644         }
4645         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4646             s++;
4647             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4648                 goto onError;
4649         }
4650         else {
4651             startinpos = s-starts;
4652             s++;
4653             errmsg = "unexpected special character";
4654             goto utf7Error;
4655         }
4656         continue;
4657 utf7Error:
4658         endinpos = s-starts;
4659         if (unicode_decode_call_errorhandler_writer(
4660                 errors, &errorHandler,
4661                 "utf7", errmsg,
4662                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4663                 &writer))
4664             goto onError;
4665     }
4666 
4667     /* end of string */
4668 
4669     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4670         /* if we're in an inconsistent state, that's an error */
4671         inShift = 0;
4672         if (surrogate ||
4673                 (base64bits >= 6) ||
4674                 (base64bits > 0 && base64buffer != 0)) {
4675             endinpos = size;
4676             if (unicode_decode_call_errorhandler_writer(
4677                     errors, &errorHandler,
4678                     "utf7", "unterminated shift sequence",
4679                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4680                     &writer))
4681                 goto onError;
4682             if (s < e)
4683                 goto restart;
4684         }
4685     }
4686 
4687     /* return state */
4688     if (consumed) {
4689         if (inShift) {
4690             *consumed = startinpos;
4691             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4692                 PyObject *result = PyUnicode_FromKindAndData(
4693                         writer.kind, writer.data, shiftOutStart);
4694                 Py_XDECREF(errorHandler);
4695                 Py_XDECREF(exc);
4696                 _PyUnicodeWriter_Dealloc(&writer);
4697                 return result;
4698             }
4699             writer.pos = shiftOutStart; /* back off output */
4700         }
4701         else {
4702             *consumed = s-starts;
4703         }
4704     }
4705 
4706     Py_XDECREF(errorHandler);
4707     Py_XDECREF(exc);
4708     return _PyUnicodeWriter_Finish(&writer);
4709 
4710   onError:
4711     Py_XDECREF(errorHandler);
4712     Py_XDECREF(exc);
4713     _PyUnicodeWriter_Dealloc(&writer);
4714     return NULL;
4715 }
4716 
4717 
4718 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4719 _PyUnicode_EncodeUTF7(PyObject *str,
4720                       int base64SetO,
4721                       int base64WhiteSpace,
4722                       const char *errors)
4723 {
4724     int kind;
4725     const void *data;
4726     Py_ssize_t len;
4727     PyObject *v;
4728     int inShift = 0;
4729     Py_ssize_t i;
4730     unsigned int base64bits = 0;
4731     unsigned long base64buffer = 0;
4732     char * out;
4733     const char * start;
4734 
4735     kind = PyUnicode_KIND(str);
4736     data = PyUnicode_DATA(str);
4737     len = PyUnicode_GET_LENGTH(str);
4738 
4739     if (len == 0)
4740         return PyBytes_FromStringAndSize(NULL, 0);
4741 
4742     /* It might be possible to tighten this worst case */
4743     if (len > PY_SSIZE_T_MAX / 8)
4744         return PyErr_NoMemory();
4745     v = PyBytes_FromStringAndSize(NULL, len * 8);
4746     if (v == NULL)
4747         return NULL;
4748 
4749     start = out = PyBytes_AS_STRING(v);
4750     for (i = 0; i < len; ++i) {
4751         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4752 
4753         if (inShift) {
4754             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4755                 /* shifting out */
4756                 if (base64bits) { /* output remaining bits */
4757                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4758                     base64buffer = 0;
4759                     base64bits = 0;
4760                 }
4761                 inShift = 0;
4762                 /* Characters not in the BASE64 set implicitly unshift the sequence
4763                    so no '-' is required, except if the character is itself a '-' */
4764                 if (IS_BASE64(ch) || ch == '-') {
4765                     *out++ = '-';
4766                 }
4767                 *out++ = (char) ch;
4768             }
4769             else {
4770                 goto encode_char;
4771             }
4772         }
4773         else { /* not in a shift sequence */
4774             if (ch == '+') {
4775                 *out++ = '+';
4776                         *out++ = '-';
4777             }
4778             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779                 *out++ = (char) ch;
4780             }
4781             else {
4782                 *out++ = '+';
4783                 inShift = 1;
4784                 goto encode_char;
4785             }
4786         }
4787         continue;
4788 encode_char:
4789         if (ch >= 0x10000) {
4790             assert(ch <= MAX_UNICODE);
4791 
4792             /* code first surrogate */
4793             base64bits += 16;
4794             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4795             while (base64bits >= 6) {
4796                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4797                 base64bits -= 6;
4798             }
4799             /* prepare second surrogate */
4800             ch = Py_UNICODE_LOW_SURROGATE(ch);
4801         }
4802         base64bits += 16;
4803         base64buffer = (base64buffer << 16) | ch;
4804         while (base64bits >= 6) {
4805             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4806             base64bits -= 6;
4807         }
4808     }
4809     if (base64bits)
4810         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4811     if (inShift)
4812         *out++ = '-';
4813     if (_PyBytes_Resize(&v, out - start) < 0)
4814         return NULL;
4815     return v;
4816 }
4817 
4818 #undef IS_BASE64
4819 #undef FROM_BASE64
4820 #undef TO_BASE64
4821 #undef DECODE_DIRECT
4822 #undef ENCODE_DIRECT
4823 
4824 /* --- UTF-8 Codec -------------------------------------------------------- */
4825 
4826 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4827 PyUnicode_DecodeUTF8(const char *s,
4828                      Py_ssize_t size,
4829                      const char *errors)
4830 {
4831     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4832 }
4833 
4834 #include "stringlib/asciilib.h"
4835 #include "stringlib/codecs.h"
4836 #include "stringlib/undef.h"
4837 
4838 #include "stringlib/ucs1lib.h"
4839 #include "stringlib/codecs.h"
4840 #include "stringlib/undef.h"
4841 
4842 #include "stringlib/ucs2lib.h"
4843 #include "stringlib/codecs.h"
4844 #include "stringlib/undef.h"
4845 
4846 #include "stringlib/ucs4lib.h"
4847 #include "stringlib/codecs.h"
4848 #include "stringlib/undef.h"
4849 
4850 /* Mask to quickly check whether a C 'size_t' contains a
4851    non-ASCII, UTF8-encoded char. */
4852 #if (SIZEOF_SIZE_T == 8)
4853 # define ASCII_CHAR_MASK 0x8080808080808080ULL
4854 #elif (SIZEOF_SIZE_T == 4)
4855 # define ASCII_CHAR_MASK 0x80808080U
4856 #else
4857 # error C 'size_t' size should be either 4 or 8!
4858 #endif
4859 
4860 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4861 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4862 {
4863     const char *p = start;
4864 
4865 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4866     assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
4867     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4868         /* Fast path, see in STRINGLIB(utf8_decode) for
4869            an explanation. */
4870         /* Help allocation */
4871         const char *_p = p;
4872         Py_UCS1 * q = dest;
4873         while (_p + SIZEOF_SIZE_T <= end) {
4874             size_t value = *(const size_t *) _p;
4875             if (value & ASCII_CHAR_MASK)
4876                 break;
4877             *((size_t *)q) = value;
4878             _p += SIZEOF_SIZE_T;
4879             q += SIZEOF_SIZE_T;
4880         }
4881         p = _p;
4882         while (p < end) {
4883             if ((unsigned char)*p & 0x80)
4884                 break;
4885             *q++ = *p++;
4886         }
4887         return p - start;
4888     }
4889 #endif
4890     while (p < end) {
4891         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4892            for an explanation. */
4893         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
4894             /* Help allocation */
4895             const char *_p = p;
4896             while (_p + SIZEOF_SIZE_T <= end) {
4897                 size_t value = *(const size_t *) _p;
4898                 if (value & ASCII_CHAR_MASK)
4899                     break;
4900                 _p += SIZEOF_SIZE_T;
4901             }
4902             p = _p;
4903             if (_p == end)
4904                 break;
4905         }
4906         if ((unsigned char)*p & 0x80)
4907             break;
4908         ++p;
4909     }
4910     memcpy(dest, start, p - start);
4911     return p - start;
4912 }
4913 
4914 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4915 unicode_decode_utf8(const char *s, Py_ssize_t size,
4916                     _Py_error_handler error_handler, const char *errors,
4917                     Py_ssize_t *consumed)
4918 {
4919     if (size == 0) {
4920         if (consumed)
4921             *consumed = 0;
4922         _Py_RETURN_UNICODE_EMPTY();
4923     }
4924 
4925     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4926     if (size == 1 && (unsigned char)s[0] < 128) {
4927         if (consumed) {
4928             *consumed = 1;
4929         }
4930         return get_latin1_char((unsigned char)s[0]);
4931     }
4932 
4933     const char *starts = s;
4934     const char *end = s + size;
4935 
4936     // fast path: try ASCII string.
4937     PyObject *u = PyUnicode_New(size, 127);
4938     if (u == NULL) {
4939         return NULL;
4940     }
4941     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4942     if (s == end) {
4943         if (consumed) {
4944             *consumed = size;
4945         }
4946         return u;
4947     }
4948 
4949     // Use _PyUnicodeWriter after fast path is failed.
4950     _PyUnicodeWriter writer;
4951     _PyUnicodeWriter_InitWithBuffer(&writer, u);
4952     writer.pos = s - starts;
4953 
4954     Py_ssize_t startinpos, endinpos;
4955     const char *errmsg = "";
4956     PyObject *error_handler_obj = NULL;
4957     PyObject *exc = NULL;
4958 
4959     while (s < end) {
4960         Py_UCS4 ch;
4961         int kind = writer.kind;
4962 
4963         if (kind == PyUnicode_1BYTE_KIND) {
4964             if (PyUnicode_IS_ASCII(writer.buffer))
4965                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4966             else
4967                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4968         } else if (kind == PyUnicode_2BYTE_KIND) {
4969             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4970         } else {
4971             assert(kind == PyUnicode_4BYTE_KIND);
4972             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4973         }
4974 
4975         switch (ch) {
4976         case 0:
4977             if (s == end || consumed)
4978                 goto End;
4979             errmsg = "unexpected end of data";
4980             startinpos = s - starts;
4981             endinpos = end - starts;
4982             break;
4983         case 1:
4984             errmsg = "invalid start byte";
4985             startinpos = s - starts;
4986             endinpos = startinpos + 1;
4987             break;
4988         case 2:
4989             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4990                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4991             {
4992                 /* Truncated surrogate code in range D800-DFFF */
4993                 goto End;
4994             }
4995             /* fall through */
4996         case 3:
4997         case 4:
4998             errmsg = "invalid continuation byte";
4999             startinpos = s - starts;
5000             endinpos = startinpos + ch - 1;
5001             break;
5002         default:
5003             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5004                 goto onError;
5005             continue;
5006         }
5007 
5008         if (error_handler == _Py_ERROR_UNKNOWN)
5009             error_handler = _Py_GetErrorHandler(errors);
5010 
5011         switch (error_handler) {
5012         case _Py_ERROR_IGNORE:
5013             s += (endinpos - startinpos);
5014             break;
5015 
5016         case _Py_ERROR_REPLACE:
5017             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5018                 goto onError;
5019             s += (endinpos - startinpos);
5020             break;
5021 
5022         case _Py_ERROR_SURROGATEESCAPE:
5023         {
5024             Py_ssize_t i;
5025 
5026             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5027                 goto onError;
5028             for (i=startinpos; i<endinpos; i++) {
5029                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5030                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5031                                 ch + 0xdc00);
5032                 writer.pos++;
5033             }
5034             s += (endinpos - startinpos);
5035             break;
5036         }
5037 
5038         default:
5039             if (unicode_decode_call_errorhandler_writer(
5040                     errors, &error_handler_obj,
5041                     "utf-8", errmsg,
5042                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5043                     &writer))
5044                 goto onError;
5045         }
5046     }
5047 
5048 End:
5049     if (consumed)
5050         *consumed = s - starts;
5051 
5052     Py_XDECREF(error_handler_obj);
5053     Py_XDECREF(exc);
5054     return _PyUnicodeWriter_Finish(&writer);
5055 
5056 onError:
5057     Py_XDECREF(error_handler_obj);
5058     Py_XDECREF(exc);
5059     _PyUnicodeWriter_Dealloc(&writer);
5060     return NULL;
5061 }
5062 
5063 
5064 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5065 PyUnicode_DecodeUTF8Stateful(const char *s,
5066                              Py_ssize_t size,
5067                              const char *errors,
5068                              Py_ssize_t *consumed)
5069 {
5070     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5071 }
5072 
5073 
5074 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5075    non-zero, use strict error handler otherwise.
5076 
5077    On success, write a pointer to a newly allocated wide character string into
5078    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5079    (in number of wchar_t units) into *wlen (if wlen is set).
5080 
5081    On memory allocation failure, return -1.
5082 
5083    On decoding error (if surrogateescape is zero), return -2. If wlen is
5084    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5085    is not NULL, write the decoding error message into *reason. */
5086 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5087 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5088                  const char **reason, _Py_error_handler errors)
5089 {
5090     const char *orig_s = s;
5091     const char *e;
5092     wchar_t *unicode;
5093     Py_ssize_t outpos;
5094 
5095     int surrogateescape = 0;
5096     int surrogatepass = 0;
5097     switch (errors)
5098     {
5099     case _Py_ERROR_STRICT:
5100         break;
5101     case _Py_ERROR_SURROGATEESCAPE:
5102         surrogateescape = 1;
5103         break;
5104     case _Py_ERROR_SURROGATEPASS:
5105         surrogatepass = 1;
5106         break;
5107     default:
5108         return -3;
5109     }
5110 
5111     /* Note: size will always be longer than the resulting Unicode
5112        character count */
5113     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5114         return -1;
5115     }
5116 
5117     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5118     if (!unicode) {
5119         return -1;
5120     }
5121 
5122     /* Unpack UTF-8 encoded data */
5123     e = s + size;
5124     outpos = 0;
5125     while (s < e) {
5126         Py_UCS4 ch;
5127 #if SIZEOF_WCHAR_T == 4
5128         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5129 #else
5130         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5131 #endif
5132         if (ch > 0xFF) {
5133 #if SIZEOF_WCHAR_T == 4
5134             Py_UNREACHABLE();
5135 #else
5136             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5137             /* write a surrogate pair */
5138             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5139             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5140 #endif
5141         }
5142         else {
5143             if (!ch && s == e) {
5144                 break;
5145             }
5146 
5147             if (surrogateescape) {
5148                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5149             }
5150             else {
5151                 /* Is it a valid three-byte code? */
5152                 if (surrogatepass
5153                     && (e - s) >= 3
5154                     && (s[0] & 0xf0) == 0xe0
5155                     && (s[1] & 0xc0) == 0x80
5156                     && (s[2] & 0xc0) == 0x80)
5157                 {
5158                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5159                     s += 3;
5160                     unicode[outpos++] = ch;
5161                 }
5162                 else {
5163                     PyMem_RawFree(unicode );
5164                     if (reason != NULL) {
5165                         switch (ch) {
5166                         case 0:
5167                             *reason = "unexpected end of data";
5168                             break;
5169                         case 1:
5170                             *reason = "invalid start byte";
5171                             break;
5172                         /* 2, 3, 4 */
5173                         default:
5174                             *reason = "invalid continuation byte";
5175                             break;
5176                         }
5177                     }
5178                     if (wlen != NULL) {
5179                         *wlen = s - orig_s;
5180                     }
5181                     return -2;
5182                 }
5183             }
5184         }
5185     }
5186     unicode[outpos] = L'\0';
5187     if (wlen) {
5188         *wlen = outpos;
5189     }
5190     *wstr = unicode;
5191     return 0;
5192 }
5193 
5194 
5195 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5196 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5197                                size_t *wlen)
5198 {
5199     wchar_t *wstr;
5200     int res = _Py_DecodeUTF8Ex(arg, arglen,
5201                                &wstr, wlen,
5202                                NULL, _Py_ERROR_SURROGATEESCAPE);
5203     if (res != 0) {
5204         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5205         assert(res != -3);
5206         if (wlen) {
5207             *wlen = (size_t)res;
5208         }
5209         return NULL;
5210     }
5211     return wstr;
5212 }
5213 
5214 
5215 /* UTF-8 encoder using the surrogateescape error handler .
5216 
5217    On success, return 0 and write the newly allocated character string (use
5218    PyMem_Free() to free the memory) into *str.
5219 
5220    On encoding failure, return -2 and write the position of the invalid
5221    surrogate character into *error_pos (if error_pos is set) and the decoding
5222    error message into *reason (if reason is set).
5223 
5224    On memory allocation failure, return -1. */
5225 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5226 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5227                  const char **reason, int raw_malloc, _Py_error_handler errors)
5228 {
5229     const Py_ssize_t max_char_size = 4;
5230     Py_ssize_t len = wcslen(text);
5231 
5232     assert(len >= 0);
5233 
5234     int surrogateescape = 0;
5235     int surrogatepass = 0;
5236     switch (errors)
5237     {
5238     case _Py_ERROR_STRICT:
5239         break;
5240     case _Py_ERROR_SURROGATEESCAPE:
5241         surrogateescape = 1;
5242         break;
5243     case _Py_ERROR_SURROGATEPASS:
5244         surrogatepass = 1;
5245         break;
5246     default:
5247         return -3;
5248     }
5249 
5250     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5251         return -1;
5252     }
5253     char *bytes;
5254     if (raw_malloc) {
5255         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5256     }
5257     else {
5258         bytes = PyMem_Malloc((len + 1) * max_char_size);
5259     }
5260     if (bytes == NULL) {
5261         return -1;
5262     }
5263 
5264     char *p = bytes;
5265     Py_ssize_t i;
5266     for (i = 0; i < len; ) {
5267         Py_ssize_t ch_pos = i;
5268         Py_UCS4 ch = text[i];
5269         i++;
5270 #if Py_UNICODE_SIZE == 2
5271         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5272             && i < len
5273             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5274         {
5275             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5276             i++;
5277         }
5278 #endif
5279 
5280         if (ch < 0x80) {
5281             /* Encode ASCII */
5282             *p++ = (char) ch;
5283 
5284         }
5285         else if (ch < 0x0800) {
5286             /* Encode Latin-1 */
5287             *p++ = (char)(0xc0 | (ch >> 6));
5288             *p++ = (char)(0x80 | (ch & 0x3f));
5289         }
5290         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5291             /* surrogateescape error handler */
5292             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5293                 if (error_pos != NULL) {
5294                     *error_pos = (size_t)ch_pos;
5295                 }
5296                 if (reason != NULL) {
5297                     *reason = "encoding error";
5298                 }
5299                 if (raw_malloc) {
5300                     PyMem_RawFree(bytes);
5301                 }
5302                 else {
5303                     PyMem_Free(bytes);
5304                 }
5305                 return -2;
5306             }
5307             *p++ = (char)(ch & 0xff);
5308         }
5309         else if (ch < 0x10000) {
5310             *p++ = (char)(0xe0 | (ch >> 12));
5311             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5312             *p++ = (char)(0x80 | (ch & 0x3f));
5313         }
5314         else {  /* ch >= 0x10000 */
5315             assert(ch <= MAX_UNICODE);
5316             /* Encode UCS4 Unicode ordinals */
5317             *p++ = (char)(0xf0 | (ch >> 18));
5318             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5319             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5320             *p++ = (char)(0x80 | (ch & 0x3f));
5321         }
5322     }
5323     *p++ = '\0';
5324 
5325     size_t final_size = (p - bytes);
5326     char *bytes2;
5327     if (raw_malloc) {
5328         bytes2 = PyMem_RawRealloc(bytes, final_size);
5329     }
5330     else {
5331         bytes2 = PyMem_Realloc(bytes, final_size);
5332     }
5333     if (bytes2 == NULL) {
5334         if (error_pos != NULL) {
5335             *error_pos = (size_t)-1;
5336         }
5337         if (raw_malloc) {
5338             PyMem_RawFree(bytes);
5339         }
5340         else {
5341             PyMem_Free(bytes);
5342         }
5343         return -1;
5344     }
5345     *str = bytes2;
5346     return 0;
5347 }
5348 
5349 
5350 /* Primary internal function which creates utf8 encoded bytes objects.
5351 
5352    Allocation strategy:  if the string is short, convert into a stack buffer
5353    and allocate exactly as much space needed at the end.  Else allocate the
5354    maximum possible needed (4 result bytes per Unicode character), and return
5355    the excess memory at the end.
5356 */
5357 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5358 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5359                     const char *errors)
5360 {
5361     if (!PyUnicode_Check(unicode)) {
5362         PyErr_BadArgument();
5363         return NULL;
5364     }
5365 
5366     if (PyUnicode_UTF8(unicode))
5367         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5368                                          PyUnicode_UTF8_LENGTH(unicode));
5369 
5370     int kind = PyUnicode_KIND(unicode);
5371     const void *data = PyUnicode_DATA(unicode);
5372     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5373 
5374     _PyBytesWriter writer;
5375     char *end;
5376 
5377     switch (kind) {
5378     default:
5379         Py_UNREACHABLE();
5380     case PyUnicode_1BYTE_KIND:
5381         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5382         assert(!PyUnicode_IS_ASCII(unicode));
5383         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5384         break;
5385     case PyUnicode_2BYTE_KIND:
5386         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5387         break;
5388     case PyUnicode_4BYTE_KIND:
5389         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5390         break;
5391     }
5392 
5393     if (end == NULL) {
5394         _PyBytesWriter_Dealloc(&writer);
5395         return NULL;
5396     }
5397     return _PyBytesWriter_Finish(&writer, end);
5398 }
5399 
5400 static int
unicode_fill_utf8(PyObject * unicode)5401 unicode_fill_utf8(PyObject *unicode)
5402 {
5403     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5404     assert(!PyUnicode_IS_ASCII(unicode));
5405 
5406     int kind = PyUnicode_KIND(unicode);
5407     const void *data = PyUnicode_DATA(unicode);
5408     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5409 
5410     _PyBytesWriter writer;
5411     char *end;
5412 
5413     switch (kind) {
5414     default:
5415         Py_UNREACHABLE();
5416     case PyUnicode_1BYTE_KIND:
5417         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5418                                    _Py_ERROR_STRICT, NULL);
5419         break;
5420     case PyUnicode_2BYTE_KIND:
5421         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5422                                    _Py_ERROR_STRICT, NULL);
5423         break;
5424     case PyUnicode_4BYTE_KIND:
5425         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5426                                    _Py_ERROR_STRICT, NULL);
5427         break;
5428     }
5429     if (end == NULL) {
5430         _PyBytesWriter_Dealloc(&writer);
5431         return -1;
5432     }
5433 
5434     const char *start = writer.use_small_buffer ? writer.small_buffer :
5435                     PyBytes_AS_STRING(writer.buffer);
5436     Py_ssize_t len = end - start;
5437 
5438     char *cache = PyMem_Malloc(len + 1);
5439     if (cache == NULL) {
5440         _PyBytesWriter_Dealloc(&writer);
5441         PyErr_NoMemory();
5442         return -1;
5443     }
5444     _PyUnicode_UTF8(unicode) = cache;
5445     _PyUnicode_UTF8_LENGTH(unicode) = len;
5446     memcpy(cache, start, len);
5447     cache[len] = '\0';
5448     _PyBytesWriter_Dealloc(&writer);
5449     return 0;
5450 }
5451 
5452 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5453 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5454 {
5455     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5456 }
5457 
5458 
5459 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5460 PyUnicode_AsUTF8String(PyObject *unicode)
5461 {
5462     return _PyUnicode_AsUTF8String(unicode, NULL);
5463 }
5464 
5465 /* --- UTF-32 Codec ------------------------------------------------------- */
5466 
5467 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5468 PyUnicode_DecodeUTF32(const char *s,
5469                       Py_ssize_t size,
5470                       const char *errors,
5471                       int *byteorder)
5472 {
5473     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5474 }
5475 
5476 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5477 PyUnicode_DecodeUTF32Stateful(const char *s,
5478                               Py_ssize_t size,
5479                               const char *errors,
5480                               int *byteorder,
5481                               Py_ssize_t *consumed)
5482 {
5483     const char *starts = s;
5484     Py_ssize_t startinpos;
5485     Py_ssize_t endinpos;
5486     _PyUnicodeWriter writer;
5487     const unsigned char *q, *e;
5488     int le, bo = 0;       /* assume native ordering by default */
5489     const char *encoding;
5490     const char *errmsg = "";
5491     PyObject *errorHandler = NULL;
5492     PyObject *exc = NULL;
5493 
5494     q = (const unsigned char *)s;
5495     e = q + size;
5496 
5497     if (byteorder)
5498         bo = *byteorder;
5499 
5500     /* Check for BOM marks (U+FEFF) in the input and adjust current
5501        byte order setting accordingly. In native mode, the leading BOM
5502        mark is skipped, in all other modes, it is copied to the output
5503        stream as-is (giving a ZWNBSP character). */
5504     if (bo == 0 && size >= 4) {
5505         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5506         if (bom == 0x0000FEFF) {
5507             bo = -1;
5508             q += 4;
5509         }
5510         else if (bom == 0xFFFE0000) {
5511             bo = 1;
5512             q += 4;
5513         }
5514         if (byteorder)
5515             *byteorder = bo;
5516     }
5517 
5518     if (q == e) {
5519         if (consumed)
5520             *consumed = size;
5521         _Py_RETURN_UNICODE_EMPTY();
5522     }
5523 
5524 #ifdef WORDS_BIGENDIAN
5525     le = bo < 0;
5526 #else
5527     le = bo <= 0;
5528 #endif
5529     encoding = le ? "utf-32-le" : "utf-32-be";
5530 
5531     _PyUnicodeWriter_Init(&writer);
5532     writer.min_length = (e - q + 3) / 4;
5533     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5534         goto onError;
5535 
5536     while (1) {
5537         Py_UCS4 ch = 0;
5538         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5539 
5540         if (e - q >= 4) {
5541             int kind = writer.kind;
5542             void *data = writer.data;
5543             const unsigned char *last = e - 4;
5544             Py_ssize_t pos = writer.pos;
5545             if (le) {
5546                 do {
5547                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5548                     if (ch > maxch)
5549                         break;
5550                     if (kind != PyUnicode_1BYTE_KIND &&
5551                         Py_UNICODE_IS_SURROGATE(ch))
5552                         break;
5553                     PyUnicode_WRITE(kind, data, pos++, ch);
5554                     q += 4;
5555                 } while (q <= last);
5556             }
5557             else {
5558                 do {
5559                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5560                     if (ch > maxch)
5561                         break;
5562                     if (kind != PyUnicode_1BYTE_KIND &&
5563                         Py_UNICODE_IS_SURROGATE(ch))
5564                         break;
5565                     PyUnicode_WRITE(kind, data, pos++, ch);
5566                     q += 4;
5567                 } while (q <= last);
5568             }
5569             writer.pos = pos;
5570         }
5571 
5572         if (Py_UNICODE_IS_SURROGATE(ch)) {
5573             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5574             startinpos = ((const char *)q) - starts;
5575             endinpos = startinpos + 4;
5576         }
5577         else if (ch <= maxch) {
5578             if (q == e || consumed)
5579                 break;
5580             /* remaining bytes at the end? (size should be divisible by 4) */
5581             errmsg = "truncated data";
5582             startinpos = ((const char *)q) - starts;
5583             endinpos = ((const char *)e) - starts;
5584         }
5585         else {
5586             if (ch < 0x110000) {
5587                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5588                     goto onError;
5589                 q += 4;
5590                 continue;
5591             }
5592             errmsg = "code point not in range(0x110000)";
5593             startinpos = ((const char *)q) - starts;
5594             endinpos = startinpos + 4;
5595         }
5596 
5597         /* The remaining input chars are ignored if the callback
5598            chooses to skip the input */
5599         if (unicode_decode_call_errorhandler_writer(
5600                 errors, &errorHandler,
5601                 encoding, errmsg,
5602                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5603                 &writer))
5604             goto onError;
5605     }
5606 
5607     if (consumed)
5608         *consumed = (const char *)q-starts;
5609 
5610     Py_XDECREF(errorHandler);
5611     Py_XDECREF(exc);
5612     return _PyUnicodeWriter_Finish(&writer);
5613 
5614   onError:
5615     _PyUnicodeWriter_Dealloc(&writer);
5616     Py_XDECREF(errorHandler);
5617     Py_XDECREF(exc);
5618     return NULL;
5619 }
5620 
5621 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5622 _PyUnicode_EncodeUTF32(PyObject *str,
5623                        const char *errors,
5624                        int byteorder)
5625 {
5626     int kind;
5627     const void *data;
5628     Py_ssize_t len;
5629     PyObject *v;
5630     uint32_t *out;
5631 #if PY_LITTLE_ENDIAN
5632     int native_ordering = byteorder <= 0;
5633 #else
5634     int native_ordering = byteorder >= 0;
5635 #endif
5636     const char *encoding;
5637     Py_ssize_t nsize, pos;
5638     PyObject *errorHandler = NULL;
5639     PyObject *exc = NULL;
5640     PyObject *rep = NULL;
5641 
5642     if (!PyUnicode_Check(str)) {
5643         PyErr_BadArgument();
5644         return NULL;
5645     }
5646     kind = PyUnicode_KIND(str);
5647     data = PyUnicode_DATA(str);
5648     len = PyUnicode_GET_LENGTH(str);
5649 
5650     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5651         return PyErr_NoMemory();
5652     nsize = len + (byteorder == 0);
5653     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5654     if (v == NULL)
5655         return NULL;
5656 
5657     /* output buffer is 4-bytes aligned */
5658     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5659     out = (uint32_t *)PyBytes_AS_STRING(v);
5660     if (byteorder == 0)
5661         *out++ = 0xFEFF;
5662     if (len == 0)
5663         goto done;
5664 
5665     if (byteorder == -1)
5666         encoding = "utf-32-le";
5667     else if (byteorder == 1)
5668         encoding = "utf-32-be";
5669     else
5670         encoding = "utf-32";
5671 
5672     if (kind == PyUnicode_1BYTE_KIND) {
5673         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5674         goto done;
5675     }
5676 
5677     pos = 0;
5678     while (pos < len) {
5679         Py_ssize_t newpos, repsize, moreunits;
5680 
5681         if (kind == PyUnicode_2BYTE_KIND) {
5682             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5683                                         &out, native_ordering);
5684         }
5685         else {
5686             assert(kind == PyUnicode_4BYTE_KIND);
5687             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5688                                         &out, native_ordering);
5689         }
5690         if (pos == len)
5691             break;
5692 
5693         rep = unicode_encode_call_errorhandler(
5694                 errors, &errorHandler,
5695                 encoding, "surrogates not allowed",
5696                 str, &exc, pos, pos + 1, &newpos);
5697         if (!rep)
5698             goto error;
5699 
5700         if (PyBytes_Check(rep)) {
5701             repsize = PyBytes_GET_SIZE(rep);
5702             if (repsize & 3) {
5703                 raise_encode_exception(&exc, encoding,
5704                                        str, pos, pos + 1,
5705                                        "surrogates not allowed");
5706                 goto error;
5707             }
5708             moreunits = repsize / 4;
5709         }
5710         else {
5711             assert(PyUnicode_Check(rep));
5712             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5713             if (!PyUnicode_IS_ASCII(rep)) {
5714                 raise_encode_exception(&exc, encoding,
5715                                        str, pos, pos + 1,
5716                                        "surrogates not allowed");
5717                 goto error;
5718             }
5719         }
5720         moreunits += pos - newpos;
5721         pos = newpos;
5722 
5723         /* four bytes are reserved for each surrogate */
5724         if (moreunits > 0) {
5725             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5726             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5727                 /* integer overflow */
5728                 PyErr_NoMemory();
5729                 goto error;
5730             }
5731             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5732                 goto error;
5733             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5734         }
5735 
5736         if (PyBytes_Check(rep)) {
5737             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5738             out += repsize / 4;
5739         } else /* rep is unicode */ {
5740             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5741             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5742                                  &out, native_ordering);
5743         }
5744 
5745         Py_CLEAR(rep);
5746     }
5747 
5748     /* Cut back to size actually needed. This is necessary for, for example,
5749        encoding of a string containing isolated surrogates and the 'ignore'
5750        handler is used. */
5751     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5752     if (nsize != PyBytes_GET_SIZE(v))
5753       _PyBytes_Resize(&v, nsize);
5754     Py_XDECREF(errorHandler);
5755     Py_XDECREF(exc);
5756   done:
5757     return v;
5758   error:
5759     Py_XDECREF(rep);
5760     Py_XDECREF(errorHandler);
5761     Py_XDECREF(exc);
5762     Py_XDECREF(v);
5763     return NULL;
5764 }
5765 
5766 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5767 PyUnicode_AsUTF32String(PyObject *unicode)
5768 {
5769     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5770 }
5771 
5772 /* --- UTF-16 Codec ------------------------------------------------------- */
5773 
5774 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5775 PyUnicode_DecodeUTF16(const char *s,
5776                       Py_ssize_t size,
5777                       const char *errors,
5778                       int *byteorder)
5779 {
5780     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5781 }
5782 
5783 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5784 PyUnicode_DecodeUTF16Stateful(const char *s,
5785                               Py_ssize_t size,
5786                               const char *errors,
5787                               int *byteorder,
5788                               Py_ssize_t *consumed)
5789 {
5790     const char *starts = s;
5791     Py_ssize_t startinpos;
5792     Py_ssize_t endinpos;
5793     _PyUnicodeWriter writer;
5794     const unsigned char *q, *e;
5795     int bo = 0;       /* assume native ordering by default */
5796     int native_ordering;
5797     const char *errmsg = "";
5798     PyObject *errorHandler = NULL;
5799     PyObject *exc = NULL;
5800     const char *encoding;
5801 
5802     q = (const unsigned char *)s;
5803     e = q + size;
5804 
5805     if (byteorder)
5806         bo = *byteorder;
5807 
5808     /* Check for BOM marks (U+FEFF) in the input and adjust current
5809        byte order setting accordingly. In native mode, the leading BOM
5810        mark is skipped, in all other modes, it is copied to the output
5811        stream as-is (giving a ZWNBSP character). */
5812     if (bo == 0 && size >= 2) {
5813         const Py_UCS4 bom = (q[1] << 8) | q[0];
5814         if (bom == 0xFEFF) {
5815             q += 2;
5816             bo = -1;
5817         }
5818         else if (bom == 0xFFFE) {
5819             q += 2;
5820             bo = 1;
5821         }
5822         if (byteorder)
5823             *byteorder = bo;
5824     }
5825 
5826     if (q == e) {
5827         if (consumed)
5828             *consumed = size;
5829         _Py_RETURN_UNICODE_EMPTY();
5830     }
5831 
5832 #if PY_LITTLE_ENDIAN
5833     native_ordering = bo <= 0;
5834     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5835 #else
5836     native_ordering = bo >= 0;
5837     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5838 #endif
5839 
5840     /* Note: size will always be longer than the resulting Unicode
5841        character count normally.  Error handler will take care of
5842        resizing when needed. */
5843     _PyUnicodeWriter_Init(&writer);
5844     writer.min_length = (e - q + 1) / 2;
5845     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5846         goto onError;
5847 
5848     while (1) {
5849         Py_UCS4 ch = 0;
5850         if (e - q >= 2) {
5851             int kind = writer.kind;
5852             if (kind == PyUnicode_1BYTE_KIND) {
5853                 if (PyUnicode_IS_ASCII(writer.buffer))
5854                     ch = asciilib_utf16_decode(&q, e,
5855                             (Py_UCS1*)writer.data, &writer.pos,
5856                             native_ordering);
5857                 else
5858                     ch = ucs1lib_utf16_decode(&q, e,
5859                             (Py_UCS1*)writer.data, &writer.pos,
5860                             native_ordering);
5861             } else if (kind == PyUnicode_2BYTE_KIND) {
5862                 ch = ucs2lib_utf16_decode(&q, e,
5863                         (Py_UCS2*)writer.data, &writer.pos,
5864                         native_ordering);
5865             } else {
5866                 assert(kind == PyUnicode_4BYTE_KIND);
5867                 ch = ucs4lib_utf16_decode(&q, e,
5868                         (Py_UCS4*)writer.data, &writer.pos,
5869                         native_ordering);
5870             }
5871         }
5872 
5873         switch (ch)
5874         {
5875         case 0:
5876             /* remaining byte at the end? (size should be even) */
5877             if (q == e || consumed)
5878                 goto End;
5879             errmsg = "truncated data";
5880             startinpos = ((const char *)q) - starts;
5881             endinpos = ((const char *)e) - starts;
5882             break;
5883             /* The remaining input chars are ignored if the callback
5884                chooses to skip the input */
5885         case 1:
5886             q -= 2;
5887             if (consumed)
5888                 goto End;
5889             errmsg = "unexpected end of data";
5890             startinpos = ((const char *)q) - starts;
5891             endinpos = ((const char *)e) - starts;
5892             break;
5893         case 2:
5894             errmsg = "illegal encoding";
5895             startinpos = ((const char *)q) - 2 - starts;
5896             endinpos = startinpos + 2;
5897             break;
5898         case 3:
5899             errmsg = "illegal UTF-16 surrogate";
5900             startinpos = ((const char *)q) - 4 - starts;
5901             endinpos = startinpos + 2;
5902             break;
5903         default:
5904             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5905                 goto onError;
5906             continue;
5907         }
5908 
5909         if (unicode_decode_call_errorhandler_writer(
5910                 errors,
5911                 &errorHandler,
5912                 encoding, errmsg,
5913                 &starts,
5914                 (const char **)&e,
5915                 &startinpos,
5916                 &endinpos,
5917                 &exc,
5918                 (const char **)&q,
5919                 &writer))
5920             goto onError;
5921     }
5922 
5923 End:
5924     if (consumed)
5925         *consumed = (const char *)q-starts;
5926 
5927     Py_XDECREF(errorHandler);
5928     Py_XDECREF(exc);
5929     return _PyUnicodeWriter_Finish(&writer);
5930 
5931   onError:
5932     _PyUnicodeWriter_Dealloc(&writer);
5933     Py_XDECREF(errorHandler);
5934     Py_XDECREF(exc);
5935     return NULL;
5936 }
5937 
5938 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5939 _PyUnicode_EncodeUTF16(PyObject *str,
5940                        const char *errors,
5941                        int byteorder)
5942 {
5943     int kind;
5944     const void *data;
5945     Py_ssize_t len;
5946     PyObject *v;
5947     unsigned short *out;
5948     Py_ssize_t pairs;
5949 #if PY_BIG_ENDIAN
5950     int native_ordering = byteorder >= 0;
5951 #else
5952     int native_ordering = byteorder <= 0;
5953 #endif
5954     const char *encoding;
5955     Py_ssize_t nsize, pos;
5956     PyObject *errorHandler = NULL;
5957     PyObject *exc = NULL;
5958     PyObject *rep = NULL;
5959 
5960     if (!PyUnicode_Check(str)) {
5961         PyErr_BadArgument();
5962         return NULL;
5963     }
5964     kind = PyUnicode_KIND(str);
5965     data = PyUnicode_DATA(str);
5966     len = PyUnicode_GET_LENGTH(str);
5967 
5968     pairs = 0;
5969     if (kind == PyUnicode_4BYTE_KIND) {
5970         const Py_UCS4 *in = (const Py_UCS4 *)data;
5971         const Py_UCS4 *end = in + len;
5972         while (in < end) {
5973             if (*in++ >= 0x10000) {
5974                 pairs++;
5975             }
5976         }
5977     }
5978     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5979         return PyErr_NoMemory();
5980     }
5981     nsize = len + pairs + (byteorder == 0);
5982     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5983     if (v == NULL) {
5984         return NULL;
5985     }
5986 
5987     /* output buffer is 2-bytes aligned */
5988     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5989     out = (unsigned short *)PyBytes_AS_STRING(v);
5990     if (byteorder == 0) {
5991         *out++ = 0xFEFF;
5992     }
5993     if (len == 0) {
5994         goto done;
5995     }
5996 
5997     if (kind == PyUnicode_1BYTE_KIND) {
5998         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5999         goto done;
6000     }
6001 
6002     if (byteorder < 0) {
6003         encoding = "utf-16-le";
6004     }
6005     else if (byteorder > 0) {
6006         encoding = "utf-16-be";
6007     }
6008     else {
6009         encoding = "utf-16";
6010     }
6011 
6012     pos = 0;
6013     while (pos < len) {
6014         Py_ssize_t newpos, repsize, moreunits;
6015 
6016         if (kind == PyUnicode_2BYTE_KIND) {
6017             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6018                                         &out, native_ordering);
6019         }
6020         else {
6021             assert(kind == PyUnicode_4BYTE_KIND);
6022             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6023                                         &out, native_ordering);
6024         }
6025         if (pos == len)
6026             break;
6027 
6028         rep = unicode_encode_call_errorhandler(
6029                 errors, &errorHandler,
6030                 encoding, "surrogates not allowed",
6031                 str, &exc, pos, pos + 1, &newpos);
6032         if (!rep)
6033             goto error;
6034 
6035         if (PyBytes_Check(rep)) {
6036             repsize = PyBytes_GET_SIZE(rep);
6037             if (repsize & 1) {
6038                 raise_encode_exception(&exc, encoding,
6039                                        str, pos, pos + 1,
6040                                        "surrogates not allowed");
6041                 goto error;
6042             }
6043             moreunits = repsize / 2;
6044         }
6045         else {
6046             assert(PyUnicode_Check(rep));
6047             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6048             if (!PyUnicode_IS_ASCII(rep)) {
6049                 raise_encode_exception(&exc, encoding,
6050                                        str, pos, pos + 1,
6051                                        "surrogates not allowed");
6052                 goto error;
6053             }
6054         }
6055         moreunits += pos - newpos;
6056         pos = newpos;
6057 
6058         /* two bytes are reserved for each surrogate */
6059         if (moreunits > 0) {
6060             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6061             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6062                 /* integer overflow */
6063                 PyErr_NoMemory();
6064                 goto error;
6065             }
6066             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6067                 goto error;
6068             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6069         }
6070 
6071         if (PyBytes_Check(rep)) {
6072             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6073             out += repsize / 2;
6074         } else /* rep is unicode */ {
6075             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6076             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6077                                  &out, native_ordering);
6078         }
6079 
6080         Py_CLEAR(rep);
6081     }
6082 
6083     /* Cut back to size actually needed. This is necessary for, for example,
6084     encoding of a string containing isolated surrogates and the 'ignore' handler
6085     is used. */
6086     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6087     if (nsize != PyBytes_GET_SIZE(v))
6088       _PyBytes_Resize(&v, nsize);
6089     Py_XDECREF(errorHandler);
6090     Py_XDECREF(exc);
6091   done:
6092     return v;
6093   error:
6094     Py_XDECREF(rep);
6095     Py_XDECREF(errorHandler);
6096     Py_XDECREF(exc);
6097     Py_XDECREF(v);
6098     return NULL;
6099 #undef STORECHAR
6100 }
6101 
6102 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6103 PyUnicode_AsUTF16String(PyObject *unicode)
6104 {
6105     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6106 }
6107 
6108 _PyUnicode_Name_CAPI *
_PyUnicode_GetNameCAPI(void)6109 _PyUnicode_GetNameCAPI(void)
6110 {
6111     PyInterpreterState *interp = _PyInterpreterState_GET();
6112     _PyUnicode_Name_CAPI *ucnhash_capi;
6113 
6114     ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6115     if (ucnhash_capi == NULL) {
6116         ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6117                 PyUnicodeData_CAPSULE_NAME, 1);
6118 
6119         // It's fine if we overwite the value here. It's always the same value.
6120         _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6121     }
6122     return ucnhash_capi;
6123 }
6124 
6125 /* --- Unicode Escape Codec ----------------------------------------------- */
6126 
6127 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6128 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6129                                Py_ssize_t size,
6130                                const char *errors,
6131                                Py_ssize_t *consumed,
6132                                const char **first_invalid_escape)
6133 {
6134     const char *starts = s;
6135     _PyUnicodeWriter writer;
6136     const char *end;
6137     PyObject *errorHandler = NULL;
6138     PyObject *exc = NULL;
6139     _PyUnicode_Name_CAPI *ucnhash_capi;
6140 
6141     // so we can remember if we've seen an invalid escape char or not
6142     *first_invalid_escape = NULL;
6143 
6144     if (size == 0) {
6145         if (consumed) {
6146             *consumed = 0;
6147         }
6148         _Py_RETURN_UNICODE_EMPTY();
6149     }
6150     /* Escaped strings will always be longer than the resulting
6151        Unicode string, so we start with size here and then reduce the
6152        length after conversion to the true value.
6153        (but if the error callback returns a long replacement string
6154        we'll have to allocate more space) */
6155     _PyUnicodeWriter_Init(&writer);
6156     writer.min_length = size;
6157     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6158         goto onError;
6159     }
6160 
6161     end = s + size;
6162     while (s < end) {
6163         unsigned char c = (unsigned char) *s++;
6164         Py_UCS4 ch;
6165         int count;
6166         const char *message;
6167 
6168 #define WRITE_ASCII_CHAR(ch)                                                  \
6169             do {                                                              \
6170                 assert(ch <= 127);                                            \
6171                 assert(writer.pos < writer.size);                             \
6172                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6173             } while(0)
6174 
6175 #define WRITE_CHAR(ch)                                                        \
6176             do {                                                              \
6177                 if (ch <= writer.maxchar) {                                   \
6178                     assert(writer.pos < writer.size);                         \
6179                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6180                 }                                                             \
6181                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6182                     goto onError;                                             \
6183                 }                                                             \
6184             } while(0)
6185 
6186         /* Non-escape characters are interpreted as Unicode ordinals */
6187         if (c != '\\') {
6188             WRITE_CHAR(c);
6189             continue;
6190         }
6191 
6192         Py_ssize_t startinpos = s - starts - 1;
6193         /* \ - Escapes */
6194         if (s >= end) {
6195             message = "\\ at end of string";
6196             goto incomplete;
6197         }
6198         c = (unsigned char) *s++;
6199 
6200         assert(writer.pos < writer.size);
6201         switch (c) {
6202 
6203             /* \x escapes */
6204         case '\n': continue;
6205         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6206         case '\'': WRITE_ASCII_CHAR('\''); continue;
6207         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6208         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6209         /* FF */
6210         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6211         case 't': WRITE_ASCII_CHAR('\t'); continue;
6212         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6213         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6214         /* VT */
6215         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6216         /* BEL, not classic C */
6217         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6218 
6219             /* \OOO (octal) escapes */
6220         case '0': case '1': case '2': case '3':
6221         case '4': case '5': case '6': case '7':
6222             ch = c - '0';
6223             if (s < end && '0' <= *s && *s <= '7') {
6224                 ch = (ch<<3) + *s++ - '0';
6225                 if (s < end && '0' <= *s && *s <= '7') {
6226                     ch = (ch<<3) + *s++ - '0';
6227                 }
6228             }
6229             if (ch > 0377) {
6230                 if (*first_invalid_escape == NULL) {
6231                     *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6232                                                     already incremented s. */
6233                 }
6234             }
6235             WRITE_CHAR(ch);
6236             continue;
6237 
6238             /* hex escapes */
6239             /* \xXX */
6240         case 'x':
6241             count = 2;
6242             message = "truncated \\xXX escape";
6243             goto hexescape;
6244 
6245             /* \uXXXX */
6246         case 'u':
6247             count = 4;
6248             message = "truncated \\uXXXX escape";
6249             goto hexescape;
6250 
6251             /* \UXXXXXXXX */
6252         case 'U':
6253             count = 8;
6254             message = "truncated \\UXXXXXXXX escape";
6255         hexescape:
6256             for (ch = 0; count; ++s, --count) {
6257                 if (s >= end) {
6258                     goto incomplete;
6259                 }
6260                 c = (unsigned char)*s;
6261                 ch <<= 4;
6262                 if (c >= '0' && c <= '9') {
6263                     ch += c - '0';
6264                 }
6265                 else if (c >= 'a' && c <= 'f') {
6266                     ch += c - ('a' - 10);
6267                 }
6268                 else if (c >= 'A' && c <= 'F') {
6269                     ch += c - ('A' - 10);
6270                 }
6271                 else {
6272                     goto error;
6273                 }
6274             }
6275 
6276             /* when we get here, ch is a 32-bit unicode character */
6277             if (ch > MAX_UNICODE) {
6278                 message = "illegal Unicode character";
6279                 goto error;
6280             }
6281 
6282             WRITE_CHAR(ch);
6283             continue;
6284 
6285             /* \N{name} */
6286         case 'N':
6287             ucnhash_capi = _PyUnicode_GetNameCAPI();
6288             if (ucnhash_capi == NULL) {
6289                 PyErr_SetString(
6290                         PyExc_UnicodeError,
6291                         "\\N escapes not supported (can't load unicodedata module)"
6292                 );
6293                 goto onError;
6294             }
6295 
6296             message = "malformed \\N character escape";
6297             if (s >= end) {
6298                 goto incomplete;
6299             }
6300             if (*s == '{') {
6301                 const char *start = ++s;
6302                 size_t namelen;
6303                 /* look for the closing brace */
6304                 while (s < end && *s != '}')
6305                     s++;
6306                 if (s >= end) {
6307                     goto incomplete;
6308                 }
6309                 namelen = s - start;
6310                 if (namelen) {
6311                     /* found a name.  look it up in the unicode database */
6312                     s++;
6313                     ch = 0xffffffff; /* in case 'getcode' messes up */
6314                     if (namelen <= INT_MAX &&
6315                         ucnhash_capi->getcode(start, (int)namelen,
6316                                               &ch, 0)) {
6317                         assert(ch <= MAX_UNICODE);
6318                         WRITE_CHAR(ch);
6319                         continue;
6320                     }
6321                     message = "unknown Unicode character name";
6322                 }
6323             }
6324             goto error;
6325 
6326         default:
6327             if (*first_invalid_escape == NULL) {
6328                 *first_invalid_escape = s-1; /* Back up one char, since we've
6329                                                 already incremented s. */
6330             }
6331             WRITE_ASCII_CHAR('\\');
6332             WRITE_CHAR(c);
6333             continue;
6334         }
6335 
6336       incomplete:
6337         if (consumed) {
6338             *consumed = startinpos;
6339             break;
6340         }
6341       error:;
6342         Py_ssize_t endinpos = s-starts;
6343         writer.min_length = end - s + writer.pos;
6344         if (unicode_decode_call_errorhandler_writer(
6345                 errors, &errorHandler,
6346                 "unicodeescape", message,
6347                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6348                 &writer)) {
6349             goto onError;
6350         }
6351         assert(end - s <= writer.size - writer.pos);
6352 
6353 #undef WRITE_ASCII_CHAR
6354 #undef WRITE_CHAR
6355     }
6356 
6357     Py_XDECREF(errorHandler);
6358     Py_XDECREF(exc);
6359     return _PyUnicodeWriter_Finish(&writer);
6360 
6361   onError:
6362     _PyUnicodeWriter_Dealloc(&writer);
6363     Py_XDECREF(errorHandler);
6364     Py_XDECREF(exc);
6365     return NULL;
6366 }
6367 
6368 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6369 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6370                               Py_ssize_t size,
6371                               const char *errors,
6372                               Py_ssize_t *consumed)
6373 {
6374     const char *first_invalid_escape;
6375     PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6376                                                       consumed,
6377                                                       &first_invalid_escape);
6378     if (result == NULL)
6379         return NULL;
6380     if (first_invalid_escape != NULL) {
6381         unsigned char c = *first_invalid_escape;
6382         if ('4' <= c && c <= '7') {
6383             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6384                                  "invalid octal escape sequence '\\%.3s'",
6385                                  first_invalid_escape) < 0)
6386             {
6387                 Py_DECREF(result);
6388                 return NULL;
6389             }
6390         }
6391         else {
6392             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6393                                  "invalid escape sequence '\\%c'",
6394                                  c) < 0)
6395             {
6396                 Py_DECREF(result);
6397                 return NULL;
6398             }
6399         }
6400     }
6401     return result;
6402 }
6403 
6404 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6405 PyUnicode_DecodeUnicodeEscape(const char *s,
6406                               Py_ssize_t size,
6407                               const char *errors)
6408 {
6409     return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6410 }
6411 
6412 /* Return a Unicode-Escape string version of the Unicode object. */
6413 
6414 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6415 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6416 {
6417     Py_ssize_t i, len;
6418     PyObject *repr;
6419     char *p;
6420     int kind;
6421     const void *data;
6422     Py_ssize_t expandsize;
6423 
6424     /* Initial allocation is based on the longest-possible character
6425        escape.
6426 
6427        For UCS1 strings it's '\xxx', 4 bytes per source character.
6428        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6429        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6430     */
6431 
6432     if (!PyUnicode_Check(unicode)) {
6433         PyErr_BadArgument();
6434         return NULL;
6435     }
6436 
6437     len = PyUnicode_GET_LENGTH(unicode);
6438     if (len == 0) {
6439         return PyBytes_FromStringAndSize(NULL, 0);
6440     }
6441 
6442     kind = PyUnicode_KIND(unicode);
6443     data = PyUnicode_DATA(unicode);
6444     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6445        bytes, and 1 byte characters 4. */
6446     expandsize = kind * 2 + 2;
6447     if (len > PY_SSIZE_T_MAX / expandsize) {
6448         return PyErr_NoMemory();
6449     }
6450     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6451     if (repr == NULL) {
6452         return NULL;
6453     }
6454 
6455     p = PyBytes_AS_STRING(repr);
6456     for (i = 0; i < len; i++) {
6457         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6458 
6459         /* U+0000-U+00ff range */
6460         if (ch < 0x100) {
6461             if (ch >= ' ' && ch < 127) {
6462                 if (ch != '\\') {
6463                     /* Copy printable US ASCII as-is */
6464                     *p++ = (char) ch;
6465                 }
6466                 /* Escape backslashes */
6467                 else {
6468                     *p++ = '\\';
6469                     *p++ = '\\';
6470                 }
6471             }
6472 
6473             /* Map special whitespace to '\t', \n', '\r' */
6474             else if (ch == '\t') {
6475                 *p++ = '\\';
6476                 *p++ = 't';
6477             }
6478             else if (ch == '\n') {
6479                 *p++ = '\\';
6480                 *p++ = 'n';
6481             }
6482             else if (ch == '\r') {
6483                 *p++ = '\\';
6484                 *p++ = 'r';
6485             }
6486 
6487             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6488             else {
6489                 *p++ = '\\';
6490                 *p++ = 'x';
6491                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6492                 *p++ = Py_hexdigits[ch & 0x000F];
6493             }
6494         }
6495         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6496         else if (ch < 0x10000) {
6497             *p++ = '\\';
6498             *p++ = 'u';
6499             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6500             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6501             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6502             *p++ = Py_hexdigits[ch & 0x000F];
6503         }
6504         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6505         else {
6506 
6507             /* Make sure that the first two digits are zero */
6508             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6509             *p++ = '\\';
6510             *p++ = 'U';
6511             *p++ = '0';
6512             *p++ = '0';
6513             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6514             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6515             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6516             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6517             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6518             *p++ = Py_hexdigits[ch & 0x0000000F];
6519         }
6520     }
6521 
6522     assert(p - PyBytes_AS_STRING(repr) > 0);
6523     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6524         return NULL;
6525     }
6526     return repr;
6527 }
6528 
6529 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6530 
6531 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6532 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6533                                           Py_ssize_t size,
6534                                           const char *errors,
6535                                           Py_ssize_t *consumed)
6536 {
6537     const char *starts = s;
6538     _PyUnicodeWriter writer;
6539     const char *end;
6540     PyObject *errorHandler = NULL;
6541     PyObject *exc = NULL;
6542 
6543     if (size == 0) {
6544         if (consumed) {
6545             *consumed = 0;
6546         }
6547         _Py_RETURN_UNICODE_EMPTY();
6548     }
6549 
6550     /* Escaped strings will always be longer than the resulting
6551        Unicode string, so we start with size here and then reduce the
6552        length after conversion to the true value. (But decoding error
6553        handler might have to resize the string) */
6554     _PyUnicodeWriter_Init(&writer);
6555     writer.min_length = size;
6556     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6557         goto onError;
6558     }
6559 
6560     end = s + size;
6561     while (s < end) {
6562         unsigned char c = (unsigned char) *s++;
6563         Py_UCS4 ch;
6564         int count;
6565         const char *message;
6566 
6567 #define WRITE_CHAR(ch)                                                        \
6568             do {                                                              \
6569                 if (ch <= writer.maxchar) {                                   \
6570                     assert(writer.pos < writer.size);                         \
6571                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6572                 }                                                             \
6573                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6574                     goto onError;                                             \
6575                 }                                                             \
6576             } while(0)
6577 
6578         /* Non-escape characters are interpreted as Unicode ordinals */
6579         if (c != '\\' || (s >= end && !consumed)) {
6580             WRITE_CHAR(c);
6581             continue;
6582         }
6583 
6584         Py_ssize_t startinpos = s - starts - 1;
6585         /* \ - Escapes */
6586         if (s >= end) {
6587             assert(consumed);
6588             // Set message to silent compiler warning.
6589             // Actually it is never used.
6590             message = "\\ at end of string";
6591             goto incomplete;
6592         }
6593 
6594         c = (unsigned char) *s++;
6595         if (c == 'u') {
6596             count = 4;
6597             message = "truncated \\uXXXX escape";
6598         }
6599         else if (c == 'U') {
6600             count = 8;
6601             message = "truncated \\UXXXXXXXX escape";
6602         }
6603         else {
6604             assert(writer.pos < writer.size);
6605             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6606             WRITE_CHAR(c);
6607             continue;
6608         }
6609 
6610         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6611         for (ch = 0; count; ++s, --count) {
6612             if (s >= end) {
6613                 goto incomplete;
6614             }
6615             c = (unsigned char)*s;
6616             ch <<= 4;
6617             if (c >= '0' && c <= '9') {
6618                 ch += c - '0';
6619             }
6620             else if (c >= 'a' && c <= 'f') {
6621                 ch += c - ('a' - 10);
6622             }
6623             else if (c >= 'A' && c <= 'F') {
6624                 ch += c - ('A' - 10);
6625             }
6626             else {
6627                 goto error;
6628             }
6629         }
6630         if (ch > MAX_UNICODE) {
6631             message = "\\Uxxxxxxxx out of range";
6632             goto error;
6633         }
6634         WRITE_CHAR(ch);
6635         continue;
6636 
6637       incomplete:
6638         if (consumed) {
6639             *consumed = startinpos;
6640             break;
6641         }
6642       error:;
6643         Py_ssize_t endinpos = s-starts;
6644         writer.min_length = end - s + writer.pos;
6645         if (unicode_decode_call_errorhandler_writer(
6646                 errors, &errorHandler,
6647                 "rawunicodeescape", message,
6648                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6649                 &writer)) {
6650             goto onError;
6651         }
6652         assert(end - s <= writer.size - writer.pos);
6653 
6654 #undef WRITE_CHAR
6655     }
6656     Py_XDECREF(errorHandler);
6657     Py_XDECREF(exc);
6658     return _PyUnicodeWriter_Finish(&writer);
6659 
6660   onError:
6661     _PyUnicodeWriter_Dealloc(&writer);
6662     Py_XDECREF(errorHandler);
6663     Py_XDECREF(exc);
6664     return NULL;
6665 }
6666 
6667 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6668 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6669                                  Py_ssize_t size,
6670                                  const char *errors)
6671 {
6672     return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6673 }
6674 
6675 
6676 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6677 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6678 {
6679     PyObject *repr;
6680     char *p;
6681     Py_ssize_t expandsize, pos;
6682     int kind;
6683     const void *data;
6684     Py_ssize_t len;
6685 
6686     if (!PyUnicode_Check(unicode)) {
6687         PyErr_BadArgument();
6688         return NULL;
6689     }
6690     kind = PyUnicode_KIND(unicode);
6691     data = PyUnicode_DATA(unicode);
6692     len = PyUnicode_GET_LENGTH(unicode);
6693     if (kind == PyUnicode_1BYTE_KIND) {
6694         return PyBytes_FromStringAndSize(data, len);
6695     }
6696 
6697     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6698        bytes, and 1 byte characters 4. */
6699     expandsize = kind * 2 + 2;
6700 
6701     if (len > PY_SSIZE_T_MAX / expandsize) {
6702         return PyErr_NoMemory();
6703     }
6704     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6705     if (repr == NULL) {
6706         return NULL;
6707     }
6708     if (len == 0) {
6709         return repr;
6710     }
6711 
6712     p = PyBytes_AS_STRING(repr);
6713     for (pos = 0; pos < len; pos++) {
6714         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6715 
6716         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6717         if (ch < 0x100) {
6718             *p++ = (char) ch;
6719         }
6720         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6721         else if (ch < 0x10000) {
6722             *p++ = '\\';
6723             *p++ = 'u';
6724             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6725             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6726             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6727             *p++ = Py_hexdigits[ch & 15];
6728         }
6729         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6730         else {
6731             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6732             *p++ = '\\';
6733             *p++ = 'U';
6734             *p++ = '0';
6735             *p++ = '0';
6736             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6737             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6738             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6739             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6740             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6741             *p++ = Py_hexdigits[ch & 15];
6742         }
6743     }
6744 
6745     assert(p > PyBytes_AS_STRING(repr));
6746     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6747         return NULL;
6748     }
6749     return repr;
6750 }
6751 
6752 /* --- Latin-1 Codec ------------------------------------------------------ */
6753 
6754 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6755 PyUnicode_DecodeLatin1(const char *s,
6756                        Py_ssize_t size,
6757                        const char *errors)
6758 {
6759     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6760     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6761 }
6762 
6763 /* create or adjust a UnicodeEncodeError */
6764 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6765 make_encode_exception(PyObject **exceptionObject,
6766                       const char *encoding,
6767                       PyObject *unicode,
6768                       Py_ssize_t startpos, Py_ssize_t endpos,
6769                       const char *reason)
6770 {
6771     if (*exceptionObject == NULL) {
6772         *exceptionObject = PyObject_CallFunction(
6773             PyExc_UnicodeEncodeError, "sOnns",
6774             encoding, unicode, startpos, endpos, reason);
6775     }
6776     else {
6777         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6778             goto onError;
6779         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6780             goto onError;
6781         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6782             goto onError;
6783         return;
6784       onError:
6785         Py_CLEAR(*exceptionObject);
6786     }
6787 }
6788 
6789 /* raises a UnicodeEncodeError */
6790 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6791 raise_encode_exception(PyObject **exceptionObject,
6792                        const char *encoding,
6793                        PyObject *unicode,
6794                        Py_ssize_t startpos, Py_ssize_t endpos,
6795                        const char *reason)
6796 {
6797     make_encode_exception(exceptionObject,
6798                           encoding, unicode, startpos, endpos, reason);
6799     if (*exceptionObject != NULL)
6800         PyCodec_StrictErrors(*exceptionObject);
6801 }
6802 
6803 /* error handling callback helper:
6804    build arguments, call the callback and check the arguments,
6805    put the result into newpos and return the replacement string, which
6806    has to be freed by the caller */
6807 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6808 unicode_encode_call_errorhandler(const char *errors,
6809                                  PyObject **errorHandler,
6810                                  const char *encoding, const char *reason,
6811                                  PyObject *unicode, PyObject **exceptionObject,
6812                                  Py_ssize_t startpos, Py_ssize_t endpos,
6813                                  Py_ssize_t *newpos)
6814 {
6815     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6816     Py_ssize_t len;
6817     PyObject *restuple;
6818     PyObject *resunicode;
6819 
6820     if (*errorHandler == NULL) {
6821         *errorHandler = PyCodec_LookupError(errors);
6822         if (*errorHandler == NULL)
6823             return NULL;
6824     }
6825 
6826     len = PyUnicode_GET_LENGTH(unicode);
6827 
6828     make_encode_exception(exceptionObject,
6829                           encoding, unicode, startpos, endpos, reason);
6830     if (*exceptionObject == NULL)
6831         return NULL;
6832 
6833     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6834     if (restuple == NULL)
6835         return NULL;
6836     if (!PyTuple_Check(restuple)) {
6837         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6838         Py_DECREF(restuple);
6839         return NULL;
6840     }
6841     if (!PyArg_ParseTuple(restuple, argparse,
6842                           &resunicode, newpos)) {
6843         Py_DECREF(restuple);
6844         return NULL;
6845     }
6846     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6847         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6848         Py_DECREF(restuple);
6849         return NULL;
6850     }
6851     if (*newpos<0)
6852         *newpos = len + *newpos;
6853     if (*newpos<0 || *newpos>len) {
6854         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6855         Py_DECREF(restuple);
6856         return NULL;
6857     }
6858     Py_INCREF(resunicode);
6859     Py_DECREF(restuple);
6860     return resunicode;
6861 }
6862 
6863 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6864 unicode_encode_ucs1(PyObject *unicode,
6865                     const char *errors,
6866                     const Py_UCS4 limit)
6867 {
6868     /* input state */
6869     Py_ssize_t pos=0, size;
6870     int kind;
6871     const void *data;
6872     /* pointer into the output */
6873     char *str;
6874     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6875     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6876     PyObject *error_handler_obj = NULL;
6877     PyObject *exc = NULL;
6878     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6879     PyObject *rep = NULL;
6880     /* output object */
6881     _PyBytesWriter writer;
6882 
6883     size = PyUnicode_GET_LENGTH(unicode);
6884     kind = PyUnicode_KIND(unicode);
6885     data = PyUnicode_DATA(unicode);
6886     /* allocate enough for a simple encoding without
6887        replacements, if we need more, we'll resize */
6888     if (size == 0)
6889         return PyBytes_FromStringAndSize(NULL, 0);
6890 
6891     _PyBytesWriter_Init(&writer);
6892     str = _PyBytesWriter_Alloc(&writer, size);
6893     if (str == NULL)
6894         return NULL;
6895 
6896     while (pos < size) {
6897         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6898 
6899         /* can we encode this? */
6900         if (ch < limit) {
6901             /* no overflow check, because we know that the space is enough */
6902             *str++ = (char)ch;
6903             ++pos;
6904         }
6905         else {
6906             Py_ssize_t newpos, i;
6907             /* startpos for collecting unencodable chars */
6908             Py_ssize_t collstart = pos;
6909             Py_ssize_t collend = collstart + 1;
6910             /* find all unecodable characters */
6911 
6912             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6913                 ++collend;
6914 
6915             /* Only overallocate the buffer if it's not the last write */
6916             writer.overallocate = (collend < size);
6917 
6918             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6919             if (error_handler == _Py_ERROR_UNKNOWN)
6920                 error_handler = _Py_GetErrorHandler(errors);
6921 
6922             switch (error_handler) {
6923             case _Py_ERROR_STRICT:
6924                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6925                 goto onError;
6926 
6927             case _Py_ERROR_REPLACE:
6928                 memset(str, '?', collend - collstart);
6929                 str += (collend - collstart);
6930                 /* fall through */
6931             case _Py_ERROR_IGNORE:
6932                 pos = collend;
6933                 break;
6934 
6935             case _Py_ERROR_BACKSLASHREPLACE:
6936                 /* subtract preallocated bytes */
6937                 writer.min_size -= (collend - collstart);
6938                 str = backslashreplace(&writer, str,
6939                                        unicode, collstart, collend);
6940                 if (str == NULL)
6941                     goto onError;
6942                 pos = collend;
6943                 break;
6944 
6945             case _Py_ERROR_XMLCHARREFREPLACE:
6946                 /* subtract preallocated bytes */
6947                 writer.min_size -= (collend - collstart);
6948                 str = xmlcharrefreplace(&writer, str,
6949                                         unicode, collstart, collend);
6950                 if (str == NULL)
6951                     goto onError;
6952                 pos = collend;
6953                 break;
6954 
6955             case _Py_ERROR_SURROGATEESCAPE:
6956                 for (i = collstart; i < collend; ++i) {
6957                     ch = PyUnicode_READ(kind, data, i);
6958                     if (ch < 0xdc80 || 0xdcff < ch) {
6959                         /* Not a UTF-8b surrogate */
6960                         break;
6961                     }
6962                     *str++ = (char)(ch - 0xdc00);
6963                     ++pos;
6964                 }
6965                 if (i >= collend)
6966                     break;
6967                 collstart = pos;
6968                 assert(collstart != collend);
6969                 /* fall through */
6970 
6971             default:
6972                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6973                                                        encoding, reason, unicode, &exc,
6974                                                        collstart, collend, &newpos);
6975                 if (rep == NULL)
6976                     goto onError;
6977 
6978                 if (newpos < collstart) {
6979                     writer.overallocate = 1;
6980                     str = _PyBytesWriter_Prepare(&writer, str,
6981                                                  collstart - newpos);
6982                     if (str == NULL)
6983                         goto onError;
6984                 }
6985                 else {
6986                     /* subtract preallocated bytes */
6987                     writer.min_size -= newpos - collstart;
6988                     /* Only overallocate the buffer if it's not the last write */
6989                     writer.overallocate = (newpos < size);
6990                 }
6991 
6992                 if (PyBytes_Check(rep)) {
6993                     /* Directly copy bytes result to output. */
6994                     str = _PyBytesWriter_WriteBytes(&writer, str,
6995                                                     PyBytes_AS_STRING(rep),
6996                                                     PyBytes_GET_SIZE(rep));
6997                 }
6998                 else {
6999                     assert(PyUnicode_Check(rep));
7000 
7001                     if (limit == 256 ?
7002                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7003                         !PyUnicode_IS_ASCII(rep))
7004                     {
7005                         /* Not all characters are smaller than limit */
7006                         raise_encode_exception(&exc, encoding, unicode,
7007                                                collstart, collend, reason);
7008                         goto onError;
7009                     }
7010                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7011                     str = _PyBytesWriter_WriteBytes(&writer, str,
7012                                                     PyUnicode_DATA(rep),
7013                                                     PyUnicode_GET_LENGTH(rep));
7014                 }
7015                 if (str == NULL)
7016                     goto onError;
7017 
7018                 pos = newpos;
7019                 Py_CLEAR(rep);
7020             }
7021 
7022             /* If overallocation was disabled, ensure that it was the last
7023                write. Otherwise, we missed an optimization */
7024             assert(writer.overallocate || pos == size);
7025         }
7026     }
7027 
7028     Py_XDECREF(error_handler_obj);
7029     Py_XDECREF(exc);
7030     return _PyBytesWriter_Finish(&writer, str);
7031 
7032   onError:
7033     Py_XDECREF(rep);
7034     _PyBytesWriter_Dealloc(&writer);
7035     Py_XDECREF(error_handler_obj);
7036     Py_XDECREF(exc);
7037     return NULL;
7038 }
7039 
7040 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7041 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7042 {
7043     if (!PyUnicode_Check(unicode)) {
7044         PyErr_BadArgument();
7045         return NULL;
7046     }
7047     /* Fast path: if it is a one-byte string, construct
7048        bytes object directly. */
7049     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7050         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7051                                          PyUnicode_GET_LENGTH(unicode));
7052     /* Non-Latin-1 characters present. Defer to above function to
7053        raise the exception. */
7054     return unicode_encode_ucs1(unicode, errors, 256);
7055 }
7056 
7057 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7058 PyUnicode_AsLatin1String(PyObject *unicode)
7059 {
7060     return _PyUnicode_AsLatin1String(unicode, NULL);
7061 }
7062 
7063 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7064 
7065 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7066 PyUnicode_DecodeASCII(const char *s,
7067                       Py_ssize_t size,
7068                       const char *errors)
7069 {
7070     const char *starts = s;
7071     const char *e = s + size;
7072     PyObject *error_handler_obj = NULL;
7073     PyObject *exc = NULL;
7074     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7075 
7076     if (size == 0)
7077         _Py_RETURN_UNICODE_EMPTY();
7078 
7079     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7080     if (size == 1 && (unsigned char)s[0] < 128) {
7081         return get_latin1_char((unsigned char)s[0]);
7082     }
7083 
7084     // Shortcut for simple case
7085     PyObject *u = PyUnicode_New(size, 127);
7086     if (u == NULL) {
7087         return NULL;
7088     }
7089     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7090     if (outpos == size) {
7091         return u;
7092     }
7093 
7094     _PyUnicodeWriter writer;
7095     _PyUnicodeWriter_InitWithBuffer(&writer, u);
7096     writer.pos = outpos;
7097 
7098     s += outpos;
7099     int kind = writer.kind;
7100     void *data = writer.data;
7101     Py_ssize_t startinpos, endinpos;
7102 
7103     while (s < e) {
7104         unsigned char c = (unsigned char)*s;
7105         if (c < 128) {
7106             PyUnicode_WRITE(kind, data, writer.pos, c);
7107             writer.pos++;
7108             ++s;
7109             continue;
7110         }
7111 
7112         /* byte outsize range 0x00..0x7f: call the error handler */
7113 
7114         if (error_handler == _Py_ERROR_UNKNOWN)
7115             error_handler = _Py_GetErrorHandler(errors);
7116 
7117         switch (error_handler)
7118         {
7119         case _Py_ERROR_REPLACE:
7120         case _Py_ERROR_SURROGATEESCAPE:
7121             /* Fast-path: the error handler only writes one character,
7122                but we may switch to UCS2 at the first write */
7123             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7124                 goto onError;
7125             kind = writer.kind;
7126             data = writer.data;
7127 
7128             if (error_handler == _Py_ERROR_REPLACE)
7129                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7130             else
7131                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7132             writer.pos++;
7133             ++s;
7134             break;
7135 
7136         case _Py_ERROR_IGNORE:
7137             ++s;
7138             break;
7139 
7140         default:
7141             startinpos = s-starts;
7142             endinpos = startinpos + 1;
7143             if (unicode_decode_call_errorhandler_writer(
7144                     errors, &error_handler_obj,
7145                     "ascii", "ordinal not in range(128)",
7146                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7147                     &writer))
7148                 goto onError;
7149             kind = writer.kind;
7150             data = writer.data;
7151         }
7152     }
7153     Py_XDECREF(error_handler_obj);
7154     Py_XDECREF(exc);
7155     return _PyUnicodeWriter_Finish(&writer);
7156 
7157   onError:
7158     _PyUnicodeWriter_Dealloc(&writer);
7159     Py_XDECREF(error_handler_obj);
7160     Py_XDECREF(exc);
7161     return NULL;
7162 }
7163 
7164 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7165 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7166 {
7167     if (!PyUnicode_Check(unicode)) {
7168         PyErr_BadArgument();
7169         return NULL;
7170     }
7171     /* Fast path: if it is an ASCII-only string, construct bytes object
7172        directly. Else defer to above function to raise the exception. */
7173     if (PyUnicode_IS_ASCII(unicode))
7174         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7175                                          PyUnicode_GET_LENGTH(unicode));
7176     return unicode_encode_ucs1(unicode, errors, 128);
7177 }
7178 
7179 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7180 PyUnicode_AsASCIIString(PyObject *unicode)
7181 {
7182     return _PyUnicode_AsASCIIString(unicode, NULL);
7183 }
7184 
7185 #ifdef MS_WINDOWS
7186 
7187 /* --- MBCS codecs for Windows -------------------------------------------- */
7188 
7189 #if SIZEOF_INT < SIZEOF_SIZE_T
7190 #define NEED_RETRY
7191 #endif
7192 
7193 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7194    transcoding from UTF-16), but INT_MAX / 4 performs better in
7195    both cases also and avoids partial characters overrunning the
7196    length limit in MultiByteToWideChar on Windows */
7197 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7198 
7199 #ifndef WC_ERR_INVALID_CHARS
7200 #  define WC_ERR_INVALID_CHARS 0x0080
7201 #endif
7202 
7203 static const char*
code_page_name(UINT code_page,PyObject ** obj)7204 code_page_name(UINT code_page, PyObject **obj)
7205 {
7206     *obj = NULL;
7207     if (code_page == CP_ACP)
7208         return "mbcs";
7209     if (code_page == CP_UTF7)
7210         return "CP_UTF7";
7211     if (code_page == CP_UTF8)
7212         return "CP_UTF8";
7213 
7214     *obj = PyBytes_FromFormat("cp%u", code_page);
7215     if (*obj == NULL)
7216         return NULL;
7217     return PyBytes_AS_STRING(*obj);
7218 }
7219 
7220 static DWORD
decode_code_page_flags(UINT code_page)7221 decode_code_page_flags(UINT code_page)
7222 {
7223     if (code_page == CP_UTF7) {
7224         /* The CP_UTF7 decoder only supports flags=0 */
7225         return 0;
7226     }
7227     else
7228         return MB_ERR_INVALID_CHARS;
7229 }
7230 
7231 /*
7232  * Decode a byte string from a Windows code page into unicode object in strict
7233  * mode.
7234  *
7235  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7236  * OSError and returns -1 on other error.
7237  */
7238 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7239 decode_code_page_strict(UINT code_page,
7240                         wchar_t **buf,
7241                         Py_ssize_t *bufsize,
7242                         const char *in,
7243                         int insize)
7244 {
7245     DWORD flags = MB_ERR_INVALID_CHARS;
7246     wchar_t *out;
7247     DWORD outsize;
7248 
7249     /* First get the size of the result */
7250     assert(insize > 0);
7251     while ((outsize = MultiByteToWideChar(code_page, flags,
7252                                           in, insize, NULL, 0)) <= 0)
7253     {
7254         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7255             goto error;
7256         }
7257         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7258         flags = 0;
7259     }
7260 
7261     /* Extend a wchar_t* buffer */
7262     Py_ssize_t n = *bufsize;   /* Get the current length */
7263     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7264         return -1;
7265     }
7266     out = *buf + n;
7267 
7268     /* Do the conversion */
7269     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7270     if (outsize <= 0)
7271         goto error;
7272     return insize;
7273 
7274 error:
7275     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7276         return -2;
7277     PyErr_SetFromWindowsErr(0);
7278     return -1;
7279 }
7280 
7281 /*
7282  * Decode a byte string from a code page into unicode object with an error
7283  * handler.
7284  *
7285  * Returns consumed size if succeed, or raise an OSError or
7286  * UnicodeDecodeError exception and returns -1 on error.
7287  */
7288 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7289 decode_code_page_errors(UINT code_page,
7290                         wchar_t **buf,
7291                         Py_ssize_t *bufsize,
7292                         const char *in, const int size,
7293                         const char *errors, int final)
7294 {
7295     const char *startin = in;
7296     const char *endin = in + size;
7297     DWORD flags = MB_ERR_INVALID_CHARS;
7298     /* Ideally, we should get reason from FormatMessage. This is the Windows
7299        2000 English version of the message. */
7300     const char *reason = "No mapping for the Unicode character exists "
7301                          "in the target code page.";
7302     /* each step cannot decode more than 1 character, but a character can be
7303        represented as a surrogate pair */
7304     wchar_t buffer[2], *out;
7305     int insize;
7306     Py_ssize_t outsize;
7307     PyObject *errorHandler = NULL;
7308     PyObject *exc = NULL;
7309     PyObject *encoding_obj = NULL;
7310     const char *encoding;
7311     DWORD err;
7312     int ret = -1;
7313 
7314     assert(size > 0);
7315 
7316     encoding = code_page_name(code_page, &encoding_obj);
7317     if (encoding == NULL)
7318         return -1;
7319 
7320     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7321         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7322            UnicodeDecodeError. */
7323         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7324         if (exc != NULL) {
7325             PyCodec_StrictErrors(exc);
7326             Py_CLEAR(exc);
7327         }
7328         goto error;
7329     }
7330 
7331     /* Extend a wchar_t* buffer */
7332     Py_ssize_t n = *bufsize;   /* Get the current length */
7333     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7334         PyErr_NoMemory();
7335         goto error;
7336     }
7337     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7338         goto error;
7339     }
7340     out = *buf + n;
7341 
7342     /* Decode the byte string character per character */
7343     while (in < endin)
7344     {
7345         /* Decode a character */
7346         insize = 1;
7347         do
7348         {
7349             outsize = MultiByteToWideChar(code_page, flags,
7350                                           in, insize,
7351                                           buffer, Py_ARRAY_LENGTH(buffer));
7352             if (outsize > 0)
7353                 break;
7354             err = GetLastError();
7355             if (err == ERROR_INVALID_FLAGS && flags) {
7356                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7357                 flags = 0;
7358                 continue;
7359             }
7360             if (err != ERROR_NO_UNICODE_TRANSLATION
7361                 && err != ERROR_INSUFFICIENT_BUFFER)
7362             {
7363                 PyErr_SetFromWindowsErr(err);
7364                 goto error;
7365             }
7366             insize++;
7367         }
7368         /* 4=maximum length of a UTF-8 sequence */
7369         while (insize <= 4 && (in + insize) <= endin);
7370 
7371         if (outsize <= 0) {
7372             Py_ssize_t startinpos, endinpos, outpos;
7373 
7374             /* last character in partial decode? */
7375             if (in + insize >= endin && !final)
7376                 break;
7377 
7378             startinpos = in - startin;
7379             endinpos = startinpos + 1;
7380             outpos = out - *buf;
7381             if (unicode_decode_call_errorhandler_wchar(
7382                     errors, &errorHandler,
7383                     encoding, reason,
7384                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7385                     buf, bufsize, &outpos))
7386             {
7387                 goto error;
7388             }
7389             out = *buf + outpos;
7390         }
7391         else {
7392             in += insize;
7393             memcpy(out, buffer, outsize * sizeof(wchar_t));
7394             out += outsize;
7395         }
7396     }
7397 
7398     /* Shrink the buffer */
7399     assert(out - *buf <= *bufsize);
7400     *bufsize = out - *buf;
7401     /* (in - startin) <= size and size is an int */
7402     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7403 
7404 error:
7405     Py_XDECREF(encoding_obj);
7406     Py_XDECREF(errorHandler);
7407     Py_XDECREF(exc);
7408     return ret;
7409 }
7410 
7411 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7412 decode_code_page_stateful(int code_page,
7413                           const char *s, Py_ssize_t size,
7414                           const char *errors, Py_ssize_t *consumed)
7415 {
7416     wchar_t *buf = NULL;
7417     Py_ssize_t bufsize = 0;
7418     int chunk_size, final, converted, done;
7419 
7420     if (code_page < 0) {
7421         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7422         return NULL;
7423     }
7424     if (size < 0) {
7425         PyErr_BadInternalCall();
7426         return NULL;
7427     }
7428 
7429     if (consumed)
7430         *consumed = 0;
7431 
7432     do
7433     {
7434 #ifdef NEED_RETRY
7435         if (size > DECODING_CHUNK_SIZE) {
7436             chunk_size = DECODING_CHUNK_SIZE;
7437             final = 0;
7438             done = 0;
7439         }
7440         else
7441 #endif
7442         {
7443             chunk_size = (int)size;
7444             final = (consumed == NULL);
7445             done = 1;
7446         }
7447 
7448         if (chunk_size == 0 && done) {
7449             if (buf != NULL)
7450                 break;
7451             _Py_RETURN_UNICODE_EMPTY();
7452         }
7453 
7454         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7455                                             s, chunk_size);
7456         if (converted == -2)
7457             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7458                                                 s, chunk_size,
7459                                                 errors, final);
7460         assert(converted != 0 || done);
7461 
7462         if (converted < 0) {
7463             PyMem_Free(buf);
7464             return NULL;
7465         }
7466 
7467         if (consumed)
7468             *consumed += converted;
7469 
7470         s += converted;
7471         size -= converted;
7472     } while (!done);
7473 
7474     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7475     PyMem_Free(buf);
7476     return v;
7477 }
7478 
7479 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7480 PyUnicode_DecodeCodePageStateful(int code_page,
7481                                  const char *s,
7482                                  Py_ssize_t size,
7483                                  const char *errors,
7484                                  Py_ssize_t *consumed)
7485 {
7486     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7487 }
7488 
7489 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7490 PyUnicode_DecodeMBCSStateful(const char *s,
7491                              Py_ssize_t size,
7492                              const char *errors,
7493                              Py_ssize_t *consumed)
7494 {
7495     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7496 }
7497 
7498 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7499 PyUnicode_DecodeMBCS(const char *s,
7500                      Py_ssize_t size,
7501                      const char *errors)
7502 {
7503     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7504 }
7505 
7506 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7507 encode_code_page_flags(UINT code_page, const char *errors)
7508 {
7509     if (code_page == CP_UTF8) {
7510         return WC_ERR_INVALID_CHARS;
7511     }
7512     else if (code_page == CP_UTF7) {
7513         /* CP_UTF7 only supports flags=0 */
7514         return 0;
7515     }
7516     else {
7517         if (errors != NULL && strcmp(errors, "replace") == 0)
7518             return 0;
7519         else
7520             return WC_NO_BEST_FIT_CHARS;
7521     }
7522 }
7523 
7524 /*
7525  * Encode a Unicode string to a Windows code page into a byte string in strict
7526  * mode.
7527  *
7528  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7529  * an OSError and returns -1 on other error.
7530  */
7531 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7532 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7533                         PyObject *unicode, Py_ssize_t offset, int len,
7534                         const char* errors)
7535 {
7536     BOOL usedDefaultChar = FALSE;
7537     BOOL *pusedDefaultChar = &usedDefaultChar;
7538     int outsize;
7539     wchar_t *p;
7540     Py_ssize_t size;
7541     const DWORD flags = encode_code_page_flags(code_page, NULL);
7542     char *out;
7543     /* Create a substring so that we can get the UTF-16 representation
7544        of just the slice under consideration. */
7545     PyObject *substring;
7546     int ret = -1;
7547 
7548     assert(len > 0);
7549 
7550     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7551         pusedDefaultChar = &usedDefaultChar;
7552     else
7553         pusedDefaultChar = NULL;
7554 
7555     substring = PyUnicode_Substring(unicode, offset, offset+len);
7556     if (substring == NULL)
7557         return -1;
7558     p = PyUnicode_AsWideCharString(substring, &size);
7559     Py_CLEAR(substring);
7560     if (p == NULL) {
7561         return -1;
7562     }
7563     assert(size <= INT_MAX);
7564 
7565     /* First get the size of the result */
7566     outsize = WideCharToMultiByte(code_page, flags,
7567                                   p, (int)size,
7568                                   NULL, 0,
7569                                   NULL, pusedDefaultChar);
7570     if (outsize <= 0)
7571         goto error;
7572     /* If we used a default char, then we failed! */
7573     if (pusedDefaultChar && *pusedDefaultChar) {
7574         ret = -2;
7575         goto done;
7576     }
7577 
7578     if (*outbytes == NULL) {
7579         /* Create string object */
7580         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7581         if (*outbytes == NULL) {
7582             goto done;
7583         }
7584         out = PyBytes_AS_STRING(*outbytes);
7585     }
7586     else {
7587         /* Extend string object */
7588         const Py_ssize_t n = PyBytes_Size(*outbytes);
7589         if (outsize > PY_SSIZE_T_MAX - n) {
7590             PyErr_NoMemory();
7591             goto done;
7592         }
7593         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7594             goto done;
7595         }
7596         out = PyBytes_AS_STRING(*outbytes) + n;
7597     }
7598 
7599     /* Do the conversion */
7600     outsize = WideCharToMultiByte(code_page, flags,
7601                                   p, (int)size,
7602                                   out, outsize,
7603                                   NULL, pusedDefaultChar);
7604     if (outsize <= 0)
7605         goto error;
7606     if (pusedDefaultChar && *pusedDefaultChar) {
7607         ret = -2;
7608         goto done;
7609     }
7610     ret = 0;
7611 
7612 done:
7613     PyMem_Free(p);
7614     return ret;
7615 
7616 error:
7617     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7618         ret = -2;
7619         goto done;
7620     }
7621     PyErr_SetFromWindowsErr(0);
7622     goto done;
7623 }
7624 
7625 /*
7626  * Encode a Unicode string to a Windows code page into a byte string using an
7627  * error handler.
7628  *
7629  * Returns consumed characters if succeed, or raise an OSError and returns
7630  * -1 on other error.
7631  */
7632 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7633 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7634                         PyObject *unicode, Py_ssize_t unicode_offset,
7635                         Py_ssize_t insize, const char* errors)
7636 {
7637     const DWORD flags = encode_code_page_flags(code_page, errors);
7638     Py_ssize_t pos = unicode_offset;
7639     Py_ssize_t endin = unicode_offset + insize;
7640     /* Ideally, we should get reason from FormatMessage. This is the Windows
7641        2000 English version of the message. */
7642     const char *reason = "invalid character";
7643     /* 4=maximum length of a UTF-8 sequence */
7644     char buffer[4];
7645     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7646     Py_ssize_t outsize;
7647     char *out;
7648     PyObject *errorHandler = NULL;
7649     PyObject *exc = NULL;
7650     PyObject *encoding_obj = NULL;
7651     const char *encoding;
7652     Py_ssize_t newpos, newoutsize;
7653     PyObject *rep;
7654     int ret = -1;
7655 
7656     assert(insize > 0);
7657 
7658     encoding = code_page_name(code_page, &encoding_obj);
7659     if (encoding == NULL)
7660         return -1;
7661 
7662     if (errors == NULL || strcmp(errors, "strict") == 0) {
7663         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7664            then we raise a UnicodeEncodeError. */
7665         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7666         if (exc != NULL) {
7667             PyCodec_StrictErrors(exc);
7668             Py_DECREF(exc);
7669         }
7670         Py_XDECREF(encoding_obj);
7671         return -1;
7672     }
7673 
7674     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7675         pusedDefaultChar = &usedDefaultChar;
7676     else
7677         pusedDefaultChar = NULL;
7678 
7679     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7680         PyErr_NoMemory();
7681         goto error;
7682     }
7683     outsize = insize * Py_ARRAY_LENGTH(buffer);
7684 
7685     if (*outbytes == NULL) {
7686         /* Create string object */
7687         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7688         if (*outbytes == NULL)
7689             goto error;
7690         out = PyBytes_AS_STRING(*outbytes);
7691     }
7692     else {
7693         /* Extend string object */
7694         Py_ssize_t n = PyBytes_Size(*outbytes);
7695         if (n > PY_SSIZE_T_MAX - outsize) {
7696             PyErr_NoMemory();
7697             goto error;
7698         }
7699         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7700             goto error;
7701         out = PyBytes_AS_STRING(*outbytes) + n;
7702     }
7703 
7704     /* Encode the string character per character */
7705     while (pos < endin)
7706     {
7707         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7708         wchar_t chars[2];
7709         int charsize;
7710         if (ch < 0x10000) {
7711             chars[0] = (wchar_t)ch;
7712             charsize = 1;
7713         }
7714         else {
7715             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7716             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7717             charsize = 2;
7718         }
7719 
7720         outsize = WideCharToMultiByte(code_page, flags,
7721                                       chars, charsize,
7722                                       buffer, Py_ARRAY_LENGTH(buffer),
7723                                       NULL, pusedDefaultChar);
7724         if (outsize > 0) {
7725             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7726             {
7727                 pos++;
7728                 memcpy(out, buffer, outsize);
7729                 out += outsize;
7730                 continue;
7731             }
7732         }
7733         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7734             PyErr_SetFromWindowsErr(0);
7735             goto error;
7736         }
7737 
7738         rep = unicode_encode_call_errorhandler(
7739                   errors, &errorHandler, encoding, reason,
7740                   unicode, &exc,
7741                   pos, pos + 1, &newpos);
7742         if (rep == NULL)
7743             goto error;
7744 
7745         Py_ssize_t morebytes = pos - newpos;
7746         if (PyBytes_Check(rep)) {
7747             outsize = PyBytes_GET_SIZE(rep);
7748             morebytes += outsize;
7749             if (morebytes > 0) {
7750                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7751                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7752                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7753                     Py_DECREF(rep);
7754                     goto error;
7755                 }
7756                 out = PyBytes_AS_STRING(*outbytes) + offset;
7757             }
7758             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7759             out += outsize;
7760         }
7761         else {
7762             Py_ssize_t i;
7763             int kind;
7764             const void *data;
7765 
7766             outsize = PyUnicode_GET_LENGTH(rep);
7767             morebytes += outsize;
7768             if (morebytes > 0) {
7769                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7770                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7771                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7772                     Py_DECREF(rep);
7773                     goto error;
7774                 }
7775                 out = PyBytes_AS_STRING(*outbytes) + offset;
7776             }
7777             kind = PyUnicode_KIND(rep);
7778             data = PyUnicode_DATA(rep);
7779             for (i=0; i < outsize; i++) {
7780                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7781                 if (ch > 127) {
7782                     raise_encode_exception(&exc,
7783                         encoding, unicode,
7784                         pos, pos + 1,
7785                         "unable to encode error handler result to ASCII");
7786                     Py_DECREF(rep);
7787                     goto error;
7788                 }
7789                 *out = (unsigned char)ch;
7790                 out++;
7791             }
7792         }
7793         pos = newpos;
7794         Py_DECREF(rep);
7795     }
7796     /* write a NUL byte */
7797     *out = 0;
7798     outsize = out - PyBytes_AS_STRING(*outbytes);
7799     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7800     if (_PyBytes_Resize(outbytes, outsize) < 0)
7801         goto error;
7802     ret = 0;
7803 
7804 error:
7805     Py_XDECREF(encoding_obj);
7806     Py_XDECREF(errorHandler);
7807     Py_XDECREF(exc);
7808     return ret;
7809 }
7810 
7811 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7812 encode_code_page(int code_page,
7813                  PyObject *unicode,
7814                  const char *errors)
7815 {
7816     Py_ssize_t len;
7817     PyObject *outbytes = NULL;
7818     Py_ssize_t offset;
7819     int chunk_len, ret, done;
7820 
7821     if (!PyUnicode_Check(unicode)) {
7822         PyErr_BadArgument();
7823         return NULL;
7824     }
7825 
7826     len = PyUnicode_GET_LENGTH(unicode);
7827 
7828     if (code_page < 0) {
7829         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7830         return NULL;
7831     }
7832 
7833     if (len == 0)
7834         return PyBytes_FromStringAndSize(NULL, 0);
7835 
7836     offset = 0;
7837     do
7838     {
7839 #ifdef NEED_RETRY
7840         if (len > DECODING_CHUNK_SIZE) {
7841             chunk_len = DECODING_CHUNK_SIZE;
7842             done = 0;
7843         }
7844         else
7845 #endif
7846         {
7847             chunk_len = (int)len;
7848             done = 1;
7849         }
7850 
7851         ret = encode_code_page_strict(code_page, &outbytes,
7852                                       unicode, offset, chunk_len,
7853                                       errors);
7854         if (ret == -2)
7855             ret = encode_code_page_errors(code_page, &outbytes,
7856                                           unicode, offset,
7857                                           chunk_len, errors);
7858         if (ret < 0) {
7859             Py_XDECREF(outbytes);
7860             return NULL;
7861         }
7862 
7863         offset += chunk_len;
7864         len -= chunk_len;
7865     } while (!done);
7866 
7867     return outbytes;
7868 }
7869 
7870 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7871 PyUnicode_EncodeCodePage(int code_page,
7872                          PyObject *unicode,
7873                          const char *errors)
7874 {
7875     return encode_code_page(code_page, unicode, errors);
7876 }
7877 
7878 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7879 PyUnicode_AsMBCSString(PyObject *unicode)
7880 {
7881     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7882 }
7883 
7884 #undef NEED_RETRY
7885 
7886 #endif /* MS_WINDOWS */
7887 
7888 /* --- Character Mapping Codec -------------------------------------------- */
7889 
7890 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7891 charmap_decode_string(const char *s,
7892                       Py_ssize_t size,
7893                       PyObject *mapping,
7894                       const char *errors,
7895                       _PyUnicodeWriter *writer)
7896 {
7897     const char *starts = s;
7898     const char *e;
7899     Py_ssize_t startinpos, endinpos;
7900     PyObject *errorHandler = NULL, *exc = NULL;
7901     Py_ssize_t maplen;
7902     int mapkind;
7903     const void *mapdata;
7904     Py_UCS4 x;
7905     unsigned char ch;
7906 
7907     maplen = PyUnicode_GET_LENGTH(mapping);
7908     mapdata = PyUnicode_DATA(mapping);
7909     mapkind = PyUnicode_KIND(mapping);
7910 
7911     e = s + size;
7912 
7913     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7914         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7915          * is disabled in encoding aliases, latin1 is preferred because
7916          * its implementation is faster. */
7917         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
7918         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7919         Py_UCS4 maxchar = writer->maxchar;
7920 
7921         assert (writer->kind == PyUnicode_1BYTE_KIND);
7922         while (s < e) {
7923             ch = *s;
7924             x = mapdata_ucs1[ch];
7925             if (x > maxchar) {
7926                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7927                     goto onError;
7928                 maxchar = writer->maxchar;
7929                 outdata = (Py_UCS1 *)writer->data;
7930             }
7931             outdata[writer->pos] = x;
7932             writer->pos++;
7933             ++s;
7934         }
7935         return 0;
7936     }
7937 
7938     while (s < e) {
7939         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7940             int outkind = writer->kind;
7941             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
7942             if (outkind == PyUnicode_1BYTE_KIND) {
7943                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7944                 Py_UCS4 maxchar = writer->maxchar;
7945                 while (s < e) {
7946                     ch = *s;
7947                     x = mapdata_ucs2[ch];
7948                     if (x > maxchar)
7949                         goto Error;
7950                     outdata[writer->pos] = x;
7951                     writer->pos++;
7952                     ++s;
7953                 }
7954                 break;
7955             }
7956             else if (outkind == PyUnicode_2BYTE_KIND) {
7957                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7958                 while (s < e) {
7959                     ch = *s;
7960                     x = mapdata_ucs2[ch];
7961                     if (x == 0xFFFE)
7962                         goto Error;
7963                     outdata[writer->pos] = x;
7964                     writer->pos++;
7965                     ++s;
7966                 }
7967                 break;
7968             }
7969         }
7970         ch = *s;
7971 
7972         if (ch < maplen)
7973             x = PyUnicode_READ(mapkind, mapdata, ch);
7974         else
7975             x = 0xfffe; /* invalid value */
7976 Error:
7977         if (x == 0xfffe)
7978         {
7979             /* undefined mapping */
7980             startinpos = s-starts;
7981             endinpos = startinpos+1;
7982             if (unicode_decode_call_errorhandler_writer(
7983                     errors, &errorHandler,
7984                     "charmap", "character maps to <undefined>",
7985                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7986                     writer)) {
7987                 goto onError;
7988             }
7989             continue;
7990         }
7991 
7992         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7993             goto onError;
7994         ++s;
7995     }
7996     Py_XDECREF(errorHandler);
7997     Py_XDECREF(exc);
7998     return 0;
7999 
8000 onError:
8001     Py_XDECREF(errorHandler);
8002     Py_XDECREF(exc);
8003     return -1;
8004 }
8005 
8006 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8007 charmap_decode_mapping(const char *s,
8008                        Py_ssize_t size,
8009                        PyObject *mapping,
8010                        const char *errors,
8011                        _PyUnicodeWriter *writer)
8012 {
8013     const char *starts = s;
8014     const char *e;
8015     Py_ssize_t startinpos, endinpos;
8016     PyObject *errorHandler = NULL, *exc = NULL;
8017     unsigned char ch;
8018     PyObject *key, *item = NULL;
8019 
8020     e = s + size;
8021 
8022     while (s < e) {
8023         ch = *s;
8024 
8025         /* Get mapping (char ordinal -> integer, Unicode char or None) */
8026         key = PyLong_FromLong((long)ch);
8027         if (key == NULL)
8028             goto onError;
8029 
8030         item = PyObject_GetItem(mapping, key);
8031         Py_DECREF(key);
8032         if (item == NULL) {
8033             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8034                 /* No mapping found means: mapping is undefined. */
8035                 PyErr_Clear();
8036                 goto Undefined;
8037             } else
8038                 goto onError;
8039         }
8040 
8041         /* Apply mapping */
8042         if (item == Py_None)
8043             goto Undefined;
8044         if (PyLong_Check(item)) {
8045             long value = PyLong_AS_LONG(item);
8046             if (value == 0xFFFE)
8047                 goto Undefined;
8048             if (value < 0 || value > MAX_UNICODE) {
8049                 PyErr_Format(PyExc_TypeError,
8050                              "character mapping must be in range(0x%x)",
8051                              (unsigned long)MAX_UNICODE + 1);
8052                 goto onError;
8053             }
8054 
8055             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8056                 goto onError;
8057         }
8058         else if (PyUnicode_Check(item)) {
8059             if (PyUnicode_GET_LENGTH(item) == 1) {
8060                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8061                 if (value == 0xFFFE)
8062                     goto Undefined;
8063                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8064                     goto onError;
8065             }
8066             else {
8067                 writer->overallocate = 1;
8068                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8069                     goto onError;
8070             }
8071         }
8072         else {
8073             /* wrong return value */
8074             PyErr_SetString(PyExc_TypeError,
8075                             "character mapping must return integer, None or str");
8076             goto onError;
8077         }
8078         Py_CLEAR(item);
8079         ++s;
8080         continue;
8081 
8082 Undefined:
8083         /* undefined mapping */
8084         Py_CLEAR(item);
8085         startinpos = s-starts;
8086         endinpos = startinpos+1;
8087         if (unicode_decode_call_errorhandler_writer(
8088                 errors, &errorHandler,
8089                 "charmap", "character maps to <undefined>",
8090                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8091                 writer)) {
8092             goto onError;
8093         }
8094     }
8095     Py_XDECREF(errorHandler);
8096     Py_XDECREF(exc);
8097     return 0;
8098 
8099 onError:
8100     Py_XDECREF(item);
8101     Py_XDECREF(errorHandler);
8102     Py_XDECREF(exc);
8103     return -1;
8104 }
8105 
8106 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8107 PyUnicode_DecodeCharmap(const char *s,
8108                         Py_ssize_t size,
8109                         PyObject *mapping,
8110                         const char *errors)
8111 {
8112     _PyUnicodeWriter writer;
8113 
8114     /* Default to Latin-1 */
8115     if (mapping == NULL)
8116         return PyUnicode_DecodeLatin1(s, size, errors);
8117 
8118     if (size == 0)
8119         _Py_RETURN_UNICODE_EMPTY();
8120     _PyUnicodeWriter_Init(&writer);
8121     writer.min_length = size;
8122     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8123         goto onError;
8124 
8125     if (PyUnicode_CheckExact(mapping)) {
8126         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8127             goto onError;
8128     }
8129     else {
8130         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8131             goto onError;
8132     }
8133     return _PyUnicodeWriter_Finish(&writer);
8134 
8135   onError:
8136     _PyUnicodeWriter_Dealloc(&writer);
8137     return NULL;
8138 }
8139 
8140 /* Charmap encoding: the lookup table */
8141 
8142 /*[clinic input]
8143 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8144 [clinic start generated code]*/
8145 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8146 
8147 struct encoding_map {
8148     PyObject_HEAD
8149     unsigned char level1[32];
8150     int count2, count3;
8151     unsigned char level23[1];
8152 };
8153 
8154 /*[clinic input]
8155 EncodingMap.size
8156 
8157 Return the size (in bytes) of this object.
8158 [clinic start generated code]*/
8159 
8160 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8161 EncodingMap_size_impl(struct encoding_map *self)
8162 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8163 {
8164     return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8165                            128*self->count3);
8166 }
8167 
8168 static PyMethodDef encoding_map_methods[] = {
8169     ENCODINGMAP_SIZE_METHODDEF
8170     {NULL, NULL}
8171 };
8172 
8173 static PyTypeObject EncodingMapType = {
8174     PyVarObject_HEAD_INIT(NULL, 0)
8175     .tp_name = "EncodingMap",
8176     .tp_basicsize = sizeof(struct encoding_map),
8177     /* methods */
8178     .tp_flags = Py_TPFLAGS_DEFAULT,
8179     .tp_methods = encoding_map_methods,
8180 };
8181 
8182 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8183 PyUnicode_BuildEncodingMap(PyObject* string)
8184 {
8185     PyObject *result;
8186     struct encoding_map *mresult;
8187     int i;
8188     int need_dict = 0;
8189     unsigned char level1[32];
8190     unsigned char level2[512];
8191     unsigned char *mlevel1, *mlevel2, *mlevel3;
8192     int count2 = 0, count3 = 0;
8193     int kind;
8194     const void *data;
8195     Py_ssize_t length;
8196     Py_UCS4 ch;
8197 
8198     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8199         PyErr_BadArgument();
8200         return NULL;
8201     }
8202     kind = PyUnicode_KIND(string);
8203     data = PyUnicode_DATA(string);
8204     length = PyUnicode_GET_LENGTH(string);
8205     length = Py_MIN(length, 256);
8206     memset(level1, 0xFF, sizeof level1);
8207     memset(level2, 0xFF, sizeof level2);
8208 
8209     /* If there isn't a one-to-one mapping of NULL to \0,
8210        or if there are non-BMP characters, we need to use
8211        a mapping dictionary. */
8212     if (PyUnicode_READ(kind, data, 0) != 0)
8213         need_dict = 1;
8214     for (i = 1; i < length; i++) {
8215         int l1, l2;
8216         ch = PyUnicode_READ(kind, data, i);
8217         if (ch == 0 || ch > 0xFFFF) {
8218             need_dict = 1;
8219             break;
8220         }
8221         if (ch == 0xFFFE)
8222             /* unmapped character */
8223             continue;
8224         l1 = ch >> 11;
8225         l2 = ch >> 7;
8226         if (level1[l1] == 0xFF)
8227             level1[l1] = count2++;
8228         if (level2[l2] == 0xFF)
8229             level2[l2] = count3++;
8230     }
8231 
8232     if (count2 >= 0xFF || count3 >= 0xFF)
8233         need_dict = 1;
8234 
8235     if (need_dict) {
8236         PyObject *result = PyDict_New();
8237         if (!result)
8238             return NULL;
8239         for (i = 0; i < length; i++) {
8240             Py_UCS4 c = PyUnicode_READ(kind, data, i);
8241             PyObject *key = PyLong_FromLong(c);
8242             if (key == NULL) {
8243                 Py_DECREF(result);
8244                 return NULL;
8245             }
8246             PyObject *value = PyLong_FromLong(i);
8247             if (value == NULL) {
8248                 Py_DECREF(key);
8249                 Py_DECREF(result);
8250                 return NULL;
8251             }
8252             int rc = PyDict_SetItem(result, key, value);
8253             Py_DECREF(key);
8254             Py_DECREF(value);
8255             if (rc < 0) {
8256                 Py_DECREF(result);
8257                 return NULL;
8258             }
8259         }
8260         return result;
8261     }
8262 
8263     /* Create a three-level trie */
8264     result = PyObject_Malloc(sizeof(struct encoding_map) +
8265                              16*count2 + 128*count3 - 1);
8266     if (!result) {
8267         return PyErr_NoMemory();
8268     }
8269 
8270     _PyObject_Init(result, &EncodingMapType);
8271     mresult = (struct encoding_map*)result;
8272     mresult->count2 = count2;
8273     mresult->count3 = count3;
8274     mlevel1 = mresult->level1;
8275     mlevel2 = mresult->level23;
8276     mlevel3 = mresult->level23 + 16*count2;
8277     memcpy(mlevel1, level1, 32);
8278     memset(mlevel2, 0xFF, 16*count2);
8279     memset(mlevel3, 0, 128*count3);
8280     count3 = 0;
8281     for (i = 1; i < length; i++) {
8282         int o1, o2, o3, i2, i3;
8283         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8284         if (ch == 0xFFFE)
8285             /* unmapped character */
8286             continue;
8287         o1 = ch>>11;
8288         o2 = (ch>>7) & 0xF;
8289         i2 = 16*mlevel1[o1] + o2;
8290         if (mlevel2[i2] == 0xFF)
8291             mlevel2[i2] = count3++;
8292         o3 = ch & 0x7F;
8293         i3 = 128*mlevel2[i2] + o3;
8294         mlevel3[i3] = i;
8295     }
8296     return result;
8297 }
8298 
8299 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8300 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8301 {
8302     struct encoding_map *map = (struct encoding_map*)mapping;
8303     int l1 = c>>11;
8304     int l2 = (c>>7) & 0xF;
8305     int l3 = c & 0x7F;
8306     int i;
8307 
8308     if (c > 0xFFFF)
8309         return -1;
8310     if (c == 0)
8311         return 0;
8312     /* level 1*/
8313     i = map->level1[l1];
8314     if (i == 0xFF) {
8315         return -1;
8316     }
8317     /* level 2*/
8318     i = map->level23[16*i+l2];
8319     if (i == 0xFF) {
8320         return -1;
8321     }
8322     /* level 3 */
8323     i = map->level23[16*map->count2 + 128*i + l3];
8324     if (i == 0) {
8325         return -1;
8326     }
8327     return i;
8328 }
8329 
8330 /* Lookup the character ch in the mapping. If the character
8331    can't be found, Py_None is returned (or NULL, if another
8332    error occurred). */
8333 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8334 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8335 {
8336     PyObject *w = PyLong_FromLong((long)c);
8337     PyObject *x;
8338 
8339     if (w == NULL)
8340         return NULL;
8341     x = PyObject_GetItem(mapping, w);
8342     Py_DECREF(w);
8343     if (x == NULL) {
8344         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345             /* No mapping found means: mapping is undefined. */
8346             PyErr_Clear();
8347             Py_RETURN_NONE;
8348         } else
8349             return NULL;
8350     }
8351     else if (x == Py_None)
8352         return x;
8353     else if (PyLong_Check(x)) {
8354         long value = PyLong_AS_LONG(x);
8355         if (value < 0 || value > 255) {
8356             PyErr_SetString(PyExc_TypeError,
8357                             "character mapping must be in range(256)");
8358             Py_DECREF(x);
8359             return NULL;
8360         }
8361         return x;
8362     }
8363     else if (PyBytes_Check(x))
8364         return x;
8365     else {
8366         /* wrong return value */
8367         PyErr_Format(PyExc_TypeError,
8368                      "character mapping must return integer, bytes or None, not %.400s",
8369                      Py_TYPE(x)->tp_name);
8370         Py_DECREF(x);
8371         return NULL;
8372     }
8373 }
8374 
8375 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8376 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8377 {
8378     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8379     /* exponentially overallocate to minimize reallocations */
8380     if (requiredsize < 2*outsize)
8381         requiredsize = 2*outsize;
8382     if (_PyBytes_Resize(outobj, requiredsize))
8383         return -1;
8384     return 0;
8385 }
8386 
8387 typedef enum charmapencode_result {
8388     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8389 } charmapencode_result;
8390 /* lookup the character, put the result in the output string and adjust
8391    various state variables. Resize the output bytes object if not enough
8392    space is available. Return a new reference to the object that
8393    was put in the output buffer, or Py_None, if the mapping was undefined
8394    (in which case no character was written) or NULL, if a
8395    reallocation error occurred. The caller must decref the result */
8396 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8397 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8398                      PyObject **outobj, Py_ssize_t *outpos)
8399 {
8400     PyObject *rep;
8401     char *outstart;
8402     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8403 
8404     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8405         int res = encoding_map_lookup(c, mapping);
8406         Py_ssize_t requiredsize = *outpos+1;
8407         if (res == -1)
8408             return enc_FAILED;
8409         if (outsize<requiredsize)
8410             if (charmapencode_resize(outobj, outpos, requiredsize))
8411                 return enc_EXCEPTION;
8412         outstart = PyBytes_AS_STRING(*outobj);
8413         outstart[(*outpos)++] = (char)res;
8414         return enc_SUCCESS;
8415     }
8416 
8417     rep = charmapencode_lookup(c, mapping);
8418     if (rep==NULL)
8419         return enc_EXCEPTION;
8420     else if (rep==Py_None) {
8421         Py_DECREF(rep);
8422         return enc_FAILED;
8423     } else {
8424         if (PyLong_Check(rep)) {
8425             Py_ssize_t requiredsize = *outpos+1;
8426             if (outsize<requiredsize)
8427                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8428                     Py_DECREF(rep);
8429                     return enc_EXCEPTION;
8430                 }
8431             outstart = PyBytes_AS_STRING(*outobj);
8432             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8433         }
8434         else {
8435             const char *repchars = PyBytes_AS_STRING(rep);
8436             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8437             Py_ssize_t requiredsize = *outpos+repsize;
8438             if (outsize<requiredsize)
8439                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8440                     Py_DECREF(rep);
8441                     return enc_EXCEPTION;
8442                 }
8443             outstart = PyBytes_AS_STRING(*outobj);
8444             memcpy(outstart + *outpos, repchars, repsize);
8445             *outpos += repsize;
8446         }
8447     }
8448     Py_DECREF(rep);
8449     return enc_SUCCESS;
8450 }
8451 
8452 /* handle an error in PyUnicode_EncodeCharmap
8453    Return 0 on success, -1 on error */
8454 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8455 charmap_encoding_error(
8456     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8457     PyObject **exceptionObject,
8458     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8459     PyObject **res, Py_ssize_t *respos)
8460 {
8461     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8462     Py_ssize_t size, repsize;
8463     Py_ssize_t newpos;
8464     int kind;
8465     const void *data;
8466     Py_ssize_t index;
8467     /* startpos for collecting unencodable chars */
8468     Py_ssize_t collstartpos = *inpos;
8469     Py_ssize_t collendpos = *inpos+1;
8470     Py_ssize_t collpos;
8471     const char *encoding = "charmap";
8472     const char *reason = "character maps to <undefined>";
8473     charmapencode_result x;
8474     Py_UCS4 ch;
8475     int val;
8476 
8477     size = PyUnicode_GET_LENGTH(unicode);
8478     /* find all unencodable characters */
8479     while (collendpos < size) {
8480         PyObject *rep;
8481         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8482             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8483             val = encoding_map_lookup(ch, mapping);
8484             if (val != -1)
8485                 break;
8486             ++collendpos;
8487             continue;
8488         }
8489 
8490         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8491         rep = charmapencode_lookup(ch, mapping);
8492         if (rep==NULL)
8493             return -1;
8494         else if (rep!=Py_None) {
8495             Py_DECREF(rep);
8496             break;
8497         }
8498         Py_DECREF(rep);
8499         ++collendpos;
8500     }
8501     /* cache callback name lookup
8502      * (if not done yet, i.e. it's the first error) */
8503     if (*error_handler == _Py_ERROR_UNKNOWN)
8504         *error_handler = _Py_GetErrorHandler(errors);
8505 
8506     switch (*error_handler) {
8507     case _Py_ERROR_STRICT:
8508         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8509         return -1;
8510 
8511     case _Py_ERROR_REPLACE:
8512         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8513             x = charmapencode_output('?', mapping, res, respos);
8514             if (x==enc_EXCEPTION) {
8515                 return -1;
8516             }
8517             else if (x==enc_FAILED) {
8518                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8519                 return -1;
8520             }
8521         }
8522         /* fall through */
8523     case _Py_ERROR_IGNORE:
8524         *inpos = collendpos;
8525         break;
8526 
8527     case _Py_ERROR_XMLCHARREFREPLACE:
8528         /* generate replacement (temporarily (mis)uses p) */
8529         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8530             char buffer[2+29+1+1];
8531             char *cp;
8532             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8533             for (cp = buffer; *cp; ++cp) {
8534                 x = charmapencode_output(*cp, mapping, res, respos);
8535                 if (x==enc_EXCEPTION)
8536                     return -1;
8537                 else if (x==enc_FAILED) {
8538                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8539                     return -1;
8540                 }
8541             }
8542         }
8543         *inpos = collendpos;
8544         break;
8545 
8546     default:
8547         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8548                                                       encoding, reason, unicode, exceptionObject,
8549                                                       collstartpos, collendpos, &newpos);
8550         if (repunicode == NULL)
8551             return -1;
8552         if (PyBytes_Check(repunicode)) {
8553             /* Directly copy bytes result to output. */
8554             Py_ssize_t outsize = PyBytes_Size(*res);
8555             Py_ssize_t requiredsize;
8556             repsize = PyBytes_Size(repunicode);
8557             requiredsize = *respos + repsize;
8558             if (requiredsize > outsize)
8559                 /* Make room for all additional bytes. */
8560                 if (charmapencode_resize(res, respos, requiredsize)) {
8561                     Py_DECREF(repunicode);
8562                     return -1;
8563                 }
8564             memcpy(PyBytes_AsString(*res) + *respos,
8565                    PyBytes_AsString(repunicode),  repsize);
8566             *respos += repsize;
8567             *inpos = newpos;
8568             Py_DECREF(repunicode);
8569             break;
8570         }
8571         /* generate replacement  */
8572         repsize = PyUnicode_GET_LENGTH(repunicode);
8573         data = PyUnicode_DATA(repunicode);
8574         kind = PyUnicode_KIND(repunicode);
8575         for (index = 0; index < repsize; index++) {
8576             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8577             x = charmapencode_output(repch, mapping, res, respos);
8578             if (x==enc_EXCEPTION) {
8579                 Py_DECREF(repunicode);
8580                 return -1;
8581             }
8582             else if (x==enc_FAILED) {
8583                 Py_DECREF(repunicode);
8584                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8585                 return -1;
8586             }
8587         }
8588         *inpos = newpos;
8589         Py_DECREF(repunicode);
8590     }
8591     return 0;
8592 }
8593 
8594 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8595 _PyUnicode_EncodeCharmap(PyObject *unicode,
8596                          PyObject *mapping,
8597                          const char *errors)
8598 {
8599     /* output object */
8600     PyObject *res = NULL;
8601     /* current input position */
8602     Py_ssize_t inpos = 0;
8603     Py_ssize_t size;
8604     /* current output position */
8605     Py_ssize_t respos = 0;
8606     PyObject *error_handler_obj = NULL;
8607     PyObject *exc = NULL;
8608     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8609     const void *data;
8610     int kind;
8611 
8612     size = PyUnicode_GET_LENGTH(unicode);
8613     data = PyUnicode_DATA(unicode);
8614     kind = PyUnicode_KIND(unicode);
8615 
8616     /* Default to Latin-1 */
8617     if (mapping == NULL)
8618         return unicode_encode_ucs1(unicode, errors, 256);
8619 
8620     /* allocate enough for a simple encoding without
8621        replacements, if we need more, we'll resize */
8622     res = PyBytes_FromStringAndSize(NULL, size);
8623     if (res == NULL)
8624         goto onError;
8625     if (size == 0)
8626         return res;
8627 
8628     while (inpos<size) {
8629         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8630         /* try to encode it */
8631         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8632         if (x==enc_EXCEPTION) /* error */
8633             goto onError;
8634         if (x==enc_FAILED) { /* unencodable character */
8635             if (charmap_encoding_error(unicode, &inpos, mapping,
8636                                        &exc,
8637                                        &error_handler, &error_handler_obj, errors,
8638                                        &res, &respos)) {
8639                 goto onError;
8640             }
8641         }
8642         else
8643             /* done with this character => adjust input position */
8644             ++inpos;
8645     }
8646 
8647     /* Resize if we allocated to much */
8648     if (respos<PyBytes_GET_SIZE(res))
8649         if (_PyBytes_Resize(&res, respos) < 0)
8650             goto onError;
8651 
8652     Py_XDECREF(exc);
8653     Py_XDECREF(error_handler_obj);
8654     return res;
8655 
8656   onError:
8657     Py_XDECREF(res);
8658     Py_XDECREF(exc);
8659     Py_XDECREF(error_handler_obj);
8660     return NULL;
8661 }
8662 
8663 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8664 PyUnicode_AsCharmapString(PyObject *unicode,
8665                           PyObject *mapping)
8666 {
8667     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8668         PyErr_BadArgument();
8669         return NULL;
8670     }
8671     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8672 }
8673 
8674 /* create or adjust a UnicodeTranslateError */
8675 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8676 make_translate_exception(PyObject **exceptionObject,
8677                          PyObject *unicode,
8678                          Py_ssize_t startpos, Py_ssize_t endpos,
8679                          const char *reason)
8680 {
8681     if (*exceptionObject == NULL) {
8682         *exceptionObject = _PyUnicodeTranslateError_Create(
8683             unicode, startpos, endpos, reason);
8684     }
8685     else {
8686         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8687             goto onError;
8688         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8689             goto onError;
8690         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8691             goto onError;
8692         return;
8693       onError:
8694         Py_CLEAR(*exceptionObject);
8695     }
8696 }
8697 
8698 /* error handling callback helper:
8699    build arguments, call the callback and check the arguments,
8700    put the result into newpos and return the replacement string, which
8701    has to be freed by the caller */
8702 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8703 unicode_translate_call_errorhandler(const char *errors,
8704                                     PyObject **errorHandler,
8705                                     const char *reason,
8706                                     PyObject *unicode, PyObject **exceptionObject,
8707                                     Py_ssize_t startpos, Py_ssize_t endpos,
8708                                     Py_ssize_t *newpos)
8709 {
8710     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8711 
8712     Py_ssize_t i_newpos;
8713     PyObject *restuple;
8714     PyObject *resunicode;
8715 
8716     if (*errorHandler == NULL) {
8717         *errorHandler = PyCodec_LookupError(errors);
8718         if (*errorHandler == NULL)
8719             return NULL;
8720     }
8721 
8722     make_translate_exception(exceptionObject,
8723                              unicode, startpos, endpos, reason);
8724     if (*exceptionObject == NULL)
8725         return NULL;
8726 
8727     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8728     if (restuple == NULL)
8729         return NULL;
8730     if (!PyTuple_Check(restuple)) {
8731         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8732         Py_DECREF(restuple);
8733         return NULL;
8734     }
8735     if (!PyArg_ParseTuple(restuple, argparse,
8736                           &resunicode, &i_newpos)) {
8737         Py_DECREF(restuple);
8738         return NULL;
8739     }
8740     if (i_newpos<0)
8741         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8742     else
8743         *newpos = i_newpos;
8744     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8745         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8746         Py_DECREF(restuple);
8747         return NULL;
8748     }
8749     Py_INCREF(resunicode);
8750     Py_DECREF(restuple);
8751     return resunicode;
8752 }
8753 
8754 /* Lookup the character ch in the mapping and put the result in result,
8755    which must be decrefed by the caller.
8756    Return 0 on success, -1 on error */
8757 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8758 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8759 {
8760     PyObject *w = PyLong_FromLong((long)c);
8761     PyObject *x;
8762 
8763     if (w == NULL)
8764         return -1;
8765     x = PyObject_GetItem(mapping, w);
8766     Py_DECREF(w);
8767     if (x == NULL) {
8768         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8769             /* No mapping found means: use 1:1 mapping. */
8770             PyErr_Clear();
8771             *result = NULL;
8772             return 0;
8773         } else
8774             return -1;
8775     }
8776     else if (x == Py_None) {
8777         *result = x;
8778         return 0;
8779     }
8780     else if (PyLong_Check(x)) {
8781         long value = PyLong_AS_LONG(x);
8782         if (value < 0 || value > MAX_UNICODE) {
8783             PyErr_Format(PyExc_ValueError,
8784                          "character mapping must be in range(0x%x)",
8785                          MAX_UNICODE+1);
8786             Py_DECREF(x);
8787             return -1;
8788         }
8789         *result = x;
8790         return 0;
8791     }
8792     else if (PyUnicode_Check(x)) {
8793         *result = x;
8794         return 0;
8795     }
8796     else {
8797         /* wrong return value */
8798         PyErr_SetString(PyExc_TypeError,
8799                         "character mapping must return integer, None or str");
8800         Py_DECREF(x);
8801         return -1;
8802     }
8803 }
8804 
8805 /* lookup the character, write the result into the writer.
8806    Return 1 if the result was written into the writer, return 0 if the mapping
8807    was undefined, raise an exception return -1 on error. */
8808 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8809 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8810                         _PyUnicodeWriter *writer)
8811 {
8812     PyObject *item;
8813 
8814     if (charmaptranslate_lookup(ch, mapping, &item))
8815         return -1;
8816 
8817     if (item == NULL) {
8818         /* not found => default to 1:1 mapping */
8819         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8820             return -1;
8821         }
8822         return 1;
8823     }
8824 
8825     if (item == Py_None) {
8826         Py_DECREF(item);
8827         return 0;
8828     }
8829 
8830     if (PyLong_Check(item)) {
8831         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8832         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8833            used it */
8834         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8835             Py_DECREF(item);
8836             return -1;
8837         }
8838         Py_DECREF(item);
8839         return 1;
8840     }
8841 
8842     if (!PyUnicode_Check(item)) {
8843         Py_DECREF(item);
8844         return -1;
8845     }
8846 
8847     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8848         Py_DECREF(item);
8849         return -1;
8850     }
8851 
8852     Py_DECREF(item);
8853     return 1;
8854 }
8855 
8856 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8857 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8858                               Py_UCS1 *translate)
8859 {
8860     PyObject *item = NULL;
8861     int ret = 0;
8862 
8863     if (charmaptranslate_lookup(ch, mapping, &item)) {
8864         return -1;
8865     }
8866 
8867     if (item == Py_None) {
8868         /* deletion */
8869         translate[ch] = 0xfe;
8870     }
8871     else if (item == NULL) {
8872         /* not found => default to 1:1 mapping */
8873         translate[ch] = ch;
8874         return 1;
8875     }
8876     else if (PyLong_Check(item)) {
8877         long replace = PyLong_AS_LONG(item);
8878         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8879            used it */
8880         if (127 < replace) {
8881             /* invalid character or character outside ASCII:
8882                skip the fast translate */
8883             goto exit;
8884         }
8885         translate[ch] = (Py_UCS1)replace;
8886     }
8887     else if (PyUnicode_Check(item)) {
8888         Py_UCS4 replace;
8889 
8890         if (PyUnicode_GET_LENGTH(item) != 1)
8891             goto exit;
8892 
8893         replace = PyUnicode_READ_CHAR(item, 0);
8894         if (replace > 127)
8895             goto exit;
8896         translate[ch] = (Py_UCS1)replace;
8897     }
8898     else {
8899         /* not None, NULL, long or unicode */
8900         goto exit;
8901     }
8902     ret = 1;
8903 
8904   exit:
8905     Py_DECREF(item);
8906     return ret;
8907 }
8908 
8909 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8910    was translated into writer, return 0 if the input string was partially
8911    translated into writer, raise an exception and return -1 on error. */
8912 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8913 unicode_fast_translate(PyObject *input, PyObject *mapping,
8914                        _PyUnicodeWriter *writer, int ignore,
8915                        Py_ssize_t *input_pos)
8916 {
8917     Py_UCS1 ascii_table[128], ch, ch2;
8918     Py_ssize_t len;
8919     const Py_UCS1 *in, *end;
8920     Py_UCS1 *out;
8921     int res = 0;
8922 
8923     len = PyUnicode_GET_LENGTH(input);
8924 
8925     memset(ascii_table, 0xff, 128);
8926 
8927     in = PyUnicode_1BYTE_DATA(input);
8928     end = in + len;
8929 
8930     assert(PyUnicode_IS_ASCII(writer->buffer));
8931     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8932     out = PyUnicode_1BYTE_DATA(writer->buffer);
8933 
8934     for (; in < end; in++) {
8935         ch = *in;
8936         ch2 = ascii_table[ch];
8937         if (ch2 == 0xff) {
8938             int translate = unicode_fast_translate_lookup(mapping, ch,
8939                                                           ascii_table);
8940             if (translate < 0)
8941                 return -1;
8942             if (translate == 0)
8943                 goto exit;
8944             ch2 = ascii_table[ch];
8945         }
8946         if (ch2 == 0xfe) {
8947             if (ignore)
8948                 continue;
8949             goto exit;
8950         }
8951         assert(ch2 < 128);
8952         *out = ch2;
8953         out++;
8954     }
8955     res = 1;
8956 
8957 exit:
8958     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8959     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8960     return res;
8961 }
8962 
8963 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8964 _PyUnicode_TranslateCharmap(PyObject *input,
8965                             PyObject *mapping,
8966                             const char *errors)
8967 {
8968     /* input object */
8969     const void *data;
8970     Py_ssize_t size, i;
8971     int kind;
8972     /* output buffer */
8973     _PyUnicodeWriter writer;
8974     /* error handler */
8975     const char *reason = "character maps to <undefined>";
8976     PyObject *errorHandler = NULL;
8977     PyObject *exc = NULL;
8978     int ignore;
8979     int res;
8980 
8981     if (mapping == NULL) {
8982         PyErr_BadArgument();
8983         return NULL;
8984     }
8985 
8986     data = PyUnicode_DATA(input);
8987     kind = PyUnicode_KIND(input);
8988     size = PyUnicode_GET_LENGTH(input);
8989 
8990     if (size == 0)
8991         return PyUnicode_FromObject(input);
8992 
8993     /* allocate enough for a simple 1:1 translation without
8994        replacements, if we need more, we'll resize */
8995     _PyUnicodeWriter_Init(&writer);
8996     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8997         goto onError;
8998 
8999     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9000 
9001     if (PyUnicode_IS_ASCII(input)) {
9002         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9003         if (res < 0) {
9004             _PyUnicodeWriter_Dealloc(&writer);
9005             return NULL;
9006         }
9007         if (res == 1)
9008             return _PyUnicodeWriter_Finish(&writer);
9009     }
9010     else {
9011         i = 0;
9012     }
9013 
9014     while (i<size) {
9015         /* try to encode it */
9016         int translate;
9017         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9018         Py_ssize_t newpos;
9019         /* startpos for collecting untranslatable chars */
9020         Py_ssize_t collstart;
9021         Py_ssize_t collend;
9022         Py_UCS4 ch;
9023 
9024         ch = PyUnicode_READ(kind, data, i);
9025         translate = charmaptranslate_output(ch, mapping, &writer);
9026         if (translate < 0)
9027             goto onError;
9028 
9029         if (translate != 0) {
9030             /* it worked => adjust input pointer */
9031             ++i;
9032             continue;
9033         }
9034 
9035         /* untranslatable character */
9036         collstart = i;
9037         collend = i+1;
9038 
9039         /* find all untranslatable characters */
9040         while (collend < size) {
9041             PyObject *x;
9042             ch = PyUnicode_READ(kind, data, collend);
9043             if (charmaptranslate_lookup(ch, mapping, &x))
9044                 goto onError;
9045             Py_XDECREF(x);
9046             if (x != Py_None)
9047                 break;
9048             ++collend;
9049         }
9050 
9051         if (ignore) {
9052             i = collend;
9053         }
9054         else {
9055             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9056                                                              reason, input, &exc,
9057                                                              collstart, collend, &newpos);
9058             if (repunicode == NULL)
9059                 goto onError;
9060             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9061                 Py_DECREF(repunicode);
9062                 goto onError;
9063             }
9064             Py_DECREF(repunicode);
9065             i = newpos;
9066         }
9067     }
9068     Py_XDECREF(exc);
9069     Py_XDECREF(errorHandler);
9070     return _PyUnicodeWriter_Finish(&writer);
9071 
9072   onError:
9073     _PyUnicodeWriter_Dealloc(&writer);
9074     Py_XDECREF(exc);
9075     Py_XDECREF(errorHandler);
9076     return NULL;
9077 }
9078 
9079 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9080 PyUnicode_Translate(PyObject *str,
9081                     PyObject *mapping,
9082                     const char *errors)
9083 {
9084     if (ensure_unicode(str) < 0)
9085         return NULL;
9086     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9087 }
9088 
9089 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9090 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9091 {
9092     if (!PyUnicode_Check(unicode)) {
9093         PyErr_BadInternalCall();
9094         return NULL;
9095     }
9096     if (PyUnicode_IS_ASCII(unicode)) {
9097         /* If the string is already ASCII, just return the same string */
9098         return Py_NewRef(unicode);
9099     }
9100 
9101     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9102     PyObject *result = PyUnicode_New(len, 127);
9103     if (result == NULL) {
9104         return NULL;
9105     }
9106 
9107     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9108     int kind = PyUnicode_KIND(unicode);
9109     const void *data = PyUnicode_DATA(unicode);
9110     Py_ssize_t i;
9111     for (i = 0; i < len; ++i) {
9112         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9113         if (ch < 127) {
9114             out[i] = ch;
9115         }
9116         else if (Py_UNICODE_ISSPACE(ch)) {
9117             out[i] = ' ';
9118         }
9119         else {
9120             int decimal = Py_UNICODE_TODECIMAL(ch);
9121             if (decimal < 0) {
9122                 out[i] = '?';
9123                 out[i+1] = '\0';
9124                 _PyUnicode_LENGTH(result) = i + 1;
9125                 break;
9126             }
9127             out[i] = '0' + decimal;
9128         }
9129     }
9130 
9131     assert(_PyUnicode_CheckConsistency(result, 1));
9132     return result;
9133 }
9134 
9135 /* --- Helpers ------------------------------------------------------------ */
9136 
9137 /* helper macro to fixup start/end slice values */
9138 #define ADJUST_INDICES(start, end, len)         \
9139     if (end > len)                              \
9140         end = len;                              \
9141     else if (end < 0) {                         \
9142         end += len;                             \
9143         if (end < 0)                            \
9144             end = 0;                            \
9145     }                                           \
9146     if (start < 0) {                            \
9147         start += len;                           \
9148         if (start < 0)                          \
9149             start = 0;                          \
9150     }
9151 
9152 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9153 any_find_slice(PyObject* s1, PyObject* s2,
9154                Py_ssize_t start,
9155                Py_ssize_t end,
9156                int direction)
9157 {
9158     int kind1, kind2;
9159     const void *buf1, *buf2;
9160     Py_ssize_t len1, len2, result;
9161 
9162     kind1 = PyUnicode_KIND(s1);
9163     kind2 = PyUnicode_KIND(s2);
9164     if (kind1 < kind2)
9165         return -1;
9166 
9167     len1 = PyUnicode_GET_LENGTH(s1);
9168     len2 = PyUnicode_GET_LENGTH(s2);
9169     ADJUST_INDICES(start, end, len1);
9170     if (end - start < len2)
9171         return -1;
9172 
9173     buf1 = PyUnicode_DATA(s1);
9174     buf2 = PyUnicode_DATA(s2);
9175     if (len2 == 1) {
9176         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9177         result = findchar((const char *)buf1 + kind1*start,
9178                           kind1, end - start, ch, direction);
9179         if (result == -1)
9180             return -1;
9181         else
9182             return start + result;
9183     }
9184 
9185     if (kind2 != kind1) {
9186         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9187         if (!buf2)
9188             return -2;
9189     }
9190 
9191     if (direction > 0) {
9192         switch (kind1) {
9193         case PyUnicode_1BYTE_KIND:
9194             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9195                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9196             else
9197                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9198             break;
9199         case PyUnicode_2BYTE_KIND:
9200             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9201             break;
9202         case PyUnicode_4BYTE_KIND:
9203             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9204             break;
9205         default:
9206             Py_UNREACHABLE();
9207         }
9208     }
9209     else {
9210         switch (kind1) {
9211         case PyUnicode_1BYTE_KIND:
9212             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9213                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9214             else
9215                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9216             break;
9217         case PyUnicode_2BYTE_KIND:
9218             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9219             break;
9220         case PyUnicode_4BYTE_KIND:
9221             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9222             break;
9223         default:
9224             Py_UNREACHABLE();
9225         }
9226     }
9227 
9228     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9229     if (kind2 != kind1)
9230         PyMem_Free((void *)buf2);
9231 
9232     return result;
9233 }
9234 
9235 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9236 #include "stringlib/localeutil.h"
9237 
9238 /**
9239  * InsertThousandsGrouping:
9240  * @writer: Unicode writer.
9241  * @n_buffer: Number of characters in @buffer.
9242  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9243  * @d_pos: Start of digits string.
9244  * @n_digits: The number of digits in the string, in which we want
9245  *            to put the grouping chars.
9246  * @min_width: The minimum width of the digits in the output string.
9247  *             Output will be zero-padded on the left to fill.
9248  * @grouping: see definition in localeconv().
9249  * @thousands_sep: see definition in localeconv().
9250  *
9251  * There are 2 modes: counting and filling. If @writer is NULL,
9252  *  we are in counting mode, else filling mode.
9253  * If counting, the required buffer size is returned.
9254  * If filling, we know the buffer will be large enough, so we don't
9255  *  need to pass in the buffer size.
9256  * Inserts thousand grouping characters (as defined by grouping and
9257  *  thousands_sep) into @writer.
9258  *
9259  * Return value: -1 on error, number of characters otherwise.
9260  **/
9261 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9262 _PyUnicode_InsertThousandsGrouping(
9263     _PyUnicodeWriter *writer,
9264     Py_ssize_t n_buffer,
9265     PyObject *digits,
9266     Py_ssize_t d_pos,
9267     Py_ssize_t n_digits,
9268     Py_ssize_t min_width,
9269     const char *grouping,
9270     PyObject *thousands_sep,
9271     Py_UCS4 *maxchar)
9272 {
9273     min_width = Py_MAX(0, min_width);
9274     if (writer) {
9275         assert(digits != NULL);
9276         assert(maxchar == NULL);
9277     }
9278     else {
9279         assert(digits == NULL);
9280         assert(maxchar != NULL);
9281     }
9282     assert(0 <= d_pos);
9283     assert(0 <= n_digits);
9284     assert(grouping != NULL);
9285 
9286     Py_ssize_t count = 0;
9287     Py_ssize_t n_zeros;
9288     int loop_broken = 0;
9289     int use_separator = 0; /* First time through, don't append the
9290                               separator. They only go between
9291                               groups. */
9292     Py_ssize_t buffer_pos;
9293     Py_ssize_t digits_pos;
9294     Py_ssize_t len;
9295     Py_ssize_t n_chars;
9296     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9297                                         be looked at */
9298     /* A generator that returns all of the grouping widths, until it
9299        returns 0. */
9300     GroupGenerator groupgen;
9301     GroupGenerator_init(&groupgen, grouping);
9302     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9303 
9304     /* if digits are not grouped, thousands separator
9305        should be an empty string */
9306     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9307 
9308     digits_pos = d_pos + n_digits;
9309     if (writer) {
9310         buffer_pos = writer->pos + n_buffer;
9311         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9312         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9313     }
9314     else {
9315         buffer_pos = n_buffer;
9316     }
9317 
9318     if (!writer) {
9319         *maxchar = 127;
9320     }
9321 
9322     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9323         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9324         n_zeros = Py_MAX(0, len - remaining);
9325         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9326 
9327         /* Use n_zero zero's and n_chars chars */
9328 
9329         /* Count only, don't do anything. */
9330         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9331 
9332         /* Copy into the writer. */
9333         InsertThousandsGrouping_fill(writer, &buffer_pos,
9334                                      digits, &digits_pos,
9335                                      n_chars, n_zeros,
9336                                      use_separator ? thousands_sep : NULL,
9337                                      thousands_sep_len, maxchar);
9338 
9339         /* Use a separator next time. */
9340         use_separator = 1;
9341 
9342         remaining -= n_chars;
9343         min_width -= len;
9344 
9345         if (remaining <= 0 && min_width <= 0) {
9346             loop_broken = 1;
9347             break;
9348         }
9349         min_width -= thousands_sep_len;
9350     }
9351     if (!loop_broken) {
9352         /* We left the loop without using a break statement. */
9353 
9354         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9355         n_zeros = Py_MAX(0, len - remaining);
9356         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9357 
9358         /* Use n_zero zero's and n_chars chars */
9359         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9360 
9361         /* Copy into the writer. */
9362         InsertThousandsGrouping_fill(writer, &buffer_pos,
9363                                      digits, &digits_pos,
9364                                      n_chars, n_zeros,
9365                                      use_separator ? thousands_sep : NULL,
9366                                      thousands_sep_len, maxchar);
9367     }
9368     return count;
9369 }
9370 
9371 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9372 PyUnicode_Count(PyObject *str,
9373                 PyObject *substr,
9374                 Py_ssize_t start,
9375                 Py_ssize_t end)
9376 {
9377     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9378         return -1;
9379 
9380     return unicode_count_impl(str, substr, start, end);
9381 }
9382 
9383 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9384 PyUnicode_Find(PyObject *str,
9385                PyObject *substr,
9386                Py_ssize_t start,
9387                Py_ssize_t end,
9388                int direction)
9389 {
9390     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9391         return -2;
9392 
9393     return any_find_slice(str, substr, start, end, direction);
9394 }
9395 
9396 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9397 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9398                    Py_ssize_t start, Py_ssize_t end,
9399                    int direction)
9400 {
9401     int kind;
9402     Py_ssize_t len, result;
9403     len = PyUnicode_GET_LENGTH(str);
9404     ADJUST_INDICES(start, end, len);
9405     if (end - start < 1)
9406         return -1;
9407     kind = PyUnicode_KIND(str);
9408     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9409                       kind, end-start, ch, direction);
9410     if (result == -1)
9411         return -1;
9412     else
9413         return start + result;
9414 }
9415 
9416 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9417 tailmatch(PyObject *self,
9418           PyObject *substring,
9419           Py_ssize_t start,
9420           Py_ssize_t end,
9421           int direction)
9422 {
9423     int kind_self;
9424     int kind_sub;
9425     const void *data_self;
9426     const void *data_sub;
9427     Py_ssize_t offset;
9428     Py_ssize_t i;
9429     Py_ssize_t end_sub;
9430 
9431     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9432     end -= PyUnicode_GET_LENGTH(substring);
9433     if (end < start)
9434         return 0;
9435 
9436     if (PyUnicode_GET_LENGTH(substring) == 0)
9437         return 1;
9438 
9439     kind_self = PyUnicode_KIND(self);
9440     data_self = PyUnicode_DATA(self);
9441     kind_sub = PyUnicode_KIND(substring);
9442     data_sub = PyUnicode_DATA(substring);
9443     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9444 
9445     if (direction > 0)
9446         offset = end;
9447     else
9448         offset = start;
9449 
9450     if (PyUnicode_READ(kind_self, data_self, offset) ==
9451         PyUnicode_READ(kind_sub, data_sub, 0) &&
9452         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9453         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9454         /* If both are of the same kind, memcmp is sufficient */
9455         if (kind_self == kind_sub) {
9456             return ! memcmp((char *)data_self +
9457                                 (offset * PyUnicode_KIND(substring)),
9458                             data_sub,
9459                             PyUnicode_GET_LENGTH(substring) *
9460                                 PyUnicode_KIND(substring));
9461         }
9462         /* otherwise we have to compare each character by first accessing it */
9463         else {
9464             /* We do not need to compare 0 and len(substring)-1 because
9465                the if statement above ensured already that they are equal
9466                when we end up here. */
9467             for (i = 1; i < end_sub; ++i) {
9468                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9469                     PyUnicode_READ(kind_sub, data_sub, i))
9470                     return 0;
9471             }
9472             return 1;
9473         }
9474     }
9475 
9476     return 0;
9477 }
9478 
9479 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9480 PyUnicode_Tailmatch(PyObject *str,
9481                     PyObject *substr,
9482                     Py_ssize_t start,
9483                     Py_ssize_t end,
9484                     int direction)
9485 {
9486     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9487         return -1;
9488 
9489     return tailmatch(str, substr, start, end, direction);
9490 }
9491 
9492 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9493 ascii_upper_or_lower(PyObject *self, int lower)
9494 {
9495     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9496     const char *data = PyUnicode_DATA(self);
9497     char *resdata;
9498     PyObject *res;
9499 
9500     res = PyUnicode_New(len, 127);
9501     if (res == NULL)
9502         return NULL;
9503     resdata = PyUnicode_DATA(res);
9504     if (lower)
9505         _Py_bytes_lower(resdata, data, len);
9506     else
9507         _Py_bytes_upper(resdata, data, len);
9508     return res;
9509 }
9510 
9511 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9512 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9513 {
9514     Py_ssize_t j;
9515     int final_sigma;
9516     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9517     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9518 
9519      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9520 
9521     where ! is a negation and \p{xxx} is a character with property xxx.
9522     */
9523     for (j = i - 1; j >= 0; j--) {
9524         c = PyUnicode_READ(kind, data, j);
9525         if (!_PyUnicode_IsCaseIgnorable(c))
9526             break;
9527     }
9528     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9529     if (final_sigma) {
9530         for (j = i + 1; j < length; j++) {
9531             c = PyUnicode_READ(kind, data, j);
9532             if (!_PyUnicode_IsCaseIgnorable(c))
9533                 break;
9534         }
9535         final_sigma = j == length || !_PyUnicode_IsCased(c);
9536     }
9537     return (final_sigma) ? 0x3C2 : 0x3C3;
9538 }
9539 
9540 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9541 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9542            Py_UCS4 c, Py_UCS4 *mapped)
9543 {
9544     /* Obscure special case. */
9545     if (c == 0x3A3) {
9546         mapped[0] = handle_capital_sigma(kind, data, length, i);
9547         return 1;
9548     }
9549     return _PyUnicode_ToLowerFull(c, mapped);
9550 }
9551 
9552 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9553 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9554 {
9555     Py_ssize_t i, k = 0;
9556     int n_res, j;
9557     Py_UCS4 c, mapped[3];
9558 
9559     c = PyUnicode_READ(kind, data, 0);
9560     n_res = _PyUnicode_ToTitleFull(c, mapped);
9561     for (j = 0; j < n_res; j++) {
9562         *maxchar = Py_MAX(*maxchar, mapped[j]);
9563         res[k++] = mapped[j];
9564     }
9565     for (i = 1; i < length; i++) {
9566         c = PyUnicode_READ(kind, data, i);
9567         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9568         for (j = 0; j < n_res; j++) {
9569             *maxchar = Py_MAX(*maxchar, mapped[j]);
9570             res[k++] = mapped[j];
9571         }
9572     }
9573     return k;
9574 }
9575 
9576 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9577 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9578     Py_ssize_t i, k = 0;
9579 
9580     for (i = 0; i < length; i++) {
9581         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9582         int n_res, j;
9583         if (Py_UNICODE_ISUPPER(c)) {
9584             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9585         }
9586         else if (Py_UNICODE_ISLOWER(c)) {
9587             n_res = _PyUnicode_ToUpperFull(c, mapped);
9588         }
9589         else {
9590             n_res = 1;
9591             mapped[0] = c;
9592         }
9593         for (j = 0; j < n_res; j++) {
9594             *maxchar = Py_MAX(*maxchar, mapped[j]);
9595             res[k++] = mapped[j];
9596         }
9597     }
9598     return k;
9599 }
9600 
9601 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9602 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9603                   Py_UCS4 *maxchar, int lower)
9604 {
9605     Py_ssize_t i, k = 0;
9606 
9607     for (i = 0; i < length; i++) {
9608         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9609         int n_res, j;
9610         if (lower)
9611             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9612         else
9613             n_res = _PyUnicode_ToUpperFull(c, mapped);
9614         for (j = 0; j < n_res; j++) {
9615             *maxchar = Py_MAX(*maxchar, mapped[j]);
9616             res[k++] = mapped[j];
9617         }
9618     }
9619     return k;
9620 }
9621 
9622 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9623 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9624 {
9625     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9626 }
9627 
9628 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9629 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9630 {
9631     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9632 }
9633 
9634 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9635 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9636 {
9637     Py_ssize_t i, k = 0;
9638 
9639     for (i = 0; i < length; i++) {
9640         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9641         Py_UCS4 mapped[3];
9642         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9643         for (j = 0; j < n_res; j++) {
9644             *maxchar = Py_MAX(*maxchar, mapped[j]);
9645             res[k++] = mapped[j];
9646         }
9647     }
9648     return k;
9649 }
9650 
9651 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9652 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9653 {
9654     Py_ssize_t i, k = 0;
9655     int previous_is_cased;
9656 
9657     previous_is_cased = 0;
9658     for (i = 0; i < length; i++) {
9659         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9660         Py_UCS4 mapped[3];
9661         int n_res, j;
9662 
9663         if (previous_is_cased)
9664             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9665         else
9666             n_res = _PyUnicode_ToTitleFull(c, mapped);
9667 
9668         for (j = 0; j < n_res; j++) {
9669             *maxchar = Py_MAX(*maxchar, mapped[j]);
9670             res[k++] = mapped[j];
9671         }
9672 
9673         previous_is_cased = _PyUnicode_IsCased(c);
9674     }
9675     return k;
9676 }
9677 
9678 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9679 case_operation(PyObject *self,
9680                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9681 {
9682     PyObject *res = NULL;
9683     Py_ssize_t length, newlength = 0;
9684     int kind, outkind;
9685     const void *data;
9686     void *outdata;
9687     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9688 
9689     kind = PyUnicode_KIND(self);
9690     data = PyUnicode_DATA(self);
9691     length = PyUnicode_GET_LENGTH(self);
9692     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9693         PyErr_SetString(PyExc_OverflowError, "string is too long");
9694         return NULL;
9695     }
9696     tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9697     if (tmp == NULL)
9698         return PyErr_NoMemory();
9699     newlength = perform(kind, data, length, tmp, &maxchar);
9700     res = PyUnicode_New(newlength, maxchar);
9701     if (res == NULL)
9702         goto leave;
9703     tmpend = tmp + newlength;
9704     outdata = PyUnicode_DATA(res);
9705     outkind = PyUnicode_KIND(res);
9706     switch (outkind) {
9707     case PyUnicode_1BYTE_KIND:
9708         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9709         break;
9710     case PyUnicode_2BYTE_KIND:
9711         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9712         break;
9713     case PyUnicode_4BYTE_KIND:
9714         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9715         break;
9716     default:
9717         Py_UNREACHABLE();
9718     }
9719   leave:
9720     PyMem_Free(tmp);
9721     return res;
9722 }
9723 
9724 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9725 PyUnicode_Join(PyObject *separator, PyObject *seq)
9726 {
9727     PyObject *res;
9728     PyObject *fseq;
9729     Py_ssize_t seqlen;
9730     PyObject **items;
9731 
9732     fseq = PySequence_Fast(seq, "can only join an iterable");
9733     if (fseq == NULL) {
9734         return NULL;
9735     }
9736 
9737     Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9738 
9739     items = PySequence_Fast_ITEMS(fseq);
9740     seqlen = PySequence_Fast_GET_SIZE(fseq);
9741     res = _PyUnicode_JoinArray(separator, items, seqlen);
9742 
9743     Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9744 
9745     Py_DECREF(fseq);
9746     return res;
9747 }
9748 
9749 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9750 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9751 {
9752     PyObject *res = NULL; /* the result */
9753     PyObject *sep = NULL;
9754     Py_ssize_t seplen;
9755     PyObject *item;
9756     Py_ssize_t sz, i, res_offset;
9757     Py_UCS4 maxchar;
9758     Py_UCS4 item_maxchar;
9759     int use_memcpy;
9760     unsigned char *res_data = NULL, *sep_data = NULL;
9761     PyObject *last_obj;
9762     int kind = 0;
9763 
9764     /* If empty sequence, return u"". */
9765     if (seqlen == 0) {
9766         _Py_RETURN_UNICODE_EMPTY();
9767     }
9768 
9769     /* If singleton sequence with an exact Unicode, return that. */
9770     last_obj = NULL;
9771     if (seqlen == 1) {
9772         if (PyUnicode_CheckExact(items[0])) {
9773             res = items[0];
9774             return Py_NewRef(res);
9775         }
9776         seplen = 0;
9777         maxchar = 0;
9778     }
9779     else {
9780         /* Set up sep and seplen */
9781         if (separator == NULL) {
9782             /* fall back to a blank space separator */
9783             sep = PyUnicode_FromOrdinal(' ');
9784             if (!sep)
9785                 goto onError;
9786             seplen = 1;
9787             maxchar = 32;
9788         }
9789         else {
9790             if (!PyUnicode_Check(separator)) {
9791                 PyErr_Format(PyExc_TypeError,
9792                              "separator: expected str instance,"
9793                              " %.80s found",
9794                              Py_TYPE(separator)->tp_name);
9795                 goto onError;
9796             }
9797             sep = separator;
9798             seplen = PyUnicode_GET_LENGTH(separator);
9799             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9800             /* inc refcount to keep this code path symmetric with the
9801                above case of a blank separator */
9802             Py_INCREF(sep);
9803         }
9804         last_obj = sep;
9805     }
9806 
9807     /* There are at least two things to join, or else we have a subclass
9808      * of str in the sequence.
9809      * Do a pre-pass to figure out the total amount of space we'll
9810      * need (sz), and see whether all argument are strings.
9811      */
9812     sz = 0;
9813 #ifdef Py_DEBUG
9814     use_memcpy = 0;
9815 #else
9816     use_memcpy = 1;
9817 #endif
9818     for (i = 0; i < seqlen; i++) {
9819         size_t add_sz;
9820         item = items[i];
9821         if (!PyUnicode_Check(item)) {
9822             PyErr_Format(PyExc_TypeError,
9823                          "sequence item %zd: expected str instance,"
9824                          " %.80s found",
9825                          i, Py_TYPE(item)->tp_name);
9826             goto onError;
9827         }
9828         add_sz = PyUnicode_GET_LENGTH(item);
9829         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9830         maxchar = Py_MAX(maxchar, item_maxchar);
9831         if (i != 0) {
9832             add_sz += seplen;
9833         }
9834         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
9835             PyErr_SetString(PyExc_OverflowError,
9836                             "join() result is too long for a Python string");
9837             goto onError;
9838         }
9839         sz += add_sz;
9840         if (use_memcpy && last_obj != NULL) {
9841             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9842                 use_memcpy = 0;
9843         }
9844         last_obj = item;
9845     }
9846 
9847     res = PyUnicode_New(sz, maxchar);
9848     if (res == NULL)
9849         goto onError;
9850 
9851     /* Catenate everything. */
9852 #ifdef Py_DEBUG
9853     use_memcpy = 0;
9854 #else
9855     if (use_memcpy) {
9856         res_data = PyUnicode_1BYTE_DATA(res);
9857         kind = PyUnicode_KIND(res);
9858         if (seplen != 0)
9859             sep_data = PyUnicode_1BYTE_DATA(sep);
9860     }
9861 #endif
9862     if (use_memcpy) {
9863         for (i = 0; i < seqlen; ++i) {
9864             Py_ssize_t itemlen;
9865             item = items[i];
9866 
9867             /* Copy item, and maybe the separator. */
9868             if (i && seplen != 0) {
9869                 memcpy(res_data,
9870                           sep_data,
9871                           kind * seplen);
9872                 res_data += kind * seplen;
9873             }
9874 
9875             itemlen = PyUnicode_GET_LENGTH(item);
9876             if (itemlen != 0) {
9877                 memcpy(res_data,
9878                           PyUnicode_DATA(item),
9879                           kind * itemlen);
9880                 res_data += kind * itemlen;
9881             }
9882         }
9883         assert(res_data == PyUnicode_1BYTE_DATA(res)
9884                            + kind * PyUnicode_GET_LENGTH(res));
9885     }
9886     else {
9887         for (i = 0, res_offset = 0; i < seqlen; ++i) {
9888             Py_ssize_t itemlen;
9889             item = items[i];
9890 
9891             /* Copy item, and maybe the separator. */
9892             if (i && seplen != 0) {
9893                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9894                 res_offset += seplen;
9895             }
9896 
9897             itemlen = PyUnicode_GET_LENGTH(item);
9898             if (itemlen != 0) {
9899                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9900                 res_offset += itemlen;
9901             }
9902         }
9903         assert(res_offset == PyUnicode_GET_LENGTH(res));
9904     }
9905 
9906     Py_XDECREF(sep);
9907     assert(_PyUnicode_CheckConsistency(res, 1));
9908     return res;
9909 
9910   onError:
9911     Py_XDECREF(sep);
9912     Py_XDECREF(res);
9913     return NULL;
9914 }
9915 
9916 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)9917 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9918                     Py_UCS4 fill_char)
9919 {
9920     const int kind = PyUnicode_KIND(unicode);
9921     void *data = PyUnicode_DATA(unicode);
9922     assert(unicode_modifiable(unicode));
9923     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9924     assert(start >= 0);
9925     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9926     unicode_fill(kind, data, fill_char, start, length);
9927 }
9928 
9929 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)9930 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9931                Py_UCS4 fill_char)
9932 {
9933     Py_ssize_t maxlen;
9934 
9935     if (!PyUnicode_Check(unicode)) {
9936         PyErr_BadInternalCall();
9937         return -1;
9938     }
9939     if (unicode_check_modifiable(unicode))
9940         return -1;
9941 
9942     if (start < 0) {
9943         PyErr_SetString(PyExc_IndexError, "string index out of range");
9944         return -1;
9945     }
9946     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9947         PyErr_SetString(PyExc_ValueError,
9948                          "fill character is bigger than "
9949                          "the string maximum character");
9950         return -1;
9951     }
9952 
9953     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9954     length = Py_MIN(maxlen, length);
9955     if (length <= 0)
9956         return 0;
9957 
9958     _PyUnicode_FastFill(unicode, start, length, fill_char);
9959     return length;
9960 }
9961 
9962 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)9963 pad(PyObject *self,
9964     Py_ssize_t left,
9965     Py_ssize_t right,
9966     Py_UCS4 fill)
9967 {
9968     PyObject *u;
9969     Py_UCS4 maxchar;
9970     int kind;
9971     void *data;
9972 
9973     if (left < 0)
9974         left = 0;
9975     if (right < 0)
9976         right = 0;
9977 
9978     if (left == 0 && right == 0)
9979         return unicode_result_unchanged(self);
9980 
9981     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9982         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9983         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9984         return NULL;
9985     }
9986     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9987     maxchar = Py_MAX(maxchar, fill);
9988     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9989     if (!u)
9990         return NULL;
9991 
9992     kind = PyUnicode_KIND(u);
9993     data = PyUnicode_DATA(u);
9994     if (left)
9995         unicode_fill(kind, data, fill, 0, left);
9996     if (right)
9997         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9998     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9999     assert(_PyUnicode_CheckConsistency(u, 1));
10000     return u;
10001 }
10002 
10003 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10004 PyUnicode_Splitlines(PyObject *string, int keepends)
10005 {
10006     PyObject *list;
10007 
10008     if (ensure_unicode(string) < 0)
10009         return NULL;
10010 
10011     switch (PyUnicode_KIND(string)) {
10012     case PyUnicode_1BYTE_KIND:
10013         if (PyUnicode_IS_ASCII(string))
10014             list = asciilib_splitlines(
10015                 string, PyUnicode_1BYTE_DATA(string),
10016                 PyUnicode_GET_LENGTH(string), keepends);
10017         else
10018             list = ucs1lib_splitlines(
10019                 string, PyUnicode_1BYTE_DATA(string),
10020                 PyUnicode_GET_LENGTH(string), keepends);
10021         break;
10022     case PyUnicode_2BYTE_KIND:
10023         list = ucs2lib_splitlines(
10024             string, PyUnicode_2BYTE_DATA(string),
10025             PyUnicode_GET_LENGTH(string), keepends);
10026         break;
10027     case PyUnicode_4BYTE_KIND:
10028         list = ucs4lib_splitlines(
10029             string, PyUnicode_4BYTE_DATA(string),
10030             PyUnicode_GET_LENGTH(string), keepends);
10031         break;
10032     default:
10033         Py_UNREACHABLE();
10034     }
10035     return list;
10036 }
10037 
10038 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10039 split(PyObject *self,
10040       PyObject *substring,
10041       Py_ssize_t maxcount)
10042 {
10043     int kind1, kind2;
10044     const void *buf1, *buf2;
10045     Py_ssize_t len1, len2;
10046     PyObject* out;
10047     len1 = PyUnicode_GET_LENGTH(self);
10048     kind1 = PyUnicode_KIND(self);
10049 
10050     if (substring == NULL) {
10051         if (maxcount < 0) {
10052             maxcount = (len1 - 1) / 2 + 1;
10053         }
10054         switch (kind1) {
10055         case PyUnicode_1BYTE_KIND:
10056             if (PyUnicode_IS_ASCII(self))
10057                 return asciilib_split_whitespace(
10058                     self,  PyUnicode_1BYTE_DATA(self),
10059                     len1, maxcount
10060                     );
10061             else
10062                 return ucs1lib_split_whitespace(
10063                     self,  PyUnicode_1BYTE_DATA(self),
10064                     len1, maxcount
10065                     );
10066         case PyUnicode_2BYTE_KIND:
10067             return ucs2lib_split_whitespace(
10068                 self,  PyUnicode_2BYTE_DATA(self),
10069                 len1, maxcount
10070                 );
10071         case PyUnicode_4BYTE_KIND:
10072             return ucs4lib_split_whitespace(
10073                 self,  PyUnicode_4BYTE_DATA(self),
10074                 len1, maxcount
10075                 );
10076         default:
10077             Py_UNREACHABLE();
10078         }
10079     }
10080 
10081     kind2 = PyUnicode_KIND(substring);
10082     len2 = PyUnicode_GET_LENGTH(substring);
10083     if (maxcount < 0) {
10084         // if len2 == 0, it will raise ValueError.
10085         maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10086         // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10087         maxcount = maxcount < 0 ? len1 : maxcount;
10088     }
10089     if (kind1 < kind2 || len1 < len2) {
10090         out = PyList_New(1);
10091         if (out == NULL)
10092             return NULL;
10093         PyList_SET_ITEM(out, 0, Py_NewRef(self));
10094         return out;
10095     }
10096     buf1 = PyUnicode_DATA(self);
10097     buf2 = PyUnicode_DATA(substring);
10098     if (kind2 != kind1) {
10099         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10100         if (!buf2)
10101             return NULL;
10102     }
10103 
10104     switch (kind1) {
10105     case PyUnicode_1BYTE_KIND:
10106         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10107             out = asciilib_split(
10108                 self,  buf1, len1, buf2, len2, maxcount);
10109         else
10110             out = ucs1lib_split(
10111                 self,  buf1, len1, buf2, len2, maxcount);
10112         break;
10113     case PyUnicode_2BYTE_KIND:
10114         out = ucs2lib_split(
10115             self,  buf1, len1, buf2, len2, maxcount);
10116         break;
10117     case PyUnicode_4BYTE_KIND:
10118         out = ucs4lib_split(
10119             self,  buf1, len1, buf2, len2, maxcount);
10120         break;
10121     default:
10122         out = NULL;
10123     }
10124     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10125     if (kind2 != kind1)
10126         PyMem_Free((void *)buf2);
10127     return out;
10128 }
10129 
10130 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10131 rsplit(PyObject *self,
10132        PyObject *substring,
10133        Py_ssize_t maxcount)
10134 {
10135     int kind1, kind2;
10136     const void *buf1, *buf2;
10137     Py_ssize_t len1, len2;
10138     PyObject* out;
10139 
10140     len1 = PyUnicode_GET_LENGTH(self);
10141     kind1 = PyUnicode_KIND(self);
10142 
10143     if (substring == NULL) {
10144         if (maxcount < 0) {
10145             maxcount = (len1 - 1) / 2 + 1;
10146         }
10147         switch (kind1) {
10148         case PyUnicode_1BYTE_KIND:
10149             if (PyUnicode_IS_ASCII(self))
10150                 return asciilib_rsplit_whitespace(
10151                     self,  PyUnicode_1BYTE_DATA(self),
10152                     len1, maxcount
10153                     );
10154             else
10155                 return ucs1lib_rsplit_whitespace(
10156                     self,  PyUnicode_1BYTE_DATA(self),
10157                     len1, maxcount
10158                     );
10159         case PyUnicode_2BYTE_KIND:
10160             return ucs2lib_rsplit_whitespace(
10161                 self,  PyUnicode_2BYTE_DATA(self),
10162                 len1, maxcount
10163                 );
10164         case PyUnicode_4BYTE_KIND:
10165             return ucs4lib_rsplit_whitespace(
10166                 self,  PyUnicode_4BYTE_DATA(self),
10167                 len1, maxcount
10168                 );
10169         default:
10170             Py_UNREACHABLE();
10171         }
10172     }
10173     kind2 = PyUnicode_KIND(substring);
10174     len2 = PyUnicode_GET_LENGTH(substring);
10175     if (maxcount < 0) {
10176         // if len2 == 0, it will raise ValueError.
10177         maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10178         // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10179         maxcount = maxcount < 0 ? len1 : maxcount;
10180     }
10181     if (kind1 < kind2 || len1 < len2) {
10182         out = PyList_New(1);
10183         if (out == NULL)
10184             return NULL;
10185         PyList_SET_ITEM(out, 0, Py_NewRef(self));
10186         return out;
10187     }
10188     buf1 = PyUnicode_DATA(self);
10189     buf2 = PyUnicode_DATA(substring);
10190     if (kind2 != kind1) {
10191         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10192         if (!buf2)
10193             return NULL;
10194     }
10195 
10196     switch (kind1) {
10197     case PyUnicode_1BYTE_KIND:
10198         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10199             out = asciilib_rsplit(
10200                 self,  buf1, len1, buf2, len2, maxcount);
10201         else
10202             out = ucs1lib_rsplit(
10203                 self,  buf1, len1, buf2, len2, maxcount);
10204         break;
10205     case PyUnicode_2BYTE_KIND:
10206         out = ucs2lib_rsplit(
10207             self,  buf1, len1, buf2, len2, maxcount);
10208         break;
10209     case PyUnicode_4BYTE_KIND:
10210         out = ucs4lib_rsplit(
10211             self,  buf1, len1, buf2, len2, maxcount);
10212         break;
10213     default:
10214         out = NULL;
10215     }
10216     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10217     if (kind2 != kind1)
10218         PyMem_Free((void *)buf2);
10219     return out;
10220 }
10221 
10222 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10223 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10224             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10225 {
10226     switch (kind) {
10227     case PyUnicode_1BYTE_KIND:
10228         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10229             return asciilib_find(buf1, len1, buf2, len2, offset);
10230         else
10231             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10232     case PyUnicode_2BYTE_KIND:
10233         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10234     case PyUnicode_4BYTE_KIND:
10235         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10236     }
10237     Py_UNREACHABLE();
10238 }
10239 
10240 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10241 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10242              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10243 {
10244     switch (kind) {
10245     case PyUnicode_1BYTE_KIND:
10246         return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10247     case PyUnicode_2BYTE_KIND:
10248         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10249     case PyUnicode_4BYTE_KIND:
10250         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10251     }
10252     Py_UNREACHABLE();
10253 }
10254 
10255 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10256 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10257                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10258 {
10259     int kind = PyUnicode_KIND(u);
10260     void *data = PyUnicode_DATA(u);
10261     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10262     if (kind == PyUnicode_1BYTE_KIND) {
10263         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10264                                       (Py_UCS1 *)data + len,
10265                                       u1, u2, maxcount);
10266     }
10267     else if (kind == PyUnicode_2BYTE_KIND) {
10268         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10269                                       (Py_UCS2 *)data + len,
10270                                       u1, u2, maxcount);
10271     }
10272     else {
10273         assert(kind == PyUnicode_4BYTE_KIND);
10274         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10275                                       (Py_UCS4 *)data + len,
10276                                       u1, u2, maxcount);
10277     }
10278 }
10279 
10280 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10281 replace(PyObject *self, PyObject *str1,
10282         PyObject *str2, Py_ssize_t maxcount)
10283 {
10284     PyObject *u;
10285     const char *sbuf = PyUnicode_DATA(self);
10286     const void *buf1 = PyUnicode_DATA(str1);
10287     const void *buf2 = PyUnicode_DATA(str2);
10288     int srelease = 0, release1 = 0, release2 = 0;
10289     int skind = PyUnicode_KIND(self);
10290     int kind1 = PyUnicode_KIND(str1);
10291     int kind2 = PyUnicode_KIND(str2);
10292     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10293     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10294     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10295     int mayshrink;
10296     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10297 
10298     if (slen < len1)
10299         goto nothing;
10300 
10301     if (maxcount < 0)
10302         maxcount = PY_SSIZE_T_MAX;
10303     else if (maxcount == 0)
10304         goto nothing;
10305 
10306     if (str1 == str2)
10307         goto nothing;
10308 
10309     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10310     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10311     if (maxchar < maxchar_str1)
10312         /* substring too wide to be present */
10313         goto nothing;
10314     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10315     /* Replacing str1 with str2 may cause a maxchar reduction in the
10316        result string. */
10317     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10318     maxchar = Py_MAX(maxchar, maxchar_str2);
10319 
10320     if (len1 == len2) {
10321         /* same length */
10322         if (len1 == 0)
10323             goto nothing;
10324         if (len1 == 1) {
10325             /* replace characters */
10326             Py_UCS4 u1, u2;
10327             Py_ssize_t pos;
10328 
10329             u1 = PyUnicode_READ(kind1, buf1, 0);
10330             pos = findchar(sbuf, skind, slen, u1, 1);
10331             if (pos < 0)
10332                 goto nothing;
10333             u2 = PyUnicode_READ(kind2, buf2, 0);
10334             u = PyUnicode_New(slen, maxchar);
10335             if (!u)
10336                 goto error;
10337 
10338             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10339             replace_1char_inplace(u, pos, u1, u2, maxcount);
10340         }
10341         else {
10342             int rkind = skind;
10343             char *res;
10344             Py_ssize_t i;
10345 
10346             if (kind1 < rkind) {
10347                 /* widen substring */
10348                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10349                 if (!buf1) goto error;
10350                 release1 = 1;
10351             }
10352             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10353             if (i < 0)
10354                 goto nothing;
10355             if (rkind > kind2) {
10356                 /* widen replacement */
10357                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10358                 if (!buf2) goto error;
10359                 release2 = 1;
10360             }
10361             else if (rkind < kind2) {
10362                 /* widen self and buf1 */
10363                 rkind = kind2;
10364                 if (release1) {
10365                     assert(buf1 != PyUnicode_DATA(str1));
10366                     PyMem_Free((void *)buf1);
10367                     buf1 = PyUnicode_DATA(str1);
10368                     release1 = 0;
10369                 }
10370                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10371                 if (!sbuf) goto error;
10372                 srelease = 1;
10373                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10374                 if (!buf1) goto error;
10375                 release1 = 1;
10376             }
10377             u = PyUnicode_New(slen, maxchar);
10378             if (!u)
10379                 goto error;
10380             assert(PyUnicode_KIND(u) == rkind);
10381             res = PyUnicode_DATA(u);
10382 
10383             memcpy(res, sbuf, rkind * slen);
10384             /* change everything in-place, starting with this one */
10385             memcpy(res + rkind * i,
10386                    buf2,
10387                    rkind * len2);
10388             i += len1;
10389 
10390             while ( --maxcount > 0) {
10391                 i = anylib_find(rkind, self,
10392                                 sbuf+rkind*i, slen-i,
10393                                 str1, buf1, len1, i);
10394                 if (i == -1)
10395                     break;
10396                 memcpy(res + rkind * i,
10397                        buf2,
10398                        rkind * len2);
10399                 i += len1;
10400             }
10401         }
10402     }
10403     else {
10404         Py_ssize_t n, i, j, ires;
10405         Py_ssize_t new_size;
10406         int rkind = skind;
10407         char *res;
10408 
10409         if (kind1 < rkind) {
10410             /* widen substring */
10411             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10412             if (!buf1) goto error;
10413             release1 = 1;
10414         }
10415         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10416         if (n == 0)
10417             goto nothing;
10418         if (kind2 < rkind) {
10419             /* widen replacement */
10420             buf2 = unicode_askind(kind2, buf2, len2, rkind);
10421             if (!buf2) goto error;
10422             release2 = 1;
10423         }
10424         else if (kind2 > rkind) {
10425             /* widen self and buf1 */
10426             rkind = kind2;
10427             sbuf = unicode_askind(skind, sbuf, slen, rkind);
10428             if (!sbuf) goto error;
10429             srelease = 1;
10430             if (release1) {
10431                 assert(buf1 != PyUnicode_DATA(str1));
10432                 PyMem_Free((void *)buf1);
10433                 buf1 = PyUnicode_DATA(str1);
10434                 release1 = 0;
10435             }
10436             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10437             if (!buf1) goto error;
10438             release1 = 1;
10439         }
10440         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10441            PyUnicode_GET_LENGTH(str1)); */
10442         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10443                 PyErr_SetString(PyExc_OverflowError,
10444                                 "replace string is too long");
10445                 goto error;
10446         }
10447         new_size = slen + n * (len2 - len1);
10448         if (new_size == 0) {
10449             u = unicode_get_empty();
10450             goto done;
10451         }
10452         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10453             PyErr_SetString(PyExc_OverflowError,
10454                             "replace string is too long");
10455             goto error;
10456         }
10457         u = PyUnicode_New(new_size, maxchar);
10458         if (!u)
10459             goto error;
10460         assert(PyUnicode_KIND(u) == rkind);
10461         res = PyUnicode_DATA(u);
10462         ires = i = 0;
10463         if (len1 > 0) {
10464             while (n-- > 0) {
10465                 /* look for next match */
10466                 j = anylib_find(rkind, self,
10467                                 sbuf + rkind * i, slen-i,
10468                                 str1, buf1, len1, i);
10469                 if (j == -1)
10470                     break;
10471                 else if (j > i) {
10472                     /* copy unchanged part [i:j] */
10473                     memcpy(res + rkind * ires,
10474                            sbuf + rkind * i,
10475                            rkind * (j-i));
10476                     ires += j - i;
10477                 }
10478                 /* copy substitution string */
10479                 if (len2 > 0) {
10480                     memcpy(res + rkind * ires,
10481                            buf2,
10482                            rkind * len2);
10483                     ires += len2;
10484                 }
10485                 i = j + len1;
10486             }
10487             if (i < slen)
10488                 /* copy tail [i:] */
10489                 memcpy(res + rkind * ires,
10490                        sbuf + rkind * i,
10491                        rkind * (slen-i));
10492         }
10493         else {
10494             /* interleave */
10495             while (n > 0) {
10496                 memcpy(res + rkind * ires,
10497                        buf2,
10498                        rkind * len2);
10499                 ires += len2;
10500                 if (--n <= 0)
10501                     break;
10502                 memcpy(res + rkind * ires,
10503                        sbuf + rkind * i,
10504                        rkind);
10505                 ires++;
10506                 i++;
10507             }
10508             memcpy(res + rkind * ires,
10509                    sbuf + rkind * i,
10510                    rkind * (slen-i));
10511         }
10512     }
10513 
10514     if (mayshrink) {
10515         unicode_adjust_maxchar(&u);
10516         if (u == NULL)
10517             goto error;
10518     }
10519 
10520   done:
10521     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10522     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10523     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10524     if (srelease)
10525         PyMem_Free((void *)sbuf);
10526     if (release1)
10527         PyMem_Free((void *)buf1);
10528     if (release2)
10529         PyMem_Free((void *)buf2);
10530     assert(_PyUnicode_CheckConsistency(u, 1));
10531     return u;
10532 
10533   nothing:
10534     /* nothing to replace; return original string (when possible) */
10535     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10536     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10537     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10538     if (srelease)
10539         PyMem_Free((void *)sbuf);
10540     if (release1)
10541         PyMem_Free((void *)buf1);
10542     if (release2)
10543         PyMem_Free((void *)buf2);
10544     return unicode_result_unchanged(self);
10545 
10546   error:
10547     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10548     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10549     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10550     if (srelease)
10551         PyMem_Free((void *)sbuf);
10552     if (release1)
10553         PyMem_Free((void *)buf1);
10554     if (release2)
10555         PyMem_Free((void *)buf2);
10556     return NULL;
10557 }
10558 
10559 /* --- Unicode Object Methods --------------------------------------------- */
10560 
10561 /*[clinic input]
10562 str.title as unicode_title
10563 
10564 Return a version of the string where each word is titlecased.
10565 
10566 More specifically, words start with uppercased characters and all remaining
10567 cased characters have lower case.
10568 [clinic start generated code]*/
10569 
10570 static PyObject *
unicode_title_impl(PyObject * self)10571 unicode_title_impl(PyObject *self)
10572 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10573 {
10574     return case_operation(self, do_title);
10575 }
10576 
10577 /*[clinic input]
10578 str.capitalize as unicode_capitalize
10579 
10580 Return a capitalized version of the string.
10581 
10582 More specifically, make the first character have upper case and the rest lower
10583 case.
10584 [clinic start generated code]*/
10585 
10586 static PyObject *
unicode_capitalize_impl(PyObject * self)10587 unicode_capitalize_impl(PyObject *self)
10588 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10589 {
10590     if (PyUnicode_GET_LENGTH(self) == 0)
10591         return unicode_result_unchanged(self);
10592     return case_operation(self, do_capitalize);
10593 }
10594 
10595 /*[clinic input]
10596 str.casefold as unicode_casefold
10597 
10598 Return a version of the string suitable for caseless comparisons.
10599 [clinic start generated code]*/
10600 
10601 static PyObject *
unicode_casefold_impl(PyObject * self)10602 unicode_casefold_impl(PyObject *self)
10603 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10604 {
10605     if (PyUnicode_IS_ASCII(self))
10606         return ascii_upper_or_lower(self, 1);
10607     return case_operation(self, do_casefold);
10608 }
10609 
10610 
10611 /* Argument converter. Accepts a single Unicode character. */
10612 
10613 static int
convert_uc(PyObject * obj,void * addr)10614 convert_uc(PyObject *obj, void *addr)
10615 {
10616     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10617 
10618     if (!PyUnicode_Check(obj)) {
10619         PyErr_Format(PyExc_TypeError,
10620                      "The fill character must be a unicode character, "
10621                      "not %.100s", Py_TYPE(obj)->tp_name);
10622         return 0;
10623     }
10624     if (PyUnicode_GET_LENGTH(obj) != 1) {
10625         PyErr_SetString(PyExc_TypeError,
10626                         "The fill character must be exactly one character long");
10627         return 0;
10628     }
10629     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10630     return 1;
10631 }
10632 
10633 /*[clinic input]
10634 str.center as unicode_center
10635 
10636     width: Py_ssize_t
10637     fillchar: Py_UCS4 = ' '
10638     /
10639 
10640 Return a centered string of length width.
10641 
10642 Padding is done using the specified fill character (default is a space).
10643 [clinic start generated code]*/
10644 
10645 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10646 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10647 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10648 {
10649     Py_ssize_t marg, left;
10650 
10651     if (PyUnicode_GET_LENGTH(self) >= width)
10652         return unicode_result_unchanged(self);
10653 
10654     marg = width - PyUnicode_GET_LENGTH(self);
10655     left = marg / 2 + (marg & width & 1);
10656 
10657     return pad(self, left, marg - left, fillchar);
10658 }
10659 
10660 /* This function assumes that str1 and str2 are readied by the caller. */
10661 
10662 static int
unicode_compare(PyObject * str1,PyObject * str2)10663 unicode_compare(PyObject *str1, PyObject *str2)
10664 {
10665 #define COMPARE(TYPE1, TYPE2) \
10666     do { \
10667         TYPE1* p1 = (TYPE1 *)data1; \
10668         TYPE2* p2 = (TYPE2 *)data2; \
10669         TYPE1* end = p1 + len; \
10670         Py_UCS4 c1, c2; \
10671         for (; p1 != end; p1++, p2++) { \
10672             c1 = *p1; \
10673             c2 = *p2; \
10674             if (c1 != c2) \
10675                 return (c1 < c2) ? -1 : 1; \
10676         } \
10677     } \
10678     while (0)
10679 
10680     int kind1, kind2;
10681     const void *data1, *data2;
10682     Py_ssize_t len1, len2, len;
10683 
10684     kind1 = PyUnicode_KIND(str1);
10685     kind2 = PyUnicode_KIND(str2);
10686     data1 = PyUnicode_DATA(str1);
10687     data2 = PyUnicode_DATA(str2);
10688     len1 = PyUnicode_GET_LENGTH(str1);
10689     len2 = PyUnicode_GET_LENGTH(str2);
10690     len = Py_MIN(len1, len2);
10691 
10692     switch(kind1) {
10693     case PyUnicode_1BYTE_KIND:
10694     {
10695         switch(kind2) {
10696         case PyUnicode_1BYTE_KIND:
10697         {
10698             int cmp = memcmp(data1, data2, len);
10699             /* normalize result of memcmp() into the range [-1; 1] */
10700             if (cmp < 0)
10701                 return -1;
10702             if (cmp > 0)
10703                 return 1;
10704             break;
10705         }
10706         case PyUnicode_2BYTE_KIND:
10707             COMPARE(Py_UCS1, Py_UCS2);
10708             break;
10709         case PyUnicode_4BYTE_KIND:
10710             COMPARE(Py_UCS1, Py_UCS4);
10711             break;
10712         default:
10713             Py_UNREACHABLE();
10714         }
10715         break;
10716     }
10717     case PyUnicode_2BYTE_KIND:
10718     {
10719         switch(kind2) {
10720         case PyUnicode_1BYTE_KIND:
10721             COMPARE(Py_UCS2, Py_UCS1);
10722             break;
10723         case PyUnicode_2BYTE_KIND:
10724         {
10725             COMPARE(Py_UCS2, Py_UCS2);
10726             break;
10727         }
10728         case PyUnicode_4BYTE_KIND:
10729             COMPARE(Py_UCS2, Py_UCS4);
10730             break;
10731         default:
10732             Py_UNREACHABLE();
10733         }
10734         break;
10735     }
10736     case PyUnicode_4BYTE_KIND:
10737     {
10738         switch(kind2) {
10739         case PyUnicode_1BYTE_KIND:
10740             COMPARE(Py_UCS4, Py_UCS1);
10741             break;
10742         case PyUnicode_2BYTE_KIND:
10743             COMPARE(Py_UCS4, Py_UCS2);
10744             break;
10745         case PyUnicode_4BYTE_KIND:
10746         {
10747 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10748             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10749             /* normalize result of wmemcmp() into the range [-1; 1] */
10750             if (cmp < 0)
10751                 return -1;
10752             if (cmp > 0)
10753                 return 1;
10754 #else
10755             COMPARE(Py_UCS4, Py_UCS4);
10756 #endif
10757             break;
10758         }
10759         default:
10760             Py_UNREACHABLE();
10761         }
10762         break;
10763     }
10764     default:
10765         Py_UNREACHABLE();
10766     }
10767 
10768     if (len1 == len2)
10769         return 0;
10770     if (len1 < len2)
10771         return -1;
10772     else
10773         return 1;
10774 
10775 #undef COMPARE
10776 }
10777 
10778 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10779 unicode_compare_eq(PyObject *str1, PyObject *str2)
10780 {
10781     int kind;
10782     const void *data1, *data2;
10783     Py_ssize_t len;
10784     int cmp;
10785 
10786     len = PyUnicode_GET_LENGTH(str1);
10787     if (PyUnicode_GET_LENGTH(str2) != len)
10788         return 0;
10789     kind = PyUnicode_KIND(str1);
10790     if (PyUnicode_KIND(str2) != kind)
10791         return 0;
10792     data1 = PyUnicode_DATA(str1);
10793     data2 = PyUnicode_DATA(str2);
10794 
10795     cmp = memcmp(data1, data2, len * kind);
10796     return (cmp == 0);
10797 }
10798 
10799 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)10800 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
10801 {
10802     assert(PyUnicode_Check(str1));
10803     assert(PyUnicode_Check(str2));
10804     if (str1 == str2) {
10805         return 1;
10806     }
10807     return unicode_compare_eq(str1, str2);
10808 }
10809 
10810 
10811 int
PyUnicode_Compare(PyObject * left,PyObject * right)10812 PyUnicode_Compare(PyObject *left, PyObject *right)
10813 {
10814     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10815         /* a string is equal to itself */
10816         if (left == right)
10817             return 0;
10818 
10819         return unicode_compare(left, right);
10820     }
10821     PyErr_Format(PyExc_TypeError,
10822                  "Can't compare %.100s and %.100s",
10823                  Py_TYPE(left)->tp_name,
10824                  Py_TYPE(right)->tp_name);
10825     return -1;
10826 }
10827 
10828 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)10829 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10830 {
10831     Py_ssize_t i;
10832     int kind;
10833     Py_UCS4 chr;
10834 
10835     assert(_PyUnicode_CHECK(uni));
10836     kind = PyUnicode_KIND(uni);
10837     if (kind == PyUnicode_1BYTE_KIND) {
10838         const void *data = PyUnicode_1BYTE_DATA(uni);
10839         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10840         size_t len, len2 = strlen(str);
10841         int cmp;
10842 
10843         len = Py_MIN(len1, len2);
10844         cmp = memcmp(data, str, len);
10845         if (cmp != 0) {
10846             if (cmp < 0)
10847                 return -1;
10848             else
10849                 return 1;
10850         }
10851         if (len1 > len2)
10852             return 1; /* uni is longer */
10853         if (len1 < len2)
10854             return -1; /* str is longer */
10855         return 0;
10856     }
10857     else {
10858         const void *data = PyUnicode_DATA(uni);
10859         /* Compare Unicode string and source character set string */
10860         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10861             if (chr != (unsigned char)str[i])
10862                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10863         /* This check keeps Python strings that end in '\0' from comparing equal
10864          to C strings identical up to that point. */
10865         if (PyUnicode_GET_LENGTH(uni) != i || chr)
10866             return 1; /* uni is longer */
10867         if (str[i])
10868             return -1; /* str is longer */
10869         return 0;
10870     }
10871 }
10872 
10873 int
PyUnicode_EqualToUTF8(PyObject * unicode,const char * str)10874 PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
10875 {
10876     return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
10877 }
10878 
10879 int
PyUnicode_EqualToUTF8AndSize(PyObject * unicode,const char * str,Py_ssize_t size)10880 PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
10881 {
10882     assert(_PyUnicode_CHECK(unicode));
10883     assert(str);
10884 
10885     if (PyUnicode_IS_ASCII(unicode)) {
10886         Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10887         return size == len &&
10888             memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10889     }
10890     if (PyUnicode_UTF8(unicode) != NULL) {
10891         Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
10892         return size == len &&
10893             memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
10894     }
10895 
10896     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
10897     if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
10898         return 0;
10899     }
10900     const unsigned char *s = (const unsigned char *)str;
10901     const unsigned char *ends = s + (size_t)size;
10902     int kind = PyUnicode_KIND(unicode);
10903     const void *data = PyUnicode_DATA(unicode);
10904     /* Compare Unicode string and UTF-8 string */
10905     for (Py_ssize_t i = 0; i < len; i++) {
10906         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10907         if (ch < 0x80) {
10908             if (ends == s || s[0] != ch) {
10909                 return 0;
10910             }
10911             s += 1;
10912         }
10913         else if (ch < 0x800) {
10914             if ((ends - s) < 2 ||
10915                 s[0] != (0xc0 | (ch >> 6)) ||
10916                 s[1] != (0x80 | (ch & 0x3f)))
10917             {
10918                 return 0;
10919             }
10920             s += 2;
10921         }
10922         else if (ch < 0x10000) {
10923             if (Py_UNICODE_IS_SURROGATE(ch) ||
10924                 (ends - s) < 3 ||
10925                 s[0] != (0xe0 | (ch >> 12)) ||
10926                 s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
10927                 s[2] != (0x80 | (ch & 0x3f)))
10928             {
10929                 return 0;
10930             }
10931             s += 3;
10932         }
10933         else {
10934             assert(ch <= MAX_UNICODE);
10935             if ((ends - s) < 4 ||
10936                 s[0] != (0xf0 | (ch >> 18)) ||
10937                 s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
10938                 s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
10939                 s[3] != (0x80 | (ch & 0x3f)))
10940             {
10941                 return 0;
10942             }
10943             s += 4;
10944         }
10945     }
10946     return s == ends;
10947 }
10948 
10949 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)10950 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10951 {
10952     size_t len;
10953     assert(_PyUnicode_CHECK(unicode));
10954     assert(str);
10955 #ifndef NDEBUG
10956     for (const char *p = str; *p; p++) {
10957         assert((unsigned char)*p < 128);
10958     }
10959 #endif
10960     if (!PyUnicode_IS_ASCII(unicode))
10961         return 0;
10962     len = (size_t)PyUnicode_GET_LENGTH(unicode);
10963     return strlen(str) == len &&
10964            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10965 }
10966 
10967 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)10968 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
10969 {
10970     PyObject *right_uni;
10971 
10972     assert(_PyUnicode_CHECK(left));
10973     assert(right->string);
10974 #ifndef NDEBUG
10975     for (const char *p = right->string; *p; p++) {
10976         assert((unsigned char)*p < 128);
10977     }
10978 #endif
10979 
10980     if (!PyUnicode_IS_ASCII(left))
10981         return 0;
10982 
10983     right_uni = _PyUnicode_FromId(right);       /* borrowed */
10984     if (right_uni == NULL) {
10985         /* memory error or bad data */
10986         PyErr_Clear();
10987         return _PyUnicode_EqualToASCIIString(left, right->string);
10988     }
10989 
10990     if (left == right_uni)
10991         return 1;
10992 
10993     assert(PyUnicode_CHECK_INTERNED(right_uni));
10994     if (PyUnicode_CHECK_INTERNED(left)) {
10995         return 0;
10996     }
10997 
10998     Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni));
10999     assert(right_hash != -1);
11000     Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left));
11001     if (hash != -1 && hash != right_hash) {
11002         return 0;
11003     }
11004 
11005     return unicode_compare_eq(left, right_uni);
11006 }
11007 
11008 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11009 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11010 {
11011     int result;
11012 
11013     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11014         Py_RETURN_NOTIMPLEMENTED;
11015 
11016     if (left == right) {
11017         switch (op) {
11018         case Py_EQ:
11019         case Py_LE:
11020         case Py_GE:
11021             /* a string is equal to itself */
11022             Py_RETURN_TRUE;
11023         case Py_NE:
11024         case Py_LT:
11025         case Py_GT:
11026             Py_RETURN_FALSE;
11027         default:
11028             PyErr_BadArgument();
11029             return NULL;
11030         }
11031     }
11032     else if (op == Py_EQ || op == Py_NE) {
11033         result = unicode_compare_eq(left, right);
11034         result ^= (op == Py_NE);
11035         return PyBool_FromLong(result);
11036     }
11037     else {
11038         result = unicode_compare(left, right);
11039         Py_RETURN_RICHCOMPARE(result, 0, op);
11040     }
11041 }
11042 
11043 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11044 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11045 {
11046     return unicode_eq(aa, bb);
11047 }
11048 
11049 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11050 PyUnicode_Contains(PyObject *str, PyObject *substr)
11051 {
11052     int kind1, kind2;
11053     const void *buf1, *buf2;
11054     Py_ssize_t len1, len2;
11055     int result;
11056 
11057     if (!PyUnicode_Check(substr)) {
11058         PyErr_Format(PyExc_TypeError,
11059                      "'in <string>' requires string as left operand, not %.100s",
11060                      Py_TYPE(substr)->tp_name);
11061         return -1;
11062     }
11063     if (ensure_unicode(str) < 0)
11064         return -1;
11065 
11066     kind1 = PyUnicode_KIND(str);
11067     kind2 = PyUnicode_KIND(substr);
11068     if (kind1 < kind2)
11069         return 0;
11070     len1 = PyUnicode_GET_LENGTH(str);
11071     len2 = PyUnicode_GET_LENGTH(substr);
11072     if (len1 < len2)
11073         return 0;
11074     buf1 = PyUnicode_DATA(str);
11075     buf2 = PyUnicode_DATA(substr);
11076     if (len2 == 1) {
11077         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11078         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11079         return result;
11080     }
11081     if (kind2 != kind1) {
11082         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11083         if (!buf2)
11084             return -1;
11085     }
11086 
11087     switch (kind1) {
11088     case PyUnicode_1BYTE_KIND:
11089         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11090         break;
11091     case PyUnicode_2BYTE_KIND:
11092         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11093         break;
11094     case PyUnicode_4BYTE_KIND:
11095         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11096         break;
11097     default:
11098         Py_UNREACHABLE();
11099     }
11100 
11101     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11102     if (kind2 != kind1)
11103         PyMem_Free((void *)buf2);
11104 
11105     return result;
11106 }
11107 
11108 /* Concat to string or Unicode object giving a new Unicode object. */
11109 
11110 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11111 PyUnicode_Concat(PyObject *left, PyObject *right)
11112 {
11113     PyObject *result;
11114     Py_UCS4 maxchar, maxchar2;
11115     Py_ssize_t left_len, right_len, new_len;
11116 
11117     if (ensure_unicode(left) < 0)
11118         return NULL;
11119 
11120     if (!PyUnicode_Check(right)) {
11121         PyErr_Format(PyExc_TypeError,
11122                      "can only concatenate str (not \"%.200s\") to str",
11123                      Py_TYPE(right)->tp_name);
11124         return NULL;
11125     }
11126 
11127     /* Shortcuts */
11128     PyObject *empty = unicode_get_empty();  // Borrowed reference
11129     if (left == empty) {
11130         return PyUnicode_FromObject(right);
11131     }
11132     if (right == empty) {
11133         return PyUnicode_FromObject(left);
11134     }
11135 
11136     left_len = PyUnicode_GET_LENGTH(left);
11137     right_len = PyUnicode_GET_LENGTH(right);
11138     if (left_len > PY_SSIZE_T_MAX - right_len) {
11139         PyErr_SetString(PyExc_OverflowError,
11140                         "strings are too large to concat");
11141         return NULL;
11142     }
11143     new_len = left_len + right_len;
11144 
11145     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11146     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11147     maxchar = Py_MAX(maxchar, maxchar2);
11148 
11149     /* Concat the two Unicode strings */
11150     result = PyUnicode_New(new_len, maxchar);
11151     if (result == NULL)
11152         return NULL;
11153     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11154     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11155     assert(_PyUnicode_CheckConsistency(result, 1));
11156     return result;
11157 }
11158 
11159 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11160 PyUnicode_Append(PyObject **p_left, PyObject *right)
11161 {
11162     PyObject *left, *res;
11163     Py_UCS4 maxchar, maxchar2;
11164     Py_ssize_t left_len, right_len, new_len;
11165 
11166     if (p_left == NULL) {
11167         if (!PyErr_Occurred())
11168             PyErr_BadInternalCall();
11169         return;
11170     }
11171     left = *p_left;
11172     if (right == NULL || left == NULL
11173         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11174         if (!PyErr_Occurred())
11175             PyErr_BadInternalCall();
11176         goto error;
11177     }
11178 
11179     /* Shortcuts */
11180     PyObject *empty = unicode_get_empty();  // Borrowed reference
11181     if (left == empty) {
11182         Py_DECREF(left);
11183         *p_left = Py_NewRef(right);
11184         return;
11185     }
11186     if (right == empty) {
11187         return;
11188     }
11189 
11190     left_len = PyUnicode_GET_LENGTH(left);
11191     right_len = PyUnicode_GET_LENGTH(right);
11192     if (left_len > PY_SSIZE_T_MAX - right_len) {
11193         PyErr_SetString(PyExc_OverflowError,
11194                         "strings are too large to concat");
11195         goto error;
11196     }
11197     new_len = left_len + right_len;
11198 
11199     if (unicode_modifiable(left)
11200         && PyUnicode_CheckExact(right)
11201         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11202         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11203            to change the structure size, but characters are stored just after
11204            the structure, and so it requires to move all characters which is
11205            not so different than duplicating the string. */
11206         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11207     {
11208         /* append inplace */
11209         if (unicode_resize(p_left, new_len) != 0)
11210             goto error;
11211 
11212         /* copy 'right' into the newly allocated area of 'left' */
11213         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11214     }
11215     else {
11216         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11217         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11218         maxchar = Py_MAX(maxchar, maxchar2);
11219 
11220         /* Concat the two Unicode strings */
11221         res = PyUnicode_New(new_len, maxchar);
11222         if (res == NULL)
11223             goto error;
11224         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11225         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11226         Py_DECREF(left);
11227         *p_left = res;
11228     }
11229     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11230     return;
11231 
11232 error:
11233     Py_CLEAR(*p_left);
11234 }
11235 
11236 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11237 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11238 {
11239     PyUnicode_Append(pleft, right);
11240     Py_XDECREF(right);
11241 }
11242 
11243 /*[clinic input]
11244 @text_signature "($self, sub[, start[, end]], /)"
11245 str.count as unicode_count -> Py_ssize_t
11246 
11247     self as str: self
11248     sub as substr: unicode
11249     start: slice_index(accept={int, NoneType}, c_default='0') = None
11250     end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11251     /
11252 
11253 Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11254 
11255 Optional arguments start and end are interpreted as in slice notation.
11256 [clinic start generated code]*/
11257 
11258 static Py_ssize_t
unicode_count_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11259 unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11260                    Py_ssize_t end)
11261 /*[clinic end generated code: output=8fcc3aef0b18edbf input=6f168ffd94be8785]*/
11262 {
11263     assert(PyUnicode_Check(str));
11264     assert(PyUnicode_Check(substr));
11265 
11266     Py_ssize_t result;
11267     int kind1, kind2;
11268     const void *buf1 = NULL, *buf2 = NULL;
11269     Py_ssize_t len1, len2;
11270 
11271     kind1 = PyUnicode_KIND(str);
11272     kind2 = PyUnicode_KIND(substr);
11273     if (kind1 < kind2)
11274         return 0;
11275 
11276     len1 = PyUnicode_GET_LENGTH(str);
11277     len2 = PyUnicode_GET_LENGTH(substr);
11278     ADJUST_INDICES(start, end, len1);
11279     if (end - start < len2)
11280         return 0;
11281 
11282     buf1 = PyUnicode_DATA(str);
11283     buf2 = PyUnicode_DATA(substr);
11284     if (kind2 != kind1) {
11285         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11286         if (!buf2)
11287             goto onError;
11288     }
11289 
11290     // We don't reuse `anylib_count` here because of the explicit casts.
11291     switch (kind1) {
11292     case PyUnicode_1BYTE_KIND:
11293         result = ucs1lib_count(
11294             ((const Py_UCS1*)buf1) + start, end - start,
11295             buf2, len2, PY_SSIZE_T_MAX
11296             );
11297         break;
11298     case PyUnicode_2BYTE_KIND:
11299         result = ucs2lib_count(
11300             ((const Py_UCS2*)buf1) + start, end - start,
11301             buf2, len2, PY_SSIZE_T_MAX
11302             );
11303         break;
11304     case PyUnicode_4BYTE_KIND:
11305         result = ucs4lib_count(
11306             ((const Py_UCS4*)buf1) + start, end - start,
11307             buf2, len2, PY_SSIZE_T_MAX
11308             );
11309         break;
11310     default:
11311         Py_UNREACHABLE();
11312     }
11313 
11314     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11315     if (kind2 != kind1)
11316         PyMem_Free((void *)buf2);
11317 
11318     return result;
11319   onError:
11320     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11321     if (kind2 != kind1)
11322         PyMem_Free((void *)buf2);
11323     return -1;
11324 }
11325 
11326 /*[clinic input]
11327 str.encode as unicode_encode
11328 
11329     encoding: str(c_default="NULL") = 'utf-8'
11330         The encoding in which to encode the string.
11331     errors: str(c_default="NULL") = 'strict'
11332         The error handling scheme to use for encoding errors.
11333         The default is 'strict' meaning that encoding errors raise a
11334         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11335         'xmlcharrefreplace' as well as any other name registered with
11336         codecs.register_error that can handle UnicodeEncodeErrors.
11337 
11338 Encode the string using the codec registered for encoding.
11339 [clinic start generated code]*/
11340 
11341 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11342 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11343 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11344 {
11345     return PyUnicode_AsEncodedString(self, encoding, errors);
11346 }
11347 
11348 /*[clinic input]
11349 str.expandtabs as unicode_expandtabs
11350 
11351     tabsize: int = 8
11352 
11353 Return a copy where all tab characters are expanded using spaces.
11354 
11355 If tabsize is not given, a tab size of 8 characters is assumed.
11356 [clinic start generated code]*/
11357 
11358 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11359 unicode_expandtabs_impl(PyObject *self, int tabsize)
11360 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11361 {
11362     Py_ssize_t i, j, line_pos, src_len, incr;
11363     Py_UCS4 ch;
11364     PyObject *u;
11365     const void *src_data;
11366     void *dest_data;
11367     int kind;
11368     int found;
11369 
11370     /* First pass: determine size of output string */
11371     src_len = PyUnicode_GET_LENGTH(self);
11372     i = j = line_pos = 0;
11373     kind = PyUnicode_KIND(self);
11374     src_data = PyUnicode_DATA(self);
11375     found = 0;
11376     for (; i < src_len; i++) {
11377         ch = PyUnicode_READ(kind, src_data, i);
11378         if (ch == '\t') {
11379             found = 1;
11380             if (tabsize > 0) {
11381                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11382                 if (j > PY_SSIZE_T_MAX - incr)
11383                     goto overflow;
11384                 line_pos += incr;
11385                 j += incr;
11386             }
11387         }
11388         else {
11389             if (j > PY_SSIZE_T_MAX - 1)
11390                 goto overflow;
11391             line_pos++;
11392             j++;
11393             if (ch == '\n' || ch == '\r')
11394                 line_pos = 0;
11395         }
11396     }
11397     if (!found)
11398         return unicode_result_unchanged(self);
11399 
11400     /* Second pass: create output string and fill it */
11401     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11402     if (!u)
11403         return NULL;
11404     dest_data = PyUnicode_DATA(u);
11405 
11406     i = j = line_pos = 0;
11407 
11408     for (; i < src_len; i++) {
11409         ch = PyUnicode_READ(kind, src_data, i);
11410         if (ch == '\t') {
11411             if (tabsize > 0) {
11412                 incr = tabsize - (line_pos % tabsize);
11413                 line_pos += incr;
11414                 unicode_fill(kind, dest_data, ' ', j, incr);
11415                 j += incr;
11416             }
11417         }
11418         else {
11419             line_pos++;
11420             PyUnicode_WRITE(kind, dest_data, j, ch);
11421             j++;
11422             if (ch == '\n' || ch == '\r')
11423                 line_pos = 0;
11424         }
11425     }
11426     assert (j == PyUnicode_GET_LENGTH(u));
11427     return unicode_result(u);
11428 
11429   overflow:
11430     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11431     return NULL;
11432 }
11433 
11434 /*[clinic input]
11435 str.find as unicode_find = str.count
11436 
11437 Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11438 
11439 Optional arguments start and end are interpreted as in slice notation.
11440 Return -1 on failure.
11441 [clinic start generated code]*/
11442 
11443 static Py_ssize_t
unicode_find_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11444 unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11445                   Py_ssize_t end)
11446 /*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/
11447 {
11448     Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11449     if (result < 0) {
11450         return -1;
11451     }
11452     return result;
11453 }
11454 
11455 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11456 unicode_getitem(PyObject *self, Py_ssize_t index)
11457 {
11458     const void *data;
11459     int kind;
11460     Py_UCS4 ch;
11461 
11462     if (!PyUnicode_Check(self)) {
11463         PyErr_BadArgument();
11464         return NULL;
11465     }
11466     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11467         PyErr_SetString(PyExc_IndexError, "string index out of range");
11468         return NULL;
11469     }
11470     kind = PyUnicode_KIND(self);
11471     data = PyUnicode_DATA(self);
11472     ch = PyUnicode_READ(kind, data, index);
11473     return unicode_char(ch);
11474 }
11475 
11476 /* Believe it or not, this produces the same value for ASCII strings
11477    as bytes_hash(). */
11478 static Py_hash_t
unicode_hash(PyObject * self)11479 unicode_hash(PyObject *self)
11480 {
11481     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11482 
11483 #ifdef Py_DEBUG
11484     assert(_Py_HashSecret_Initialized);
11485 #endif
11486     Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self));
11487     if (hash != -1) {
11488         return hash;
11489     }
11490     x = _Py_HashBytes(PyUnicode_DATA(self),
11491                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11492 
11493     FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x);
11494     return x;
11495 }
11496 
11497 /*[clinic input]
11498 str.index as unicode_index = str.count
11499 
11500 Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11501 
11502 Optional arguments start and end are interpreted as in slice notation.
11503 Raises ValueError when the substring is not found.
11504 [clinic start generated code]*/
11505 
11506 static Py_ssize_t
unicode_index_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)11507 unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11508                    Py_ssize_t end)
11509 /*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/
11510 {
11511     Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11512     if (result == -1) {
11513         PyErr_SetString(PyExc_ValueError, "substring not found");
11514     }
11515     else if (result < 0) {
11516         return -1;
11517     }
11518     return result;
11519 }
11520 
11521 /*[clinic input]
11522 str.isascii as unicode_isascii
11523 
11524 Return True if all characters in the string are ASCII, False otherwise.
11525 
11526 ASCII characters have code points in the range U+0000-U+007F.
11527 Empty string is ASCII too.
11528 [clinic start generated code]*/
11529 
11530 static PyObject *
unicode_isascii_impl(PyObject * self)11531 unicode_isascii_impl(PyObject *self)
11532 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11533 {
11534     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11535 }
11536 
11537 /*[clinic input]
11538 str.islower as unicode_islower
11539 
11540 Return True if the string is a lowercase string, False otherwise.
11541 
11542 A string is lowercase if all cased characters in the string are lowercase and
11543 there is at least one cased character in the string.
11544 [clinic start generated code]*/
11545 
11546 static PyObject *
unicode_islower_impl(PyObject * self)11547 unicode_islower_impl(PyObject *self)
11548 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11549 {
11550     Py_ssize_t i, length;
11551     int kind;
11552     const void *data;
11553     int cased;
11554 
11555     length = PyUnicode_GET_LENGTH(self);
11556     kind = PyUnicode_KIND(self);
11557     data = PyUnicode_DATA(self);
11558 
11559     /* Shortcut for single character strings */
11560     if (length == 1)
11561         return PyBool_FromLong(
11562             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11563 
11564     /* Special case for empty strings */
11565     if (length == 0)
11566         Py_RETURN_FALSE;
11567 
11568     cased = 0;
11569     for (i = 0; i < length; i++) {
11570         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11571 
11572         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11573             Py_RETURN_FALSE;
11574         else if (!cased && Py_UNICODE_ISLOWER(ch))
11575             cased = 1;
11576     }
11577     return PyBool_FromLong(cased);
11578 }
11579 
11580 /*[clinic input]
11581 str.isupper as unicode_isupper
11582 
11583 Return True if the string is an uppercase string, False otherwise.
11584 
11585 A string is uppercase if all cased characters in the string are uppercase and
11586 there is at least one cased character in the string.
11587 [clinic start generated code]*/
11588 
11589 static PyObject *
unicode_isupper_impl(PyObject * self)11590 unicode_isupper_impl(PyObject *self)
11591 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11592 {
11593     Py_ssize_t i, length;
11594     int kind;
11595     const void *data;
11596     int cased;
11597 
11598     length = PyUnicode_GET_LENGTH(self);
11599     kind = PyUnicode_KIND(self);
11600     data = PyUnicode_DATA(self);
11601 
11602     /* Shortcut for single character strings */
11603     if (length == 1)
11604         return PyBool_FromLong(
11605             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11606 
11607     /* Special case for empty strings */
11608     if (length == 0)
11609         Py_RETURN_FALSE;
11610 
11611     cased = 0;
11612     for (i = 0; i < length; i++) {
11613         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11614 
11615         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11616             Py_RETURN_FALSE;
11617         else if (!cased && Py_UNICODE_ISUPPER(ch))
11618             cased = 1;
11619     }
11620     return PyBool_FromLong(cased);
11621 }
11622 
11623 /*[clinic input]
11624 str.istitle as unicode_istitle
11625 
11626 Return True if the string is a title-cased string, False otherwise.
11627 
11628 In a title-cased string, upper- and title-case characters may only
11629 follow uncased characters and lowercase characters only cased ones.
11630 [clinic start generated code]*/
11631 
11632 static PyObject *
unicode_istitle_impl(PyObject * self)11633 unicode_istitle_impl(PyObject *self)
11634 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11635 {
11636     Py_ssize_t i, length;
11637     int kind;
11638     const void *data;
11639     int cased, previous_is_cased;
11640 
11641     length = PyUnicode_GET_LENGTH(self);
11642     kind = PyUnicode_KIND(self);
11643     data = PyUnicode_DATA(self);
11644 
11645     /* Shortcut for single character strings */
11646     if (length == 1) {
11647         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11648         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11649                                (Py_UNICODE_ISUPPER(ch) != 0));
11650     }
11651 
11652     /* Special case for empty strings */
11653     if (length == 0)
11654         Py_RETURN_FALSE;
11655 
11656     cased = 0;
11657     previous_is_cased = 0;
11658     for (i = 0; i < length; i++) {
11659         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11660 
11661         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11662             if (previous_is_cased)
11663                 Py_RETURN_FALSE;
11664             previous_is_cased = 1;
11665             cased = 1;
11666         }
11667         else if (Py_UNICODE_ISLOWER(ch)) {
11668             if (!previous_is_cased)
11669                 Py_RETURN_FALSE;
11670             previous_is_cased = 1;
11671             cased = 1;
11672         }
11673         else
11674             previous_is_cased = 0;
11675     }
11676     return PyBool_FromLong(cased);
11677 }
11678 
11679 /*[clinic input]
11680 str.isspace as unicode_isspace
11681 
11682 Return True if the string is a whitespace string, False otherwise.
11683 
11684 A string is whitespace if all characters in the string are whitespace and there
11685 is at least one character in the string.
11686 [clinic start generated code]*/
11687 
11688 static PyObject *
unicode_isspace_impl(PyObject * self)11689 unicode_isspace_impl(PyObject *self)
11690 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11691 {
11692     Py_ssize_t i, length;
11693     int kind;
11694     const void *data;
11695 
11696     length = PyUnicode_GET_LENGTH(self);
11697     kind = PyUnicode_KIND(self);
11698     data = PyUnicode_DATA(self);
11699 
11700     /* Shortcut for single character strings */
11701     if (length == 1)
11702         return PyBool_FromLong(
11703             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11704 
11705     /* Special case for empty strings */
11706     if (length == 0)
11707         Py_RETURN_FALSE;
11708 
11709     for (i = 0; i < length; i++) {
11710         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11711         if (!Py_UNICODE_ISSPACE(ch))
11712             Py_RETURN_FALSE;
11713     }
11714     Py_RETURN_TRUE;
11715 }
11716 
11717 /*[clinic input]
11718 str.isalpha as unicode_isalpha
11719 
11720 Return True if the string is an alphabetic string, False otherwise.
11721 
11722 A string is alphabetic if all characters in the string are alphabetic and there
11723 is at least one character in the string.
11724 [clinic start generated code]*/
11725 
11726 static PyObject *
unicode_isalpha_impl(PyObject * self)11727 unicode_isalpha_impl(PyObject *self)
11728 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11729 {
11730     Py_ssize_t i, length;
11731     int kind;
11732     const void *data;
11733 
11734     length = PyUnicode_GET_LENGTH(self);
11735     kind = PyUnicode_KIND(self);
11736     data = PyUnicode_DATA(self);
11737 
11738     /* Shortcut for single character strings */
11739     if (length == 1)
11740         return PyBool_FromLong(
11741             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11742 
11743     /* Special case for empty strings */
11744     if (length == 0)
11745         Py_RETURN_FALSE;
11746 
11747     for (i = 0; i < length; i++) {
11748         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11749             Py_RETURN_FALSE;
11750     }
11751     Py_RETURN_TRUE;
11752 }
11753 
11754 /*[clinic input]
11755 str.isalnum as unicode_isalnum
11756 
11757 Return True if the string is an alpha-numeric string, False otherwise.
11758 
11759 A string is alpha-numeric if all characters in the string are alpha-numeric and
11760 there is at least one character in the string.
11761 [clinic start generated code]*/
11762 
11763 static PyObject *
unicode_isalnum_impl(PyObject * self)11764 unicode_isalnum_impl(PyObject *self)
11765 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11766 {
11767     int kind;
11768     const void *data;
11769     Py_ssize_t len, i;
11770 
11771     kind = PyUnicode_KIND(self);
11772     data = PyUnicode_DATA(self);
11773     len = PyUnicode_GET_LENGTH(self);
11774 
11775     /* Shortcut for single character strings */
11776     if (len == 1) {
11777         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11778         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11779     }
11780 
11781     /* Special case for empty strings */
11782     if (len == 0)
11783         Py_RETURN_FALSE;
11784 
11785     for (i = 0; i < len; i++) {
11786         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11787         if (!Py_UNICODE_ISALNUM(ch))
11788             Py_RETURN_FALSE;
11789     }
11790     Py_RETURN_TRUE;
11791 }
11792 
11793 /*[clinic input]
11794 str.isdecimal as unicode_isdecimal
11795 
11796 Return True if the string is a decimal string, False otherwise.
11797 
11798 A string is a decimal string if all characters in the string are decimal and
11799 there is at least one character in the string.
11800 [clinic start generated code]*/
11801 
11802 static PyObject *
unicode_isdecimal_impl(PyObject * self)11803 unicode_isdecimal_impl(PyObject *self)
11804 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
11805 {
11806     Py_ssize_t i, length;
11807     int kind;
11808     const void *data;
11809 
11810     length = PyUnicode_GET_LENGTH(self);
11811     kind = PyUnicode_KIND(self);
11812     data = PyUnicode_DATA(self);
11813 
11814     /* Shortcut for single character strings */
11815     if (length == 1)
11816         return PyBool_FromLong(
11817             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11818 
11819     /* Special case for empty strings */
11820     if (length == 0)
11821         Py_RETURN_FALSE;
11822 
11823     for (i = 0; i < length; i++) {
11824         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11825             Py_RETURN_FALSE;
11826     }
11827     Py_RETURN_TRUE;
11828 }
11829 
11830 /*[clinic input]
11831 str.isdigit as unicode_isdigit
11832 
11833 Return True if the string is a digit string, False otherwise.
11834 
11835 A string is a digit string if all characters in the string are digits and there
11836 is at least one character in the string.
11837 [clinic start generated code]*/
11838 
11839 static PyObject *
unicode_isdigit_impl(PyObject * self)11840 unicode_isdigit_impl(PyObject *self)
11841 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
11842 {
11843     Py_ssize_t i, length;
11844     int kind;
11845     const void *data;
11846 
11847     length = PyUnicode_GET_LENGTH(self);
11848     kind = PyUnicode_KIND(self);
11849     data = PyUnicode_DATA(self);
11850 
11851     /* Shortcut for single character strings */
11852     if (length == 1) {
11853         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11854         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11855     }
11856 
11857     /* Special case for empty strings */
11858     if (length == 0)
11859         Py_RETURN_FALSE;
11860 
11861     for (i = 0; i < length; i++) {
11862         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11863             Py_RETURN_FALSE;
11864     }
11865     Py_RETURN_TRUE;
11866 }
11867 
11868 /*[clinic input]
11869 str.isnumeric as unicode_isnumeric
11870 
11871 Return True if the string is a numeric string, False otherwise.
11872 
11873 A string is numeric if all characters in the string are numeric and there is at
11874 least one character in the string.
11875 [clinic start generated code]*/
11876 
11877 static PyObject *
unicode_isnumeric_impl(PyObject * self)11878 unicode_isnumeric_impl(PyObject *self)
11879 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
11880 {
11881     Py_ssize_t i, length;
11882     int kind;
11883     const void *data;
11884 
11885     length = PyUnicode_GET_LENGTH(self);
11886     kind = PyUnicode_KIND(self);
11887     data = PyUnicode_DATA(self);
11888 
11889     /* Shortcut for single character strings */
11890     if (length == 1)
11891         return PyBool_FromLong(
11892             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11893 
11894     /* Special case for empty strings */
11895     if (length == 0)
11896         Py_RETURN_FALSE;
11897 
11898     for (i = 0; i < length; i++) {
11899         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11900             Py_RETURN_FALSE;
11901     }
11902     Py_RETURN_TRUE;
11903 }
11904 
11905 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)11906 _PyUnicode_ScanIdentifier(PyObject *self)
11907 {
11908     Py_ssize_t i;
11909     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11910     if (len == 0) {
11911         /* an empty string is not a valid identifier */
11912         return 0;
11913     }
11914 
11915     int kind = PyUnicode_KIND(self);
11916     const void *data = PyUnicode_DATA(self);
11917     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11918     /* PEP 3131 says that the first character must be in
11919        XID_Start and subsequent characters in XID_Continue,
11920        and for the ASCII range, the 2.x rules apply (i.e
11921        start with letters and underscore, continue with
11922        letters, digits, underscore). However, given the current
11923        definition of XID_Start and XID_Continue, it is sufficient
11924        to check just for these, except that _ must be allowed
11925        as starting an identifier.  */
11926     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
11927         return 0;
11928     }
11929 
11930     for (i = 1; i < len; i++) {
11931         ch = PyUnicode_READ(kind, data, i);
11932         if (!_PyUnicode_IsXidContinue(ch)) {
11933             return i;
11934         }
11935     }
11936     return i;
11937 }
11938 
11939 int
PyUnicode_IsIdentifier(PyObject * self)11940 PyUnicode_IsIdentifier(PyObject *self)
11941 {
11942     Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
11943     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
11944     /* an empty string is not a valid identifier */
11945     return len && i == len;
11946 }
11947 
11948 /*[clinic input]
11949 str.isidentifier as unicode_isidentifier
11950 
11951 Return True if the string is a valid Python identifier, False otherwise.
11952 
11953 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
11954 such as "def" or "class".
11955 [clinic start generated code]*/
11956 
11957 static PyObject *
unicode_isidentifier_impl(PyObject * self)11958 unicode_isidentifier_impl(PyObject *self)
11959 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
11960 {
11961     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11962 }
11963 
11964 /*[clinic input]
11965 str.isprintable as unicode_isprintable
11966 
11967 Return True if the string is printable, False otherwise.
11968 
11969 A string is printable if all of its characters are considered printable in
11970 repr() or if it is empty.
11971 [clinic start generated code]*/
11972 
11973 static PyObject *
unicode_isprintable_impl(PyObject * self)11974 unicode_isprintable_impl(PyObject *self)
11975 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
11976 {
11977     Py_ssize_t i, length;
11978     int kind;
11979     const void *data;
11980 
11981     length = PyUnicode_GET_LENGTH(self);
11982     kind = PyUnicode_KIND(self);
11983     data = PyUnicode_DATA(self);
11984 
11985     /* Shortcut for single character strings */
11986     if (length == 1)
11987         return PyBool_FromLong(
11988             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11989 
11990     for (i = 0; i < length; i++) {
11991         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11992             Py_RETURN_FALSE;
11993         }
11994     }
11995     Py_RETURN_TRUE;
11996 }
11997 
11998 /*[clinic input]
11999 str.join as unicode_join
12000 
12001     iterable: object
12002     /
12003 
12004 Concatenate any number of strings.
12005 
12006 The string whose method is called is inserted in between each given string.
12007 The result is returned as a new string.
12008 
12009 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12010 [clinic start generated code]*/
12011 
12012 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12013 unicode_join(PyObject *self, PyObject *iterable)
12014 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12015 {
12016     return PyUnicode_Join(self, iterable);
12017 }
12018 
12019 static Py_ssize_t
unicode_length(PyObject * self)12020 unicode_length(PyObject *self)
12021 {
12022     return PyUnicode_GET_LENGTH(self);
12023 }
12024 
12025 /*[clinic input]
12026 str.ljust as unicode_ljust
12027 
12028     width: Py_ssize_t
12029     fillchar: Py_UCS4 = ' '
12030     /
12031 
12032 Return a left-justified string of length width.
12033 
12034 Padding is done using the specified fill character (default is a space).
12035 [clinic start generated code]*/
12036 
12037 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12038 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12039 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12040 {
12041     if (PyUnicode_GET_LENGTH(self) >= width)
12042         return unicode_result_unchanged(self);
12043 
12044     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12045 }
12046 
12047 /*[clinic input]
12048 str.lower as unicode_lower
12049 
12050 Return a copy of the string converted to lowercase.
12051 [clinic start generated code]*/
12052 
12053 static PyObject *
unicode_lower_impl(PyObject * self)12054 unicode_lower_impl(PyObject *self)
12055 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12056 {
12057     if (PyUnicode_IS_ASCII(self))
12058         return ascii_upper_or_lower(self, 1);
12059     return case_operation(self, do_lower);
12060 }
12061 
12062 #define LEFTSTRIP 0
12063 #define RIGHTSTRIP 1
12064 #define BOTHSTRIP 2
12065 
12066 /* Arrays indexed by above */
12067 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12068 
12069 #define STRIPNAME(i) (stripfuncnames[i])
12070 
12071 /* externally visible for str.strip(unicode) */
12072 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12073 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12074 {
12075     const void *data;
12076     int kind;
12077     Py_ssize_t i, j, len;
12078     BLOOM_MASK sepmask;
12079     Py_ssize_t seplen;
12080 
12081     kind = PyUnicode_KIND(self);
12082     data = PyUnicode_DATA(self);
12083     len = PyUnicode_GET_LENGTH(self);
12084     seplen = PyUnicode_GET_LENGTH(sepobj);
12085     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12086                               PyUnicode_DATA(sepobj),
12087                               seplen);
12088 
12089     i = 0;
12090     if (striptype != RIGHTSTRIP) {
12091         while (i < len) {
12092             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12093             if (!BLOOM(sepmask, ch))
12094                 break;
12095             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12096                 break;
12097             i++;
12098         }
12099     }
12100 
12101     j = len;
12102     if (striptype != LEFTSTRIP) {
12103         j--;
12104         while (j >= i) {
12105             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12106             if (!BLOOM(sepmask, ch))
12107                 break;
12108             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12109                 break;
12110             j--;
12111         }
12112 
12113         j++;
12114     }
12115 
12116     return PyUnicode_Substring(self, i, j);
12117 }
12118 
12119 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12120 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12121 {
12122     const unsigned char *data;
12123     int kind;
12124     Py_ssize_t length;
12125 
12126     length = PyUnicode_GET_LENGTH(self);
12127     end = Py_MIN(end, length);
12128 
12129     if (start == 0 && end == length)
12130         return unicode_result_unchanged(self);
12131 
12132     if (start < 0 || end < 0) {
12133         PyErr_SetString(PyExc_IndexError, "string index out of range");
12134         return NULL;
12135     }
12136     if (start >= length || end < start)
12137         _Py_RETURN_UNICODE_EMPTY();
12138 
12139     length = end - start;
12140     if (PyUnicode_IS_ASCII(self)) {
12141         data = PyUnicode_1BYTE_DATA(self);
12142         return _PyUnicode_FromASCII((const char*)(data + start), length);
12143     }
12144     else {
12145         kind = PyUnicode_KIND(self);
12146         data = PyUnicode_1BYTE_DATA(self);
12147         return PyUnicode_FromKindAndData(kind,
12148                                          data + kind * start,
12149                                          length);
12150     }
12151 }
12152 
12153 static PyObject *
do_strip(PyObject * self,int striptype)12154 do_strip(PyObject *self, int striptype)
12155 {
12156     Py_ssize_t len, i, j;
12157 
12158     len = PyUnicode_GET_LENGTH(self);
12159 
12160     if (PyUnicode_IS_ASCII(self)) {
12161         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12162 
12163         i = 0;
12164         if (striptype != RIGHTSTRIP) {
12165             while (i < len) {
12166                 Py_UCS1 ch = data[i];
12167                 if (!_Py_ascii_whitespace[ch])
12168                     break;
12169                 i++;
12170             }
12171         }
12172 
12173         j = len;
12174         if (striptype != LEFTSTRIP) {
12175             j--;
12176             while (j >= i) {
12177                 Py_UCS1 ch = data[j];
12178                 if (!_Py_ascii_whitespace[ch])
12179                     break;
12180                 j--;
12181             }
12182             j++;
12183         }
12184     }
12185     else {
12186         int kind = PyUnicode_KIND(self);
12187         const void *data = PyUnicode_DATA(self);
12188 
12189         i = 0;
12190         if (striptype != RIGHTSTRIP) {
12191             while (i < len) {
12192                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12193                 if (!Py_UNICODE_ISSPACE(ch))
12194                     break;
12195                 i++;
12196             }
12197         }
12198 
12199         j = len;
12200         if (striptype != LEFTSTRIP) {
12201             j--;
12202             while (j >= i) {
12203                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12204                 if (!Py_UNICODE_ISSPACE(ch))
12205                     break;
12206                 j--;
12207             }
12208             j++;
12209         }
12210     }
12211 
12212     return PyUnicode_Substring(self, i, j);
12213 }
12214 
12215 
12216 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12217 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12218 {
12219     if (sep != Py_None) {
12220         if (PyUnicode_Check(sep))
12221             return _PyUnicode_XStrip(self, striptype, sep);
12222         else {
12223             PyErr_Format(PyExc_TypeError,
12224                          "%s arg must be None or str",
12225                          STRIPNAME(striptype));
12226             return NULL;
12227         }
12228     }
12229 
12230     return do_strip(self, striptype);
12231 }
12232 
12233 
12234 /*[clinic input]
12235 str.strip as unicode_strip
12236 
12237     chars: object = None
12238     /
12239 
12240 Return a copy of the string with leading and trailing whitespace removed.
12241 
12242 If chars is given and not None, remove characters in chars instead.
12243 [clinic start generated code]*/
12244 
12245 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12246 unicode_strip_impl(PyObject *self, PyObject *chars)
12247 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12248 {
12249     return do_argstrip(self, BOTHSTRIP, chars);
12250 }
12251 
12252 
12253 /*[clinic input]
12254 str.lstrip as unicode_lstrip
12255 
12256     chars: object = None
12257     /
12258 
12259 Return a copy of the string with leading whitespace removed.
12260 
12261 If chars is given and not None, remove characters in chars instead.
12262 [clinic start generated code]*/
12263 
12264 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12265 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12266 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12267 {
12268     return do_argstrip(self, LEFTSTRIP, chars);
12269 }
12270 
12271 
12272 /*[clinic input]
12273 str.rstrip as unicode_rstrip
12274 
12275     chars: object = None
12276     /
12277 
12278 Return a copy of the string with trailing whitespace removed.
12279 
12280 If chars is given and not None, remove characters in chars instead.
12281 [clinic start generated code]*/
12282 
12283 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12284 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12285 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12286 {
12287     return do_argstrip(self, RIGHTSTRIP, chars);
12288 }
12289 
12290 
12291 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12292 unicode_repeat(PyObject *str, Py_ssize_t len)
12293 {
12294     PyObject *u;
12295     Py_ssize_t nchars, n;
12296 
12297     if (len < 1)
12298         _Py_RETURN_UNICODE_EMPTY();
12299 
12300     /* no repeat, return original string */
12301     if (len == 1)
12302         return unicode_result_unchanged(str);
12303 
12304     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12305         PyErr_SetString(PyExc_OverflowError,
12306                         "repeated string is too long");
12307         return NULL;
12308     }
12309     nchars = len * PyUnicode_GET_LENGTH(str);
12310 
12311     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12312     if (!u)
12313         return NULL;
12314     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12315 
12316     if (PyUnicode_GET_LENGTH(str) == 1) {
12317         int kind = PyUnicode_KIND(str);
12318         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12319         if (kind == PyUnicode_1BYTE_KIND) {
12320             void *to = PyUnicode_DATA(u);
12321             memset(to, (unsigned char)fill_char, len);
12322         }
12323         else if (kind == PyUnicode_2BYTE_KIND) {
12324             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12325             for (n = 0; n < len; ++n)
12326                 ucs2[n] = fill_char;
12327         } else {
12328             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12329             assert(kind == PyUnicode_4BYTE_KIND);
12330             for (n = 0; n < len; ++n)
12331                 ucs4[n] = fill_char;
12332         }
12333     }
12334     else {
12335         Py_ssize_t char_size = PyUnicode_KIND(str);
12336         char *to = (char *) PyUnicode_DATA(u);
12337         _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12338             PyUnicode_GET_LENGTH(str) * char_size);
12339     }
12340 
12341     assert(_PyUnicode_CheckConsistency(u, 1));
12342     return u;
12343 }
12344 
12345 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12346 PyUnicode_Replace(PyObject *str,
12347                   PyObject *substr,
12348                   PyObject *replstr,
12349                   Py_ssize_t maxcount)
12350 {
12351     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12352             ensure_unicode(replstr) < 0)
12353         return NULL;
12354     return replace(str, substr, replstr, maxcount);
12355 }
12356 
12357 /*[clinic input]
12358 str.replace as unicode_replace
12359 
12360     old: unicode
12361     new: unicode
12362     /
12363     count: Py_ssize_t = -1
12364         Maximum number of occurrences to replace.
12365         -1 (the default value) means replace all occurrences.
12366 
12367 Return a copy with all occurrences of substring old replaced by new.
12368 
12369 If the optional argument count is given, only the first count occurrences are
12370 replaced.
12371 [clinic start generated code]*/
12372 
12373 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12374 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12375                      Py_ssize_t count)
12376 /*[clinic end generated code: output=b63f1a8b5eebf448 input=3345c455d60a5499]*/
12377 {
12378     return replace(self, old, new, count);
12379 }
12380 
12381 /*[clinic input]
12382 str.removeprefix as unicode_removeprefix
12383 
12384     prefix: unicode
12385     /
12386 
12387 Return a str with the given prefix string removed if present.
12388 
12389 If the string starts with the prefix string, return string[len(prefix):].
12390 Otherwise, return a copy of the original string.
12391 [clinic start generated code]*/
12392 
12393 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12394 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12395 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12396 {
12397     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12398     if (match == -1) {
12399         return NULL;
12400     }
12401     if (match) {
12402         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12403                                    PyUnicode_GET_LENGTH(self));
12404     }
12405     return unicode_result_unchanged(self);
12406 }
12407 
12408 /*[clinic input]
12409 str.removesuffix as unicode_removesuffix
12410 
12411     suffix: unicode
12412     /
12413 
12414 Return a str with the given suffix string removed if present.
12415 
12416 If the string ends with the suffix string and that suffix is not empty,
12417 return string[:-len(suffix)]. Otherwise, return a copy of the original
12418 string.
12419 [clinic start generated code]*/
12420 
12421 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12422 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12423 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12424 {
12425     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12426     if (match == -1) {
12427         return NULL;
12428     }
12429     if (match) {
12430         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12431                                             - PyUnicode_GET_LENGTH(suffix));
12432     }
12433     return unicode_result_unchanged(self);
12434 }
12435 
12436 static PyObject *
unicode_repr(PyObject * unicode)12437 unicode_repr(PyObject *unicode)
12438 {
12439     PyObject *repr;
12440     Py_ssize_t isize;
12441     Py_ssize_t osize, squote, dquote, i, o;
12442     Py_UCS4 max, quote;
12443     int ikind, okind, unchanged;
12444     const void *idata;
12445     void *odata;
12446 
12447     isize = PyUnicode_GET_LENGTH(unicode);
12448     idata = PyUnicode_DATA(unicode);
12449 
12450     /* Compute length of output, quote characters, and
12451        maximum character */
12452     osize = 0;
12453     max = 127;
12454     squote = dquote = 0;
12455     ikind = PyUnicode_KIND(unicode);
12456     for (i = 0; i < isize; i++) {
12457         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12458         Py_ssize_t incr = 1;
12459         switch (ch) {
12460         case '\'': squote++; break;
12461         case '"':  dquote++; break;
12462         case '\\': case '\t': case '\r': case '\n':
12463             incr = 2;
12464             break;
12465         default:
12466             /* Fast-path ASCII */
12467             if (ch < ' ' || ch == 0x7f)
12468                 incr = 4; /* \xHH */
12469             else if (ch < 0x7f)
12470                 ;
12471             else if (Py_UNICODE_ISPRINTABLE(ch))
12472                 max = ch > max ? ch : max;
12473             else if (ch < 0x100)
12474                 incr = 4; /* \xHH */
12475             else if (ch < 0x10000)
12476                 incr = 6; /* \uHHHH */
12477             else
12478                 incr = 10; /* \uHHHHHHHH */
12479         }
12480         if (osize > PY_SSIZE_T_MAX - incr) {
12481             PyErr_SetString(PyExc_OverflowError,
12482                             "string is too long to generate repr");
12483             return NULL;
12484         }
12485         osize += incr;
12486     }
12487 
12488     quote = '\'';
12489     unchanged = (osize == isize);
12490     if (squote) {
12491         unchanged = 0;
12492         if (dquote)
12493             /* Both squote and dquote present. Use squote,
12494                and escape them */
12495             osize += squote;
12496         else
12497             quote = '"';
12498     }
12499     osize += 2;   /* quotes */
12500 
12501     repr = PyUnicode_New(osize, max);
12502     if (repr == NULL)
12503         return NULL;
12504     okind = PyUnicode_KIND(repr);
12505     odata = PyUnicode_DATA(repr);
12506 
12507     PyUnicode_WRITE(okind, odata, 0, quote);
12508     PyUnicode_WRITE(okind, odata, osize-1, quote);
12509     if (unchanged) {
12510         _PyUnicode_FastCopyCharacters(repr, 1,
12511                                       unicode, 0,
12512                                       isize);
12513     }
12514     else {
12515         for (i = 0, o = 1; i < isize; i++) {
12516             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12517 
12518             /* Escape quotes and backslashes */
12519             if ((ch == quote) || (ch == '\\')) {
12520                 PyUnicode_WRITE(okind, odata, o++, '\\');
12521                 PyUnicode_WRITE(okind, odata, o++, ch);
12522                 continue;
12523             }
12524 
12525             /* Map special whitespace to '\t', \n', '\r' */
12526             if (ch == '\t') {
12527                 PyUnicode_WRITE(okind, odata, o++, '\\');
12528                 PyUnicode_WRITE(okind, odata, o++, 't');
12529             }
12530             else if (ch == '\n') {
12531                 PyUnicode_WRITE(okind, odata, o++, '\\');
12532                 PyUnicode_WRITE(okind, odata, o++, 'n');
12533             }
12534             else if (ch == '\r') {
12535                 PyUnicode_WRITE(okind, odata, o++, '\\');
12536                 PyUnicode_WRITE(okind, odata, o++, 'r');
12537             }
12538 
12539             /* Map non-printable US ASCII to '\xhh' */
12540             else if (ch < ' ' || ch == 0x7F) {
12541                 PyUnicode_WRITE(okind, odata, o++, '\\');
12542                 PyUnicode_WRITE(okind, odata, o++, 'x');
12543                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12544                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12545             }
12546 
12547             /* Copy ASCII characters as-is */
12548             else if (ch < 0x7F) {
12549                 PyUnicode_WRITE(okind, odata, o++, ch);
12550             }
12551 
12552             /* Non-ASCII characters */
12553             else {
12554                 /* Map Unicode whitespace and control characters
12555                    (categories Z* and C* except ASCII space)
12556                 */
12557                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12558                     PyUnicode_WRITE(okind, odata, o++, '\\');
12559                     /* Map 8-bit characters to '\xhh' */
12560                     if (ch <= 0xff) {
12561                         PyUnicode_WRITE(okind, odata, o++, 'x');
12562                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12563                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12564                     }
12565                     /* Map 16-bit characters to '\uxxxx' */
12566                     else if (ch <= 0xffff) {
12567                         PyUnicode_WRITE(okind, odata, o++, 'u');
12568                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12569                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12570                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12571                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12572                     }
12573                     /* Map 21-bit characters to '\U00xxxxxx' */
12574                     else {
12575                         PyUnicode_WRITE(okind, odata, o++, 'U');
12576                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12577                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12578                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12579                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12580                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12581                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12582                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12583                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12584                     }
12585                 }
12586                 /* Copy characters as-is */
12587                 else {
12588                     PyUnicode_WRITE(okind, odata, o++, ch);
12589                 }
12590             }
12591         }
12592     }
12593     /* Closing quote already added at the beginning */
12594     assert(_PyUnicode_CheckConsistency(repr, 1));
12595     return repr;
12596 }
12597 
12598 /*[clinic input]
12599 str.rfind as unicode_rfind = str.count
12600 
12601 Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12602 
12603 Optional arguments start and end are interpreted as in slice notation.
12604 Return -1 on failure.
12605 [clinic start generated code]*/
12606 
12607 static Py_ssize_t
unicode_rfind_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)12608 unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12609                    Py_ssize_t end)
12610 /*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/
12611 {
12612     Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12613     if (result < 0) {
12614         return -1;
12615     }
12616     return result;
12617 }
12618 
12619 /*[clinic input]
12620 str.rindex as unicode_rindex = str.count
12621 
12622 Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12623 
12624 Optional arguments start and end are interpreted as in slice notation.
12625 Raises ValueError when the substring is not found.
12626 [clinic start generated code]*/
12627 
12628 static Py_ssize_t
unicode_rindex_impl(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)12629 unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12630                     Py_ssize_t end)
12631 /*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/
12632 {
12633     Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12634     if (result == -1) {
12635         PyErr_SetString(PyExc_ValueError, "substring not found");
12636     }
12637     else if (result < 0) {
12638         return -1;
12639     }
12640     return result;
12641 }
12642 
12643 /*[clinic input]
12644 str.rjust as unicode_rjust
12645 
12646     width: Py_ssize_t
12647     fillchar: Py_UCS4 = ' '
12648     /
12649 
12650 Return a right-justified string of length width.
12651 
12652 Padding is done using the specified fill character (default is a space).
12653 [clinic start generated code]*/
12654 
12655 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12656 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12657 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12658 {
12659     if (PyUnicode_GET_LENGTH(self) >= width)
12660         return unicode_result_unchanged(self);
12661 
12662     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12663 }
12664 
12665 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12666 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12667 {
12668     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12669         return NULL;
12670 
12671     return split(s, sep, maxsplit);
12672 }
12673 
12674 /*[clinic input]
12675 str.split as unicode_split
12676 
12677     sep: object = None
12678         The separator used to split the string.
12679 
12680         When set to None (the default value), will split on any whitespace
12681         character (including \n \r \t \f and spaces) and will discard
12682         empty strings from the result.
12683     maxsplit: Py_ssize_t = -1
12684         Maximum number of splits.
12685         -1 (the default value) means no limit.
12686 
12687 Return a list of the substrings in the string, using sep as the separator string.
12688 
12689 Splitting starts at the front of the string and works to the end.
12690 
12691 Note, str.split() is mainly useful for data that has been intentionally
12692 delimited.  With natural text that includes punctuation, consider using
12693 the regular expression module.
12694 
12695 [clinic start generated code]*/
12696 
12697 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12698 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12699 /*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
12700 {
12701     if (sep == Py_None)
12702         return split(self, NULL, maxsplit);
12703     if (PyUnicode_Check(sep))
12704         return split(self, sep, maxsplit);
12705 
12706     PyErr_Format(PyExc_TypeError,
12707                  "must be str or None, not %.100s",
12708                  Py_TYPE(sep)->tp_name);
12709     return NULL;
12710 }
12711 
12712 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12713 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12714 {
12715     PyObject* out;
12716     int kind1, kind2;
12717     const void *buf1, *buf2;
12718     Py_ssize_t len1, len2;
12719 
12720     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12721         return NULL;
12722 
12723     kind1 = PyUnicode_KIND(str_obj);
12724     kind2 = PyUnicode_KIND(sep_obj);
12725     len1 = PyUnicode_GET_LENGTH(str_obj);
12726     len2 = PyUnicode_GET_LENGTH(sep_obj);
12727     if (kind1 < kind2 || len1 < len2) {
12728         PyObject *empty = unicode_get_empty();  // Borrowed reference
12729         return PyTuple_Pack(3, str_obj, empty, empty);
12730     }
12731     buf1 = PyUnicode_DATA(str_obj);
12732     buf2 = PyUnicode_DATA(sep_obj);
12733     if (kind2 != kind1) {
12734         buf2 = unicode_askind(kind2, buf2, len2, kind1);
12735         if (!buf2)
12736             return NULL;
12737     }
12738 
12739     switch (kind1) {
12740     case PyUnicode_1BYTE_KIND:
12741         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12742             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12743         else
12744             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12745         break;
12746     case PyUnicode_2BYTE_KIND:
12747         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12748         break;
12749     case PyUnicode_4BYTE_KIND:
12750         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12751         break;
12752     default:
12753         Py_UNREACHABLE();
12754     }
12755 
12756     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12757     if (kind2 != kind1)
12758         PyMem_Free((void *)buf2);
12759 
12760     return out;
12761 }
12762 
12763 
12764 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12765 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12766 {
12767     PyObject* out;
12768     int kind1, kind2;
12769     const void *buf1, *buf2;
12770     Py_ssize_t len1, len2;
12771 
12772     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12773         return NULL;
12774 
12775     kind1 = PyUnicode_KIND(str_obj);
12776     kind2 = PyUnicode_KIND(sep_obj);
12777     len1 = PyUnicode_GET_LENGTH(str_obj);
12778     len2 = PyUnicode_GET_LENGTH(sep_obj);
12779     if (kind1 < kind2 || len1 < len2) {
12780         PyObject *empty = unicode_get_empty();  // Borrowed reference
12781         return PyTuple_Pack(3, empty, empty, str_obj);
12782     }
12783     buf1 = PyUnicode_DATA(str_obj);
12784     buf2 = PyUnicode_DATA(sep_obj);
12785     if (kind2 != kind1) {
12786         buf2 = unicode_askind(kind2, buf2, len2, kind1);
12787         if (!buf2)
12788             return NULL;
12789     }
12790 
12791     switch (kind1) {
12792     case PyUnicode_1BYTE_KIND:
12793         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12794             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12795         else
12796             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12797         break;
12798     case PyUnicode_2BYTE_KIND:
12799         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12800         break;
12801     case PyUnicode_4BYTE_KIND:
12802         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12803         break;
12804     default:
12805         Py_UNREACHABLE();
12806     }
12807 
12808     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12809     if (kind2 != kind1)
12810         PyMem_Free((void *)buf2);
12811 
12812     return out;
12813 }
12814 
12815 /*[clinic input]
12816 str.partition as unicode_partition
12817 
12818     sep: object
12819     /
12820 
12821 Partition the string into three parts using the given separator.
12822 
12823 This will search for the separator in the string.  If the separator is found,
12824 returns a 3-tuple containing the part before the separator, the separator
12825 itself, and the part after it.
12826 
12827 If the separator is not found, returns a 3-tuple containing the original string
12828 and two empty strings.
12829 [clinic start generated code]*/
12830 
12831 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)12832 unicode_partition(PyObject *self, PyObject *sep)
12833 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
12834 {
12835     return PyUnicode_Partition(self, sep);
12836 }
12837 
12838 /*[clinic input]
12839 str.rpartition as unicode_rpartition = str.partition
12840 
12841 Partition the string into three parts using the given separator.
12842 
12843 This will search for the separator in the string, starting at the end. If
12844 the separator is found, returns a 3-tuple containing the part before the
12845 separator, the separator itself, and the part after it.
12846 
12847 If the separator is not found, returns a 3-tuple containing two empty strings
12848 and the original string.
12849 [clinic start generated code]*/
12850 
12851 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)12852 unicode_rpartition(PyObject *self, PyObject *sep)
12853 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
12854 {
12855     return PyUnicode_RPartition(self, sep);
12856 }
12857 
12858 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12859 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12860 {
12861     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12862         return NULL;
12863 
12864     return rsplit(s, sep, maxsplit);
12865 }
12866 
12867 /*[clinic input]
12868 str.rsplit as unicode_rsplit = str.split
12869 
12870 Return a list of the substrings in the string, using sep as the separator string.
12871 
12872 Splitting starts at the end of the string and works to the front.
12873 [clinic start generated code]*/
12874 
12875 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12876 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12877 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
12878 {
12879     if (sep == Py_None)
12880         return rsplit(self, NULL, maxsplit);
12881     if (PyUnicode_Check(sep))
12882         return rsplit(self, sep, maxsplit);
12883 
12884     PyErr_Format(PyExc_TypeError,
12885                  "must be str or None, not %.100s",
12886                  Py_TYPE(sep)->tp_name);
12887     return NULL;
12888 }
12889 
12890 /*[clinic input]
12891 str.splitlines as unicode_splitlines
12892 
12893     keepends: bool = False
12894 
12895 Return a list of the lines in the string, breaking at line boundaries.
12896 
12897 Line breaks are not included in the resulting list unless keepends is given and
12898 true.
12899 [clinic start generated code]*/
12900 
12901 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)12902 unicode_splitlines_impl(PyObject *self, int keepends)
12903 /*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
12904 {
12905     return PyUnicode_Splitlines(self, keepends);
12906 }
12907 
12908 static
unicode_str(PyObject * self)12909 PyObject *unicode_str(PyObject *self)
12910 {
12911     return unicode_result_unchanged(self);
12912 }
12913 
12914 /*[clinic input]
12915 str.swapcase as unicode_swapcase
12916 
12917 Convert uppercase characters to lowercase and lowercase characters to uppercase.
12918 [clinic start generated code]*/
12919 
12920 static PyObject *
unicode_swapcase_impl(PyObject * self)12921 unicode_swapcase_impl(PyObject *self)
12922 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
12923 {
12924     return case_operation(self, do_swapcase);
12925 }
12926 
12927 /*[clinic input]
12928 
12929 @staticmethod
12930 str.maketrans as unicode_maketrans
12931 
12932   x: object
12933 
12934   y: unicode=NULL
12935 
12936   z: unicode=NULL
12937 
12938   /
12939 
12940 Return a translation table usable for str.translate().
12941 
12942 If there is only one argument, it must be a dictionary mapping Unicode
12943 ordinals (integers) or characters to Unicode ordinals, strings or None.
12944 Character keys will be then converted to ordinals.
12945 If there are two arguments, they must be strings of equal length, and
12946 in the resulting dictionary, each character in x will be mapped to the
12947 character at the same position in y. If there is a third argument, it
12948 must be a string, whose characters will be mapped to None in the result.
12949 [clinic start generated code]*/
12950 
12951 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)12952 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12953 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12954 {
12955     PyObject *new = NULL, *key, *value;
12956     Py_ssize_t i = 0;
12957     int res;
12958 
12959     new = PyDict_New();
12960     if (!new)
12961         return NULL;
12962     if (y != NULL) {
12963         int x_kind, y_kind, z_kind;
12964         const void *x_data, *y_data, *z_data;
12965 
12966         /* x must be a string too, of equal length */
12967         if (!PyUnicode_Check(x)) {
12968             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12969                             "be a string if there is a second argument");
12970             goto err;
12971         }
12972         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12973             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12974                             "arguments must have equal length");
12975             goto err;
12976         }
12977         /* create entries for translating chars in x to those in y */
12978         x_kind = PyUnicode_KIND(x);
12979         y_kind = PyUnicode_KIND(y);
12980         x_data = PyUnicode_DATA(x);
12981         y_data = PyUnicode_DATA(y);
12982         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12983             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12984             if (!key)
12985                 goto err;
12986             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12987             if (!value) {
12988                 Py_DECREF(key);
12989                 goto err;
12990             }
12991             res = PyDict_SetItem(new, key, value);
12992             Py_DECREF(key);
12993             Py_DECREF(value);
12994             if (res < 0)
12995                 goto err;
12996         }
12997         /* create entries for deleting chars in z */
12998         if (z != NULL) {
12999             z_kind = PyUnicode_KIND(z);
13000             z_data = PyUnicode_DATA(z);
13001             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13002                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13003                 if (!key)
13004                     goto err;
13005                 res = PyDict_SetItem(new, key, Py_None);
13006                 Py_DECREF(key);
13007                 if (res < 0)
13008                     goto err;
13009             }
13010         }
13011     } else {
13012         int kind;
13013         const void *data;
13014 
13015         /* x must be a dict */
13016         if (!PyDict_CheckExact(x)) {
13017             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13018                             "to maketrans it must be a dict");
13019             goto err;
13020         }
13021         /* copy entries into the new dict, converting string keys to int keys */
13022         while (PyDict_Next(x, &i, &key, &value)) {
13023             if (PyUnicode_Check(key)) {
13024                 /* convert string keys to integer keys */
13025                 PyObject *newkey;
13026                 if (PyUnicode_GET_LENGTH(key) != 1) {
13027                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13028                                     "table must be of length 1");
13029                     goto err;
13030                 }
13031                 kind = PyUnicode_KIND(key);
13032                 data = PyUnicode_DATA(key);
13033                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13034                 if (!newkey)
13035                     goto err;
13036                 res = PyDict_SetItem(new, newkey, value);
13037                 Py_DECREF(newkey);
13038                 if (res < 0)
13039                     goto err;
13040             } else if (PyLong_Check(key)) {
13041                 /* just keep integer keys */
13042                 if (PyDict_SetItem(new, key, value) < 0)
13043                     goto err;
13044             } else {
13045                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13046                                 "be strings or integers");
13047                 goto err;
13048             }
13049         }
13050     }
13051     return new;
13052   err:
13053     Py_DECREF(new);
13054     return NULL;
13055 }
13056 
13057 /*[clinic input]
13058 str.translate as unicode_translate
13059 
13060     table: object
13061         Translation table, which must be a mapping of Unicode ordinals to
13062         Unicode ordinals, strings, or None.
13063     /
13064 
13065 Replace each character in the string using the given translation table.
13066 
13067 The table must implement lookup/indexing via __getitem__, for instance a
13068 dictionary or list.  If this operation raises LookupError, the character is
13069 left untouched.  Characters mapped to None are deleted.
13070 [clinic start generated code]*/
13071 
13072 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13073 unicode_translate(PyObject *self, PyObject *table)
13074 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13075 {
13076     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13077 }
13078 
13079 /*[clinic input]
13080 str.upper as unicode_upper
13081 
13082 Return a copy of the string converted to uppercase.
13083 [clinic start generated code]*/
13084 
13085 static PyObject *
unicode_upper_impl(PyObject * self)13086 unicode_upper_impl(PyObject *self)
13087 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13088 {
13089     if (PyUnicode_IS_ASCII(self))
13090         return ascii_upper_or_lower(self, 0);
13091     return case_operation(self, do_upper);
13092 }
13093 
13094 /*[clinic input]
13095 str.zfill as unicode_zfill
13096 
13097     width: Py_ssize_t
13098     /
13099 
13100 Pad a numeric string with zeros on the left, to fill a field of the given width.
13101 
13102 The string is never truncated.
13103 [clinic start generated code]*/
13104 
13105 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13106 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13107 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13108 {
13109     Py_ssize_t fill;
13110     PyObject *u;
13111     int kind;
13112     const void *data;
13113     Py_UCS4 chr;
13114 
13115     if (PyUnicode_GET_LENGTH(self) >= width)
13116         return unicode_result_unchanged(self);
13117 
13118     fill = width - PyUnicode_GET_LENGTH(self);
13119 
13120     u = pad(self, fill, 0, '0');
13121 
13122     if (u == NULL)
13123         return NULL;
13124 
13125     kind = PyUnicode_KIND(u);
13126     data = PyUnicode_DATA(u);
13127     chr = PyUnicode_READ(kind, data, fill);
13128 
13129     if (chr == '+' || chr == '-') {
13130         /* move sign to beginning of string */
13131         PyUnicode_WRITE(kind, data, 0, chr);
13132         PyUnicode_WRITE(kind, data, fill, '0');
13133     }
13134 
13135     assert(_PyUnicode_CheckConsistency(u, 1));
13136     return u;
13137 }
13138 
13139 /*[clinic input]
13140 @text_signature "($self, prefix[, start[, end]], /)"
13141 str.startswith as unicode_startswith
13142 
13143     prefix as subobj: object
13144         A string or a tuple of strings to try.
13145     start: slice_index(accept={int, NoneType}, c_default='0') = None
13146         Optional start position. Default: start of the string.
13147     end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13148         Optional stop position. Default: end of the string.
13149     /
13150 
13151 Return True if the string starts with the specified prefix, False otherwise.
13152 [clinic start generated code]*/
13153 
13154 static PyObject *
unicode_startswith_impl(PyObject * self,PyObject * subobj,Py_ssize_t start,Py_ssize_t end)13155 unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13156                         Py_ssize_t end)
13157 /*[clinic end generated code: output=4bd7cfd0803051d4 input=5f918b5f5f89d856]*/
13158 {
13159     if (PyTuple_Check(subobj)) {
13160         Py_ssize_t i;
13161         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13162             PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13163             if (!PyUnicode_Check(substring)) {
13164                 PyErr_Format(PyExc_TypeError,
13165                              "tuple for startswith must only contain str, "
13166                              "not %.100s",
13167                              Py_TYPE(substring)->tp_name);
13168                 return NULL;
13169             }
13170             int result = tailmatch(self, substring, start, end, -1);
13171             if (result < 0) {
13172                 return NULL;
13173             }
13174             if (result) {
13175                 Py_RETURN_TRUE;
13176             }
13177         }
13178         /* nothing matched */
13179         Py_RETURN_FALSE;
13180     }
13181     if (!PyUnicode_Check(subobj)) {
13182         PyErr_Format(PyExc_TypeError,
13183                      "startswith first arg must be str or "
13184                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13185         return NULL;
13186     }
13187     int result = tailmatch(self, subobj, start, end, -1);
13188     if (result < 0) {
13189         return NULL;
13190     }
13191     return PyBool_FromLong(result);
13192 }
13193 
13194 
13195 /*[clinic input]
13196 @text_signature "($self, suffix[, start[, end]], /)"
13197 str.endswith as unicode_endswith
13198 
13199     suffix as subobj: object
13200         A string or a tuple of strings to try.
13201     start: slice_index(accept={int, NoneType}, c_default='0') = None
13202         Optional start position. Default: start of the string.
13203     end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13204         Optional stop position. Default: end of the string.
13205     /
13206 
13207 Return True if the string ends with the specified suffix, False otherwise.
13208 [clinic start generated code]*/
13209 
13210 static PyObject *
unicode_endswith_impl(PyObject * self,PyObject * subobj,Py_ssize_t start,Py_ssize_t end)13211 unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13212                       Py_ssize_t end)
13213 /*[clinic end generated code: output=cce6f8ceb0102ca9 input=00fbdc774a7d4d71]*/
13214 {
13215     if (PyTuple_Check(subobj)) {
13216         Py_ssize_t i;
13217         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13218             PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13219             if (!PyUnicode_Check(substring)) {
13220                 PyErr_Format(PyExc_TypeError,
13221                              "tuple for endswith must only contain str, "
13222                              "not %.100s",
13223                              Py_TYPE(substring)->tp_name);
13224                 return NULL;
13225             }
13226             int result = tailmatch(self, substring, start, end, +1);
13227             if (result < 0) {
13228                 return NULL;
13229             }
13230             if (result) {
13231                 Py_RETURN_TRUE;
13232             }
13233         }
13234         Py_RETURN_FALSE;
13235     }
13236     if (!PyUnicode_Check(subobj)) {
13237         PyErr_Format(PyExc_TypeError,
13238                      "endswith first arg must be str or "
13239                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13240         return NULL;
13241     }
13242     int result = tailmatch(self, subobj, start, end, +1);
13243     if (result < 0) {
13244         return NULL;
13245     }
13246     return PyBool_FromLong(result);
13247 }
13248 
13249 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13250 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13251 {
13252     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13253     writer->data = PyUnicode_DATA(writer->buffer);
13254 
13255     if (!writer->readonly) {
13256         writer->kind = PyUnicode_KIND(writer->buffer);
13257         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13258     }
13259     else {
13260         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13261            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13262         writer->kind = 0;
13263         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13264 
13265         /* Copy-on-write mode: set buffer size to 0 so
13266          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13267          * next write. */
13268         writer->size = 0;
13269     }
13270 }
13271 
13272 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13273 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13274 {
13275     memset(writer, 0, sizeof(*writer));
13276 
13277     /* ASCII is the bare minimum */
13278     writer->min_char = 127;
13279 
13280     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13281        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13282     writer->kind = 0;
13283     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13284 }
13285 
13286 // Initialize _PyUnicodeWriter with initial buffer
13287 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13288 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13289 {
13290     memset(writer, 0, sizeof(*writer));
13291     writer->buffer = buffer;
13292     _PyUnicodeWriter_Update(writer);
13293     writer->min_length = writer->size;
13294 }
13295 
13296 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13297 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13298                                  Py_ssize_t length, Py_UCS4 maxchar)
13299 {
13300     Py_ssize_t newlen;
13301     PyObject *newbuffer;
13302 
13303     assert(maxchar <= MAX_UNICODE);
13304 
13305     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13306     assert((maxchar > writer->maxchar && length >= 0)
13307            || length > 0);
13308 
13309     if (length > PY_SSIZE_T_MAX - writer->pos) {
13310         PyErr_NoMemory();
13311         return -1;
13312     }
13313     newlen = writer->pos + length;
13314 
13315     maxchar = Py_MAX(maxchar, writer->min_char);
13316 
13317     if (writer->buffer == NULL) {
13318         assert(!writer->readonly);
13319         if (writer->overallocate
13320             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13321             /* overallocate to limit the number of realloc() */
13322             newlen += newlen / OVERALLOCATE_FACTOR;
13323         }
13324         if (newlen < writer->min_length)
13325             newlen = writer->min_length;
13326 
13327         writer->buffer = PyUnicode_New(newlen, maxchar);
13328         if (writer->buffer == NULL)
13329             return -1;
13330     }
13331     else if (newlen > writer->size) {
13332         if (writer->overallocate
13333             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13334             /* overallocate to limit the number of realloc() */
13335             newlen += newlen / OVERALLOCATE_FACTOR;
13336         }
13337         if (newlen < writer->min_length)
13338             newlen = writer->min_length;
13339 
13340         if (maxchar > writer->maxchar || writer->readonly) {
13341             /* resize + widen */
13342             maxchar = Py_MAX(maxchar, writer->maxchar);
13343             newbuffer = PyUnicode_New(newlen, maxchar);
13344             if (newbuffer == NULL)
13345                 return -1;
13346             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13347                                           writer->buffer, 0, writer->pos);
13348             Py_DECREF(writer->buffer);
13349             writer->readonly = 0;
13350         }
13351         else {
13352             newbuffer = resize_compact(writer->buffer, newlen);
13353             if (newbuffer == NULL)
13354                 return -1;
13355         }
13356         writer->buffer = newbuffer;
13357     }
13358     else if (maxchar > writer->maxchar) {
13359         assert(!writer->readonly);
13360         newbuffer = PyUnicode_New(writer->size, maxchar);
13361         if (newbuffer == NULL)
13362             return -1;
13363         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13364                                       writer->buffer, 0, writer->pos);
13365         Py_SETREF(writer->buffer, newbuffer);
13366     }
13367     _PyUnicodeWriter_Update(writer);
13368     return 0;
13369 
13370 #undef OVERALLOCATE_FACTOR
13371 }
13372 
13373 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,int kind)13374 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13375                                      int kind)
13376 {
13377     Py_UCS4 maxchar;
13378 
13379     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13380     assert(writer->kind < kind);
13381 
13382     switch (kind)
13383     {
13384     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13385     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13386     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13387     default:
13388         Py_UNREACHABLE();
13389     }
13390 
13391     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13392 }
13393 
13394 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13395 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13396 {
13397     assert(ch <= MAX_UNICODE);
13398     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13399         return -1;
13400     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13401     writer->pos++;
13402     return 0;
13403 }
13404 
13405 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13406 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13407 {
13408     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13409 }
13410 
13411 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13412 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13413 {
13414     Py_UCS4 maxchar;
13415     Py_ssize_t len;
13416 
13417     len = PyUnicode_GET_LENGTH(str);
13418     if (len == 0)
13419         return 0;
13420     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13421     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13422         if (writer->buffer == NULL && !writer->overallocate) {
13423             assert(_PyUnicode_CheckConsistency(str, 1));
13424             writer->readonly = 1;
13425             writer->buffer = Py_NewRef(str);
13426             _PyUnicodeWriter_Update(writer);
13427             writer->pos += len;
13428             return 0;
13429         }
13430         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13431             return -1;
13432     }
13433     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13434                                   str, 0, len);
13435     writer->pos += len;
13436     return 0;
13437 }
13438 
13439 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13440 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13441                                 Py_ssize_t start, Py_ssize_t end)
13442 {
13443     Py_UCS4 maxchar;
13444     Py_ssize_t len;
13445 
13446     assert(0 <= start);
13447     assert(end <= PyUnicode_GET_LENGTH(str));
13448     assert(start <= end);
13449 
13450     if (end == 0)
13451         return 0;
13452 
13453     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13454         return _PyUnicodeWriter_WriteStr(writer, str);
13455 
13456     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13457         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13458     else
13459         maxchar = writer->maxchar;
13460     len = end - start;
13461 
13462     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13463         return -1;
13464 
13465     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13466                                   str, start, len);
13467     writer->pos += len;
13468     return 0;
13469 }
13470 
13471 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13472 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13473                                   const char *ascii, Py_ssize_t len)
13474 {
13475     if (len == -1)
13476         len = strlen(ascii);
13477 
13478     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13479 
13480     if (writer->buffer == NULL && !writer->overallocate) {
13481         PyObject *str;
13482 
13483         str = _PyUnicode_FromASCII(ascii, len);
13484         if (str == NULL)
13485             return -1;
13486 
13487         writer->readonly = 1;
13488         writer->buffer = str;
13489         _PyUnicodeWriter_Update(writer);
13490         writer->pos += len;
13491         return 0;
13492     }
13493 
13494     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13495         return -1;
13496 
13497     switch (writer->kind)
13498     {
13499     case PyUnicode_1BYTE_KIND:
13500     {
13501         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13502         Py_UCS1 *data = writer->data;
13503 
13504         memcpy(data + writer->pos, str, len);
13505         break;
13506     }
13507     case PyUnicode_2BYTE_KIND:
13508     {
13509         _PyUnicode_CONVERT_BYTES(
13510             Py_UCS1, Py_UCS2,
13511             ascii, ascii + len,
13512             (Py_UCS2 *)writer->data + writer->pos);
13513         break;
13514     }
13515     case PyUnicode_4BYTE_KIND:
13516     {
13517         _PyUnicode_CONVERT_BYTES(
13518             Py_UCS1, Py_UCS4,
13519             ascii, ascii + len,
13520             (Py_UCS4 *)writer->data + writer->pos);
13521         break;
13522     }
13523     default:
13524         Py_UNREACHABLE();
13525     }
13526 
13527     writer->pos += len;
13528     return 0;
13529 }
13530 
13531 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13532 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13533                                    const char *str, Py_ssize_t len)
13534 {
13535     Py_UCS4 maxchar;
13536 
13537     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13538     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13539         return -1;
13540     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13541     writer->pos += len;
13542     return 0;
13543 }
13544 
13545 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13546 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13547 {
13548     PyObject *str;
13549 
13550     if (writer->pos == 0) {
13551         Py_CLEAR(writer->buffer);
13552         _Py_RETURN_UNICODE_EMPTY();
13553     }
13554 
13555     str = writer->buffer;
13556     writer->buffer = NULL;
13557 
13558     if (writer->readonly) {
13559         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13560         return str;
13561     }
13562 
13563     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13564         PyObject *str2;
13565         str2 = resize_compact(str, writer->pos);
13566         if (str2 == NULL) {
13567             Py_DECREF(str);
13568             return NULL;
13569         }
13570         str = str2;
13571     }
13572 
13573     assert(_PyUnicode_CheckConsistency(str, 1));
13574     return unicode_result(str);
13575 }
13576 
13577 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13578 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13579 {
13580     Py_CLEAR(writer->buffer);
13581 }
13582 
13583 #include "stringlib/unicode_format.h"
13584 
13585 PyDoc_STRVAR(format__doc__,
13586              "format($self, /, *args, **kwargs)\n\
13587 --\n\
13588 \n\
13589 Return a formatted version of the string, using substitutions from args and kwargs.\n\
13590 The substitutions are identified by braces ('{' and '}').");
13591 
13592 PyDoc_STRVAR(format_map__doc__,
13593              "format_map($self, mapping, /)\n\
13594 --\n\
13595 \n\
13596 Return a formatted version of the string, using substitutions from mapping.\n\
13597 The substitutions are identified by braces ('{' and '}').");
13598 
13599 /*[clinic input]
13600 str.__format__ as unicode___format__
13601 
13602     format_spec: unicode
13603     /
13604 
13605 Return a formatted version of the string as described by format_spec.
13606 [clinic start generated code]*/
13607 
13608 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13609 unicode___format___impl(PyObject *self, PyObject *format_spec)
13610 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13611 {
13612     _PyUnicodeWriter writer;
13613     int ret;
13614 
13615     _PyUnicodeWriter_Init(&writer);
13616     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13617                                           self, format_spec, 0,
13618                                           PyUnicode_GET_LENGTH(format_spec));
13619     if (ret == -1) {
13620         _PyUnicodeWriter_Dealloc(&writer);
13621         return NULL;
13622     }
13623     return _PyUnicodeWriter_Finish(&writer);
13624 }
13625 
13626 /*[clinic input]
13627 str.__sizeof__ as unicode_sizeof
13628 
13629 Return the size of the string in memory, in bytes.
13630 [clinic start generated code]*/
13631 
13632 static PyObject *
unicode_sizeof_impl(PyObject * self)13633 unicode_sizeof_impl(PyObject *self)
13634 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13635 {
13636     Py_ssize_t size;
13637 
13638     /* If it's a compact object, account for base structure +
13639        character data. */
13640     if (PyUnicode_IS_COMPACT_ASCII(self)) {
13641         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13642     }
13643     else if (PyUnicode_IS_COMPACT(self)) {
13644         size = sizeof(PyCompactUnicodeObject) +
13645             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13646     }
13647     else {
13648         /* If it is a two-block object, account for base object, and
13649            for character block if present. */
13650         size = sizeof(PyUnicodeObject);
13651         if (_PyUnicode_DATA_ANY(self))
13652             size += (PyUnicode_GET_LENGTH(self) + 1) *
13653                 PyUnicode_KIND(self);
13654     }
13655     if (_PyUnicode_HAS_UTF8_MEMORY(self))
13656         size += PyUnicode_UTF8_LENGTH(self) + 1;
13657 
13658     return PyLong_FromSsize_t(size);
13659 }
13660 
13661 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13662 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13663 {
13664     PyObject *copy = _PyUnicode_Copy(v);
13665     if (!copy)
13666         return NULL;
13667     return Py_BuildValue("(N)", copy);
13668 }
13669 
13670 static PyMethodDef unicode_methods[] = {
13671     UNICODE_ENCODE_METHODDEF
13672     UNICODE_REPLACE_METHODDEF
13673     UNICODE_SPLIT_METHODDEF
13674     UNICODE_RSPLIT_METHODDEF
13675     UNICODE_JOIN_METHODDEF
13676     UNICODE_CAPITALIZE_METHODDEF
13677     UNICODE_CASEFOLD_METHODDEF
13678     UNICODE_TITLE_METHODDEF
13679     UNICODE_CENTER_METHODDEF
13680     UNICODE_COUNT_METHODDEF
13681     UNICODE_EXPANDTABS_METHODDEF
13682     UNICODE_FIND_METHODDEF
13683     UNICODE_PARTITION_METHODDEF
13684     UNICODE_INDEX_METHODDEF
13685     UNICODE_LJUST_METHODDEF
13686     UNICODE_LOWER_METHODDEF
13687     UNICODE_LSTRIP_METHODDEF
13688     UNICODE_RFIND_METHODDEF
13689     UNICODE_RINDEX_METHODDEF
13690     UNICODE_RJUST_METHODDEF
13691     UNICODE_RSTRIP_METHODDEF
13692     UNICODE_RPARTITION_METHODDEF
13693     UNICODE_SPLITLINES_METHODDEF
13694     UNICODE_STRIP_METHODDEF
13695     UNICODE_SWAPCASE_METHODDEF
13696     UNICODE_TRANSLATE_METHODDEF
13697     UNICODE_UPPER_METHODDEF
13698     UNICODE_STARTSWITH_METHODDEF
13699     UNICODE_ENDSWITH_METHODDEF
13700     UNICODE_REMOVEPREFIX_METHODDEF
13701     UNICODE_REMOVESUFFIX_METHODDEF
13702     UNICODE_ISASCII_METHODDEF
13703     UNICODE_ISLOWER_METHODDEF
13704     UNICODE_ISUPPER_METHODDEF
13705     UNICODE_ISTITLE_METHODDEF
13706     UNICODE_ISSPACE_METHODDEF
13707     UNICODE_ISDECIMAL_METHODDEF
13708     UNICODE_ISDIGIT_METHODDEF
13709     UNICODE_ISNUMERIC_METHODDEF
13710     UNICODE_ISALPHA_METHODDEF
13711     UNICODE_ISALNUM_METHODDEF
13712     UNICODE_ISIDENTIFIER_METHODDEF
13713     UNICODE_ISPRINTABLE_METHODDEF
13714     UNICODE_ZFILL_METHODDEF
13715     {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13716     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13717     UNICODE___FORMAT___METHODDEF
13718     UNICODE_MAKETRANS_METHODDEF
13719     UNICODE_SIZEOF_METHODDEF
13720     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13721     {NULL, NULL}
13722 };
13723 
13724 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13725 unicode_mod(PyObject *v, PyObject *w)
13726 {
13727     if (!PyUnicode_Check(v))
13728         Py_RETURN_NOTIMPLEMENTED;
13729     return PyUnicode_Format(v, w);
13730 }
13731 
13732 static PyNumberMethods unicode_as_number = {
13733     0,              /*nb_add*/
13734     0,              /*nb_subtract*/
13735     0,              /*nb_multiply*/
13736     unicode_mod,            /*nb_remainder*/
13737 };
13738 
13739 static PySequenceMethods unicode_as_sequence = {
13740     (lenfunc) unicode_length,       /* sq_length */
13741     PyUnicode_Concat,           /* sq_concat */
13742     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13743     (ssizeargfunc) unicode_getitem,     /* sq_item */
13744     0,                  /* sq_slice */
13745     0,                  /* sq_ass_item */
13746     0,                  /* sq_ass_slice */
13747     PyUnicode_Contains,         /* sq_contains */
13748 };
13749 
13750 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13751 unicode_subscript(PyObject* self, PyObject* item)
13752 {
13753     if (_PyIndex_Check(item)) {
13754         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13755         if (i == -1 && PyErr_Occurred())
13756             return NULL;
13757         if (i < 0)
13758             i += PyUnicode_GET_LENGTH(self);
13759         return unicode_getitem(self, i);
13760     } else if (PySlice_Check(item)) {
13761         Py_ssize_t start, stop, step, slicelength, i;
13762         size_t cur;
13763         PyObject *result;
13764         const void *src_data;
13765         void *dest_data;
13766         int src_kind, dest_kind;
13767         Py_UCS4 ch, max_char, kind_limit;
13768 
13769         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13770             return NULL;
13771         }
13772         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13773                                             &start, &stop, step);
13774 
13775         if (slicelength <= 0) {
13776             _Py_RETURN_UNICODE_EMPTY();
13777         } else if (start == 0 && step == 1 &&
13778                    slicelength == PyUnicode_GET_LENGTH(self)) {
13779             return unicode_result_unchanged(self);
13780         } else if (step == 1) {
13781             return PyUnicode_Substring(self,
13782                                        start, start + slicelength);
13783         }
13784         /* General case */
13785         src_kind = PyUnicode_KIND(self);
13786         src_data = PyUnicode_DATA(self);
13787         if (!PyUnicode_IS_ASCII(self)) {
13788             kind_limit = kind_maxchar_limit(src_kind);
13789             max_char = 0;
13790             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13791                 ch = PyUnicode_READ(src_kind, src_data, cur);
13792                 if (ch > max_char) {
13793                     max_char = ch;
13794                     if (max_char >= kind_limit)
13795                         break;
13796                 }
13797             }
13798         }
13799         else
13800             max_char = 127;
13801         result = PyUnicode_New(slicelength, max_char);
13802         if (result == NULL)
13803             return NULL;
13804         dest_kind = PyUnicode_KIND(result);
13805         dest_data = PyUnicode_DATA(result);
13806 
13807         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13808             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13809             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13810         }
13811         assert(_PyUnicode_CheckConsistency(result, 1));
13812         return result;
13813     } else {
13814         PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13815                      Py_TYPE(item)->tp_name);
13816         return NULL;
13817     }
13818 }
13819 
13820 static PyMappingMethods unicode_as_mapping = {
13821     (lenfunc)unicode_length,        /* mp_length */
13822     (binaryfunc)unicode_subscript,  /* mp_subscript */
13823     (objobjargproc)0,           /* mp_ass_subscript */
13824 };
13825 
13826 
13827 /* Helpers for PyUnicode_Format() */
13828 
13829 struct unicode_formatter_t {
13830     PyObject *args;
13831     int args_owned;
13832     Py_ssize_t arglen, argidx;
13833     PyObject *dict;
13834 
13835     int fmtkind;
13836     Py_ssize_t fmtcnt, fmtpos;
13837     const void *fmtdata;
13838     PyObject *fmtstr;
13839 
13840     _PyUnicodeWriter writer;
13841 };
13842 
13843 struct unicode_format_arg_t {
13844     Py_UCS4 ch;
13845     int flags;
13846     Py_ssize_t width;
13847     int prec;
13848     int sign;
13849 };
13850 
13851 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)13852 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13853 {
13854     Py_ssize_t argidx = ctx->argidx;
13855 
13856     if (argidx < ctx->arglen) {
13857         ctx->argidx++;
13858         if (ctx->arglen < 0)
13859             return ctx->args;
13860         else
13861             return PyTuple_GetItem(ctx->args, argidx);
13862     }
13863     PyErr_SetString(PyExc_TypeError,
13864                     "not enough arguments for format string");
13865     return NULL;
13866 }
13867 
13868 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
13869 
13870 /* Format a float into the writer if the writer is not NULL, or into *p_output
13871    otherwise.
13872 
13873    Return 0 on success, raise an exception and return -1 on error. */
13874 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)13875 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13876             PyObject **p_output,
13877             _PyUnicodeWriter *writer)
13878 {
13879     char *p;
13880     double x;
13881     Py_ssize_t len;
13882     int prec;
13883     int dtoa_flags = 0;
13884 
13885     x = PyFloat_AsDouble(v);
13886     if (x == -1.0 && PyErr_Occurred())
13887         return -1;
13888 
13889     prec = arg->prec;
13890     if (prec < 0)
13891         prec = 6;
13892 
13893     if (arg->flags & F_ALT)
13894         dtoa_flags |= Py_DTSF_ALT;
13895     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13896     if (p == NULL)
13897         return -1;
13898     len = strlen(p);
13899     if (writer) {
13900         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13901             PyMem_Free(p);
13902             return -1;
13903         }
13904     }
13905     else
13906         *p_output = _PyUnicode_FromASCII(p, len);
13907     PyMem_Free(p);
13908     return 0;
13909 }
13910 
13911 /* formatlong() emulates the format codes d, u, o, x and X, and
13912  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13913  * Python's regular ints.
13914  * Return value:  a new PyUnicodeObject*, or NULL if error.
13915  *     The output string is of the form
13916  *         "-"? ("0x" | "0X")? digit+
13917  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13918  *         set in flags.  The case of hex digits will be correct,
13919  *     There will be at least prec digits, zero-filled on the left if
13920  *         necessary to get that many.
13921  * val          object to be converted
13922  * flags        bitmask of format flags; only F_ALT is looked at
13923  * prec         minimum number of digits; 0-fill on left if needed
13924  * type         a character in [duoxX]; u acts the same as d
13925  *
13926  * CAUTION:  o, x and X conversions on regular ints can never
13927  * produce a '-' sign, but can for Python's unbounded ints.
13928  */
13929 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)13930 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13931 {
13932     PyObject *result = NULL;
13933     char *buf;
13934     Py_ssize_t i;
13935     int sign;           /* 1 if '-', else 0 */
13936     int len;            /* number of characters */
13937     Py_ssize_t llen;
13938     int numdigits;      /* len == numnondigits + numdigits */
13939     int numnondigits = 0;
13940 
13941     /* Avoid exceeding SSIZE_T_MAX */
13942     if (prec > INT_MAX-3) {
13943         PyErr_SetString(PyExc_OverflowError,
13944                         "precision too large");
13945         return NULL;
13946     }
13947 
13948     assert(PyLong_Check(val));
13949 
13950     switch (type) {
13951     default:
13952         Py_UNREACHABLE();
13953     case 'd':
13954     case 'i':
13955     case 'u':
13956         /* int and int subclasses should print numerically when a numeric */
13957         /* format code is used (see issue18780) */
13958         result = PyNumber_ToBase(val, 10);
13959         break;
13960     case 'o':
13961         numnondigits = 2;
13962         result = PyNumber_ToBase(val, 8);
13963         break;
13964     case 'x':
13965     case 'X':
13966         numnondigits = 2;
13967         result = PyNumber_ToBase(val, 16);
13968         break;
13969     }
13970     if (!result)
13971         return NULL;
13972 
13973     assert(unicode_modifiable(result));
13974     assert(PyUnicode_IS_ASCII(result));
13975 
13976     /* To modify the string in-place, there can only be one reference. */
13977     if (Py_REFCNT(result) != 1) {
13978         Py_DECREF(result);
13979         PyErr_BadInternalCall();
13980         return NULL;
13981     }
13982     buf = PyUnicode_DATA(result);
13983     llen = PyUnicode_GET_LENGTH(result);
13984     if (llen > INT_MAX) {
13985         Py_DECREF(result);
13986         PyErr_SetString(PyExc_ValueError,
13987                         "string too large in _PyUnicode_FormatLong");
13988         return NULL;
13989     }
13990     len = (int)llen;
13991     sign = buf[0] == '-';
13992     numnondigits += sign;
13993     numdigits = len - numnondigits;
13994     assert(numdigits > 0);
13995 
13996     /* Get rid of base marker unless F_ALT */
13997     if (((alt) == 0 &&
13998         (type == 'o' || type == 'x' || type == 'X'))) {
13999         assert(buf[sign] == '0');
14000         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14001                buf[sign+1] == 'o');
14002         numnondigits -= 2;
14003         buf += 2;
14004         len -= 2;
14005         if (sign)
14006             buf[0] = '-';
14007         assert(len == numnondigits + numdigits);
14008         assert(numdigits > 0);
14009     }
14010 
14011     /* Fill with leading zeroes to meet minimum width. */
14012     if (prec > numdigits) {
14013         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14014                                 numnondigits + prec);
14015         char *b1;
14016         if (!r1) {
14017             Py_DECREF(result);
14018             return NULL;
14019         }
14020         b1 = PyBytes_AS_STRING(r1);
14021         for (i = 0; i < numnondigits; ++i)
14022             *b1++ = *buf++;
14023         for (i = 0; i < prec - numdigits; i++)
14024             *b1++ = '0';
14025         for (i = 0; i < numdigits; i++)
14026             *b1++ = *buf++;
14027         *b1 = '\0';
14028         Py_SETREF(result, r1);
14029         buf = PyBytes_AS_STRING(result);
14030         len = numnondigits + prec;
14031     }
14032 
14033     /* Fix up case for hex conversions. */
14034     if (type == 'X') {
14035         /* Need to convert all lower case letters to upper case.
14036            and need to convert 0x to 0X (and -0x to -0X). */
14037         for (i = 0; i < len; i++)
14038             if (buf[i] >= 'a' && buf[i] <= 'x')
14039                 buf[i] -= 'a'-'A';
14040     }
14041     if (!PyUnicode_Check(result)
14042         || buf != PyUnicode_DATA(result)) {
14043         PyObject *unicode;
14044         unicode = _PyUnicode_FromASCII(buf, len);
14045         Py_SETREF(result, unicode);
14046     }
14047     else if (len != PyUnicode_GET_LENGTH(result)) {
14048         if (PyUnicode_Resize(&result, len) < 0)
14049             Py_CLEAR(result);
14050     }
14051     return result;
14052 }
14053 
14054 /* Format an integer or a float as an integer.
14055  * Return 1 if the number has been formatted into the writer,
14056  *        0 if the number has been formatted into *p_output
14057  *       -1 and raise an exception on error */
14058 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14059 mainformatlong(PyObject *v,
14060                struct unicode_format_arg_t *arg,
14061                PyObject **p_output,
14062                _PyUnicodeWriter *writer)
14063 {
14064     PyObject *iobj, *res;
14065     char type = (char)arg->ch;
14066 
14067     if (!PyNumber_Check(v))
14068         goto wrongtype;
14069 
14070     /* make sure number is a type of integer for o, x, and X */
14071     if (!PyLong_Check(v)) {
14072         if (type == 'o' || type == 'x' || type == 'X') {
14073             iobj = _PyNumber_Index(v);
14074         }
14075         else {
14076             iobj = PyNumber_Long(v);
14077         }
14078         if (iobj == NULL ) {
14079             if (PyErr_ExceptionMatches(PyExc_TypeError))
14080                 goto wrongtype;
14081             return -1;
14082         }
14083         assert(PyLong_Check(iobj));
14084     }
14085     else {
14086         iobj = Py_NewRef(v);
14087     }
14088 
14089     if (PyLong_CheckExact(v)
14090         && arg->width == -1 && arg->prec == -1
14091         && !(arg->flags & (F_SIGN | F_BLANK))
14092         && type != 'X')
14093     {
14094         /* Fast path */
14095         int alternate = arg->flags & F_ALT;
14096         int base;
14097 
14098         switch(type)
14099         {
14100             default:
14101                 Py_UNREACHABLE();
14102             case 'd':
14103             case 'i':
14104             case 'u':
14105                 base = 10;
14106                 break;
14107             case 'o':
14108                 base = 8;
14109                 break;
14110             case 'x':
14111             case 'X':
14112                 base = 16;
14113                 break;
14114         }
14115 
14116         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14117             Py_DECREF(iobj);
14118             return -1;
14119         }
14120         Py_DECREF(iobj);
14121         return 1;
14122     }
14123 
14124     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14125     Py_DECREF(iobj);
14126     if (res == NULL)
14127         return -1;
14128     *p_output = res;
14129     return 0;
14130 
14131 wrongtype:
14132     switch(type)
14133     {
14134         case 'o':
14135         case 'x':
14136         case 'X':
14137             PyErr_Format(PyExc_TypeError,
14138                     "%%%c format: an integer is required, "
14139                     "not %.200s",
14140                     type, Py_TYPE(v)->tp_name);
14141             break;
14142         default:
14143             PyErr_Format(PyExc_TypeError,
14144                     "%%%c format: a real number is required, "
14145                     "not %.200s",
14146                     type, Py_TYPE(v)->tp_name);
14147             break;
14148     }
14149     return -1;
14150 }
14151 
14152 static Py_UCS4
formatchar(PyObject * v)14153 formatchar(PyObject *v)
14154 {
14155     /* presume that the buffer is at least 3 characters long */
14156     if (PyUnicode_Check(v)) {
14157         if (PyUnicode_GET_LENGTH(v) == 1) {
14158             return PyUnicode_READ_CHAR(v, 0);
14159         }
14160         goto onError;
14161     }
14162     else {
14163         int overflow;
14164         long x = PyLong_AsLongAndOverflow(v, &overflow);
14165         if (x == -1 && PyErr_Occurred()) {
14166             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14167                 goto onError;
14168             }
14169             return (Py_UCS4) -1;
14170         }
14171 
14172         if (x < 0 || x > MAX_UNICODE) {
14173             /* this includes an overflow in converting to C long */
14174             PyErr_SetString(PyExc_OverflowError,
14175                             "%c arg not in range(0x110000)");
14176             return (Py_UCS4) -1;
14177         }
14178 
14179         return (Py_UCS4) x;
14180     }
14181 
14182   onError:
14183     PyErr_SetString(PyExc_TypeError,
14184                     "%c requires int or char");
14185     return (Py_UCS4) -1;
14186 }
14187 
14188 /* Parse options of an argument: flags, width, precision.
14189    Handle also "%(name)" syntax.
14190 
14191    Return 0 if the argument has been formatted into arg->str.
14192    Return 1 if the argument has been written into ctx->writer,
14193    Raise an exception and return -1 on error. */
14194 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14195 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14196                          struct unicode_format_arg_t *arg)
14197 {
14198 #define FORMAT_READ(ctx) \
14199         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14200 
14201     PyObject *v;
14202 
14203     if (arg->ch == '(') {
14204         /* Get argument value from a dictionary. Example: "%(name)s". */
14205         Py_ssize_t keystart;
14206         Py_ssize_t keylen;
14207         PyObject *key;
14208         int pcount = 1;
14209 
14210         if (ctx->dict == NULL) {
14211             PyErr_SetString(PyExc_TypeError,
14212                             "format requires a mapping");
14213             return -1;
14214         }
14215         ++ctx->fmtpos;
14216         --ctx->fmtcnt;
14217         keystart = ctx->fmtpos;
14218         /* Skip over balanced parentheses */
14219         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14220             arg->ch = FORMAT_READ(ctx);
14221             if (arg->ch == ')')
14222                 --pcount;
14223             else if (arg->ch == '(')
14224                 ++pcount;
14225             ctx->fmtpos++;
14226         }
14227         keylen = ctx->fmtpos - keystart - 1;
14228         if (ctx->fmtcnt < 0 || pcount > 0) {
14229             PyErr_SetString(PyExc_ValueError,
14230                             "incomplete format key");
14231             return -1;
14232         }
14233         key = PyUnicode_Substring(ctx->fmtstr,
14234                                   keystart, keystart + keylen);
14235         if (key == NULL)
14236             return -1;
14237         if (ctx->args_owned) {
14238             ctx->args_owned = 0;
14239             Py_DECREF(ctx->args);
14240         }
14241         ctx->args = PyObject_GetItem(ctx->dict, key);
14242         Py_DECREF(key);
14243         if (ctx->args == NULL)
14244             return -1;
14245         ctx->args_owned = 1;
14246         ctx->arglen = -1;
14247         ctx->argidx = -2;
14248     }
14249 
14250     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14251     while (--ctx->fmtcnt >= 0) {
14252         arg->ch = FORMAT_READ(ctx);
14253         ctx->fmtpos++;
14254         switch (arg->ch) {
14255         case '-': arg->flags |= F_LJUST; continue;
14256         case '+': arg->flags |= F_SIGN; continue;
14257         case ' ': arg->flags |= F_BLANK; continue;
14258         case '#': arg->flags |= F_ALT; continue;
14259         case '0': arg->flags |= F_ZERO; continue;
14260         }
14261         break;
14262     }
14263 
14264     /* Parse width. Example: "%10s" => width=10 */
14265     if (arg->ch == '*') {
14266         v = unicode_format_getnextarg(ctx);
14267         if (v == NULL)
14268             return -1;
14269         if (!PyLong_Check(v)) {
14270             PyErr_SetString(PyExc_TypeError,
14271                             "* wants int");
14272             return -1;
14273         }
14274         arg->width = PyLong_AsSsize_t(v);
14275         if (arg->width == -1 && PyErr_Occurred())
14276             return -1;
14277         if (arg->width < 0) {
14278             arg->flags |= F_LJUST;
14279             arg->width = -arg->width;
14280         }
14281         if (--ctx->fmtcnt >= 0) {
14282             arg->ch = FORMAT_READ(ctx);
14283             ctx->fmtpos++;
14284         }
14285     }
14286     else if (arg->ch >= '0' && arg->ch <= '9') {
14287         arg->width = arg->ch - '0';
14288         while (--ctx->fmtcnt >= 0) {
14289             arg->ch = FORMAT_READ(ctx);
14290             ctx->fmtpos++;
14291             if (arg->ch < '0' || arg->ch > '9')
14292                 break;
14293             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14294                mixing signed and unsigned comparison. Since arg->ch is between
14295                '0' and '9', casting to int is safe. */
14296             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14297                 PyErr_SetString(PyExc_ValueError,
14298                                 "width too big");
14299                 return -1;
14300             }
14301             arg->width = arg->width*10 + (arg->ch - '0');
14302         }
14303     }
14304 
14305     /* Parse precision. Example: "%.3f" => prec=3 */
14306     if (arg->ch == '.') {
14307         arg->prec = 0;
14308         if (--ctx->fmtcnt >= 0) {
14309             arg->ch = FORMAT_READ(ctx);
14310             ctx->fmtpos++;
14311         }
14312         if (arg->ch == '*') {
14313             v = unicode_format_getnextarg(ctx);
14314             if (v == NULL)
14315                 return -1;
14316             if (!PyLong_Check(v)) {
14317                 PyErr_SetString(PyExc_TypeError,
14318                                 "* wants int");
14319                 return -1;
14320             }
14321             arg->prec = PyLong_AsInt(v);
14322             if (arg->prec == -1 && PyErr_Occurred())
14323                 return -1;
14324             if (arg->prec < 0)
14325                 arg->prec = 0;
14326             if (--ctx->fmtcnt >= 0) {
14327                 arg->ch = FORMAT_READ(ctx);
14328                 ctx->fmtpos++;
14329             }
14330         }
14331         else if (arg->ch >= '0' && arg->ch <= '9') {
14332             arg->prec = arg->ch - '0';
14333             while (--ctx->fmtcnt >= 0) {
14334                 arg->ch = FORMAT_READ(ctx);
14335                 ctx->fmtpos++;
14336                 if (arg->ch < '0' || arg->ch > '9')
14337                     break;
14338                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14339                     PyErr_SetString(PyExc_ValueError,
14340                                     "precision too big");
14341                     return -1;
14342                 }
14343                 arg->prec = arg->prec*10 + (arg->ch - '0');
14344             }
14345         }
14346     }
14347 
14348     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14349     if (ctx->fmtcnt >= 0) {
14350         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14351             if (--ctx->fmtcnt >= 0) {
14352                 arg->ch = FORMAT_READ(ctx);
14353                 ctx->fmtpos++;
14354             }
14355         }
14356     }
14357     if (ctx->fmtcnt < 0) {
14358         PyErr_SetString(PyExc_ValueError,
14359                         "incomplete format");
14360         return -1;
14361     }
14362     return 0;
14363 
14364 #undef FORMAT_READ
14365 }
14366 
14367 /* Format one argument. Supported conversion specifiers:
14368 
14369    - "s", "r", "a": any type
14370    - "i", "d", "u": int or float
14371    - "o", "x", "X": int
14372    - "e", "E", "f", "F", "g", "G": float
14373    - "c": int or str (1 character)
14374 
14375    When possible, the output is written directly into the Unicode writer
14376    (ctx->writer). A string is created when padding is required.
14377 
14378    Return 0 if the argument has been formatted into *p_str,
14379           1 if the argument has been written into ctx->writer,
14380          -1 on error. */
14381 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14382 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14383                           struct unicode_format_arg_t *arg,
14384                           PyObject **p_str)
14385 {
14386     PyObject *v;
14387     _PyUnicodeWriter *writer = &ctx->writer;
14388 
14389     if (ctx->fmtcnt == 0)
14390         ctx->writer.overallocate = 0;
14391 
14392     v = unicode_format_getnextarg(ctx);
14393     if (v == NULL)
14394         return -1;
14395 
14396 
14397     switch (arg->ch) {
14398     case 's':
14399     case 'r':
14400     case 'a':
14401         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14402             /* Fast path */
14403             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14404                 return -1;
14405             return 1;
14406         }
14407 
14408         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14409             *p_str = Py_NewRef(v);
14410         }
14411         else {
14412             if (arg->ch == 's')
14413                 *p_str = PyObject_Str(v);
14414             else if (arg->ch == 'r')
14415                 *p_str = PyObject_Repr(v);
14416             else
14417                 *p_str = PyObject_ASCII(v);
14418         }
14419         break;
14420 
14421     case 'i':
14422     case 'd':
14423     case 'u':
14424     case 'o':
14425     case 'x':
14426     case 'X':
14427     {
14428         int ret = mainformatlong(v, arg, p_str, writer);
14429         if (ret != 0)
14430             return ret;
14431         arg->sign = 1;
14432         break;
14433     }
14434 
14435     case 'e':
14436     case 'E':
14437     case 'f':
14438     case 'F':
14439     case 'g':
14440     case 'G':
14441         if (arg->width == -1 && arg->prec == -1
14442             && !(arg->flags & (F_SIGN | F_BLANK)))
14443         {
14444             /* Fast path */
14445             if (formatfloat(v, arg, NULL, writer) == -1)
14446                 return -1;
14447             return 1;
14448         }
14449 
14450         arg->sign = 1;
14451         if (formatfloat(v, arg, p_str, NULL) == -1)
14452             return -1;
14453         break;
14454 
14455     case 'c':
14456     {
14457         Py_UCS4 ch = formatchar(v);
14458         if (ch == (Py_UCS4) -1)
14459             return -1;
14460         if (arg->width == -1 && arg->prec == -1) {
14461             /* Fast path */
14462             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14463                 return -1;
14464             return 1;
14465         }
14466         *p_str = PyUnicode_FromOrdinal(ch);
14467         break;
14468     }
14469 
14470     default:
14471         PyErr_Format(PyExc_ValueError,
14472                      "unsupported format character '%c' (0x%x) "
14473                      "at index %zd",
14474                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14475                      (int)arg->ch,
14476                      ctx->fmtpos - 1);
14477         return -1;
14478     }
14479     if (*p_str == NULL)
14480         return -1;
14481     assert (PyUnicode_Check(*p_str));
14482     return 0;
14483 }
14484 
14485 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14486 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14487                           struct unicode_format_arg_t *arg,
14488                           PyObject *str)
14489 {
14490     Py_ssize_t len;
14491     int kind;
14492     const void *pbuf;
14493     Py_ssize_t pindex;
14494     Py_UCS4 signchar;
14495     Py_ssize_t buflen;
14496     Py_UCS4 maxchar;
14497     Py_ssize_t sublen;
14498     _PyUnicodeWriter *writer = &ctx->writer;
14499     Py_UCS4 fill;
14500 
14501     fill = ' ';
14502     if (arg->sign && arg->flags & F_ZERO)
14503         fill = '0';
14504 
14505     len = PyUnicode_GET_LENGTH(str);
14506     if ((arg->width == -1 || arg->width <= len)
14507         && (arg->prec == -1 || arg->prec >= len)
14508         && !(arg->flags & (F_SIGN | F_BLANK)))
14509     {
14510         /* Fast path */
14511         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14512             return -1;
14513         return 0;
14514     }
14515 
14516     /* Truncate the string for "s", "r" and "a" formats
14517        if the precision is set */
14518     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14519         if (arg->prec >= 0 && len > arg->prec)
14520             len = arg->prec;
14521     }
14522 
14523     /* Adjust sign and width */
14524     kind = PyUnicode_KIND(str);
14525     pbuf = PyUnicode_DATA(str);
14526     pindex = 0;
14527     signchar = '\0';
14528     if (arg->sign) {
14529         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14530         if (ch == '-' || ch == '+') {
14531             signchar = ch;
14532             len--;
14533             pindex++;
14534         }
14535         else if (arg->flags & F_SIGN)
14536             signchar = '+';
14537         else if (arg->flags & F_BLANK)
14538             signchar = ' ';
14539         else
14540             arg->sign = 0;
14541     }
14542     if (arg->width < len)
14543         arg->width = len;
14544 
14545     /* Prepare the writer */
14546     maxchar = writer->maxchar;
14547     if (!(arg->flags & F_LJUST)) {
14548         if (arg->sign) {
14549             if ((arg->width-1) > len)
14550                 maxchar = Py_MAX(maxchar, fill);
14551         }
14552         else {
14553             if (arg->width > len)
14554                 maxchar = Py_MAX(maxchar, fill);
14555         }
14556     }
14557     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14558         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14559         maxchar = Py_MAX(maxchar, strmaxchar);
14560     }
14561 
14562     buflen = arg->width;
14563     if (arg->sign && len == arg->width)
14564         buflen++;
14565     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14566         return -1;
14567 
14568     /* Write the sign if needed */
14569     if (arg->sign) {
14570         if (fill != ' ') {
14571             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14572             writer->pos += 1;
14573         }
14574         if (arg->width > len)
14575             arg->width--;
14576     }
14577 
14578     /* Write the numeric prefix for "x", "X" and "o" formats
14579        if the alternate form is used.
14580        For example, write "0x" for the "%#x" format. */
14581     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14582         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14583         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14584         if (fill != ' ') {
14585             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14586             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14587             writer->pos += 2;
14588             pindex += 2;
14589         }
14590         arg->width -= 2;
14591         if (arg->width < 0)
14592             arg->width = 0;
14593         len -= 2;
14594     }
14595 
14596     /* Pad left with the fill character if needed */
14597     if (arg->width > len && !(arg->flags & F_LJUST)) {
14598         sublen = arg->width - len;
14599         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14600         writer->pos += sublen;
14601         arg->width = len;
14602     }
14603 
14604     /* If padding with spaces: write sign if needed and/or numeric prefix if
14605        the alternate form is used */
14606     if (fill == ' ') {
14607         if (arg->sign) {
14608             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14609             writer->pos += 1;
14610         }
14611         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14612             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14613             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14614             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14615             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14616             writer->pos += 2;
14617             pindex += 2;
14618         }
14619     }
14620 
14621     /* Write characters */
14622     if (len) {
14623         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14624                                       str, pindex, len);
14625         writer->pos += len;
14626     }
14627 
14628     /* Pad right with the fill character if needed */
14629     if (arg->width > len) {
14630         sublen = arg->width - len;
14631         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14632         writer->pos += sublen;
14633     }
14634     return 0;
14635 }
14636 
14637 /* Helper of PyUnicode_Format(): format one arg.
14638    Return 0 on success, raise an exception and return -1 on error. */
14639 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14640 unicode_format_arg(struct unicode_formatter_t *ctx)
14641 {
14642     struct unicode_format_arg_t arg;
14643     PyObject *str;
14644     int ret;
14645 
14646     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14647     if (arg.ch == '%') {
14648         ctx->fmtpos++;
14649         ctx->fmtcnt--;
14650         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14651             return -1;
14652         return 0;
14653     }
14654     arg.flags = 0;
14655     arg.width = -1;
14656     arg.prec = -1;
14657     arg.sign = 0;
14658     str = NULL;
14659 
14660     ret = unicode_format_arg_parse(ctx, &arg);
14661     if (ret == -1)
14662         return -1;
14663 
14664     ret = unicode_format_arg_format(ctx, &arg, &str);
14665     if (ret == -1)
14666         return -1;
14667 
14668     if (ret != 1) {
14669         ret = unicode_format_arg_output(ctx, &arg, str);
14670         Py_DECREF(str);
14671         if (ret == -1)
14672             return -1;
14673     }
14674 
14675     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14676         PyErr_SetString(PyExc_TypeError,
14677                         "not all arguments converted during string formatting");
14678         return -1;
14679     }
14680     return 0;
14681 }
14682 
14683 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14684 PyUnicode_Format(PyObject *format, PyObject *args)
14685 {
14686     struct unicode_formatter_t ctx;
14687 
14688     if (format == NULL || args == NULL) {
14689         PyErr_BadInternalCall();
14690         return NULL;
14691     }
14692 
14693     if (ensure_unicode(format) < 0)
14694         return NULL;
14695 
14696     ctx.fmtstr = format;
14697     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14698     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14699     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14700     ctx.fmtpos = 0;
14701 
14702     _PyUnicodeWriter_Init(&ctx.writer);
14703     ctx.writer.min_length = ctx.fmtcnt + 100;
14704     ctx.writer.overallocate = 1;
14705 
14706     if (PyTuple_Check(args)) {
14707         ctx.arglen = PyTuple_Size(args);
14708         ctx.argidx = 0;
14709     }
14710     else {
14711         ctx.arglen = -1;
14712         ctx.argidx = -2;
14713     }
14714     ctx.args_owned = 0;
14715     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14716         ctx.dict = args;
14717     else
14718         ctx.dict = NULL;
14719     ctx.args = args;
14720 
14721     while (--ctx.fmtcnt >= 0) {
14722         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14723             Py_ssize_t nonfmtpos;
14724 
14725             nonfmtpos = ctx.fmtpos++;
14726             while (ctx.fmtcnt >= 0 &&
14727                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14728                 ctx.fmtpos++;
14729                 ctx.fmtcnt--;
14730             }
14731             if (ctx.fmtcnt < 0) {
14732                 ctx.fmtpos--;
14733                 ctx.writer.overallocate = 0;
14734             }
14735 
14736             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14737                                                 nonfmtpos, ctx.fmtpos) < 0)
14738                 goto onError;
14739         }
14740         else {
14741             ctx.fmtpos++;
14742             if (unicode_format_arg(&ctx) == -1)
14743                 goto onError;
14744         }
14745     }
14746 
14747     if (ctx.argidx < ctx.arglen && !ctx.dict) {
14748         PyErr_SetString(PyExc_TypeError,
14749                         "not all arguments converted during string formatting");
14750         goto onError;
14751     }
14752 
14753     if (ctx.args_owned) {
14754         Py_DECREF(ctx.args);
14755     }
14756     return _PyUnicodeWriter_Finish(&ctx.writer);
14757 
14758   onError:
14759     _PyUnicodeWriter_Dealloc(&ctx.writer);
14760     if (ctx.args_owned) {
14761         Py_DECREF(ctx.args);
14762     }
14763     return NULL;
14764 }
14765 
14766 static PyObject *
14767 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
14768 
14769 /*[clinic input]
14770 @classmethod
14771 str.__new__ as unicode_new
14772 
14773     object as x: object = NULL
14774     encoding: str = NULL
14775     errors: str = NULL
14776 
14777 [clinic start generated code]*/
14778 
14779 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)14780 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
14781                  const char *errors)
14782 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
14783 {
14784     PyObject *unicode;
14785     if (x == NULL) {
14786         unicode = unicode_get_empty();
14787     }
14788     else if (encoding == NULL && errors == NULL) {
14789         unicode = PyObject_Str(x);
14790     }
14791     else {
14792         unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
14793     }
14794 
14795     if (unicode != NULL && type != &PyUnicode_Type) {
14796         Py_SETREF(unicode, unicode_subtype_new(type, unicode));
14797     }
14798     return unicode;
14799 }
14800 
14801 static const char *
arg_as_utf8(PyObject * obj,const char * name)14802 arg_as_utf8(PyObject *obj, const char *name)
14803 {
14804     if (!PyUnicode_Check(obj)) {
14805         PyErr_Format(PyExc_TypeError,
14806                      "str() argument '%s' must be str, not %T",
14807                      name, obj);
14808         return NULL;
14809     }
14810     return _PyUnicode_AsUTF8NoNUL(obj);
14811 }
14812 
14813 static PyObject *
unicode_vectorcall(PyObject * type,PyObject * const * args,size_t nargsf,PyObject * kwnames)14814 unicode_vectorcall(PyObject *type, PyObject *const *args,
14815                    size_t nargsf, PyObject *kwnames)
14816 {
14817     assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
14818 
14819     Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
14820     if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
14821         // Fallback to unicode_new()
14822         PyObject *tuple = _PyTuple_FromArray(args, nargs);
14823         if (tuple == NULL) {
14824             return NULL;
14825         }
14826         PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
14827         if (dict == NULL) {
14828             Py_DECREF(tuple);
14829             return NULL;
14830         }
14831         PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
14832         Py_DECREF(tuple);
14833         Py_DECREF(dict);
14834         return ret;
14835     }
14836     if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
14837         return NULL;
14838     }
14839     if (nargs == 0) {
14840         return unicode_get_empty();
14841     }
14842     PyObject *object = args[0];
14843     if (nargs == 1) {
14844         return PyObject_Str(object);
14845     }
14846     const char *encoding = arg_as_utf8(args[1], "encoding");
14847     if (encoding == NULL) {
14848         return NULL;
14849     }
14850     const char *errors = NULL;
14851     if (nargs == 3) {
14852         errors = arg_as_utf8(args[2], "errors");
14853         if (errors == NULL) {
14854             return NULL;
14855         }
14856     }
14857     return PyUnicode_FromEncodedObject(object, encoding, errors);
14858 }
14859 
14860 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)14861 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
14862 {
14863     PyObject *self;
14864     Py_ssize_t length, char_size;
14865     int share_utf8;
14866     int kind;
14867     void *data;
14868 
14869     assert(PyType_IsSubtype(type, &PyUnicode_Type));
14870     assert(_PyUnicode_CHECK(unicode));
14871 
14872     self = type->tp_alloc(type, 0);
14873     if (self == NULL) {
14874         return NULL;
14875     }
14876     kind = PyUnicode_KIND(unicode);
14877     length = PyUnicode_GET_LENGTH(unicode);
14878 
14879     _PyUnicode_LENGTH(self) = length;
14880 #ifdef Py_DEBUG
14881     _PyUnicode_HASH(self) = -1;
14882 #else
14883     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14884 #endif
14885     _PyUnicode_STATE(self).interned = 0;
14886     _PyUnicode_STATE(self).kind = kind;
14887     _PyUnicode_STATE(self).compact = 0;
14888     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14889     _PyUnicode_STATE(self).statically_allocated = 0;
14890     _PyUnicode_UTF8_LENGTH(self) = 0;
14891     _PyUnicode_UTF8(self) = NULL;
14892     _PyUnicode_DATA_ANY(self) = NULL;
14893 
14894     share_utf8 = 0;
14895     if (kind == PyUnicode_1BYTE_KIND) {
14896         char_size = 1;
14897         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14898             share_utf8 = 1;
14899     }
14900     else if (kind == PyUnicode_2BYTE_KIND) {
14901         char_size = 2;
14902     }
14903     else {
14904         assert(kind == PyUnicode_4BYTE_KIND);
14905         char_size = 4;
14906     }
14907 
14908     /* Ensure we won't overflow the length. */
14909     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14910         PyErr_NoMemory();
14911         goto onError;
14912     }
14913     data = PyMem_Malloc((length + 1) * char_size);
14914     if (data == NULL) {
14915         PyErr_NoMemory();
14916         goto onError;
14917     }
14918 
14919     _PyUnicode_DATA_ANY(self) = data;
14920     if (share_utf8) {
14921         _PyUnicode_UTF8_LENGTH(self) = length;
14922         _PyUnicode_UTF8(self) = data;
14923     }
14924 
14925     memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
14926     assert(_PyUnicode_CheckConsistency(self, 1));
14927 #ifdef Py_DEBUG
14928     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14929 #endif
14930     return self;
14931 
14932 onError:
14933     Py_DECREF(self);
14934     return NULL;
14935 }
14936 
14937 void
_PyUnicode_ExactDealloc(PyObject * op)14938 _PyUnicode_ExactDealloc(PyObject *op)
14939 {
14940     assert(PyUnicode_CheckExact(op));
14941     unicode_dealloc(op);
14942 }
14943 
14944 PyDoc_STRVAR(unicode_doc,
14945 "str(object='') -> str\n\
14946 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14947 \n\
14948 Create a new string object from the given object. If encoding or\n\
14949 errors is specified, then the object must expose a data buffer\n\
14950 that will be decoded using the given encoding and error handler.\n\
14951 Otherwise, returns the result of object.__str__() (if defined)\n\
14952 or repr(object).\n\
14953 encoding defaults to 'utf-8'.\n\
14954 errors defaults to 'strict'.");
14955 
14956 static PyObject *unicode_iter(PyObject *seq);
14957 
14958 PyTypeObject PyUnicode_Type = {
14959     PyVarObject_HEAD_INIT(&PyType_Type, 0)
14960     "str",                        /* tp_name */
14961     sizeof(PyUnicodeObject),      /* tp_basicsize */
14962     0,                            /* tp_itemsize */
14963     /* Slots */
14964     (destructor)unicode_dealloc,  /* tp_dealloc */
14965     0,                            /* tp_vectorcall_offset */
14966     0,                            /* tp_getattr */
14967     0,                            /* tp_setattr */
14968     0,                            /* tp_as_async */
14969     unicode_repr,                 /* tp_repr */
14970     &unicode_as_number,           /* tp_as_number */
14971     &unicode_as_sequence,         /* tp_as_sequence */
14972     &unicode_as_mapping,          /* tp_as_mapping */
14973     (hashfunc) unicode_hash,      /* tp_hash*/
14974     0,                            /* tp_call*/
14975     (reprfunc) unicode_str,       /* tp_str */
14976     PyObject_GenericGetAttr,      /* tp_getattro */
14977     0,                            /* tp_setattro */
14978     0,                            /* tp_as_buffer */
14979     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14980         Py_TPFLAGS_UNICODE_SUBCLASS |
14981         _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14982     unicode_doc,                  /* tp_doc */
14983     0,                            /* tp_traverse */
14984     0,                            /* tp_clear */
14985     PyUnicode_RichCompare,        /* tp_richcompare */
14986     0,                            /* tp_weaklistoffset */
14987     unicode_iter,                 /* tp_iter */
14988     0,                            /* tp_iternext */
14989     unicode_methods,              /* tp_methods */
14990     0,                            /* tp_members */
14991     0,                            /* tp_getset */
14992     0,                            /* tp_base */
14993     0,                            /* tp_dict */
14994     0,                            /* tp_descr_get */
14995     0,                            /* tp_descr_set */
14996     0,                            /* tp_dictoffset */
14997     0,                            /* tp_init */
14998     0,                            /* tp_alloc */
14999     unicode_new,                  /* tp_new */
15000     PyObject_Del,                 /* tp_free */
15001     .tp_vectorcall = unicode_vectorcall,
15002 };
15003 
15004 /* Initialize the Unicode implementation */
15005 
15006 static void
_init_global_state(void)15007 _init_global_state(void)
15008 {
15009     static int initialized = 0;
15010     if (initialized) {
15011         return;
15012     }
15013     initialized = 1;
15014 
15015     /* initialize the linebreak bloom filter */
15016     const Py_UCS2 linebreak[] = {
15017         0x000A, /* LINE FEED */
15018         0x000D, /* CARRIAGE RETURN */
15019         0x001C, /* FILE SEPARATOR */
15020         0x001D, /* GROUP SEPARATOR */
15021         0x001E, /* RECORD SEPARATOR */
15022         0x0085, /* NEXT LINE */
15023         0x2028, /* LINE SEPARATOR */
15024         0x2029, /* PARAGRAPH SEPARATOR */
15025     };
15026     bloom_linebreak = make_bloom_mask(
15027         PyUnicode_2BYTE_KIND, linebreak,
15028         Py_ARRAY_LENGTH(linebreak));
15029 }
15030 
15031 void
_PyUnicode_InitState(PyInterpreterState * interp)15032 _PyUnicode_InitState(PyInterpreterState *interp)
15033 {
15034     if (!_Py_IsMainInterpreter(interp)) {
15035         return;
15036     }
15037     _init_global_state();
15038 }
15039 
15040 
15041 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15042 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15043 {
15044     if (_Py_IsMainInterpreter(interp)) {
15045         PyStatus status = init_global_interned_strings(interp);
15046         if (_PyStatus_EXCEPTION(status)) {
15047             return status;
15048         }
15049     }
15050     assert(INTERNED_STRINGS);
15051 
15052     if (init_interned_dict(interp)) {
15053         PyErr_Clear();
15054         return _PyStatus_ERR("failed to create interned dict");
15055     }
15056 
15057     return _PyStatus_OK();
15058 }
15059 
15060 
15061 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15062 _PyUnicode_InitTypes(PyInterpreterState *interp)
15063 {
15064     if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15065         goto error;
15066     }
15067     if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15068         goto error;
15069     }
15070     if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15071         goto error;
15072     }
15073     return _PyStatus_OK();
15074 
15075 error:
15076     return _PyStatus_ERR("Can't initialize unicode types");
15077 }
15078 
15079 static /* non-null */ PyObject*
intern_static(PyInterpreterState * interp,PyObject * s)15080 intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15081 {
15082     // Note that this steals a reference to `s`, but in many cases that
15083     // stolen ref is returned, requiring no decref/incref.
15084 
15085     assert(s != NULL);
15086     assert(_PyUnicode_CHECK(s));
15087     assert(_PyUnicode_STATE(s).statically_allocated);
15088     assert(!PyUnicode_CHECK_INTERNED(s));
15089 
15090 #ifdef Py_DEBUG
15091     /* We must not add process-global interned string if there's already a
15092      * per-interpreter interned_dict, which might contain duplicates.
15093      */
15094     PyObject *interned = get_interned_dict(interp);
15095     assert(interned == NULL);
15096 #endif
15097 
15098     /* Look in the global cache first. */
15099     PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15100     /* We should only init each string once */
15101     assert(r == NULL);
15102     /* but just in case (for the non-debug build), handle this */
15103     if (r != NULL && r != s) {
15104         assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15105         assert(_PyUnicode_CHECK(r));
15106         Py_DECREF(s);
15107         return Py_NewRef(r);
15108     }
15109 
15110     if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15111         Py_FatalError("failed to intern static string");
15112     }
15113 
15114     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15115     return s;
15116 }
15117 
15118 void
_PyUnicode_InternStatic(PyInterpreterState * interp,PyObject ** p)15119 _PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15120 {
15121     // This should only be called as part of runtime initialization
15122     assert(!Py_IsInitialized());
15123 
15124     *p = intern_static(interp, *p);
15125     assert(*p);
15126 }
15127 
15128 static void
immortalize_interned(PyObject * s)15129 immortalize_interned(PyObject *s)
15130 {
15131     assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15132     assert(!_Py_IsImmortal(s));
15133 #ifdef Py_REF_DEBUG
15134     /* The reference count value should be excluded from the RefTotal.
15135        The decrements to these objects will not be registered so they
15136        need to be accounted for in here. */
15137     for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15138         _Py_DecRefTotal(_PyThreadState_GET());
15139     }
15140 #endif
15141     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL;
15142     _Py_SetImmortal(s);
15143 }
15144 
15145 static /* non-null */ PyObject*
intern_common(PyInterpreterState * interp,PyObject * s,bool immortalize)15146 intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15147               bool immortalize)
15148 {
15149     // Note that this steals a reference to `s`, but in many cases that
15150     // stolen ref is returned, requiring no decref/incref.
15151 
15152 #ifdef Py_DEBUG
15153     assert(s != NULL);
15154     assert(_PyUnicode_CHECK(s));
15155 #else
15156     if (s == NULL || !PyUnicode_Check(s)) {
15157         return s;
15158     }
15159 #endif
15160 
15161     /* If it's a subclass, we don't really know what putting
15162        it in the interned dict might do. */
15163     if (!PyUnicode_CheckExact(s)) {
15164         return s;
15165     }
15166 
15167     /* Is it already interned? */
15168     switch (PyUnicode_CHECK_INTERNED(s)) {
15169         case SSTATE_NOT_INTERNED:
15170             // no, go on
15171             break;
15172         case SSTATE_INTERNED_MORTAL:
15173             // yes but we might need to make it immortal
15174             if (immortalize) {
15175                 immortalize_interned(s);
15176             }
15177             return s;
15178         default:
15179             // all done
15180             return s;
15181     }
15182 
15183     /* Statically allocated strings must be already interned. */
15184     assert(!_PyUnicode_STATE(s).statically_allocated);
15185 
15186 #if Py_GIL_DISABLED
15187     /* In the free-threaded build, all interned strings are immortal */
15188     immortalize = 1;
15189 #endif
15190 
15191     /* If it's already immortal, intern it as such */
15192     if (_Py_IsImmortal(s)) {
15193         immortalize = 1;
15194     }
15195 
15196     /* if it's a short string, get the singleton */
15197     if (PyUnicode_GET_LENGTH(s) == 1 &&
15198                 PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15199         PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15200         assert(PyUnicode_CHECK_INTERNED(r));
15201         Py_DECREF(s);
15202         return r;
15203     }
15204 #ifdef Py_DEBUG
15205     assert(!unicode_is_singleton(s));
15206 #endif
15207 
15208     /* Look in the global cache now. */
15209     {
15210         PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15211         if (r != NULL) {
15212             assert(_PyUnicode_STATE(r).statically_allocated);
15213             assert(r != s);  // r must be statically_allocated; s is not
15214             Py_DECREF(s);
15215             return Py_NewRef(r);
15216         }
15217     }
15218 
15219     /* Do a setdefault on the per-interpreter cache. */
15220     PyObject *interned = get_interned_dict(interp);
15221     assert(interned != NULL);
15222 
15223     PyObject *t;
15224     {
15225         int res = PyDict_SetDefaultRef(interned, s, s, &t);
15226         if (res < 0) {
15227             PyErr_Clear();
15228             return s;
15229         }
15230         else if (res == 1) {
15231             // value was already present (not inserted)
15232             Py_DECREF(s);
15233             if (immortalize &&
15234                     PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
15235                 immortalize_interned(t);
15236             }
15237             return t;
15238         }
15239         else {
15240             // value was newly inserted
15241             assert (s == t);
15242             Py_DECREF(t);
15243         }
15244     }
15245 
15246     /* NOT_INTERNED -> INTERNED_MORTAL */
15247 
15248     assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
15249 
15250     if (!_Py_IsImmortal(s)) {
15251         /* The two references in interned dict (key and value) are not counted.
15252         unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
15253         Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15254 #ifdef Py_REF_DEBUG
15255         /* let's be pedantic with the ref total */
15256         _Py_DecRefTotal(_PyThreadState_GET());
15257         _Py_DecRefTotal(_PyThreadState_GET());
15258 #endif
15259     }
15260     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15261 
15262     /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
15263 
15264 #ifdef Py_DEBUG
15265     if (_Py_IsImmortal(s)) {
15266         assert(immortalize);
15267     }
15268 #endif
15269     if (immortalize) {
15270         immortalize_interned(s);
15271     }
15272 
15273     return s;
15274 }
15275 
15276 void
_PyUnicode_InternImmortal(PyInterpreterState * interp,PyObject ** p)15277 _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
15278 {
15279     *p = intern_common(interp, *p, 1);
15280     assert(*p);
15281 }
15282 
15283 void
_PyUnicode_InternMortal(PyInterpreterState * interp,PyObject ** p)15284 _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
15285 {
15286     *p = intern_common(interp, *p, 0);
15287     assert(*p);
15288 }
15289 
15290 
15291 void
_PyUnicode_InternInPlace(PyInterpreterState * interp,PyObject ** p)15292 _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
15293 {
15294     _PyUnicode_InternImmortal(interp, p);
15295     return;
15296 }
15297 
15298 void
PyUnicode_InternInPlace(PyObject ** p)15299 PyUnicode_InternInPlace(PyObject **p)
15300 {
15301     PyInterpreterState *interp = _PyInterpreterState_GET();
15302     _PyUnicode_InternMortal(interp, p);
15303 }
15304 
15305 // Public-looking name kept for the stable ABI; user should not call this:
15306 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
15307 void
PyUnicode_InternImmortal(PyObject ** p)15308 PyUnicode_InternImmortal(PyObject **p)
15309 {
15310     PyInterpreterState *interp = _PyInterpreterState_GET();
15311     _PyUnicode_InternImmortal(interp, p);
15312 }
15313 
15314 PyObject *
PyUnicode_InternFromString(const char * cp)15315 PyUnicode_InternFromString(const char *cp)
15316 {
15317     PyObject *s = PyUnicode_FromString(cp);
15318     if (s == NULL) {
15319         return NULL;
15320     }
15321     PyInterpreterState *interp = _PyInterpreterState_GET();
15322     _PyUnicode_InternMortal(interp, &s);
15323     return s;
15324 }
15325 
15326 
15327 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15328 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15329 {
15330     PyObject *interned = get_interned_dict(interp);
15331     if (interned == NULL) {
15332         return;
15333     }
15334     assert(PyDict_CheckExact(interned));
15335 
15336     if (has_shared_intern_dict(interp)) {
15337         // the dict doesn't belong to this interpreter, skip the debug
15338         // checks on it and just clear the pointer to it
15339         clear_interned_dict(interp);
15340         return;
15341     }
15342 
15343 #ifdef INTERNED_STATS
15344     fprintf(stderr, "releasing %zd interned strings\n",
15345             PyDict_GET_SIZE(interned));
15346 
15347     Py_ssize_t total_length = 0;
15348 #endif
15349     Py_ssize_t pos = 0;
15350     PyObject *s, *ignored_value;
15351     while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15352         assert(PyUnicode_IS_READY(s));
15353         int shared = 0;
15354         switch (PyUnicode_CHECK_INTERNED(s)) {
15355         case SSTATE_INTERNED_IMMORTAL:
15356             /* Make immortal interned strings mortal again.
15357              *
15358              * Currently, the runtime is not able to guarantee that it can exit
15359              * without allocations that carry over to a future initialization
15360              * of Python within the same process. i.e:
15361              *   ./python -X showrefcount -c 'import itertools'
15362              *   [237 refs, 237 blocks]
15363              *
15364              * This should remain disabled (`Py_DEBUG` only) until there is a
15365              * strict guarantee that no memory will be left after
15366              * `Py_Finalize`.
15367              */
15368 #ifdef Py_DEBUG
15369             // Skip the Immortal Instance check and restore
15370             // the two references (key and value) ignored
15371             // by PyUnicode_InternInPlace().
15372             _Py_SetMortal(s, 2);
15373 #ifdef Py_REF_DEBUG
15374             /* let's be pedantic with the ref total */
15375             _Py_IncRefTotal(_PyThreadState_GET());
15376             _Py_IncRefTotal(_PyThreadState_GET());
15377 #endif
15378 #ifdef INTERNED_STATS
15379             total_length += PyUnicode_GET_LENGTH(s);
15380 #endif
15381 #endif // Py_DEBUG
15382             break;
15383         case SSTATE_INTERNED_IMMORTAL_STATIC:
15384             /* It is shared between interpreters, so we should unmark it
15385                only when this is the last interpreter in which it's
15386                interned.  We immortalize all the statically initialized
15387                strings during startup, so we can rely on the
15388                main interpreter to be the last one. */
15389             if (!_Py_IsMainInterpreter(interp)) {
15390                 shared = 1;
15391             }
15392             break;
15393         case SSTATE_INTERNED_MORTAL:
15394             // Restore 2 references held by the interned dict; these will
15395             // be decref'd by clear_interned_dict's PyDict_Clear.
15396             Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15397 #ifdef Py_REF_DEBUG
15398             /* let's be pedantic with the ref total */
15399             _Py_IncRefTotal(_PyThreadState_GET());
15400             _Py_IncRefTotal(_PyThreadState_GET());
15401 #endif
15402             break;
15403         case SSTATE_NOT_INTERNED:
15404             /* fall through */
15405         default:
15406             Py_UNREACHABLE();
15407         }
15408         if (!shared) {
15409             _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15410         }
15411     }
15412 #ifdef INTERNED_STATS
15413     fprintf(stderr,
15414             "total length of all interned strings: %zd characters\n",
15415             total_length);
15416 #endif
15417 
15418     struct _Py_unicode_state *state = &interp->unicode;
15419     struct _Py_unicode_ids *ids = &state->ids;
15420     for (Py_ssize_t i=0; i < ids->size; i++) {
15421         Py_XINCREF(ids->array[i]);
15422     }
15423     clear_interned_dict(interp);
15424     if (_Py_IsMainInterpreter(interp)) {
15425         clear_global_interned_strings();
15426     }
15427 }
15428 
15429 
15430 /********************* Unicode Iterator **************************/
15431 
15432 typedef struct {
15433     PyObject_HEAD
15434     Py_ssize_t it_index;
15435     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15436 } unicodeiterobject;
15437 
15438 static void
unicodeiter_dealloc(unicodeiterobject * it)15439 unicodeiter_dealloc(unicodeiterobject *it)
15440 {
15441     _PyObject_GC_UNTRACK(it);
15442     Py_XDECREF(it->it_seq);
15443     PyObject_GC_Del(it);
15444 }
15445 
15446 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15447 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15448 {
15449     Py_VISIT(it->it_seq);
15450     return 0;
15451 }
15452 
15453 static PyObject *
unicodeiter_next(unicodeiterobject * it)15454 unicodeiter_next(unicodeiterobject *it)
15455 {
15456     PyObject *seq;
15457 
15458     assert(it != NULL);
15459     seq = it->it_seq;
15460     if (seq == NULL)
15461         return NULL;
15462     assert(_PyUnicode_CHECK(seq));
15463 
15464     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15465         int kind = PyUnicode_KIND(seq);
15466         const void *data = PyUnicode_DATA(seq);
15467         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15468         it->it_index++;
15469         return unicode_char(chr);
15470     }
15471 
15472     it->it_seq = NULL;
15473     Py_DECREF(seq);
15474     return NULL;
15475 }
15476 
15477 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15478 unicode_ascii_iter_next(unicodeiterobject *it)
15479 {
15480     assert(it != NULL);
15481     PyObject *seq = it->it_seq;
15482     if (seq == NULL) {
15483         return NULL;
15484     }
15485     assert(_PyUnicode_CHECK(seq));
15486     assert(PyUnicode_IS_COMPACT_ASCII(seq));
15487     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15488         const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15489         Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15490                                               data, it->it_index);
15491         it->it_index++;
15492         return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15493     }
15494     it->it_seq = NULL;
15495     Py_DECREF(seq);
15496     return NULL;
15497 }
15498 
15499 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15500 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15501 {
15502     Py_ssize_t len = 0;
15503     if (it->it_seq)
15504         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15505     return PyLong_FromSsize_t(len);
15506 }
15507 
15508 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15509 
15510 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15511 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15512 {
15513     PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15514 
15515     /* _PyEval_GetBuiltin can invoke arbitrary code,
15516      * call must be before access of iterator pointers.
15517      * see issue #101765 */
15518 
15519     if (it->it_seq != NULL) {
15520         return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15521     } else {
15522         PyObject *u = unicode_get_empty();
15523         if (u == NULL) {
15524             Py_XDECREF(iter);
15525             return NULL;
15526         }
15527         return Py_BuildValue("N(N)", iter, u);
15528     }
15529 }
15530 
15531 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15532 
15533 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15534 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15535 {
15536     Py_ssize_t index = PyLong_AsSsize_t(state);
15537     if (index == -1 && PyErr_Occurred())
15538         return NULL;
15539     if (it->it_seq != NULL) {
15540         if (index < 0)
15541             index = 0;
15542         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15543             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15544         it->it_index = index;
15545     }
15546     Py_RETURN_NONE;
15547 }
15548 
15549 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15550 
15551 static PyMethodDef unicodeiter_methods[] = {
15552     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15553      length_hint_doc},
15554     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15555      reduce_doc},
15556     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15557      setstate_doc},
15558     {NULL,      NULL}       /* sentinel */
15559 };
15560 
15561 PyTypeObject PyUnicodeIter_Type = {
15562     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15563     "str_iterator",         /* tp_name */
15564     sizeof(unicodeiterobject),      /* tp_basicsize */
15565     0,                  /* tp_itemsize */
15566     /* methods */
15567     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15568     0,                  /* tp_vectorcall_offset */
15569     0,                  /* tp_getattr */
15570     0,                  /* tp_setattr */
15571     0,                  /* tp_as_async */
15572     0,                  /* tp_repr */
15573     0,                  /* tp_as_number */
15574     0,                  /* tp_as_sequence */
15575     0,                  /* tp_as_mapping */
15576     0,                  /* tp_hash */
15577     0,                  /* tp_call */
15578     0,                  /* tp_str */
15579     PyObject_GenericGetAttr,        /* tp_getattro */
15580     0,                  /* tp_setattro */
15581     0,                  /* tp_as_buffer */
15582     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15583     0,                  /* tp_doc */
15584     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15585     0,                  /* tp_clear */
15586     0,                  /* tp_richcompare */
15587     0,                  /* tp_weaklistoffset */
15588     PyObject_SelfIter,          /* tp_iter */
15589     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15590     unicodeiter_methods,            /* tp_methods */
15591     0,
15592 };
15593 
15594 PyTypeObject _PyUnicodeASCIIIter_Type = {
15595     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15596     .tp_name = "str_ascii_iterator",
15597     .tp_basicsize = sizeof(unicodeiterobject),
15598     .tp_dealloc = (destructor)unicodeiter_dealloc,
15599     .tp_getattro = PyObject_GenericGetAttr,
15600     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15601     .tp_traverse = (traverseproc)unicodeiter_traverse,
15602     .tp_iter = PyObject_SelfIter,
15603     .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15604     .tp_methods = unicodeiter_methods,
15605 };
15606 
15607 static PyObject *
unicode_iter(PyObject * seq)15608 unicode_iter(PyObject *seq)
15609 {
15610     unicodeiterobject *it;
15611 
15612     if (!PyUnicode_Check(seq)) {
15613         PyErr_BadInternalCall();
15614         return NULL;
15615     }
15616     if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15617         it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15618     }
15619     else {
15620         it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15621     }
15622     if (it == NULL)
15623         return NULL;
15624     it->it_index = 0;
15625     it->it_seq = Py_NewRef(seq);
15626     _PyObject_GC_TRACK(it);
15627     return (PyObject *)it;
15628 }
15629 
15630 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15631 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15632 {
15633     int res;
15634     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15635     if (res == -2) {
15636         PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
15637         return -1;
15638     }
15639     if (res < 0) {
15640         PyErr_NoMemory();
15641         return -1;
15642     }
15643     return 0;
15644 }
15645 
15646 
15647 static int
config_get_codec_name(wchar_t ** config_encoding)15648 config_get_codec_name(wchar_t **config_encoding)
15649 {
15650     char *encoding;
15651     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15652         return -1;
15653     }
15654 
15655     PyObject *name_obj = NULL;
15656     PyObject *codec = _PyCodec_Lookup(encoding);
15657     PyMem_RawFree(encoding);
15658 
15659     if (!codec)
15660         goto error;
15661 
15662     name_obj = PyObject_GetAttrString(codec, "name");
15663     Py_CLEAR(codec);
15664     if (!name_obj) {
15665         goto error;
15666     }
15667 
15668     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15669     Py_DECREF(name_obj);
15670     if (wname == NULL) {
15671         goto error;
15672     }
15673 
15674     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15675     if (raw_wname == NULL) {
15676         PyMem_Free(wname);
15677         PyErr_NoMemory();
15678         goto error;
15679     }
15680 
15681     PyMem_RawFree(*config_encoding);
15682     *config_encoding = raw_wname;
15683 
15684     PyMem_Free(wname);
15685     return 0;
15686 
15687 error:
15688     Py_XDECREF(codec);
15689     Py_XDECREF(name_obj);
15690     return -1;
15691 }
15692 
15693 
15694 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15695 init_stdio_encoding(PyInterpreterState *interp)
15696 {
15697     /* Update the stdio encoding to the normalized Python codec name. */
15698     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15699     if (config_get_codec_name(&config->stdio_encoding) < 0) {
15700         return _PyStatus_ERR("failed to get the Python codec name "
15701                              "of the stdio encoding");
15702     }
15703     return _PyStatus_OK();
15704 }
15705 
15706 
15707 static int
init_fs_codec(PyInterpreterState * interp)15708 init_fs_codec(PyInterpreterState *interp)
15709 {
15710     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15711 
15712     _Py_error_handler error_handler;
15713     error_handler = get_error_handler_wide(config->filesystem_errors);
15714     if (error_handler == _Py_ERROR_UNKNOWN) {
15715         PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15716         return -1;
15717     }
15718 
15719     char *encoding, *errors;
15720     if (encode_wstr_utf8(config->filesystem_encoding,
15721                          &encoding,
15722                          "filesystem_encoding") < 0) {
15723         return -1;
15724     }
15725 
15726     if (encode_wstr_utf8(config->filesystem_errors,
15727                          &errors,
15728                          "filesystem_errors") < 0) {
15729         PyMem_RawFree(encoding);
15730         return -1;
15731     }
15732 
15733     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15734     PyMem_RawFree(fs_codec->encoding);
15735     fs_codec->encoding = encoding;
15736     /* encoding has been normalized by init_fs_encoding() */
15737     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15738     PyMem_RawFree(fs_codec->errors);
15739     fs_codec->errors = errors;
15740     fs_codec->error_handler = error_handler;
15741 
15742 #ifdef _Py_FORCE_UTF8_FS_ENCODING
15743     assert(fs_codec->utf8 == 1);
15744 #endif
15745 
15746     /* At this point, PyUnicode_EncodeFSDefault() and
15747        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15748        the C implementation of the filesystem encoding. */
15749 
15750     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15751        global configuration variables. */
15752     if (_Py_IsMainInterpreter(interp)) {
15753 
15754         if (_Py_SetFileSystemEncoding(fs_codec->encoding,
15755                                       fs_codec->errors) < 0) {
15756             PyErr_NoMemory();
15757             return -1;
15758         }
15759     }
15760     return 0;
15761 }
15762 
15763 
15764 static PyStatus
init_fs_encoding(PyThreadState * tstate)15765 init_fs_encoding(PyThreadState *tstate)
15766 {
15767     PyInterpreterState *interp = tstate->interp;
15768 
15769     /* Update the filesystem encoding to the normalized Python codec name.
15770        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15771        (Python codec name). */
15772     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15773     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15774         _Py_DumpPathConfig(tstate);
15775         return _PyStatus_ERR("failed to get the Python codec "
15776                              "of the filesystem encoding");
15777     }
15778 
15779     if (init_fs_codec(interp) < 0) {
15780         return _PyStatus_ERR("cannot initialize filesystem codec");
15781     }
15782     return _PyStatus_OK();
15783 }
15784 
15785 
15786 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15787 _PyUnicode_InitEncodings(PyThreadState *tstate)
15788 {
15789     PyStatus status = _PyCodec_InitRegistry(tstate->interp);
15790     if (_PyStatus_EXCEPTION(status)) {
15791         return status;
15792     }
15793     status = init_fs_encoding(tstate);
15794     if (_PyStatus_EXCEPTION(status)) {
15795         return status;
15796     }
15797 
15798     return init_stdio_encoding(tstate->interp);
15799 }
15800 
15801 
15802 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)15803 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
15804 {
15805     PyMem_RawFree(fs_codec->encoding);
15806     fs_codec->encoding = NULL;
15807     fs_codec->utf8 = 0;
15808     PyMem_RawFree(fs_codec->errors);
15809     fs_codec->errors = NULL;
15810     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
15811 }
15812 
15813 
15814 #ifdef MS_WINDOWS
15815 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15816 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15817 {
15818     PyInterpreterState *interp = _PyInterpreterState_GET();
15819     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
15820 
15821     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15822     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15823     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15824     if (encoding == NULL || errors == NULL) {
15825         PyMem_RawFree(encoding);
15826         PyMem_RawFree(errors);
15827         PyErr_NoMemory();
15828         return -1;
15829     }
15830 
15831     PyMem_RawFree(config->filesystem_encoding);
15832     config->filesystem_encoding = encoding;
15833     PyMem_RawFree(config->filesystem_errors);
15834     config->filesystem_errors = errors;
15835 
15836     return init_fs_codec(interp);
15837 }
15838 #endif
15839 
15840 
15841 #ifdef Py_DEBUG
15842 static inline int
unicode_is_finalizing(void)15843 unicode_is_finalizing(void)
15844 {
15845     return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
15846 }
15847 #endif
15848 
15849 
15850 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)15851 _PyUnicode_FiniTypes(PyInterpreterState *interp)
15852 {
15853     _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
15854     _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
15855     _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
15856 }
15857 
15858 
15859 void
_PyUnicode_Fini(PyInterpreterState * interp)15860 _PyUnicode_Fini(PyInterpreterState *interp)
15861 {
15862     struct _Py_unicode_state *state = &interp->unicode;
15863 
15864     if (!has_shared_intern_dict(interp)) {
15865         // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
15866         assert(get_interned_dict(interp) == NULL);
15867     }
15868 
15869     _PyUnicode_FiniEncodings(&state->fs_codec);
15870 
15871     // bpo-47182: force a unicodedata CAPI capsule re-import on
15872     // subsequent initialization of interpreter.
15873     interp->unicode.ucnhash_capi = NULL;
15874 
15875     unicode_clear_identifiers(state);
15876 }
15877 
15878 /* A _string module, to export formatter_parser and formatter_field_name_split
15879    to the string.Formatter class implemented in Python. */
15880 
15881 static PyMethodDef _string_methods[] = {
15882     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15883      METH_O, PyDoc_STR("split the argument as a field name")},
15884     {"formatter_parser", (PyCFunction) formatter_parser,
15885      METH_O, PyDoc_STR("parse the argument as a format string")},
15886     {NULL, NULL}
15887 };
15888 
15889 static PyModuleDef_Slot module_slots[] = {
15890     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
15891     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
15892     {0, NULL}
15893 };
15894 
15895 static struct PyModuleDef _string_module = {
15896     PyModuleDef_HEAD_INIT,
15897     .m_name = "_string",
15898     .m_doc = PyDoc_STR("string helper module"),
15899     .m_size = 0,
15900     .m_methods = _string_methods,
15901     .m_slots = module_slots,
15902 };
15903 
15904 PyMODINIT_FUNC
PyInit__string(void)15905 PyInit__string(void)
15906 {
15907     return PyModuleDef_Init(&_string_module);
15908 }
15909