• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h"      // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45 #include "pycore_bytesobject.h"   // _PyBytes_Repeat()
46 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47 #include "pycore_format.h"        // F_LJUST
48 #include "pycore_initconfig.h"    // _PyStatus_OK()
49 #include "pycore_interp.h"        // PyInterpreterState.fs_codec
50 #include "pycore_long.h"          // _PyLong_FormatWriter()
51 #include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52 #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
53 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
54 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
55 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57 #include "stringlib/eq.h"         // unicode_eq()
58 
59 #ifdef MS_WINDOWS
60 #include <windows.h>
61 #endif
62 
63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64 #  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
65 #endif
66 
67 /* Uncomment to display statistics on interned strings at exit
68    in _PyUnicode_ClearInterned(). */
69 /* #define INTERNED_STATS 1 */
70 
71 
72 /*[clinic input]
73 class str "PyObject *" "&PyUnicode_Type"
74 [clinic start generated code]*/
75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76 
77 /*[python input]
78 class Py_UCS4_converter(CConverter):
79     type = 'Py_UCS4'
80     converter = 'convert_uc'
81 
82     def converter_init(self):
83         if self.default is not unspecified:
84             self.c_default = ascii(self.default)
85             if len(self.c_default) > 4 or self.c_default[0] != "'":
86                 self.c_default = hex(ord(self.default))
87 
88 [python start generated code]*/
89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90 
91 /* --- Globals ------------------------------------------------------------
92 
93 NOTE: In the interpreter's initialization phase, some globals are currently
94       initialized dynamically as needed. In the process Unicode objects may
95       be created before the Unicode type is ready.
96 
97 */
98 
99 
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103 
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107 
108 #ifdef Py_DEBUG
109 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113 
114 #define _PyUnicode_UTF8(op)                             \
115     (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op)                              \
117     (assert(_PyUnicode_CHECK(op)),                      \
118      assert(PyUnicode_IS_READY(op)),                    \
119      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
120          ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
121          _PyUnicode_UTF8(op))
122 #define _PyUnicode_UTF8_LENGTH(op)                      \
123     (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124 #define PyUnicode_UTF8_LENGTH(op)                       \
125     (assert(_PyUnicode_CHECK(op)),                      \
126      assert(PyUnicode_IS_READY(op)),                    \
127      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
128          _PyASCIIObject_CAST(op)->length :              \
129          _PyUnicode_UTF8_LENGTH(op))
130 #define _PyUnicode_WSTR(op)                             \
131     (_PyASCIIObject_CAST(op)->wstr)
132 
133 /* Don't use deprecated macro of unicodeobject.h */
134 #undef PyUnicode_WSTR_LENGTH
135 #define PyUnicode_WSTR_LENGTH(op) \
136     (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
137      _PyASCIIObject_CAST(op)->length :                  \
138      _PyCompactUnicodeObject_CAST(op)->wstr_length)
139 #define _PyUnicode_WSTR_LENGTH(op)                      \
140     (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141 #define _PyUnicode_LENGTH(op)                           \
142     (_PyASCIIObject_CAST(op)->length)
143 #define _PyUnicode_STATE(op)                            \
144     (_PyASCIIObject_CAST(op)->state)
145 #define _PyUnicode_HASH(op)                             \
146     (_PyASCIIObject_CAST(op)->hash)
147 #define _PyUnicode_KIND(op)                             \
148     (assert(_PyUnicode_CHECK(op)),                      \
149      _PyASCIIObject_CAST(op)->state.kind)
150 #define _PyUnicode_GET_LENGTH(op)                       \
151     (assert(_PyUnicode_CHECK(op)),                      \
152      _PyASCIIObject_CAST(op)->length)
153 #define _PyUnicode_DATA_ANY(op)                         \
154     (_PyUnicodeObject_CAST(op)->data.any)
155 
156 #undef PyUnicode_READY
157 #define PyUnicode_READY(op)                             \
158     (assert(_PyUnicode_CHECK(op)),                      \
159      (PyUnicode_IS_READY(op) ?                          \
160       0 :                                               \
161       _PyUnicode_Ready(op)))
162 
163 #define _PyUnicode_SHARE_UTF8(op)                       \
164     (assert(_PyUnicode_CHECK(op)),                      \
165      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
166      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167 #define _PyUnicode_SHARE_WSTR(op)                       \
168     (assert(_PyUnicode_CHECK(op)),                      \
169      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170 
171 /* true if the Unicode object has an allocated UTF-8 memory block
172    (not shared with other data) */
173 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
174     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
175       && _PyUnicode_UTF8(op)                            \
176       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177 
178 /* true if the Unicode object has an allocated wstr memory block
179    (not shared with other data) */
180 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
181     ((_PyUnicode_WSTR(op) &&                            \
182       (!PyUnicode_IS_READY(op) ||                       \
183        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184 
185 /* Generic helper macro to convert characters of different types.
186    from_type and to_type have to be valid type names, begin and end
187    are pointers to the source characters which should be of type
188    "from_type *".  to is a pointer of type "to_type *" and points to the
189    buffer where the result characters are written to. */
190 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191     do {                                                \
192         to_type *_to = (to_type *)(to);                 \
193         const from_type *_iter = (const from_type *)(begin);\
194         const from_type *_end = (const from_type *)(end);\
195         Py_ssize_t n = (_end) - (_iter);                \
196         const from_type *_unrolled_end =                \
197             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
198         while (_iter < (_unrolled_end)) {               \
199             _to[0] = (to_type) _iter[0];                \
200             _to[1] = (to_type) _iter[1];                \
201             _to[2] = (to_type) _iter[2];                \
202             _to[3] = (to_type) _iter[3];                \
203             _iter += 4; _to += 4;                       \
204         }                                               \
205         while (_iter < (_end))                          \
206             *_to++ = (to_type) *_iter++;                \
207     } while (0)
208 
209 #define LATIN1(ch)  \
210     (ch < 128 \
211      ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212      : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213 
214 #ifdef MS_WINDOWS
215    /* On Windows, overallocate by 50% is the best factor */
216 #  define OVERALLOCATE_FACTOR 2
217 #else
218    /* On Linux, overallocate by 25% is the best factor */
219 #  define OVERALLOCATE_FACTOR 4
220 #endif
221 
222 /* This dictionary holds all interned unicode strings.  Note that references
223    to strings in this dictionary are *not* counted in the string's ob_refcnt.
224    When the interned string reaches a refcnt of 0 the string deallocation
225    function will delete the reference from this dictionary.
226 
227    Another way to look at this is that to say that the actual reference
228    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
229 */
230 static PyObject *interned = NULL;
231 
232 /* Forward declaration */
233 static inline int
234 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235 static inline void
236 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237 static PyObject *
238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239                     const char *errors);
240 static PyObject *
241 unicode_decode_utf8(const char *s, Py_ssize_t size,
242                     _Py_error_handler error_handler, const char *errors,
243                     Py_ssize_t *consumed);
244 #ifdef Py_DEBUG
245 static inline int unicode_is_finalizing(void);
246 static int unicode_is_singleton(PyObject *unicode);
247 #endif
248 
249 
250 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)251 static inline PyObject* unicode_get_empty(void)
252 {
253     _Py_DECLARE_STR(empty, "");
254     return &_Py_STR(empty);
255 }
256 
257 
258 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)259 static inline PyObject* unicode_new_empty(void)
260 {
261     PyObject *empty = unicode_get_empty();
262     Py_INCREF(empty);
263     return empty;
264 }
265 
266 #define _Py_RETURN_UNICODE_EMPTY()   \
267     do {                             \
268         return unicode_new_empty();  \
269     } while (0)
270 
271 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273              Py_ssize_t start, Py_ssize_t length)
274 {
275     assert(0 <= start);
276     assert(kind != PyUnicode_WCHAR_KIND);
277     switch (kind) {
278     case PyUnicode_1BYTE_KIND: {
279         assert(value <= 0xff);
280         Py_UCS1 ch = (unsigned char)value;
281         Py_UCS1 *to = (Py_UCS1 *)data + start;
282         memset(to, ch, length);
283         break;
284     }
285     case PyUnicode_2BYTE_KIND: {
286         assert(value <= 0xffff);
287         Py_UCS2 ch = (Py_UCS2)value;
288         Py_UCS2 *to = (Py_UCS2 *)data + start;
289         const Py_UCS2 *end = to + length;
290         for (; to < end; ++to) *to = ch;
291         break;
292     }
293     case PyUnicode_4BYTE_KIND: {
294         assert(value <= MAX_UNICODE);
295         Py_UCS4 ch = value;
296         Py_UCS4 * to = (Py_UCS4 *)data + start;
297         const Py_UCS4 *end = to + length;
298         for (; to < end; ++to) *to = ch;
299         break;
300     }
301     default: Py_UNREACHABLE();
302     }
303 }
304 
305 
306 /* Fast detection of the most frequent whitespace characters */
307 const unsigned char _Py_ascii_whitespace[] = {
308     0, 0, 0, 0, 0, 0, 0, 0,
309 /*     case 0x0009: * CHARACTER TABULATION */
310 /*     case 0x000A: * LINE FEED */
311 /*     case 0x000B: * LINE TABULATION */
312 /*     case 0x000C: * FORM FEED */
313 /*     case 0x000D: * CARRIAGE RETURN */
314     0, 1, 1, 1, 1, 1, 0, 0,
315     0, 0, 0, 0, 0, 0, 0, 0,
316 /*     case 0x001C: * FILE SEPARATOR */
317 /*     case 0x001D: * GROUP SEPARATOR */
318 /*     case 0x001E: * RECORD SEPARATOR */
319 /*     case 0x001F: * UNIT SEPARATOR */
320     0, 0, 0, 0, 1, 1, 1, 1,
321 /*     case 0x0020: * SPACE */
322     1, 0, 0, 0, 0, 0, 0, 0,
323     0, 0, 0, 0, 0, 0, 0, 0,
324     0, 0, 0, 0, 0, 0, 0, 0,
325     0, 0, 0, 0, 0, 0, 0, 0,
326 
327     0, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331     0, 0, 0, 0, 0, 0, 0, 0,
332     0, 0, 0, 0, 0, 0, 0, 0,
333     0, 0, 0, 0, 0, 0, 0, 0,
334     0, 0, 0, 0, 0, 0, 0, 0
335 };
336 
337 /* forward */
338 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339 static PyObject* get_latin1_char(unsigned char ch);
340 static int unicode_modifiable(PyObject *unicode);
341 
342 
343 static PyObject *
344 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345 static PyObject *
346 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347 static PyObject *
348 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349 
350 static PyObject *
351 unicode_encode_call_errorhandler(const char *errors,
352        PyObject **errorHandler,const char *encoding, const char *reason,
353        PyObject *unicode, PyObject **exceptionObject,
354        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355 
356 static void
357 raise_encode_exception(PyObject **exceptionObject,
358                        const char *encoding,
359                        PyObject *unicode,
360                        Py_ssize_t startpos, Py_ssize_t endpos,
361                        const char *reason);
362 
363 /* Same for linebreaks */
364 static const unsigned char ascii_linebreak[] = {
365     0, 0, 0, 0, 0, 0, 0, 0,
366 /*         0x000A, * LINE FEED */
367 /*         0x000B, * LINE TABULATION */
368 /*         0x000C, * FORM FEED */
369 /*         0x000D, * CARRIAGE RETURN */
370     0, 0, 1, 1, 1, 1, 0, 0,
371     0, 0, 0, 0, 0, 0, 0, 0,
372 /*         0x001C, * FILE SEPARATOR */
373 /*         0x001D, * GROUP SEPARATOR */
374 /*         0x001E, * RECORD SEPARATOR */
375     0, 0, 0, 0, 1, 1, 1, 0,
376     0, 0, 0, 0, 0, 0, 0, 0,
377     0, 0, 0, 0, 0, 0, 0, 0,
378     0, 0, 0, 0, 0, 0, 0, 0,
379     0, 0, 0, 0, 0, 0, 0, 0,
380 
381     0, 0, 0, 0, 0, 0, 0, 0,
382     0, 0, 0, 0, 0, 0, 0, 0,
383     0, 0, 0, 0, 0, 0, 0, 0,
384     0, 0, 0, 0, 0, 0, 0, 0,
385     0, 0, 0, 0, 0, 0, 0, 0,
386     0, 0, 0, 0, 0, 0, 0, 0,
387     0, 0, 0, 0, 0, 0, 0, 0,
388     0, 0, 0, 0, 0, 0, 0, 0
389 };
390 
391 static int convert_uc(PyObject *obj, void *addr);
392 
393 struct encoding_map;
394 #include "clinic/unicodeobject.c.h"
395 
396 _Py_error_handler
_Py_GetErrorHandler(const char * errors)397 _Py_GetErrorHandler(const char *errors)
398 {
399     if (errors == NULL || strcmp(errors, "strict") == 0) {
400         return _Py_ERROR_STRICT;
401     }
402     if (strcmp(errors, "surrogateescape") == 0) {
403         return _Py_ERROR_SURROGATEESCAPE;
404     }
405     if (strcmp(errors, "replace") == 0) {
406         return _Py_ERROR_REPLACE;
407     }
408     if (strcmp(errors, "ignore") == 0) {
409         return _Py_ERROR_IGNORE;
410     }
411     if (strcmp(errors, "backslashreplace") == 0) {
412         return _Py_ERROR_BACKSLASHREPLACE;
413     }
414     if (strcmp(errors, "surrogatepass") == 0) {
415         return _Py_ERROR_SURROGATEPASS;
416     }
417     if (strcmp(errors, "xmlcharrefreplace") == 0) {
418         return _Py_ERROR_XMLCHARREFREPLACE;
419     }
420     return _Py_ERROR_OTHER;
421 }
422 
423 
424 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)425 get_error_handler_wide(const wchar_t *errors)
426 {
427     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428         return _Py_ERROR_STRICT;
429     }
430     if (wcscmp(errors, L"surrogateescape") == 0) {
431         return _Py_ERROR_SURROGATEESCAPE;
432     }
433     if (wcscmp(errors, L"replace") == 0) {
434         return _Py_ERROR_REPLACE;
435     }
436     if (wcscmp(errors, L"ignore") == 0) {
437         return _Py_ERROR_IGNORE;
438     }
439     if (wcscmp(errors, L"backslashreplace") == 0) {
440         return _Py_ERROR_BACKSLASHREPLACE;
441     }
442     if (wcscmp(errors, L"surrogatepass") == 0) {
443         return _Py_ERROR_SURROGATEPASS;
444     }
445     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446         return _Py_ERROR_XMLCHARREFREPLACE;
447     }
448     return _Py_ERROR_OTHER;
449 }
450 
451 
452 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)453 unicode_check_encoding_errors(const char *encoding, const char *errors)
454 {
455     if (encoding == NULL && errors == NULL) {
456         return 0;
457     }
458 
459     PyInterpreterState *interp = _PyInterpreterState_GET();
460 #ifndef Py_DEBUG
461     /* In release mode, only check in development mode (-X dev) */
462     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463         return 0;
464     }
465 #else
466     /* Always check in debug mode */
467 #endif
468 
469     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471     if (!interp->unicode.fs_codec.encoding) {
472         return 0;
473     }
474 
475     /* Disable checks during Python finalization. For example, it allows to
476        call _PyObject_Dump() during finalization for debugging purpose. */
477     if (interp->finalizing) {
478         return 0;
479     }
480 
481     if (encoding != NULL) {
482         PyObject *handler = _PyCodec_Lookup(encoding);
483         if (handler == NULL) {
484             return -1;
485         }
486         Py_DECREF(handler);
487     }
488 
489     if (errors != NULL) {
490         PyObject *handler = PyCodec_LookupError(errors);
491         if (handler == NULL) {
492             return -1;
493         }
494         Py_DECREF(handler);
495     }
496     return 0;
497 }
498 
499 
500 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)501 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
502 {
503 #define CHECK(expr) \
504     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505 
506     assert(op != NULL);
507     CHECK(PyUnicode_Check(op));
508 
509     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510     unsigned int kind = ascii->state.kind;
511 
512     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513         CHECK(kind == PyUnicode_1BYTE_KIND);
514         CHECK(ascii->state.ready == 1);
515     }
516     else {
517         PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518         void *data;
519 
520         if (ascii->state.compact == 1) {
521             data = compact + 1;
522             CHECK(kind == PyUnicode_1BYTE_KIND
523                                  || kind == PyUnicode_2BYTE_KIND
524                                  || kind == PyUnicode_4BYTE_KIND);
525             CHECK(ascii->state.ascii == 0);
526             CHECK(ascii->state.ready == 1);
527             CHECK(compact->utf8 != data);
528         }
529         else {
530             PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531 
532             data = unicode->data.any;
533             if (kind == PyUnicode_WCHAR_KIND) {
534                 CHECK(ascii->length == 0);
535                 CHECK(ascii->hash == -1);
536                 CHECK(ascii->state.compact == 0);
537                 CHECK(ascii->state.ascii == 0);
538                 CHECK(ascii->state.ready == 0);
539                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540                 CHECK(ascii->wstr != NULL);
541                 CHECK(data == NULL);
542                 CHECK(compact->utf8 == NULL);
543             }
544             else {
545                 CHECK(kind == PyUnicode_1BYTE_KIND
546                                      || kind == PyUnicode_2BYTE_KIND
547                                      || kind == PyUnicode_4BYTE_KIND);
548                 CHECK(ascii->state.compact == 0);
549                 CHECK(ascii->state.ready == 1);
550                 CHECK(data != NULL);
551                 if (ascii->state.ascii) {
552                     CHECK(compact->utf8 == data);
553                     CHECK(compact->utf8_length == ascii->length);
554                 }
555                 else
556                     CHECK(compact->utf8 != data);
557             }
558         }
559         if (kind != PyUnicode_WCHAR_KIND) {
560             if (
561 #if SIZEOF_WCHAR_T == 2
562                 kind == PyUnicode_2BYTE_KIND
563 #else
564                 kind == PyUnicode_4BYTE_KIND
565 #endif
566                )
567             {
568                 CHECK(ascii->wstr == data);
569                 CHECK(compact->wstr_length == ascii->length);
570             } else
571                 CHECK(ascii->wstr != data);
572         }
573 
574         if (compact->utf8 == NULL)
575             CHECK(compact->utf8_length == 0);
576         if (ascii->wstr == NULL)
577             CHECK(compact->wstr_length == 0);
578     }
579 
580     /* check that the best kind is used: O(n) operation */
581     if (check_content && kind != PyUnicode_WCHAR_KIND) {
582         Py_ssize_t i;
583         Py_UCS4 maxchar = 0;
584         const void *data;
585         Py_UCS4 ch;
586 
587         data = PyUnicode_DATA(ascii);
588         for (i=0; i < ascii->length; i++)
589         {
590             ch = PyUnicode_READ(kind, data, i);
591             if (ch > maxchar)
592                 maxchar = ch;
593         }
594         if (kind == PyUnicode_1BYTE_KIND) {
595             if (ascii->state.ascii == 0) {
596                 CHECK(maxchar >= 128);
597                 CHECK(maxchar <= 255);
598             }
599             else
600                 CHECK(maxchar < 128);
601         }
602         else if (kind == PyUnicode_2BYTE_KIND) {
603             CHECK(maxchar >= 0x100);
604             CHECK(maxchar <= 0xFFFF);
605         }
606         else {
607             CHECK(maxchar >= 0x10000);
608             CHECK(maxchar <= MAX_UNICODE);
609         }
610         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611     }
612     return 1;
613 
614 #undef CHECK
615 }
616 
617 
618 static PyObject*
unicode_result_wchar(PyObject * unicode)619 unicode_result_wchar(PyObject *unicode)
620 {
621 #ifndef Py_DEBUG
622     Py_ssize_t len;
623 
624     len = _PyUnicode_WSTR_LENGTH(unicode);
625     if (len == 0) {
626         Py_DECREF(unicode);
627         _Py_RETURN_UNICODE_EMPTY();
628     }
629 
630     if (len == 1) {
631         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632         if ((Py_UCS4)ch < 256) {
633             Py_DECREF(unicode);
634             return get_latin1_char((unsigned char)ch);
635         }
636     }
637 
638     if (_PyUnicode_Ready(unicode) < 0) {
639         Py_DECREF(unicode);
640         return NULL;
641     }
642 #else
643     assert(Py_REFCNT(unicode) == 1);
644 
645     /* don't make the result ready in debug mode to ensure that the caller
646        makes the string ready before using it */
647     assert(_PyUnicode_CheckConsistency(unicode, 1));
648 #endif
649     return unicode;
650 }
651 
652 static PyObject*
unicode_result_ready(PyObject * unicode)653 unicode_result_ready(PyObject *unicode)
654 {
655     Py_ssize_t length;
656 
657     length = PyUnicode_GET_LENGTH(unicode);
658     if (length == 0) {
659         PyObject *empty = unicode_get_empty();
660         if (unicode != empty) {
661             Py_DECREF(unicode);
662             Py_INCREF(empty);
663         }
664         return empty;
665     }
666 
667     if (length == 1) {
668         int kind = PyUnicode_KIND(unicode);
669         if (kind == PyUnicode_1BYTE_KIND) {
670             const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671             Py_UCS1 ch = data[0];
672             PyObject *latin1_char = LATIN1(ch);
673             if (unicode != latin1_char) {
674                 Py_INCREF(latin1_char);
675                 Py_DECREF(unicode);
676             }
677             return latin1_char;
678         }
679     }
680 
681     assert(_PyUnicode_CheckConsistency(unicode, 1));
682     return unicode;
683 }
684 
685 static PyObject*
unicode_result(PyObject * unicode)686 unicode_result(PyObject *unicode)
687 {
688     assert(_PyUnicode_CHECK(unicode));
689     if (PyUnicode_IS_READY(unicode))
690         return unicode_result_ready(unicode);
691     else
692         return unicode_result_wchar(unicode);
693 }
694 
695 static PyObject*
unicode_result_unchanged(PyObject * unicode)696 unicode_result_unchanged(PyObject *unicode)
697 {
698     if (PyUnicode_CheckExact(unicode)) {
699         if (PyUnicode_READY(unicode) == -1)
700             return NULL;
701         Py_INCREF(unicode);
702         return unicode;
703     }
704     else
705         /* Subtype -- return genuine unicode string with the same value. */
706         return _PyUnicode_Copy(unicode);
707 }
708 
709 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710    ASCII, Latin1, UTF-8, etc. */
711 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712 backslashreplace(_PyBytesWriter *writer, char *str,
713                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714 {
715     Py_ssize_t size, i;
716     Py_UCS4 ch;
717     enum PyUnicode_Kind kind;
718     const void *data;
719 
720     assert(PyUnicode_IS_READY(unicode));
721     kind = PyUnicode_KIND(unicode);
722     data = PyUnicode_DATA(unicode);
723 
724     size = 0;
725     /* determine replacement size */
726     for (i = collstart; i < collend; ++i) {
727         Py_ssize_t incr;
728 
729         ch = PyUnicode_READ(kind, data, i);
730         if (ch < 0x100)
731             incr = 2+2;
732         else if (ch < 0x10000)
733             incr = 2+4;
734         else {
735             assert(ch <= MAX_UNICODE);
736             incr = 2+8;
737         }
738         if (size > PY_SSIZE_T_MAX - incr) {
739             PyErr_SetString(PyExc_OverflowError,
740                             "encoded result is too long for a Python string");
741             return NULL;
742         }
743         size += incr;
744     }
745 
746     str = _PyBytesWriter_Prepare(writer, str, size);
747     if (str == NULL)
748         return NULL;
749 
750     /* generate replacement */
751     for (i = collstart; i < collend; ++i) {
752         ch = PyUnicode_READ(kind, data, i);
753         *str++ = '\\';
754         if (ch >= 0x00010000) {
755             *str++ = 'U';
756             *str++ = Py_hexdigits[(ch>>28)&0xf];
757             *str++ = Py_hexdigits[(ch>>24)&0xf];
758             *str++ = Py_hexdigits[(ch>>20)&0xf];
759             *str++ = Py_hexdigits[(ch>>16)&0xf];
760             *str++ = Py_hexdigits[(ch>>12)&0xf];
761             *str++ = Py_hexdigits[(ch>>8)&0xf];
762         }
763         else if (ch >= 0x100) {
764             *str++ = 'u';
765             *str++ = Py_hexdigits[(ch>>12)&0xf];
766             *str++ = Py_hexdigits[(ch>>8)&0xf];
767         }
768         else
769             *str++ = 'x';
770         *str++ = Py_hexdigits[(ch>>4)&0xf];
771         *str++ = Py_hexdigits[ch&0xf];
772     }
773     return str;
774 }
775 
776 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777    ASCII, Latin1, UTF-8, etc. */
778 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781 {
782     Py_ssize_t size, i;
783     Py_UCS4 ch;
784     enum PyUnicode_Kind kind;
785     const void *data;
786 
787     assert(PyUnicode_IS_READY(unicode));
788     kind = PyUnicode_KIND(unicode);
789     data = PyUnicode_DATA(unicode);
790 
791     size = 0;
792     /* determine replacement size */
793     for (i = collstart; i < collend; ++i) {
794         Py_ssize_t incr;
795 
796         ch = PyUnicode_READ(kind, data, i);
797         if (ch < 10)
798             incr = 2+1+1;
799         else if (ch < 100)
800             incr = 2+2+1;
801         else if (ch < 1000)
802             incr = 2+3+1;
803         else if (ch < 10000)
804             incr = 2+4+1;
805         else if (ch < 100000)
806             incr = 2+5+1;
807         else if (ch < 1000000)
808             incr = 2+6+1;
809         else {
810             assert(ch <= MAX_UNICODE);
811             incr = 2+7+1;
812         }
813         if (size > PY_SSIZE_T_MAX - incr) {
814             PyErr_SetString(PyExc_OverflowError,
815                             "encoded result is too long for a Python string");
816             return NULL;
817         }
818         size += incr;
819     }
820 
821     str = _PyBytesWriter_Prepare(writer, str, size);
822     if (str == NULL)
823         return NULL;
824 
825     /* generate replacement */
826     for (i = collstart; i < collend; ++i) {
827         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828         if (size < 0) {
829             return NULL;
830         }
831         str += size;
832     }
833     return str;
834 }
835 
836 /* --- Bloom Filters ----------------------------------------------------- */
837 
838 /* stuff to implement simple "bloom filters" for Unicode characters.
839    to keep things simple, we use a single bitmask, using the least 5
840    bits from each unicode characters as the bit index. */
841 
842 /* the linebreak mask is set up by _PyUnicode_Init() below */
843 
844 #if LONG_BIT >= 128
845 #define BLOOM_WIDTH 128
846 #elif LONG_BIT >= 64
847 #define BLOOM_WIDTH 64
848 #elif LONG_BIT >= 32
849 #define BLOOM_WIDTH 32
850 #else
851 #error "LONG_BIT is smaller than 32"
852 #endif
853 
854 #define BLOOM_MASK unsigned long
855 
856 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857 
858 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859 
860 #define BLOOM_LINEBREAK(ch)                                             \
861     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
862      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863 
864 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866 {
867 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
868     do {                                               \
869         TYPE *data = (TYPE *)PTR;                      \
870         TYPE *end = data + LEN;                        \
871         Py_UCS4 ch;                                    \
872         for (; data != end; data++) {                  \
873             ch = *data;                                \
874             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875         }                                              \
876         break;                                         \
877     } while (0)
878 
879     /* calculate simple bloom-style bitmask for a given unicode string */
880 
881     BLOOM_MASK mask;
882 
883     mask = 0;
884     switch (kind) {
885     case PyUnicode_1BYTE_KIND:
886         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887         break;
888     case PyUnicode_2BYTE_KIND:
889         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890         break;
891     case PyUnicode_4BYTE_KIND:
892         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893         break;
894     default:
895         Py_UNREACHABLE();
896     }
897     return mask;
898 
899 #undef BLOOM_UPDATE
900 }
901 
902 static int
ensure_unicode(PyObject * obj)903 ensure_unicode(PyObject *obj)
904 {
905     if (!PyUnicode_Check(obj)) {
906         PyErr_Format(PyExc_TypeError,
907                      "must be str, not %.100s",
908                      Py_TYPE(obj)->tp_name);
909         return -1;
910     }
911     return PyUnicode_READY(obj);
912 }
913 
914 /* Compilation of templated routines */
915 
916 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
917 
918 #include "stringlib/asciilib.h"
919 #include "stringlib/fastsearch.h"
920 #include "stringlib/partition.h"
921 #include "stringlib/split.h"
922 #include "stringlib/count.h"
923 #include "stringlib/find.h"
924 #include "stringlib/find_max_char.h"
925 #include "stringlib/undef.h"
926 
927 #include "stringlib/ucs1lib.h"
928 #include "stringlib/fastsearch.h"
929 #include "stringlib/partition.h"
930 #include "stringlib/split.h"
931 #include "stringlib/count.h"
932 #include "stringlib/find.h"
933 #include "stringlib/replace.h"
934 #include "stringlib/find_max_char.h"
935 #include "stringlib/undef.h"
936 
937 #include "stringlib/ucs2lib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/replace.h"
944 #include "stringlib/find_max_char.h"
945 #include "stringlib/undef.h"
946 
947 #include "stringlib/ucs4lib.h"
948 #include "stringlib/fastsearch.h"
949 #include "stringlib/partition.h"
950 #include "stringlib/split.h"
951 #include "stringlib/count.h"
952 #include "stringlib/find.h"
953 #include "stringlib/replace.h"
954 #include "stringlib/find_max_char.h"
955 #include "stringlib/undef.h"
956 
957 _Py_COMP_DIAG_PUSH
958 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
959 #include "stringlib/unicodedefs.h"
960 #include "stringlib/fastsearch.h"
961 #include "stringlib/count.h"
962 #include "stringlib/find.h"
963 #include "stringlib/undef.h"
964 _Py_COMP_DIAG_POP
965 
966 #undef STRINGLIB_GET_EMPTY
967 
968 /* --- Unicode Object ----------------------------------------------------- */
969 
970 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971 findchar(const void *s, int kind,
972          Py_ssize_t size, Py_UCS4 ch,
973          int direction)
974 {
975     switch (kind) {
976     case PyUnicode_1BYTE_KIND:
977         if ((Py_UCS1) ch != ch)
978             return -1;
979         if (direction > 0)
980             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981         else
982             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983     case PyUnicode_2BYTE_KIND:
984         if ((Py_UCS2) ch != ch)
985             return -1;
986         if (direction > 0)
987             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988         else
989             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990     case PyUnicode_4BYTE_KIND:
991         if (direction > 0)
992             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993         else
994             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995     default:
996         Py_UNREACHABLE();
997     }
998 }
999 
1000 #ifdef Py_DEBUG
1001 /* Fill the data of a Unicode string with invalid characters to detect bugs
1002    earlier.
1003 
1004    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006    invalid character in Unicode 6.0. */
1007 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009 {
1010     int kind = PyUnicode_KIND(unicode);
1011     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013     if (length <= old_length)
1014         return;
1015     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016 }
1017 #endif
1018 
1019 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1020 resize_compact(PyObject *unicode, Py_ssize_t length)
1021 {
1022     Py_ssize_t char_size;
1023     Py_ssize_t struct_size;
1024     Py_ssize_t new_size;
1025     int share_wstr;
1026     PyObject *new_unicode;
1027 #ifdef Py_DEBUG
1028     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029 #endif
1030 
1031     assert(unicode_modifiable(unicode));
1032     assert(PyUnicode_IS_READY(unicode));
1033     assert(PyUnicode_IS_COMPACT(unicode));
1034 
1035     char_size = PyUnicode_KIND(unicode);
1036     if (PyUnicode_IS_ASCII(unicode))
1037         struct_size = sizeof(PyASCIIObject);
1038     else
1039         struct_size = sizeof(PyCompactUnicodeObject);
1040     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 
1042     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043         PyErr_NoMemory();
1044         return NULL;
1045     }
1046     new_size = (struct_size + (length + 1) * char_size);
1047 
1048     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049         PyObject_Free(_PyUnicode_UTF8(unicode));
1050         _PyUnicode_UTF8(unicode) = NULL;
1051         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052     }
1053 #ifdef Py_REF_DEBUG
1054     _Py_RefTotal--;
1055 #endif
1056 #ifdef Py_TRACE_REFS
1057     _Py_ForgetReference(unicode);
1058 #endif
1059 
1060     new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061     if (new_unicode == NULL) {
1062         _Py_NewReference(unicode);
1063         PyErr_NoMemory();
1064         return NULL;
1065     }
1066     unicode = new_unicode;
1067     _Py_NewReference(unicode);
1068 
1069     _PyUnicode_LENGTH(unicode) = length;
1070     if (share_wstr) {
1071         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072         if (!PyUnicode_IS_ASCII(unicode))
1073             _PyUnicode_WSTR_LENGTH(unicode) = length;
1074     }
1075     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076         PyObject_Free(_PyUnicode_WSTR(unicode));
1077         _PyUnicode_WSTR(unicode) = NULL;
1078         if (!PyUnicode_IS_ASCII(unicode))
1079             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080     }
1081 #ifdef Py_DEBUG
1082     unicode_fill_invalid(unicode, old_length);
1083 #endif
1084     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085                     length, 0);
1086     assert(_PyUnicode_CheckConsistency(unicode, 0));
1087     return unicode;
1088 }
1089 
1090 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1091 resize_inplace(PyObject *unicode, Py_ssize_t length)
1092 {
1093     wchar_t *wstr;
1094     Py_ssize_t new_size;
1095     assert(!PyUnicode_IS_COMPACT(unicode));
1096     assert(Py_REFCNT(unicode) == 1);
1097 
1098     if (PyUnicode_IS_READY(unicode)) {
1099         Py_ssize_t char_size;
1100         int share_wstr, share_utf8;
1101         void *data;
1102 #ifdef Py_DEBUG
1103         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104 #endif
1105 
1106         data = _PyUnicode_DATA_ANY(unicode);
1107         char_size = PyUnicode_KIND(unicode);
1108         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110 
1111         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112             PyErr_NoMemory();
1113             return -1;
1114         }
1115         new_size = (length + 1) * char_size;
1116 
1117         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118         {
1119             PyObject_Free(_PyUnicode_UTF8(unicode));
1120             _PyUnicode_UTF8(unicode) = NULL;
1121             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122         }
1123 
1124         data = (PyObject *)PyObject_Realloc(data, new_size);
1125         if (data == NULL) {
1126             PyErr_NoMemory();
1127             return -1;
1128         }
1129         _PyUnicode_DATA_ANY(unicode) = data;
1130         if (share_wstr) {
1131             _PyUnicode_WSTR(unicode) = data;
1132             _PyUnicode_WSTR_LENGTH(unicode) = length;
1133         }
1134         if (share_utf8) {
1135             _PyUnicode_UTF8(unicode) = data;
1136             _PyUnicode_UTF8_LENGTH(unicode) = length;
1137         }
1138         _PyUnicode_LENGTH(unicode) = length;
1139         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140 #ifdef Py_DEBUG
1141         unicode_fill_invalid(unicode, old_length);
1142 #endif
1143         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144             assert(_PyUnicode_CheckConsistency(unicode, 0));
1145             return 0;
1146         }
1147     }
1148     assert(_PyUnicode_WSTR(unicode) != NULL);
1149 
1150     /* check for integer overflow */
1151     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152         PyErr_NoMemory();
1153         return -1;
1154     }
1155     new_size = sizeof(wchar_t) * (length + 1);
1156     wstr =  _PyUnicode_WSTR(unicode);
1157     wstr = PyObject_Realloc(wstr, new_size);
1158     if (!wstr) {
1159         PyErr_NoMemory();
1160         return -1;
1161     }
1162     _PyUnicode_WSTR(unicode) = wstr;
1163     _PyUnicode_WSTR(unicode)[length] = 0;
1164     _PyUnicode_WSTR_LENGTH(unicode) = length;
1165     assert(_PyUnicode_CheckConsistency(unicode, 0));
1166     return 0;
1167 }
1168 
1169 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1170 resize_copy(PyObject *unicode, Py_ssize_t length)
1171 {
1172     Py_ssize_t copy_length;
1173     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174         PyObject *copy;
1175 
1176         assert(PyUnicode_IS_READY(unicode));
1177 
1178         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179         if (copy == NULL)
1180             return NULL;
1181 
1182         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184         return copy;
1185     }
1186     else {
1187         PyObject *w;
1188 
1189         w = (PyObject*)_PyUnicode_New(length);
1190         if (w == NULL)
1191             return NULL;
1192         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193         copy_length = Py_MIN(copy_length, length);
1194         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195                   copy_length * sizeof(wchar_t));
1196         return w;
1197     }
1198 }
1199 
1200 /* We allocate one more byte to make sure the string is
1201    Ux0000 terminated; some code (e.g. new_identifier)
1202    relies on that.
1203 
1204    XXX This allocator could further be enhanced by assuring that the
1205    free list never reduces its size below 1.
1206 
1207 */
1208 
1209 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1210 _PyUnicode_New(Py_ssize_t length)
1211 {
1212     PyUnicodeObject *unicode;
1213     size_t new_size;
1214 
1215     /* Optimization for empty strings */
1216     if (length == 0) {
1217         return (PyUnicodeObject *)unicode_new_empty();
1218     }
1219 
1220     /* Ensure we won't overflow the size. */
1221     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222         return (PyUnicodeObject *)PyErr_NoMemory();
1223     }
1224     if (length < 0) {
1225         PyErr_SetString(PyExc_SystemError,
1226                         "Negative size passed to _PyUnicode_New");
1227         return NULL;
1228     }
1229 
1230     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231     if (unicode == NULL)
1232         return NULL;
1233     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234 
1235     _PyUnicode_WSTR_LENGTH(unicode) = length;
1236     _PyUnicode_HASH(unicode) = -1;
1237     _PyUnicode_STATE(unicode).interned = 0;
1238     _PyUnicode_STATE(unicode).kind = 0;
1239     _PyUnicode_STATE(unicode).compact = 0;
1240     _PyUnicode_STATE(unicode).ready = 0;
1241     _PyUnicode_STATE(unicode).ascii = 0;
1242     _PyUnicode_DATA_ANY(unicode) = NULL;
1243     _PyUnicode_LENGTH(unicode) = 0;
1244     _PyUnicode_UTF8(unicode) = NULL;
1245     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246 
1247     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248     if (!_PyUnicode_WSTR(unicode)) {
1249         Py_DECREF(unicode);
1250         PyErr_NoMemory();
1251         return NULL;
1252     }
1253 
1254     /* Initialize the first element to guard against cases where
1255      * the caller fails before initializing str -- unicode_resize()
1256      * reads str[0], and the Keep-Alive optimization can keep memory
1257      * allocated for str alive across a call to unicode_dealloc(unicode).
1258      * We don't want unicode_resize to read uninitialized memory in
1259      * that case.
1260      */
1261     _PyUnicode_WSTR(unicode)[0] = 0;
1262     _PyUnicode_WSTR(unicode)[length] = 0;
1263 
1264     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265     return unicode;
1266 }
1267 
1268 static const char*
unicode_kind_name(PyObject * unicode)1269 unicode_kind_name(PyObject *unicode)
1270 {
1271     /* don't check consistency: unicode_kind_name() is called from
1272        _PyUnicode_Dump() */
1273     if (!PyUnicode_IS_COMPACT(unicode))
1274     {
1275         if (!PyUnicode_IS_READY(unicode))
1276             return "wstr";
1277         switch (PyUnicode_KIND(unicode))
1278         {
1279         case PyUnicode_1BYTE_KIND:
1280             if (PyUnicode_IS_ASCII(unicode))
1281                 return "legacy ascii";
1282             else
1283                 return "legacy latin1";
1284         case PyUnicode_2BYTE_KIND:
1285             return "legacy UCS2";
1286         case PyUnicode_4BYTE_KIND:
1287             return "legacy UCS4";
1288         default:
1289             return "<legacy invalid kind>";
1290         }
1291     }
1292     assert(PyUnicode_IS_READY(unicode));
1293     switch (PyUnicode_KIND(unicode)) {
1294     case PyUnicode_1BYTE_KIND:
1295         if (PyUnicode_IS_ASCII(unicode))
1296             return "ascii";
1297         else
1298             return "latin1";
1299     case PyUnicode_2BYTE_KIND:
1300         return "UCS2";
1301     case PyUnicode_4BYTE_KIND:
1302         return "UCS4";
1303     default:
1304         return "<invalid compact kind>";
1305     }
1306 }
1307 
1308 #ifdef Py_DEBUG
1309 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1310 const char *_PyUnicode_utf8(void *unicode_raw){
1311     PyObject *unicode = _PyObject_CAST(unicode_raw);
1312     return PyUnicode_UTF8(unicode);
1313 }
1314 
_PyUnicode_compact_data(void * unicode_raw)1315 const void *_PyUnicode_compact_data(void *unicode_raw) {
1316     PyObject *unicode = _PyObject_CAST(unicode_raw);
1317     return _PyUnicode_COMPACT_DATA(unicode);
1318 }
_PyUnicode_data(void * unicode_raw)1319 const void *_PyUnicode_data(void *unicode_raw) {
1320     PyObject *unicode = _PyObject_CAST(unicode_raw);
1321     printf("obj %p\n", (void*)unicode);
1322     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324     printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325     printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327     return PyUnicode_DATA(unicode);
1328 }
1329 
1330 void
_PyUnicode_Dump(PyObject * op)1331 _PyUnicode_Dump(PyObject *op)
1332 {
1333     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334     PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335     PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336     const void *data;
1337 
1338     if (ascii->state.compact)
1339     {
1340         if (ascii->state.ascii)
1341             data = (ascii + 1);
1342         else
1343             data = (compact + 1);
1344     }
1345     else
1346         data = unicode->data.any;
1347     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348 
1349     if (ascii->wstr == data)
1350         printf("shared ");
1351     printf("wstr=%p", (void *)ascii->wstr);
1352 
1353     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354         printf(" (%zu), ", compact->wstr_length);
1355         if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356             printf("shared ");
1357         }
1358         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359     }
1360     printf(", data=%p\n", data);
1361 }
1362 #endif
1363 
1364 
1365 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367 {
1368     /* Optimization for empty strings */
1369     if (size == 0) {
1370         return unicode_new_empty();
1371     }
1372 
1373     PyObject *obj;
1374     PyCompactUnicodeObject *unicode;
1375     void *data;
1376     enum PyUnicode_Kind kind;
1377     int is_sharing, is_ascii;
1378     Py_ssize_t char_size;
1379     Py_ssize_t struct_size;
1380 
1381     is_ascii = 0;
1382     is_sharing = 0;
1383     struct_size = sizeof(PyCompactUnicodeObject);
1384     if (maxchar < 128) {
1385         kind = PyUnicode_1BYTE_KIND;
1386         char_size = 1;
1387         is_ascii = 1;
1388         struct_size = sizeof(PyASCIIObject);
1389     }
1390     else if (maxchar < 256) {
1391         kind = PyUnicode_1BYTE_KIND;
1392         char_size = 1;
1393     }
1394     else if (maxchar < 65536) {
1395         kind = PyUnicode_2BYTE_KIND;
1396         char_size = 2;
1397         if (sizeof(wchar_t) == 2)
1398             is_sharing = 1;
1399     }
1400     else {
1401         if (maxchar > MAX_UNICODE) {
1402             PyErr_SetString(PyExc_SystemError,
1403                             "invalid maximum character passed to PyUnicode_New");
1404             return NULL;
1405         }
1406         kind = PyUnicode_4BYTE_KIND;
1407         char_size = 4;
1408         if (sizeof(wchar_t) == 4)
1409             is_sharing = 1;
1410     }
1411 
1412     /* Ensure we won't overflow the size. */
1413     if (size < 0) {
1414         PyErr_SetString(PyExc_SystemError,
1415                         "Negative size passed to PyUnicode_New");
1416         return NULL;
1417     }
1418     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419         return PyErr_NoMemory();
1420 
1421     /* Duplicated allocation code from _PyObject_New() instead of a call to
1422      * PyObject_New() so we are able to allocate space for the object and
1423      * it's data buffer.
1424      */
1425     obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426     if (obj == NULL) {
1427         return PyErr_NoMemory();
1428     }
1429     _PyObject_Init(obj, &PyUnicode_Type);
1430 
1431     unicode = (PyCompactUnicodeObject *)obj;
1432     if (is_ascii)
1433         data = ((PyASCIIObject*)obj) + 1;
1434     else
1435         data = unicode + 1;
1436     _PyUnicode_LENGTH(unicode) = size;
1437     _PyUnicode_HASH(unicode) = -1;
1438     _PyUnicode_STATE(unicode).interned = 0;
1439     _PyUnicode_STATE(unicode).kind = kind;
1440     _PyUnicode_STATE(unicode).compact = 1;
1441     _PyUnicode_STATE(unicode).ready = 1;
1442     _PyUnicode_STATE(unicode).ascii = is_ascii;
1443     if (is_ascii) {
1444         ((char*)data)[size] = 0;
1445         _PyUnicode_WSTR(unicode) = NULL;
1446     }
1447     else if (kind == PyUnicode_1BYTE_KIND) {
1448         ((char*)data)[size] = 0;
1449         _PyUnicode_WSTR(unicode) = NULL;
1450         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451         unicode->utf8 = NULL;
1452         unicode->utf8_length = 0;
1453     }
1454     else {
1455         unicode->utf8 = NULL;
1456         unicode->utf8_length = 0;
1457         if (kind == PyUnicode_2BYTE_KIND)
1458             ((Py_UCS2*)data)[size] = 0;
1459         else /* kind == PyUnicode_4BYTE_KIND */
1460             ((Py_UCS4*)data)[size] = 0;
1461         if (is_sharing) {
1462             _PyUnicode_WSTR_LENGTH(unicode) = size;
1463             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464         }
1465         else {
1466             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467             _PyUnicode_WSTR(unicode) = NULL;
1468         }
1469     }
1470 #ifdef Py_DEBUG
1471     unicode_fill_invalid((PyObject*)unicode, 0);
1472 #endif
1473     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474     return obj;
1475 }
1476 
1477 #if SIZEOF_WCHAR_T == 2
1478 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479    will decode surrogate pairs, the other conversions are implemented as macros
1480    for efficiency.
1481 
1482    This function assumes that unicode can hold one more code point than wstr
1483    characters for a terminating null character. */
1484 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486                               PyObject *unicode)
1487 {
1488     const wchar_t *iter;
1489     Py_UCS4 *ucs4_out;
1490 
1491     assert(unicode != NULL);
1492     assert(_PyUnicode_CHECK(unicode));
1493     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495 
1496     for (iter = begin; iter < end; ) {
1497         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498                            _PyUnicode_GET_LENGTH(unicode)));
1499         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500             && (iter+1) < end
1501             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502         {
1503             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504             iter += 2;
1505         }
1506         else {
1507             *ucs4_out++ = *iter;
1508             iter++;
1509         }
1510     }
1511     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512                         _PyUnicode_GET_LENGTH(unicode)));
1513 
1514 }
1515 #endif
1516 
1517 static int
unicode_check_modifiable(PyObject * unicode)1518 unicode_check_modifiable(PyObject *unicode)
1519 {
1520     if (!unicode_modifiable(unicode)) {
1521         PyErr_SetString(PyExc_SystemError,
1522                         "Cannot modify a string currently used");
1523         return -1;
1524     }
1525     return 0;
1526 }
1527 
1528 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529 _copy_characters(PyObject *to, Py_ssize_t to_start,
1530                  PyObject *from, Py_ssize_t from_start,
1531                  Py_ssize_t how_many, int check_maxchar)
1532 {
1533     unsigned int from_kind, to_kind;
1534     const void *from_data;
1535     void *to_data;
1536 
1537     assert(0 <= how_many);
1538     assert(0 <= from_start);
1539     assert(0 <= to_start);
1540     assert(PyUnicode_Check(from));
1541     assert(PyUnicode_IS_READY(from));
1542     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543 
1544     assert(PyUnicode_Check(to));
1545     assert(PyUnicode_IS_READY(to));
1546     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547 
1548     if (how_many == 0)
1549         return 0;
1550 
1551     from_kind = PyUnicode_KIND(from);
1552     from_data = PyUnicode_DATA(from);
1553     to_kind = PyUnicode_KIND(to);
1554     to_data = PyUnicode_DATA(to);
1555 
1556 #ifdef Py_DEBUG
1557     if (!check_maxchar
1558         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559     {
1560         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561         Py_UCS4 ch;
1562         Py_ssize_t i;
1563         for (i=0; i < how_many; i++) {
1564             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565             assert(ch <= to_maxchar);
1566         }
1567     }
1568 #endif
1569 
1570     if (from_kind == to_kind) {
1571         if (check_maxchar
1572             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573         {
1574             /* Writing Latin-1 characters into an ASCII string requires to
1575                check that all written characters are pure ASCII */
1576             Py_UCS4 max_char;
1577             max_char = ucs1lib_find_max_char(from_data,
1578                                              (const Py_UCS1*)from_data + how_many);
1579             if (max_char >= 128)
1580                 return -1;
1581         }
1582         memcpy((char*)to_data + to_kind * to_start,
1583                   (const char*)from_data + from_kind * from_start,
1584                   to_kind * how_many);
1585     }
1586     else if (from_kind == PyUnicode_1BYTE_KIND
1587              && to_kind == PyUnicode_2BYTE_KIND)
1588     {
1589         _PyUnicode_CONVERT_BYTES(
1590             Py_UCS1, Py_UCS2,
1591             PyUnicode_1BYTE_DATA(from) + from_start,
1592             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593             PyUnicode_2BYTE_DATA(to) + to_start
1594             );
1595     }
1596     else if (from_kind == PyUnicode_1BYTE_KIND
1597              && to_kind == PyUnicode_4BYTE_KIND)
1598     {
1599         _PyUnicode_CONVERT_BYTES(
1600             Py_UCS1, Py_UCS4,
1601             PyUnicode_1BYTE_DATA(from) + from_start,
1602             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603             PyUnicode_4BYTE_DATA(to) + to_start
1604             );
1605     }
1606     else if (from_kind == PyUnicode_2BYTE_KIND
1607              && to_kind == PyUnicode_4BYTE_KIND)
1608     {
1609         _PyUnicode_CONVERT_BYTES(
1610             Py_UCS2, Py_UCS4,
1611             PyUnicode_2BYTE_DATA(from) + from_start,
1612             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613             PyUnicode_4BYTE_DATA(to) + to_start
1614             );
1615     }
1616     else {
1617         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618 
1619         if (!check_maxchar) {
1620             if (from_kind == PyUnicode_2BYTE_KIND
1621                 && to_kind == PyUnicode_1BYTE_KIND)
1622             {
1623                 _PyUnicode_CONVERT_BYTES(
1624                     Py_UCS2, Py_UCS1,
1625                     PyUnicode_2BYTE_DATA(from) + from_start,
1626                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627                     PyUnicode_1BYTE_DATA(to) + to_start
1628                     );
1629             }
1630             else if (from_kind == PyUnicode_4BYTE_KIND
1631                      && to_kind == PyUnicode_1BYTE_KIND)
1632             {
1633                 _PyUnicode_CONVERT_BYTES(
1634                     Py_UCS4, Py_UCS1,
1635                     PyUnicode_4BYTE_DATA(from) + from_start,
1636                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637                     PyUnicode_1BYTE_DATA(to) + to_start
1638                     );
1639             }
1640             else if (from_kind == PyUnicode_4BYTE_KIND
1641                      && to_kind == PyUnicode_2BYTE_KIND)
1642             {
1643                 _PyUnicode_CONVERT_BYTES(
1644                     Py_UCS4, Py_UCS2,
1645                     PyUnicode_4BYTE_DATA(from) + from_start,
1646                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647                     PyUnicode_2BYTE_DATA(to) + to_start
1648                     );
1649             }
1650             else {
1651                 Py_UNREACHABLE();
1652             }
1653         }
1654         else {
1655             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656             Py_UCS4 ch;
1657             Py_ssize_t i;
1658 
1659             for (i=0; i < how_many; i++) {
1660                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661                 if (ch > to_maxchar)
1662                     return -1;
1663                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664             }
1665         }
1666     }
1667     return 0;
1668 }
1669 
1670 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671 _PyUnicode_FastCopyCharacters(
1672     PyObject *to, Py_ssize_t to_start,
1673     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674 {
1675     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676 }
1677 
1678 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680                          PyObject *from, Py_ssize_t from_start,
1681                          Py_ssize_t how_many)
1682 {
1683     int err;
1684 
1685     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686         PyErr_BadInternalCall();
1687         return -1;
1688     }
1689 
1690     if (PyUnicode_READY(from) == -1)
1691         return -1;
1692     if (PyUnicode_READY(to) == -1)
1693         return -1;
1694 
1695     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696         PyErr_SetString(PyExc_IndexError, "string index out of range");
1697         return -1;
1698     }
1699     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700         PyErr_SetString(PyExc_IndexError, "string index out of range");
1701         return -1;
1702     }
1703     if (how_many < 0) {
1704         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705         return -1;
1706     }
1707     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709         PyErr_Format(PyExc_SystemError,
1710                      "Cannot write %zi characters at %zi "
1711                      "in a string of %zi characters",
1712                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1713         return -1;
1714     }
1715 
1716     if (how_many == 0)
1717         return 0;
1718 
1719     if (unicode_check_modifiable(to))
1720         return -1;
1721 
1722     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723     if (err) {
1724         PyErr_Format(PyExc_SystemError,
1725                      "Cannot copy %s characters "
1726                      "into a string of %s characters",
1727                      unicode_kind_name(from),
1728                      unicode_kind_name(to));
1729         return -1;
1730     }
1731     return how_many;
1732 }
1733 
1734 /* Find the maximum code point and count the number of surrogate pairs so a
1735    correct string length can be computed before converting a string to UCS4.
1736    This function counts single surrogates as a character and not as a pair.
1737 
1738    Return 0 on success, or -1 on error. */
1739 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742 {
1743     const wchar_t *iter;
1744     Py_UCS4 ch;
1745 
1746     assert(num_surrogates != NULL && maxchar != NULL);
1747     *num_surrogates = 0;
1748     *maxchar = 0;
1749 
1750     for (iter = begin; iter < end; ) {
1751 #if SIZEOF_WCHAR_T == 2
1752         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753             && (iter+1) < end
1754             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755         {
1756             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757             ++(*num_surrogates);
1758             iter += 2;
1759         }
1760         else
1761 #endif
1762         {
1763             ch = *iter;
1764             iter++;
1765         }
1766         if (ch > *maxchar) {
1767             *maxchar = ch;
1768             if (*maxchar > MAX_UNICODE) {
1769                 PyErr_Format(PyExc_ValueError,
1770                              "character U+%x is not in range [U+0000; U+%x]",
1771                              ch, MAX_UNICODE);
1772                 return -1;
1773             }
1774         }
1775     }
1776     return 0;
1777 }
1778 
1779 int
_PyUnicode_Ready(PyObject * unicode)1780 _PyUnicode_Ready(PyObject *unicode)
1781 {
1782     wchar_t *end;
1783     Py_UCS4 maxchar = 0;
1784     Py_ssize_t num_surrogates;
1785 #if SIZEOF_WCHAR_T == 2
1786     Py_ssize_t length_wo_surrogates;
1787 #endif
1788 
1789     /* _PyUnicode_Ready() is only intended for old-style API usage where
1790        strings were created using _PyObject_New() and where no canonical
1791        representation (the str field) has been set yet aka strings
1792        which are not yet ready. */
1793     assert(_PyUnicode_CHECK(unicode));
1794     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795     assert(_PyUnicode_WSTR(unicode) != NULL);
1796     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797     assert(_PyUnicode_UTF8(unicode) == NULL);
1798     /* Actually, it should neither be interned nor be anything else: */
1799     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800 
1801     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803                                 &maxchar, &num_surrogates) == -1)
1804         return -1;
1805 
1806     if (maxchar < 256) {
1807         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808         if (!_PyUnicode_DATA_ANY(unicode)) {
1809             PyErr_NoMemory();
1810             return -1;
1811         }
1812         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813                                 _PyUnicode_WSTR(unicode), end,
1814                                 PyUnicode_1BYTE_DATA(unicode));
1815         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818         if (maxchar < 128) {
1819             _PyUnicode_STATE(unicode).ascii = 1;
1820             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822         }
1823         else {
1824             _PyUnicode_STATE(unicode).ascii = 0;
1825             _PyUnicode_UTF8(unicode) = NULL;
1826             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827         }
1828         PyObject_Free(_PyUnicode_WSTR(unicode));
1829         _PyUnicode_WSTR(unicode) = NULL;
1830         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831     }
1832     /* In this case we might have to convert down from 4-byte native
1833        wchar_t to 2-byte unicode. */
1834     else if (maxchar < 65536) {
1835         assert(num_surrogates == 0 &&
1836                "FindMaxCharAndNumSurrogatePairs() messed up");
1837 
1838 #if SIZEOF_WCHAR_T == 2
1839         /* We can share representations and are done. */
1840         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844         _PyUnicode_UTF8(unicode) = NULL;
1845         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846 #else
1847         /* sizeof(wchar_t) == 4 */
1848         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850         if (!_PyUnicode_DATA_ANY(unicode)) {
1851             PyErr_NoMemory();
1852             return -1;
1853         }
1854         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855                                 _PyUnicode_WSTR(unicode), end,
1856                                 PyUnicode_2BYTE_DATA(unicode));
1857         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860         _PyUnicode_UTF8(unicode) = NULL;
1861         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862         PyObject_Free(_PyUnicode_WSTR(unicode));
1863         _PyUnicode_WSTR(unicode) = NULL;
1864         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865 #endif
1866     }
1867     /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868     else {
1869 #if SIZEOF_WCHAR_T == 2
1870         /* in case the native representation is 2-bytes, we need to allocate a
1871            new normalized 4-byte version. */
1872         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874             PyErr_NoMemory();
1875             return -1;
1876         }
1877         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878         if (!_PyUnicode_DATA_ANY(unicode)) {
1879             PyErr_NoMemory();
1880             return -1;
1881         }
1882         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884         _PyUnicode_UTF8(unicode) = NULL;
1885         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887         _PyUnicode_STATE(unicode).ready = 1;
1888         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889         PyObject_Free(_PyUnicode_WSTR(unicode));
1890         _PyUnicode_WSTR(unicode) = NULL;
1891         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892 #else
1893         assert(num_surrogates == 0);
1894 
1895         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897         _PyUnicode_UTF8(unicode) = NULL;
1898         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900 #endif
1901         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902     }
1903     _PyUnicode_STATE(unicode).ready = 1;
1904     assert(_PyUnicode_CheckConsistency(unicode, 1));
1905     return 0;
1906 }
1907 
1908 static void
unicode_dealloc(PyObject * unicode)1909 unicode_dealloc(PyObject *unicode)
1910 {
1911 #ifdef Py_DEBUG
1912     if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913         _Py_FatalRefcountError("deallocating an Unicode singleton");
1914     }
1915 #endif
1916 
1917     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918     case SSTATE_NOT_INTERNED:
1919         break;
1920     case SSTATE_INTERNED_MORTAL:
1921     {
1922         /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923            references (key and value) which were ignored by
1924            PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925            to prevent calling unicode_dealloc() again. Adjust refcnt after
1926            PyDict_DelItem(). */
1927         assert(Py_REFCNT(unicode) == 0);
1928         Py_SET_REFCNT(unicode, 3);
1929         if (PyDict_DelItem(interned, unicode) != 0) {
1930             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931                                       NULL);
1932         }
1933         assert(Py_REFCNT(unicode) == 1);
1934         Py_SET_REFCNT(unicode, 0);
1935         break;
1936     }
1937 
1938     case SSTATE_INTERNED_IMMORTAL:
1939         _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940         break;
1941 
1942     default:
1943         Py_UNREACHABLE();
1944     }
1945 
1946     if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947         PyObject_Free(_PyUnicode_WSTR(unicode));
1948     }
1949     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950         PyObject_Free(_PyUnicode_UTF8(unicode));
1951     }
1952     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953         PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954     }
1955 
1956     Py_TYPE(unicode)->tp_free(unicode);
1957 }
1958 
1959 #ifdef Py_DEBUG
1960 static int
unicode_is_singleton(PyObject * unicode)1961 unicode_is_singleton(PyObject *unicode)
1962 {
1963     if (unicode == &_Py_STR(empty)) {
1964         return 1;
1965     }
1966 
1967     PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970         if (ch < 256 && LATIN1(ch) == unicode) {
1971             return 1;
1972         }
1973     }
1974     return 0;
1975 }
1976 #endif
1977 
1978 static int
unicode_modifiable(PyObject * unicode)1979 unicode_modifiable(PyObject *unicode)
1980 {
1981     assert(_PyUnicode_CHECK(unicode));
1982     if (Py_REFCNT(unicode) != 1)
1983         return 0;
1984     if (_PyUnicode_HASH(unicode) != -1)
1985         return 0;
1986     if (PyUnicode_CHECK_INTERNED(unicode))
1987         return 0;
1988     if (!PyUnicode_CheckExact(unicode))
1989         return 0;
1990 #ifdef Py_DEBUG
1991     /* singleton refcount is greater than 1 */
1992     assert(!unicode_is_singleton(unicode));
1993 #endif
1994     return 1;
1995 }
1996 
1997 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999 {
2000     PyObject *unicode;
2001     Py_ssize_t old_length;
2002 
2003     assert(p_unicode != NULL);
2004     unicode = *p_unicode;
2005 
2006     assert(unicode != NULL);
2007     assert(PyUnicode_Check(unicode));
2008     assert(0 <= length);
2009 
2010     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011         old_length = PyUnicode_WSTR_LENGTH(unicode);
2012     else
2013         old_length = PyUnicode_GET_LENGTH(unicode);
2014     if (old_length == length)
2015         return 0;
2016 
2017     if (length == 0) {
2018         PyObject *empty = unicode_new_empty();
2019         Py_SETREF(*p_unicode, empty);
2020         return 0;
2021     }
2022 
2023     if (!unicode_modifiable(unicode)) {
2024         PyObject *copy = resize_copy(unicode, length);
2025         if (copy == NULL)
2026             return -1;
2027         Py_SETREF(*p_unicode, copy);
2028         return 0;
2029     }
2030 
2031     if (PyUnicode_IS_COMPACT(unicode)) {
2032         PyObject *new_unicode = resize_compact(unicode, length);
2033         if (new_unicode == NULL)
2034             return -1;
2035         *p_unicode = new_unicode;
2036         return 0;
2037     }
2038     return resize_inplace(unicode, length);
2039 }
2040 
2041 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043 {
2044     PyObject *unicode;
2045     if (p_unicode == NULL) {
2046         PyErr_BadInternalCall();
2047         return -1;
2048     }
2049     unicode = *p_unicode;
2050     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051     {
2052         PyErr_BadInternalCall();
2053         return -1;
2054     }
2055     return unicode_resize(p_unicode, length);
2056 }
2057 
2058 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059 
2060    WARNING: The function doesn't copy the terminating null character and
2061    doesn't check the maximum character (may write a latin1 character in an
2062    ASCII string). */
2063 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065                    const char *str, Py_ssize_t len)
2066 {
2067     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068     const void *data = PyUnicode_DATA(unicode);
2069     const char *end = str + len;
2070 
2071     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072     switch (kind) {
2073     case PyUnicode_1BYTE_KIND: {
2074 #ifdef Py_DEBUG
2075         if (PyUnicode_IS_ASCII(unicode)) {
2076             Py_UCS4 maxchar = ucs1lib_find_max_char(
2077                 (const Py_UCS1*)str,
2078                 (const Py_UCS1*)str + len);
2079             assert(maxchar < 128);
2080         }
2081 #endif
2082         memcpy((char *) data + index, str, len);
2083         break;
2084     }
2085     case PyUnicode_2BYTE_KIND: {
2086         Py_UCS2 *start = (Py_UCS2 *)data + index;
2087         Py_UCS2 *ucs2 = start;
2088 
2089         for (; str < end; ++ucs2, ++str)
2090             *ucs2 = (Py_UCS2)*str;
2091 
2092         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093         break;
2094     }
2095     case PyUnicode_4BYTE_KIND: {
2096         Py_UCS4 *start = (Py_UCS4 *)data + index;
2097         Py_UCS4 *ucs4 = start;
2098 
2099         for (; str < end; ++ucs4, ++str)
2100             *ucs4 = (Py_UCS4)*str;
2101 
2102         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103         break;
2104     }
2105     default:
2106         Py_UNREACHABLE();
2107     }
2108 }
2109 
2110 static PyObject*
get_latin1_char(Py_UCS1 ch)2111 get_latin1_char(Py_UCS1 ch)
2112 {
2113     return Py_NewRef(LATIN1(ch));
2114 }
2115 
2116 static PyObject*
unicode_char(Py_UCS4 ch)2117 unicode_char(Py_UCS4 ch)
2118 {
2119     PyObject *unicode;
2120 
2121     assert(ch <= MAX_UNICODE);
2122 
2123     if (ch < 256) {
2124         return get_latin1_char(ch);
2125     }
2126 
2127     unicode = PyUnicode_New(1, ch);
2128     if (unicode == NULL)
2129         return NULL;
2130 
2131     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134     } else {
2135         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137     }
2138     assert(_PyUnicode_CheckConsistency(unicode, 1));
2139     return unicode;
2140 }
2141 
2142 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144 {
2145     if (u == NULL) {
2146         if (size > 0) {
2147             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148                     "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149                     "use PyUnicode_New() instead", 1) < 0) {
2150                 return NULL;
2151             }
2152         }
2153         return (PyObject*)_PyUnicode_New(size);
2154     }
2155 
2156     if (size < 0) {
2157         PyErr_BadInternalCall();
2158         return NULL;
2159     }
2160 
2161     return PyUnicode_FromWideChar(u, size);
2162 }
2163 
2164 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166 {
2167     PyObject *unicode;
2168     Py_UCS4 maxchar = 0;
2169     Py_ssize_t num_surrogates;
2170 
2171     if (u == NULL && size != 0) {
2172         PyErr_BadInternalCall();
2173         return NULL;
2174     }
2175 
2176     if (size == -1) {
2177         size = wcslen(u);
2178     }
2179 
2180     /* If the Unicode data is known at construction time, we can apply
2181        some optimizations which share commonly used objects. */
2182 
2183     /* Optimization for empty strings */
2184     if (size == 0)
2185         _Py_RETURN_UNICODE_EMPTY();
2186 
2187 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188     /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189        non-Unicode locales and hence needs conversion to UCS-4 first. */
2190     if (_Py_LocaleUsesNonUnicodeWchar()) {
2191         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192         if (!converted) {
2193             return NULL;
2194         }
2195         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196         PyMem_Free(converted);
2197         return unicode;
2198     }
2199 #endif
2200 
2201     /* Single character Unicode objects in the Latin-1 range are
2202        shared when using this constructor */
2203     if (size == 1 && (Py_UCS4)*u < 256)
2204         return get_latin1_char((unsigned char)*u);
2205 
2206     /* If not empty and not single character, copy the Unicode data
2207        into the new object */
2208     if (find_maxchar_surrogates(u, u + size,
2209                                 &maxchar, &num_surrogates) == -1)
2210         return NULL;
2211 
2212     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213     if (!unicode)
2214         return NULL;
2215 
2216     switch (PyUnicode_KIND(unicode)) {
2217     case PyUnicode_1BYTE_KIND:
2218         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220         break;
2221     case PyUnicode_2BYTE_KIND:
2222 #if Py_UNICODE_SIZE == 2
2223         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224 #else
2225         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227 #endif
2228         break;
2229     case PyUnicode_4BYTE_KIND:
2230 #if SIZEOF_WCHAR_T == 2
2231         /* This is the only case which has to process surrogates, thus
2232            a simple copy loop is not enough and we need a function. */
2233         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234 #else
2235         assert(num_surrogates == 0);
2236         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237 #endif
2238         break;
2239     default:
2240         Py_UNREACHABLE();
2241     }
2242 
2243     return unicode_result(unicode);
2244 }
2245 
2246 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248 {
2249     if (size < 0) {
2250         PyErr_SetString(PyExc_SystemError,
2251                         "Negative size passed to PyUnicode_FromStringAndSize");
2252         return NULL;
2253     }
2254     if (u != NULL) {
2255         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256     }
2257     else {
2258         if (size > 0) {
2259             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260                     "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261                     "use PyUnicode_New() instead", 1) < 0) {
2262                 return NULL;
2263             }
2264         }
2265         return (PyObject *)_PyUnicode_New(size);
2266     }
2267 }
2268 
2269 PyObject *
PyUnicode_FromString(const char * u)2270 PyUnicode_FromString(const char *u)
2271 {
2272     size_t size = strlen(u);
2273     if (size > PY_SSIZE_T_MAX) {
2274         PyErr_SetString(PyExc_OverflowError, "input too long");
2275         return NULL;
2276     }
2277     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278 }
2279 
2280 
2281 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2282 _PyUnicode_FromId(_Py_Identifier *id)
2283 {
2284     PyInterpreterState *interp = _PyInterpreterState_GET();
2285     struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286 
2287     Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288     if (index < 0) {
2289         struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290 
2291         PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292         // Check again to detect concurrent access. Another thread can have
2293         // initialized the index while this thread waited for the lock.
2294         index = _Py_atomic_size_get(&id->index);
2295         if (index < 0) {
2296             assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297             index = rt_ids->next_index;
2298             rt_ids->next_index++;
2299             _Py_atomic_size_set(&id->index, index);
2300         }
2301         PyThread_release_lock(rt_ids->lock);
2302     }
2303     assert(index >= 0);
2304 
2305     PyObject *obj;
2306     if (index < ids->size) {
2307         obj = ids->array[index];
2308         if (obj) {
2309             // Return a borrowed reference
2310             return obj;
2311         }
2312     }
2313 
2314     obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315                                        NULL, NULL);
2316     if (!obj) {
2317         return NULL;
2318     }
2319     PyUnicode_InternInPlace(&obj);
2320 
2321     if (index >= ids->size) {
2322         // Overallocate to reduce the number of realloc
2323         Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324         Py_ssize_t item_size = sizeof(ids->array[0]);
2325         PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326         if (new_array == NULL) {
2327             PyErr_NoMemory();
2328             return NULL;
2329         }
2330         memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331         ids->array = new_array;
2332         ids->size = new_size;
2333     }
2334 
2335     // The array stores a strong reference
2336     ids->array[index] = obj;
2337 
2338     // Return a borrowed reference
2339     return obj;
2340 }
2341 
2342 
2343 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2344 unicode_clear_identifiers(struct _Py_unicode_state *state)
2345 {
2346     struct _Py_unicode_ids *ids = &state->ids;
2347     for (Py_ssize_t i=0; i < ids->size; i++) {
2348         Py_XDECREF(ids->array[i]);
2349     }
2350     ids->size = 0;
2351     PyMem_Free(ids->array);
2352     ids->array = NULL;
2353     // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354     // after Py_Finalize().
2355 }
2356 
2357 
2358 /* Internal function, doesn't check maximum character */
2359 
2360 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362 {
2363     const unsigned char *s = (const unsigned char *)buffer;
2364     PyObject *unicode;
2365     if (size == 1) {
2366 #ifdef Py_DEBUG
2367         assert((unsigned char)s[0] < 128);
2368 #endif
2369         return get_latin1_char(s[0]);
2370     }
2371     unicode = PyUnicode_New(size, 127);
2372     if (!unicode)
2373         return NULL;
2374     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375     assert(_PyUnicode_CheckConsistency(unicode, 1));
2376     return unicode;
2377 }
2378 
2379 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2380 kind_maxchar_limit(unsigned int kind)
2381 {
2382     switch (kind) {
2383     case PyUnicode_1BYTE_KIND:
2384         return 0x80;
2385     case PyUnicode_2BYTE_KIND:
2386         return 0x100;
2387     case PyUnicode_4BYTE_KIND:
2388         return 0x10000;
2389     default:
2390         Py_UNREACHABLE();
2391     }
2392 }
2393 
2394 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396 {
2397     PyObject *res;
2398     unsigned char max_char;
2399 
2400     if (size == 0) {
2401         _Py_RETURN_UNICODE_EMPTY();
2402     }
2403     assert(size > 0);
2404     if (size == 1) {
2405         return get_latin1_char(u[0]);
2406     }
2407 
2408     max_char = ucs1lib_find_max_char(u, u + size);
2409     res = PyUnicode_New(size, max_char);
2410     if (!res)
2411         return NULL;
2412     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413     assert(_PyUnicode_CheckConsistency(res, 1));
2414     return res;
2415 }
2416 
2417 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419 {
2420     PyObject *res;
2421     Py_UCS2 max_char;
2422 
2423     if (size == 0)
2424         _Py_RETURN_UNICODE_EMPTY();
2425     assert(size > 0);
2426     if (size == 1)
2427         return unicode_char(u[0]);
2428 
2429     max_char = ucs2lib_find_max_char(u, u + size);
2430     res = PyUnicode_New(size, max_char);
2431     if (!res)
2432         return NULL;
2433     if (max_char >= 256)
2434         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435     else {
2436         _PyUnicode_CONVERT_BYTES(
2437             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438     }
2439     assert(_PyUnicode_CheckConsistency(res, 1));
2440     return res;
2441 }
2442 
2443 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445 {
2446     PyObject *res;
2447     Py_UCS4 max_char;
2448 
2449     if (size == 0)
2450         _Py_RETURN_UNICODE_EMPTY();
2451     assert(size > 0);
2452     if (size == 1)
2453         return unicode_char(u[0]);
2454 
2455     max_char = ucs4lib_find_max_char(u, u + size);
2456     res = PyUnicode_New(size, max_char);
2457     if (!res)
2458         return NULL;
2459     if (max_char < 256)
2460         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461                                  PyUnicode_1BYTE_DATA(res));
2462     else if (max_char < 0x10000)
2463         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464                                  PyUnicode_2BYTE_DATA(res));
2465     else
2466         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467     assert(_PyUnicode_CheckConsistency(res, 1));
2468     return res;
2469 }
2470 
2471 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473 {
2474     if (size < 0) {
2475         PyErr_SetString(PyExc_ValueError, "size must be positive");
2476         return NULL;
2477     }
2478     switch (kind) {
2479     case PyUnicode_1BYTE_KIND:
2480         return _PyUnicode_FromUCS1(buffer, size);
2481     case PyUnicode_2BYTE_KIND:
2482         return _PyUnicode_FromUCS2(buffer, size);
2483     case PyUnicode_4BYTE_KIND:
2484         return _PyUnicode_FromUCS4(buffer, size);
2485     default:
2486         PyErr_SetString(PyExc_SystemError, "invalid kind");
2487         return NULL;
2488     }
2489 }
2490 
2491 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493 {
2494     enum PyUnicode_Kind kind;
2495     const void *startptr, *endptr;
2496 
2497     assert(PyUnicode_IS_READY(unicode));
2498     assert(0 <= start);
2499     assert(end <= PyUnicode_GET_LENGTH(unicode));
2500     assert(start <= end);
2501 
2502     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503         return PyUnicode_MAX_CHAR_VALUE(unicode);
2504 
2505     if (start == end)
2506         return 127;
2507 
2508     if (PyUnicode_IS_ASCII(unicode))
2509         return 127;
2510 
2511     kind = PyUnicode_KIND(unicode);
2512     startptr = PyUnicode_DATA(unicode);
2513     endptr = (char *)startptr + end * kind;
2514     startptr = (char *)startptr + start * kind;
2515     switch(kind) {
2516     case PyUnicode_1BYTE_KIND:
2517         return ucs1lib_find_max_char(startptr, endptr);
2518     case PyUnicode_2BYTE_KIND:
2519         return ucs2lib_find_max_char(startptr, endptr);
2520     case PyUnicode_4BYTE_KIND:
2521         return ucs4lib_find_max_char(startptr, endptr);
2522     default:
2523         Py_UNREACHABLE();
2524     }
2525 }
2526 
2527 /* Ensure that a string uses the most efficient storage, if it is not the
2528    case: create a new string with of the right kind. Write NULL into *p_unicode
2529    on error. */
2530 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2531 unicode_adjust_maxchar(PyObject **p_unicode)
2532 {
2533     PyObject *unicode, *copy;
2534     Py_UCS4 max_char;
2535     Py_ssize_t len;
2536     unsigned int kind;
2537 
2538     assert(p_unicode != NULL);
2539     unicode = *p_unicode;
2540     assert(PyUnicode_IS_READY(unicode));
2541     if (PyUnicode_IS_ASCII(unicode))
2542         return;
2543 
2544     len = PyUnicode_GET_LENGTH(unicode);
2545     kind = PyUnicode_KIND(unicode);
2546     if (kind == PyUnicode_1BYTE_KIND) {
2547         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548         max_char = ucs1lib_find_max_char(u, u + len);
2549         if (max_char >= 128)
2550             return;
2551     }
2552     else if (kind == PyUnicode_2BYTE_KIND) {
2553         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554         max_char = ucs2lib_find_max_char(u, u + len);
2555         if (max_char >= 256)
2556             return;
2557     }
2558     else if (kind == PyUnicode_4BYTE_KIND) {
2559         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560         max_char = ucs4lib_find_max_char(u, u + len);
2561         if (max_char >= 0x10000)
2562             return;
2563     }
2564     else
2565         Py_UNREACHABLE();
2566 
2567     copy = PyUnicode_New(len, max_char);
2568     if (copy != NULL)
2569         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570     Py_DECREF(unicode);
2571     *p_unicode = copy;
2572 }
2573 
2574 PyObject*
_PyUnicode_Copy(PyObject * unicode)2575 _PyUnicode_Copy(PyObject *unicode)
2576 {
2577     Py_ssize_t length;
2578     PyObject *copy;
2579 
2580     if (!PyUnicode_Check(unicode)) {
2581         PyErr_BadInternalCall();
2582         return NULL;
2583     }
2584     if (PyUnicode_READY(unicode) == -1)
2585         return NULL;
2586 
2587     length = PyUnicode_GET_LENGTH(unicode);
2588     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589     if (!copy)
2590         return NULL;
2591     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592 
2593     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594               length * PyUnicode_KIND(unicode));
2595     assert(_PyUnicode_CheckConsistency(copy, 1));
2596     return copy;
2597 }
2598 
2599 
2600 /* Widen Unicode objects to larger buffers. Don't write terminating null
2601    character. Return NULL on error. */
2602 
2603 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605 {
2606     void *result;
2607 
2608     assert(skind < kind);
2609     switch (kind) {
2610     case PyUnicode_2BYTE_KIND:
2611         result = PyMem_New(Py_UCS2, len);
2612         if (!result)
2613             return PyErr_NoMemory();
2614         assert(skind == PyUnicode_1BYTE_KIND);
2615         _PyUnicode_CONVERT_BYTES(
2616             Py_UCS1, Py_UCS2,
2617             (const Py_UCS1 *)data,
2618             ((const Py_UCS1 *)data) + len,
2619             result);
2620         return result;
2621     case PyUnicode_4BYTE_KIND:
2622         result = PyMem_New(Py_UCS4, len);
2623         if (!result)
2624             return PyErr_NoMemory();
2625         if (skind == PyUnicode_2BYTE_KIND) {
2626             _PyUnicode_CONVERT_BYTES(
2627                 Py_UCS2, Py_UCS4,
2628                 (const Py_UCS2 *)data,
2629                 ((const Py_UCS2 *)data) + len,
2630                 result);
2631         }
2632         else {
2633             assert(skind == PyUnicode_1BYTE_KIND);
2634             _PyUnicode_CONVERT_BYTES(
2635                 Py_UCS1, Py_UCS4,
2636                 (const Py_UCS1 *)data,
2637                 ((const Py_UCS1 *)data) + len,
2638                 result);
2639         }
2640         return result;
2641     default:
2642         Py_UNREACHABLE();
2643         return NULL;
2644     }
2645 }
2646 
2647 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649         int copy_null)
2650 {
2651     int kind;
2652     const void *data;
2653     Py_ssize_t len, targetlen;
2654     if (PyUnicode_READY(string) == -1)
2655         return NULL;
2656     kind = PyUnicode_KIND(string);
2657     data = PyUnicode_DATA(string);
2658     len = PyUnicode_GET_LENGTH(string);
2659     targetlen = len;
2660     if (copy_null)
2661         targetlen++;
2662     if (!target) {
2663         target = PyMem_New(Py_UCS4, targetlen);
2664         if (!target) {
2665             PyErr_NoMemory();
2666             return NULL;
2667         }
2668     }
2669     else {
2670         if (targetsize < targetlen) {
2671             PyErr_Format(PyExc_SystemError,
2672                          "string is longer than the buffer");
2673             if (copy_null && 0 < targetsize)
2674                 target[0] = 0;
2675             return NULL;
2676         }
2677     }
2678     if (kind == PyUnicode_1BYTE_KIND) {
2679         const Py_UCS1 *start = (const Py_UCS1 *) data;
2680         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681     }
2682     else if (kind == PyUnicode_2BYTE_KIND) {
2683         const Py_UCS2 *start = (const Py_UCS2 *) data;
2684         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685     }
2686     else if (kind == PyUnicode_4BYTE_KIND) {
2687         memcpy(target, data, len * sizeof(Py_UCS4));
2688     }
2689     else {
2690         Py_UNREACHABLE();
2691     }
2692     if (copy_null)
2693         target[len] = 0;
2694     return target;
2695 }
2696 
2697 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699                  int copy_null)
2700 {
2701     if (target == NULL || targetsize < 0) {
2702         PyErr_BadInternalCall();
2703         return NULL;
2704     }
2705     return as_ucs4(string, target, targetsize, copy_null);
2706 }
2707 
2708 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2709 PyUnicode_AsUCS4Copy(PyObject *string)
2710 {
2711     return as_ucs4(string, NULL, 0, 1);
2712 }
2713 
2714 /* maximum number of characters required for output of %lld or %p.
2715    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2717 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718 
2719 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721                              Py_ssize_t width, Py_ssize_t precision)
2722 {
2723     Py_ssize_t length, fill, arglen;
2724     Py_UCS4 maxchar;
2725 
2726     if (PyUnicode_READY(str) == -1)
2727         return -1;
2728 
2729     length = PyUnicode_GET_LENGTH(str);
2730     if ((precision == -1 || precision >= length)
2731         && width <= length)
2732         return _PyUnicodeWriter_WriteStr(writer, str);
2733 
2734     if (precision != -1)
2735         length = Py_MIN(precision, length);
2736 
2737     arglen = Py_MAX(length, width);
2738     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740     else
2741         maxchar = writer->maxchar;
2742 
2743     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744         return -1;
2745 
2746     if (width > length) {
2747         fill = width - length;
2748         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749             return -1;
2750         writer->pos += fill;
2751     }
2752 
2753     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754                                   str, 0, length);
2755     writer->pos += length;
2756     return 0;
2757 }
2758 
2759 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761                               Py_ssize_t width, Py_ssize_t precision)
2762 {
2763     /* UTF-8 */
2764     Py_ssize_t length;
2765     PyObject *unicode;
2766     int res;
2767 
2768     if (precision == -1) {
2769         length = strlen(str);
2770     }
2771     else {
2772         length = 0;
2773         while (length < precision && str[length]) {
2774             length++;
2775         }
2776     }
2777     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778     if (unicode == NULL)
2779         return -1;
2780 
2781     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782     Py_DECREF(unicode);
2783     return res;
2784 }
2785 
2786 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788                        const char *f, va_list *vargs)
2789 {
2790     const char *p;
2791     Py_ssize_t len;
2792     int zeropad;
2793     Py_ssize_t width;
2794     Py_ssize_t precision;
2795     int longflag;
2796     int longlongflag;
2797     int size_tflag;
2798     Py_ssize_t fill;
2799 
2800     p = f;
2801     f++;
2802     zeropad = 0;
2803     if (*f == '0') {
2804         zeropad = 1;
2805         f++;
2806     }
2807 
2808     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809     width = -1;
2810     if (Py_ISDIGIT((unsigned)*f)) {
2811         width = *f - '0';
2812         f++;
2813         while (Py_ISDIGIT((unsigned)*f)) {
2814             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815                 PyErr_SetString(PyExc_ValueError,
2816                                 "width too big");
2817                 return NULL;
2818             }
2819             width = (width * 10) + (*f - '0');
2820             f++;
2821         }
2822     }
2823     precision = -1;
2824     if (*f == '.') {
2825         f++;
2826         if (Py_ISDIGIT((unsigned)*f)) {
2827             precision = (*f - '0');
2828             f++;
2829             while (Py_ISDIGIT((unsigned)*f)) {
2830                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831                     PyErr_SetString(PyExc_ValueError,
2832                                     "precision too big");
2833                     return NULL;
2834                 }
2835                 precision = (precision * 10) + (*f - '0');
2836                 f++;
2837             }
2838         }
2839         if (*f == '%') {
2840             /* "%.3%s" => f points to "3" */
2841             f--;
2842         }
2843     }
2844     if (*f == '\0') {
2845         /* bogus format "%.123" => go backward, f points to "3" */
2846         f--;
2847     }
2848 
2849     /* Handle %ld, %lu, %lld and %llu. */
2850     longflag = 0;
2851     longlongflag = 0;
2852     size_tflag = 0;
2853     if (*f == 'l') {
2854         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855             longflag = 1;
2856             ++f;
2857         }
2858         else if (f[1] == 'l' &&
2859                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860             longlongflag = 1;
2861             f += 2;
2862         }
2863     }
2864     /* handle the size_t flag. */
2865     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866         size_tflag = 1;
2867         ++f;
2868     }
2869 
2870     if (f[1] == '\0')
2871         writer->overallocate = 0;
2872 
2873     switch (*f) {
2874     case 'c':
2875     {
2876         int ordinal = va_arg(*vargs, int);
2877         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878             PyErr_SetString(PyExc_OverflowError,
2879                             "character argument not in range(0x110000)");
2880             return NULL;
2881         }
2882         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883             return NULL;
2884         break;
2885     }
2886 
2887     case 'i':
2888     case 'd':
2889     case 'u':
2890     case 'x':
2891     {
2892         /* used by sprintf */
2893         char buffer[MAX_LONG_LONG_CHARS];
2894         Py_ssize_t arglen;
2895 
2896         if (*f == 'u') {
2897             if (longflag) {
2898                 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899             }
2900             else if (longlongflag) {
2901                 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902             }
2903             else if (size_tflag) {
2904                 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905             }
2906             else {
2907                 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908             }
2909         }
2910         else if (*f == 'x') {
2911             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912         }
2913         else {
2914             if (longflag) {
2915                 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916             }
2917             else if (longlongflag) {
2918                 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919             }
2920             else if (size_tflag) {
2921                 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922             }
2923             else {
2924                 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925             }
2926         }
2927         assert(len >= 0);
2928 
2929         if (precision < len)
2930             precision = len;
2931 
2932         arglen = Py_MAX(precision, width);
2933         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934             return NULL;
2935 
2936         if (width > precision) {
2937             Py_UCS4 fillchar;
2938             fill = width - precision;
2939             fillchar = zeropad?'0':' ';
2940             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941                 return NULL;
2942             writer->pos += fill;
2943         }
2944         if (precision > len) {
2945             fill = precision - len;
2946             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947                 return NULL;
2948             writer->pos += fill;
2949         }
2950 
2951         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952             return NULL;
2953         break;
2954     }
2955 
2956     case 'p':
2957     {
2958         char number[MAX_LONG_LONG_CHARS];
2959 
2960         len = sprintf(number, "%p", va_arg(*vargs, void*));
2961         assert(len >= 0);
2962 
2963         /* %p is ill-defined:  ensure leading 0x. */
2964         if (number[1] == 'X')
2965             number[1] = 'x';
2966         else if (number[1] != 'x') {
2967             memmove(number + 2, number,
2968                     strlen(number) + 1);
2969             number[0] = '0';
2970             number[1] = 'x';
2971             len += 2;
2972         }
2973 
2974         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975             return NULL;
2976         break;
2977     }
2978 
2979     case 's':
2980     {
2981         /* UTF-8 */
2982         const char *s = va_arg(*vargs, const char*);
2983         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984             return NULL;
2985         break;
2986     }
2987 
2988     case 'U':
2989     {
2990         PyObject *obj = va_arg(*vargs, PyObject *);
2991         assert(obj && _PyUnicode_CHECK(obj));
2992 
2993         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994             return NULL;
2995         break;
2996     }
2997 
2998     case 'V':
2999     {
3000         PyObject *obj = va_arg(*vargs, PyObject *);
3001         const char *str = va_arg(*vargs, const char *);
3002         if (obj) {
3003             assert(_PyUnicode_CHECK(obj));
3004             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005                 return NULL;
3006         }
3007         else {
3008             assert(str != NULL);
3009             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010                 return NULL;
3011         }
3012         break;
3013     }
3014 
3015     case 'S':
3016     {
3017         PyObject *obj = va_arg(*vargs, PyObject *);
3018         PyObject *str;
3019         assert(obj);
3020         str = PyObject_Str(obj);
3021         if (!str)
3022             return NULL;
3023         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024             Py_DECREF(str);
3025             return NULL;
3026         }
3027         Py_DECREF(str);
3028         break;
3029     }
3030 
3031     case 'R':
3032     {
3033         PyObject *obj = va_arg(*vargs, PyObject *);
3034         PyObject *repr;
3035         assert(obj);
3036         repr = PyObject_Repr(obj);
3037         if (!repr)
3038             return NULL;
3039         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040             Py_DECREF(repr);
3041             return NULL;
3042         }
3043         Py_DECREF(repr);
3044         break;
3045     }
3046 
3047     case 'A':
3048     {
3049         PyObject *obj = va_arg(*vargs, PyObject *);
3050         PyObject *ascii;
3051         assert(obj);
3052         ascii = PyObject_ASCII(obj);
3053         if (!ascii)
3054             return NULL;
3055         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056             Py_DECREF(ascii);
3057             return NULL;
3058         }
3059         Py_DECREF(ascii);
3060         break;
3061     }
3062 
3063     case '%':
3064         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065             return NULL;
3066         break;
3067 
3068     default:
3069         /* if we stumble upon an unknown formatting code, copy the rest
3070            of the format string to the output string. (we cannot just
3071            skip the code, since there's no way to know what's in the
3072            argument list) */
3073         len = strlen(p);
3074         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075             return NULL;
3076         f = p+len;
3077         return f;
3078     }
3079 
3080     f++;
3081     return f;
3082 }
3083 
3084 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3085 PyUnicode_FromFormatV(const char *format, va_list vargs)
3086 {
3087     va_list vargs2;
3088     const char *f;
3089     _PyUnicodeWriter writer;
3090 
3091     _PyUnicodeWriter_Init(&writer);
3092     writer.min_length = strlen(format) + 100;
3093     writer.overallocate = 1;
3094 
3095     // Copy varags to be able to pass a reference to a subfunction.
3096     va_copy(vargs2, vargs);
3097 
3098     for (f = format; *f; ) {
3099         if (*f == '%') {
3100             f = unicode_fromformat_arg(&writer, f, &vargs2);
3101             if (f == NULL)
3102                 goto fail;
3103         }
3104         else {
3105             const char *p;
3106             Py_ssize_t len;
3107 
3108             p = f;
3109             do
3110             {
3111                 if ((unsigned char)*p > 127) {
3112                     PyErr_Format(PyExc_ValueError,
3113                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114                         "string, got a non-ASCII byte: 0x%02x",
3115                         (unsigned char)*p);
3116                     goto fail;
3117                 }
3118                 p++;
3119             }
3120             while (*p != '\0' && *p != '%');
3121             len = p - f;
3122 
3123             if (*p == '\0')
3124                 writer.overallocate = 0;
3125 
3126             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127                 goto fail;
3128 
3129             f = p;
3130         }
3131     }
3132     va_end(vargs2);
3133     return _PyUnicodeWriter_Finish(&writer);
3134 
3135   fail:
3136     va_end(vargs2);
3137     _PyUnicodeWriter_Dealloc(&writer);
3138     return NULL;
3139 }
3140 
3141 PyObject *
PyUnicode_FromFormat(const char * format,...)3142 PyUnicode_FromFormat(const char *format, ...)
3143 {
3144     PyObject* ret;
3145     va_list vargs;
3146 
3147 #ifdef HAVE_STDARG_PROTOTYPES
3148     va_start(vargs, format);
3149 #else
3150     va_start(vargs);
3151 #endif
3152     ret = PyUnicode_FromFormatV(format, vargs);
3153     va_end(vargs);
3154     return ret;
3155 }
3156 
3157 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3158 unicode_get_widechar_size(PyObject *unicode)
3159 {
3160     Py_ssize_t res;
3161 
3162     assert(unicode != NULL);
3163     assert(_PyUnicode_CHECK(unicode));
3164 
3165 #if USE_UNICODE_WCHAR_CACHE
3166     if (_PyUnicode_WSTR(unicode) != NULL) {
3167         return PyUnicode_WSTR_LENGTH(unicode);
3168     }
3169 #endif /* USE_UNICODE_WCHAR_CACHE */
3170     assert(PyUnicode_IS_READY(unicode));
3171 
3172     res = _PyUnicode_LENGTH(unicode);
3173 #if SIZEOF_WCHAR_T == 2
3174     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176         const Py_UCS4 *end = s + res;
3177         for (; s < end; ++s) {
3178             if (*s > 0xFFFF) {
3179                 ++res;
3180             }
3181         }
3182     }
3183 #endif
3184     return res;
3185 }
3186 
3187 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189 {
3190     assert(unicode != NULL);
3191     assert(_PyUnicode_CHECK(unicode));
3192 
3193 #if USE_UNICODE_WCHAR_CACHE
3194     const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195     if (wstr != NULL) {
3196         memcpy(w, wstr, size * sizeof(wchar_t));
3197         return;
3198     }
3199 #else /* USE_UNICODE_WCHAR_CACHE */
3200     if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201         memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202         return;
3203     }
3204 #endif /* USE_UNICODE_WCHAR_CACHE */
3205     assert(PyUnicode_IS_READY(unicode));
3206 
3207     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209         for (; size--; ++s, ++w) {
3210             *w = *s;
3211         }
3212     }
3213     else {
3214 #if SIZEOF_WCHAR_T == 4
3215         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217         for (; size--; ++s, ++w) {
3218             *w = *s;
3219         }
3220 #else
3221         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223         for (; size--; ++s, ++w) {
3224             Py_UCS4 ch = *s;
3225             if (ch > 0xFFFF) {
3226                 assert(ch <= MAX_UNICODE);
3227                 /* encode surrogate pair in this case */
3228                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229                 if (!size--)
3230                     break;
3231                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3232             }
3233             else {
3234                 *w = ch;
3235             }
3236         }
3237 #endif
3238     }
3239 }
3240 
3241 #ifdef HAVE_WCHAR_H
3242 
3243 /* Convert a Unicode object to a wide character string.
3244 
3245    - If w is NULL: return the number of wide characters (including the null
3246      character) required to convert the unicode object. Ignore size argument.
3247 
3248    - Otherwise: return the number of wide characters (excluding the null
3249      character) written into w. Write at most size wide characters (including
3250      the null character). */
3251 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252 PyUnicode_AsWideChar(PyObject *unicode,
3253                      wchar_t *w,
3254                      Py_ssize_t size)
3255 {
3256     Py_ssize_t res;
3257 
3258     if (unicode == NULL) {
3259         PyErr_BadInternalCall();
3260         return -1;
3261     }
3262     if (!PyUnicode_Check(unicode)) {
3263         PyErr_BadArgument();
3264         return -1;
3265     }
3266 
3267     res = unicode_get_widechar_size(unicode);
3268     if (w == NULL) {
3269         return res + 1;
3270     }
3271 
3272     if (size > res) {
3273         size = res + 1;
3274     }
3275     else {
3276         res = size;
3277     }
3278     unicode_copy_as_widechar(unicode, w, size);
3279 
3280 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282        non-Unicode locales and hence needs conversion first. */
3283     if (_Py_LocaleUsesNonUnicodeWchar()) {
3284         if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285             return -1;
3286         }
3287     }
3288 #endif
3289 
3290     return res;
3291 }
3292 
3293 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294 PyUnicode_AsWideCharString(PyObject *unicode,
3295                            Py_ssize_t *size)
3296 {
3297     wchar_t *buffer;
3298     Py_ssize_t buflen;
3299 
3300     if (unicode == NULL) {
3301         PyErr_BadInternalCall();
3302         return NULL;
3303     }
3304     if (!PyUnicode_Check(unicode)) {
3305         PyErr_BadArgument();
3306         return NULL;
3307     }
3308 
3309     buflen = unicode_get_widechar_size(unicode);
3310     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311     if (buffer == NULL) {
3312         PyErr_NoMemory();
3313         return NULL;
3314     }
3315     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316 
3317 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319        non-Unicode locales and hence needs conversion first. */
3320     if (_Py_LocaleUsesNonUnicodeWchar()) {
3321         if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322             return NULL;
3323         }
3324     }
3325 #endif
3326 
3327     if (size != NULL) {
3328         *size = buflen;
3329     }
3330     else if (wcslen(buffer) != (size_t)buflen) {
3331         PyMem_Free(buffer);
3332         PyErr_SetString(PyExc_ValueError,
3333                         "embedded null character");
3334         return NULL;
3335     }
3336     return buffer;
3337 }
3338 
3339 #endif /* HAVE_WCHAR_H */
3340 
3341 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343 {
3344     wchar_t **p = (wchar_t **)ptr;
3345     if (obj == NULL) {
3346 #if !USE_UNICODE_WCHAR_CACHE
3347         PyMem_Free(*p);
3348 #endif /* USE_UNICODE_WCHAR_CACHE */
3349         *p = NULL;
3350         return 1;
3351     }
3352     if (PyUnicode_Check(obj)) {
3353 #if USE_UNICODE_WCHAR_CACHE
3354         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355         if (*p == NULL) {
3356             return 0;
3357         }
3358         return 1;
3359 #else /* USE_UNICODE_WCHAR_CACHE */
3360         *p = PyUnicode_AsWideCharString(obj, NULL);
3361         if (*p == NULL) {
3362             return 0;
3363         }
3364         return Py_CLEANUP_SUPPORTED;
3365 #endif /* USE_UNICODE_WCHAR_CACHE */
3366     }
3367     PyErr_Format(PyExc_TypeError,
3368                  "argument must be str, not %.50s",
3369                  Py_TYPE(obj)->tp_name);
3370     return 0;
3371 }
3372 
3373 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375 {
3376     wchar_t **p = (wchar_t **)ptr;
3377     if (obj == NULL) {
3378 #if !USE_UNICODE_WCHAR_CACHE
3379         PyMem_Free(*p);
3380 #endif /* USE_UNICODE_WCHAR_CACHE */
3381         *p = NULL;
3382         return 1;
3383     }
3384     if (obj == Py_None) {
3385         *p = NULL;
3386         return 1;
3387     }
3388     if (PyUnicode_Check(obj)) {
3389 #if USE_UNICODE_WCHAR_CACHE
3390         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391         if (*p == NULL) {
3392             return 0;
3393         }
3394         return 1;
3395 #else /* USE_UNICODE_WCHAR_CACHE */
3396         *p = PyUnicode_AsWideCharString(obj, NULL);
3397         if (*p == NULL) {
3398             return 0;
3399         }
3400         return Py_CLEANUP_SUPPORTED;
3401 #endif /* USE_UNICODE_WCHAR_CACHE */
3402     }
3403     PyErr_Format(PyExc_TypeError,
3404                  "argument must be str or None, not %.50s",
3405                  Py_TYPE(obj)->tp_name);
3406     return 0;
3407 }
3408 
3409 PyObject *
PyUnicode_FromOrdinal(int ordinal)3410 PyUnicode_FromOrdinal(int ordinal)
3411 {
3412     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413         PyErr_SetString(PyExc_ValueError,
3414                         "chr() arg not in range(0x110000)");
3415         return NULL;
3416     }
3417 
3418     return unicode_char((Py_UCS4)ordinal);
3419 }
3420 
3421 PyObject *
PyUnicode_FromObject(PyObject * obj)3422 PyUnicode_FromObject(PyObject *obj)
3423 {
3424     /* XXX Perhaps we should make this API an alias of
3425        PyObject_Str() instead ?! */
3426     if (PyUnicode_CheckExact(obj)) {
3427         if (PyUnicode_READY(obj) == -1)
3428             return NULL;
3429         Py_INCREF(obj);
3430         return obj;
3431     }
3432     if (PyUnicode_Check(obj)) {
3433         /* For a Unicode subtype that's not a Unicode object,
3434            return a true Unicode object with the same data. */
3435         return _PyUnicode_Copy(obj);
3436     }
3437     PyErr_Format(PyExc_TypeError,
3438                  "Can't convert '%.100s' object to str implicitly",
3439                  Py_TYPE(obj)->tp_name);
3440     return NULL;
3441 }
3442 
3443 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444 PyUnicode_FromEncodedObject(PyObject *obj,
3445                             const char *encoding,
3446                             const char *errors)
3447 {
3448     Py_buffer buffer;
3449     PyObject *v;
3450 
3451     if (obj == NULL) {
3452         PyErr_BadInternalCall();
3453         return NULL;
3454     }
3455 
3456     /* Decoding bytes objects is the most common case and should be fast */
3457     if (PyBytes_Check(obj)) {
3458         if (PyBytes_GET_SIZE(obj) == 0) {
3459             if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460                 return NULL;
3461             }
3462             _Py_RETURN_UNICODE_EMPTY();
3463         }
3464         return PyUnicode_Decode(
3465                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466                 encoding, errors);
3467     }
3468 
3469     if (PyUnicode_Check(obj)) {
3470         PyErr_SetString(PyExc_TypeError,
3471                         "decoding str is not supported");
3472         return NULL;
3473     }
3474 
3475     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477         PyErr_Format(PyExc_TypeError,
3478                      "decoding to str: need a bytes-like object, %.80s found",
3479                      Py_TYPE(obj)->tp_name);
3480         return NULL;
3481     }
3482 
3483     if (buffer.len == 0) {
3484         PyBuffer_Release(&buffer);
3485         if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486             return NULL;
3487         }
3488         _Py_RETURN_UNICODE_EMPTY();
3489     }
3490 
3491     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492     PyBuffer_Release(&buffer);
3493     return v;
3494 }
3495 
3496 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498    longer than lower_len-1). */
3499 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500 _Py_normalize_encoding(const char *encoding,
3501                        char *lower,
3502                        size_t lower_len)
3503 {
3504     const char *e;
3505     char *l;
3506     char *l_end;
3507     int punct;
3508 
3509     assert(encoding != NULL);
3510 
3511     e = encoding;
3512     l = lower;
3513     l_end = &lower[lower_len - 1];
3514     punct = 0;
3515     while (1) {
3516         char c = *e;
3517         if (c == 0) {
3518             break;
3519         }
3520 
3521         if (Py_ISALNUM(c) || c == '.') {
3522             if (punct && l != lower) {
3523                 if (l == l_end) {
3524                     return 0;
3525                 }
3526                 *l++ = '_';
3527             }
3528             punct = 0;
3529 
3530             if (l == l_end) {
3531                 return 0;
3532             }
3533             *l++ = Py_TOLOWER(c);
3534         }
3535         else {
3536             punct = 1;
3537         }
3538 
3539         e++;
3540     }
3541     *l = '\0';
3542     return 1;
3543 }
3544 
3545 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546 PyUnicode_Decode(const char *s,
3547                  Py_ssize_t size,
3548                  const char *encoding,
3549                  const char *errors)
3550 {
3551     PyObject *buffer = NULL, *unicode;
3552     Py_buffer info;
3553     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554 
3555     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556         return NULL;
3557     }
3558 
3559     if (size == 0) {
3560         _Py_RETURN_UNICODE_EMPTY();
3561     }
3562 
3563     if (encoding == NULL) {
3564         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565     }
3566 
3567     /* Shortcuts for common default encodings */
3568     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569         char *lower = buflower;
3570 
3571         /* Fast paths */
3572         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573             lower += 3;
3574             if (*lower == '_') {
3575                 /* Match "utf8" and "utf_8" */
3576                 lower++;
3577             }
3578 
3579             if (lower[0] == '8' && lower[1] == 0) {
3580                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581             }
3582             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584             }
3585             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587             }
3588         }
3589         else {
3590             if (strcmp(lower, "ascii") == 0
3591                 || strcmp(lower, "us_ascii") == 0) {
3592                 return PyUnicode_DecodeASCII(s, size, errors);
3593             }
3594     #ifdef MS_WINDOWS
3595             else if (strcmp(lower, "mbcs") == 0) {
3596                 return PyUnicode_DecodeMBCS(s, size, errors);
3597             }
3598     #endif
3599             else if (strcmp(lower, "latin1") == 0
3600                      || strcmp(lower, "latin_1") == 0
3601                      || strcmp(lower, "iso_8859_1") == 0
3602                      || strcmp(lower, "iso8859_1") == 0) {
3603                 return PyUnicode_DecodeLatin1(s, size, errors);
3604             }
3605         }
3606     }
3607 
3608     /* Decode via the codec registry */
3609     buffer = NULL;
3610     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611         goto onError;
3612     buffer = PyMemoryView_FromBuffer(&info);
3613     if (buffer == NULL)
3614         goto onError;
3615     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616     if (unicode == NULL)
3617         goto onError;
3618     if (!PyUnicode_Check(unicode)) {
3619         PyErr_Format(PyExc_TypeError,
3620                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621                      "use codecs.decode() to decode to arbitrary types",
3622                      encoding,
3623                      Py_TYPE(unicode)->tp_name);
3624         Py_DECREF(unicode);
3625         goto onError;
3626     }
3627     Py_DECREF(buffer);
3628     return unicode_result(unicode);
3629 
3630   onError:
3631     Py_XDECREF(buffer);
3632     return NULL;
3633 }
3634 
3635 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636 PyUnicode_AsDecodedObject(PyObject *unicode,
3637                           const char *encoding,
3638                           const char *errors)
3639 {
3640     if (!PyUnicode_Check(unicode)) {
3641         PyErr_BadArgument();
3642         return NULL;
3643     }
3644 
3645     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646                      "PyUnicode_AsDecodedObject() is deprecated; "
3647                      "use PyCodec_Decode() to decode from str", 1) < 0)
3648         return NULL;
3649 
3650     if (encoding == NULL)
3651         encoding = PyUnicode_GetDefaultEncoding();
3652 
3653     /* Decode via the codec registry */
3654     return PyCodec_Decode(unicode, encoding, errors);
3655 }
3656 
3657 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659                            const char *encoding,
3660                            const char *errors)
3661 {
3662     PyObject *v;
3663 
3664     if (!PyUnicode_Check(unicode)) {
3665         PyErr_BadArgument();
3666         goto onError;
3667     }
3668 
3669     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3671                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672         return NULL;
3673 
3674     if (encoding == NULL)
3675         encoding = PyUnicode_GetDefaultEncoding();
3676 
3677     /* Decode via the codec registry */
3678     v = PyCodec_Decode(unicode, encoding, errors);
3679     if (v == NULL)
3680         goto onError;
3681     if (!PyUnicode_Check(v)) {
3682         PyErr_Format(PyExc_TypeError,
3683                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684                      "use codecs.decode() to decode to arbitrary types",
3685                      encoding,
3686                      Py_TYPE(unicode)->tp_name);
3687         Py_DECREF(v);
3688         goto onError;
3689     }
3690     return unicode_result(v);
3691 
3692   onError:
3693     return NULL;
3694 }
3695 
3696 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697 PyUnicode_AsEncodedObject(PyObject *unicode,
3698                           const char *encoding,
3699                           const char *errors)
3700 {
3701     PyObject *v;
3702 
3703     if (!PyUnicode_Check(unicode)) {
3704         PyErr_BadArgument();
3705         goto onError;
3706     }
3707 
3708     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709                      "PyUnicode_AsEncodedObject() is deprecated; "
3710                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3712         return NULL;
3713 
3714     if (encoding == NULL)
3715         encoding = PyUnicode_GetDefaultEncoding();
3716 
3717     /* Encode via the codec registry */
3718     v = PyCodec_Encode(unicode, encoding, errors);
3719     if (v == NULL)
3720         goto onError;
3721     return v;
3722 
3723   onError:
3724     return NULL;
3725 }
3726 
3727 
3728 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730                       int current_locale)
3731 {
3732     Py_ssize_t wlen;
3733     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734     if (wstr == NULL) {
3735         return NULL;
3736     }
3737 
3738     if ((size_t)wlen != wcslen(wstr)) {
3739         PyErr_SetString(PyExc_ValueError, "embedded null character");
3740         PyMem_Free(wstr);
3741         return NULL;
3742     }
3743 
3744     char *str;
3745     size_t error_pos;
3746     const char *reason;
3747     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748                                  current_locale, error_handler);
3749     PyMem_Free(wstr);
3750 
3751     if (res != 0) {
3752         if (res == -2) {
3753             PyObject *exc;
3754             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755                     "locale", unicode,
3756                     (Py_ssize_t)error_pos,
3757                     (Py_ssize_t)(error_pos+1),
3758                     reason);
3759             if (exc != NULL) {
3760                 PyCodec_StrictErrors(exc);
3761                 Py_DECREF(exc);
3762             }
3763         }
3764         else if (res == -3) {
3765             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766         }
3767         else {
3768             PyErr_NoMemory();
3769         }
3770         return NULL;
3771     }
3772 
3773     PyObject *bytes = PyBytes_FromString(str);
3774     PyMem_RawFree(str);
3775     return bytes;
3776 }
3777 
3778 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780 {
3781     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782     return unicode_encode_locale(unicode, error_handler, 1);
3783 }
3784 
3785 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3786 PyUnicode_EncodeFSDefault(PyObject *unicode)
3787 {
3788     PyInterpreterState *interp = _PyInterpreterState_GET();
3789     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790     if (fs_codec->utf8) {
3791         return unicode_encode_utf8(unicode,
3792                                    fs_codec->error_handler,
3793                                    fs_codec->errors);
3794     }
3795 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3796     else if (fs_codec->encoding) {
3797         return PyUnicode_AsEncodedString(unicode,
3798                                          fs_codec->encoding,
3799                                          fs_codec->errors);
3800     }
3801 #endif
3802     else {
3803         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804            machinery is not ready and so cannot be used:
3805            use wcstombs() in this case. */
3806         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807         const wchar_t *filesystem_errors = config->filesystem_errors;
3808         assert(filesystem_errors != NULL);
3809         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810         assert(errors != _Py_ERROR_UNKNOWN);
3811 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3812         return unicode_encode_utf8(unicode, errors, NULL);
3813 #else
3814         return unicode_encode_locale(unicode, errors, 0);
3815 #endif
3816     }
3817 }
3818 
3819 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820 PyUnicode_AsEncodedString(PyObject *unicode,
3821                           const char *encoding,
3822                           const char *errors)
3823 {
3824     PyObject *v;
3825     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826 
3827     if (!PyUnicode_Check(unicode)) {
3828         PyErr_BadArgument();
3829         return NULL;
3830     }
3831 
3832     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833         return NULL;
3834     }
3835 
3836     if (encoding == NULL) {
3837         return _PyUnicode_AsUTF8String(unicode, errors);
3838     }
3839 
3840     /* Shortcuts for common default encodings */
3841     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842         char *lower = buflower;
3843 
3844         /* Fast paths */
3845         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846             lower += 3;
3847             if (*lower == '_') {
3848                 /* Match "utf8" and "utf_8" */
3849                 lower++;
3850             }
3851 
3852             if (lower[0] == '8' && lower[1] == 0) {
3853                 return _PyUnicode_AsUTF8String(unicode, errors);
3854             }
3855             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857             }
3858             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860             }
3861         }
3862         else {
3863             if (strcmp(lower, "ascii") == 0
3864                 || strcmp(lower, "us_ascii") == 0) {
3865                 return _PyUnicode_AsASCIIString(unicode, errors);
3866             }
3867 #ifdef MS_WINDOWS
3868             else if (strcmp(lower, "mbcs") == 0) {
3869                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870             }
3871 #endif
3872             else if (strcmp(lower, "latin1") == 0 ||
3873                      strcmp(lower, "latin_1") == 0 ||
3874                      strcmp(lower, "iso_8859_1") == 0 ||
3875                      strcmp(lower, "iso8859_1") == 0) {
3876                 return _PyUnicode_AsLatin1String(unicode, errors);
3877             }
3878         }
3879     }
3880 
3881     /* Encode via the codec registry */
3882     v = _PyCodec_EncodeText(unicode, encoding, errors);
3883     if (v == NULL)
3884         return NULL;
3885 
3886     /* The normal path */
3887     if (PyBytes_Check(v))
3888         return v;
3889 
3890     /* If the codec returns a buffer, raise a warning and convert to bytes */
3891     if (PyByteArray_Check(v)) {
3892         int error;
3893         PyObject *b;
3894 
3895         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896             "encoder %s returned bytearray instead of bytes; "
3897             "use codecs.encode() to encode to arbitrary types",
3898             encoding);
3899         if (error) {
3900             Py_DECREF(v);
3901             return NULL;
3902         }
3903 
3904         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905                                       PyByteArray_GET_SIZE(v));
3906         Py_DECREF(v);
3907         return b;
3908     }
3909 
3910     PyErr_Format(PyExc_TypeError,
3911                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912                  "use codecs.encode() to encode to arbitrary types",
3913                  encoding,
3914                  Py_TYPE(v)->tp_name);
3915     Py_DECREF(v);
3916     return NULL;
3917 }
3918 
3919 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921                            const char *encoding,
3922                            const char *errors)
3923 {
3924     PyObject *v;
3925 
3926     if (!PyUnicode_Check(unicode)) {
3927         PyErr_BadArgument();
3928         goto onError;
3929     }
3930 
3931     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3933                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934         return NULL;
3935 
3936     if (encoding == NULL)
3937         encoding = PyUnicode_GetDefaultEncoding();
3938 
3939     /* Encode via the codec registry */
3940     v = PyCodec_Encode(unicode, encoding, errors);
3941     if (v == NULL)
3942         goto onError;
3943     if (!PyUnicode_Check(v)) {
3944         PyErr_Format(PyExc_TypeError,
3945                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946                      "use codecs.encode() to encode to arbitrary types",
3947                      encoding,
3948                      Py_TYPE(v)->tp_name);
3949         Py_DECREF(v);
3950         goto onError;
3951     }
3952     return v;
3953 
3954   onError:
3955     return NULL;
3956 }
3957 
3958 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959 unicode_decode_locale(const char *str, Py_ssize_t len,
3960                       _Py_error_handler errors, int current_locale)
3961 {
3962     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3963         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964         return NULL;
3965     }
3966 
3967     wchar_t *wstr;
3968     size_t wlen;
3969     const char *reason;
3970     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971                                  current_locale, errors);
3972     if (res != 0) {
3973         if (res == -2) {
3974             PyObject *exc;
3975             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976                                         "locale", str, len,
3977                                         (Py_ssize_t)wlen,
3978                                         (Py_ssize_t)(wlen + 1),
3979                                         reason);
3980             if (exc != NULL) {
3981                 PyCodec_StrictErrors(exc);
3982                 Py_DECREF(exc);
3983             }
3984         }
3985         else if (res == -3) {
3986             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987         }
3988         else {
3989             PyErr_NoMemory();
3990         }
3991         return NULL;
3992     }
3993 
3994     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995     PyMem_RawFree(wstr);
3996     return unicode;
3997 }
3998 
3999 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001                               const char *errors)
4002 {
4003     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004     return unicode_decode_locale(str, len, error_handler, 1);
4005 }
4006 
4007 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4008 PyUnicode_DecodeLocale(const char *str, const char *errors)
4009 {
4010     Py_ssize_t size = (Py_ssize_t)strlen(str);
4011     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012     return unicode_decode_locale(str, size, error_handler, 1);
4013 }
4014 
4015 
4016 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4017 PyUnicode_DecodeFSDefault(const char *s) {
4018     Py_ssize_t size = (Py_ssize_t)strlen(s);
4019     return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020 }
4021 
4022 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024 {
4025     PyInterpreterState *interp = _PyInterpreterState_GET();
4026     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027     if (fs_codec->utf8) {
4028         return unicode_decode_utf8(s, size,
4029                                    fs_codec->error_handler,
4030                                    fs_codec->errors,
4031                                    NULL);
4032     }
4033 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4034     else if (fs_codec->encoding) {
4035         return PyUnicode_Decode(s, size,
4036                                 fs_codec->encoding,
4037                                 fs_codec->errors);
4038     }
4039 #endif
4040     else {
4041         /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042            machinery is not ready and so cannot be used:
4043            use mbstowcs() in this case. */
4044         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045         const wchar_t *filesystem_errors = config->filesystem_errors;
4046         assert(filesystem_errors != NULL);
4047         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048         assert(errors != _Py_ERROR_UNKNOWN);
4049 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4050         return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051 #else
4052         return unicode_decode_locale(s, size, errors, 0);
4053 #endif
4054     }
4055 }
4056 
4057 
4058 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4059 PyUnicode_FSConverter(PyObject* arg, void* addr)
4060 {
4061     PyObject *path = NULL;
4062     PyObject *output = NULL;
4063     Py_ssize_t size;
4064     const char *data;
4065     if (arg == NULL) {
4066         Py_DECREF(*(PyObject**)addr);
4067         *(PyObject**)addr = NULL;
4068         return 1;
4069     }
4070     path = PyOS_FSPath(arg);
4071     if (path == NULL) {
4072         return 0;
4073     }
4074     if (PyBytes_Check(path)) {
4075         output = path;
4076     }
4077     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4078         output = PyUnicode_EncodeFSDefault(path);
4079         Py_DECREF(path);
4080         if (!output) {
4081             return 0;
4082         }
4083         assert(PyBytes_Check(output));
4084     }
4085 
4086     size = PyBytes_GET_SIZE(output);
4087     data = PyBytes_AS_STRING(output);
4088     if ((size_t)size != strlen(data)) {
4089         PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090         Py_DECREF(output);
4091         return 0;
4092     }
4093     *(PyObject**)addr = output;
4094     return Py_CLEANUP_SUPPORTED;
4095 }
4096 
4097 
4098 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4099 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100 {
4101     int is_buffer = 0;
4102     PyObject *path = NULL;
4103     PyObject *output = NULL;
4104     if (arg == NULL) {
4105         Py_DECREF(*(PyObject**)addr);
4106         *(PyObject**)addr = NULL;
4107         return 1;
4108     }
4109 
4110     is_buffer = PyObject_CheckBuffer(arg);
4111     if (!is_buffer) {
4112         path = PyOS_FSPath(arg);
4113         if (path == NULL) {
4114             return 0;
4115         }
4116     }
4117     else {
4118         path = arg;
4119         Py_INCREF(arg);
4120     }
4121 
4122     if (PyUnicode_Check(path)) {
4123         output = path;
4124     }
4125     else if (PyBytes_Check(path) || is_buffer) {
4126         PyObject *path_bytes = NULL;
4127 
4128         if (!PyBytes_Check(path) &&
4129             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130             "path should be string, bytes, or os.PathLike, not %.200s",
4131             Py_TYPE(arg)->tp_name)) {
4132                 Py_DECREF(path);
4133             return 0;
4134         }
4135         path_bytes = PyBytes_FromObject(path);
4136         Py_DECREF(path);
4137         if (!path_bytes) {
4138             return 0;
4139         }
4140         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141                                                   PyBytes_GET_SIZE(path_bytes));
4142         Py_DECREF(path_bytes);
4143         if (!output) {
4144             return 0;
4145         }
4146     }
4147     else {
4148         PyErr_Format(PyExc_TypeError,
4149                      "path should be string, bytes, or os.PathLike, not %.200s",
4150                      Py_TYPE(arg)->tp_name);
4151         Py_DECREF(path);
4152         return 0;
4153     }
4154     if (PyUnicode_READY(output) == -1) {
4155         Py_DECREF(output);
4156         return 0;
4157     }
4158     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160         PyErr_SetString(PyExc_ValueError, "embedded null character");
4161         Py_DECREF(output);
4162         return 0;
4163     }
4164     *(PyObject**)addr = output;
4165     return Py_CLEANUP_SUPPORTED;
4166 }
4167 
4168 
4169 static int unicode_fill_utf8(PyObject *unicode);
4170 
4171 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173 {
4174     if (!PyUnicode_Check(unicode)) {
4175         PyErr_BadArgument();
4176         return NULL;
4177     }
4178     if (PyUnicode_READY(unicode) == -1)
4179         return NULL;
4180 
4181     if (PyUnicode_UTF8(unicode) == NULL) {
4182         if (unicode_fill_utf8(unicode) == -1) {
4183             return NULL;
4184         }
4185     }
4186 
4187     if (psize)
4188         *psize = PyUnicode_UTF8_LENGTH(unicode);
4189     return PyUnicode_UTF8(unicode);
4190 }
4191 
4192 const char *
PyUnicode_AsUTF8(PyObject * unicode)4193 PyUnicode_AsUTF8(PyObject *unicode)
4194 {
4195     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196 }
4197 
4198 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200 {
4201     if (!PyUnicode_Check(unicode)) {
4202         PyErr_BadArgument();
4203         return NULL;
4204     }
4205     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206     if (w == NULL) {
4207         /* Non-ASCII compact unicode object */
4208         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209         assert(PyUnicode_IS_READY(unicode));
4210 
4211         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213             PyErr_NoMemory();
4214             return NULL;
4215         }
4216         w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217         if (w == NULL) {
4218             PyErr_NoMemory();
4219             return NULL;
4220         }
4221         unicode_copy_as_widechar(unicode, w, wlen + 1);
4222         _PyUnicode_WSTR(unicode) = w;
4223         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225         }
4226     }
4227     if (size != NULL)
4228         *size = PyUnicode_WSTR_LENGTH(unicode);
4229     return w;
4230 }
4231 
4232 /* Deprecated APIs */
4233 
4234 _Py_COMP_DIAG_PUSH
4235 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236 
4237 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4238 PyUnicode_AsUnicode(PyObject *unicode)
4239 {
4240     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241 }
4242 
4243 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4244 _PyUnicode_AsUnicode(PyObject *unicode)
4245 {
4246     Py_ssize_t size;
4247     const Py_UNICODE *wstr;
4248 
4249     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250     if (wstr && wcslen(wstr) != (size_t)size) {
4251         PyErr_SetString(PyExc_ValueError, "embedded null character");
4252         return NULL;
4253     }
4254     return wstr;
4255 }
4256 
4257 
4258 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4259 PyUnicode_GetSize(PyObject *unicode)
4260 {
4261     if (!PyUnicode_Check(unicode)) {
4262         PyErr_BadArgument();
4263         goto onError;
4264     }
4265     if (_PyUnicode_WSTR(unicode) == NULL) {
4266         if (PyUnicode_AsUnicode(unicode) == NULL)
4267             goto onError;
4268     }
4269     return PyUnicode_WSTR_LENGTH(unicode);
4270 
4271   onError:
4272     return -1;
4273 }
4274 
4275 _Py_COMP_DIAG_POP
4276 
4277 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4278 PyUnicode_GetLength(PyObject *unicode)
4279 {
4280     if (!PyUnicode_Check(unicode)) {
4281         PyErr_BadArgument();
4282         return -1;
4283     }
4284     if (PyUnicode_READY(unicode) == -1)
4285         return -1;
4286     return PyUnicode_GET_LENGTH(unicode);
4287 }
4288 
4289 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291 {
4292     const void *data;
4293     int kind;
4294 
4295     if (!PyUnicode_Check(unicode)) {
4296         PyErr_BadArgument();
4297         return (Py_UCS4)-1;
4298     }
4299     if (PyUnicode_READY(unicode) == -1) {
4300         return (Py_UCS4)-1;
4301     }
4302     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303         PyErr_SetString(PyExc_IndexError, "string index out of range");
4304         return (Py_UCS4)-1;
4305     }
4306     data = PyUnicode_DATA(unicode);
4307     kind = PyUnicode_KIND(unicode);
4308     return PyUnicode_READ(kind, data, index);
4309 }
4310 
4311 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313 {
4314     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315         PyErr_BadArgument();
4316         return -1;
4317     }
4318     assert(PyUnicode_IS_READY(unicode));
4319     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320         PyErr_SetString(PyExc_IndexError, "string index out of range");
4321         return -1;
4322     }
4323     if (unicode_check_modifiable(unicode))
4324         return -1;
4325     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326         PyErr_SetString(PyExc_ValueError, "character out of range");
4327         return -1;
4328     }
4329     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330                     index, ch);
4331     return 0;
4332 }
4333 
4334 const char *
PyUnicode_GetDefaultEncoding(void)4335 PyUnicode_GetDefaultEncoding(void)
4336 {
4337     return "utf-8";
4338 }
4339 
4340 /* create or adjust a UnicodeDecodeError */
4341 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342 make_decode_exception(PyObject **exceptionObject,
4343                       const char *encoding,
4344                       const char *input, Py_ssize_t length,
4345                       Py_ssize_t startpos, Py_ssize_t endpos,
4346                       const char *reason)
4347 {
4348     if (*exceptionObject == NULL) {
4349         *exceptionObject = PyUnicodeDecodeError_Create(
4350             encoding, input, length, startpos, endpos, reason);
4351     }
4352     else {
4353         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354             goto onError;
4355         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356             goto onError;
4357         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358             goto onError;
4359     }
4360     return;
4361 
4362 onError:
4363     Py_CLEAR(*exceptionObject);
4364 }
4365 
4366 #ifdef MS_WINDOWS
4367 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369 {
4370     if (newsize > *size) {
4371         wchar_t *newbuf = *buf;
4372         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373             PyErr_NoMemory();
4374             return -1;
4375         }
4376         *buf = newbuf;
4377     }
4378     *size = newsize;
4379     return 0;
4380 }
4381 
4382 /* error handling callback helper:
4383    build arguments, call the callback and check the arguments,
4384    if no exception occurred, copy the replacement to the output
4385    and adjust various state variables.
4386    return 0 on success, -1 on error
4387 */
4388 
4389 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390 unicode_decode_call_errorhandler_wchar(
4391     const char *errors, PyObject **errorHandler,
4392     const char *encoding, const char *reason,
4393     const char **input, const char **inend, Py_ssize_t *startinpos,
4394     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396 {
4397     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398 
4399     PyObject *restuple = NULL;
4400     PyObject *repunicode = NULL;
4401     Py_ssize_t outsize;
4402     Py_ssize_t insize;
4403     Py_ssize_t requiredsize;
4404     Py_ssize_t newpos;
4405     PyObject *inputobj = NULL;
4406     Py_ssize_t repwlen;
4407 
4408     if (*errorHandler == NULL) {
4409         *errorHandler = PyCodec_LookupError(errors);
4410         if (*errorHandler == NULL)
4411             goto onError;
4412     }
4413 
4414     make_decode_exception(exceptionObject,
4415         encoding,
4416         *input, *inend - *input,
4417         *startinpos, *endinpos,
4418         reason);
4419     if (*exceptionObject == NULL)
4420         goto onError;
4421 
4422     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423     if (restuple == NULL)
4424         goto onError;
4425     if (!PyTuple_Check(restuple)) {
4426         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427         goto onError;
4428     }
4429     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430         goto onError;
4431 
4432     /* Copy back the bytes variables, which might have been modified by the
4433        callback */
4434     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435     if (!inputobj)
4436         goto onError;
4437     *input = PyBytes_AS_STRING(inputobj);
4438     insize = PyBytes_GET_SIZE(inputobj);
4439     *inend = *input + insize;
4440     /* we can DECREF safely, as the exception has another reference,
4441        so the object won't go away. */
4442     Py_DECREF(inputobj);
4443 
4444     if (newpos<0)
4445         newpos = insize+newpos;
4446     if (newpos<0 || newpos>insize) {
4447         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448         goto onError;
4449     }
4450 
4451 #if USE_UNICODE_WCHAR_CACHE
4452 _Py_COMP_DIAG_PUSH
4453 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454     repwlen = PyUnicode_GetSize(repunicode);
4455     if (repwlen < 0)
4456         goto onError;
4457 _Py_COMP_DIAG_POP
4458 #else /* USE_UNICODE_WCHAR_CACHE */
4459     repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460     if (repwlen < 0)
4461         goto onError;
4462     repwlen--;
4463 #endif /* USE_UNICODE_WCHAR_CACHE */
4464     /* need more space? (at least enough for what we
4465        have+the replacement+the rest of the string (starting
4466        at the new input position), so we won't have to check space
4467        when there are no errors in the rest of the string) */
4468     requiredsize = *outpos;
4469     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470         goto overflow;
4471     requiredsize += repwlen;
4472     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473         goto overflow;
4474     requiredsize += insize - newpos;
4475     outsize = *bufsize;
4476     if (requiredsize > outsize) {
4477         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478             requiredsize = 2*outsize;
4479         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480             goto onError;
4481         }
4482     }
4483     PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484     *outpos += repwlen;
4485     *endinpos = newpos;
4486     *inptr = *input + newpos;
4487 
4488     /* we made it! */
4489     Py_DECREF(restuple);
4490     return 0;
4491 
4492   overflow:
4493     PyErr_SetString(PyExc_OverflowError,
4494                     "decoded result is too long for a Python string");
4495 
4496   onError:
4497     Py_XDECREF(restuple);
4498     return -1;
4499 }
4500 #endif   /* MS_WINDOWS */
4501 
4502 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503 unicode_decode_call_errorhandler_writer(
4504     const char *errors, PyObject **errorHandler,
4505     const char *encoding, const char *reason,
4506     const char **input, const char **inend, Py_ssize_t *startinpos,
4507     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509 {
4510     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511 
4512     PyObject *restuple = NULL;
4513     PyObject *repunicode = NULL;
4514     Py_ssize_t insize;
4515     Py_ssize_t newpos;
4516     Py_ssize_t replen;
4517     Py_ssize_t remain;
4518     PyObject *inputobj = NULL;
4519     int need_to_grow = 0;
4520     const char *new_inptr;
4521 
4522     if (*errorHandler == NULL) {
4523         *errorHandler = PyCodec_LookupError(errors);
4524         if (*errorHandler == NULL)
4525             goto onError;
4526     }
4527 
4528     make_decode_exception(exceptionObject,
4529         encoding,
4530         *input, *inend - *input,
4531         *startinpos, *endinpos,
4532         reason);
4533     if (*exceptionObject == NULL)
4534         goto onError;
4535 
4536     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537     if (restuple == NULL)
4538         goto onError;
4539     if (!PyTuple_Check(restuple)) {
4540         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541         goto onError;
4542     }
4543     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544         goto onError;
4545 
4546     /* Copy back the bytes variables, which might have been modified by the
4547        callback */
4548     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549     if (!inputobj)
4550         goto onError;
4551     remain = *inend - *input - *endinpos;
4552     *input = PyBytes_AS_STRING(inputobj);
4553     insize = PyBytes_GET_SIZE(inputobj);
4554     *inend = *input + insize;
4555     /* we can DECREF safely, as the exception has another reference,
4556        so the object won't go away. */
4557     Py_DECREF(inputobj);
4558 
4559     if (newpos<0)
4560         newpos = insize+newpos;
4561     if (newpos<0 || newpos>insize) {
4562         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563         goto onError;
4564     }
4565 
4566     replen = PyUnicode_GET_LENGTH(repunicode);
4567     if (replen > 1) {
4568         writer->min_length += replen - 1;
4569         need_to_grow = 1;
4570     }
4571     new_inptr = *input + newpos;
4572     if (*inend - new_inptr > remain) {
4573         /* We don't know the decoding algorithm here so we make the worst
4574            assumption that one byte decodes to one unicode character.
4575            If unfortunately one byte could decode to more unicode characters,
4576            the decoder may write out-of-bound then.  Is it possible for the
4577            algorithms using this function? */
4578         writer->min_length += *inend - new_inptr - remain;
4579         need_to_grow = 1;
4580     }
4581     if (need_to_grow) {
4582         writer->overallocate = 1;
4583         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585             goto onError;
4586     }
4587     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588         goto onError;
4589 
4590     *endinpos = newpos;
4591     *inptr = new_inptr;
4592 
4593     /* we made it! */
4594     Py_DECREF(restuple);
4595     return 0;
4596 
4597   onError:
4598     Py_XDECREF(restuple);
4599     return -1;
4600 }
4601 
4602 /* --- UTF-7 Codec -------------------------------------------------------- */
4603 
4604 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4605 
4606 /* Three simple macros defining base-64. */
4607 
4608 /* Is c a base-64 character? */
4609 
4610 #define IS_BASE64(c) \
4611     (((c) >= 'A' && (c) <= 'Z') ||     \
4612      ((c) >= 'a' && (c) <= 'z') ||     \
4613      ((c) >= '0' && (c) <= '9') ||     \
4614      (c) == '+' || (c) == '/')
4615 
4616 /* given that c is a base-64 character, what is its base-64 value? */
4617 
4618 #define FROM_BASE64(c)                                                  \
4619     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4620      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4621      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4622      (c) == '+' ? 62 : 63)
4623 
4624 /* What is the base-64 character of the bottom 6 bits of n? */
4625 
4626 #define TO_BASE64(n)  \
4627     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628 
4629 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630  * decoded as itself.  We are permissive on decoding; the only ASCII
4631  * byte not decoding to itself is the + which begins a base64
4632  * string. */
4633 
4634 #define DECODE_DIRECT(c)                                \
4635     ((c) <= 127 && (c) != '+')
4636 
4637 /* The UTF-7 encoder treats ASCII characters differently according to
4638  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639  * the above).  See RFC2152.  This array identifies these different
4640  * sets:
4641  * 0 : "Set D"
4642  *     alphanumeric and '(),-./:?
4643  * 1 : "Set O"
4644  *     !"#$%&*;<=>@[]^_`{|}
4645  * 2 : "whitespace"
4646  *     ht nl cr sp
4647  * 3 : special (must be base64 encoded)
4648  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649  */
4650 
4651 static
4652 char utf7_category[128] = {
4653 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4654     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4655 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4656     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4657 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4658     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4659 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4660     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4661 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4662     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4663 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4664     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4665 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4666     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4667 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4668     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4669 };
4670 
4671 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4672  * answer depends on whether we are encoding set O as itself, and also
4673  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4674  * clear that the answers to these questions vary between
4675  * applications, so this code needs to be flexible.  */
4676 
4677 #define ENCODE_DIRECT(c, directO, directWS)             \
4678     ((c) < 128 && (c) > 0 &&                            \
4679      ((utf7_category[(c)] == 0) ||                      \
4680       (directWS && (utf7_category[(c)] == 2)) ||        \
4681       (directO && (utf7_category[(c)] == 1))))
4682 
4683 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684 PyUnicode_DecodeUTF7(const char *s,
4685                      Py_ssize_t size,
4686                      const char *errors)
4687 {
4688     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689 }
4690 
4691 /* The decoder.  The only state we preserve is our read position,
4692  * i.e. how many characters we have consumed.  So if we end in the
4693  * middle of a shift sequence we have to back off the read position
4694  * and the output to the beginning of the sequence, otherwise we lose
4695  * all the shift state (seen bits, number of bits seen, high
4696  * surrogate). */
4697 
4698 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699 PyUnicode_DecodeUTF7Stateful(const char *s,
4700                              Py_ssize_t size,
4701                              const char *errors,
4702                              Py_ssize_t *consumed)
4703 {
4704     const char *starts = s;
4705     Py_ssize_t startinpos;
4706     Py_ssize_t endinpos;
4707     const char *e;
4708     _PyUnicodeWriter writer;
4709     const char *errmsg = "";
4710     int inShift = 0;
4711     Py_ssize_t shiftOutStart;
4712     unsigned int base64bits = 0;
4713     unsigned long base64buffer = 0;
4714     Py_UCS4 surrogate = 0;
4715     PyObject *errorHandler = NULL;
4716     PyObject *exc = NULL;
4717 
4718     if (size == 0) {
4719         if (consumed)
4720             *consumed = 0;
4721         _Py_RETURN_UNICODE_EMPTY();
4722     }
4723 
4724     /* Start off assuming it's all ASCII. Widen later as necessary. */
4725     _PyUnicodeWriter_Init(&writer);
4726     writer.min_length = size;
4727 
4728     shiftOutStart = 0;
4729     e = s + size;
4730 
4731     while (s < e) {
4732         Py_UCS4 ch;
4733       restart:
4734         ch = (unsigned char) *s;
4735 
4736         if (inShift) { /* in a base-64 section */
4737             if (IS_BASE64(ch)) { /* consume a base-64 character */
4738                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739                 base64bits += 6;
4740                 s++;
4741                 if (base64bits >= 16) {
4742                     /* we have enough bits for a UTF-16 value */
4743                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744                     base64bits -= 16;
4745                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746                     assert(outCh <= 0xffff);
4747                     if (surrogate) {
4748                         /* expecting a second surrogate */
4749                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752                                 goto onError;
4753                             surrogate = 0;
4754                             continue;
4755                         }
4756                         else {
4757                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758                                 goto onError;
4759                             surrogate = 0;
4760                         }
4761                     }
4762                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763                         /* first surrogate */
4764                         surrogate = outCh;
4765                     }
4766                     else {
4767                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768                             goto onError;
4769                     }
4770                 }
4771             }
4772             else { /* now leaving a base-64 section */
4773                 inShift = 0;
4774                 if (base64bits > 0) { /* left-over bits */
4775                     if (base64bits >= 6) {
4776                         /* We've seen at least one base-64 character */
4777                         s++;
4778                         errmsg = "partial character in shift sequence";
4779                         goto utf7Error;
4780                     }
4781                     else {
4782                         /* Some bits remain; they should be zero */
4783                         if (base64buffer != 0) {
4784                             s++;
4785                             errmsg = "non-zero padding bits in shift sequence";
4786                             goto utf7Error;
4787                         }
4788                     }
4789                 }
4790                 if (surrogate && DECODE_DIRECT(ch)) {
4791                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792                         goto onError;
4793                 }
4794                 surrogate = 0;
4795                 if (ch == '-') {
4796                     /* '-' is absorbed; other terminating
4797                        characters are preserved */
4798                     s++;
4799                 }
4800             }
4801         }
4802         else if ( ch == '+' ) {
4803             startinpos = s-starts;
4804             s++; /* consume '+' */
4805             if (s < e && *s == '-') { /* '+-' encodes '+' */
4806                 s++;
4807                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808                     goto onError;
4809             }
4810             else if (s < e && !IS_BASE64(*s)) {
4811                 s++;
4812                 errmsg = "ill-formed sequence";
4813                 goto utf7Error;
4814             }
4815             else { /* begin base64-encoded section */
4816                 inShift = 1;
4817                 surrogate = 0;
4818                 shiftOutStart = writer.pos;
4819                 base64bits = 0;
4820                 base64buffer = 0;
4821             }
4822         }
4823         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824             s++;
4825             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826                 goto onError;
4827         }
4828         else {
4829             startinpos = s-starts;
4830             s++;
4831             errmsg = "unexpected special character";
4832             goto utf7Error;
4833         }
4834         continue;
4835 utf7Error:
4836         endinpos = s-starts;
4837         if (unicode_decode_call_errorhandler_writer(
4838                 errors, &errorHandler,
4839                 "utf7", errmsg,
4840                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4841                 &writer))
4842             goto onError;
4843     }
4844 
4845     /* end of string */
4846 
4847     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848         /* if we're in an inconsistent state, that's an error */
4849         inShift = 0;
4850         if (surrogate ||
4851                 (base64bits >= 6) ||
4852                 (base64bits > 0 && base64buffer != 0)) {
4853             endinpos = size;
4854             if (unicode_decode_call_errorhandler_writer(
4855                     errors, &errorHandler,
4856                     "utf7", "unterminated shift sequence",
4857                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4858                     &writer))
4859                 goto onError;
4860             if (s < e)
4861                 goto restart;
4862         }
4863     }
4864 
4865     /* return state */
4866     if (consumed) {
4867         if (inShift) {
4868             *consumed = startinpos;
4869             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870                 PyObject *result = PyUnicode_FromKindAndData(
4871                         writer.kind, writer.data, shiftOutStart);
4872                 Py_XDECREF(errorHandler);
4873                 Py_XDECREF(exc);
4874                 _PyUnicodeWriter_Dealloc(&writer);
4875                 return result;
4876             }
4877             writer.pos = shiftOutStart; /* back off output */
4878         }
4879         else {
4880             *consumed = s-starts;
4881         }
4882     }
4883 
4884     Py_XDECREF(errorHandler);
4885     Py_XDECREF(exc);
4886     return _PyUnicodeWriter_Finish(&writer);
4887 
4888   onError:
4889     Py_XDECREF(errorHandler);
4890     Py_XDECREF(exc);
4891     _PyUnicodeWriter_Dealloc(&writer);
4892     return NULL;
4893 }
4894 
4895 
4896 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897 _PyUnicode_EncodeUTF7(PyObject *str,
4898                       int base64SetO,
4899                       int base64WhiteSpace,
4900                       const char *errors)
4901 {
4902     int kind;
4903     const void *data;
4904     Py_ssize_t len;
4905     PyObject *v;
4906     int inShift = 0;
4907     Py_ssize_t i;
4908     unsigned int base64bits = 0;
4909     unsigned long base64buffer = 0;
4910     char * out;
4911     const char * start;
4912 
4913     if (PyUnicode_READY(str) == -1)
4914         return NULL;
4915     kind = PyUnicode_KIND(str);
4916     data = PyUnicode_DATA(str);
4917     len = PyUnicode_GET_LENGTH(str);
4918 
4919     if (len == 0)
4920         return PyBytes_FromStringAndSize(NULL, 0);
4921 
4922     /* It might be possible to tighten this worst case */
4923     if (len > PY_SSIZE_T_MAX / 8)
4924         return PyErr_NoMemory();
4925     v = PyBytes_FromStringAndSize(NULL, len * 8);
4926     if (v == NULL)
4927         return NULL;
4928 
4929     start = out = PyBytes_AS_STRING(v);
4930     for (i = 0; i < len; ++i) {
4931         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932 
4933         if (inShift) {
4934             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935                 /* shifting out */
4936                 if (base64bits) { /* output remaining bits */
4937                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938                     base64buffer = 0;
4939                     base64bits = 0;
4940                 }
4941                 inShift = 0;
4942                 /* Characters not in the BASE64 set implicitly unshift the sequence
4943                    so no '-' is required, except if the character is itself a '-' */
4944                 if (IS_BASE64(ch) || ch == '-') {
4945                     *out++ = '-';
4946                 }
4947                 *out++ = (char) ch;
4948             }
4949             else {
4950                 goto encode_char;
4951             }
4952         }
4953         else { /* not in a shift sequence */
4954             if (ch == '+') {
4955                 *out++ = '+';
4956                         *out++ = '-';
4957             }
4958             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959                 *out++ = (char) ch;
4960             }
4961             else {
4962                 *out++ = '+';
4963                 inShift = 1;
4964                 goto encode_char;
4965             }
4966         }
4967         continue;
4968 encode_char:
4969         if (ch >= 0x10000) {
4970             assert(ch <= MAX_UNICODE);
4971 
4972             /* code first surrogate */
4973             base64bits += 16;
4974             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975             while (base64bits >= 6) {
4976                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977                 base64bits -= 6;
4978             }
4979             /* prepare second surrogate */
4980             ch = Py_UNICODE_LOW_SURROGATE(ch);
4981         }
4982         base64bits += 16;
4983         base64buffer = (base64buffer << 16) | ch;
4984         while (base64bits >= 6) {
4985             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986             base64bits -= 6;
4987         }
4988     }
4989     if (base64bits)
4990         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991     if (inShift)
4992         *out++ = '-';
4993     if (_PyBytes_Resize(&v, out - start) < 0)
4994         return NULL;
4995     return v;
4996 }
4997 
4998 #undef IS_BASE64
4999 #undef FROM_BASE64
5000 #undef TO_BASE64
5001 #undef DECODE_DIRECT
5002 #undef ENCODE_DIRECT
5003 
5004 /* --- UTF-8 Codec -------------------------------------------------------- */
5005 
5006 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007 PyUnicode_DecodeUTF8(const char *s,
5008                      Py_ssize_t size,
5009                      const char *errors)
5010 {
5011     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012 }
5013 
5014 #include "stringlib/asciilib.h"
5015 #include "stringlib/codecs.h"
5016 #include "stringlib/undef.h"
5017 
5018 #include "stringlib/ucs1lib.h"
5019 #include "stringlib/codecs.h"
5020 #include "stringlib/undef.h"
5021 
5022 #include "stringlib/ucs2lib.h"
5023 #include "stringlib/codecs.h"
5024 #include "stringlib/undef.h"
5025 
5026 #include "stringlib/ucs4lib.h"
5027 #include "stringlib/codecs.h"
5028 #include "stringlib/undef.h"
5029 
5030 /* Mask to quickly check whether a C 'size_t' contains a
5031    non-ASCII, UTF8-encoded char. */
5032 #if (SIZEOF_SIZE_T == 8)
5033 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5034 #elif (SIZEOF_SIZE_T == 4)
5035 # define ASCII_CHAR_MASK 0x80808080U
5036 #else
5037 # error C 'size_t' size should be either 4 or 8!
5038 #endif
5039 
5040 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042 {
5043     const char *p = start;
5044 
5045 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046     assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048         /* Fast path, see in STRINGLIB(utf8_decode) for
5049            an explanation. */
5050         /* Help allocation */
5051         const char *_p = p;
5052         Py_UCS1 * q = dest;
5053         while (_p + SIZEOF_SIZE_T <= end) {
5054             size_t value = *(const size_t *) _p;
5055             if (value & ASCII_CHAR_MASK)
5056                 break;
5057             *((size_t *)q) = value;
5058             _p += SIZEOF_SIZE_T;
5059             q += SIZEOF_SIZE_T;
5060         }
5061         p = _p;
5062         while (p < end) {
5063             if ((unsigned char)*p & 0x80)
5064                 break;
5065             *q++ = *p++;
5066         }
5067         return p - start;
5068     }
5069 #endif
5070     while (p < end) {
5071         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072            for an explanation. */
5073         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074             /* Help allocation */
5075             const char *_p = p;
5076             while (_p + SIZEOF_SIZE_T <= end) {
5077                 size_t value = *(const size_t *) _p;
5078                 if (value & ASCII_CHAR_MASK)
5079                     break;
5080                 _p += SIZEOF_SIZE_T;
5081             }
5082             p = _p;
5083             if (_p == end)
5084                 break;
5085         }
5086         if ((unsigned char)*p & 0x80)
5087             break;
5088         ++p;
5089     }
5090     memcpy(dest, start, p - start);
5091     return p - start;
5092 }
5093 
5094 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095 unicode_decode_utf8(const char *s, Py_ssize_t size,
5096                     _Py_error_handler error_handler, const char *errors,
5097                     Py_ssize_t *consumed)
5098 {
5099     if (size == 0) {
5100         if (consumed)
5101             *consumed = 0;
5102         _Py_RETURN_UNICODE_EMPTY();
5103     }
5104 
5105     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106     if (size == 1 && (unsigned char)s[0] < 128) {
5107         if (consumed) {
5108             *consumed = 1;
5109         }
5110         return get_latin1_char((unsigned char)s[0]);
5111     }
5112 
5113     const char *starts = s;
5114     const char *end = s + size;
5115 
5116     // fast path: try ASCII string.
5117     PyObject *u = PyUnicode_New(size, 127);
5118     if (u == NULL) {
5119         return NULL;
5120     }
5121     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122     if (s == end) {
5123         return u;
5124     }
5125 
5126     // Use _PyUnicodeWriter after fast path is failed.
5127     _PyUnicodeWriter writer;
5128     _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129     writer.pos = s - starts;
5130 
5131     Py_ssize_t startinpos, endinpos;
5132     const char *errmsg = "";
5133     PyObject *error_handler_obj = NULL;
5134     PyObject *exc = NULL;
5135 
5136     while (s < end) {
5137         Py_UCS4 ch;
5138         int kind = writer.kind;
5139 
5140         if (kind == PyUnicode_1BYTE_KIND) {
5141             if (PyUnicode_IS_ASCII(writer.buffer))
5142                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143             else
5144                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145         } else if (kind == PyUnicode_2BYTE_KIND) {
5146             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147         } else {
5148             assert(kind == PyUnicode_4BYTE_KIND);
5149             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150         }
5151 
5152         switch (ch) {
5153         case 0:
5154             if (s == end || consumed)
5155                 goto End;
5156             errmsg = "unexpected end of data";
5157             startinpos = s - starts;
5158             endinpos = end - starts;
5159             break;
5160         case 1:
5161             errmsg = "invalid start byte";
5162             startinpos = s - starts;
5163             endinpos = startinpos + 1;
5164             break;
5165         case 2:
5166             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168             {
5169                 /* Truncated surrogate code in range D800-DFFF */
5170                 goto End;
5171             }
5172             /* fall through */
5173         case 3:
5174         case 4:
5175             errmsg = "invalid continuation byte";
5176             startinpos = s - starts;
5177             endinpos = startinpos + ch - 1;
5178             break;
5179         default:
5180             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181                 goto onError;
5182             continue;
5183         }
5184 
5185         if (error_handler == _Py_ERROR_UNKNOWN)
5186             error_handler = _Py_GetErrorHandler(errors);
5187 
5188         switch (error_handler) {
5189         case _Py_ERROR_IGNORE:
5190             s += (endinpos - startinpos);
5191             break;
5192 
5193         case _Py_ERROR_REPLACE:
5194             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195                 goto onError;
5196             s += (endinpos - startinpos);
5197             break;
5198 
5199         case _Py_ERROR_SURROGATEESCAPE:
5200         {
5201             Py_ssize_t i;
5202 
5203             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204                 goto onError;
5205             for (i=startinpos; i<endinpos; i++) {
5206                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5207                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208                                 ch + 0xdc00);
5209                 writer.pos++;
5210             }
5211             s += (endinpos - startinpos);
5212             break;
5213         }
5214 
5215         default:
5216             if (unicode_decode_call_errorhandler_writer(
5217                     errors, &error_handler_obj,
5218                     "utf-8", errmsg,
5219                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5220                     &writer))
5221                 goto onError;
5222         }
5223     }
5224 
5225 End:
5226     if (consumed)
5227         *consumed = s - starts;
5228 
5229     Py_XDECREF(error_handler_obj);
5230     Py_XDECREF(exc);
5231     return _PyUnicodeWriter_Finish(&writer);
5232 
5233 onError:
5234     Py_XDECREF(error_handler_obj);
5235     Py_XDECREF(exc);
5236     _PyUnicodeWriter_Dealloc(&writer);
5237     return NULL;
5238 }
5239 
5240 
5241 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242 PyUnicode_DecodeUTF8Stateful(const char *s,
5243                              Py_ssize_t size,
5244                              const char *errors,
5245                              Py_ssize_t *consumed)
5246 {
5247     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248 }
5249 
5250 
5251 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252    non-zero, use strict error handler otherwise.
5253 
5254    On success, write a pointer to a newly allocated wide character string into
5255    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256    (in number of wchar_t units) into *wlen (if wlen is set).
5257 
5258    On memory allocation failure, return -1.
5259 
5260    On decoding error (if surrogateescape is zero), return -2. If wlen is
5261    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262    is not NULL, write the decoding error message into *reason. */
5263 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265                  const char **reason, _Py_error_handler errors)
5266 {
5267     const char *orig_s = s;
5268     const char *e;
5269     wchar_t *unicode;
5270     Py_ssize_t outpos;
5271 
5272     int surrogateescape = 0;
5273     int surrogatepass = 0;
5274     switch (errors)
5275     {
5276     case _Py_ERROR_STRICT:
5277         break;
5278     case _Py_ERROR_SURROGATEESCAPE:
5279         surrogateescape = 1;
5280         break;
5281     case _Py_ERROR_SURROGATEPASS:
5282         surrogatepass = 1;
5283         break;
5284     default:
5285         return -3;
5286     }
5287 
5288     /* Note: size will always be longer than the resulting Unicode
5289        character count */
5290     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291         return -1;
5292     }
5293 
5294     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295     if (!unicode) {
5296         return -1;
5297     }
5298 
5299     /* Unpack UTF-8 encoded data */
5300     e = s + size;
5301     outpos = 0;
5302     while (s < e) {
5303         Py_UCS4 ch;
5304 #if SIZEOF_WCHAR_T == 4
5305         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306 #else
5307         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308 #endif
5309         if (ch > 0xFF) {
5310 #if SIZEOF_WCHAR_T == 4
5311             Py_UNREACHABLE();
5312 #else
5313             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314             /* write a surrogate pair */
5315             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317 #endif
5318         }
5319         else {
5320             if (!ch && s == e) {
5321                 break;
5322             }
5323 
5324             if (surrogateescape) {
5325                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326             }
5327             else {
5328                 /* Is it a valid three-byte code? */
5329                 if (surrogatepass
5330                     && (e - s) >= 3
5331                     && (s[0] & 0xf0) == 0xe0
5332                     && (s[1] & 0xc0) == 0x80
5333                     && (s[2] & 0xc0) == 0x80)
5334                 {
5335                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336                     s += 3;
5337                     unicode[outpos++] = ch;
5338                 }
5339                 else {
5340                     PyMem_RawFree(unicode );
5341                     if (reason != NULL) {
5342                         switch (ch) {
5343                         case 0:
5344                             *reason = "unexpected end of data";
5345                             break;
5346                         case 1:
5347                             *reason = "invalid start byte";
5348                             break;
5349                         /* 2, 3, 4 */
5350                         default:
5351                             *reason = "invalid continuation byte";
5352                             break;
5353                         }
5354                     }
5355                     if (wlen != NULL) {
5356                         *wlen = s - orig_s;
5357                     }
5358                     return -2;
5359                 }
5360             }
5361         }
5362     }
5363     unicode[outpos] = L'\0';
5364     if (wlen) {
5365         *wlen = outpos;
5366     }
5367     *wstr = unicode;
5368     return 0;
5369 }
5370 
5371 
5372 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374                                size_t *wlen)
5375 {
5376     wchar_t *wstr;
5377     int res = _Py_DecodeUTF8Ex(arg, arglen,
5378                                &wstr, wlen,
5379                                NULL, _Py_ERROR_SURROGATEESCAPE);
5380     if (res != 0) {
5381         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382         assert(res != -3);
5383         if (wlen) {
5384             *wlen = (size_t)res;
5385         }
5386         return NULL;
5387     }
5388     return wstr;
5389 }
5390 
5391 
5392 /* UTF-8 encoder using the surrogateescape error handler .
5393 
5394    On success, return 0 and write the newly allocated character string (use
5395    PyMem_Free() to free the memory) into *str.
5396 
5397    On encoding failure, return -2 and write the position of the invalid
5398    surrogate character into *error_pos (if error_pos is set) and the decoding
5399    error message into *reason (if reason is set).
5400 
5401    On memory allocation failure, return -1. */
5402 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404                  const char **reason, int raw_malloc, _Py_error_handler errors)
5405 {
5406     const Py_ssize_t max_char_size = 4;
5407     Py_ssize_t len = wcslen(text);
5408 
5409     assert(len >= 0);
5410 
5411     int surrogateescape = 0;
5412     int surrogatepass = 0;
5413     switch (errors)
5414     {
5415     case _Py_ERROR_STRICT:
5416         break;
5417     case _Py_ERROR_SURROGATEESCAPE:
5418         surrogateescape = 1;
5419         break;
5420     case _Py_ERROR_SURROGATEPASS:
5421         surrogatepass = 1;
5422         break;
5423     default:
5424         return -3;
5425     }
5426 
5427     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428         return -1;
5429     }
5430     char *bytes;
5431     if (raw_malloc) {
5432         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433     }
5434     else {
5435         bytes = PyMem_Malloc((len + 1) * max_char_size);
5436     }
5437     if (bytes == NULL) {
5438         return -1;
5439     }
5440 
5441     char *p = bytes;
5442     Py_ssize_t i;
5443     for (i = 0; i < len; ) {
5444         Py_ssize_t ch_pos = i;
5445         Py_UCS4 ch = text[i];
5446         i++;
5447 #if Py_UNICODE_SIZE == 2
5448         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449             && i < len
5450             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451         {
5452             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453             i++;
5454         }
5455 #endif
5456 
5457         if (ch < 0x80) {
5458             /* Encode ASCII */
5459             *p++ = (char) ch;
5460 
5461         }
5462         else if (ch < 0x0800) {
5463             /* Encode Latin-1 */
5464             *p++ = (char)(0xc0 | (ch >> 6));
5465             *p++ = (char)(0x80 | (ch & 0x3f));
5466         }
5467         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468             /* surrogateescape error handler */
5469             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470                 if (error_pos != NULL) {
5471                     *error_pos = (size_t)ch_pos;
5472                 }
5473                 if (reason != NULL) {
5474                     *reason = "encoding error";
5475                 }
5476                 if (raw_malloc) {
5477                     PyMem_RawFree(bytes);
5478                 }
5479                 else {
5480                     PyMem_Free(bytes);
5481                 }
5482                 return -2;
5483             }
5484             *p++ = (char)(ch & 0xff);
5485         }
5486         else if (ch < 0x10000) {
5487             *p++ = (char)(0xe0 | (ch >> 12));
5488             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489             *p++ = (char)(0x80 | (ch & 0x3f));
5490         }
5491         else {  /* ch >= 0x10000 */
5492             assert(ch <= MAX_UNICODE);
5493             /* Encode UCS4 Unicode ordinals */
5494             *p++ = (char)(0xf0 | (ch >> 18));
5495             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497             *p++ = (char)(0x80 | (ch & 0x3f));
5498         }
5499     }
5500     *p++ = '\0';
5501 
5502     size_t final_size = (p - bytes);
5503     char *bytes2;
5504     if (raw_malloc) {
5505         bytes2 = PyMem_RawRealloc(bytes, final_size);
5506     }
5507     else {
5508         bytes2 = PyMem_Realloc(bytes, final_size);
5509     }
5510     if (bytes2 == NULL) {
5511         if (error_pos != NULL) {
5512             *error_pos = (size_t)-1;
5513         }
5514         if (raw_malloc) {
5515             PyMem_RawFree(bytes);
5516         }
5517         else {
5518             PyMem_Free(bytes);
5519         }
5520         return -1;
5521     }
5522     *str = bytes2;
5523     return 0;
5524 }
5525 
5526 
5527 /* Primary internal function which creates utf8 encoded bytes objects.
5528 
5529    Allocation strategy:  if the string is short, convert into a stack buffer
5530    and allocate exactly as much space needed at the end.  Else allocate the
5531    maximum possible needed (4 result bytes per Unicode character), and return
5532    the excess memory at the end.
5533 */
5534 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536                     const char *errors)
5537 {
5538     if (!PyUnicode_Check(unicode)) {
5539         PyErr_BadArgument();
5540         return NULL;
5541     }
5542 
5543     if (PyUnicode_READY(unicode) == -1)
5544         return NULL;
5545 
5546     if (PyUnicode_UTF8(unicode))
5547         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548                                          PyUnicode_UTF8_LENGTH(unicode));
5549 
5550     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551     const void *data = PyUnicode_DATA(unicode);
5552     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553 
5554     _PyBytesWriter writer;
5555     char *end;
5556 
5557     switch (kind) {
5558     default:
5559         Py_UNREACHABLE();
5560     case PyUnicode_1BYTE_KIND:
5561         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562         assert(!PyUnicode_IS_ASCII(unicode));
5563         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564         break;
5565     case PyUnicode_2BYTE_KIND:
5566         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567         break;
5568     case PyUnicode_4BYTE_KIND:
5569         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570         break;
5571     }
5572 
5573     if (end == NULL) {
5574         _PyBytesWriter_Dealloc(&writer);
5575         return NULL;
5576     }
5577     return _PyBytesWriter_Finish(&writer, end);
5578 }
5579 
5580 static int
unicode_fill_utf8(PyObject * unicode)5581 unicode_fill_utf8(PyObject *unicode)
5582 {
5583     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584     assert(!PyUnicode_IS_ASCII(unicode));
5585 
5586     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587     const void *data = PyUnicode_DATA(unicode);
5588     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589 
5590     _PyBytesWriter writer;
5591     char *end;
5592 
5593     switch (kind) {
5594     default:
5595         Py_UNREACHABLE();
5596     case PyUnicode_1BYTE_KIND:
5597         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598                                    _Py_ERROR_STRICT, NULL);
5599         break;
5600     case PyUnicode_2BYTE_KIND:
5601         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602                                    _Py_ERROR_STRICT, NULL);
5603         break;
5604     case PyUnicode_4BYTE_KIND:
5605         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606                                    _Py_ERROR_STRICT, NULL);
5607         break;
5608     }
5609     if (end == NULL) {
5610         _PyBytesWriter_Dealloc(&writer);
5611         return -1;
5612     }
5613 
5614     const char *start = writer.use_small_buffer ? writer.small_buffer :
5615                     PyBytes_AS_STRING(writer.buffer);
5616     Py_ssize_t len = end - start;
5617 
5618     char *cache = PyObject_Malloc(len + 1);
5619     if (cache == NULL) {
5620         _PyBytesWriter_Dealloc(&writer);
5621         PyErr_NoMemory();
5622         return -1;
5623     }
5624     _PyUnicode_UTF8(unicode) = cache;
5625     _PyUnicode_UTF8_LENGTH(unicode) = len;
5626     memcpy(cache, start, len);
5627     cache[len] = '\0';
5628     _PyBytesWriter_Dealloc(&writer);
5629     return 0;
5630 }
5631 
5632 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634 {
5635     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636 }
5637 
5638 
5639 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5640 PyUnicode_AsUTF8String(PyObject *unicode)
5641 {
5642     return _PyUnicode_AsUTF8String(unicode, NULL);
5643 }
5644 
5645 /* --- UTF-32 Codec ------------------------------------------------------- */
5646 
5647 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648 PyUnicode_DecodeUTF32(const char *s,
5649                       Py_ssize_t size,
5650                       const char *errors,
5651                       int *byteorder)
5652 {
5653     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654 }
5655 
5656 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657 PyUnicode_DecodeUTF32Stateful(const char *s,
5658                               Py_ssize_t size,
5659                               const char *errors,
5660                               int *byteorder,
5661                               Py_ssize_t *consumed)
5662 {
5663     const char *starts = s;
5664     Py_ssize_t startinpos;
5665     Py_ssize_t endinpos;
5666     _PyUnicodeWriter writer;
5667     const unsigned char *q, *e;
5668     int le, bo = 0;       /* assume native ordering by default */
5669     const char *encoding;
5670     const char *errmsg = "";
5671     PyObject *errorHandler = NULL;
5672     PyObject *exc = NULL;
5673 
5674     q = (const unsigned char *)s;
5675     e = q + size;
5676 
5677     if (byteorder)
5678         bo = *byteorder;
5679 
5680     /* Check for BOM marks (U+FEFF) in the input and adjust current
5681        byte order setting accordingly. In native mode, the leading BOM
5682        mark is skipped, in all other modes, it is copied to the output
5683        stream as-is (giving a ZWNBSP character). */
5684     if (bo == 0 && size >= 4) {
5685         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686         if (bom == 0x0000FEFF) {
5687             bo = -1;
5688             q += 4;
5689         }
5690         else if (bom == 0xFFFE0000) {
5691             bo = 1;
5692             q += 4;
5693         }
5694         if (byteorder)
5695             *byteorder = bo;
5696     }
5697 
5698     if (q == e) {
5699         if (consumed)
5700             *consumed = size;
5701         _Py_RETURN_UNICODE_EMPTY();
5702     }
5703 
5704 #ifdef WORDS_BIGENDIAN
5705     le = bo < 0;
5706 #else
5707     le = bo <= 0;
5708 #endif
5709     encoding = le ? "utf-32-le" : "utf-32-be";
5710 
5711     _PyUnicodeWriter_Init(&writer);
5712     writer.min_length = (e - q + 3) / 4;
5713     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714         goto onError;
5715 
5716     while (1) {
5717         Py_UCS4 ch = 0;
5718         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719 
5720         if (e - q >= 4) {
5721             enum PyUnicode_Kind kind = writer.kind;
5722             void *data = writer.data;
5723             const unsigned char *last = e - 4;
5724             Py_ssize_t pos = writer.pos;
5725             if (le) {
5726                 do {
5727                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728                     if (ch > maxch)
5729                         break;
5730                     if (kind != PyUnicode_1BYTE_KIND &&
5731                         Py_UNICODE_IS_SURROGATE(ch))
5732                         break;
5733                     PyUnicode_WRITE(kind, data, pos++, ch);
5734                     q += 4;
5735                 } while (q <= last);
5736             }
5737             else {
5738                 do {
5739                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740                     if (ch > maxch)
5741                         break;
5742                     if (kind != PyUnicode_1BYTE_KIND &&
5743                         Py_UNICODE_IS_SURROGATE(ch))
5744                         break;
5745                     PyUnicode_WRITE(kind, data, pos++, ch);
5746                     q += 4;
5747                 } while (q <= last);
5748             }
5749             writer.pos = pos;
5750         }
5751 
5752         if (Py_UNICODE_IS_SURROGATE(ch)) {
5753             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754             startinpos = ((const char *)q) - starts;
5755             endinpos = startinpos + 4;
5756         }
5757         else if (ch <= maxch) {
5758             if (q == e || consumed)
5759                 break;
5760             /* remaining bytes at the end? (size should be divisible by 4) */
5761             errmsg = "truncated data";
5762             startinpos = ((const char *)q) - starts;
5763             endinpos = ((const char *)e) - starts;
5764         }
5765         else {
5766             if (ch < 0x110000) {
5767                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768                     goto onError;
5769                 q += 4;
5770                 continue;
5771             }
5772             errmsg = "code point not in range(0x110000)";
5773             startinpos = ((const char *)q) - starts;
5774             endinpos = startinpos + 4;
5775         }
5776 
5777         /* The remaining input chars are ignored if the callback
5778            chooses to skip the input */
5779         if (unicode_decode_call_errorhandler_writer(
5780                 errors, &errorHandler,
5781                 encoding, errmsg,
5782                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783                 &writer))
5784             goto onError;
5785     }
5786 
5787     if (consumed)
5788         *consumed = (const char *)q-starts;
5789 
5790     Py_XDECREF(errorHandler);
5791     Py_XDECREF(exc);
5792     return _PyUnicodeWriter_Finish(&writer);
5793 
5794   onError:
5795     _PyUnicodeWriter_Dealloc(&writer);
5796     Py_XDECREF(errorHandler);
5797     Py_XDECREF(exc);
5798     return NULL;
5799 }
5800 
5801 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802 _PyUnicode_EncodeUTF32(PyObject *str,
5803                        const char *errors,
5804                        int byteorder)
5805 {
5806     enum PyUnicode_Kind kind;
5807     const void *data;
5808     Py_ssize_t len;
5809     PyObject *v;
5810     uint32_t *out;
5811 #if PY_LITTLE_ENDIAN
5812     int native_ordering = byteorder <= 0;
5813 #else
5814     int native_ordering = byteorder >= 0;
5815 #endif
5816     const char *encoding;
5817     Py_ssize_t nsize, pos;
5818     PyObject *errorHandler = NULL;
5819     PyObject *exc = NULL;
5820     PyObject *rep = NULL;
5821 
5822     if (!PyUnicode_Check(str)) {
5823         PyErr_BadArgument();
5824         return NULL;
5825     }
5826     if (PyUnicode_READY(str) == -1)
5827         return NULL;
5828     kind = PyUnicode_KIND(str);
5829     data = PyUnicode_DATA(str);
5830     len = PyUnicode_GET_LENGTH(str);
5831 
5832     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833         return PyErr_NoMemory();
5834     nsize = len + (byteorder == 0);
5835     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836     if (v == NULL)
5837         return NULL;
5838 
5839     /* output buffer is 4-bytes aligned */
5840     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841     out = (uint32_t *)PyBytes_AS_STRING(v);
5842     if (byteorder == 0)
5843         *out++ = 0xFEFF;
5844     if (len == 0)
5845         goto done;
5846 
5847     if (byteorder == -1)
5848         encoding = "utf-32-le";
5849     else if (byteorder == 1)
5850         encoding = "utf-32-be";
5851     else
5852         encoding = "utf-32";
5853 
5854     if (kind == PyUnicode_1BYTE_KIND) {
5855         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856         goto done;
5857     }
5858 
5859     pos = 0;
5860     while (pos < len) {
5861         Py_ssize_t newpos, repsize, moreunits;
5862 
5863         if (kind == PyUnicode_2BYTE_KIND) {
5864             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865                                         &out, native_ordering);
5866         }
5867         else {
5868             assert(kind == PyUnicode_4BYTE_KIND);
5869             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870                                         &out, native_ordering);
5871         }
5872         if (pos == len)
5873             break;
5874 
5875         rep = unicode_encode_call_errorhandler(
5876                 errors, &errorHandler,
5877                 encoding, "surrogates not allowed",
5878                 str, &exc, pos, pos + 1, &newpos);
5879         if (!rep)
5880             goto error;
5881 
5882         if (PyBytes_Check(rep)) {
5883             repsize = PyBytes_GET_SIZE(rep);
5884             if (repsize & 3) {
5885                 raise_encode_exception(&exc, encoding,
5886                                        str, pos, pos + 1,
5887                                        "surrogates not allowed");
5888                 goto error;
5889             }
5890             moreunits = repsize / 4;
5891         }
5892         else {
5893             assert(PyUnicode_Check(rep));
5894             if (PyUnicode_READY(rep) < 0)
5895                 goto error;
5896             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897             if (!PyUnicode_IS_ASCII(rep)) {
5898                 raise_encode_exception(&exc, encoding,
5899                                        str, pos, pos + 1,
5900                                        "surrogates not allowed");
5901                 goto error;
5902             }
5903         }
5904         moreunits += pos - newpos;
5905         pos = newpos;
5906 
5907         /* four bytes are reserved for each surrogate */
5908         if (moreunits > 0) {
5909             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911                 /* integer overflow */
5912                 PyErr_NoMemory();
5913                 goto error;
5914             }
5915             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916                 goto error;
5917             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918         }
5919 
5920         if (PyBytes_Check(rep)) {
5921             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922             out += repsize / 4;
5923         } else /* rep is unicode */ {
5924             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926                                  &out, native_ordering);
5927         }
5928 
5929         Py_CLEAR(rep);
5930     }
5931 
5932     /* Cut back to size actually needed. This is necessary for, for example,
5933        encoding of a string containing isolated surrogates and the 'ignore'
5934        handler is used. */
5935     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936     if (nsize != PyBytes_GET_SIZE(v))
5937       _PyBytes_Resize(&v, nsize);
5938     Py_XDECREF(errorHandler);
5939     Py_XDECREF(exc);
5940   done:
5941     return v;
5942   error:
5943     Py_XDECREF(rep);
5944     Py_XDECREF(errorHandler);
5945     Py_XDECREF(exc);
5946     Py_XDECREF(v);
5947     return NULL;
5948 }
5949 
5950 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5951 PyUnicode_AsUTF32String(PyObject *unicode)
5952 {
5953     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954 }
5955 
5956 /* --- UTF-16 Codec ------------------------------------------------------- */
5957 
5958 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959 PyUnicode_DecodeUTF16(const char *s,
5960                       Py_ssize_t size,
5961                       const char *errors,
5962                       int *byteorder)
5963 {
5964     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965 }
5966 
5967 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968 PyUnicode_DecodeUTF16Stateful(const char *s,
5969                               Py_ssize_t size,
5970                               const char *errors,
5971                               int *byteorder,
5972                               Py_ssize_t *consumed)
5973 {
5974     const char *starts = s;
5975     Py_ssize_t startinpos;
5976     Py_ssize_t endinpos;
5977     _PyUnicodeWriter writer;
5978     const unsigned char *q, *e;
5979     int bo = 0;       /* assume native ordering by default */
5980     int native_ordering;
5981     const char *errmsg = "";
5982     PyObject *errorHandler = NULL;
5983     PyObject *exc = NULL;
5984     const char *encoding;
5985 
5986     q = (const unsigned char *)s;
5987     e = q + size;
5988 
5989     if (byteorder)
5990         bo = *byteorder;
5991 
5992     /* Check for BOM marks (U+FEFF) in the input and adjust current
5993        byte order setting accordingly. In native mode, the leading BOM
5994        mark is skipped, in all other modes, it is copied to the output
5995        stream as-is (giving a ZWNBSP character). */
5996     if (bo == 0 && size >= 2) {
5997         const Py_UCS4 bom = (q[1] << 8) | q[0];
5998         if (bom == 0xFEFF) {
5999             q += 2;
6000             bo = -1;
6001         }
6002         else if (bom == 0xFFFE) {
6003             q += 2;
6004             bo = 1;
6005         }
6006         if (byteorder)
6007             *byteorder = bo;
6008     }
6009 
6010     if (q == e) {
6011         if (consumed)
6012             *consumed = size;
6013         _Py_RETURN_UNICODE_EMPTY();
6014     }
6015 
6016 #if PY_LITTLE_ENDIAN
6017     native_ordering = bo <= 0;
6018     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019 #else
6020     native_ordering = bo >= 0;
6021     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022 #endif
6023 
6024     /* Note: size will always be longer than the resulting Unicode
6025        character count normally.  Error handler will take care of
6026        resizing when needed. */
6027     _PyUnicodeWriter_Init(&writer);
6028     writer.min_length = (e - q + 1) / 2;
6029     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030         goto onError;
6031 
6032     while (1) {
6033         Py_UCS4 ch = 0;
6034         if (e - q >= 2) {
6035             int kind = writer.kind;
6036             if (kind == PyUnicode_1BYTE_KIND) {
6037                 if (PyUnicode_IS_ASCII(writer.buffer))
6038                     ch = asciilib_utf16_decode(&q, e,
6039                             (Py_UCS1*)writer.data, &writer.pos,
6040                             native_ordering);
6041                 else
6042                     ch = ucs1lib_utf16_decode(&q, e,
6043                             (Py_UCS1*)writer.data, &writer.pos,
6044                             native_ordering);
6045             } else if (kind == PyUnicode_2BYTE_KIND) {
6046                 ch = ucs2lib_utf16_decode(&q, e,
6047                         (Py_UCS2*)writer.data, &writer.pos,
6048                         native_ordering);
6049             } else {
6050                 assert(kind == PyUnicode_4BYTE_KIND);
6051                 ch = ucs4lib_utf16_decode(&q, e,
6052                         (Py_UCS4*)writer.data, &writer.pos,
6053                         native_ordering);
6054             }
6055         }
6056 
6057         switch (ch)
6058         {
6059         case 0:
6060             /* remaining byte at the end? (size should be even) */
6061             if (q == e || consumed)
6062                 goto End;
6063             errmsg = "truncated data";
6064             startinpos = ((const char *)q) - starts;
6065             endinpos = ((const char *)e) - starts;
6066             break;
6067             /* The remaining input chars are ignored if the callback
6068                chooses to skip the input */
6069         case 1:
6070             q -= 2;
6071             if (consumed)
6072                 goto End;
6073             errmsg = "unexpected end of data";
6074             startinpos = ((const char *)q) - starts;
6075             endinpos = ((const char *)e) - starts;
6076             break;
6077         case 2:
6078             errmsg = "illegal encoding";
6079             startinpos = ((const char *)q) - 2 - starts;
6080             endinpos = startinpos + 2;
6081             break;
6082         case 3:
6083             errmsg = "illegal UTF-16 surrogate";
6084             startinpos = ((const char *)q) - 4 - starts;
6085             endinpos = startinpos + 2;
6086             break;
6087         default:
6088             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089                 goto onError;
6090             continue;
6091         }
6092 
6093         if (unicode_decode_call_errorhandler_writer(
6094                 errors,
6095                 &errorHandler,
6096                 encoding, errmsg,
6097                 &starts,
6098                 (const char **)&e,
6099                 &startinpos,
6100                 &endinpos,
6101                 &exc,
6102                 (const char **)&q,
6103                 &writer))
6104             goto onError;
6105     }
6106 
6107 End:
6108     if (consumed)
6109         *consumed = (const char *)q-starts;
6110 
6111     Py_XDECREF(errorHandler);
6112     Py_XDECREF(exc);
6113     return _PyUnicodeWriter_Finish(&writer);
6114 
6115   onError:
6116     _PyUnicodeWriter_Dealloc(&writer);
6117     Py_XDECREF(errorHandler);
6118     Py_XDECREF(exc);
6119     return NULL;
6120 }
6121 
6122 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123 _PyUnicode_EncodeUTF16(PyObject *str,
6124                        const char *errors,
6125                        int byteorder)
6126 {
6127     enum PyUnicode_Kind kind;
6128     const void *data;
6129     Py_ssize_t len;
6130     PyObject *v;
6131     unsigned short *out;
6132     Py_ssize_t pairs;
6133 #if PY_BIG_ENDIAN
6134     int native_ordering = byteorder >= 0;
6135 #else
6136     int native_ordering = byteorder <= 0;
6137 #endif
6138     const char *encoding;
6139     Py_ssize_t nsize, pos;
6140     PyObject *errorHandler = NULL;
6141     PyObject *exc = NULL;
6142     PyObject *rep = NULL;
6143 
6144     if (!PyUnicode_Check(str)) {
6145         PyErr_BadArgument();
6146         return NULL;
6147     }
6148     if (PyUnicode_READY(str) == -1)
6149         return NULL;
6150     kind = PyUnicode_KIND(str);
6151     data = PyUnicode_DATA(str);
6152     len = PyUnicode_GET_LENGTH(str);
6153 
6154     pairs = 0;
6155     if (kind == PyUnicode_4BYTE_KIND) {
6156         const Py_UCS4 *in = (const Py_UCS4 *)data;
6157         const Py_UCS4 *end = in + len;
6158         while (in < end) {
6159             if (*in++ >= 0x10000) {
6160                 pairs++;
6161             }
6162         }
6163     }
6164     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165         return PyErr_NoMemory();
6166     }
6167     nsize = len + pairs + (byteorder == 0);
6168     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169     if (v == NULL) {
6170         return NULL;
6171     }
6172 
6173     /* output buffer is 2-bytes aligned */
6174     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175     out = (unsigned short *)PyBytes_AS_STRING(v);
6176     if (byteorder == 0) {
6177         *out++ = 0xFEFF;
6178     }
6179     if (len == 0) {
6180         goto done;
6181     }
6182 
6183     if (kind == PyUnicode_1BYTE_KIND) {
6184         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185         goto done;
6186     }
6187 
6188     if (byteorder < 0) {
6189         encoding = "utf-16-le";
6190     }
6191     else if (byteorder > 0) {
6192         encoding = "utf-16-be";
6193     }
6194     else {
6195         encoding = "utf-16";
6196     }
6197 
6198     pos = 0;
6199     while (pos < len) {
6200         Py_ssize_t newpos, repsize, moreunits;
6201 
6202         if (kind == PyUnicode_2BYTE_KIND) {
6203             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204                                         &out, native_ordering);
6205         }
6206         else {
6207             assert(kind == PyUnicode_4BYTE_KIND);
6208             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209                                         &out, native_ordering);
6210         }
6211         if (pos == len)
6212             break;
6213 
6214         rep = unicode_encode_call_errorhandler(
6215                 errors, &errorHandler,
6216                 encoding, "surrogates not allowed",
6217                 str, &exc, pos, pos + 1, &newpos);
6218         if (!rep)
6219             goto error;
6220 
6221         if (PyBytes_Check(rep)) {
6222             repsize = PyBytes_GET_SIZE(rep);
6223             if (repsize & 1) {
6224                 raise_encode_exception(&exc, encoding,
6225                                        str, pos, pos + 1,
6226                                        "surrogates not allowed");
6227                 goto error;
6228             }
6229             moreunits = repsize / 2;
6230         }
6231         else {
6232             assert(PyUnicode_Check(rep));
6233             if (PyUnicode_READY(rep) < 0)
6234                 goto error;
6235             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236             if (!PyUnicode_IS_ASCII(rep)) {
6237                 raise_encode_exception(&exc, encoding,
6238                                        str, pos, pos + 1,
6239                                        "surrogates not allowed");
6240                 goto error;
6241             }
6242         }
6243         moreunits += pos - newpos;
6244         pos = newpos;
6245 
6246         /* two bytes are reserved for each surrogate */
6247         if (moreunits > 0) {
6248             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250                 /* integer overflow */
6251                 PyErr_NoMemory();
6252                 goto error;
6253             }
6254             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255                 goto error;
6256             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257         }
6258 
6259         if (PyBytes_Check(rep)) {
6260             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261             out += repsize / 2;
6262         } else /* rep is unicode */ {
6263             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265                                  &out, native_ordering);
6266         }
6267 
6268         Py_CLEAR(rep);
6269     }
6270 
6271     /* Cut back to size actually needed. This is necessary for, for example,
6272     encoding of a string containing isolated surrogates and the 'ignore' handler
6273     is used. */
6274     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275     if (nsize != PyBytes_GET_SIZE(v))
6276       _PyBytes_Resize(&v, nsize);
6277     Py_XDECREF(errorHandler);
6278     Py_XDECREF(exc);
6279   done:
6280     return v;
6281   error:
6282     Py_XDECREF(rep);
6283     Py_XDECREF(errorHandler);
6284     Py_XDECREF(exc);
6285     Py_XDECREF(v);
6286     return NULL;
6287 #undef STORECHAR
6288 }
6289 
6290 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6291 PyUnicode_AsUTF16String(PyObject *unicode)
6292 {
6293     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294 }
6295 
6296 /* --- Unicode Escape Codec ----------------------------------------------- */
6297 
6298 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299 
6300 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal2(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,int * first_invalid_escape_char,const char ** first_invalid_escape_ptr)6301 _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6302                                Py_ssize_t size,
6303                                const char *errors,
6304                                Py_ssize_t *consumed,
6305                                int *first_invalid_escape_char,
6306                                const char **first_invalid_escape_ptr)
6307 {
6308     const char *starts = s;
6309     const char *initial_starts = starts;
6310     _PyUnicodeWriter writer;
6311     const char *end;
6312     PyObject *errorHandler = NULL;
6313     PyObject *exc = NULL;
6314 
6315     // so we can remember if we've seen an invalid escape char or not
6316     *first_invalid_escape_char = -1;
6317     *first_invalid_escape_ptr = NULL;
6318 
6319     if (size == 0) {
6320         if (consumed) {
6321             *consumed = 0;
6322         }
6323         _Py_RETURN_UNICODE_EMPTY();
6324     }
6325     /* Escaped strings will always be longer than the resulting
6326        Unicode string, so we start with size here and then reduce the
6327        length after conversion to the true value.
6328        (but if the error callback returns a long replacement string
6329        we'll have to allocate more space) */
6330     _PyUnicodeWriter_Init(&writer);
6331     writer.min_length = size;
6332     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6333         goto onError;
6334     }
6335 
6336     end = s + size;
6337     while (s < end) {
6338         unsigned char c = (unsigned char) *s++;
6339         Py_UCS4 ch;
6340         int count;
6341         const char *message;
6342 
6343 #define WRITE_ASCII_CHAR(ch)                                                  \
6344             do {                                                              \
6345                 assert(ch <= 127);                                            \
6346                 assert(writer.pos < writer.size);                             \
6347                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6348             } while(0)
6349 
6350 #define WRITE_CHAR(ch)                                                        \
6351             do {                                                              \
6352                 if (ch <= writer.maxchar) {                                   \
6353                     assert(writer.pos < writer.size);                         \
6354                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6355                 }                                                             \
6356                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6357                     goto onError;                                             \
6358                 }                                                             \
6359             } while(0)
6360 
6361         /* Non-escape characters are interpreted as Unicode ordinals */
6362         if (c != '\\') {
6363             WRITE_CHAR(c);
6364             continue;
6365         }
6366 
6367         Py_ssize_t startinpos = s - starts - 1;
6368         /* \ - Escapes */
6369         if (s >= end) {
6370             message = "\\ at end of string";
6371             goto incomplete;
6372         }
6373         c = (unsigned char) *s++;
6374 
6375         assert(writer.pos < writer.size);
6376         switch (c) {
6377 
6378             /* \x escapes */
6379         case '\n': continue;
6380         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6381         case '\'': WRITE_ASCII_CHAR('\''); continue;
6382         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6383         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6384         /* FF */
6385         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6386         case 't': WRITE_ASCII_CHAR('\t'); continue;
6387         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6388         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6389         /* VT */
6390         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6391         /* BEL, not classic C */
6392         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6393 
6394             /* \OOO (octal) escapes */
6395         case '0': case '1': case '2': case '3':
6396         case '4': case '5': case '6': case '7':
6397             ch = c - '0';
6398             if (s < end && '0' <= *s && *s <= '7') {
6399                 ch = (ch<<3) + *s++ - '0';
6400                 if (s < end && '0' <= *s && *s <= '7') {
6401                     ch = (ch<<3) + *s++ - '0';
6402                 }
6403             }
6404             if (ch > 0377) {
6405                 if (*first_invalid_escape_char == -1) {
6406                     *first_invalid_escape_char = ch;
6407                     if (starts == initial_starts) {
6408                         /* Back up 3 chars, since we've already incremented s. */
6409                         *first_invalid_escape_ptr = s - 3;
6410                     }
6411                 }
6412             }
6413             WRITE_CHAR(ch);
6414             continue;
6415 
6416             /* hex escapes */
6417             /* \xXX */
6418         case 'x':
6419             count = 2;
6420             message = "truncated \\xXX escape";
6421             goto hexescape;
6422 
6423             /* \uXXXX */
6424         case 'u':
6425             count = 4;
6426             message = "truncated \\uXXXX escape";
6427             goto hexescape;
6428 
6429             /* \UXXXXXXXX */
6430         case 'U':
6431             count = 8;
6432             message = "truncated \\UXXXXXXXX escape";
6433         hexescape:
6434             for (ch = 0; count; ++s, --count) {
6435                 if (s >= end) {
6436                     goto incomplete;
6437                 }
6438                 c = (unsigned char)*s;
6439                 ch <<= 4;
6440                 if (c >= '0' && c <= '9') {
6441                     ch += c - '0';
6442                 }
6443                 else if (c >= 'a' && c <= 'f') {
6444                     ch += c - ('a' - 10);
6445                 }
6446                 else if (c >= 'A' && c <= 'F') {
6447                     ch += c - ('A' - 10);
6448                 }
6449                 else {
6450                     goto error;
6451                 }
6452             }
6453 
6454             /* when we get here, ch is a 32-bit unicode character */
6455             if (ch > MAX_UNICODE) {
6456                 message = "illegal Unicode character";
6457                 goto error;
6458             }
6459 
6460             WRITE_CHAR(ch);
6461             continue;
6462 
6463             /* \N{name} */
6464         case 'N':
6465             if (ucnhash_capi == NULL) {
6466                 /* load the unicode data module */
6467                 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6468                                                 PyUnicodeData_CAPSULE_NAME, 1);
6469                 if (ucnhash_capi == NULL) {
6470                     PyErr_SetString(
6471                         PyExc_UnicodeError,
6472                         "\\N escapes not supported (can't load unicodedata module)"
6473                         );
6474                     goto onError;
6475                 }
6476             }
6477 
6478             message = "malformed \\N character escape";
6479             if (s >= end) {
6480                 goto incomplete;
6481             }
6482             if (*s == '{') {
6483                 const char *start = ++s;
6484                 size_t namelen;
6485                 /* look for the closing brace */
6486                 while (s < end && *s != '}')
6487                     s++;
6488                 if (s >= end) {
6489                     goto incomplete;
6490                 }
6491                 namelen = s - start;
6492                 if (namelen) {
6493                     /* found a name.  look it up in the unicode database */
6494                     s++;
6495                     ch = 0xffffffff; /* in case 'getcode' messes up */
6496                     if (namelen <= INT_MAX &&
6497                         ucnhash_capi->getcode(start, (int)namelen,
6498                                               &ch, 0)) {
6499                         assert(ch <= MAX_UNICODE);
6500                         WRITE_CHAR(ch);
6501                         continue;
6502                     }
6503                     message = "unknown Unicode character name";
6504                 }
6505             }
6506             goto error;
6507 
6508         default:
6509             if (*first_invalid_escape_char == -1) {
6510                 *first_invalid_escape_char = c;
6511                 if (starts == initial_starts) {
6512                     /* Back up one char, since we've already incremented s. */
6513                     *first_invalid_escape_ptr = s - 1;
6514                 }
6515             }
6516             WRITE_ASCII_CHAR('\\');
6517             WRITE_CHAR(c);
6518             continue;
6519         }
6520 
6521       incomplete:
6522         if (consumed) {
6523             *consumed = startinpos;
6524             break;
6525         }
6526       error:;
6527         Py_ssize_t endinpos = s-starts;
6528         writer.min_length = end - s + writer.pos;
6529         if (unicode_decode_call_errorhandler_writer(
6530                 errors, &errorHandler,
6531                 "unicodeescape", message,
6532                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6533                 &writer)) {
6534             goto onError;
6535         }
6536         assert(end - s <= writer.size - writer.pos);
6537 
6538 #undef WRITE_ASCII_CHAR
6539 #undef WRITE_CHAR
6540     }
6541 
6542     Py_XDECREF(errorHandler);
6543     Py_XDECREF(exc);
6544     return _PyUnicodeWriter_Finish(&writer);
6545 
6546   onError:
6547     _PyUnicodeWriter_Dealloc(&writer);
6548     Py_XDECREF(errorHandler);
6549     Py_XDECREF(exc);
6550     return NULL;
6551 }
6552 
6553 // Export for binary compatibility.
6554 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6555 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6556                                Py_ssize_t size,
6557                                const char *errors,
6558                                Py_ssize_t *consumed,
6559                                const char **first_invalid_escape)
6560 {
6561     int first_invalid_escape_char;
6562     return _PyUnicode_DecodeUnicodeEscapeInternal2(
6563             s, size, errors, consumed,
6564             &first_invalid_escape_char,
6565             first_invalid_escape);
6566 }
6567 
6568 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6569 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6570                               Py_ssize_t size,
6571                               const char *errors,
6572                               Py_ssize_t *consumed)
6573 {
6574     int first_invalid_escape_char;
6575     const char *first_invalid_escape_ptr;
6576     PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6577                                                       consumed,
6578                                                       &first_invalid_escape_char,
6579                                                       &first_invalid_escape_ptr);
6580     if (result == NULL)
6581         return NULL;
6582     if (first_invalid_escape_char != -1) {
6583         if (first_invalid_escape_char > 0xff) {
6584             char buf[12] = "";
6585             snprintf(buf, sizeof buf, "%o", first_invalid_escape_char);
6586             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6587                                  "invalid octal escape sequence '\\%s'",
6588                                  buf) < 0)
6589             {
6590                 Py_DECREF(result);
6591                 return NULL;
6592             }
6593         }
6594         else {
6595             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6596                                  "invalid escape sequence '\\%c'",
6597                                  first_invalid_escape_char) < 0)
6598             {
6599                 Py_DECREF(result);
6600                 return NULL;
6601             }
6602         }
6603     }
6604     return result;
6605 }
6606 
6607 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6608 PyUnicode_DecodeUnicodeEscape(const char *s,
6609                               Py_ssize_t size,
6610                               const char *errors)
6611 {
6612     return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6613 }
6614 
6615 /* Return a Unicode-Escape string version of the Unicode object. */
6616 
6617 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6618 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6619 {
6620     Py_ssize_t i, len;
6621     PyObject *repr;
6622     char *p;
6623     enum PyUnicode_Kind kind;
6624     const void *data;
6625     Py_ssize_t expandsize;
6626 
6627     /* Initial allocation is based on the longest-possible character
6628        escape.
6629 
6630        For UCS1 strings it's '\xxx', 4 bytes per source character.
6631        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6632        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6633     */
6634 
6635     if (!PyUnicode_Check(unicode)) {
6636         PyErr_BadArgument();
6637         return NULL;
6638     }
6639     if (PyUnicode_READY(unicode) == -1) {
6640         return NULL;
6641     }
6642 
6643     len = PyUnicode_GET_LENGTH(unicode);
6644     if (len == 0) {
6645         return PyBytes_FromStringAndSize(NULL, 0);
6646     }
6647 
6648     kind = PyUnicode_KIND(unicode);
6649     data = PyUnicode_DATA(unicode);
6650     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6651        bytes, and 1 byte characters 4. */
6652     expandsize = kind * 2 + 2;
6653     if (len > PY_SSIZE_T_MAX / expandsize) {
6654         return PyErr_NoMemory();
6655     }
6656     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6657     if (repr == NULL) {
6658         return NULL;
6659     }
6660 
6661     p = PyBytes_AS_STRING(repr);
6662     for (i = 0; i < len; i++) {
6663         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6664 
6665         /* U+0000-U+00ff range */
6666         if (ch < 0x100) {
6667             if (ch >= ' ' && ch < 127) {
6668                 if (ch != '\\') {
6669                     /* Copy printable US ASCII as-is */
6670                     *p++ = (char) ch;
6671                 }
6672                 /* Escape backslashes */
6673                 else {
6674                     *p++ = '\\';
6675                     *p++ = '\\';
6676                 }
6677             }
6678 
6679             /* Map special whitespace to '\t', \n', '\r' */
6680             else if (ch == '\t') {
6681                 *p++ = '\\';
6682                 *p++ = 't';
6683             }
6684             else if (ch == '\n') {
6685                 *p++ = '\\';
6686                 *p++ = 'n';
6687             }
6688             else if (ch == '\r') {
6689                 *p++ = '\\';
6690                 *p++ = 'r';
6691             }
6692 
6693             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6694             else {
6695                 *p++ = '\\';
6696                 *p++ = 'x';
6697                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6698                 *p++ = Py_hexdigits[ch & 0x000F];
6699             }
6700         }
6701         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6702         else if (ch < 0x10000) {
6703             *p++ = '\\';
6704             *p++ = 'u';
6705             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6706             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6707             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6708             *p++ = Py_hexdigits[ch & 0x000F];
6709         }
6710         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6711         else {
6712 
6713             /* Make sure that the first two digits are zero */
6714             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6715             *p++ = '\\';
6716             *p++ = 'U';
6717             *p++ = '0';
6718             *p++ = '0';
6719             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6720             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6721             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6722             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6723             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6724             *p++ = Py_hexdigits[ch & 0x0000000F];
6725         }
6726     }
6727 
6728     assert(p - PyBytes_AS_STRING(repr) > 0);
6729     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6730         return NULL;
6731     }
6732     return repr;
6733 }
6734 
6735 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6736 
6737 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6738 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6739                                           Py_ssize_t size,
6740                                           const char *errors,
6741                                           Py_ssize_t *consumed)
6742 {
6743     const char *starts = s;
6744     _PyUnicodeWriter writer;
6745     const char *end;
6746     PyObject *errorHandler = NULL;
6747     PyObject *exc = NULL;
6748 
6749     if (size == 0) {
6750         if (consumed) {
6751             *consumed = 0;
6752         }
6753         _Py_RETURN_UNICODE_EMPTY();
6754     }
6755 
6756     /* Escaped strings will always be longer than the resulting
6757        Unicode string, so we start with size here and then reduce the
6758        length after conversion to the true value. (But decoding error
6759        handler might have to resize the string) */
6760     _PyUnicodeWriter_Init(&writer);
6761     writer.min_length = size;
6762     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6763         goto onError;
6764     }
6765 
6766     end = s + size;
6767     while (s < end) {
6768         unsigned char c = (unsigned char) *s++;
6769         Py_UCS4 ch;
6770         int count;
6771         const char *message;
6772 
6773 #define WRITE_CHAR(ch)                                                        \
6774             do {                                                              \
6775                 if (ch <= writer.maxchar) {                                   \
6776                     assert(writer.pos < writer.size);                         \
6777                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6778                 }                                                             \
6779                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6780                     goto onError;                                             \
6781                 }                                                             \
6782             } while(0)
6783 
6784         /* Non-escape characters are interpreted as Unicode ordinals */
6785         if (c != '\\' || (s >= end && !consumed)) {
6786             WRITE_CHAR(c);
6787             continue;
6788         }
6789 
6790         Py_ssize_t startinpos = s - starts - 1;
6791         /* \ - Escapes */
6792         if (s >= end) {
6793             assert(consumed);
6794             // Set message to silent compiler warning.
6795             // Actually it is never used.
6796             message = "\\ at end of string";
6797             goto incomplete;
6798         }
6799 
6800         c = (unsigned char) *s++;
6801         if (c == 'u') {
6802             count = 4;
6803             message = "truncated \\uXXXX escape";
6804         }
6805         else if (c == 'U') {
6806             count = 8;
6807             message = "truncated \\UXXXXXXXX escape";
6808         }
6809         else {
6810             assert(writer.pos < writer.size);
6811             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6812             WRITE_CHAR(c);
6813             continue;
6814         }
6815 
6816         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6817         for (ch = 0; count; ++s, --count) {
6818             if (s >= end) {
6819                 goto incomplete;
6820             }
6821             c = (unsigned char)*s;
6822             ch <<= 4;
6823             if (c >= '0' && c <= '9') {
6824                 ch += c - '0';
6825             }
6826             else if (c >= 'a' && c <= 'f') {
6827                 ch += c - ('a' - 10);
6828             }
6829             else if (c >= 'A' && c <= 'F') {
6830                 ch += c - ('A' - 10);
6831             }
6832             else {
6833                 goto error;
6834             }
6835         }
6836         if (ch > MAX_UNICODE) {
6837             message = "\\Uxxxxxxxx out of range";
6838             goto error;
6839         }
6840         WRITE_CHAR(ch);
6841         continue;
6842 
6843       incomplete:
6844         if (consumed) {
6845             *consumed = startinpos;
6846             break;
6847         }
6848       error:;
6849         Py_ssize_t endinpos = s-starts;
6850         writer.min_length = end - s + writer.pos;
6851         if (unicode_decode_call_errorhandler_writer(
6852                 errors, &errorHandler,
6853                 "rawunicodeescape", message,
6854                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6855                 &writer)) {
6856             goto onError;
6857         }
6858         assert(end - s <= writer.size - writer.pos);
6859 
6860 #undef WRITE_CHAR
6861     }
6862     Py_XDECREF(errorHandler);
6863     Py_XDECREF(exc);
6864     return _PyUnicodeWriter_Finish(&writer);
6865 
6866   onError:
6867     _PyUnicodeWriter_Dealloc(&writer);
6868     Py_XDECREF(errorHandler);
6869     Py_XDECREF(exc);
6870     return NULL;
6871 }
6872 
6873 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6874 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6875                                  Py_ssize_t size,
6876                                  const char *errors)
6877 {
6878     return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6879 }
6880 
6881 
6882 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6883 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6884 {
6885     PyObject *repr;
6886     char *p;
6887     Py_ssize_t expandsize, pos;
6888     int kind;
6889     const void *data;
6890     Py_ssize_t len;
6891 
6892     if (!PyUnicode_Check(unicode)) {
6893         PyErr_BadArgument();
6894         return NULL;
6895     }
6896     if (PyUnicode_READY(unicode) == -1) {
6897         return NULL;
6898     }
6899     kind = PyUnicode_KIND(unicode);
6900     data = PyUnicode_DATA(unicode);
6901     len = PyUnicode_GET_LENGTH(unicode);
6902     if (kind == PyUnicode_1BYTE_KIND) {
6903         return PyBytes_FromStringAndSize(data, len);
6904     }
6905 
6906     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6907        bytes, and 1 byte characters 4. */
6908     expandsize = kind * 2 + 2;
6909 
6910     if (len > PY_SSIZE_T_MAX / expandsize) {
6911         return PyErr_NoMemory();
6912     }
6913     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6914     if (repr == NULL) {
6915         return NULL;
6916     }
6917     if (len == 0) {
6918         return repr;
6919     }
6920 
6921     p = PyBytes_AS_STRING(repr);
6922     for (pos = 0; pos < len; pos++) {
6923         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6924 
6925         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6926         if (ch < 0x100) {
6927             *p++ = (char) ch;
6928         }
6929         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6930         else if (ch < 0x10000) {
6931             *p++ = '\\';
6932             *p++ = 'u';
6933             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6934             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6935             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6936             *p++ = Py_hexdigits[ch & 15];
6937         }
6938         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6939         else {
6940             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6941             *p++ = '\\';
6942             *p++ = 'U';
6943             *p++ = '0';
6944             *p++ = '0';
6945             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6946             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6947             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6948             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6949             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6950             *p++ = Py_hexdigits[ch & 15];
6951         }
6952     }
6953 
6954     assert(p > PyBytes_AS_STRING(repr));
6955     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6956         return NULL;
6957     }
6958     return repr;
6959 }
6960 
6961 /* --- Latin-1 Codec ------------------------------------------------------ */
6962 
6963 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6964 PyUnicode_DecodeLatin1(const char *s,
6965                        Py_ssize_t size,
6966                        const char *errors)
6967 {
6968     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6969     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6970 }
6971 
6972 /* create or adjust a UnicodeEncodeError */
6973 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6974 make_encode_exception(PyObject **exceptionObject,
6975                       const char *encoding,
6976                       PyObject *unicode,
6977                       Py_ssize_t startpos, Py_ssize_t endpos,
6978                       const char *reason)
6979 {
6980     if (*exceptionObject == NULL) {
6981         *exceptionObject = PyObject_CallFunction(
6982             PyExc_UnicodeEncodeError, "sOnns",
6983             encoding, unicode, startpos, endpos, reason);
6984     }
6985     else {
6986         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6987             goto onError;
6988         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6989             goto onError;
6990         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6991             goto onError;
6992         return;
6993       onError:
6994         Py_CLEAR(*exceptionObject);
6995     }
6996 }
6997 
6998 /* raises a UnicodeEncodeError */
6999 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7000 raise_encode_exception(PyObject **exceptionObject,
7001                        const char *encoding,
7002                        PyObject *unicode,
7003                        Py_ssize_t startpos, Py_ssize_t endpos,
7004                        const char *reason)
7005 {
7006     make_encode_exception(exceptionObject,
7007                           encoding, unicode, startpos, endpos, reason);
7008     if (*exceptionObject != NULL)
7009         PyCodec_StrictErrors(*exceptionObject);
7010 }
7011 
7012 /* error handling callback helper:
7013    build arguments, call the callback and check the arguments,
7014    put the result into newpos and return the replacement string, which
7015    has to be freed by the caller */
7016 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)7017 unicode_encode_call_errorhandler(const char *errors,
7018                                  PyObject **errorHandler,
7019                                  const char *encoding, const char *reason,
7020                                  PyObject *unicode, PyObject **exceptionObject,
7021                                  Py_ssize_t startpos, Py_ssize_t endpos,
7022                                  Py_ssize_t *newpos)
7023 {
7024     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7025     Py_ssize_t len;
7026     PyObject *restuple;
7027     PyObject *resunicode;
7028 
7029     if (*errorHandler == NULL) {
7030         *errorHandler = PyCodec_LookupError(errors);
7031         if (*errorHandler == NULL)
7032             return NULL;
7033     }
7034 
7035     if (PyUnicode_READY(unicode) == -1)
7036         return NULL;
7037     len = PyUnicode_GET_LENGTH(unicode);
7038 
7039     make_encode_exception(exceptionObject,
7040                           encoding, unicode, startpos, endpos, reason);
7041     if (*exceptionObject == NULL)
7042         return NULL;
7043 
7044     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7045     if (restuple == NULL)
7046         return NULL;
7047     if (!PyTuple_Check(restuple)) {
7048         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7049         Py_DECREF(restuple);
7050         return NULL;
7051     }
7052     if (!PyArg_ParseTuple(restuple, argparse,
7053                           &resunicode, newpos)) {
7054         Py_DECREF(restuple);
7055         return NULL;
7056     }
7057     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7058         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7059         Py_DECREF(restuple);
7060         return NULL;
7061     }
7062     if (*newpos<0)
7063         *newpos = len + *newpos;
7064     if (*newpos<0 || *newpos>len) {
7065         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7066         Py_DECREF(restuple);
7067         return NULL;
7068     }
7069     Py_INCREF(resunicode);
7070     Py_DECREF(restuple);
7071     return resunicode;
7072 }
7073 
7074 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7075 unicode_encode_ucs1(PyObject *unicode,
7076                     const char *errors,
7077                     const Py_UCS4 limit)
7078 {
7079     /* input state */
7080     Py_ssize_t pos=0, size;
7081     int kind;
7082     const void *data;
7083     /* pointer into the output */
7084     char *str;
7085     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7086     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7087     PyObject *error_handler_obj = NULL;
7088     PyObject *exc = NULL;
7089     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7090     PyObject *rep = NULL;
7091     /* output object */
7092     _PyBytesWriter writer;
7093 
7094     if (PyUnicode_READY(unicode) == -1)
7095         return NULL;
7096     size = PyUnicode_GET_LENGTH(unicode);
7097     kind = PyUnicode_KIND(unicode);
7098     data = PyUnicode_DATA(unicode);
7099     /* allocate enough for a simple encoding without
7100        replacements, if we need more, we'll resize */
7101     if (size == 0)
7102         return PyBytes_FromStringAndSize(NULL, 0);
7103 
7104     _PyBytesWriter_Init(&writer);
7105     str = _PyBytesWriter_Alloc(&writer, size);
7106     if (str == NULL)
7107         return NULL;
7108 
7109     while (pos < size) {
7110         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7111 
7112         /* can we encode this? */
7113         if (ch < limit) {
7114             /* no overflow check, because we know that the space is enough */
7115             *str++ = (char)ch;
7116             ++pos;
7117         }
7118         else {
7119             Py_ssize_t newpos, i;
7120             /* startpos for collecting unencodable chars */
7121             Py_ssize_t collstart = pos;
7122             Py_ssize_t collend = collstart + 1;
7123             /* find all unecodable characters */
7124 
7125             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7126                 ++collend;
7127 
7128             /* Only overallocate the buffer if it's not the last write */
7129             writer.overallocate = (collend < size);
7130 
7131             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7132             if (error_handler == _Py_ERROR_UNKNOWN)
7133                 error_handler = _Py_GetErrorHandler(errors);
7134 
7135             switch (error_handler) {
7136             case _Py_ERROR_STRICT:
7137                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7138                 goto onError;
7139 
7140             case _Py_ERROR_REPLACE:
7141                 memset(str, '?', collend - collstart);
7142                 str += (collend - collstart);
7143                 /* fall through */
7144             case _Py_ERROR_IGNORE:
7145                 pos = collend;
7146                 break;
7147 
7148             case _Py_ERROR_BACKSLASHREPLACE:
7149                 /* subtract preallocated bytes */
7150                 writer.min_size -= (collend - collstart);
7151                 str = backslashreplace(&writer, str,
7152                                        unicode, collstart, collend);
7153                 if (str == NULL)
7154                     goto onError;
7155                 pos = collend;
7156                 break;
7157 
7158             case _Py_ERROR_XMLCHARREFREPLACE:
7159                 /* subtract preallocated bytes */
7160                 writer.min_size -= (collend - collstart);
7161                 str = xmlcharrefreplace(&writer, str,
7162                                         unicode, collstart, collend);
7163                 if (str == NULL)
7164                     goto onError;
7165                 pos = collend;
7166                 break;
7167 
7168             case _Py_ERROR_SURROGATEESCAPE:
7169                 for (i = collstart; i < collend; ++i) {
7170                     ch = PyUnicode_READ(kind, data, i);
7171                     if (ch < 0xdc80 || 0xdcff < ch) {
7172                         /* Not a UTF-8b surrogate */
7173                         break;
7174                     }
7175                     *str++ = (char)(ch - 0xdc00);
7176                     ++pos;
7177                 }
7178                 if (i >= collend)
7179                     break;
7180                 collstart = pos;
7181                 assert(collstart != collend);
7182                 /* fall through */
7183 
7184             default:
7185                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7186                                                        encoding, reason, unicode, &exc,
7187                                                        collstart, collend, &newpos);
7188                 if (rep == NULL)
7189                     goto onError;
7190 
7191                 if (newpos < collstart) {
7192                     writer.overallocate = 1;
7193                     str = _PyBytesWriter_Prepare(&writer, str,
7194                                                  collstart - newpos);
7195                     if (str == NULL)
7196                         goto onError;
7197                 }
7198                 else {
7199                     /* subtract preallocated bytes */
7200                     writer.min_size -= newpos - collstart;
7201                     /* Only overallocate the buffer if it's not the last write */
7202                     writer.overallocate = (newpos < size);
7203                 }
7204 
7205                 if (PyBytes_Check(rep)) {
7206                     /* Directly copy bytes result to output. */
7207                     str = _PyBytesWriter_WriteBytes(&writer, str,
7208                                                     PyBytes_AS_STRING(rep),
7209                                                     PyBytes_GET_SIZE(rep));
7210                 }
7211                 else {
7212                     assert(PyUnicode_Check(rep));
7213 
7214                     if (PyUnicode_READY(rep) < 0)
7215                         goto onError;
7216 
7217                     if (limit == 256 ?
7218                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7219                         !PyUnicode_IS_ASCII(rep))
7220                     {
7221                         /* Not all characters are smaller than limit */
7222                         raise_encode_exception(&exc, encoding, unicode,
7223                                                collstart, collend, reason);
7224                         goto onError;
7225                     }
7226                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7227                     str = _PyBytesWriter_WriteBytes(&writer, str,
7228                                                     PyUnicode_DATA(rep),
7229                                                     PyUnicode_GET_LENGTH(rep));
7230                 }
7231                 if (str == NULL)
7232                     goto onError;
7233 
7234                 pos = newpos;
7235                 Py_CLEAR(rep);
7236             }
7237 
7238             /* If overallocation was disabled, ensure that it was the last
7239                write. Otherwise, we missed an optimization */
7240             assert(writer.overallocate || pos == size);
7241         }
7242     }
7243 
7244     Py_XDECREF(error_handler_obj);
7245     Py_XDECREF(exc);
7246     return _PyBytesWriter_Finish(&writer, str);
7247 
7248   onError:
7249     Py_XDECREF(rep);
7250     _PyBytesWriter_Dealloc(&writer);
7251     Py_XDECREF(error_handler_obj);
7252     Py_XDECREF(exc);
7253     return NULL;
7254 }
7255 
7256 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7257 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7258 {
7259     if (!PyUnicode_Check(unicode)) {
7260         PyErr_BadArgument();
7261         return NULL;
7262     }
7263     if (PyUnicode_READY(unicode) == -1)
7264         return NULL;
7265     /* Fast path: if it is a one-byte string, construct
7266        bytes object directly. */
7267     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7268         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7269                                          PyUnicode_GET_LENGTH(unicode));
7270     /* Non-Latin-1 characters present. Defer to above function to
7271        raise the exception. */
7272     return unicode_encode_ucs1(unicode, errors, 256);
7273 }
7274 
7275 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7276 PyUnicode_AsLatin1String(PyObject *unicode)
7277 {
7278     return _PyUnicode_AsLatin1String(unicode, NULL);
7279 }
7280 
7281 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7282 
7283 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7284 PyUnicode_DecodeASCII(const char *s,
7285                       Py_ssize_t size,
7286                       const char *errors)
7287 {
7288     const char *starts = s;
7289     const char *e = s + size;
7290     PyObject *error_handler_obj = NULL;
7291     PyObject *exc = NULL;
7292     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7293 
7294     if (size == 0)
7295         _Py_RETURN_UNICODE_EMPTY();
7296 
7297     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7298     if (size == 1 && (unsigned char)s[0] < 128) {
7299         return get_latin1_char((unsigned char)s[0]);
7300     }
7301 
7302     // Shortcut for simple case
7303     PyObject *u = PyUnicode_New(size, 127);
7304     if (u == NULL) {
7305         return NULL;
7306     }
7307     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7308     if (outpos == size) {
7309         return u;
7310     }
7311 
7312     _PyUnicodeWriter writer;
7313     _PyUnicodeWriter_InitWithBuffer(&writer, u);
7314     writer.pos = outpos;
7315 
7316     s += outpos;
7317     int kind = writer.kind;
7318     void *data = writer.data;
7319     Py_ssize_t startinpos, endinpos;
7320 
7321     while (s < e) {
7322         unsigned char c = (unsigned char)*s;
7323         if (c < 128) {
7324             PyUnicode_WRITE(kind, data, writer.pos, c);
7325             writer.pos++;
7326             ++s;
7327             continue;
7328         }
7329 
7330         /* byte outsize range 0x00..0x7f: call the error handler */
7331 
7332         if (error_handler == _Py_ERROR_UNKNOWN)
7333             error_handler = _Py_GetErrorHandler(errors);
7334 
7335         switch (error_handler)
7336         {
7337         case _Py_ERROR_REPLACE:
7338         case _Py_ERROR_SURROGATEESCAPE:
7339             /* Fast-path: the error handler only writes one character,
7340                but we may switch to UCS2 at the first write */
7341             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7342                 goto onError;
7343             kind = writer.kind;
7344             data = writer.data;
7345 
7346             if (error_handler == _Py_ERROR_REPLACE)
7347                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7348             else
7349                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7350             writer.pos++;
7351             ++s;
7352             break;
7353 
7354         case _Py_ERROR_IGNORE:
7355             ++s;
7356             break;
7357 
7358         default:
7359             startinpos = s-starts;
7360             endinpos = startinpos + 1;
7361             if (unicode_decode_call_errorhandler_writer(
7362                     errors, &error_handler_obj,
7363                     "ascii", "ordinal not in range(128)",
7364                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7365                     &writer))
7366                 goto onError;
7367             kind = writer.kind;
7368             data = writer.data;
7369         }
7370     }
7371     Py_XDECREF(error_handler_obj);
7372     Py_XDECREF(exc);
7373     return _PyUnicodeWriter_Finish(&writer);
7374 
7375   onError:
7376     _PyUnicodeWriter_Dealloc(&writer);
7377     Py_XDECREF(error_handler_obj);
7378     Py_XDECREF(exc);
7379     return NULL;
7380 }
7381 
7382 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7383 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7384 {
7385     if (!PyUnicode_Check(unicode)) {
7386         PyErr_BadArgument();
7387         return NULL;
7388     }
7389     if (PyUnicode_READY(unicode) == -1)
7390         return NULL;
7391     /* Fast path: if it is an ASCII-only string, construct bytes object
7392        directly. Else defer to above function to raise the exception. */
7393     if (PyUnicode_IS_ASCII(unicode))
7394         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7395                                          PyUnicode_GET_LENGTH(unicode));
7396     return unicode_encode_ucs1(unicode, errors, 128);
7397 }
7398 
7399 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7400 PyUnicode_AsASCIIString(PyObject *unicode)
7401 {
7402     return _PyUnicode_AsASCIIString(unicode, NULL);
7403 }
7404 
7405 #ifdef MS_WINDOWS
7406 
7407 /* --- MBCS codecs for Windows -------------------------------------------- */
7408 
7409 #if SIZEOF_INT < SIZEOF_SIZE_T
7410 #define NEED_RETRY
7411 #endif
7412 
7413 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7414    transcoding from UTF-16), but INT_MAX / 4 performs better in
7415    both cases also and avoids partial characters overrunning the
7416    length limit in MultiByteToWideChar on Windows */
7417 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7418 
7419 #ifndef WC_ERR_INVALID_CHARS
7420 #  define WC_ERR_INVALID_CHARS 0x0080
7421 #endif
7422 
7423 static const char*
code_page_name(UINT code_page,PyObject ** obj)7424 code_page_name(UINT code_page, PyObject **obj)
7425 {
7426     *obj = NULL;
7427     if (code_page == CP_ACP)
7428         return "mbcs";
7429     if (code_page == CP_UTF7)
7430         return "CP_UTF7";
7431     if (code_page == CP_UTF8)
7432         return "CP_UTF8";
7433 
7434     *obj = PyBytes_FromFormat("cp%u", code_page);
7435     if (*obj == NULL)
7436         return NULL;
7437     return PyBytes_AS_STRING(*obj);
7438 }
7439 
7440 static DWORD
decode_code_page_flags(UINT code_page)7441 decode_code_page_flags(UINT code_page)
7442 {
7443     if (code_page == CP_UTF7) {
7444         /* The CP_UTF7 decoder only supports flags=0 */
7445         return 0;
7446     }
7447     else
7448         return MB_ERR_INVALID_CHARS;
7449 }
7450 
7451 /*
7452  * Decode a byte string from a Windows code page into unicode object in strict
7453  * mode.
7454  *
7455  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7456  * OSError and returns -1 on other error.
7457  */
7458 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7459 decode_code_page_strict(UINT code_page,
7460                         wchar_t **buf,
7461                         Py_ssize_t *bufsize,
7462                         const char *in,
7463                         int insize)
7464 {
7465     DWORD flags = MB_ERR_INVALID_CHARS;
7466     wchar_t *out;
7467     DWORD outsize;
7468 
7469     /* First get the size of the result */
7470     assert(insize > 0);
7471     while ((outsize = MultiByteToWideChar(code_page, flags,
7472                                           in, insize, NULL, 0)) <= 0)
7473     {
7474         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7475             goto error;
7476         }
7477         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7478         flags = 0;
7479     }
7480 
7481     /* Extend a wchar_t* buffer */
7482     Py_ssize_t n = *bufsize;   /* Get the current length */
7483     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7484         return -1;
7485     }
7486     out = *buf + n;
7487 
7488     /* Do the conversion */
7489     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7490     if (outsize <= 0)
7491         goto error;
7492     return insize;
7493 
7494 error:
7495     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7496         return -2;
7497     PyErr_SetFromWindowsErr(0);
7498     return -1;
7499 }
7500 
7501 /*
7502  * Decode a byte string from a code page into unicode object with an error
7503  * handler.
7504  *
7505  * Returns consumed size if succeed, or raise an OSError or
7506  * UnicodeDecodeError exception and returns -1 on error.
7507  */
7508 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7509 decode_code_page_errors(UINT code_page,
7510                         wchar_t **buf,
7511                         Py_ssize_t *bufsize,
7512                         const char *in, const int size,
7513                         const char *errors, int final)
7514 {
7515     const char *startin = in;
7516     const char *endin = in + size;
7517     DWORD flags = MB_ERR_INVALID_CHARS;
7518     /* Ideally, we should get reason from FormatMessage. This is the Windows
7519        2000 English version of the message. */
7520     const char *reason = "No mapping for the Unicode character exists "
7521                          "in the target code page.";
7522     /* each step cannot decode more than 1 character, but a character can be
7523        represented as a surrogate pair */
7524     wchar_t buffer[2], *out;
7525     int insize;
7526     Py_ssize_t outsize;
7527     PyObject *errorHandler = NULL;
7528     PyObject *exc = NULL;
7529     PyObject *encoding_obj = NULL;
7530     const char *encoding;
7531     DWORD err;
7532     int ret = -1;
7533 
7534     assert(size > 0);
7535 
7536     encoding = code_page_name(code_page, &encoding_obj);
7537     if (encoding == NULL)
7538         return -1;
7539 
7540     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7541         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7542            UnicodeDecodeError. */
7543         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7544         if (exc != NULL) {
7545             PyCodec_StrictErrors(exc);
7546             Py_CLEAR(exc);
7547         }
7548         goto error;
7549     }
7550 
7551     /* Extend a wchar_t* buffer */
7552     Py_ssize_t n = *bufsize;   /* Get the current length */
7553     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7554         PyErr_NoMemory();
7555         goto error;
7556     }
7557     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7558         goto error;
7559     }
7560     out = *buf + n;
7561 
7562     /* Decode the byte string character per character */
7563     while (in < endin)
7564     {
7565         /* Decode a character */
7566         insize = 1;
7567         do
7568         {
7569             outsize = MultiByteToWideChar(code_page, flags,
7570                                           in, insize,
7571                                           buffer, Py_ARRAY_LENGTH(buffer));
7572             if (outsize > 0)
7573                 break;
7574             err = GetLastError();
7575             if (err == ERROR_INVALID_FLAGS && flags) {
7576                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7577                 flags = 0;
7578                 continue;
7579             }
7580             if (err != ERROR_NO_UNICODE_TRANSLATION
7581                 && err != ERROR_INSUFFICIENT_BUFFER)
7582             {
7583                 PyErr_SetFromWindowsErr(0);
7584                 goto error;
7585             }
7586             insize++;
7587         }
7588         /* 4=maximum length of a UTF-8 sequence */
7589         while (insize <= 4 && (in + insize) <= endin);
7590 
7591         if (outsize <= 0) {
7592             Py_ssize_t startinpos, endinpos, outpos;
7593 
7594             /* last character in partial decode? */
7595             if (in + insize >= endin && !final)
7596                 break;
7597 
7598             startinpos = in - startin;
7599             endinpos = startinpos + 1;
7600             outpos = out - *buf;
7601             if (unicode_decode_call_errorhandler_wchar(
7602                     errors, &errorHandler,
7603                     encoding, reason,
7604                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7605                     buf, bufsize, &outpos))
7606             {
7607                 goto error;
7608             }
7609             out = *buf + outpos;
7610         }
7611         else {
7612             in += insize;
7613             memcpy(out, buffer, outsize * sizeof(wchar_t));
7614             out += outsize;
7615         }
7616     }
7617 
7618     /* Shrink the buffer */
7619     assert(out - *buf <= *bufsize);
7620     *bufsize = out - *buf;
7621     /* (in - startin) <= size and size is an int */
7622     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7623 
7624 error:
7625     Py_XDECREF(encoding_obj);
7626     Py_XDECREF(errorHandler);
7627     Py_XDECREF(exc);
7628     return ret;
7629 }
7630 
7631 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7632 decode_code_page_stateful(int code_page,
7633                           const char *s, Py_ssize_t size,
7634                           const char *errors, Py_ssize_t *consumed)
7635 {
7636     wchar_t *buf = NULL;
7637     Py_ssize_t bufsize = 0;
7638     int chunk_size, final, converted, done;
7639 
7640     if (code_page < 0) {
7641         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7642         return NULL;
7643     }
7644     if (size < 0) {
7645         PyErr_BadInternalCall();
7646         return NULL;
7647     }
7648 
7649     if (consumed)
7650         *consumed = 0;
7651 
7652     do
7653     {
7654 #ifdef NEED_RETRY
7655         if (size > DECODING_CHUNK_SIZE) {
7656             chunk_size = DECODING_CHUNK_SIZE;
7657             final = 0;
7658             done = 0;
7659         }
7660         else
7661 #endif
7662         {
7663             chunk_size = (int)size;
7664             final = (consumed == NULL);
7665             done = 1;
7666         }
7667 
7668         if (chunk_size == 0 && done) {
7669             if (buf != NULL)
7670                 break;
7671             _Py_RETURN_UNICODE_EMPTY();
7672         }
7673 
7674         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7675                                             s, chunk_size);
7676         if (converted == -2)
7677             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7678                                                 s, chunk_size,
7679                                                 errors, final);
7680         assert(converted != 0 || done);
7681 
7682         if (converted < 0) {
7683             PyMem_Free(buf);
7684             return NULL;
7685         }
7686 
7687         if (consumed)
7688             *consumed += converted;
7689 
7690         s += converted;
7691         size -= converted;
7692     } while (!done);
7693 
7694     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7695     PyMem_Free(buf);
7696     return v;
7697 }
7698 
7699 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7700 PyUnicode_DecodeCodePageStateful(int code_page,
7701                                  const char *s,
7702                                  Py_ssize_t size,
7703                                  const char *errors,
7704                                  Py_ssize_t *consumed)
7705 {
7706     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7707 }
7708 
7709 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7710 PyUnicode_DecodeMBCSStateful(const char *s,
7711                              Py_ssize_t size,
7712                              const char *errors,
7713                              Py_ssize_t *consumed)
7714 {
7715     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7716 }
7717 
7718 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7719 PyUnicode_DecodeMBCS(const char *s,
7720                      Py_ssize_t size,
7721                      const char *errors)
7722 {
7723     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7724 }
7725 
7726 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7727 encode_code_page_flags(UINT code_page, const char *errors)
7728 {
7729     if (code_page == CP_UTF8) {
7730         return WC_ERR_INVALID_CHARS;
7731     }
7732     else if (code_page == CP_UTF7) {
7733         /* CP_UTF7 only supports flags=0 */
7734         return 0;
7735     }
7736     else {
7737         if (errors != NULL && strcmp(errors, "replace") == 0)
7738             return 0;
7739         else
7740             return WC_NO_BEST_FIT_CHARS;
7741     }
7742 }
7743 
7744 /*
7745  * Encode a Unicode string to a Windows code page into a byte string in strict
7746  * mode.
7747  *
7748  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7749  * an OSError and returns -1 on other error.
7750  */
7751 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7752 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7753                         PyObject *unicode, Py_ssize_t offset, int len,
7754                         const char* errors)
7755 {
7756     BOOL usedDefaultChar = FALSE;
7757     BOOL *pusedDefaultChar = &usedDefaultChar;
7758     int outsize;
7759     wchar_t *p;
7760     Py_ssize_t size;
7761     const DWORD flags = encode_code_page_flags(code_page, NULL);
7762     char *out;
7763     /* Create a substring so that we can get the UTF-16 representation
7764        of just the slice under consideration. */
7765     PyObject *substring;
7766     int ret = -1;
7767 
7768     assert(len > 0);
7769 
7770     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7771         pusedDefaultChar = &usedDefaultChar;
7772     else
7773         pusedDefaultChar = NULL;
7774 
7775     substring = PyUnicode_Substring(unicode, offset, offset+len);
7776     if (substring == NULL)
7777         return -1;
7778 #if USE_UNICODE_WCHAR_CACHE
7779 _Py_COMP_DIAG_PUSH
7780 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7781     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7782     if (p == NULL) {
7783         Py_DECREF(substring);
7784         return -1;
7785     }
7786 _Py_COMP_DIAG_POP
7787 #else /* USE_UNICODE_WCHAR_CACHE */
7788     p = PyUnicode_AsWideCharString(substring, &size);
7789     Py_CLEAR(substring);
7790     if (p == NULL) {
7791         return -1;
7792     }
7793 #endif /* USE_UNICODE_WCHAR_CACHE */
7794     assert(size <= INT_MAX);
7795 
7796     /* First get the size of the result */
7797     outsize = WideCharToMultiByte(code_page, flags,
7798                                   p, (int)size,
7799                                   NULL, 0,
7800                                   NULL, pusedDefaultChar);
7801     if (outsize <= 0)
7802         goto error;
7803     /* If we used a default char, then we failed! */
7804     if (pusedDefaultChar && *pusedDefaultChar) {
7805         ret = -2;
7806         goto done;
7807     }
7808 
7809     if (*outbytes == NULL) {
7810         /* Create string object */
7811         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7812         if (*outbytes == NULL) {
7813             goto done;
7814         }
7815         out = PyBytes_AS_STRING(*outbytes);
7816     }
7817     else {
7818         /* Extend string object */
7819         const Py_ssize_t n = PyBytes_Size(*outbytes);
7820         if (outsize > PY_SSIZE_T_MAX - n) {
7821             PyErr_NoMemory();
7822             goto done;
7823         }
7824         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7825             goto done;
7826         }
7827         out = PyBytes_AS_STRING(*outbytes) + n;
7828     }
7829 
7830     /* Do the conversion */
7831     outsize = WideCharToMultiByte(code_page, flags,
7832                                   p, (int)size,
7833                                   out, outsize,
7834                                   NULL, pusedDefaultChar);
7835     if (outsize <= 0)
7836         goto error;
7837     if (pusedDefaultChar && *pusedDefaultChar) {
7838         ret = -2;
7839         goto done;
7840     }
7841     ret = 0;
7842 
7843 done:
7844 #if USE_UNICODE_WCHAR_CACHE
7845     Py_DECREF(substring);
7846 #else /* USE_UNICODE_WCHAR_CACHE */
7847     PyMem_Free(p);
7848 #endif /* USE_UNICODE_WCHAR_CACHE */
7849     return ret;
7850 
7851 error:
7852     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7853         ret = -2;
7854         goto done;
7855     }
7856     PyErr_SetFromWindowsErr(0);
7857     goto done;
7858 }
7859 
7860 /*
7861  * Encode a Unicode string to a Windows code page into a byte string using an
7862  * error handler.
7863  *
7864  * Returns consumed characters if succeed, or raise an OSError and returns
7865  * -1 on other error.
7866  */
7867 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7868 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7869                         PyObject *unicode, Py_ssize_t unicode_offset,
7870                         Py_ssize_t insize, const char* errors)
7871 {
7872     const DWORD flags = encode_code_page_flags(code_page, errors);
7873     Py_ssize_t pos = unicode_offset;
7874     Py_ssize_t endin = unicode_offset + insize;
7875     /* Ideally, we should get reason from FormatMessage. This is the Windows
7876        2000 English version of the message. */
7877     const char *reason = "invalid character";
7878     /* 4=maximum length of a UTF-8 sequence */
7879     char buffer[4];
7880     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7881     Py_ssize_t outsize;
7882     char *out;
7883     PyObject *errorHandler = NULL;
7884     PyObject *exc = NULL;
7885     PyObject *encoding_obj = NULL;
7886     const char *encoding;
7887     Py_ssize_t newpos, newoutsize;
7888     PyObject *rep;
7889     int ret = -1;
7890 
7891     assert(insize > 0);
7892 
7893     encoding = code_page_name(code_page, &encoding_obj);
7894     if (encoding == NULL)
7895         return -1;
7896 
7897     if (errors == NULL || strcmp(errors, "strict") == 0) {
7898         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7899            then we raise a UnicodeEncodeError. */
7900         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7901         if (exc != NULL) {
7902             PyCodec_StrictErrors(exc);
7903             Py_DECREF(exc);
7904         }
7905         Py_XDECREF(encoding_obj);
7906         return -1;
7907     }
7908 
7909     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7910         pusedDefaultChar = &usedDefaultChar;
7911     else
7912         pusedDefaultChar = NULL;
7913 
7914     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7915         PyErr_NoMemory();
7916         goto error;
7917     }
7918     outsize = insize * Py_ARRAY_LENGTH(buffer);
7919 
7920     if (*outbytes == NULL) {
7921         /* Create string object */
7922         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7923         if (*outbytes == NULL)
7924             goto error;
7925         out = PyBytes_AS_STRING(*outbytes);
7926     }
7927     else {
7928         /* Extend string object */
7929         Py_ssize_t n = PyBytes_Size(*outbytes);
7930         if (n > PY_SSIZE_T_MAX - outsize) {
7931             PyErr_NoMemory();
7932             goto error;
7933         }
7934         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7935             goto error;
7936         out = PyBytes_AS_STRING(*outbytes) + n;
7937     }
7938 
7939     /* Encode the string character per character */
7940     while (pos < endin)
7941     {
7942         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7943         wchar_t chars[2];
7944         int charsize;
7945         if (ch < 0x10000) {
7946             chars[0] = (wchar_t)ch;
7947             charsize = 1;
7948         }
7949         else {
7950             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7951             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7952             charsize = 2;
7953         }
7954 
7955         outsize = WideCharToMultiByte(code_page, flags,
7956                                       chars, charsize,
7957                                       buffer, Py_ARRAY_LENGTH(buffer),
7958                                       NULL, pusedDefaultChar);
7959         if (outsize > 0) {
7960             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7961             {
7962                 pos++;
7963                 memcpy(out, buffer, outsize);
7964                 out += outsize;
7965                 continue;
7966             }
7967         }
7968         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7969             PyErr_SetFromWindowsErr(0);
7970             goto error;
7971         }
7972 
7973         rep = unicode_encode_call_errorhandler(
7974                   errors, &errorHandler, encoding, reason,
7975                   unicode, &exc,
7976                   pos, pos + 1, &newpos);
7977         if (rep == NULL)
7978             goto error;
7979 
7980         Py_ssize_t morebytes = pos - newpos;
7981         if (PyBytes_Check(rep)) {
7982             outsize = PyBytes_GET_SIZE(rep);
7983             morebytes += outsize;
7984             if (morebytes > 0) {
7985                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7986                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7987                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7988                     Py_DECREF(rep);
7989                     goto error;
7990                 }
7991                 out = PyBytes_AS_STRING(*outbytes) + offset;
7992             }
7993             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7994             out += outsize;
7995         }
7996         else {
7997             Py_ssize_t i;
7998             enum PyUnicode_Kind kind;
7999             const void *data;
8000 
8001             if (PyUnicode_READY(rep) == -1) {
8002                 Py_DECREF(rep);
8003                 goto error;
8004             }
8005 
8006             outsize = PyUnicode_GET_LENGTH(rep);
8007             morebytes += outsize;
8008             if (morebytes > 0) {
8009                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8010                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8011                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8012                     Py_DECREF(rep);
8013                     goto error;
8014                 }
8015                 out = PyBytes_AS_STRING(*outbytes) + offset;
8016             }
8017             kind = PyUnicode_KIND(rep);
8018             data = PyUnicode_DATA(rep);
8019             for (i=0; i < outsize; i++) {
8020                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8021                 if (ch > 127) {
8022                     raise_encode_exception(&exc,
8023                         encoding, unicode,
8024                         pos, pos + 1,
8025                         "unable to encode error handler result to ASCII");
8026                     Py_DECREF(rep);
8027                     goto error;
8028                 }
8029                 *out = (unsigned char)ch;
8030                 out++;
8031             }
8032         }
8033         pos = newpos;
8034         Py_DECREF(rep);
8035     }
8036     /* write a NUL byte */
8037     *out = 0;
8038     outsize = out - PyBytes_AS_STRING(*outbytes);
8039     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8040     if (_PyBytes_Resize(outbytes, outsize) < 0)
8041         goto error;
8042     ret = 0;
8043 
8044 error:
8045     Py_XDECREF(encoding_obj);
8046     Py_XDECREF(errorHandler);
8047     Py_XDECREF(exc);
8048     return ret;
8049 }
8050 
8051 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8052 encode_code_page(int code_page,
8053                  PyObject *unicode,
8054                  const char *errors)
8055 {
8056     Py_ssize_t len;
8057     PyObject *outbytes = NULL;
8058     Py_ssize_t offset;
8059     int chunk_len, ret, done;
8060 
8061     if (!PyUnicode_Check(unicode)) {
8062         PyErr_BadArgument();
8063         return NULL;
8064     }
8065 
8066     if (PyUnicode_READY(unicode) == -1)
8067         return NULL;
8068     len = PyUnicode_GET_LENGTH(unicode);
8069 
8070     if (code_page < 0) {
8071         PyErr_SetString(PyExc_ValueError, "invalid code page number");
8072         return NULL;
8073     }
8074 
8075     if (len == 0)
8076         return PyBytes_FromStringAndSize(NULL, 0);
8077 
8078     offset = 0;
8079     do
8080     {
8081 #ifdef NEED_RETRY
8082         if (len > DECODING_CHUNK_SIZE) {
8083             chunk_len = DECODING_CHUNK_SIZE;
8084             done = 0;
8085         }
8086         else
8087 #endif
8088         {
8089             chunk_len = (int)len;
8090             done = 1;
8091         }
8092 
8093         ret = encode_code_page_strict(code_page, &outbytes,
8094                                       unicode, offset, chunk_len,
8095                                       errors);
8096         if (ret == -2)
8097             ret = encode_code_page_errors(code_page, &outbytes,
8098                                           unicode, offset,
8099                                           chunk_len, errors);
8100         if (ret < 0) {
8101             Py_XDECREF(outbytes);
8102             return NULL;
8103         }
8104 
8105         offset += chunk_len;
8106         len -= chunk_len;
8107     } while (!done);
8108 
8109     return outbytes;
8110 }
8111 
8112 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8113 PyUnicode_EncodeCodePage(int code_page,
8114                          PyObject *unicode,
8115                          const char *errors)
8116 {
8117     return encode_code_page(code_page, unicode, errors);
8118 }
8119 
8120 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8121 PyUnicode_AsMBCSString(PyObject *unicode)
8122 {
8123     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8124 }
8125 
8126 #undef NEED_RETRY
8127 
8128 #endif /* MS_WINDOWS */
8129 
8130 /* --- Character Mapping Codec -------------------------------------------- */
8131 
8132 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8133 charmap_decode_string(const char *s,
8134                       Py_ssize_t size,
8135                       PyObject *mapping,
8136                       const char *errors,
8137                       _PyUnicodeWriter *writer)
8138 {
8139     const char *starts = s;
8140     const char *e;
8141     Py_ssize_t startinpos, endinpos;
8142     PyObject *errorHandler = NULL, *exc = NULL;
8143     Py_ssize_t maplen;
8144     enum PyUnicode_Kind mapkind;
8145     const void *mapdata;
8146     Py_UCS4 x;
8147     unsigned char ch;
8148 
8149     if (PyUnicode_READY(mapping) == -1)
8150         return -1;
8151 
8152     maplen = PyUnicode_GET_LENGTH(mapping);
8153     mapdata = PyUnicode_DATA(mapping);
8154     mapkind = PyUnicode_KIND(mapping);
8155 
8156     e = s + size;
8157 
8158     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8159         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8160          * is disabled in encoding aliases, latin1 is preferred because
8161          * its implementation is faster. */
8162         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8163         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8164         Py_UCS4 maxchar = writer->maxchar;
8165 
8166         assert (writer->kind == PyUnicode_1BYTE_KIND);
8167         while (s < e) {
8168             ch = *s;
8169             x = mapdata_ucs1[ch];
8170             if (x > maxchar) {
8171                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8172                     goto onError;
8173                 maxchar = writer->maxchar;
8174                 outdata = (Py_UCS1 *)writer->data;
8175             }
8176             outdata[writer->pos] = x;
8177             writer->pos++;
8178             ++s;
8179         }
8180         return 0;
8181     }
8182 
8183     while (s < e) {
8184         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8185             enum PyUnicode_Kind outkind = writer->kind;
8186             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8187             if (outkind == PyUnicode_1BYTE_KIND) {
8188                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8189                 Py_UCS4 maxchar = writer->maxchar;
8190                 while (s < e) {
8191                     ch = *s;
8192                     x = mapdata_ucs2[ch];
8193                     if (x > maxchar)
8194                         goto Error;
8195                     outdata[writer->pos] = x;
8196                     writer->pos++;
8197                     ++s;
8198                 }
8199                 break;
8200             }
8201             else if (outkind == PyUnicode_2BYTE_KIND) {
8202                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8203                 while (s < e) {
8204                     ch = *s;
8205                     x = mapdata_ucs2[ch];
8206                     if (x == 0xFFFE)
8207                         goto Error;
8208                     outdata[writer->pos] = x;
8209                     writer->pos++;
8210                     ++s;
8211                 }
8212                 break;
8213             }
8214         }
8215         ch = *s;
8216 
8217         if (ch < maplen)
8218             x = PyUnicode_READ(mapkind, mapdata, ch);
8219         else
8220             x = 0xfffe; /* invalid value */
8221 Error:
8222         if (x == 0xfffe)
8223         {
8224             /* undefined mapping */
8225             startinpos = s-starts;
8226             endinpos = startinpos+1;
8227             if (unicode_decode_call_errorhandler_writer(
8228                     errors, &errorHandler,
8229                     "charmap", "character maps to <undefined>",
8230                     &starts, &e, &startinpos, &endinpos, &exc, &s,
8231                     writer)) {
8232                 goto onError;
8233             }
8234             continue;
8235         }
8236 
8237         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8238             goto onError;
8239         ++s;
8240     }
8241     Py_XDECREF(errorHandler);
8242     Py_XDECREF(exc);
8243     return 0;
8244 
8245 onError:
8246     Py_XDECREF(errorHandler);
8247     Py_XDECREF(exc);
8248     return -1;
8249 }
8250 
8251 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8252 charmap_decode_mapping(const char *s,
8253                        Py_ssize_t size,
8254                        PyObject *mapping,
8255                        const char *errors,
8256                        _PyUnicodeWriter *writer)
8257 {
8258     const char *starts = s;
8259     const char *e;
8260     Py_ssize_t startinpos, endinpos;
8261     PyObject *errorHandler = NULL, *exc = NULL;
8262     unsigned char ch;
8263     PyObject *key, *item = NULL;
8264 
8265     e = s + size;
8266 
8267     while (s < e) {
8268         ch = *s;
8269 
8270         /* Get mapping (char ordinal -> integer, Unicode char or None) */
8271         key = PyLong_FromLong((long)ch);
8272         if (key == NULL)
8273             goto onError;
8274 
8275         item = PyObject_GetItem(mapping, key);
8276         Py_DECREF(key);
8277         if (item == NULL) {
8278             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8279                 /* No mapping found means: mapping is undefined. */
8280                 PyErr_Clear();
8281                 goto Undefined;
8282             } else
8283                 goto onError;
8284         }
8285 
8286         /* Apply mapping */
8287         if (item == Py_None)
8288             goto Undefined;
8289         if (PyLong_Check(item)) {
8290             long value = PyLong_AS_LONG(item);
8291             if (value == 0xFFFE)
8292                 goto Undefined;
8293             if (value < 0 || value > MAX_UNICODE) {
8294                 PyErr_Format(PyExc_TypeError,
8295                              "character mapping must be in range(0x%x)",
8296                              (unsigned long)MAX_UNICODE + 1);
8297                 goto onError;
8298             }
8299 
8300             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8301                 goto onError;
8302         }
8303         else if (PyUnicode_Check(item)) {
8304             if (PyUnicode_READY(item) == -1)
8305                 goto onError;
8306             if (PyUnicode_GET_LENGTH(item) == 1) {
8307                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8308                 if (value == 0xFFFE)
8309                     goto Undefined;
8310                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8311                     goto onError;
8312             }
8313             else {
8314                 writer->overallocate = 1;
8315                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8316                     goto onError;
8317             }
8318         }
8319         else {
8320             /* wrong return value */
8321             PyErr_SetString(PyExc_TypeError,
8322                             "character mapping must return integer, None or str");
8323             goto onError;
8324         }
8325         Py_CLEAR(item);
8326         ++s;
8327         continue;
8328 
8329 Undefined:
8330         /* undefined mapping */
8331         Py_CLEAR(item);
8332         startinpos = s-starts;
8333         endinpos = startinpos+1;
8334         if (unicode_decode_call_errorhandler_writer(
8335                 errors, &errorHandler,
8336                 "charmap", "character maps to <undefined>",
8337                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8338                 writer)) {
8339             goto onError;
8340         }
8341     }
8342     Py_XDECREF(errorHandler);
8343     Py_XDECREF(exc);
8344     return 0;
8345 
8346 onError:
8347     Py_XDECREF(item);
8348     Py_XDECREF(errorHandler);
8349     Py_XDECREF(exc);
8350     return -1;
8351 }
8352 
8353 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8354 PyUnicode_DecodeCharmap(const char *s,
8355                         Py_ssize_t size,
8356                         PyObject *mapping,
8357                         const char *errors)
8358 {
8359     _PyUnicodeWriter writer;
8360 
8361     /* Default to Latin-1 */
8362     if (mapping == NULL)
8363         return PyUnicode_DecodeLatin1(s, size, errors);
8364 
8365     if (size == 0)
8366         _Py_RETURN_UNICODE_EMPTY();
8367     _PyUnicodeWriter_Init(&writer);
8368     writer.min_length = size;
8369     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8370         goto onError;
8371 
8372     if (PyUnicode_CheckExact(mapping)) {
8373         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8374             goto onError;
8375     }
8376     else {
8377         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8378             goto onError;
8379     }
8380     return _PyUnicodeWriter_Finish(&writer);
8381 
8382   onError:
8383     _PyUnicodeWriter_Dealloc(&writer);
8384     return NULL;
8385 }
8386 
8387 /* Charmap encoding: the lookup table */
8388 
8389 /*[clinic input]
8390 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8391 [clinic start generated code]*/
8392 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8393 
8394 struct encoding_map {
8395     PyObject_HEAD
8396     unsigned char level1[32];
8397     int count2, count3;
8398     unsigned char level23[1];
8399 };
8400 
8401 /*[clinic input]
8402 EncodingMap.size
8403 
8404 Return the size (in bytes) of this object.
8405 [clinic start generated code]*/
8406 
8407 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8408 EncodingMap_size_impl(struct encoding_map *self)
8409 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8410 {
8411     return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8412                            128*self->count3);
8413 }
8414 
8415 static PyMethodDef encoding_map_methods[] = {
8416     ENCODINGMAP_SIZE_METHODDEF
8417     {NULL, NULL}
8418 };
8419 
8420 static PyTypeObject EncodingMapType = {
8421     PyVarObject_HEAD_INIT(NULL, 0)
8422     .tp_name = "EncodingMap",
8423     .tp_basicsize = sizeof(struct encoding_map),
8424     /* methods */
8425     .tp_flags = Py_TPFLAGS_DEFAULT,
8426     .tp_methods = encoding_map_methods,
8427 };
8428 
8429 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8430 PyUnicode_BuildEncodingMap(PyObject* string)
8431 {
8432     PyObject *result;
8433     struct encoding_map *mresult;
8434     int i;
8435     int need_dict = 0;
8436     unsigned char level1[32];
8437     unsigned char level2[512];
8438     unsigned char *mlevel1, *mlevel2, *mlevel3;
8439     int count2 = 0, count3 = 0;
8440     int kind;
8441     const void *data;
8442     Py_ssize_t length;
8443     Py_UCS4 ch;
8444 
8445     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8446         PyErr_BadArgument();
8447         return NULL;
8448     }
8449     kind = PyUnicode_KIND(string);
8450     data = PyUnicode_DATA(string);
8451     length = PyUnicode_GET_LENGTH(string);
8452     length = Py_MIN(length, 256);
8453     memset(level1, 0xFF, sizeof level1);
8454     memset(level2, 0xFF, sizeof level2);
8455 
8456     /* If there isn't a one-to-one mapping of NULL to \0,
8457        or if there are non-BMP characters, we need to use
8458        a mapping dictionary. */
8459     if (PyUnicode_READ(kind, data, 0) != 0)
8460         need_dict = 1;
8461     for (i = 1; i < length; i++) {
8462         int l1, l2;
8463         ch = PyUnicode_READ(kind, data, i);
8464         if (ch == 0 || ch > 0xFFFF) {
8465             need_dict = 1;
8466             break;
8467         }
8468         if (ch == 0xFFFE)
8469             /* unmapped character */
8470             continue;
8471         l1 = ch >> 11;
8472         l2 = ch >> 7;
8473         if (level1[l1] == 0xFF)
8474             level1[l1] = count2++;
8475         if (level2[l2] == 0xFF)
8476             level2[l2] = count3++;
8477     }
8478 
8479     if (count2 >= 0xFF || count3 >= 0xFF)
8480         need_dict = 1;
8481 
8482     if (need_dict) {
8483         PyObject *result = PyDict_New();
8484         PyObject *key, *value;
8485         if (!result)
8486             return NULL;
8487         for (i = 0; i < length; i++) {
8488             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8489             value = PyLong_FromLong(i);
8490             if (!key || !value)
8491                 goto failed1;
8492             if (PyDict_SetItem(result, key, value) == -1)
8493                 goto failed1;
8494             Py_DECREF(key);
8495             Py_DECREF(value);
8496         }
8497         return result;
8498       failed1:
8499         Py_XDECREF(key);
8500         Py_XDECREF(value);
8501         Py_DECREF(result);
8502         return NULL;
8503     }
8504 
8505     /* Create a three-level trie */
8506     result = PyObject_Malloc(sizeof(struct encoding_map) +
8507                              16*count2 + 128*count3 - 1);
8508     if (!result) {
8509         return PyErr_NoMemory();
8510     }
8511 
8512     _PyObject_Init(result, &EncodingMapType);
8513     mresult = (struct encoding_map*)result;
8514     mresult->count2 = count2;
8515     mresult->count3 = count3;
8516     mlevel1 = mresult->level1;
8517     mlevel2 = mresult->level23;
8518     mlevel3 = mresult->level23 + 16*count2;
8519     memcpy(mlevel1, level1, 32);
8520     memset(mlevel2, 0xFF, 16*count2);
8521     memset(mlevel3, 0, 128*count3);
8522     count3 = 0;
8523     for (i = 1; i < length; i++) {
8524         int o1, o2, o3, i2, i3;
8525         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8526         if (ch == 0xFFFE)
8527             /* unmapped character */
8528             continue;
8529         o1 = ch>>11;
8530         o2 = (ch>>7) & 0xF;
8531         i2 = 16*mlevel1[o1] + o2;
8532         if (mlevel2[i2] == 0xFF)
8533             mlevel2[i2] = count3++;
8534         o3 = ch & 0x7F;
8535         i3 = 128*mlevel2[i2] + o3;
8536         mlevel3[i3] = i;
8537     }
8538     return result;
8539 }
8540 
8541 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8542 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8543 {
8544     struct encoding_map *map = (struct encoding_map*)mapping;
8545     int l1 = c>>11;
8546     int l2 = (c>>7) & 0xF;
8547     int l3 = c & 0x7F;
8548     int i;
8549 
8550     if (c > 0xFFFF)
8551         return -1;
8552     if (c == 0)
8553         return 0;
8554     /* level 1*/
8555     i = map->level1[l1];
8556     if (i == 0xFF) {
8557         return -1;
8558     }
8559     /* level 2*/
8560     i = map->level23[16*i+l2];
8561     if (i == 0xFF) {
8562         return -1;
8563     }
8564     /* level 3 */
8565     i = map->level23[16*map->count2 + 128*i + l3];
8566     if (i == 0) {
8567         return -1;
8568     }
8569     return i;
8570 }
8571 
8572 /* Lookup the character ch in the mapping. If the character
8573    can't be found, Py_None is returned (or NULL, if another
8574    error occurred). */
8575 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8576 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8577 {
8578     PyObject *w = PyLong_FromLong((long)c);
8579     PyObject *x;
8580 
8581     if (w == NULL)
8582         return NULL;
8583     x = PyObject_GetItem(mapping, w);
8584     Py_DECREF(w);
8585     if (x == NULL) {
8586         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8587             /* No mapping found means: mapping is undefined. */
8588             PyErr_Clear();
8589             Py_RETURN_NONE;
8590         } else
8591             return NULL;
8592     }
8593     else if (x == Py_None)
8594         return x;
8595     else if (PyLong_Check(x)) {
8596         long value = PyLong_AS_LONG(x);
8597         if (value < 0 || value > 255) {
8598             PyErr_SetString(PyExc_TypeError,
8599                             "character mapping must be in range(256)");
8600             Py_DECREF(x);
8601             return NULL;
8602         }
8603         return x;
8604     }
8605     else if (PyBytes_Check(x))
8606         return x;
8607     else {
8608         /* wrong return value */
8609         PyErr_Format(PyExc_TypeError,
8610                      "character mapping must return integer, bytes or None, not %.400s",
8611                      Py_TYPE(x)->tp_name);
8612         Py_DECREF(x);
8613         return NULL;
8614     }
8615 }
8616 
8617 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8618 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8619 {
8620     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8621     /* exponentially overallocate to minimize reallocations */
8622     if (requiredsize < 2*outsize)
8623         requiredsize = 2*outsize;
8624     if (_PyBytes_Resize(outobj, requiredsize))
8625         return -1;
8626     return 0;
8627 }
8628 
8629 typedef enum charmapencode_result {
8630     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8631 } charmapencode_result;
8632 /* lookup the character, put the result in the output string and adjust
8633    various state variables. Resize the output bytes object if not enough
8634    space is available. Return a new reference to the object that
8635    was put in the output buffer, or Py_None, if the mapping was undefined
8636    (in which case no character was written) or NULL, if a
8637    reallocation error occurred. The caller must decref the result */
8638 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8639 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8640                      PyObject **outobj, Py_ssize_t *outpos)
8641 {
8642     PyObject *rep;
8643     char *outstart;
8644     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8645 
8646     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8647         int res = encoding_map_lookup(c, mapping);
8648         Py_ssize_t requiredsize = *outpos+1;
8649         if (res == -1)
8650             return enc_FAILED;
8651         if (outsize<requiredsize)
8652             if (charmapencode_resize(outobj, outpos, requiredsize))
8653                 return enc_EXCEPTION;
8654         outstart = PyBytes_AS_STRING(*outobj);
8655         outstart[(*outpos)++] = (char)res;
8656         return enc_SUCCESS;
8657     }
8658 
8659     rep = charmapencode_lookup(c, mapping);
8660     if (rep==NULL)
8661         return enc_EXCEPTION;
8662     else if (rep==Py_None) {
8663         Py_DECREF(rep);
8664         return enc_FAILED;
8665     } else {
8666         if (PyLong_Check(rep)) {
8667             Py_ssize_t requiredsize = *outpos+1;
8668             if (outsize<requiredsize)
8669                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8670                     Py_DECREF(rep);
8671                     return enc_EXCEPTION;
8672                 }
8673             outstart = PyBytes_AS_STRING(*outobj);
8674             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8675         }
8676         else {
8677             const char *repchars = PyBytes_AS_STRING(rep);
8678             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8679             Py_ssize_t requiredsize = *outpos+repsize;
8680             if (outsize<requiredsize)
8681                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8682                     Py_DECREF(rep);
8683                     return enc_EXCEPTION;
8684                 }
8685             outstart = PyBytes_AS_STRING(*outobj);
8686             memcpy(outstart + *outpos, repchars, repsize);
8687             *outpos += repsize;
8688         }
8689     }
8690     Py_DECREF(rep);
8691     return enc_SUCCESS;
8692 }
8693 
8694 /* handle an error in PyUnicode_EncodeCharmap
8695    Return 0 on success, -1 on error */
8696 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8697 charmap_encoding_error(
8698     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8699     PyObject **exceptionObject,
8700     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8701     PyObject **res, Py_ssize_t *respos)
8702 {
8703     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8704     Py_ssize_t size, repsize;
8705     Py_ssize_t newpos;
8706     enum PyUnicode_Kind kind;
8707     const void *data;
8708     Py_ssize_t index;
8709     /* startpos for collecting unencodable chars */
8710     Py_ssize_t collstartpos = *inpos;
8711     Py_ssize_t collendpos = *inpos+1;
8712     Py_ssize_t collpos;
8713     const char *encoding = "charmap";
8714     const char *reason = "character maps to <undefined>";
8715     charmapencode_result x;
8716     Py_UCS4 ch;
8717     int val;
8718 
8719     if (PyUnicode_READY(unicode) == -1)
8720         return -1;
8721     size = PyUnicode_GET_LENGTH(unicode);
8722     /* find all unencodable characters */
8723     while (collendpos < size) {
8724         PyObject *rep;
8725         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8726             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8727             val = encoding_map_lookup(ch, mapping);
8728             if (val != -1)
8729                 break;
8730             ++collendpos;
8731             continue;
8732         }
8733 
8734         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8735         rep = charmapencode_lookup(ch, mapping);
8736         if (rep==NULL)
8737             return -1;
8738         else if (rep!=Py_None) {
8739             Py_DECREF(rep);
8740             break;
8741         }
8742         Py_DECREF(rep);
8743         ++collendpos;
8744     }
8745     /* cache callback name lookup
8746      * (if not done yet, i.e. it's the first error) */
8747     if (*error_handler == _Py_ERROR_UNKNOWN)
8748         *error_handler = _Py_GetErrorHandler(errors);
8749 
8750     switch (*error_handler) {
8751     case _Py_ERROR_STRICT:
8752         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8753         return -1;
8754 
8755     case _Py_ERROR_REPLACE:
8756         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8757             x = charmapencode_output('?', mapping, res, respos);
8758             if (x==enc_EXCEPTION) {
8759                 return -1;
8760             }
8761             else if (x==enc_FAILED) {
8762                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8763                 return -1;
8764             }
8765         }
8766         /* fall through */
8767     case _Py_ERROR_IGNORE:
8768         *inpos = collendpos;
8769         break;
8770 
8771     case _Py_ERROR_XMLCHARREFREPLACE:
8772         /* generate replacement (temporarily (mis)uses p) */
8773         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8774             char buffer[2+29+1+1];
8775             char *cp;
8776             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8777             for (cp = buffer; *cp; ++cp) {
8778                 x = charmapencode_output(*cp, mapping, res, respos);
8779                 if (x==enc_EXCEPTION)
8780                     return -1;
8781                 else if (x==enc_FAILED) {
8782                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8783                     return -1;
8784                 }
8785             }
8786         }
8787         *inpos = collendpos;
8788         break;
8789 
8790     default:
8791         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8792                                                       encoding, reason, unicode, exceptionObject,
8793                                                       collstartpos, collendpos, &newpos);
8794         if (repunicode == NULL)
8795             return -1;
8796         if (PyBytes_Check(repunicode)) {
8797             /* Directly copy bytes result to output. */
8798             Py_ssize_t outsize = PyBytes_Size(*res);
8799             Py_ssize_t requiredsize;
8800             repsize = PyBytes_Size(repunicode);
8801             requiredsize = *respos + repsize;
8802             if (requiredsize > outsize)
8803                 /* Make room for all additional bytes. */
8804                 if (charmapencode_resize(res, respos, requiredsize)) {
8805                     Py_DECREF(repunicode);
8806                     return -1;
8807                 }
8808             memcpy(PyBytes_AsString(*res) + *respos,
8809                    PyBytes_AsString(repunicode),  repsize);
8810             *respos += repsize;
8811             *inpos = newpos;
8812             Py_DECREF(repunicode);
8813             break;
8814         }
8815         /* generate replacement  */
8816         if (PyUnicode_READY(repunicode) == -1) {
8817             Py_DECREF(repunicode);
8818             return -1;
8819         }
8820         repsize = PyUnicode_GET_LENGTH(repunicode);
8821         data = PyUnicode_DATA(repunicode);
8822         kind = PyUnicode_KIND(repunicode);
8823         for (index = 0; index < repsize; index++) {
8824             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8825             x = charmapencode_output(repch, mapping, res, respos);
8826             if (x==enc_EXCEPTION) {
8827                 Py_DECREF(repunicode);
8828                 return -1;
8829             }
8830             else if (x==enc_FAILED) {
8831                 Py_DECREF(repunicode);
8832                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8833                 return -1;
8834             }
8835         }
8836         *inpos = newpos;
8837         Py_DECREF(repunicode);
8838     }
8839     return 0;
8840 }
8841 
8842 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8843 _PyUnicode_EncodeCharmap(PyObject *unicode,
8844                          PyObject *mapping,
8845                          const char *errors)
8846 {
8847     /* output object */
8848     PyObject *res = NULL;
8849     /* current input position */
8850     Py_ssize_t inpos = 0;
8851     Py_ssize_t size;
8852     /* current output position */
8853     Py_ssize_t respos = 0;
8854     PyObject *error_handler_obj = NULL;
8855     PyObject *exc = NULL;
8856     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8857     const void *data;
8858     int kind;
8859 
8860     if (PyUnicode_READY(unicode) == -1)
8861         return NULL;
8862     size = PyUnicode_GET_LENGTH(unicode);
8863     data = PyUnicode_DATA(unicode);
8864     kind = PyUnicode_KIND(unicode);
8865 
8866     /* Default to Latin-1 */
8867     if (mapping == NULL)
8868         return unicode_encode_ucs1(unicode, errors, 256);
8869 
8870     /* allocate enough for a simple encoding without
8871        replacements, if we need more, we'll resize */
8872     res = PyBytes_FromStringAndSize(NULL, size);
8873     if (res == NULL)
8874         goto onError;
8875     if (size == 0)
8876         return res;
8877 
8878     while (inpos<size) {
8879         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8880         /* try to encode it */
8881         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8882         if (x==enc_EXCEPTION) /* error */
8883             goto onError;
8884         if (x==enc_FAILED) { /* unencodable character */
8885             if (charmap_encoding_error(unicode, &inpos, mapping,
8886                                        &exc,
8887                                        &error_handler, &error_handler_obj, errors,
8888                                        &res, &respos)) {
8889                 goto onError;
8890             }
8891         }
8892         else
8893             /* done with this character => adjust input position */
8894             ++inpos;
8895     }
8896 
8897     /* Resize if we allocated to much */
8898     if (respos<PyBytes_GET_SIZE(res))
8899         if (_PyBytes_Resize(&res, respos) < 0)
8900             goto onError;
8901 
8902     Py_XDECREF(exc);
8903     Py_XDECREF(error_handler_obj);
8904     return res;
8905 
8906   onError:
8907     Py_XDECREF(res);
8908     Py_XDECREF(exc);
8909     Py_XDECREF(error_handler_obj);
8910     return NULL;
8911 }
8912 
8913 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8914 PyUnicode_AsCharmapString(PyObject *unicode,
8915                           PyObject *mapping)
8916 {
8917     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8918         PyErr_BadArgument();
8919         return NULL;
8920     }
8921     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8922 }
8923 
8924 /* create or adjust a UnicodeTranslateError */
8925 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8926 make_translate_exception(PyObject **exceptionObject,
8927                          PyObject *unicode,
8928                          Py_ssize_t startpos, Py_ssize_t endpos,
8929                          const char *reason)
8930 {
8931     if (*exceptionObject == NULL) {
8932         *exceptionObject = _PyUnicodeTranslateError_Create(
8933             unicode, startpos, endpos, reason);
8934     }
8935     else {
8936         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8937             goto onError;
8938         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8939             goto onError;
8940         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8941             goto onError;
8942         return;
8943       onError:
8944         Py_CLEAR(*exceptionObject);
8945     }
8946 }
8947 
8948 /* error handling callback helper:
8949    build arguments, call the callback and check the arguments,
8950    put the result into newpos and return the replacement string, which
8951    has to be freed by the caller */
8952 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8953 unicode_translate_call_errorhandler(const char *errors,
8954                                     PyObject **errorHandler,
8955                                     const char *reason,
8956                                     PyObject *unicode, PyObject **exceptionObject,
8957                                     Py_ssize_t startpos, Py_ssize_t endpos,
8958                                     Py_ssize_t *newpos)
8959 {
8960     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8961 
8962     Py_ssize_t i_newpos;
8963     PyObject *restuple;
8964     PyObject *resunicode;
8965 
8966     if (*errorHandler == NULL) {
8967         *errorHandler = PyCodec_LookupError(errors);
8968         if (*errorHandler == NULL)
8969             return NULL;
8970     }
8971 
8972     make_translate_exception(exceptionObject,
8973                              unicode, startpos, endpos, reason);
8974     if (*exceptionObject == NULL)
8975         return NULL;
8976 
8977     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8978     if (restuple == NULL)
8979         return NULL;
8980     if (!PyTuple_Check(restuple)) {
8981         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8982         Py_DECREF(restuple);
8983         return NULL;
8984     }
8985     if (!PyArg_ParseTuple(restuple, argparse,
8986                           &resunicode, &i_newpos)) {
8987         Py_DECREF(restuple);
8988         return NULL;
8989     }
8990     if (i_newpos<0)
8991         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8992     else
8993         *newpos = i_newpos;
8994     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8995         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8996         Py_DECREF(restuple);
8997         return NULL;
8998     }
8999     Py_INCREF(resunicode);
9000     Py_DECREF(restuple);
9001     return resunicode;
9002 }
9003 
9004 /* Lookup the character ch in the mapping and put the result in result,
9005    which must be decrefed by the caller.
9006    Return 0 on success, -1 on error */
9007 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)9008 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9009 {
9010     PyObject *w = PyLong_FromLong((long)c);
9011     PyObject *x;
9012 
9013     if (w == NULL)
9014         return -1;
9015     x = PyObject_GetItem(mapping, w);
9016     Py_DECREF(w);
9017     if (x == NULL) {
9018         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9019             /* No mapping found means: use 1:1 mapping. */
9020             PyErr_Clear();
9021             *result = NULL;
9022             return 0;
9023         } else
9024             return -1;
9025     }
9026     else if (x == Py_None) {
9027         *result = x;
9028         return 0;
9029     }
9030     else if (PyLong_Check(x)) {
9031         long value = PyLong_AS_LONG(x);
9032         if (value < 0 || value > MAX_UNICODE) {
9033             PyErr_Format(PyExc_ValueError,
9034                          "character mapping must be in range(0x%x)",
9035                          MAX_UNICODE+1);
9036             Py_DECREF(x);
9037             return -1;
9038         }
9039         *result = x;
9040         return 0;
9041     }
9042     else if (PyUnicode_Check(x)) {
9043         *result = x;
9044         return 0;
9045     }
9046     else {
9047         /* wrong return value */
9048         PyErr_SetString(PyExc_TypeError,
9049                         "character mapping must return integer, None or str");
9050         Py_DECREF(x);
9051         return -1;
9052     }
9053 }
9054 
9055 /* lookup the character, write the result into the writer.
9056    Return 1 if the result was written into the writer, return 0 if the mapping
9057    was undefined, raise an exception return -1 on error. */
9058 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9059 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9060                         _PyUnicodeWriter *writer)
9061 {
9062     PyObject *item;
9063 
9064     if (charmaptranslate_lookup(ch, mapping, &item))
9065         return -1;
9066 
9067     if (item == NULL) {
9068         /* not found => default to 1:1 mapping */
9069         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9070             return -1;
9071         }
9072         return 1;
9073     }
9074 
9075     if (item == Py_None) {
9076         Py_DECREF(item);
9077         return 0;
9078     }
9079 
9080     if (PyLong_Check(item)) {
9081         long ch = (Py_UCS4)PyLong_AS_LONG(item);
9082         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9083            used it */
9084         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9085             Py_DECREF(item);
9086             return -1;
9087         }
9088         Py_DECREF(item);
9089         return 1;
9090     }
9091 
9092     if (!PyUnicode_Check(item)) {
9093         Py_DECREF(item);
9094         return -1;
9095     }
9096 
9097     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9098         Py_DECREF(item);
9099         return -1;
9100     }
9101 
9102     Py_DECREF(item);
9103     return 1;
9104 }
9105 
9106 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9107 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9108                               Py_UCS1 *translate)
9109 {
9110     PyObject *item = NULL;
9111     int ret = 0;
9112 
9113     if (charmaptranslate_lookup(ch, mapping, &item)) {
9114         return -1;
9115     }
9116 
9117     if (item == Py_None) {
9118         /* deletion */
9119         translate[ch] = 0xfe;
9120     }
9121     else if (item == NULL) {
9122         /* not found => default to 1:1 mapping */
9123         translate[ch] = ch;
9124         return 1;
9125     }
9126     else if (PyLong_Check(item)) {
9127         long replace = PyLong_AS_LONG(item);
9128         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9129            used it */
9130         if (127 < replace) {
9131             /* invalid character or character outside ASCII:
9132                skip the fast translate */
9133             goto exit;
9134         }
9135         translate[ch] = (Py_UCS1)replace;
9136     }
9137     else if (PyUnicode_Check(item)) {
9138         Py_UCS4 replace;
9139 
9140         if (PyUnicode_READY(item) == -1) {
9141             Py_DECREF(item);
9142             return -1;
9143         }
9144         if (PyUnicode_GET_LENGTH(item) != 1)
9145             goto exit;
9146 
9147         replace = PyUnicode_READ_CHAR(item, 0);
9148         if (replace > 127)
9149             goto exit;
9150         translate[ch] = (Py_UCS1)replace;
9151     }
9152     else {
9153         /* not None, NULL, long or unicode */
9154         goto exit;
9155     }
9156     ret = 1;
9157 
9158   exit:
9159     Py_DECREF(item);
9160     return ret;
9161 }
9162 
9163 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9164    was translated into writer, return 0 if the input string was partially
9165    translated into writer, raise an exception and return -1 on error. */
9166 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9167 unicode_fast_translate(PyObject *input, PyObject *mapping,
9168                        _PyUnicodeWriter *writer, int ignore,
9169                        Py_ssize_t *input_pos)
9170 {
9171     Py_UCS1 ascii_table[128], ch, ch2;
9172     Py_ssize_t len;
9173     const Py_UCS1 *in, *end;
9174     Py_UCS1 *out;
9175     int res = 0;
9176 
9177     len = PyUnicode_GET_LENGTH(input);
9178 
9179     memset(ascii_table, 0xff, 128);
9180 
9181     in = PyUnicode_1BYTE_DATA(input);
9182     end = in + len;
9183 
9184     assert(PyUnicode_IS_ASCII(writer->buffer));
9185     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9186     out = PyUnicode_1BYTE_DATA(writer->buffer);
9187 
9188     for (; in < end; in++) {
9189         ch = *in;
9190         ch2 = ascii_table[ch];
9191         if (ch2 == 0xff) {
9192             int translate = unicode_fast_translate_lookup(mapping, ch,
9193                                                           ascii_table);
9194             if (translate < 0)
9195                 return -1;
9196             if (translate == 0)
9197                 goto exit;
9198             ch2 = ascii_table[ch];
9199         }
9200         if (ch2 == 0xfe) {
9201             if (ignore)
9202                 continue;
9203             goto exit;
9204         }
9205         assert(ch2 < 128);
9206         *out = ch2;
9207         out++;
9208     }
9209     res = 1;
9210 
9211 exit:
9212     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9213     *input_pos = in - PyUnicode_1BYTE_DATA(input);
9214     return res;
9215 }
9216 
9217 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9218 _PyUnicode_TranslateCharmap(PyObject *input,
9219                             PyObject *mapping,
9220                             const char *errors)
9221 {
9222     /* input object */
9223     const void *data;
9224     Py_ssize_t size, i;
9225     int kind;
9226     /* output buffer */
9227     _PyUnicodeWriter writer;
9228     /* error handler */
9229     const char *reason = "character maps to <undefined>";
9230     PyObject *errorHandler = NULL;
9231     PyObject *exc = NULL;
9232     int ignore;
9233     int res;
9234 
9235     if (mapping == NULL) {
9236         PyErr_BadArgument();
9237         return NULL;
9238     }
9239 
9240     if (PyUnicode_READY(input) == -1)
9241         return NULL;
9242     data = PyUnicode_DATA(input);
9243     kind = PyUnicode_KIND(input);
9244     size = PyUnicode_GET_LENGTH(input);
9245 
9246     if (size == 0)
9247         return PyUnicode_FromObject(input);
9248 
9249     /* allocate enough for a simple 1:1 translation without
9250        replacements, if we need more, we'll resize */
9251     _PyUnicodeWriter_Init(&writer);
9252     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9253         goto onError;
9254 
9255     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9256 
9257     if (PyUnicode_READY(input) == -1)
9258         return NULL;
9259     if (PyUnicode_IS_ASCII(input)) {
9260         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9261         if (res < 0) {
9262             _PyUnicodeWriter_Dealloc(&writer);
9263             return NULL;
9264         }
9265         if (res == 1)
9266             return _PyUnicodeWriter_Finish(&writer);
9267     }
9268     else {
9269         i = 0;
9270     }
9271 
9272     while (i<size) {
9273         /* try to encode it */
9274         int translate;
9275         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9276         Py_ssize_t newpos;
9277         /* startpos for collecting untranslatable chars */
9278         Py_ssize_t collstart;
9279         Py_ssize_t collend;
9280         Py_UCS4 ch;
9281 
9282         ch = PyUnicode_READ(kind, data, i);
9283         translate = charmaptranslate_output(ch, mapping, &writer);
9284         if (translate < 0)
9285             goto onError;
9286 
9287         if (translate != 0) {
9288             /* it worked => adjust input pointer */
9289             ++i;
9290             continue;
9291         }
9292 
9293         /* untranslatable character */
9294         collstart = i;
9295         collend = i+1;
9296 
9297         /* find all untranslatable characters */
9298         while (collend < size) {
9299             PyObject *x;
9300             ch = PyUnicode_READ(kind, data, collend);
9301             if (charmaptranslate_lookup(ch, mapping, &x))
9302                 goto onError;
9303             Py_XDECREF(x);
9304             if (x != Py_None)
9305                 break;
9306             ++collend;
9307         }
9308 
9309         if (ignore) {
9310             i = collend;
9311         }
9312         else {
9313             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9314                                                              reason, input, &exc,
9315                                                              collstart, collend, &newpos);
9316             if (repunicode == NULL)
9317                 goto onError;
9318             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9319                 Py_DECREF(repunicode);
9320                 goto onError;
9321             }
9322             Py_DECREF(repunicode);
9323             i = newpos;
9324         }
9325     }
9326     Py_XDECREF(exc);
9327     Py_XDECREF(errorHandler);
9328     return _PyUnicodeWriter_Finish(&writer);
9329 
9330   onError:
9331     _PyUnicodeWriter_Dealloc(&writer);
9332     Py_XDECREF(exc);
9333     Py_XDECREF(errorHandler);
9334     return NULL;
9335 }
9336 
9337 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9338 PyUnicode_Translate(PyObject *str,
9339                     PyObject *mapping,
9340                     const char *errors)
9341 {
9342     if (ensure_unicode(str) < 0)
9343         return NULL;
9344     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9345 }
9346 
9347 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9348 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9349 {
9350     if (!PyUnicode_Check(unicode)) {
9351         PyErr_BadInternalCall();
9352         return NULL;
9353     }
9354     if (PyUnicode_READY(unicode) == -1)
9355         return NULL;
9356     if (PyUnicode_IS_ASCII(unicode)) {
9357         /* If the string is already ASCII, just return the same string */
9358         Py_INCREF(unicode);
9359         return unicode;
9360     }
9361 
9362     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9363     PyObject *result = PyUnicode_New(len, 127);
9364     if (result == NULL) {
9365         return NULL;
9366     }
9367 
9368     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9369     int kind = PyUnicode_KIND(unicode);
9370     const void *data = PyUnicode_DATA(unicode);
9371     Py_ssize_t i;
9372     for (i = 0; i < len; ++i) {
9373         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9374         if (ch < 127) {
9375             out[i] = ch;
9376         }
9377         else if (Py_UNICODE_ISSPACE(ch)) {
9378             out[i] = ' ';
9379         }
9380         else {
9381             int decimal = Py_UNICODE_TODECIMAL(ch);
9382             if (decimal < 0) {
9383                 out[i] = '?';
9384                 out[i+1] = '\0';
9385                 _PyUnicode_LENGTH(result) = i + 1;
9386                 break;
9387             }
9388             out[i] = '0' + decimal;
9389         }
9390     }
9391 
9392     assert(_PyUnicode_CheckConsistency(result, 1));
9393     return result;
9394 }
9395 
9396 /* --- Helpers ------------------------------------------------------------ */
9397 
9398 /* helper macro to fixup start/end slice values */
9399 #define ADJUST_INDICES(start, end, len)         \
9400     if (end > len)                              \
9401         end = len;                              \
9402     else if (end < 0) {                         \
9403         end += len;                             \
9404         if (end < 0)                            \
9405             end = 0;                            \
9406     }                                           \
9407     if (start < 0) {                            \
9408         start += len;                           \
9409         if (start < 0)                          \
9410             start = 0;                          \
9411     }
9412 
9413 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9414 any_find_slice(PyObject* s1, PyObject* s2,
9415                Py_ssize_t start,
9416                Py_ssize_t end,
9417                int direction)
9418 {
9419     int kind1, kind2;
9420     const void *buf1, *buf2;
9421     Py_ssize_t len1, len2, result;
9422 
9423     kind1 = PyUnicode_KIND(s1);
9424     kind2 = PyUnicode_KIND(s2);
9425     if (kind1 < kind2)
9426         return -1;
9427 
9428     len1 = PyUnicode_GET_LENGTH(s1);
9429     len2 = PyUnicode_GET_LENGTH(s2);
9430     ADJUST_INDICES(start, end, len1);
9431     if (end - start < len2)
9432         return -1;
9433 
9434     buf1 = PyUnicode_DATA(s1);
9435     buf2 = PyUnicode_DATA(s2);
9436     if (len2 == 1) {
9437         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9438         result = findchar((const char *)buf1 + kind1*start,
9439                           kind1, end - start, ch, direction);
9440         if (result == -1)
9441             return -1;
9442         else
9443             return start + result;
9444     }
9445 
9446     if (kind2 != kind1) {
9447         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9448         if (!buf2)
9449             return -2;
9450     }
9451 
9452     if (direction > 0) {
9453         switch (kind1) {
9454         case PyUnicode_1BYTE_KIND:
9455             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9456                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9457             else
9458                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9459             break;
9460         case PyUnicode_2BYTE_KIND:
9461             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9462             break;
9463         case PyUnicode_4BYTE_KIND:
9464             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9465             break;
9466         default:
9467             Py_UNREACHABLE();
9468         }
9469     }
9470     else {
9471         switch (kind1) {
9472         case PyUnicode_1BYTE_KIND:
9473             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9474                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9475             else
9476                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9477             break;
9478         case PyUnicode_2BYTE_KIND:
9479             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9480             break;
9481         case PyUnicode_4BYTE_KIND:
9482             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9483             break;
9484         default:
9485             Py_UNREACHABLE();
9486         }
9487     }
9488 
9489     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9490     if (kind2 != kind1)
9491         PyMem_Free((void *)buf2);
9492 
9493     return result;
9494 }
9495 
9496 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9497 #include "stringlib/localeutil.h"
9498 
9499 /**
9500  * InsertThousandsGrouping:
9501  * @writer: Unicode writer.
9502  * @n_buffer: Number of characters in @buffer.
9503  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9504  * @d_pos: Start of digits string.
9505  * @n_digits: The number of digits in the string, in which we want
9506  *            to put the grouping chars.
9507  * @min_width: The minimum width of the digits in the output string.
9508  *             Output will be zero-padded on the left to fill.
9509  * @grouping: see definition in localeconv().
9510  * @thousands_sep: see definition in localeconv().
9511  *
9512  * There are 2 modes: counting and filling. If @writer is NULL,
9513  *  we are in counting mode, else filling mode.
9514  * If counting, the required buffer size is returned.
9515  * If filling, we know the buffer will be large enough, so we don't
9516  *  need to pass in the buffer size.
9517  * Inserts thousand grouping characters (as defined by grouping and
9518  *  thousands_sep) into @writer.
9519  *
9520  * Return value: -1 on error, number of characters otherwise.
9521  **/
9522 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9523 _PyUnicode_InsertThousandsGrouping(
9524     _PyUnicodeWriter *writer,
9525     Py_ssize_t n_buffer,
9526     PyObject *digits,
9527     Py_ssize_t d_pos,
9528     Py_ssize_t n_digits,
9529     Py_ssize_t min_width,
9530     const char *grouping,
9531     PyObject *thousands_sep,
9532     Py_UCS4 *maxchar)
9533 {
9534     min_width = Py_MAX(0, min_width);
9535     if (writer) {
9536         assert(digits != NULL);
9537         assert(maxchar == NULL);
9538     }
9539     else {
9540         assert(digits == NULL);
9541         assert(maxchar != NULL);
9542     }
9543     assert(0 <= d_pos);
9544     assert(0 <= n_digits);
9545     assert(grouping != NULL);
9546 
9547     if (digits != NULL) {
9548         if (PyUnicode_READY(digits) == -1) {
9549             return -1;
9550         }
9551     }
9552     if (PyUnicode_READY(thousands_sep) == -1) {
9553         return -1;
9554     }
9555 
9556     Py_ssize_t count = 0;
9557     Py_ssize_t n_zeros;
9558     int loop_broken = 0;
9559     int use_separator = 0; /* First time through, don't append the
9560                               separator. They only go between
9561                               groups. */
9562     Py_ssize_t buffer_pos;
9563     Py_ssize_t digits_pos;
9564     Py_ssize_t len;
9565     Py_ssize_t n_chars;
9566     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9567                                         be looked at */
9568     /* A generator that returns all of the grouping widths, until it
9569        returns 0. */
9570     GroupGenerator groupgen;
9571     GroupGenerator_init(&groupgen, grouping);
9572     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9573 
9574     /* if digits are not grouped, thousands separator
9575        should be an empty string */
9576     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9577 
9578     digits_pos = d_pos + n_digits;
9579     if (writer) {
9580         buffer_pos = writer->pos + n_buffer;
9581         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9582         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9583     }
9584     else {
9585         buffer_pos = n_buffer;
9586     }
9587 
9588     if (!writer) {
9589         *maxchar = 127;
9590     }
9591 
9592     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9593         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9594         n_zeros = Py_MAX(0, len - remaining);
9595         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9596 
9597         /* Use n_zero zero's and n_chars chars */
9598 
9599         /* Count only, don't do anything. */
9600         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9601 
9602         /* Copy into the writer. */
9603         InsertThousandsGrouping_fill(writer, &buffer_pos,
9604                                      digits, &digits_pos,
9605                                      n_chars, n_zeros,
9606                                      use_separator ? thousands_sep : NULL,
9607                                      thousands_sep_len, maxchar);
9608 
9609         /* Use a separator next time. */
9610         use_separator = 1;
9611 
9612         remaining -= n_chars;
9613         min_width -= len;
9614 
9615         if (remaining <= 0 && min_width <= 0) {
9616             loop_broken = 1;
9617             break;
9618         }
9619         min_width -= thousands_sep_len;
9620     }
9621     if (!loop_broken) {
9622         /* We left the loop without using a break statement. */
9623 
9624         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9625         n_zeros = Py_MAX(0, len - remaining);
9626         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9627 
9628         /* Use n_zero zero's and n_chars chars */
9629         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9630 
9631         /* Copy into the writer. */
9632         InsertThousandsGrouping_fill(writer, &buffer_pos,
9633                                      digits, &digits_pos,
9634                                      n_chars, n_zeros,
9635                                      use_separator ? thousands_sep : NULL,
9636                                      thousands_sep_len, maxchar);
9637     }
9638     return count;
9639 }
9640 
9641 
9642 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9643 PyUnicode_Count(PyObject *str,
9644                 PyObject *substr,
9645                 Py_ssize_t start,
9646                 Py_ssize_t end)
9647 {
9648     Py_ssize_t result;
9649     int kind1, kind2;
9650     const void *buf1 = NULL, *buf2 = NULL;
9651     Py_ssize_t len1, len2;
9652 
9653     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9654         return -1;
9655 
9656     kind1 = PyUnicode_KIND(str);
9657     kind2 = PyUnicode_KIND(substr);
9658     if (kind1 < kind2)
9659         return 0;
9660 
9661     len1 = PyUnicode_GET_LENGTH(str);
9662     len2 = PyUnicode_GET_LENGTH(substr);
9663     ADJUST_INDICES(start, end, len1);
9664     if (end - start < len2)
9665         return 0;
9666 
9667     buf1 = PyUnicode_DATA(str);
9668     buf2 = PyUnicode_DATA(substr);
9669     if (kind2 != kind1) {
9670         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9671         if (!buf2)
9672             goto onError;
9673     }
9674 
9675     switch (kind1) {
9676     case PyUnicode_1BYTE_KIND:
9677         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9678             result = asciilib_count(
9679                 ((const Py_UCS1*)buf1) + start, end - start,
9680                 buf2, len2, PY_SSIZE_T_MAX
9681                 );
9682         else
9683             result = ucs1lib_count(
9684                 ((const Py_UCS1*)buf1) + start, end - start,
9685                 buf2, len2, PY_SSIZE_T_MAX
9686                 );
9687         break;
9688     case PyUnicode_2BYTE_KIND:
9689         result = ucs2lib_count(
9690             ((const Py_UCS2*)buf1) + start, end - start,
9691             buf2, len2, PY_SSIZE_T_MAX
9692             );
9693         break;
9694     case PyUnicode_4BYTE_KIND:
9695         result = ucs4lib_count(
9696             ((const Py_UCS4*)buf1) + start, end - start,
9697             buf2, len2, PY_SSIZE_T_MAX
9698             );
9699         break;
9700     default:
9701         Py_UNREACHABLE();
9702     }
9703 
9704     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9705     if (kind2 != kind1)
9706         PyMem_Free((void *)buf2);
9707 
9708     return result;
9709   onError:
9710     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9711     if (kind2 != kind1)
9712         PyMem_Free((void *)buf2);
9713     return -1;
9714 }
9715 
9716 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9717 PyUnicode_Find(PyObject *str,
9718                PyObject *substr,
9719                Py_ssize_t start,
9720                Py_ssize_t end,
9721                int direction)
9722 {
9723     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9724         return -2;
9725 
9726     return any_find_slice(str, substr, start, end, direction);
9727 }
9728 
9729 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9730 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9731                    Py_ssize_t start, Py_ssize_t end,
9732                    int direction)
9733 {
9734     int kind;
9735     Py_ssize_t len, result;
9736     if (PyUnicode_READY(str) == -1)
9737         return -2;
9738     len = PyUnicode_GET_LENGTH(str);
9739     ADJUST_INDICES(start, end, len);
9740     if (end - start < 1)
9741         return -1;
9742     kind = PyUnicode_KIND(str);
9743     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9744                       kind, end-start, ch, direction);
9745     if (result == -1)
9746         return -1;
9747     else
9748         return start + result;
9749 }
9750 
9751 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9752 tailmatch(PyObject *self,
9753           PyObject *substring,
9754           Py_ssize_t start,
9755           Py_ssize_t end,
9756           int direction)
9757 {
9758     int kind_self;
9759     int kind_sub;
9760     const void *data_self;
9761     const void *data_sub;
9762     Py_ssize_t offset;
9763     Py_ssize_t i;
9764     Py_ssize_t end_sub;
9765 
9766     if (PyUnicode_READY(self) == -1 ||
9767         PyUnicode_READY(substring) == -1)
9768         return -1;
9769 
9770     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9771     end -= PyUnicode_GET_LENGTH(substring);
9772     if (end < start)
9773         return 0;
9774 
9775     if (PyUnicode_GET_LENGTH(substring) == 0)
9776         return 1;
9777 
9778     kind_self = PyUnicode_KIND(self);
9779     data_self = PyUnicode_DATA(self);
9780     kind_sub = PyUnicode_KIND(substring);
9781     data_sub = PyUnicode_DATA(substring);
9782     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9783 
9784     if (direction > 0)
9785         offset = end;
9786     else
9787         offset = start;
9788 
9789     if (PyUnicode_READ(kind_self, data_self, offset) ==
9790         PyUnicode_READ(kind_sub, data_sub, 0) &&
9791         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9792         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9793         /* If both are of the same kind, memcmp is sufficient */
9794         if (kind_self == kind_sub) {
9795             return ! memcmp((char *)data_self +
9796                                 (offset * PyUnicode_KIND(substring)),
9797                             data_sub,
9798                             PyUnicode_GET_LENGTH(substring) *
9799                                 PyUnicode_KIND(substring));
9800         }
9801         /* otherwise we have to compare each character by first accessing it */
9802         else {
9803             /* We do not need to compare 0 and len(substring)-1 because
9804                the if statement above ensured already that they are equal
9805                when we end up here. */
9806             for (i = 1; i < end_sub; ++i) {
9807                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9808                     PyUnicode_READ(kind_sub, data_sub, i))
9809                     return 0;
9810             }
9811             return 1;
9812         }
9813     }
9814 
9815     return 0;
9816 }
9817 
9818 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9819 PyUnicode_Tailmatch(PyObject *str,
9820                     PyObject *substr,
9821                     Py_ssize_t start,
9822                     Py_ssize_t end,
9823                     int direction)
9824 {
9825     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9826         return -1;
9827 
9828     return tailmatch(str, substr, start, end, direction);
9829 }
9830 
9831 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9832 ascii_upper_or_lower(PyObject *self, int lower)
9833 {
9834     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9835     const char *data = PyUnicode_DATA(self);
9836     char *resdata;
9837     PyObject *res;
9838 
9839     res = PyUnicode_New(len, 127);
9840     if (res == NULL)
9841         return NULL;
9842     resdata = PyUnicode_DATA(res);
9843     if (lower)
9844         _Py_bytes_lower(resdata, data, len);
9845     else
9846         _Py_bytes_upper(resdata, data, len);
9847     return res;
9848 }
9849 
9850 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9851 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9852 {
9853     Py_ssize_t j;
9854     int final_sigma;
9855     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9856     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9857 
9858      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9859 
9860     where ! is a negation and \p{xxx} is a character with property xxx.
9861     */
9862     for (j = i - 1; j >= 0; j--) {
9863         c = PyUnicode_READ(kind, data, j);
9864         if (!_PyUnicode_IsCaseIgnorable(c))
9865             break;
9866     }
9867     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9868     if (final_sigma) {
9869         for (j = i + 1; j < length; j++) {
9870             c = PyUnicode_READ(kind, data, j);
9871             if (!_PyUnicode_IsCaseIgnorable(c))
9872                 break;
9873         }
9874         final_sigma = j == length || !_PyUnicode_IsCased(c);
9875     }
9876     return (final_sigma) ? 0x3C2 : 0x3C3;
9877 }
9878 
9879 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9880 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9881            Py_UCS4 c, Py_UCS4 *mapped)
9882 {
9883     /* Obscure special case. */
9884     if (c == 0x3A3) {
9885         mapped[0] = handle_capital_sigma(kind, data, length, i);
9886         return 1;
9887     }
9888     return _PyUnicode_ToLowerFull(c, mapped);
9889 }
9890 
9891 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9892 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9893 {
9894     Py_ssize_t i, k = 0;
9895     int n_res, j;
9896     Py_UCS4 c, mapped[3];
9897 
9898     c = PyUnicode_READ(kind, data, 0);
9899     n_res = _PyUnicode_ToTitleFull(c, mapped);
9900     for (j = 0; j < n_res; j++) {
9901         *maxchar = Py_MAX(*maxchar, mapped[j]);
9902         res[k++] = mapped[j];
9903     }
9904     for (i = 1; i < length; i++) {
9905         c = PyUnicode_READ(kind, data, i);
9906         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9907         for (j = 0; j < n_res; j++) {
9908             *maxchar = Py_MAX(*maxchar, mapped[j]);
9909             res[k++] = mapped[j];
9910         }
9911     }
9912     return k;
9913 }
9914 
9915 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9916 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9917     Py_ssize_t i, k = 0;
9918 
9919     for (i = 0; i < length; i++) {
9920         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921         int n_res, j;
9922         if (Py_UNICODE_ISUPPER(c)) {
9923             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924         }
9925         else if (Py_UNICODE_ISLOWER(c)) {
9926             n_res = _PyUnicode_ToUpperFull(c, mapped);
9927         }
9928         else {
9929             n_res = 1;
9930             mapped[0] = c;
9931         }
9932         for (j = 0; j < n_res; j++) {
9933             *maxchar = Py_MAX(*maxchar, mapped[j]);
9934             res[k++] = mapped[j];
9935         }
9936     }
9937     return k;
9938 }
9939 
9940 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9941 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9942                   Py_UCS4 *maxchar, int lower)
9943 {
9944     Py_ssize_t i, k = 0;
9945 
9946     for (i = 0; i < length; i++) {
9947         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9948         int n_res, j;
9949         if (lower)
9950             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9951         else
9952             n_res = _PyUnicode_ToUpperFull(c, mapped);
9953         for (j = 0; j < n_res; j++) {
9954             *maxchar = Py_MAX(*maxchar, mapped[j]);
9955             res[k++] = mapped[j];
9956         }
9957     }
9958     return k;
9959 }
9960 
9961 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9962 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9963 {
9964     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9965 }
9966 
9967 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9968 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9969 {
9970     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9971 }
9972 
9973 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9974 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9975 {
9976     Py_ssize_t i, k = 0;
9977 
9978     for (i = 0; i < length; i++) {
9979         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9980         Py_UCS4 mapped[3];
9981         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9982         for (j = 0; j < n_res; j++) {
9983             *maxchar = Py_MAX(*maxchar, mapped[j]);
9984             res[k++] = mapped[j];
9985         }
9986     }
9987     return k;
9988 }
9989 
9990 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9991 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9992 {
9993     Py_ssize_t i, k = 0;
9994     int previous_is_cased;
9995 
9996     previous_is_cased = 0;
9997     for (i = 0; i < length; i++) {
9998         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9999         Py_UCS4 mapped[3];
10000         int n_res, j;
10001 
10002         if (previous_is_cased)
10003             n_res = lower_ucs4(kind, data, length, i, c, mapped);
10004         else
10005             n_res = _PyUnicode_ToTitleFull(c, mapped);
10006 
10007         for (j = 0; j < n_res; j++) {
10008             *maxchar = Py_MAX(*maxchar, mapped[j]);
10009             res[k++] = mapped[j];
10010         }
10011 
10012         previous_is_cased = _PyUnicode_IsCased(c);
10013     }
10014     return k;
10015 }
10016 
10017 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10018 case_operation(PyObject *self,
10019                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10020 {
10021     PyObject *res = NULL;
10022     Py_ssize_t length, newlength = 0;
10023     int kind, outkind;
10024     const void *data;
10025     void *outdata;
10026     Py_UCS4 maxchar = 0, *tmp, *tmpend;
10027 
10028     assert(PyUnicode_IS_READY(self));
10029 
10030     kind = PyUnicode_KIND(self);
10031     data = PyUnicode_DATA(self);
10032     length = PyUnicode_GET_LENGTH(self);
10033     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10034         PyErr_SetString(PyExc_OverflowError, "string is too long");
10035         return NULL;
10036     }
10037     tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10038     if (tmp == NULL)
10039         return PyErr_NoMemory();
10040     newlength = perform(kind, data, length, tmp, &maxchar);
10041     res = PyUnicode_New(newlength, maxchar);
10042     if (res == NULL)
10043         goto leave;
10044     tmpend = tmp + newlength;
10045     outdata = PyUnicode_DATA(res);
10046     outkind = PyUnicode_KIND(res);
10047     switch (outkind) {
10048     case PyUnicode_1BYTE_KIND:
10049         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10050         break;
10051     case PyUnicode_2BYTE_KIND:
10052         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10053         break;
10054     case PyUnicode_4BYTE_KIND:
10055         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10056         break;
10057     default:
10058         Py_UNREACHABLE();
10059     }
10060   leave:
10061     PyMem_Free(tmp);
10062     return res;
10063 }
10064 
10065 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10066 PyUnicode_Join(PyObject *separator, PyObject *seq)
10067 {
10068     PyObject *res;
10069     PyObject *fseq;
10070     Py_ssize_t seqlen;
10071     PyObject **items;
10072 
10073     fseq = PySequence_Fast(seq, "can only join an iterable");
10074     if (fseq == NULL) {
10075         return NULL;
10076     }
10077 
10078     /* NOTE: the following code can't call back into Python code,
10079      * so we are sure that fseq won't be mutated.
10080      */
10081 
10082     items = PySequence_Fast_ITEMS(fseq);
10083     seqlen = PySequence_Fast_GET_SIZE(fseq);
10084     res = _PyUnicode_JoinArray(separator, items, seqlen);
10085     Py_DECREF(fseq);
10086     return res;
10087 }
10088 
10089 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10090 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10091 {
10092     PyObject *res = NULL; /* the result */
10093     PyObject *sep = NULL;
10094     Py_ssize_t seplen;
10095     PyObject *item;
10096     Py_ssize_t sz, i, res_offset;
10097     Py_UCS4 maxchar;
10098     Py_UCS4 item_maxchar;
10099     int use_memcpy;
10100     unsigned char *res_data = NULL, *sep_data = NULL;
10101     PyObject *last_obj;
10102     unsigned int kind = 0;
10103 
10104     /* If empty sequence, return u"". */
10105     if (seqlen == 0) {
10106         _Py_RETURN_UNICODE_EMPTY();
10107     }
10108 
10109     /* If singleton sequence with an exact Unicode, return that. */
10110     last_obj = NULL;
10111     if (seqlen == 1) {
10112         if (PyUnicode_CheckExact(items[0])) {
10113             res = items[0];
10114             Py_INCREF(res);
10115             return res;
10116         }
10117         seplen = 0;
10118         maxchar = 0;
10119     }
10120     else {
10121         /* Set up sep and seplen */
10122         if (separator == NULL) {
10123             /* fall back to a blank space separator */
10124             sep = PyUnicode_FromOrdinal(' ');
10125             if (!sep)
10126                 goto onError;
10127             seplen = 1;
10128             maxchar = 32;
10129         }
10130         else {
10131             if (!PyUnicode_Check(separator)) {
10132                 PyErr_Format(PyExc_TypeError,
10133                              "separator: expected str instance,"
10134                              " %.80s found",
10135                              Py_TYPE(separator)->tp_name);
10136                 goto onError;
10137             }
10138             if (PyUnicode_READY(separator))
10139                 goto onError;
10140             sep = separator;
10141             seplen = PyUnicode_GET_LENGTH(separator);
10142             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10143             /* inc refcount to keep this code path symmetric with the
10144                above case of a blank separator */
10145             Py_INCREF(sep);
10146         }
10147         last_obj = sep;
10148     }
10149 
10150     /* There are at least two things to join, or else we have a subclass
10151      * of str in the sequence.
10152      * Do a pre-pass to figure out the total amount of space we'll
10153      * need (sz), and see whether all argument are strings.
10154      */
10155     sz = 0;
10156 #ifdef Py_DEBUG
10157     use_memcpy = 0;
10158 #else
10159     use_memcpy = 1;
10160 #endif
10161     for (i = 0; i < seqlen; i++) {
10162         size_t add_sz;
10163         item = items[i];
10164         if (!PyUnicode_Check(item)) {
10165             PyErr_Format(PyExc_TypeError,
10166                          "sequence item %zd: expected str instance,"
10167                          " %.80s found",
10168                          i, Py_TYPE(item)->tp_name);
10169             goto onError;
10170         }
10171         if (PyUnicode_READY(item) == -1)
10172             goto onError;
10173         add_sz = PyUnicode_GET_LENGTH(item);
10174         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10175         maxchar = Py_MAX(maxchar, item_maxchar);
10176         if (i != 0) {
10177             add_sz += seplen;
10178         }
10179         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10180             PyErr_SetString(PyExc_OverflowError,
10181                             "join() result is too long for a Python string");
10182             goto onError;
10183         }
10184         sz += add_sz;
10185         if (use_memcpy && last_obj != NULL) {
10186             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10187                 use_memcpy = 0;
10188         }
10189         last_obj = item;
10190     }
10191 
10192     res = PyUnicode_New(sz, maxchar);
10193     if (res == NULL)
10194         goto onError;
10195 
10196     /* Catenate everything. */
10197 #ifdef Py_DEBUG
10198     use_memcpy = 0;
10199 #else
10200     if (use_memcpy) {
10201         res_data = PyUnicode_1BYTE_DATA(res);
10202         kind = PyUnicode_KIND(res);
10203         if (seplen != 0)
10204             sep_data = PyUnicode_1BYTE_DATA(sep);
10205     }
10206 #endif
10207     if (use_memcpy) {
10208         for (i = 0; i < seqlen; ++i) {
10209             Py_ssize_t itemlen;
10210             item = items[i];
10211 
10212             /* Copy item, and maybe the separator. */
10213             if (i && seplen != 0) {
10214                 memcpy(res_data,
10215                           sep_data,
10216                           kind * seplen);
10217                 res_data += kind * seplen;
10218             }
10219 
10220             itemlen = PyUnicode_GET_LENGTH(item);
10221             if (itemlen != 0) {
10222                 memcpy(res_data,
10223                           PyUnicode_DATA(item),
10224                           kind * itemlen);
10225                 res_data += kind * itemlen;
10226             }
10227         }
10228         assert(res_data == PyUnicode_1BYTE_DATA(res)
10229                            + kind * PyUnicode_GET_LENGTH(res));
10230     }
10231     else {
10232         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10233             Py_ssize_t itemlen;
10234             item = items[i];
10235 
10236             /* Copy item, and maybe the separator. */
10237             if (i && seplen != 0) {
10238                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10239                 res_offset += seplen;
10240             }
10241 
10242             itemlen = PyUnicode_GET_LENGTH(item);
10243             if (itemlen != 0) {
10244                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10245                 res_offset += itemlen;
10246             }
10247         }
10248         assert(res_offset == PyUnicode_GET_LENGTH(res));
10249     }
10250 
10251     Py_XDECREF(sep);
10252     assert(_PyUnicode_CheckConsistency(res, 1));
10253     return res;
10254 
10255   onError:
10256     Py_XDECREF(sep);
10257     Py_XDECREF(res);
10258     return NULL;
10259 }
10260 
10261 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10262 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10263                     Py_UCS4 fill_char)
10264 {
10265     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10266     void *data = PyUnicode_DATA(unicode);
10267     assert(PyUnicode_IS_READY(unicode));
10268     assert(unicode_modifiable(unicode));
10269     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10270     assert(start >= 0);
10271     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10272     unicode_fill(kind, data, fill_char, start, length);
10273 }
10274 
10275 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10276 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10277                Py_UCS4 fill_char)
10278 {
10279     Py_ssize_t maxlen;
10280 
10281     if (!PyUnicode_Check(unicode)) {
10282         PyErr_BadInternalCall();
10283         return -1;
10284     }
10285     if (PyUnicode_READY(unicode) == -1)
10286         return -1;
10287     if (unicode_check_modifiable(unicode))
10288         return -1;
10289 
10290     if (start < 0) {
10291         PyErr_SetString(PyExc_IndexError, "string index out of range");
10292         return -1;
10293     }
10294     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10295         PyErr_SetString(PyExc_ValueError,
10296                          "fill character is bigger than "
10297                          "the string maximum character");
10298         return -1;
10299     }
10300 
10301     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10302     length = Py_MIN(maxlen, length);
10303     if (length <= 0)
10304         return 0;
10305 
10306     _PyUnicode_FastFill(unicode, start, length, fill_char);
10307     return length;
10308 }
10309 
10310 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10311 pad(PyObject *self,
10312     Py_ssize_t left,
10313     Py_ssize_t right,
10314     Py_UCS4 fill)
10315 {
10316     PyObject *u;
10317     Py_UCS4 maxchar;
10318     int kind;
10319     void *data;
10320 
10321     if (left < 0)
10322         left = 0;
10323     if (right < 0)
10324         right = 0;
10325 
10326     if (left == 0 && right == 0)
10327         return unicode_result_unchanged(self);
10328 
10329     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10330         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10331         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10332         return NULL;
10333     }
10334     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10335     maxchar = Py_MAX(maxchar, fill);
10336     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10337     if (!u)
10338         return NULL;
10339 
10340     kind = PyUnicode_KIND(u);
10341     data = PyUnicode_DATA(u);
10342     if (left)
10343         unicode_fill(kind, data, fill, 0, left);
10344     if (right)
10345         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10346     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10347     assert(_PyUnicode_CheckConsistency(u, 1));
10348     return u;
10349 }
10350 
10351 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10352 PyUnicode_Splitlines(PyObject *string, int keepends)
10353 {
10354     PyObject *list;
10355 
10356     if (ensure_unicode(string) < 0)
10357         return NULL;
10358 
10359     switch (PyUnicode_KIND(string)) {
10360     case PyUnicode_1BYTE_KIND:
10361         if (PyUnicode_IS_ASCII(string))
10362             list = asciilib_splitlines(
10363                 string, PyUnicode_1BYTE_DATA(string),
10364                 PyUnicode_GET_LENGTH(string), keepends);
10365         else
10366             list = ucs1lib_splitlines(
10367                 string, PyUnicode_1BYTE_DATA(string),
10368                 PyUnicode_GET_LENGTH(string), keepends);
10369         break;
10370     case PyUnicode_2BYTE_KIND:
10371         list = ucs2lib_splitlines(
10372             string, PyUnicode_2BYTE_DATA(string),
10373             PyUnicode_GET_LENGTH(string), keepends);
10374         break;
10375     case PyUnicode_4BYTE_KIND:
10376         list = ucs4lib_splitlines(
10377             string, PyUnicode_4BYTE_DATA(string),
10378             PyUnicode_GET_LENGTH(string), keepends);
10379         break;
10380     default:
10381         Py_UNREACHABLE();
10382     }
10383     return list;
10384 }
10385 
10386 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10387 split(PyObject *self,
10388       PyObject *substring,
10389       Py_ssize_t maxcount)
10390 {
10391     int kind1, kind2;
10392     const void *buf1, *buf2;
10393     Py_ssize_t len1, len2;
10394     PyObject* out;
10395 
10396     if (maxcount < 0)
10397         maxcount = PY_SSIZE_T_MAX;
10398 
10399     if (PyUnicode_READY(self) == -1)
10400         return NULL;
10401 
10402     if (substring == NULL)
10403         switch (PyUnicode_KIND(self)) {
10404         case PyUnicode_1BYTE_KIND:
10405             if (PyUnicode_IS_ASCII(self))
10406                 return asciilib_split_whitespace(
10407                     self,  PyUnicode_1BYTE_DATA(self),
10408                     PyUnicode_GET_LENGTH(self), maxcount
10409                     );
10410             else
10411                 return ucs1lib_split_whitespace(
10412                     self,  PyUnicode_1BYTE_DATA(self),
10413                     PyUnicode_GET_LENGTH(self), maxcount
10414                     );
10415         case PyUnicode_2BYTE_KIND:
10416             return ucs2lib_split_whitespace(
10417                 self,  PyUnicode_2BYTE_DATA(self),
10418                 PyUnicode_GET_LENGTH(self), maxcount
10419                 );
10420         case PyUnicode_4BYTE_KIND:
10421             return ucs4lib_split_whitespace(
10422                 self,  PyUnicode_4BYTE_DATA(self),
10423                 PyUnicode_GET_LENGTH(self), maxcount
10424                 );
10425         default:
10426             Py_UNREACHABLE();
10427         }
10428 
10429     if (PyUnicode_READY(substring) == -1)
10430         return NULL;
10431 
10432     kind1 = PyUnicode_KIND(self);
10433     kind2 = PyUnicode_KIND(substring);
10434     len1 = PyUnicode_GET_LENGTH(self);
10435     len2 = PyUnicode_GET_LENGTH(substring);
10436     if (kind1 < kind2 || len1 < len2) {
10437         out = PyList_New(1);
10438         if (out == NULL)
10439             return NULL;
10440         Py_INCREF(self);
10441         PyList_SET_ITEM(out, 0, self);
10442         return out;
10443     }
10444     buf1 = PyUnicode_DATA(self);
10445     buf2 = PyUnicode_DATA(substring);
10446     if (kind2 != kind1) {
10447         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10448         if (!buf2)
10449             return NULL;
10450     }
10451 
10452     switch (kind1) {
10453     case PyUnicode_1BYTE_KIND:
10454         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10455             out = asciilib_split(
10456                 self,  buf1, len1, buf2, len2, maxcount);
10457         else
10458             out = ucs1lib_split(
10459                 self,  buf1, len1, buf2, len2, maxcount);
10460         break;
10461     case PyUnicode_2BYTE_KIND:
10462         out = ucs2lib_split(
10463             self,  buf1, len1, buf2, len2, maxcount);
10464         break;
10465     case PyUnicode_4BYTE_KIND:
10466         out = ucs4lib_split(
10467             self,  buf1, len1, buf2, len2, maxcount);
10468         break;
10469     default:
10470         out = NULL;
10471     }
10472     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10473     if (kind2 != kind1)
10474         PyMem_Free((void *)buf2);
10475     return out;
10476 }
10477 
10478 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10479 rsplit(PyObject *self,
10480        PyObject *substring,
10481        Py_ssize_t maxcount)
10482 {
10483     int kind1, kind2;
10484     const void *buf1, *buf2;
10485     Py_ssize_t len1, len2;
10486     PyObject* out;
10487 
10488     if (maxcount < 0)
10489         maxcount = PY_SSIZE_T_MAX;
10490 
10491     if (PyUnicode_READY(self) == -1)
10492         return NULL;
10493 
10494     if (substring == NULL)
10495         switch (PyUnicode_KIND(self)) {
10496         case PyUnicode_1BYTE_KIND:
10497             if (PyUnicode_IS_ASCII(self))
10498                 return asciilib_rsplit_whitespace(
10499                     self,  PyUnicode_1BYTE_DATA(self),
10500                     PyUnicode_GET_LENGTH(self), maxcount
10501                     );
10502             else
10503                 return ucs1lib_rsplit_whitespace(
10504                     self,  PyUnicode_1BYTE_DATA(self),
10505                     PyUnicode_GET_LENGTH(self), maxcount
10506                     );
10507         case PyUnicode_2BYTE_KIND:
10508             return ucs2lib_rsplit_whitespace(
10509                 self,  PyUnicode_2BYTE_DATA(self),
10510                 PyUnicode_GET_LENGTH(self), maxcount
10511                 );
10512         case PyUnicode_4BYTE_KIND:
10513             return ucs4lib_rsplit_whitespace(
10514                 self,  PyUnicode_4BYTE_DATA(self),
10515                 PyUnicode_GET_LENGTH(self), maxcount
10516                 );
10517         default:
10518             Py_UNREACHABLE();
10519         }
10520 
10521     if (PyUnicode_READY(substring) == -1)
10522         return NULL;
10523 
10524     kind1 = PyUnicode_KIND(self);
10525     kind2 = PyUnicode_KIND(substring);
10526     len1 = PyUnicode_GET_LENGTH(self);
10527     len2 = PyUnicode_GET_LENGTH(substring);
10528     if (kind1 < kind2 || len1 < len2) {
10529         out = PyList_New(1);
10530         if (out == NULL)
10531             return NULL;
10532         Py_INCREF(self);
10533         PyList_SET_ITEM(out, 0, self);
10534         return out;
10535     }
10536     buf1 = PyUnicode_DATA(self);
10537     buf2 = PyUnicode_DATA(substring);
10538     if (kind2 != kind1) {
10539         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10540         if (!buf2)
10541             return NULL;
10542     }
10543 
10544     switch (kind1) {
10545     case PyUnicode_1BYTE_KIND:
10546         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10547             out = asciilib_rsplit(
10548                 self,  buf1, len1, buf2, len2, maxcount);
10549         else
10550             out = ucs1lib_rsplit(
10551                 self,  buf1, len1, buf2, len2, maxcount);
10552         break;
10553     case PyUnicode_2BYTE_KIND:
10554         out = ucs2lib_rsplit(
10555             self,  buf1, len1, buf2, len2, maxcount);
10556         break;
10557     case PyUnicode_4BYTE_KIND:
10558         out = ucs4lib_rsplit(
10559             self,  buf1, len1, buf2, len2, maxcount);
10560         break;
10561     default:
10562         out = NULL;
10563     }
10564     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10565     if (kind2 != kind1)
10566         PyMem_Free((void *)buf2);
10567     return out;
10568 }
10569 
10570 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10571 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10572             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10573 {
10574     switch (kind) {
10575     case PyUnicode_1BYTE_KIND:
10576         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10577             return asciilib_find(buf1, len1, buf2, len2, offset);
10578         else
10579             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10580     case PyUnicode_2BYTE_KIND:
10581         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10582     case PyUnicode_4BYTE_KIND:
10583         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10584     }
10585     Py_UNREACHABLE();
10586 }
10587 
10588 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10589 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10590              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10591 {
10592     switch (kind) {
10593     case PyUnicode_1BYTE_KIND:
10594         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10595             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10596         else
10597             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10598     case PyUnicode_2BYTE_KIND:
10599         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10600     case PyUnicode_4BYTE_KIND:
10601         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10602     }
10603     Py_UNREACHABLE();
10604 }
10605 
10606 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10607 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10608                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10609 {
10610     int kind = PyUnicode_KIND(u);
10611     void *data = PyUnicode_DATA(u);
10612     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10613     if (kind == PyUnicode_1BYTE_KIND) {
10614         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10615                                       (Py_UCS1 *)data + len,
10616                                       u1, u2, maxcount);
10617     }
10618     else if (kind == PyUnicode_2BYTE_KIND) {
10619         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10620                                       (Py_UCS2 *)data + len,
10621                                       u1, u2, maxcount);
10622     }
10623     else {
10624         assert(kind == PyUnicode_4BYTE_KIND);
10625         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10626                                       (Py_UCS4 *)data + len,
10627                                       u1, u2, maxcount);
10628     }
10629 }
10630 
10631 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10632 replace(PyObject *self, PyObject *str1,
10633         PyObject *str2, Py_ssize_t maxcount)
10634 {
10635     PyObject *u;
10636     const char *sbuf = PyUnicode_DATA(self);
10637     const void *buf1 = PyUnicode_DATA(str1);
10638     const void *buf2 = PyUnicode_DATA(str2);
10639     int srelease = 0, release1 = 0, release2 = 0;
10640     int skind = PyUnicode_KIND(self);
10641     int kind1 = PyUnicode_KIND(str1);
10642     int kind2 = PyUnicode_KIND(str2);
10643     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10644     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10645     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10646     int mayshrink;
10647     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10648 
10649     if (slen < len1)
10650         goto nothing;
10651 
10652     if (maxcount < 0)
10653         maxcount = PY_SSIZE_T_MAX;
10654     else if (maxcount == 0)
10655         goto nothing;
10656 
10657     if (str1 == str2)
10658         goto nothing;
10659 
10660     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10661     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10662     if (maxchar < maxchar_str1)
10663         /* substring too wide to be present */
10664         goto nothing;
10665     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10666     /* Replacing str1 with str2 may cause a maxchar reduction in the
10667        result string. */
10668     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10669     maxchar = Py_MAX(maxchar, maxchar_str2);
10670 
10671     if (len1 == len2) {
10672         /* same length */
10673         if (len1 == 0)
10674             goto nothing;
10675         if (len1 == 1) {
10676             /* replace characters */
10677             Py_UCS4 u1, u2;
10678             Py_ssize_t pos;
10679 
10680             u1 = PyUnicode_READ(kind1, buf1, 0);
10681             pos = findchar(sbuf, skind, slen, u1, 1);
10682             if (pos < 0)
10683                 goto nothing;
10684             u2 = PyUnicode_READ(kind2, buf2, 0);
10685             u = PyUnicode_New(slen, maxchar);
10686             if (!u)
10687                 goto error;
10688 
10689             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10690             replace_1char_inplace(u, pos, u1, u2, maxcount);
10691         }
10692         else {
10693             int rkind = skind;
10694             char *res;
10695             Py_ssize_t i;
10696 
10697             if (kind1 < rkind) {
10698                 /* widen substring */
10699                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10700                 if (!buf1) goto error;
10701                 release1 = 1;
10702             }
10703             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10704             if (i < 0)
10705                 goto nothing;
10706             if (rkind > kind2) {
10707                 /* widen replacement */
10708                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10709                 if (!buf2) goto error;
10710                 release2 = 1;
10711             }
10712             else if (rkind < kind2) {
10713                 /* widen self and buf1 */
10714                 rkind = kind2;
10715                 if (release1) {
10716                     assert(buf1 != PyUnicode_DATA(str1));
10717                     PyMem_Free((void *)buf1);
10718                     buf1 = PyUnicode_DATA(str1);
10719                     release1 = 0;
10720                 }
10721                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10722                 if (!sbuf) goto error;
10723                 srelease = 1;
10724                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10725                 if (!buf1) goto error;
10726                 release1 = 1;
10727             }
10728             u = PyUnicode_New(slen, maxchar);
10729             if (!u)
10730                 goto error;
10731             assert(PyUnicode_KIND(u) == rkind);
10732             res = PyUnicode_DATA(u);
10733 
10734             memcpy(res, sbuf, rkind * slen);
10735             /* change everything in-place, starting with this one */
10736             memcpy(res + rkind * i,
10737                    buf2,
10738                    rkind * len2);
10739             i += len1;
10740 
10741             while ( --maxcount > 0) {
10742                 i = anylib_find(rkind, self,
10743                                 sbuf+rkind*i, slen-i,
10744                                 str1, buf1, len1, i);
10745                 if (i == -1)
10746                     break;
10747                 memcpy(res + rkind * i,
10748                        buf2,
10749                        rkind * len2);
10750                 i += len1;
10751             }
10752         }
10753     }
10754     else {
10755         Py_ssize_t n, i, j, ires;
10756         Py_ssize_t new_size;
10757         int rkind = skind;
10758         char *res;
10759 
10760         if (kind1 < rkind) {
10761             /* widen substring */
10762             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10763             if (!buf1) goto error;
10764             release1 = 1;
10765         }
10766         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10767         if (n == 0)
10768             goto nothing;
10769         if (kind2 < rkind) {
10770             /* widen replacement */
10771             buf2 = unicode_askind(kind2, buf2, len2, rkind);
10772             if (!buf2) goto error;
10773             release2 = 1;
10774         }
10775         else if (kind2 > rkind) {
10776             /* widen self and buf1 */
10777             rkind = kind2;
10778             sbuf = unicode_askind(skind, sbuf, slen, rkind);
10779             if (!sbuf) goto error;
10780             srelease = 1;
10781             if (release1) {
10782                 assert(buf1 != PyUnicode_DATA(str1));
10783                 PyMem_Free((void *)buf1);
10784                 buf1 = PyUnicode_DATA(str1);
10785                 release1 = 0;
10786             }
10787             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10788             if (!buf1) goto error;
10789             release1 = 1;
10790         }
10791         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10792            PyUnicode_GET_LENGTH(str1)); */
10793         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10794                 PyErr_SetString(PyExc_OverflowError,
10795                                 "replace string is too long");
10796                 goto error;
10797         }
10798         new_size = slen + n * (len2 - len1);
10799         if (new_size == 0) {
10800             u = unicode_new_empty();
10801             goto done;
10802         }
10803         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10804             PyErr_SetString(PyExc_OverflowError,
10805                             "replace string is too long");
10806             goto error;
10807         }
10808         u = PyUnicode_New(new_size, maxchar);
10809         if (!u)
10810             goto error;
10811         assert(PyUnicode_KIND(u) == rkind);
10812         res = PyUnicode_DATA(u);
10813         ires = i = 0;
10814         if (len1 > 0) {
10815             while (n-- > 0) {
10816                 /* look for next match */
10817                 j = anylib_find(rkind, self,
10818                                 sbuf + rkind * i, slen-i,
10819                                 str1, buf1, len1, i);
10820                 if (j == -1)
10821                     break;
10822                 else if (j > i) {
10823                     /* copy unchanged part [i:j] */
10824                     memcpy(res + rkind * ires,
10825                            sbuf + rkind * i,
10826                            rkind * (j-i));
10827                     ires += j - i;
10828                 }
10829                 /* copy substitution string */
10830                 if (len2 > 0) {
10831                     memcpy(res + rkind * ires,
10832                            buf2,
10833                            rkind * len2);
10834                     ires += len2;
10835                 }
10836                 i = j + len1;
10837             }
10838             if (i < slen)
10839                 /* copy tail [i:] */
10840                 memcpy(res + rkind * ires,
10841                        sbuf + rkind * i,
10842                        rkind * (slen-i));
10843         }
10844         else {
10845             /* interleave */
10846             while (n > 0) {
10847                 memcpy(res + rkind * ires,
10848                        buf2,
10849                        rkind * len2);
10850                 ires += len2;
10851                 if (--n <= 0)
10852                     break;
10853                 memcpy(res + rkind * ires,
10854                        sbuf + rkind * i,
10855                        rkind);
10856                 ires++;
10857                 i++;
10858             }
10859             memcpy(res + rkind * ires,
10860                    sbuf + rkind * i,
10861                    rkind * (slen-i));
10862         }
10863     }
10864 
10865     if (mayshrink) {
10866         unicode_adjust_maxchar(&u);
10867         if (u == NULL)
10868             goto error;
10869     }
10870 
10871   done:
10872     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10873     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10874     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10875     if (srelease)
10876         PyMem_Free((void *)sbuf);
10877     if (release1)
10878         PyMem_Free((void *)buf1);
10879     if (release2)
10880         PyMem_Free((void *)buf2);
10881     assert(_PyUnicode_CheckConsistency(u, 1));
10882     return u;
10883 
10884   nothing:
10885     /* nothing to replace; return original string (when possible) */
10886     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10887     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10888     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10889     if (srelease)
10890         PyMem_Free((void *)sbuf);
10891     if (release1)
10892         PyMem_Free((void *)buf1);
10893     if (release2)
10894         PyMem_Free((void *)buf2);
10895     return unicode_result_unchanged(self);
10896 
10897   error:
10898     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10899     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10900     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10901     if (srelease)
10902         PyMem_Free((void *)sbuf);
10903     if (release1)
10904         PyMem_Free((void *)buf1);
10905     if (release2)
10906         PyMem_Free((void *)buf2);
10907     return NULL;
10908 }
10909 
10910 /* --- Unicode Object Methods --------------------------------------------- */
10911 
10912 /*[clinic input]
10913 str.title as unicode_title
10914 
10915 Return a version of the string where each word is titlecased.
10916 
10917 More specifically, words start with uppercased characters and all remaining
10918 cased characters have lower case.
10919 [clinic start generated code]*/
10920 
10921 static PyObject *
unicode_title_impl(PyObject * self)10922 unicode_title_impl(PyObject *self)
10923 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10924 {
10925     if (PyUnicode_READY(self) == -1)
10926         return NULL;
10927     return case_operation(self, do_title);
10928 }
10929 
10930 /*[clinic input]
10931 str.capitalize as unicode_capitalize
10932 
10933 Return a capitalized version of the string.
10934 
10935 More specifically, make the first character have upper case and the rest lower
10936 case.
10937 [clinic start generated code]*/
10938 
10939 static PyObject *
unicode_capitalize_impl(PyObject * self)10940 unicode_capitalize_impl(PyObject *self)
10941 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10942 {
10943     if (PyUnicode_READY(self) == -1)
10944         return NULL;
10945     if (PyUnicode_GET_LENGTH(self) == 0)
10946         return unicode_result_unchanged(self);
10947     return case_operation(self, do_capitalize);
10948 }
10949 
10950 /*[clinic input]
10951 str.casefold as unicode_casefold
10952 
10953 Return a version of the string suitable for caseless comparisons.
10954 [clinic start generated code]*/
10955 
10956 static PyObject *
unicode_casefold_impl(PyObject * self)10957 unicode_casefold_impl(PyObject *self)
10958 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10959 {
10960     if (PyUnicode_READY(self) == -1)
10961         return NULL;
10962     if (PyUnicode_IS_ASCII(self))
10963         return ascii_upper_or_lower(self, 1);
10964     return case_operation(self, do_casefold);
10965 }
10966 
10967 
10968 /* Argument converter. Accepts a single Unicode character. */
10969 
10970 static int
convert_uc(PyObject * obj,void * addr)10971 convert_uc(PyObject *obj, void *addr)
10972 {
10973     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10974 
10975     if (!PyUnicode_Check(obj)) {
10976         PyErr_Format(PyExc_TypeError,
10977                      "The fill character must be a unicode character, "
10978                      "not %.100s", Py_TYPE(obj)->tp_name);
10979         return 0;
10980     }
10981     if (PyUnicode_READY(obj) < 0)
10982         return 0;
10983     if (PyUnicode_GET_LENGTH(obj) != 1) {
10984         PyErr_SetString(PyExc_TypeError,
10985                         "The fill character must be exactly one character long");
10986         return 0;
10987     }
10988     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10989     return 1;
10990 }
10991 
10992 /*[clinic input]
10993 str.center as unicode_center
10994 
10995     width: Py_ssize_t
10996     fillchar: Py_UCS4 = ' '
10997     /
10998 
10999 Return a centered string of length width.
11000 
11001 Padding is done using the specified fill character (default is a space).
11002 [clinic start generated code]*/
11003 
11004 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11005 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11006 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11007 {
11008     Py_ssize_t marg, left;
11009 
11010     if (PyUnicode_READY(self) == -1)
11011         return NULL;
11012 
11013     if (PyUnicode_GET_LENGTH(self) >= width)
11014         return unicode_result_unchanged(self);
11015 
11016     marg = width - PyUnicode_GET_LENGTH(self);
11017     left = marg / 2 + (marg & width & 1);
11018 
11019     return pad(self, left, marg - left, fillchar);
11020 }
11021 
11022 /* This function assumes that str1 and str2 are readied by the caller. */
11023 
11024 static int
unicode_compare(PyObject * str1,PyObject * str2)11025 unicode_compare(PyObject *str1, PyObject *str2)
11026 {
11027 #define COMPARE(TYPE1, TYPE2) \
11028     do { \
11029         TYPE1* p1 = (TYPE1 *)data1; \
11030         TYPE2* p2 = (TYPE2 *)data2; \
11031         TYPE1* end = p1 + len; \
11032         Py_UCS4 c1, c2; \
11033         for (; p1 != end; p1++, p2++) { \
11034             c1 = *p1; \
11035             c2 = *p2; \
11036             if (c1 != c2) \
11037                 return (c1 < c2) ? -1 : 1; \
11038         } \
11039     } \
11040     while (0)
11041 
11042     int kind1, kind2;
11043     const void *data1, *data2;
11044     Py_ssize_t len1, len2, len;
11045 
11046     kind1 = PyUnicode_KIND(str1);
11047     kind2 = PyUnicode_KIND(str2);
11048     data1 = PyUnicode_DATA(str1);
11049     data2 = PyUnicode_DATA(str2);
11050     len1 = PyUnicode_GET_LENGTH(str1);
11051     len2 = PyUnicode_GET_LENGTH(str2);
11052     len = Py_MIN(len1, len2);
11053 
11054     switch(kind1) {
11055     case PyUnicode_1BYTE_KIND:
11056     {
11057         switch(kind2) {
11058         case PyUnicode_1BYTE_KIND:
11059         {
11060             int cmp = memcmp(data1, data2, len);
11061             /* normalize result of memcmp() into the range [-1; 1] */
11062             if (cmp < 0)
11063                 return -1;
11064             if (cmp > 0)
11065                 return 1;
11066             break;
11067         }
11068         case PyUnicode_2BYTE_KIND:
11069             COMPARE(Py_UCS1, Py_UCS2);
11070             break;
11071         case PyUnicode_4BYTE_KIND:
11072             COMPARE(Py_UCS1, Py_UCS4);
11073             break;
11074         default:
11075             Py_UNREACHABLE();
11076         }
11077         break;
11078     }
11079     case PyUnicode_2BYTE_KIND:
11080     {
11081         switch(kind2) {
11082         case PyUnicode_1BYTE_KIND:
11083             COMPARE(Py_UCS2, Py_UCS1);
11084             break;
11085         case PyUnicode_2BYTE_KIND:
11086         {
11087             COMPARE(Py_UCS2, Py_UCS2);
11088             break;
11089         }
11090         case PyUnicode_4BYTE_KIND:
11091             COMPARE(Py_UCS2, Py_UCS4);
11092             break;
11093         default:
11094             Py_UNREACHABLE();
11095         }
11096         break;
11097     }
11098     case PyUnicode_4BYTE_KIND:
11099     {
11100         switch(kind2) {
11101         case PyUnicode_1BYTE_KIND:
11102             COMPARE(Py_UCS4, Py_UCS1);
11103             break;
11104         case PyUnicode_2BYTE_KIND:
11105             COMPARE(Py_UCS4, Py_UCS2);
11106             break;
11107         case PyUnicode_4BYTE_KIND:
11108         {
11109 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11110             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11111             /* normalize result of wmemcmp() into the range [-1; 1] */
11112             if (cmp < 0)
11113                 return -1;
11114             if (cmp > 0)
11115                 return 1;
11116 #else
11117             COMPARE(Py_UCS4, Py_UCS4);
11118 #endif
11119             break;
11120         }
11121         default:
11122             Py_UNREACHABLE();
11123         }
11124         break;
11125     }
11126     default:
11127         Py_UNREACHABLE();
11128     }
11129 
11130     if (len1 == len2)
11131         return 0;
11132     if (len1 < len2)
11133         return -1;
11134     else
11135         return 1;
11136 
11137 #undef COMPARE
11138 }
11139 
11140 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11141 unicode_compare_eq(PyObject *str1, PyObject *str2)
11142 {
11143     int kind;
11144     const void *data1, *data2;
11145     Py_ssize_t len;
11146     int cmp;
11147 
11148     len = PyUnicode_GET_LENGTH(str1);
11149     if (PyUnicode_GET_LENGTH(str2) != len)
11150         return 0;
11151     kind = PyUnicode_KIND(str1);
11152     if (PyUnicode_KIND(str2) != kind)
11153         return 0;
11154     data1 = PyUnicode_DATA(str1);
11155     data2 = PyUnicode_DATA(str2);
11156 
11157     cmp = memcmp(data1, data2, len * kind);
11158     return (cmp == 0);
11159 }
11160 
11161 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)11162 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
11163 {
11164     assert(PyUnicode_Check(str1));
11165     assert(PyUnicode_Check(str2));
11166     if (str1 == str2) {
11167         return 1;
11168     }
11169     if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11170         return -1;
11171     }
11172     return unicode_compare_eq(str1, str2);
11173 }
11174 
11175 
11176 int
PyUnicode_Compare(PyObject * left,PyObject * right)11177 PyUnicode_Compare(PyObject *left, PyObject *right)
11178 {
11179     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11180         if (PyUnicode_READY(left) == -1 ||
11181             PyUnicode_READY(right) == -1)
11182             return -1;
11183 
11184         /* a string is equal to itself */
11185         if (left == right)
11186             return 0;
11187 
11188         return unicode_compare(left, right);
11189     }
11190     PyErr_Format(PyExc_TypeError,
11191                  "Can't compare %.100s and %.100s",
11192                  Py_TYPE(left)->tp_name,
11193                  Py_TYPE(right)->tp_name);
11194     return -1;
11195 }
11196 
11197 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11198 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11199 {
11200     Py_ssize_t i;
11201     int kind;
11202     Py_UCS4 chr;
11203     const unsigned char *ustr = (const unsigned char *)str;
11204 
11205     assert(_PyUnicode_CHECK(uni));
11206     if (!PyUnicode_IS_READY(uni)) {
11207         const wchar_t *ws = _PyUnicode_WSTR(uni);
11208         /* Compare Unicode string and source character set string */
11209         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11210             if (chr != ustr[i])
11211                 return (chr < ustr[i]) ? -1 : 1;
11212         }
11213         /* This check keeps Python strings that end in '\0' from comparing equal
11214          to C strings identical up to that point. */
11215         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11216             return 1; /* uni is longer */
11217         if (ustr[i])
11218             return -1; /* str is longer */
11219         return 0;
11220     }
11221     kind = PyUnicode_KIND(uni);
11222     if (kind == PyUnicode_1BYTE_KIND) {
11223         const void *data = PyUnicode_1BYTE_DATA(uni);
11224         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11225         size_t len, len2 = strlen(str);
11226         int cmp;
11227 
11228         len = Py_MIN(len1, len2);
11229         cmp = memcmp(data, str, len);
11230         if (cmp != 0) {
11231             if (cmp < 0)
11232                 return -1;
11233             else
11234                 return 1;
11235         }
11236         if (len1 > len2)
11237             return 1; /* uni is longer */
11238         if (len1 < len2)
11239             return -1; /* str is longer */
11240         return 0;
11241     }
11242     else {
11243         const void *data = PyUnicode_DATA(uni);
11244         /* Compare Unicode string and source character set string */
11245         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11246             if (chr != (unsigned char)str[i])
11247                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11248         /* This check keeps Python strings that end in '\0' from comparing equal
11249          to C strings identical up to that point. */
11250         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11251             return 1; /* uni is longer */
11252         if (str[i])
11253             return -1; /* str is longer */
11254         return 0;
11255     }
11256 }
11257 
11258 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11259 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11260 {
11261     size_t i, len;
11262     const wchar_t *p;
11263     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11264     if (strlen(str) != len)
11265         return 0;
11266     p = _PyUnicode_WSTR(unicode);
11267     assert(p);
11268     for (i = 0; i < len; i++) {
11269         unsigned char c = (unsigned char)str[i];
11270         if (c >= 128 || p[i] != (wchar_t)c)
11271             return 0;
11272     }
11273     return 1;
11274 }
11275 
11276 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11277 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11278 {
11279     size_t len;
11280     assert(_PyUnicode_CHECK(unicode));
11281     assert(str);
11282 #ifndef NDEBUG
11283     for (const char *p = str; *p; p++) {
11284         assert((unsigned char)*p < 128);
11285     }
11286 #endif
11287     if (PyUnicode_READY(unicode) == -1) {
11288         /* Memory error or bad data */
11289         PyErr_Clear();
11290         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11291     }
11292     if (!PyUnicode_IS_ASCII(unicode))
11293         return 0;
11294     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11295     return strlen(str) == len &&
11296            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11297 }
11298 
11299 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11300 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11301 {
11302     PyObject *right_uni;
11303 
11304     assert(_PyUnicode_CHECK(left));
11305     assert(right->string);
11306 #ifndef NDEBUG
11307     for (const char *p = right->string; *p; p++) {
11308         assert((unsigned char)*p < 128);
11309     }
11310 #endif
11311 
11312     if (PyUnicode_READY(left) == -1) {
11313         /* memory error or bad data */
11314         PyErr_Clear();
11315         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11316     }
11317 
11318     if (!PyUnicode_IS_ASCII(left))
11319         return 0;
11320 
11321     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11322     if (right_uni == NULL) {
11323         /* memory error or bad data */
11324         PyErr_Clear();
11325         return _PyUnicode_EqualToASCIIString(left, right->string);
11326     }
11327 
11328     if (left == right_uni)
11329         return 1;
11330 
11331     if (PyUnicode_CHECK_INTERNED(left))
11332         return 0;
11333 
11334     assert(_PyUnicode_HASH(right_uni) != -1);
11335     Py_hash_t hash = _PyUnicode_HASH(left);
11336     if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11337         return 0;
11338     }
11339 
11340     return unicode_compare_eq(left, right_uni);
11341 }
11342 
11343 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11344 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11345 {
11346     int result;
11347 
11348     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11349         Py_RETURN_NOTIMPLEMENTED;
11350 
11351     if (PyUnicode_READY(left) == -1 ||
11352         PyUnicode_READY(right) == -1)
11353         return NULL;
11354 
11355     if (left == right) {
11356         switch (op) {
11357         case Py_EQ:
11358         case Py_LE:
11359         case Py_GE:
11360             /* a string is equal to itself */
11361             Py_RETURN_TRUE;
11362         case Py_NE:
11363         case Py_LT:
11364         case Py_GT:
11365             Py_RETURN_FALSE;
11366         default:
11367             PyErr_BadArgument();
11368             return NULL;
11369         }
11370     }
11371     else if (op == Py_EQ || op == Py_NE) {
11372         result = unicode_compare_eq(left, right);
11373         result ^= (op == Py_NE);
11374         return PyBool_FromLong(result);
11375     }
11376     else {
11377         result = unicode_compare(left, right);
11378         Py_RETURN_RICHCOMPARE(result, 0, op);
11379     }
11380 }
11381 
11382 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11383 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11384 {
11385     return unicode_eq(aa, bb);
11386 }
11387 
11388 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11389 PyUnicode_Contains(PyObject *str, PyObject *substr)
11390 {
11391     int kind1, kind2;
11392     const void *buf1, *buf2;
11393     Py_ssize_t len1, len2;
11394     int result;
11395 
11396     if (!PyUnicode_Check(substr)) {
11397         PyErr_Format(PyExc_TypeError,
11398                      "'in <string>' requires string as left operand, not %.100s",
11399                      Py_TYPE(substr)->tp_name);
11400         return -1;
11401     }
11402     if (PyUnicode_READY(substr) == -1)
11403         return -1;
11404     if (ensure_unicode(str) < 0)
11405         return -1;
11406 
11407     kind1 = PyUnicode_KIND(str);
11408     kind2 = PyUnicode_KIND(substr);
11409     if (kind1 < kind2)
11410         return 0;
11411     len1 = PyUnicode_GET_LENGTH(str);
11412     len2 = PyUnicode_GET_LENGTH(substr);
11413     if (len1 < len2)
11414         return 0;
11415     buf1 = PyUnicode_DATA(str);
11416     buf2 = PyUnicode_DATA(substr);
11417     if (len2 == 1) {
11418         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11419         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11420         return result;
11421     }
11422     if (kind2 != kind1) {
11423         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11424         if (!buf2)
11425             return -1;
11426     }
11427 
11428     switch (kind1) {
11429     case PyUnicode_1BYTE_KIND:
11430         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11431         break;
11432     case PyUnicode_2BYTE_KIND:
11433         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11434         break;
11435     case PyUnicode_4BYTE_KIND:
11436         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11437         break;
11438     default:
11439         Py_UNREACHABLE();
11440     }
11441 
11442     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11443     if (kind2 != kind1)
11444         PyMem_Free((void *)buf2);
11445 
11446     return result;
11447 }
11448 
11449 /* Concat to string or Unicode object giving a new Unicode object. */
11450 
11451 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11452 PyUnicode_Concat(PyObject *left, PyObject *right)
11453 {
11454     PyObject *result;
11455     Py_UCS4 maxchar, maxchar2;
11456     Py_ssize_t left_len, right_len, new_len;
11457 
11458     if (ensure_unicode(left) < 0)
11459         return NULL;
11460 
11461     if (!PyUnicode_Check(right)) {
11462         PyErr_Format(PyExc_TypeError,
11463                      "can only concatenate str (not \"%.200s\") to str",
11464                      Py_TYPE(right)->tp_name);
11465         return NULL;
11466     }
11467     if (PyUnicode_READY(right) < 0)
11468         return NULL;
11469 
11470     /* Shortcuts */
11471     PyObject *empty = unicode_get_empty();  // Borrowed reference
11472     if (left == empty) {
11473         return PyUnicode_FromObject(right);
11474     }
11475     if (right == empty) {
11476         return PyUnicode_FromObject(left);
11477     }
11478 
11479     left_len = PyUnicode_GET_LENGTH(left);
11480     right_len = PyUnicode_GET_LENGTH(right);
11481     if (left_len > PY_SSIZE_T_MAX - right_len) {
11482         PyErr_SetString(PyExc_OverflowError,
11483                         "strings are too large to concat");
11484         return NULL;
11485     }
11486     new_len = left_len + right_len;
11487 
11488     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11489     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11490     maxchar = Py_MAX(maxchar, maxchar2);
11491 
11492     /* Concat the two Unicode strings */
11493     result = PyUnicode_New(new_len, maxchar);
11494     if (result == NULL)
11495         return NULL;
11496     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11497     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11498     assert(_PyUnicode_CheckConsistency(result, 1));
11499     return result;
11500 }
11501 
11502 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11503 PyUnicode_Append(PyObject **p_left, PyObject *right)
11504 {
11505     PyObject *left, *res;
11506     Py_UCS4 maxchar, maxchar2;
11507     Py_ssize_t left_len, right_len, new_len;
11508 
11509     if (p_left == NULL) {
11510         if (!PyErr_Occurred())
11511             PyErr_BadInternalCall();
11512         return;
11513     }
11514     left = *p_left;
11515     if (right == NULL || left == NULL
11516         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11517         if (!PyErr_Occurred())
11518             PyErr_BadInternalCall();
11519         goto error;
11520     }
11521 
11522     if (PyUnicode_READY(left) == -1)
11523         goto error;
11524     if (PyUnicode_READY(right) == -1)
11525         goto error;
11526 
11527     /* Shortcuts */
11528     PyObject *empty = unicode_get_empty();  // Borrowed reference
11529     if (left == empty) {
11530         Py_DECREF(left);
11531         Py_INCREF(right);
11532         *p_left = right;
11533         return;
11534     }
11535     if (right == empty) {
11536         return;
11537     }
11538 
11539     left_len = PyUnicode_GET_LENGTH(left);
11540     right_len = PyUnicode_GET_LENGTH(right);
11541     if (left_len > PY_SSIZE_T_MAX - right_len) {
11542         PyErr_SetString(PyExc_OverflowError,
11543                         "strings are too large to concat");
11544         goto error;
11545     }
11546     new_len = left_len + right_len;
11547 
11548     if (unicode_modifiable(left)
11549         && PyUnicode_CheckExact(right)
11550         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11551         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11552            to change the structure size, but characters are stored just after
11553            the structure, and so it requires to move all characters which is
11554            not so different than duplicating the string. */
11555         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11556     {
11557         /* append inplace */
11558         if (unicode_resize(p_left, new_len) != 0)
11559             goto error;
11560 
11561         /* copy 'right' into the newly allocated area of 'left' */
11562         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11563     }
11564     else {
11565         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11566         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11567         maxchar = Py_MAX(maxchar, maxchar2);
11568 
11569         /* Concat the two Unicode strings */
11570         res = PyUnicode_New(new_len, maxchar);
11571         if (res == NULL)
11572             goto error;
11573         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11574         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11575         Py_DECREF(left);
11576         *p_left = res;
11577     }
11578     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11579     return;
11580 
11581 error:
11582     Py_CLEAR(*p_left);
11583 }
11584 
11585 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11586 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11587 {
11588     PyUnicode_Append(pleft, right);
11589     Py_XDECREF(right);
11590 }
11591 
11592 /*
11593 Wraps stringlib_parse_args_finds() and additionally ensures that the
11594 first argument is a unicode object.
11595 */
11596 
11597 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11598 parse_args_finds_unicode(const char * function_name, PyObject *args,
11599                          PyObject **substring,
11600                          Py_ssize_t *start, Py_ssize_t *end)
11601 {
11602     if(stringlib_parse_args_finds(function_name, args, substring,
11603                                   start, end)) {
11604         if (ensure_unicode(*substring) < 0)
11605             return 0;
11606         return 1;
11607     }
11608     return 0;
11609 }
11610 
11611 PyDoc_STRVAR(count__doc__,
11612              "S.count(sub[, start[, end]]) -> int\n\
11613 \n\
11614 Return the number of non-overlapping occurrences of substring sub in\n\
11615 string S[start:end].  Optional arguments start and end are\n\
11616 interpreted as in slice notation.");
11617 
11618 static PyObject *
unicode_count(PyObject * self,PyObject * args)11619 unicode_count(PyObject *self, PyObject *args)
11620 {
11621     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11622     Py_ssize_t start = 0;
11623     Py_ssize_t end = PY_SSIZE_T_MAX;
11624     PyObject *result;
11625     int kind1, kind2;
11626     const void *buf1, *buf2;
11627     Py_ssize_t len1, len2, iresult;
11628 
11629     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11630         return NULL;
11631 
11632     kind1 = PyUnicode_KIND(self);
11633     kind2 = PyUnicode_KIND(substring);
11634     if (kind1 < kind2)
11635         return PyLong_FromLong(0);
11636 
11637     len1 = PyUnicode_GET_LENGTH(self);
11638     len2 = PyUnicode_GET_LENGTH(substring);
11639     ADJUST_INDICES(start, end, len1);
11640     if (end - start < len2)
11641         return PyLong_FromLong(0);
11642 
11643     buf1 = PyUnicode_DATA(self);
11644     buf2 = PyUnicode_DATA(substring);
11645     if (kind2 != kind1) {
11646         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11647         if (!buf2)
11648             return NULL;
11649     }
11650     switch (kind1) {
11651     case PyUnicode_1BYTE_KIND:
11652         iresult = ucs1lib_count(
11653             ((const Py_UCS1*)buf1) + start, end - start,
11654             buf2, len2, PY_SSIZE_T_MAX
11655             );
11656         break;
11657     case PyUnicode_2BYTE_KIND:
11658         iresult = ucs2lib_count(
11659             ((const Py_UCS2*)buf1) + start, end - start,
11660             buf2, len2, PY_SSIZE_T_MAX
11661             );
11662         break;
11663     case PyUnicode_4BYTE_KIND:
11664         iresult = ucs4lib_count(
11665             ((const Py_UCS4*)buf1) + start, end - start,
11666             buf2, len2, PY_SSIZE_T_MAX
11667             );
11668         break;
11669     default:
11670         Py_UNREACHABLE();
11671     }
11672 
11673     result = PyLong_FromSsize_t(iresult);
11674 
11675     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11676     if (kind2 != kind1)
11677         PyMem_Free((void *)buf2);
11678 
11679     return result;
11680 }
11681 
11682 /*[clinic input]
11683 str.encode as unicode_encode
11684 
11685     encoding: str(c_default="NULL") = 'utf-8'
11686         The encoding in which to encode the string.
11687     errors: str(c_default="NULL") = 'strict'
11688         The error handling scheme to use for encoding errors.
11689         The default is 'strict' meaning that encoding errors raise a
11690         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11691         'xmlcharrefreplace' as well as any other name registered with
11692         codecs.register_error that can handle UnicodeEncodeErrors.
11693 
11694 Encode the string using the codec registered for encoding.
11695 [clinic start generated code]*/
11696 
11697 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11698 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11699 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11700 {
11701     return PyUnicode_AsEncodedString(self, encoding, errors);
11702 }
11703 
11704 /*[clinic input]
11705 str.expandtabs as unicode_expandtabs
11706 
11707     tabsize: int = 8
11708 
11709 Return a copy where all tab characters are expanded using spaces.
11710 
11711 If tabsize is not given, a tab size of 8 characters is assumed.
11712 [clinic start generated code]*/
11713 
11714 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11715 unicode_expandtabs_impl(PyObject *self, int tabsize)
11716 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11717 {
11718     Py_ssize_t i, j, line_pos, src_len, incr;
11719     Py_UCS4 ch;
11720     PyObject *u;
11721     const void *src_data;
11722     void *dest_data;
11723     int kind;
11724     int found;
11725 
11726     if (PyUnicode_READY(self) == -1)
11727         return NULL;
11728 
11729     /* First pass: determine size of output string */
11730     src_len = PyUnicode_GET_LENGTH(self);
11731     i = j = line_pos = 0;
11732     kind = PyUnicode_KIND(self);
11733     src_data = PyUnicode_DATA(self);
11734     found = 0;
11735     for (; i < src_len; i++) {
11736         ch = PyUnicode_READ(kind, src_data, i);
11737         if (ch == '\t') {
11738             found = 1;
11739             if (tabsize > 0) {
11740                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11741                 if (j > PY_SSIZE_T_MAX - incr)
11742                     goto overflow;
11743                 line_pos += incr;
11744                 j += incr;
11745             }
11746         }
11747         else {
11748             if (j > PY_SSIZE_T_MAX - 1)
11749                 goto overflow;
11750             line_pos++;
11751             j++;
11752             if (ch == '\n' || ch == '\r')
11753                 line_pos = 0;
11754         }
11755     }
11756     if (!found)
11757         return unicode_result_unchanged(self);
11758 
11759     /* Second pass: create output string and fill it */
11760     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11761     if (!u)
11762         return NULL;
11763     dest_data = PyUnicode_DATA(u);
11764 
11765     i = j = line_pos = 0;
11766 
11767     for (; i < src_len; i++) {
11768         ch = PyUnicode_READ(kind, src_data, i);
11769         if (ch == '\t') {
11770             if (tabsize > 0) {
11771                 incr = tabsize - (line_pos % tabsize);
11772                 line_pos += incr;
11773                 unicode_fill(kind, dest_data, ' ', j, incr);
11774                 j += incr;
11775             }
11776         }
11777         else {
11778             line_pos++;
11779             PyUnicode_WRITE(kind, dest_data, j, ch);
11780             j++;
11781             if (ch == '\n' || ch == '\r')
11782                 line_pos = 0;
11783         }
11784     }
11785     assert (j == PyUnicode_GET_LENGTH(u));
11786     return unicode_result(u);
11787 
11788   overflow:
11789     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11790     return NULL;
11791 }
11792 
11793 PyDoc_STRVAR(find__doc__,
11794              "S.find(sub[, start[, end]]) -> int\n\
11795 \n\
11796 Return the lowest index in S where substring sub is found,\n\
11797 such that sub is contained within S[start:end].  Optional\n\
11798 arguments start and end are interpreted as in slice notation.\n\
11799 \n\
11800 Return -1 on failure.");
11801 
11802 static PyObject *
unicode_find(PyObject * self,PyObject * args)11803 unicode_find(PyObject *self, PyObject *args)
11804 {
11805     /* initialize variables to prevent gcc warning */
11806     PyObject *substring = NULL;
11807     Py_ssize_t start = 0;
11808     Py_ssize_t end = 0;
11809     Py_ssize_t result;
11810 
11811     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11812         return NULL;
11813 
11814     if (PyUnicode_READY(self) == -1)
11815         return NULL;
11816 
11817     result = any_find_slice(self, substring, start, end, 1);
11818 
11819     if (result == -2)
11820         return NULL;
11821 
11822     return PyLong_FromSsize_t(result);
11823 }
11824 
11825 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11826 unicode_getitem(PyObject *self, Py_ssize_t index)
11827 {
11828     const void *data;
11829     enum PyUnicode_Kind kind;
11830     Py_UCS4 ch;
11831 
11832     if (!PyUnicode_Check(self)) {
11833         PyErr_BadArgument();
11834         return NULL;
11835     }
11836     if (PyUnicode_READY(self) == -1) {
11837         return NULL;
11838     }
11839     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11840         PyErr_SetString(PyExc_IndexError, "string index out of range");
11841         return NULL;
11842     }
11843     kind = PyUnicode_KIND(self);
11844     data = PyUnicode_DATA(self);
11845     ch = PyUnicode_READ(kind, data, index);
11846     return unicode_char(ch);
11847 }
11848 
11849 /* Believe it or not, this produces the same value for ASCII strings
11850    as bytes_hash(). */
11851 static Py_hash_t
unicode_hash(PyObject * self)11852 unicode_hash(PyObject *self)
11853 {
11854     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11855 
11856 #ifdef Py_DEBUG
11857     assert(_Py_HashSecret_Initialized);
11858 #endif
11859     if (_PyUnicode_HASH(self) != -1)
11860         return _PyUnicode_HASH(self);
11861     if (PyUnicode_READY(self) == -1)
11862         return -1;
11863 
11864     x = _Py_HashBytes(PyUnicode_DATA(self),
11865                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11866     _PyUnicode_HASH(self) = x;
11867     return x;
11868 }
11869 
11870 PyDoc_STRVAR(index__doc__,
11871              "S.index(sub[, start[, end]]) -> int\n\
11872 \n\
11873 Return the lowest index in S where substring sub is found,\n\
11874 such that sub is contained within S[start:end].  Optional\n\
11875 arguments start and end are interpreted as in slice notation.\n\
11876 \n\
11877 Raises ValueError when the substring is not found.");
11878 
11879 static PyObject *
unicode_index(PyObject * self,PyObject * args)11880 unicode_index(PyObject *self, PyObject *args)
11881 {
11882     /* initialize variables to prevent gcc warning */
11883     Py_ssize_t result;
11884     PyObject *substring = NULL;
11885     Py_ssize_t start = 0;
11886     Py_ssize_t end = 0;
11887 
11888     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11889         return NULL;
11890 
11891     if (PyUnicode_READY(self) == -1)
11892         return NULL;
11893 
11894     result = any_find_slice(self, substring, start, end, 1);
11895 
11896     if (result == -2)
11897         return NULL;
11898 
11899     if (result < 0) {
11900         PyErr_SetString(PyExc_ValueError, "substring not found");
11901         return NULL;
11902     }
11903 
11904     return PyLong_FromSsize_t(result);
11905 }
11906 
11907 /*[clinic input]
11908 str.isascii as unicode_isascii
11909 
11910 Return True if all characters in the string are ASCII, False otherwise.
11911 
11912 ASCII characters have code points in the range U+0000-U+007F.
11913 Empty string is ASCII too.
11914 [clinic start generated code]*/
11915 
11916 static PyObject *
unicode_isascii_impl(PyObject * self)11917 unicode_isascii_impl(PyObject *self)
11918 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11919 {
11920     if (PyUnicode_READY(self) == -1) {
11921         return NULL;
11922     }
11923     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11924 }
11925 
11926 /*[clinic input]
11927 str.islower as unicode_islower
11928 
11929 Return True if the string is a lowercase string, False otherwise.
11930 
11931 A string is lowercase if all cased characters in the string are lowercase and
11932 there is at least one cased character in the string.
11933 [clinic start generated code]*/
11934 
11935 static PyObject *
unicode_islower_impl(PyObject * self)11936 unicode_islower_impl(PyObject *self)
11937 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11938 {
11939     Py_ssize_t i, length;
11940     int kind;
11941     const void *data;
11942     int cased;
11943 
11944     if (PyUnicode_READY(self) == -1)
11945         return NULL;
11946     length = PyUnicode_GET_LENGTH(self);
11947     kind = PyUnicode_KIND(self);
11948     data = PyUnicode_DATA(self);
11949 
11950     /* Shortcut for single character strings */
11951     if (length == 1)
11952         return PyBool_FromLong(
11953             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11954 
11955     /* Special case for empty strings */
11956     if (length == 0)
11957         Py_RETURN_FALSE;
11958 
11959     cased = 0;
11960     for (i = 0; i < length; i++) {
11961         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11962 
11963         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11964             Py_RETURN_FALSE;
11965         else if (!cased && Py_UNICODE_ISLOWER(ch))
11966             cased = 1;
11967     }
11968     return PyBool_FromLong(cased);
11969 }
11970 
11971 /*[clinic input]
11972 str.isupper as unicode_isupper
11973 
11974 Return True if the string is an uppercase string, False otherwise.
11975 
11976 A string is uppercase if all cased characters in the string are uppercase and
11977 there is at least one cased character in the string.
11978 [clinic start generated code]*/
11979 
11980 static PyObject *
unicode_isupper_impl(PyObject * self)11981 unicode_isupper_impl(PyObject *self)
11982 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11983 {
11984     Py_ssize_t i, length;
11985     int kind;
11986     const void *data;
11987     int cased;
11988 
11989     if (PyUnicode_READY(self) == -1)
11990         return NULL;
11991     length = PyUnicode_GET_LENGTH(self);
11992     kind = PyUnicode_KIND(self);
11993     data = PyUnicode_DATA(self);
11994 
11995     /* Shortcut for single character strings */
11996     if (length == 1)
11997         return PyBool_FromLong(
11998             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11999 
12000     /* Special case for empty strings */
12001     if (length == 0)
12002         Py_RETURN_FALSE;
12003 
12004     cased = 0;
12005     for (i = 0; i < length; i++) {
12006         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12007 
12008         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12009             Py_RETURN_FALSE;
12010         else if (!cased && Py_UNICODE_ISUPPER(ch))
12011             cased = 1;
12012     }
12013     return PyBool_FromLong(cased);
12014 }
12015 
12016 /*[clinic input]
12017 str.istitle as unicode_istitle
12018 
12019 Return True if the string is a title-cased string, False otherwise.
12020 
12021 In a title-cased string, upper- and title-case characters may only
12022 follow uncased characters and lowercase characters only cased ones.
12023 [clinic start generated code]*/
12024 
12025 static PyObject *
unicode_istitle_impl(PyObject * self)12026 unicode_istitle_impl(PyObject *self)
12027 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12028 {
12029     Py_ssize_t i, length;
12030     int kind;
12031     const void *data;
12032     int cased, previous_is_cased;
12033 
12034     if (PyUnicode_READY(self) == -1)
12035         return NULL;
12036     length = PyUnicode_GET_LENGTH(self);
12037     kind = PyUnicode_KIND(self);
12038     data = PyUnicode_DATA(self);
12039 
12040     /* Shortcut for single character strings */
12041     if (length == 1) {
12042         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12043         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12044                                (Py_UNICODE_ISUPPER(ch) != 0));
12045     }
12046 
12047     /* Special case for empty strings */
12048     if (length == 0)
12049         Py_RETURN_FALSE;
12050 
12051     cased = 0;
12052     previous_is_cased = 0;
12053     for (i = 0; i < length; i++) {
12054         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12055 
12056         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12057             if (previous_is_cased)
12058                 Py_RETURN_FALSE;
12059             previous_is_cased = 1;
12060             cased = 1;
12061         }
12062         else if (Py_UNICODE_ISLOWER(ch)) {
12063             if (!previous_is_cased)
12064                 Py_RETURN_FALSE;
12065             previous_is_cased = 1;
12066             cased = 1;
12067         }
12068         else
12069             previous_is_cased = 0;
12070     }
12071     return PyBool_FromLong(cased);
12072 }
12073 
12074 /*[clinic input]
12075 str.isspace as unicode_isspace
12076 
12077 Return True if the string is a whitespace string, False otherwise.
12078 
12079 A string is whitespace if all characters in the string are whitespace and there
12080 is at least one character in the string.
12081 [clinic start generated code]*/
12082 
12083 static PyObject *
unicode_isspace_impl(PyObject * self)12084 unicode_isspace_impl(PyObject *self)
12085 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12086 {
12087     Py_ssize_t i, length;
12088     int kind;
12089     const void *data;
12090 
12091     if (PyUnicode_READY(self) == -1)
12092         return NULL;
12093     length = PyUnicode_GET_LENGTH(self);
12094     kind = PyUnicode_KIND(self);
12095     data = PyUnicode_DATA(self);
12096 
12097     /* Shortcut for single character strings */
12098     if (length == 1)
12099         return PyBool_FromLong(
12100             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12101 
12102     /* Special case for empty strings */
12103     if (length == 0)
12104         Py_RETURN_FALSE;
12105 
12106     for (i = 0; i < length; i++) {
12107         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12108         if (!Py_UNICODE_ISSPACE(ch))
12109             Py_RETURN_FALSE;
12110     }
12111     Py_RETURN_TRUE;
12112 }
12113 
12114 /*[clinic input]
12115 str.isalpha as unicode_isalpha
12116 
12117 Return True if the string is an alphabetic string, False otherwise.
12118 
12119 A string is alphabetic if all characters in the string are alphabetic and there
12120 is at least one character in the string.
12121 [clinic start generated code]*/
12122 
12123 static PyObject *
unicode_isalpha_impl(PyObject * self)12124 unicode_isalpha_impl(PyObject *self)
12125 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12126 {
12127     Py_ssize_t i, length;
12128     int kind;
12129     const void *data;
12130 
12131     if (PyUnicode_READY(self) == -1)
12132         return NULL;
12133     length = PyUnicode_GET_LENGTH(self);
12134     kind = PyUnicode_KIND(self);
12135     data = PyUnicode_DATA(self);
12136 
12137     /* Shortcut for single character strings */
12138     if (length == 1)
12139         return PyBool_FromLong(
12140             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12141 
12142     /* Special case for empty strings */
12143     if (length == 0)
12144         Py_RETURN_FALSE;
12145 
12146     for (i = 0; i < length; i++) {
12147         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12148             Py_RETURN_FALSE;
12149     }
12150     Py_RETURN_TRUE;
12151 }
12152 
12153 /*[clinic input]
12154 str.isalnum as unicode_isalnum
12155 
12156 Return True if the string is an alpha-numeric string, False otherwise.
12157 
12158 A string is alpha-numeric if all characters in the string are alpha-numeric and
12159 there is at least one character in the string.
12160 [clinic start generated code]*/
12161 
12162 static PyObject *
unicode_isalnum_impl(PyObject * self)12163 unicode_isalnum_impl(PyObject *self)
12164 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12165 {
12166     int kind;
12167     const void *data;
12168     Py_ssize_t len, i;
12169 
12170     if (PyUnicode_READY(self) == -1)
12171         return NULL;
12172 
12173     kind = PyUnicode_KIND(self);
12174     data = PyUnicode_DATA(self);
12175     len = PyUnicode_GET_LENGTH(self);
12176 
12177     /* Shortcut for single character strings */
12178     if (len == 1) {
12179         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12180         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12181     }
12182 
12183     /* Special case for empty strings */
12184     if (len == 0)
12185         Py_RETURN_FALSE;
12186 
12187     for (i = 0; i < len; i++) {
12188         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12189         if (!Py_UNICODE_ISALNUM(ch))
12190             Py_RETURN_FALSE;
12191     }
12192     Py_RETURN_TRUE;
12193 }
12194 
12195 /*[clinic input]
12196 str.isdecimal as unicode_isdecimal
12197 
12198 Return True if the string is a decimal string, False otherwise.
12199 
12200 A string is a decimal string if all characters in the string are decimal and
12201 there is at least one character in the string.
12202 [clinic start generated code]*/
12203 
12204 static PyObject *
unicode_isdecimal_impl(PyObject * self)12205 unicode_isdecimal_impl(PyObject *self)
12206 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12207 {
12208     Py_ssize_t i, length;
12209     int kind;
12210     const void *data;
12211 
12212     if (PyUnicode_READY(self) == -1)
12213         return NULL;
12214     length = PyUnicode_GET_LENGTH(self);
12215     kind = PyUnicode_KIND(self);
12216     data = PyUnicode_DATA(self);
12217 
12218     /* Shortcut for single character strings */
12219     if (length == 1)
12220         return PyBool_FromLong(
12221             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12222 
12223     /* Special case for empty strings */
12224     if (length == 0)
12225         Py_RETURN_FALSE;
12226 
12227     for (i = 0; i < length; i++) {
12228         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12229             Py_RETURN_FALSE;
12230     }
12231     Py_RETURN_TRUE;
12232 }
12233 
12234 /*[clinic input]
12235 str.isdigit as unicode_isdigit
12236 
12237 Return True if the string is a digit string, False otherwise.
12238 
12239 A string is a digit string if all characters in the string are digits and there
12240 is at least one character in the string.
12241 [clinic start generated code]*/
12242 
12243 static PyObject *
unicode_isdigit_impl(PyObject * self)12244 unicode_isdigit_impl(PyObject *self)
12245 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12246 {
12247     Py_ssize_t i, length;
12248     int kind;
12249     const void *data;
12250 
12251     if (PyUnicode_READY(self) == -1)
12252         return NULL;
12253     length = PyUnicode_GET_LENGTH(self);
12254     kind = PyUnicode_KIND(self);
12255     data = PyUnicode_DATA(self);
12256 
12257     /* Shortcut for single character strings */
12258     if (length == 1) {
12259         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12260         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12261     }
12262 
12263     /* Special case for empty strings */
12264     if (length == 0)
12265         Py_RETURN_FALSE;
12266 
12267     for (i = 0; i < length; i++) {
12268         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12269             Py_RETURN_FALSE;
12270     }
12271     Py_RETURN_TRUE;
12272 }
12273 
12274 /*[clinic input]
12275 str.isnumeric as unicode_isnumeric
12276 
12277 Return True if the string is a numeric string, False otherwise.
12278 
12279 A string is numeric if all characters in the string are numeric and there is at
12280 least one character in the string.
12281 [clinic start generated code]*/
12282 
12283 static PyObject *
unicode_isnumeric_impl(PyObject * self)12284 unicode_isnumeric_impl(PyObject *self)
12285 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12286 {
12287     Py_ssize_t i, length;
12288     int kind;
12289     const void *data;
12290 
12291     if (PyUnicode_READY(self) == -1)
12292         return NULL;
12293     length = PyUnicode_GET_LENGTH(self);
12294     kind = PyUnicode_KIND(self);
12295     data = PyUnicode_DATA(self);
12296 
12297     /* Shortcut for single character strings */
12298     if (length == 1)
12299         return PyBool_FromLong(
12300             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12301 
12302     /* Special case for empty strings */
12303     if (length == 0)
12304         Py_RETURN_FALSE;
12305 
12306     for (i = 0; i < length; i++) {
12307         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12308             Py_RETURN_FALSE;
12309     }
12310     Py_RETURN_TRUE;
12311 }
12312 
12313 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12314 _PyUnicode_ScanIdentifier(PyObject *self)
12315 {
12316     Py_ssize_t i;
12317     if (PyUnicode_READY(self) == -1)
12318         return -1;
12319 
12320     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12321     if (len == 0) {
12322         /* an empty string is not a valid identifier */
12323         return 0;
12324     }
12325 
12326     int kind = PyUnicode_KIND(self);
12327     const void *data = PyUnicode_DATA(self);
12328     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12329     /* PEP 3131 says that the first character must be in
12330        XID_Start and subsequent characters in XID_Continue,
12331        and for the ASCII range, the 2.x rules apply (i.e
12332        start with letters and underscore, continue with
12333        letters, digits, underscore). However, given the current
12334        definition of XID_Start and XID_Continue, it is sufficient
12335        to check just for these, except that _ must be allowed
12336        as starting an identifier.  */
12337     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12338         return 0;
12339     }
12340 
12341     for (i = 1; i < len; i++) {
12342         ch = PyUnicode_READ(kind, data, i);
12343         if (!_PyUnicode_IsXidContinue(ch)) {
12344             return i;
12345         }
12346     }
12347     return i;
12348 }
12349 
12350 int
PyUnicode_IsIdentifier(PyObject * self)12351 PyUnicode_IsIdentifier(PyObject *self)
12352 {
12353     if (PyUnicode_IS_READY(self)) {
12354         Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12355         Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12356         /* an empty string is not a valid identifier */
12357         return len && i == len;
12358     }
12359     else {
12360 _Py_COMP_DIAG_PUSH
12361 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12362         Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12363         if (len == 0) {
12364             /* an empty string is not a valid identifier */
12365             return 0;
12366         }
12367 
12368         const wchar_t *wstr = _PyUnicode_WSTR(self);
12369         Py_UCS4 ch = wstr[i++];
12370 #if SIZEOF_WCHAR_T == 2
12371         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12372             && i < len
12373             && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12374         {
12375             ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12376             i++;
12377         }
12378 #endif
12379         if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12380             return 0;
12381         }
12382 
12383         while (i < len) {
12384             ch = wstr[i++];
12385 #if SIZEOF_WCHAR_T == 2
12386             if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12387                 && i < len
12388                 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12389             {
12390                 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12391                 i++;
12392             }
12393 #endif
12394             if (!_PyUnicode_IsXidContinue(ch)) {
12395                 return 0;
12396             }
12397         }
12398         return 1;
12399 _Py_COMP_DIAG_POP
12400     }
12401 }
12402 
12403 /*[clinic input]
12404 str.isidentifier as unicode_isidentifier
12405 
12406 Return True if the string is a valid Python identifier, False otherwise.
12407 
12408 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12409 such as "def" or "class".
12410 [clinic start generated code]*/
12411 
12412 static PyObject *
unicode_isidentifier_impl(PyObject * self)12413 unicode_isidentifier_impl(PyObject *self)
12414 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12415 {
12416     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12417 }
12418 
12419 /*[clinic input]
12420 str.isprintable as unicode_isprintable
12421 
12422 Return True if the string is printable, False otherwise.
12423 
12424 A string is printable if all of its characters are considered printable in
12425 repr() or if it is empty.
12426 [clinic start generated code]*/
12427 
12428 static PyObject *
unicode_isprintable_impl(PyObject * self)12429 unicode_isprintable_impl(PyObject *self)
12430 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12431 {
12432     Py_ssize_t i, length;
12433     int kind;
12434     const void *data;
12435 
12436     if (PyUnicode_READY(self) == -1)
12437         return NULL;
12438     length = PyUnicode_GET_LENGTH(self);
12439     kind = PyUnicode_KIND(self);
12440     data = PyUnicode_DATA(self);
12441 
12442     /* Shortcut for single character strings */
12443     if (length == 1)
12444         return PyBool_FromLong(
12445             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12446 
12447     for (i = 0; i < length; i++) {
12448         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12449             Py_RETURN_FALSE;
12450         }
12451     }
12452     Py_RETURN_TRUE;
12453 }
12454 
12455 /*[clinic input]
12456 str.join as unicode_join
12457 
12458     iterable: object
12459     /
12460 
12461 Concatenate any number of strings.
12462 
12463 The string whose method is called is inserted in between each given string.
12464 The result is returned as a new string.
12465 
12466 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12467 [clinic start generated code]*/
12468 
12469 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12470 unicode_join(PyObject *self, PyObject *iterable)
12471 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12472 {
12473     return PyUnicode_Join(self, iterable);
12474 }
12475 
12476 static Py_ssize_t
unicode_length(PyObject * self)12477 unicode_length(PyObject *self)
12478 {
12479     if (PyUnicode_READY(self) == -1)
12480         return -1;
12481     return PyUnicode_GET_LENGTH(self);
12482 }
12483 
12484 /*[clinic input]
12485 str.ljust as unicode_ljust
12486 
12487     width: Py_ssize_t
12488     fillchar: Py_UCS4 = ' '
12489     /
12490 
12491 Return a left-justified string of length width.
12492 
12493 Padding is done using the specified fill character (default is a space).
12494 [clinic start generated code]*/
12495 
12496 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12497 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12498 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12499 {
12500     if (PyUnicode_READY(self) == -1)
12501         return NULL;
12502 
12503     if (PyUnicode_GET_LENGTH(self) >= width)
12504         return unicode_result_unchanged(self);
12505 
12506     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12507 }
12508 
12509 /*[clinic input]
12510 str.lower as unicode_lower
12511 
12512 Return a copy of the string converted to lowercase.
12513 [clinic start generated code]*/
12514 
12515 static PyObject *
unicode_lower_impl(PyObject * self)12516 unicode_lower_impl(PyObject *self)
12517 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12518 {
12519     if (PyUnicode_READY(self) == -1)
12520         return NULL;
12521     if (PyUnicode_IS_ASCII(self))
12522         return ascii_upper_or_lower(self, 1);
12523     return case_operation(self, do_lower);
12524 }
12525 
12526 #define LEFTSTRIP 0
12527 #define RIGHTSTRIP 1
12528 #define BOTHSTRIP 2
12529 
12530 /* Arrays indexed by above */
12531 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12532 
12533 #define STRIPNAME(i) (stripfuncnames[i])
12534 
12535 /* externally visible for str.strip(unicode) */
12536 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12537 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12538 {
12539     const void *data;
12540     int kind;
12541     Py_ssize_t i, j, len;
12542     BLOOM_MASK sepmask;
12543     Py_ssize_t seplen;
12544 
12545     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12546         return NULL;
12547 
12548     kind = PyUnicode_KIND(self);
12549     data = PyUnicode_DATA(self);
12550     len = PyUnicode_GET_LENGTH(self);
12551     seplen = PyUnicode_GET_LENGTH(sepobj);
12552     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12553                               PyUnicode_DATA(sepobj),
12554                               seplen);
12555 
12556     i = 0;
12557     if (striptype != RIGHTSTRIP) {
12558         while (i < len) {
12559             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12560             if (!BLOOM(sepmask, ch))
12561                 break;
12562             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12563                 break;
12564             i++;
12565         }
12566     }
12567 
12568     j = len;
12569     if (striptype != LEFTSTRIP) {
12570         j--;
12571         while (j >= i) {
12572             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12573             if (!BLOOM(sepmask, ch))
12574                 break;
12575             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12576                 break;
12577             j--;
12578         }
12579 
12580         j++;
12581     }
12582 
12583     return PyUnicode_Substring(self, i, j);
12584 }
12585 
12586 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12587 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12588 {
12589     const unsigned char *data;
12590     int kind;
12591     Py_ssize_t length;
12592 
12593     if (PyUnicode_READY(self) == -1)
12594         return NULL;
12595 
12596     length = PyUnicode_GET_LENGTH(self);
12597     end = Py_MIN(end, length);
12598 
12599     if (start == 0 && end == length)
12600         return unicode_result_unchanged(self);
12601 
12602     if (start < 0 || end < 0) {
12603         PyErr_SetString(PyExc_IndexError, "string index out of range");
12604         return NULL;
12605     }
12606     if (start >= length || end < start)
12607         _Py_RETURN_UNICODE_EMPTY();
12608 
12609     length = end - start;
12610     if (PyUnicode_IS_ASCII(self)) {
12611         data = PyUnicode_1BYTE_DATA(self);
12612         return _PyUnicode_FromASCII((const char*)(data + start), length);
12613     }
12614     else {
12615         kind = PyUnicode_KIND(self);
12616         data = PyUnicode_1BYTE_DATA(self);
12617         return PyUnicode_FromKindAndData(kind,
12618                                          data + kind * start,
12619                                          length);
12620     }
12621 }
12622 
12623 static PyObject *
do_strip(PyObject * self,int striptype)12624 do_strip(PyObject *self, int striptype)
12625 {
12626     Py_ssize_t len, i, j;
12627 
12628     if (PyUnicode_READY(self) == -1)
12629         return NULL;
12630 
12631     len = PyUnicode_GET_LENGTH(self);
12632 
12633     if (PyUnicode_IS_ASCII(self)) {
12634         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12635 
12636         i = 0;
12637         if (striptype != RIGHTSTRIP) {
12638             while (i < len) {
12639                 Py_UCS1 ch = data[i];
12640                 if (!_Py_ascii_whitespace[ch])
12641                     break;
12642                 i++;
12643             }
12644         }
12645 
12646         j = len;
12647         if (striptype != LEFTSTRIP) {
12648             j--;
12649             while (j >= i) {
12650                 Py_UCS1 ch = data[j];
12651                 if (!_Py_ascii_whitespace[ch])
12652                     break;
12653                 j--;
12654             }
12655             j++;
12656         }
12657     }
12658     else {
12659         int kind = PyUnicode_KIND(self);
12660         const void *data = PyUnicode_DATA(self);
12661 
12662         i = 0;
12663         if (striptype != RIGHTSTRIP) {
12664             while (i < len) {
12665                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12666                 if (!Py_UNICODE_ISSPACE(ch))
12667                     break;
12668                 i++;
12669             }
12670         }
12671 
12672         j = len;
12673         if (striptype != LEFTSTRIP) {
12674             j--;
12675             while (j >= i) {
12676                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12677                 if (!Py_UNICODE_ISSPACE(ch))
12678                     break;
12679                 j--;
12680             }
12681             j++;
12682         }
12683     }
12684 
12685     return PyUnicode_Substring(self, i, j);
12686 }
12687 
12688 
12689 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12690 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12691 {
12692     if (sep != Py_None) {
12693         if (PyUnicode_Check(sep))
12694             return _PyUnicode_XStrip(self, striptype, sep);
12695         else {
12696             PyErr_Format(PyExc_TypeError,
12697                          "%s arg must be None or str",
12698                          STRIPNAME(striptype));
12699             return NULL;
12700         }
12701     }
12702 
12703     return do_strip(self, striptype);
12704 }
12705 
12706 
12707 /*[clinic input]
12708 str.strip as unicode_strip
12709 
12710     chars: object = None
12711     /
12712 
12713 Return a copy of the string with leading and trailing whitespace removed.
12714 
12715 If chars is given and not None, remove characters in chars instead.
12716 [clinic start generated code]*/
12717 
12718 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12719 unicode_strip_impl(PyObject *self, PyObject *chars)
12720 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12721 {
12722     return do_argstrip(self, BOTHSTRIP, chars);
12723 }
12724 
12725 
12726 /*[clinic input]
12727 str.lstrip as unicode_lstrip
12728 
12729     chars: object = None
12730     /
12731 
12732 Return a copy of the string with leading whitespace removed.
12733 
12734 If chars is given and not None, remove characters in chars instead.
12735 [clinic start generated code]*/
12736 
12737 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12738 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12739 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12740 {
12741     return do_argstrip(self, LEFTSTRIP, chars);
12742 }
12743 
12744 
12745 /*[clinic input]
12746 str.rstrip as unicode_rstrip
12747 
12748     chars: object = None
12749     /
12750 
12751 Return a copy of the string with trailing whitespace removed.
12752 
12753 If chars is given and not None, remove characters in chars instead.
12754 [clinic start generated code]*/
12755 
12756 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12757 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12758 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12759 {
12760     return do_argstrip(self, RIGHTSTRIP, chars);
12761 }
12762 
12763 
12764 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12765 unicode_repeat(PyObject *str, Py_ssize_t len)
12766 {
12767     PyObject *u;
12768     Py_ssize_t nchars, n;
12769 
12770     if (len < 1)
12771         _Py_RETURN_UNICODE_EMPTY();
12772 
12773     /* no repeat, return original string */
12774     if (len == 1)
12775         return unicode_result_unchanged(str);
12776 
12777     if (PyUnicode_READY(str) == -1)
12778         return NULL;
12779 
12780     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12781         PyErr_SetString(PyExc_OverflowError,
12782                         "repeated string is too long");
12783         return NULL;
12784     }
12785     nchars = len * PyUnicode_GET_LENGTH(str);
12786 
12787     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12788     if (!u)
12789         return NULL;
12790     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12791 
12792     if (PyUnicode_GET_LENGTH(str) == 1) {
12793         int kind = PyUnicode_KIND(str);
12794         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12795         if (kind == PyUnicode_1BYTE_KIND) {
12796             void *to = PyUnicode_DATA(u);
12797             memset(to, (unsigned char)fill_char, len);
12798         }
12799         else if (kind == PyUnicode_2BYTE_KIND) {
12800             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12801             for (n = 0; n < len; ++n)
12802                 ucs2[n] = fill_char;
12803         } else {
12804             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12805             assert(kind == PyUnicode_4BYTE_KIND);
12806             for (n = 0; n < len; ++n)
12807                 ucs4[n] = fill_char;
12808         }
12809     }
12810     else {
12811         Py_ssize_t char_size = PyUnicode_KIND(str);
12812         char *to = (char *) PyUnicode_DATA(u);
12813         _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12814             PyUnicode_GET_LENGTH(str) * char_size);
12815     }
12816 
12817     assert(_PyUnicode_CheckConsistency(u, 1));
12818     return u;
12819 }
12820 
12821 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12822 PyUnicode_Replace(PyObject *str,
12823                   PyObject *substr,
12824                   PyObject *replstr,
12825                   Py_ssize_t maxcount)
12826 {
12827     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12828             ensure_unicode(replstr) < 0)
12829         return NULL;
12830     return replace(str, substr, replstr, maxcount);
12831 }
12832 
12833 /*[clinic input]
12834 str.replace as unicode_replace
12835 
12836     old: unicode
12837     new: unicode
12838     count: Py_ssize_t = -1
12839         Maximum number of occurrences to replace.
12840         -1 (the default value) means replace all occurrences.
12841     /
12842 
12843 Return a copy with all occurrences of substring old replaced by new.
12844 
12845 If the optional argument count is given, only the first count occurrences are
12846 replaced.
12847 [clinic start generated code]*/
12848 
12849 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12850 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12851                      Py_ssize_t count)
12852 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12853 {
12854     if (PyUnicode_READY(self) == -1)
12855         return NULL;
12856     return replace(self, old, new, count);
12857 }
12858 
12859 /*[clinic input]
12860 str.removeprefix as unicode_removeprefix
12861 
12862     prefix: unicode
12863     /
12864 
12865 Return a str with the given prefix string removed if present.
12866 
12867 If the string starts with the prefix string, return string[len(prefix):].
12868 Otherwise, return a copy of the original string.
12869 [clinic start generated code]*/
12870 
12871 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12872 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12873 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12874 {
12875     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12876     if (match == -1) {
12877         return NULL;
12878     }
12879     if (match) {
12880         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12881                                    PyUnicode_GET_LENGTH(self));
12882     }
12883     return unicode_result_unchanged(self);
12884 }
12885 
12886 /*[clinic input]
12887 str.removesuffix as unicode_removesuffix
12888 
12889     suffix: unicode
12890     /
12891 
12892 Return a str with the given suffix string removed if present.
12893 
12894 If the string ends with the suffix string and that suffix is not empty,
12895 return string[:-len(suffix)]. Otherwise, return a copy of the original
12896 string.
12897 [clinic start generated code]*/
12898 
12899 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12900 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12901 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12902 {
12903     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12904     if (match == -1) {
12905         return NULL;
12906     }
12907     if (match) {
12908         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12909                                             - PyUnicode_GET_LENGTH(suffix));
12910     }
12911     return unicode_result_unchanged(self);
12912 }
12913 
12914 static PyObject *
unicode_repr(PyObject * unicode)12915 unicode_repr(PyObject *unicode)
12916 {
12917     PyObject *repr;
12918     Py_ssize_t isize;
12919     Py_ssize_t osize, squote, dquote, i, o;
12920     Py_UCS4 max, quote;
12921     int ikind, okind, unchanged;
12922     const void *idata;
12923     void *odata;
12924 
12925     if (PyUnicode_READY(unicode) == -1)
12926         return NULL;
12927 
12928     isize = PyUnicode_GET_LENGTH(unicode);
12929     idata = PyUnicode_DATA(unicode);
12930 
12931     /* Compute length of output, quote characters, and
12932        maximum character */
12933     osize = 0;
12934     max = 127;
12935     squote = dquote = 0;
12936     ikind = PyUnicode_KIND(unicode);
12937     for (i = 0; i < isize; i++) {
12938         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12939         Py_ssize_t incr = 1;
12940         switch (ch) {
12941         case '\'': squote++; break;
12942         case '"':  dquote++; break;
12943         case '\\': case '\t': case '\r': case '\n':
12944             incr = 2;
12945             break;
12946         default:
12947             /* Fast-path ASCII */
12948             if (ch < ' ' || ch == 0x7f)
12949                 incr = 4; /* \xHH */
12950             else if (ch < 0x7f)
12951                 ;
12952             else if (Py_UNICODE_ISPRINTABLE(ch))
12953                 max = ch > max ? ch : max;
12954             else if (ch < 0x100)
12955                 incr = 4; /* \xHH */
12956             else if (ch < 0x10000)
12957                 incr = 6; /* \uHHHH */
12958             else
12959                 incr = 10; /* \uHHHHHHHH */
12960         }
12961         if (osize > PY_SSIZE_T_MAX - incr) {
12962             PyErr_SetString(PyExc_OverflowError,
12963                             "string is too long to generate repr");
12964             return NULL;
12965         }
12966         osize += incr;
12967     }
12968 
12969     quote = '\'';
12970     unchanged = (osize == isize);
12971     if (squote) {
12972         unchanged = 0;
12973         if (dquote)
12974             /* Both squote and dquote present. Use squote,
12975                and escape them */
12976             osize += squote;
12977         else
12978             quote = '"';
12979     }
12980     osize += 2;   /* quotes */
12981 
12982     repr = PyUnicode_New(osize, max);
12983     if (repr == NULL)
12984         return NULL;
12985     okind = PyUnicode_KIND(repr);
12986     odata = PyUnicode_DATA(repr);
12987 
12988     PyUnicode_WRITE(okind, odata, 0, quote);
12989     PyUnicode_WRITE(okind, odata, osize-1, quote);
12990     if (unchanged) {
12991         _PyUnicode_FastCopyCharacters(repr, 1,
12992                                       unicode, 0,
12993                                       isize);
12994     }
12995     else {
12996         for (i = 0, o = 1; i < isize; i++) {
12997             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12998 
12999             /* Escape quotes and backslashes */
13000             if ((ch == quote) || (ch == '\\')) {
13001                 PyUnicode_WRITE(okind, odata, o++, '\\');
13002                 PyUnicode_WRITE(okind, odata, o++, ch);
13003                 continue;
13004             }
13005 
13006             /* Map special whitespace to '\t', \n', '\r' */
13007             if (ch == '\t') {
13008                 PyUnicode_WRITE(okind, odata, o++, '\\');
13009                 PyUnicode_WRITE(okind, odata, o++, 't');
13010             }
13011             else if (ch == '\n') {
13012                 PyUnicode_WRITE(okind, odata, o++, '\\');
13013                 PyUnicode_WRITE(okind, odata, o++, 'n');
13014             }
13015             else if (ch == '\r') {
13016                 PyUnicode_WRITE(okind, odata, o++, '\\');
13017                 PyUnicode_WRITE(okind, odata, o++, 'r');
13018             }
13019 
13020             /* Map non-printable US ASCII to '\xhh' */
13021             else if (ch < ' ' || ch == 0x7F) {
13022                 PyUnicode_WRITE(okind, odata, o++, '\\');
13023                 PyUnicode_WRITE(okind, odata, o++, 'x');
13024                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13025                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13026             }
13027 
13028             /* Copy ASCII characters as-is */
13029             else if (ch < 0x7F) {
13030                 PyUnicode_WRITE(okind, odata, o++, ch);
13031             }
13032 
13033             /* Non-ASCII characters */
13034             else {
13035                 /* Map Unicode whitespace and control characters
13036                    (categories Z* and C* except ASCII space)
13037                 */
13038                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13039                     PyUnicode_WRITE(okind, odata, o++, '\\');
13040                     /* Map 8-bit characters to '\xhh' */
13041                     if (ch <= 0xff) {
13042                         PyUnicode_WRITE(okind, odata, o++, 'x');
13043                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13044                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13045                     }
13046                     /* Map 16-bit characters to '\uxxxx' */
13047                     else if (ch <= 0xffff) {
13048                         PyUnicode_WRITE(okind, odata, o++, 'u');
13049                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13050                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13051                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13052                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13053                     }
13054                     /* Map 21-bit characters to '\U00xxxxxx' */
13055                     else {
13056                         PyUnicode_WRITE(okind, odata, o++, 'U');
13057                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13058                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13059                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13060                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13061                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13062                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13063                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13064                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13065                     }
13066                 }
13067                 /* Copy characters as-is */
13068                 else {
13069                     PyUnicode_WRITE(okind, odata, o++, ch);
13070                 }
13071             }
13072         }
13073     }
13074     /* Closing quote already added at the beginning */
13075     assert(_PyUnicode_CheckConsistency(repr, 1));
13076     return repr;
13077 }
13078 
13079 PyDoc_STRVAR(rfind__doc__,
13080              "S.rfind(sub[, start[, end]]) -> int\n\
13081 \n\
13082 Return the highest index in S where substring sub is found,\n\
13083 such that sub is contained within S[start:end].  Optional\n\
13084 arguments start and end are interpreted as in slice notation.\n\
13085 \n\
13086 Return -1 on failure.");
13087 
13088 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13089 unicode_rfind(PyObject *self, PyObject *args)
13090 {
13091     /* initialize variables to prevent gcc warning */
13092     PyObject *substring = NULL;
13093     Py_ssize_t start = 0;
13094     Py_ssize_t end = 0;
13095     Py_ssize_t result;
13096 
13097     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13098         return NULL;
13099 
13100     if (PyUnicode_READY(self) == -1)
13101         return NULL;
13102 
13103     result = any_find_slice(self, substring, start, end, -1);
13104 
13105     if (result == -2)
13106         return NULL;
13107 
13108     return PyLong_FromSsize_t(result);
13109 }
13110 
13111 PyDoc_STRVAR(rindex__doc__,
13112              "S.rindex(sub[, start[, end]]) -> int\n\
13113 \n\
13114 Return the highest index in S where substring sub is found,\n\
13115 such that sub is contained within S[start:end].  Optional\n\
13116 arguments start and end are interpreted as in slice notation.\n\
13117 \n\
13118 Raises ValueError when the substring is not found.");
13119 
13120 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13121 unicode_rindex(PyObject *self, PyObject *args)
13122 {
13123     /* initialize variables to prevent gcc warning */
13124     PyObject *substring = NULL;
13125     Py_ssize_t start = 0;
13126     Py_ssize_t end = 0;
13127     Py_ssize_t result;
13128 
13129     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13130         return NULL;
13131 
13132     if (PyUnicode_READY(self) == -1)
13133         return NULL;
13134 
13135     result = any_find_slice(self, substring, start, end, -1);
13136 
13137     if (result == -2)
13138         return NULL;
13139 
13140     if (result < 0) {
13141         PyErr_SetString(PyExc_ValueError, "substring not found");
13142         return NULL;
13143     }
13144 
13145     return PyLong_FromSsize_t(result);
13146 }
13147 
13148 /*[clinic input]
13149 str.rjust as unicode_rjust
13150 
13151     width: Py_ssize_t
13152     fillchar: Py_UCS4 = ' '
13153     /
13154 
13155 Return a right-justified string of length width.
13156 
13157 Padding is done using the specified fill character (default is a space).
13158 [clinic start generated code]*/
13159 
13160 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13161 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13162 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13163 {
13164     if (PyUnicode_READY(self) == -1)
13165         return NULL;
13166 
13167     if (PyUnicode_GET_LENGTH(self) >= width)
13168         return unicode_result_unchanged(self);
13169 
13170     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13171 }
13172 
13173 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13174 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13175 {
13176     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13177         return NULL;
13178 
13179     return split(s, sep, maxsplit);
13180 }
13181 
13182 /*[clinic input]
13183 str.split as unicode_split
13184 
13185     sep: object = None
13186         The separator used to split the string.
13187 
13188         When set to None (the default value), will split on any whitespace
13189         character (including \\n \\r \\t \\f and spaces) and will discard
13190         empty strings from the result.
13191     maxsplit: Py_ssize_t = -1
13192         Maximum number of splits (starting from the left).
13193         -1 (the default value) means no limit.
13194 
13195 Return a list of the substrings in the string, using sep as the separator string.
13196 
13197 Note, str.split() is mainly useful for data that has been intentionally
13198 delimited.  With natural text that includes punctuation, consider using
13199 the regular expression module.
13200 
13201 [clinic start generated code]*/
13202 
13203 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13204 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13205 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13206 {
13207     if (sep == Py_None)
13208         return split(self, NULL, maxsplit);
13209     if (PyUnicode_Check(sep))
13210         return split(self, sep, maxsplit);
13211 
13212     PyErr_Format(PyExc_TypeError,
13213                  "must be str or None, not %.100s",
13214                  Py_TYPE(sep)->tp_name);
13215     return NULL;
13216 }
13217 
13218 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13219 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13220 {
13221     PyObject* out;
13222     int kind1, kind2;
13223     const void *buf1, *buf2;
13224     Py_ssize_t len1, len2;
13225 
13226     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13227         return NULL;
13228 
13229     kind1 = PyUnicode_KIND(str_obj);
13230     kind2 = PyUnicode_KIND(sep_obj);
13231     len1 = PyUnicode_GET_LENGTH(str_obj);
13232     len2 = PyUnicode_GET_LENGTH(sep_obj);
13233     if (kind1 < kind2 || len1 < len2) {
13234         PyObject *empty = unicode_get_empty();  // Borrowed reference
13235         return PyTuple_Pack(3, str_obj, empty, empty);
13236     }
13237     buf1 = PyUnicode_DATA(str_obj);
13238     buf2 = PyUnicode_DATA(sep_obj);
13239     if (kind2 != kind1) {
13240         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13241         if (!buf2)
13242             return NULL;
13243     }
13244 
13245     switch (kind1) {
13246     case PyUnicode_1BYTE_KIND:
13247         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13248             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13249         else
13250             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13251         break;
13252     case PyUnicode_2BYTE_KIND:
13253         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13254         break;
13255     case PyUnicode_4BYTE_KIND:
13256         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13257         break;
13258     default:
13259         Py_UNREACHABLE();
13260     }
13261 
13262     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13263     if (kind2 != kind1)
13264         PyMem_Free((void *)buf2);
13265 
13266     return out;
13267 }
13268 
13269 
13270 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13271 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13272 {
13273     PyObject* out;
13274     int kind1, kind2;
13275     const void *buf1, *buf2;
13276     Py_ssize_t len1, len2;
13277 
13278     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13279         return NULL;
13280 
13281     kind1 = PyUnicode_KIND(str_obj);
13282     kind2 = PyUnicode_KIND(sep_obj);
13283     len1 = PyUnicode_GET_LENGTH(str_obj);
13284     len2 = PyUnicode_GET_LENGTH(sep_obj);
13285     if (kind1 < kind2 || len1 < len2) {
13286         PyObject *empty = unicode_get_empty();  // Borrowed reference
13287         return PyTuple_Pack(3, empty, empty, str_obj);
13288     }
13289     buf1 = PyUnicode_DATA(str_obj);
13290     buf2 = PyUnicode_DATA(sep_obj);
13291     if (kind2 != kind1) {
13292         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13293         if (!buf2)
13294             return NULL;
13295     }
13296 
13297     switch (kind1) {
13298     case PyUnicode_1BYTE_KIND:
13299         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13300             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13301         else
13302             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13303         break;
13304     case PyUnicode_2BYTE_KIND:
13305         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13306         break;
13307     case PyUnicode_4BYTE_KIND:
13308         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13309         break;
13310     default:
13311         Py_UNREACHABLE();
13312     }
13313 
13314     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13315     if (kind2 != kind1)
13316         PyMem_Free((void *)buf2);
13317 
13318     return out;
13319 }
13320 
13321 /*[clinic input]
13322 str.partition as unicode_partition
13323 
13324     sep: object
13325     /
13326 
13327 Partition the string into three parts using the given separator.
13328 
13329 This will search for the separator in the string.  If the separator is found,
13330 returns a 3-tuple containing the part before the separator, the separator
13331 itself, and the part after it.
13332 
13333 If the separator is not found, returns a 3-tuple containing the original string
13334 and two empty strings.
13335 [clinic start generated code]*/
13336 
13337 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13338 unicode_partition(PyObject *self, PyObject *sep)
13339 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13340 {
13341     return PyUnicode_Partition(self, sep);
13342 }
13343 
13344 /*[clinic input]
13345 str.rpartition as unicode_rpartition = str.partition
13346 
13347 Partition the string into three parts using the given separator.
13348 
13349 This will search for the separator in the string, starting at the end. If
13350 the separator is found, returns a 3-tuple containing the part before the
13351 separator, the separator itself, and the part after it.
13352 
13353 If the separator is not found, returns a 3-tuple containing two empty strings
13354 and the original string.
13355 [clinic start generated code]*/
13356 
13357 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13358 unicode_rpartition(PyObject *self, PyObject *sep)
13359 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13360 {
13361     return PyUnicode_RPartition(self, sep);
13362 }
13363 
13364 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13365 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13366 {
13367     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13368         return NULL;
13369 
13370     return rsplit(s, sep, maxsplit);
13371 }
13372 
13373 /*[clinic input]
13374 str.rsplit as unicode_rsplit = str.split
13375 
13376 Return a list of the substrings in the string, using sep as the separator string.
13377 
13378 Splitting starts at the end of the string and works to the front.
13379 [clinic start generated code]*/
13380 
13381 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13382 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13383 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13384 {
13385     if (sep == Py_None)
13386         return rsplit(self, NULL, maxsplit);
13387     if (PyUnicode_Check(sep))
13388         return rsplit(self, sep, maxsplit);
13389 
13390     PyErr_Format(PyExc_TypeError,
13391                  "must be str or None, not %.100s",
13392                  Py_TYPE(sep)->tp_name);
13393     return NULL;
13394 }
13395 
13396 /*[clinic input]
13397 str.splitlines as unicode_splitlines
13398 
13399     keepends: bool(accept={int}) = False
13400 
13401 Return a list of the lines in the string, breaking at line boundaries.
13402 
13403 Line breaks are not included in the resulting list unless keepends is given and
13404 true.
13405 [clinic start generated code]*/
13406 
13407 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13408 unicode_splitlines_impl(PyObject *self, int keepends)
13409 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13410 {
13411     return PyUnicode_Splitlines(self, keepends);
13412 }
13413 
13414 static
unicode_str(PyObject * self)13415 PyObject *unicode_str(PyObject *self)
13416 {
13417     return unicode_result_unchanged(self);
13418 }
13419 
13420 /*[clinic input]
13421 str.swapcase as unicode_swapcase
13422 
13423 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13424 [clinic start generated code]*/
13425 
13426 static PyObject *
unicode_swapcase_impl(PyObject * self)13427 unicode_swapcase_impl(PyObject *self)
13428 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13429 {
13430     if (PyUnicode_READY(self) == -1)
13431         return NULL;
13432     return case_operation(self, do_swapcase);
13433 }
13434 
13435 /*[clinic input]
13436 
13437 @staticmethod
13438 str.maketrans as unicode_maketrans
13439 
13440   x: object
13441 
13442   y: unicode=NULL
13443 
13444   z: unicode=NULL
13445 
13446   /
13447 
13448 Return a translation table usable for str.translate().
13449 
13450 If there is only one argument, it must be a dictionary mapping Unicode
13451 ordinals (integers) or characters to Unicode ordinals, strings or None.
13452 Character keys will be then converted to ordinals.
13453 If there are two arguments, they must be strings of equal length, and
13454 in the resulting dictionary, each character in x will be mapped to the
13455 character at the same position in y. If there is a third argument, it
13456 must be a string, whose characters will be mapped to None in the result.
13457 [clinic start generated code]*/
13458 
13459 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13460 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13461 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13462 {
13463     PyObject *new = NULL, *key, *value;
13464     Py_ssize_t i = 0;
13465     int res;
13466 
13467     new = PyDict_New();
13468     if (!new)
13469         return NULL;
13470     if (y != NULL) {
13471         int x_kind, y_kind, z_kind;
13472         const void *x_data, *y_data, *z_data;
13473 
13474         /* x must be a string too, of equal length */
13475         if (!PyUnicode_Check(x)) {
13476             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13477                             "be a string if there is a second argument");
13478             goto err;
13479         }
13480         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13481             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13482                             "arguments must have equal length");
13483             goto err;
13484         }
13485         /* create entries for translating chars in x to those in y */
13486         x_kind = PyUnicode_KIND(x);
13487         y_kind = PyUnicode_KIND(y);
13488         x_data = PyUnicode_DATA(x);
13489         y_data = PyUnicode_DATA(y);
13490         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13491             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13492             if (!key)
13493                 goto err;
13494             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13495             if (!value) {
13496                 Py_DECREF(key);
13497                 goto err;
13498             }
13499             res = PyDict_SetItem(new, key, value);
13500             Py_DECREF(key);
13501             Py_DECREF(value);
13502             if (res < 0)
13503                 goto err;
13504         }
13505         /* create entries for deleting chars in z */
13506         if (z != NULL) {
13507             z_kind = PyUnicode_KIND(z);
13508             z_data = PyUnicode_DATA(z);
13509             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13510                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13511                 if (!key)
13512                     goto err;
13513                 res = PyDict_SetItem(new, key, Py_None);
13514                 Py_DECREF(key);
13515                 if (res < 0)
13516                     goto err;
13517             }
13518         }
13519     } else {
13520         int kind;
13521         const void *data;
13522 
13523         /* x must be a dict */
13524         if (!PyDict_CheckExact(x)) {
13525             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13526                             "to maketrans it must be a dict");
13527             goto err;
13528         }
13529         /* copy entries into the new dict, converting string keys to int keys */
13530         while (PyDict_Next(x, &i, &key, &value)) {
13531             if (PyUnicode_Check(key)) {
13532                 /* convert string keys to integer keys */
13533                 PyObject *newkey;
13534                 if (PyUnicode_GET_LENGTH(key) != 1) {
13535                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13536                                     "table must be of length 1");
13537                     goto err;
13538                 }
13539                 kind = PyUnicode_KIND(key);
13540                 data = PyUnicode_DATA(key);
13541                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13542                 if (!newkey)
13543                     goto err;
13544                 res = PyDict_SetItem(new, newkey, value);
13545                 Py_DECREF(newkey);
13546                 if (res < 0)
13547                     goto err;
13548             } else if (PyLong_Check(key)) {
13549                 /* just keep integer keys */
13550                 if (PyDict_SetItem(new, key, value) < 0)
13551                     goto err;
13552             } else {
13553                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13554                                 "be strings or integers");
13555                 goto err;
13556             }
13557         }
13558     }
13559     return new;
13560   err:
13561     Py_DECREF(new);
13562     return NULL;
13563 }
13564 
13565 /*[clinic input]
13566 str.translate as unicode_translate
13567 
13568     table: object
13569         Translation table, which must be a mapping of Unicode ordinals to
13570         Unicode ordinals, strings, or None.
13571     /
13572 
13573 Replace each character in the string using the given translation table.
13574 
13575 The table must implement lookup/indexing via __getitem__, for instance a
13576 dictionary or list.  If this operation raises LookupError, the character is
13577 left untouched.  Characters mapped to None are deleted.
13578 [clinic start generated code]*/
13579 
13580 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13581 unicode_translate(PyObject *self, PyObject *table)
13582 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13583 {
13584     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13585 }
13586 
13587 /*[clinic input]
13588 str.upper as unicode_upper
13589 
13590 Return a copy of the string converted to uppercase.
13591 [clinic start generated code]*/
13592 
13593 static PyObject *
unicode_upper_impl(PyObject * self)13594 unicode_upper_impl(PyObject *self)
13595 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13596 {
13597     if (PyUnicode_READY(self) == -1)
13598         return NULL;
13599     if (PyUnicode_IS_ASCII(self))
13600         return ascii_upper_or_lower(self, 0);
13601     return case_operation(self, do_upper);
13602 }
13603 
13604 /*[clinic input]
13605 str.zfill as unicode_zfill
13606 
13607     width: Py_ssize_t
13608     /
13609 
13610 Pad a numeric string with zeros on the left, to fill a field of the given width.
13611 
13612 The string is never truncated.
13613 [clinic start generated code]*/
13614 
13615 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13616 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13617 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13618 {
13619     Py_ssize_t fill;
13620     PyObject *u;
13621     int kind;
13622     const void *data;
13623     Py_UCS4 chr;
13624 
13625     if (PyUnicode_READY(self) == -1)
13626         return NULL;
13627 
13628     if (PyUnicode_GET_LENGTH(self) >= width)
13629         return unicode_result_unchanged(self);
13630 
13631     fill = width - PyUnicode_GET_LENGTH(self);
13632 
13633     u = pad(self, fill, 0, '0');
13634 
13635     if (u == NULL)
13636         return NULL;
13637 
13638     kind = PyUnicode_KIND(u);
13639     data = PyUnicode_DATA(u);
13640     chr = PyUnicode_READ(kind, data, fill);
13641 
13642     if (chr == '+' || chr == '-') {
13643         /* move sign to beginning of string */
13644         PyUnicode_WRITE(kind, data, 0, chr);
13645         PyUnicode_WRITE(kind, data, fill, '0');
13646     }
13647 
13648     assert(_PyUnicode_CheckConsistency(u, 1));
13649     return u;
13650 }
13651 
13652 PyDoc_STRVAR(startswith__doc__,
13653              "S.startswith(prefix[, start[, end]]) -> bool\n\
13654 \n\
13655 Return True if S starts with the specified prefix, False otherwise.\n\
13656 With optional start, test S beginning at that position.\n\
13657 With optional end, stop comparing S at that position.\n\
13658 prefix can also be a tuple of strings to try.");
13659 
13660 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13661 unicode_startswith(PyObject *self,
13662                    PyObject *args)
13663 {
13664     PyObject *subobj;
13665     PyObject *substring;
13666     Py_ssize_t start = 0;
13667     Py_ssize_t end = PY_SSIZE_T_MAX;
13668     int result;
13669 
13670     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13671         return NULL;
13672     if (PyTuple_Check(subobj)) {
13673         Py_ssize_t i;
13674         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13675             substring = PyTuple_GET_ITEM(subobj, i);
13676             if (!PyUnicode_Check(substring)) {
13677                 PyErr_Format(PyExc_TypeError,
13678                              "tuple for startswith must only contain str, "
13679                              "not %.100s",
13680                              Py_TYPE(substring)->tp_name);
13681                 return NULL;
13682             }
13683             result = tailmatch(self, substring, start, end, -1);
13684             if (result == -1)
13685                 return NULL;
13686             if (result) {
13687                 Py_RETURN_TRUE;
13688             }
13689         }
13690         /* nothing matched */
13691         Py_RETURN_FALSE;
13692     }
13693     if (!PyUnicode_Check(subobj)) {
13694         PyErr_Format(PyExc_TypeError,
13695                      "startswith first arg must be str or "
13696                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13697         return NULL;
13698     }
13699     result = tailmatch(self, subobj, start, end, -1);
13700     if (result == -1)
13701         return NULL;
13702     return PyBool_FromLong(result);
13703 }
13704 
13705 
13706 PyDoc_STRVAR(endswith__doc__,
13707              "S.endswith(suffix[, start[, end]]) -> bool\n\
13708 \n\
13709 Return True if S ends with the specified suffix, False otherwise.\n\
13710 With optional start, test S beginning at that position.\n\
13711 With optional end, stop comparing S at that position.\n\
13712 suffix can also be a tuple of strings to try.");
13713 
13714 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13715 unicode_endswith(PyObject *self,
13716                  PyObject *args)
13717 {
13718     PyObject *subobj;
13719     PyObject *substring;
13720     Py_ssize_t start = 0;
13721     Py_ssize_t end = PY_SSIZE_T_MAX;
13722     int result;
13723 
13724     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13725         return NULL;
13726     if (PyTuple_Check(subobj)) {
13727         Py_ssize_t i;
13728         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13729             substring = PyTuple_GET_ITEM(subobj, i);
13730             if (!PyUnicode_Check(substring)) {
13731                 PyErr_Format(PyExc_TypeError,
13732                              "tuple for endswith must only contain str, "
13733                              "not %.100s",
13734                              Py_TYPE(substring)->tp_name);
13735                 return NULL;
13736             }
13737             result = tailmatch(self, substring, start, end, +1);
13738             if (result == -1)
13739                 return NULL;
13740             if (result) {
13741                 Py_RETURN_TRUE;
13742             }
13743         }
13744         Py_RETURN_FALSE;
13745     }
13746     if (!PyUnicode_Check(subobj)) {
13747         PyErr_Format(PyExc_TypeError,
13748                      "endswith first arg must be str or "
13749                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13750         return NULL;
13751     }
13752     result = tailmatch(self, subobj, start, end, +1);
13753     if (result == -1)
13754         return NULL;
13755     return PyBool_FromLong(result);
13756 }
13757 
13758 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13759 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13760 {
13761     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13762     writer->data = PyUnicode_DATA(writer->buffer);
13763 
13764     if (!writer->readonly) {
13765         writer->kind = PyUnicode_KIND(writer->buffer);
13766         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13767     }
13768     else {
13769         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13770            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13771         writer->kind = PyUnicode_WCHAR_KIND;
13772         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13773 
13774         /* Copy-on-write mode: set buffer size to 0 so
13775          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13776          * next write. */
13777         writer->size = 0;
13778     }
13779 }
13780 
13781 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13782 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13783 {
13784     memset(writer, 0, sizeof(*writer));
13785 
13786     /* ASCII is the bare minimum */
13787     writer->min_char = 127;
13788 
13789     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13790        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13791     writer->kind = PyUnicode_WCHAR_KIND;
13792     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13793 }
13794 
13795 // Initialize _PyUnicodeWriter with initial buffer
13796 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13797 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13798 {
13799     memset(writer, 0, sizeof(*writer));
13800     writer->buffer = buffer;
13801     _PyUnicodeWriter_Update(writer);
13802     writer->min_length = writer->size;
13803 }
13804 
13805 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13806 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13807                                  Py_ssize_t length, Py_UCS4 maxchar)
13808 {
13809     Py_ssize_t newlen;
13810     PyObject *newbuffer;
13811 
13812     assert(maxchar <= MAX_UNICODE);
13813 
13814     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13815     assert((maxchar > writer->maxchar && length >= 0)
13816            || length > 0);
13817 
13818     if (length > PY_SSIZE_T_MAX - writer->pos) {
13819         PyErr_NoMemory();
13820         return -1;
13821     }
13822     newlen = writer->pos + length;
13823 
13824     maxchar = Py_MAX(maxchar, writer->min_char);
13825 
13826     if (writer->buffer == NULL) {
13827         assert(!writer->readonly);
13828         if (writer->overallocate
13829             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13830             /* overallocate to limit the number of realloc() */
13831             newlen += newlen / OVERALLOCATE_FACTOR;
13832         }
13833         if (newlen < writer->min_length)
13834             newlen = writer->min_length;
13835 
13836         writer->buffer = PyUnicode_New(newlen, maxchar);
13837         if (writer->buffer == NULL)
13838             return -1;
13839     }
13840     else if (newlen > writer->size) {
13841         if (writer->overallocate
13842             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13843             /* overallocate to limit the number of realloc() */
13844             newlen += newlen / OVERALLOCATE_FACTOR;
13845         }
13846         if (newlen < writer->min_length)
13847             newlen = writer->min_length;
13848 
13849         if (maxchar > writer->maxchar || writer->readonly) {
13850             /* resize + widen */
13851             maxchar = Py_MAX(maxchar, writer->maxchar);
13852             newbuffer = PyUnicode_New(newlen, maxchar);
13853             if (newbuffer == NULL)
13854                 return -1;
13855             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13856                                           writer->buffer, 0, writer->pos);
13857             Py_DECREF(writer->buffer);
13858             writer->readonly = 0;
13859         }
13860         else {
13861             newbuffer = resize_compact(writer->buffer, newlen);
13862             if (newbuffer == NULL)
13863                 return -1;
13864         }
13865         writer->buffer = newbuffer;
13866     }
13867     else if (maxchar > writer->maxchar) {
13868         assert(!writer->readonly);
13869         newbuffer = PyUnicode_New(writer->size, maxchar);
13870         if (newbuffer == NULL)
13871             return -1;
13872         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13873                                       writer->buffer, 0, writer->pos);
13874         Py_SETREF(writer->buffer, newbuffer);
13875     }
13876     _PyUnicodeWriter_Update(writer);
13877     return 0;
13878 
13879 #undef OVERALLOCATE_FACTOR
13880 }
13881 
13882 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13883 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13884                                      enum PyUnicode_Kind kind)
13885 {
13886     Py_UCS4 maxchar;
13887 
13888     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13889     assert(writer->kind < kind);
13890 
13891     switch (kind)
13892     {
13893     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13894     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13895     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13896     default:
13897         Py_UNREACHABLE();
13898     }
13899 
13900     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13901 }
13902 
13903 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13904 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13905 {
13906     assert(ch <= MAX_UNICODE);
13907     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13908         return -1;
13909     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13910     writer->pos++;
13911     return 0;
13912 }
13913 
13914 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13915 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13916 {
13917     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13918 }
13919 
13920 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13921 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13922 {
13923     Py_UCS4 maxchar;
13924     Py_ssize_t len;
13925 
13926     if (PyUnicode_READY(str) == -1)
13927         return -1;
13928     len = PyUnicode_GET_LENGTH(str);
13929     if (len == 0)
13930         return 0;
13931     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13932     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13933         if (writer->buffer == NULL && !writer->overallocate) {
13934             assert(_PyUnicode_CheckConsistency(str, 1));
13935             writer->readonly = 1;
13936             Py_INCREF(str);
13937             writer->buffer = str;
13938             _PyUnicodeWriter_Update(writer);
13939             writer->pos += len;
13940             return 0;
13941         }
13942         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13943             return -1;
13944     }
13945     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13946                                   str, 0, len);
13947     writer->pos += len;
13948     return 0;
13949 }
13950 
13951 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13952 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13953                                 Py_ssize_t start, Py_ssize_t end)
13954 {
13955     Py_UCS4 maxchar;
13956     Py_ssize_t len;
13957 
13958     if (PyUnicode_READY(str) == -1)
13959         return -1;
13960 
13961     assert(0 <= start);
13962     assert(end <= PyUnicode_GET_LENGTH(str));
13963     assert(start <= end);
13964 
13965     if (end == 0)
13966         return 0;
13967 
13968     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13969         return _PyUnicodeWriter_WriteStr(writer, str);
13970 
13971     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13972         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13973     else
13974         maxchar = writer->maxchar;
13975     len = end - start;
13976 
13977     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13978         return -1;
13979 
13980     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13981                                   str, start, len);
13982     writer->pos += len;
13983     return 0;
13984 }
13985 
13986 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13987 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13988                                   const char *ascii, Py_ssize_t len)
13989 {
13990     if (len == -1)
13991         len = strlen(ascii);
13992 
13993     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13994 
13995     if (writer->buffer == NULL && !writer->overallocate) {
13996         PyObject *str;
13997 
13998         str = _PyUnicode_FromASCII(ascii, len);
13999         if (str == NULL)
14000             return -1;
14001 
14002         writer->readonly = 1;
14003         writer->buffer = str;
14004         _PyUnicodeWriter_Update(writer);
14005         writer->pos += len;
14006         return 0;
14007     }
14008 
14009     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14010         return -1;
14011 
14012     switch (writer->kind)
14013     {
14014     case PyUnicode_1BYTE_KIND:
14015     {
14016         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14017         Py_UCS1 *data = writer->data;
14018 
14019         memcpy(data + writer->pos, str, len);
14020         break;
14021     }
14022     case PyUnicode_2BYTE_KIND:
14023     {
14024         _PyUnicode_CONVERT_BYTES(
14025             Py_UCS1, Py_UCS2,
14026             ascii, ascii + len,
14027             (Py_UCS2 *)writer->data + writer->pos);
14028         break;
14029     }
14030     case PyUnicode_4BYTE_KIND:
14031     {
14032         _PyUnicode_CONVERT_BYTES(
14033             Py_UCS1, Py_UCS4,
14034             ascii, ascii + len,
14035             (Py_UCS4 *)writer->data + writer->pos);
14036         break;
14037     }
14038     default:
14039         Py_UNREACHABLE();
14040     }
14041 
14042     writer->pos += len;
14043     return 0;
14044 }
14045 
14046 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14047 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14048                                    const char *str, Py_ssize_t len)
14049 {
14050     Py_UCS4 maxchar;
14051 
14052     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14053     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14054         return -1;
14055     unicode_write_cstr(writer->buffer, writer->pos, str, len);
14056     writer->pos += len;
14057     return 0;
14058 }
14059 
14060 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14061 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14062 {
14063     PyObject *str;
14064 
14065     if (writer->pos == 0) {
14066         Py_CLEAR(writer->buffer);
14067         _Py_RETURN_UNICODE_EMPTY();
14068     }
14069 
14070     str = writer->buffer;
14071     writer->buffer = NULL;
14072 
14073     if (writer->readonly) {
14074         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14075         return str;
14076     }
14077 
14078     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14079         PyObject *str2;
14080         str2 = resize_compact(str, writer->pos);
14081         if (str2 == NULL) {
14082             Py_DECREF(str);
14083             return NULL;
14084         }
14085         str = str2;
14086     }
14087 
14088     assert(_PyUnicode_CheckConsistency(str, 1));
14089     return unicode_result_ready(str);
14090 }
14091 
14092 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14093 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14094 {
14095     Py_CLEAR(writer->buffer);
14096 }
14097 
14098 #include "stringlib/unicode_format.h"
14099 
14100 PyDoc_STRVAR(format__doc__,
14101              "S.format(*args, **kwargs) -> str\n\
14102 \n\
14103 Return a formatted version of S, using substitutions from args and kwargs.\n\
14104 The substitutions are identified by braces ('{' and '}').");
14105 
14106 PyDoc_STRVAR(format_map__doc__,
14107              "S.format_map(mapping) -> str\n\
14108 \n\
14109 Return a formatted version of S, using substitutions from mapping.\n\
14110 The substitutions are identified by braces ('{' and '}').");
14111 
14112 /*[clinic input]
14113 str.__format__ as unicode___format__
14114 
14115     format_spec: unicode
14116     /
14117 
14118 Return a formatted version of the string as described by format_spec.
14119 [clinic start generated code]*/
14120 
14121 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14122 unicode___format___impl(PyObject *self, PyObject *format_spec)
14123 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14124 {
14125     _PyUnicodeWriter writer;
14126     int ret;
14127 
14128     if (PyUnicode_READY(self) == -1)
14129         return NULL;
14130     _PyUnicodeWriter_Init(&writer);
14131     ret = _PyUnicode_FormatAdvancedWriter(&writer,
14132                                           self, format_spec, 0,
14133                                           PyUnicode_GET_LENGTH(format_spec));
14134     if (ret == -1) {
14135         _PyUnicodeWriter_Dealloc(&writer);
14136         return NULL;
14137     }
14138     return _PyUnicodeWriter_Finish(&writer);
14139 }
14140 
14141 /*[clinic input]
14142 str.__sizeof__ as unicode_sizeof
14143 
14144 Return the size of the string in memory, in bytes.
14145 [clinic start generated code]*/
14146 
14147 static PyObject *
unicode_sizeof_impl(PyObject * self)14148 unicode_sizeof_impl(PyObject *self)
14149 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14150 {
14151     Py_ssize_t size;
14152 
14153     /* If it's a compact object, account for base structure +
14154        character data. */
14155     if (PyUnicode_IS_COMPACT_ASCII(self))
14156         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14157     else if (PyUnicode_IS_COMPACT(self))
14158         size = sizeof(PyCompactUnicodeObject) +
14159             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14160     else {
14161         /* If it is a two-block object, account for base object, and
14162            for character block if present. */
14163         size = sizeof(PyUnicodeObject);
14164         if (_PyUnicode_DATA_ANY(self))
14165             size += (PyUnicode_GET_LENGTH(self) + 1) *
14166                 PyUnicode_KIND(self);
14167     }
14168     /* If the wstr pointer is present, account for it unless it is shared
14169        with the data pointer. Check if the data is not shared. */
14170     if (_PyUnicode_HAS_WSTR_MEMORY(self))
14171         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14172     if (_PyUnicode_HAS_UTF8_MEMORY(self))
14173         size += PyUnicode_UTF8_LENGTH(self) + 1;
14174 
14175     return PyLong_FromSsize_t(size);
14176 }
14177 
14178 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14179 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14180 {
14181     PyObject *copy = _PyUnicode_Copy(v);
14182     if (!copy)
14183         return NULL;
14184     return Py_BuildValue("(N)", copy);
14185 }
14186 
14187 static PyMethodDef unicode_methods[] = {
14188     UNICODE_ENCODE_METHODDEF
14189     UNICODE_REPLACE_METHODDEF
14190     UNICODE_SPLIT_METHODDEF
14191     UNICODE_RSPLIT_METHODDEF
14192     UNICODE_JOIN_METHODDEF
14193     UNICODE_CAPITALIZE_METHODDEF
14194     UNICODE_CASEFOLD_METHODDEF
14195     UNICODE_TITLE_METHODDEF
14196     UNICODE_CENTER_METHODDEF
14197     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14198     UNICODE_EXPANDTABS_METHODDEF
14199     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14200     UNICODE_PARTITION_METHODDEF
14201     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14202     UNICODE_LJUST_METHODDEF
14203     UNICODE_LOWER_METHODDEF
14204     UNICODE_LSTRIP_METHODDEF
14205     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14206     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14207     UNICODE_RJUST_METHODDEF
14208     UNICODE_RSTRIP_METHODDEF
14209     UNICODE_RPARTITION_METHODDEF
14210     UNICODE_SPLITLINES_METHODDEF
14211     UNICODE_STRIP_METHODDEF
14212     UNICODE_SWAPCASE_METHODDEF
14213     UNICODE_TRANSLATE_METHODDEF
14214     UNICODE_UPPER_METHODDEF
14215     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14216     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14217     UNICODE_REMOVEPREFIX_METHODDEF
14218     UNICODE_REMOVESUFFIX_METHODDEF
14219     UNICODE_ISASCII_METHODDEF
14220     UNICODE_ISLOWER_METHODDEF
14221     UNICODE_ISUPPER_METHODDEF
14222     UNICODE_ISTITLE_METHODDEF
14223     UNICODE_ISSPACE_METHODDEF
14224     UNICODE_ISDECIMAL_METHODDEF
14225     UNICODE_ISDIGIT_METHODDEF
14226     UNICODE_ISNUMERIC_METHODDEF
14227     UNICODE_ISALPHA_METHODDEF
14228     UNICODE_ISALNUM_METHODDEF
14229     UNICODE_ISIDENTIFIER_METHODDEF
14230     UNICODE_ISPRINTABLE_METHODDEF
14231     UNICODE_ZFILL_METHODDEF
14232     {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14233     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14234     UNICODE___FORMAT___METHODDEF
14235     UNICODE_MAKETRANS_METHODDEF
14236     UNICODE_SIZEOF_METHODDEF
14237     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14238     {NULL, NULL}
14239 };
14240 
14241 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14242 unicode_mod(PyObject *v, PyObject *w)
14243 {
14244     if (!PyUnicode_Check(v))
14245         Py_RETURN_NOTIMPLEMENTED;
14246     return PyUnicode_Format(v, w);
14247 }
14248 
14249 static PyNumberMethods unicode_as_number = {
14250     0,              /*nb_add*/
14251     0,              /*nb_subtract*/
14252     0,              /*nb_multiply*/
14253     unicode_mod,            /*nb_remainder*/
14254 };
14255 
14256 static PySequenceMethods unicode_as_sequence = {
14257     (lenfunc) unicode_length,       /* sq_length */
14258     PyUnicode_Concat,           /* sq_concat */
14259     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14260     (ssizeargfunc) unicode_getitem,     /* sq_item */
14261     0,                  /* sq_slice */
14262     0,                  /* sq_ass_item */
14263     0,                  /* sq_ass_slice */
14264     PyUnicode_Contains,         /* sq_contains */
14265 };
14266 
14267 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14268 unicode_subscript(PyObject* self, PyObject* item)
14269 {
14270     if (PyUnicode_READY(self) == -1)
14271         return NULL;
14272 
14273     if (_PyIndex_Check(item)) {
14274         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14275         if (i == -1 && PyErr_Occurred())
14276             return NULL;
14277         if (i < 0)
14278             i += PyUnicode_GET_LENGTH(self);
14279         return unicode_getitem(self, i);
14280     } else if (PySlice_Check(item)) {
14281         Py_ssize_t start, stop, step, slicelength, i;
14282         size_t cur;
14283         PyObject *result;
14284         const void *src_data;
14285         void *dest_data;
14286         int src_kind, dest_kind;
14287         Py_UCS4 ch, max_char, kind_limit;
14288 
14289         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14290             return NULL;
14291         }
14292         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14293                                             &start, &stop, step);
14294 
14295         if (slicelength <= 0) {
14296             _Py_RETURN_UNICODE_EMPTY();
14297         } else if (start == 0 && step == 1 &&
14298                    slicelength == PyUnicode_GET_LENGTH(self)) {
14299             return unicode_result_unchanged(self);
14300         } else if (step == 1) {
14301             return PyUnicode_Substring(self,
14302                                        start, start + slicelength);
14303         }
14304         /* General case */
14305         src_kind = PyUnicode_KIND(self);
14306         src_data = PyUnicode_DATA(self);
14307         if (!PyUnicode_IS_ASCII(self)) {
14308             kind_limit = kind_maxchar_limit(src_kind);
14309             max_char = 0;
14310             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14311                 ch = PyUnicode_READ(src_kind, src_data, cur);
14312                 if (ch > max_char) {
14313                     max_char = ch;
14314                     if (max_char >= kind_limit)
14315                         break;
14316                 }
14317             }
14318         }
14319         else
14320             max_char = 127;
14321         result = PyUnicode_New(slicelength, max_char);
14322         if (result == NULL)
14323             return NULL;
14324         dest_kind = PyUnicode_KIND(result);
14325         dest_data = PyUnicode_DATA(result);
14326 
14327         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14328             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14329             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14330         }
14331         assert(_PyUnicode_CheckConsistency(result, 1));
14332         return result;
14333     } else {
14334         PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14335                      Py_TYPE(item)->tp_name);
14336         return NULL;
14337     }
14338 }
14339 
14340 static PyMappingMethods unicode_as_mapping = {
14341     (lenfunc)unicode_length,        /* mp_length */
14342     (binaryfunc)unicode_subscript,  /* mp_subscript */
14343     (objobjargproc)0,           /* mp_ass_subscript */
14344 };
14345 
14346 
14347 /* Helpers for PyUnicode_Format() */
14348 
14349 struct unicode_formatter_t {
14350     PyObject *args;
14351     int args_owned;
14352     Py_ssize_t arglen, argidx;
14353     PyObject *dict;
14354 
14355     enum PyUnicode_Kind fmtkind;
14356     Py_ssize_t fmtcnt, fmtpos;
14357     const void *fmtdata;
14358     PyObject *fmtstr;
14359 
14360     _PyUnicodeWriter writer;
14361 };
14362 
14363 struct unicode_format_arg_t {
14364     Py_UCS4 ch;
14365     int flags;
14366     Py_ssize_t width;
14367     int prec;
14368     int sign;
14369 };
14370 
14371 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14372 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14373 {
14374     Py_ssize_t argidx = ctx->argidx;
14375 
14376     if (argidx < ctx->arglen) {
14377         ctx->argidx++;
14378         if (ctx->arglen < 0)
14379             return ctx->args;
14380         else
14381             return PyTuple_GetItem(ctx->args, argidx);
14382     }
14383     PyErr_SetString(PyExc_TypeError,
14384                     "not enough arguments for format string");
14385     return NULL;
14386 }
14387 
14388 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14389 
14390 /* Format a float into the writer if the writer is not NULL, or into *p_output
14391    otherwise.
14392 
14393    Return 0 on success, raise an exception and return -1 on error. */
14394 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14395 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14396             PyObject **p_output,
14397             _PyUnicodeWriter *writer)
14398 {
14399     char *p;
14400     double x;
14401     Py_ssize_t len;
14402     int prec;
14403     int dtoa_flags = 0;
14404 
14405     x = PyFloat_AsDouble(v);
14406     if (x == -1.0 && PyErr_Occurred())
14407         return -1;
14408 
14409     prec = arg->prec;
14410     if (prec < 0)
14411         prec = 6;
14412 
14413     if (arg->flags & F_ALT)
14414         dtoa_flags |= Py_DTSF_ALT;
14415     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14416     if (p == NULL)
14417         return -1;
14418     len = strlen(p);
14419     if (writer) {
14420         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14421             PyMem_Free(p);
14422             return -1;
14423         }
14424     }
14425     else
14426         *p_output = _PyUnicode_FromASCII(p, len);
14427     PyMem_Free(p);
14428     return 0;
14429 }
14430 
14431 /* formatlong() emulates the format codes d, u, o, x and X, and
14432  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14433  * Python's regular ints.
14434  * Return value:  a new PyUnicodeObject*, or NULL if error.
14435  *     The output string is of the form
14436  *         "-"? ("0x" | "0X")? digit+
14437  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14438  *         set in flags.  The case of hex digits will be correct,
14439  *     There will be at least prec digits, zero-filled on the left if
14440  *         necessary to get that many.
14441  * val          object to be converted
14442  * flags        bitmask of format flags; only F_ALT is looked at
14443  * prec         minimum number of digits; 0-fill on left if needed
14444  * type         a character in [duoxX]; u acts the same as d
14445  *
14446  * CAUTION:  o, x and X conversions on regular ints can never
14447  * produce a '-' sign, but can for Python's unbounded ints.
14448  */
14449 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14450 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14451 {
14452     PyObject *result = NULL;
14453     char *buf;
14454     Py_ssize_t i;
14455     int sign;           /* 1 if '-', else 0 */
14456     int len;            /* number of characters */
14457     Py_ssize_t llen;
14458     int numdigits;      /* len == numnondigits + numdigits */
14459     int numnondigits = 0;
14460 
14461     /* Avoid exceeding SSIZE_T_MAX */
14462     if (prec > INT_MAX-3) {
14463         PyErr_SetString(PyExc_OverflowError,
14464                         "precision too large");
14465         return NULL;
14466     }
14467 
14468     assert(PyLong_Check(val));
14469 
14470     switch (type) {
14471     default:
14472         Py_UNREACHABLE();
14473     case 'd':
14474     case 'i':
14475     case 'u':
14476         /* int and int subclasses should print numerically when a numeric */
14477         /* format code is used (see issue18780) */
14478         result = PyNumber_ToBase(val, 10);
14479         break;
14480     case 'o':
14481         numnondigits = 2;
14482         result = PyNumber_ToBase(val, 8);
14483         break;
14484     case 'x':
14485     case 'X':
14486         numnondigits = 2;
14487         result = PyNumber_ToBase(val, 16);
14488         break;
14489     }
14490     if (!result)
14491         return NULL;
14492 
14493     assert(unicode_modifiable(result));
14494     assert(PyUnicode_IS_READY(result));
14495     assert(PyUnicode_IS_ASCII(result));
14496 
14497     /* To modify the string in-place, there can only be one reference. */
14498     if (Py_REFCNT(result) != 1) {
14499         Py_DECREF(result);
14500         PyErr_BadInternalCall();
14501         return NULL;
14502     }
14503     buf = PyUnicode_DATA(result);
14504     llen = PyUnicode_GET_LENGTH(result);
14505     if (llen > INT_MAX) {
14506         Py_DECREF(result);
14507         PyErr_SetString(PyExc_ValueError,
14508                         "string too large in _PyUnicode_FormatLong");
14509         return NULL;
14510     }
14511     len = (int)llen;
14512     sign = buf[0] == '-';
14513     numnondigits += sign;
14514     numdigits = len - numnondigits;
14515     assert(numdigits > 0);
14516 
14517     /* Get rid of base marker unless F_ALT */
14518     if (((alt) == 0 &&
14519         (type == 'o' || type == 'x' || type == 'X'))) {
14520         assert(buf[sign] == '0');
14521         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14522                buf[sign+1] == 'o');
14523         numnondigits -= 2;
14524         buf += 2;
14525         len -= 2;
14526         if (sign)
14527             buf[0] = '-';
14528         assert(len == numnondigits + numdigits);
14529         assert(numdigits > 0);
14530     }
14531 
14532     /* Fill with leading zeroes to meet minimum width. */
14533     if (prec > numdigits) {
14534         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14535                                 numnondigits + prec);
14536         char *b1;
14537         if (!r1) {
14538             Py_DECREF(result);
14539             return NULL;
14540         }
14541         b1 = PyBytes_AS_STRING(r1);
14542         for (i = 0; i < numnondigits; ++i)
14543             *b1++ = *buf++;
14544         for (i = 0; i < prec - numdigits; i++)
14545             *b1++ = '0';
14546         for (i = 0; i < numdigits; i++)
14547             *b1++ = *buf++;
14548         *b1 = '\0';
14549         Py_DECREF(result);
14550         result = r1;
14551         buf = PyBytes_AS_STRING(result);
14552         len = numnondigits + prec;
14553     }
14554 
14555     /* Fix up case for hex conversions. */
14556     if (type == 'X') {
14557         /* Need to convert all lower case letters to upper case.
14558            and need to convert 0x to 0X (and -0x to -0X). */
14559         for (i = 0; i < len; i++)
14560             if (buf[i] >= 'a' && buf[i] <= 'x')
14561                 buf[i] -= 'a'-'A';
14562     }
14563     if (!PyUnicode_Check(result)
14564         || buf != PyUnicode_DATA(result)) {
14565         PyObject *unicode;
14566         unicode = _PyUnicode_FromASCII(buf, len);
14567         Py_DECREF(result);
14568         result = unicode;
14569     }
14570     else if (len != PyUnicode_GET_LENGTH(result)) {
14571         if (PyUnicode_Resize(&result, len) < 0)
14572             Py_CLEAR(result);
14573     }
14574     return result;
14575 }
14576 
14577 /* Format an integer or a float as an integer.
14578  * Return 1 if the number has been formatted into the writer,
14579  *        0 if the number has been formatted into *p_output
14580  *       -1 and raise an exception on error */
14581 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14582 mainformatlong(PyObject *v,
14583                struct unicode_format_arg_t *arg,
14584                PyObject **p_output,
14585                _PyUnicodeWriter *writer)
14586 {
14587     PyObject *iobj, *res;
14588     char type = (char)arg->ch;
14589 
14590     if (!PyNumber_Check(v))
14591         goto wrongtype;
14592 
14593     /* make sure number is a type of integer for o, x, and X */
14594     if (!PyLong_Check(v)) {
14595         if (type == 'o' || type == 'x' || type == 'X') {
14596             iobj = _PyNumber_Index(v);
14597         }
14598         else {
14599             iobj = PyNumber_Long(v);
14600         }
14601         if (iobj == NULL ) {
14602             if (PyErr_ExceptionMatches(PyExc_TypeError))
14603                 goto wrongtype;
14604             return -1;
14605         }
14606         assert(PyLong_Check(iobj));
14607     }
14608     else {
14609         iobj = v;
14610         Py_INCREF(iobj);
14611     }
14612 
14613     if (PyLong_CheckExact(v)
14614         && arg->width == -1 && arg->prec == -1
14615         && !(arg->flags & (F_SIGN | F_BLANK))
14616         && type != 'X')
14617     {
14618         /* Fast path */
14619         int alternate = arg->flags & F_ALT;
14620         int base;
14621 
14622         switch(type)
14623         {
14624             default:
14625                 Py_UNREACHABLE();
14626             case 'd':
14627             case 'i':
14628             case 'u':
14629                 base = 10;
14630                 break;
14631             case 'o':
14632                 base = 8;
14633                 break;
14634             case 'x':
14635             case 'X':
14636                 base = 16;
14637                 break;
14638         }
14639 
14640         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14641             Py_DECREF(iobj);
14642             return -1;
14643         }
14644         Py_DECREF(iobj);
14645         return 1;
14646     }
14647 
14648     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14649     Py_DECREF(iobj);
14650     if (res == NULL)
14651         return -1;
14652     *p_output = res;
14653     return 0;
14654 
14655 wrongtype:
14656     switch(type)
14657     {
14658         case 'o':
14659         case 'x':
14660         case 'X':
14661             PyErr_Format(PyExc_TypeError,
14662                     "%%%c format: an integer is required, "
14663                     "not %.200s",
14664                     type, Py_TYPE(v)->tp_name);
14665             break;
14666         default:
14667             PyErr_Format(PyExc_TypeError,
14668                     "%%%c format: a real number is required, "
14669                     "not %.200s",
14670                     type, Py_TYPE(v)->tp_name);
14671             break;
14672     }
14673     return -1;
14674 }
14675 
14676 static Py_UCS4
formatchar(PyObject * v)14677 formatchar(PyObject *v)
14678 {
14679     /* presume that the buffer is at least 3 characters long */
14680     if (PyUnicode_Check(v)) {
14681         if (PyUnicode_GET_LENGTH(v) == 1) {
14682             return PyUnicode_READ_CHAR(v, 0);
14683         }
14684         goto onError;
14685     }
14686     else {
14687         int overflow;
14688         long x = PyLong_AsLongAndOverflow(v, &overflow);
14689         if (x == -1 && PyErr_Occurred()) {
14690             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14691                 goto onError;
14692             }
14693             return (Py_UCS4) -1;
14694         }
14695 
14696         if (x < 0 || x > MAX_UNICODE) {
14697             /* this includes an overflow in converting to C long */
14698             PyErr_SetString(PyExc_OverflowError,
14699                             "%c arg not in range(0x110000)");
14700             return (Py_UCS4) -1;
14701         }
14702 
14703         return (Py_UCS4) x;
14704     }
14705 
14706   onError:
14707     PyErr_SetString(PyExc_TypeError,
14708                     "%c requires int or char");
14709     return (Py_UCS4) -1;
14710 }
14711 
14712 /* Parse options of an argument: flags, width, precision.
14713    Handle also "%(name)" syntax.
14714 
14715    Return 0 if the argument has been formatted into arg->str.
14716    Return 1 if the argument has been written into ctx->writer,
14717    Raise an exception and return -1 on error. */
14718 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14719 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14720                          struct unicode_format_arg_t *arg)
14721 {
14722 #define FORMAT_READ(ctx) \
14723         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14724 
14725     PyObject *v;
14726 
14727     if (arg->ch == '(') {
14728         /* Get argument value from a dictionary. Example: "%(name)s". */
14729         Py_ssize_t keystart;
14730         Py_ssize_t keylen;
14731         PyObject *key;
14732         int pcount = 1;
14733 
14734         if (ctx->dict == NULL) {
14735             PyErr_SetString(PyExc_TypeError,
14736                             "format requires a mapping");
14737             return -1;
14738         }
14739         ++ctx->fmtpos;
14740         --ctx->fmtcnt;
14741         keystart = ctx->fmtpos;
14742         /* Skip over balanced parentheses */
14743         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14744             arg->ch = FORMAT_READ(ctx);
14745             if (arg->ch == ')')
14746                 --pcount;
14747             else if (arg->ch == '(')
14748                 ++pcount;
14749             ctx->fmtpos++;
14750         }
14751         keylen = ctx->fmtpos - keystart - 1;
14752         if (ctx->fmtcnt < 0 || pcount > 0) {
14753             PyErr_SetString(PyExc_ValueError,
14754                             "incomplete format key");
14755             return -1;
14756         }
14757         key = PyUnicode_Substring(ctx->fmtstr,
14758                                   keystart, keystart + keylen);
14759         if (key == NULL)
14760             return -1;
14761         if (ctx->args_owned) {
14762             ctx->args_owned = 0;
14763             Py_DECREF(ctx->args);
14764         }
14765         ctx->args = PyObject_GetItem(ctx->dict, key);
14766         Py_DECREF(key);
14767         if (ctx->args == NULL)
14768             return -1;
14769         ctx->args_owned = 1;
14770         ctx->arglen = -1;
14771         ctx->argidx = -2;
14772     }
14773 
14774     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14775     while (--ctx->fmtcnt >= 0) {
14776         arg->ch = FORMAT_READ(ctx);
14777         ctx->fmtpos++;
14778         switch (arg->ch) {
14779         case '-': arg->flags |= F_LJUST; continue;
14780         case '+': arg->flags |= F_SIGN; continue;
14781         case ' ': arg->flags |= F_BLANK; continue;
14782         case '#': arg->flags |= F_ALT; continue;
14783         case '0': arg->flags |= F_ZERO; continue;
14784         }
14785         break;
14786     }
14787 
14788     /* Parse width. Example: "%10s" => width=10 */
14789     if (arg->ch == '*') {
14790         v = unicode_format_getnextarg(ctx);
14791         if (v == NULL)
14792             return -1;
14793         if (!PyLong_Check(v)) {
14794             PyErr_SetString(PyExc_TypeError,
14795                             "* wants int");
14796             return -1;
14797         }
14798         arg->width = PyLong_AsSsize_t(v);
14799         if (arg->width == -1 && PyErr_Occurred())
14800             return -1;
14801         if (arg->width < 0) {
14802             arg->flags |= F_LJUST;
14803             arg->width = -arg->width;
14804         }
14805         if (--ctx->fmtcnt >= 0) {
14806             arg->ch = FORMAT_READ(ctx);
14807             ctx->fmtpos++;
14808         }
14809     }
14810     else if (arg->ch >= '0' && arg->ch <= '9') {
14811         arg->width = arg->ch - '0';
14812         while (--ctx->fmtcnt >= 0) {
14813             arg->ch = FORMAT_READ(ctx);
14814             ctx->fmtpos++;
14815             if (arg->ch < '0' || arg->ch > '9')
14816                 break;
14817             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14818                mixing signed and unsigned comparison. Since arg->ch is between
14819                '0' and '9', casting to int is safe. */
14820             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14821                 PyErr_SetString(PyExc_ValueError,
14822                                 "width too big");
14823                 return -1;
14824             }
14825             arg->width = arg->width*10 + (arg->ch - '0');
14826         }
14827     }
14828 
14829     /* Parse precision. Example: "%.3f" => prec=3 */
14830     if (arg->ch == '.') {
14831         arg->prec = 0;
14832         if (--ctx->fmtcnt >= 0) {
14833             arg->ch = FORMAT_READ(ctx);
14834             ctx->fmtpos++;
14835         }
14836         if (arg->ch == '*') {
14837             v = unicode_format_getnextarg(ctx);
14838             if (v == NULL)
14839                 return -1;
14840             if (!PyLong_Check(v)) {
14841                 PyErr_SetString(PyExc_TypeError,
14842                                 "* wants int");
14843                 return -1;
14844             }
14845             arg->prec = _PyLong_AsInt(v);
14846             if (arg->prec == -1 && PyErr_Occurred())
14847                 return -1;
14848             if (arg->prec < 0)
14849                 arg->prec = 0;
14850             if (--ctx->fmtcnt >= 0) {
14851                 arg->ch = FORMAT_READ(ctx);
14852                 ctx->fmtpos++;
14853             }
14854         }
14855         else if (arg->ch >= '0' && arg->ch <= '9') {
14856             arg->prec = arg->ch - '0';
14857             while (--ctx->fmtcnt >= 0) {
14858                 arg->ch = FORMAT_READ(ctx);
14859                 ctx->fmtpos++;
14860                 if (arg->ch < '0' || arg->ch > '9')
14861                     break;
14862                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14863                     PyErr_SetString(PyExc_ValueError,
14864                                     "precision too big");
14865                     return -1;
14866                 }
14867                 arg->prec = arg->prec*10 + (arg->ch - '0');
14868             }
14869         }
14870     }
14871 
14872     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14873     if (ctx->fmtcnt >= 0) {
14874         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14875             if (--ctx->fmtcnt >= 0) {
14876                 arg->ch = FORMAT_READ(ctx);
14877                 ctx->fmtpos++;
14878             }
14879         }
14880     }
14881     if (ctx->fmtcnt < 0) {
14882         PyErr_SetString(PyExc_ValueError,
14883                         "incomplete format");
14884         return -1;
14885     }
14886     return 0;
14887 
14888 #undef FORMAT_READ
14889 }
14890 
14891 /* Format one argument. Supported conversion specifiers:
14892 
14893    - "s", "r", "a": any type
14894    - "i", "d", "u": int or float
14895    - "o", "x", "X": int
14896    - "e", "E", "f", "F", "g", "G": float
14897    - "c": int or str (1 character)
14898 
14899    When possible, the output is written directly into the Unicode writer
14900    (ctx->writer). A string is created when padding is required.
14901 
14902    Return 0 if the argument has been formatted into *p_str,
14903           1 if the argument has been written into ctx->writer,
14904          -1 on error. */
14905 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14906 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14907                           struct unicode_format_arg_t *arg,
14908                           PyObject **p_str)
14909 {
14910     PyObject *v;
14911     _PyUnicodeWriter *writer = &ctx->writer;
14912 
14913     if (ctx->fmtcnt == 0)
14914         ctx->writer.overallocate = 0;
14915 
14916     v = unicode_format_getnextarg(ctx);
14917     if (v == NULL)
14918         return -1;
14919 
14920 
14921     switch (arg->ch) {
14922     case 's':
14923     case 'r':
14924     case 'a':
14925         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14926             /* Fast path */
14927             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14928                 return -1;
14929             return 1;
14930         }
14931 
14932         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14933             *p_str = v;
14934             Py_INCREF(*p_str);
14935         }
14936         else {
14937             if (arg->ch == 's')
14938                 *p_str = PyObject_Str(v);
14939             else if (arg->ch == 'r')
14940                 *p_str = PyObject_Repr(v);
14941             else
14942                 *p_str = PyObject_ASCII(v);
14943         }
14944         break;
14945 
14946     case 'i':
14947     case 'd':
14948     case 'u':
14949     case 'o':
14950     case 'x':
14951     case 'X':
14952     {
14953         int ret = mainformatlong(v, arg, p_str, writer);
14954         if (ret != 0)
14955             return ret;
14956         arg->sign = 1;
14957         break;
14958     }
14959 
14960     case 'e':
14961     case 'E':
14962     case 'f':
14963     case 'F':
14964     case 'g':
14965     case 'G':
14966         if (arg->width == -1 && arg->prec == -1
14967             && !(arg->flags & (F_SIGN | F_BLANK)))
14968         {
14969             /* Fast path */
14970             if (formatfloat(v, arg, NULL, writer) == -1)
14971                 return -1;
14972             return 1;
14973         }
14974 
14975         arg->sign = 1;
14976         if (formatfloat(v, arg, p_str, NULL) == -1)
14977             return -1;
14978         break;
14979 
14980     case 'c':
14981     {
14982         Py_UCS4 ch = formatchar(v);
14983         if (ch == (Py_UCS4) -1)
14984             return -1;
14985         if (arg->width == -1 && arg->prec == -1) {
14986             /* Fast path */
14987             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14988                 return -1;
14989             return 1;
14990         }
14991         *p_str = PyUnicode_FromOrdinal(ch);
14992         break;
14993     }
14994 
14995     default:
14996         PyErr_Format(PyExc_ValueError,
14997                      "unsupported format character '%c' (0x%x) "
14998                      "at index %zd",
14999                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15000                      (int)arg->ch,
15001                      ctx->fmtpos - 1);
15002         return -1;
15003     }
15004     if (*p_str == NULL)
15005         return -1;
15006     assert (PyUnicode_Check(*p_str));
15007     return 0;
15008 }
15009 
15010 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15011 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15012                           struct unicode_format_arg_t *arg,
15013                           PyObject *str)
15014 {
15015     Py_ssize_t len;
15016     enum PyUnicode_Kind kind;
15017     const void *pbuf;
15018     Py_ssize_t pindex;
15019     Py_UCS4 signchar;
15020     Py_ssize_t buflen;
15021     Py_UCS4 maxchar;
15022     Py_ssize_t sublen;
15023     _PyUnicodeWriter *writer = &ctx->writer;
15024     Py_UCS4 fill;
15025 
15026     fill = ' ';
15027     if (arg->sign && arg->flags & F_ZERO)
15028         fill = '0';
15029 
15030     if (PyUnicode_READY(str) == -1)
15031         return -1;
15032 
15033     len = PyUnicode_GET_LENGTH(str);
15034     if ((arg->width == -1 || arg->width <= len)
15035         && (arg->prec == -1 || arg->prec >= len)
15036         && !(arg->flags & (F_SIGN | F_BLANK)))
15037     {
15038         /* Fast path */
15039         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15040             return -1;
15041         return 0;
15042     }
15043 
15044     /* Truncate the string for "s", "r" and "a" formats
15045        if the precision is set */
15046     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15047         if (arg->prec >= 0 && len > arg->prec)
15048             len = arg->prec;
15049     }
15050 
15051     /* Adjust sign and width */
15052     kind = PyUnicode_KIND(str);
15053     pbuf = PyUnicode_DATA(str);
15054     pindex = 0;
15055     signchar = '\0';
15056     if (arg->sign) {
15057         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15058         if (ch == '-' || ch == '+') {
15059             signchar = ch;
15060             len--;
15061             pindex++;
15062         }
15063         else if (arg->flags & F_SIGN)
15064             signchar = '+';
15065         else if (arg->flags & F_BLANK)
15066             signchar = ' ';
15067         else
15068             arg->sign = 0;
15069     }
15070     if (arg->width < len)
15071         arg->width = len;
15072 
15073     /* Prepare the writer */
15074     maxchar = writer->maxchar;
15075     if (!(arg->flags & F_LJUST)) {
15076         if (arg->sign) {
15077             if ((arg->width-1) > len)
15078                 maxchar = Py_MAX(maxchar, fill);
15079         }
15080         else {
15081             if (arg->width > len)
15082                 maxchar = Py_MAX(maxchar, fill);
15083         }
15084     }
15085     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15086         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15087         maxchar = Py_MAX(maxchar, strmaxchar);
15088     }
15089 
15090     buflen = arg->width;
15091     if (arg->sign && len == arg->width)
15092         buflen++;
15093     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15094         return -1;
15095 
15096     /* Write the sign if needed */
15097     if (arg->sign) {
15098         if (fill != ' ') {
15099             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15100             writer->pos += 1;
15101         }
15102         if (arg->width > len)
15103             arg->width--;
15104     }
15105 
15106     /* Write the numeric prefix for "x", "X" and "o" formats
15107        if the alternate form is used.
15108        For example, write "0x" for the "%#x" format. */
15109     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15110         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15111         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15112         if (fill != ' ') {
15113             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15114             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15115             writer->pos += 2;
15116             pindex += 2;
15117         }
15118         arg->width -= 2;
15119         if (arg->width < 0)
15120             arg->width = 0;
15121         len -= 2;
15122     }
15123 
15124     /* Pad left with the fill character if needed */
15125     if (arg->width > len && !(arg->flags & F_LJUST)) {
15126         sublen = arg->width - len;
15127         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15128         writer->pos += sublen;
15129         arg->width = len;
15130     }
15131 
15132     /* If padding with spaces: write sign if needed and/or numeric prefix if
15133        the alternate form is used */
15134     if (fill == ' ') {
15135         if (arg->sign) {
15136             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15137             writer->pos += 1;
15138         }
15139         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15140             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15141             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15142             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15143             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15144             writer->pos += 2;
15145             pindex += 2;
15146         }
15147     }
15148 
15149     /* Write characters */
15150     if (len) {
15151         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15152                                       str, pindex, len);
15153         writer->pos += len;
15154     }
15155 
15156     /* Pad right with the fill character if needed */
15157     if (arg->width > len) {
15158         sublen = arg->width - len;
15159         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15160         writer->pos += sublen;
15161     }
15162     return 0;
15163 }
15164 
15165 /* Helper of PyUnicode_Format(): format one arg.
15166    Return 0 on success, raise an exception and return -1 on error. */
15167 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15168 unicode_format_arg(struct unicode_formatter_t *ctx)
15169 {
15170     struct unicode_format_arg_t arg;
15171     PyObject *str;
15172     int ret;
15173 
15174     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15175     if (arg.ch == '%') {
15176         ctx->fmtpos++;
15177         ctx->fmtcnt--;
15178         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15179             return -1;
15180         return 0;
15181     }
15182     arg.flags = 0;
15183     arg.width = -1;
15184     arg.prec = -1;
15185     arg.sign = 0;
15186     str = NULL;
15187 
15188     ret = unicode_format_arg_parse(ctx, &arg);
15189     if (ret == -1)
15190         return -1;
15191 
15192     ret = unicode_format_arg_format(ctx, &arg, &str);
15193     if (ret == -1)
15194         return -1;
15195 
15196     if (ret != 1) {
15197         ret = unicode_format_arg_output(ctx, &arg, str);
15198         Py_DECREF(str);
15199         if (ret == -1)
15200             return -1;
15201     }
15202 
15203     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15204         PyErr_SetString(PyExc_TypeError,
15205                         "not all arguments converted during string formatting");
15206         return -1;
15207     }
15208     return 0;
15209 }
15210 
15211 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15212 PyUnicode_Format(PyObject *format, PyObject *args)
15213 {
15214     struct unicode_formatter_t ctx;
15215 
15216     if (format == NULL || args == NULL) {
15217         PyErr_BadInternalCall();
15218         return NULL;
15219     }
15220 
15221     if (ensure_unicode(format) < 0)
15222         return NULL;
15223 
15224     ctx.fmtstr = format;
15225     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15226     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15227     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15228     ctx.fmtpos = 0;
15229 
15230     _PyUnicodeWriter_Init(&ctx.writer);
15231     ctx.writer.min_length = ctx.fmtcnt + 100;
15232     ctx.writer.overallocate = 1;
15233 
15234     if (PyTuple_Check(args)) {
15235         ctx.arglen = PyTuple_Size(args);
15236         ctx.argidx = 0;
15237     }
15238     else {
15239         ctx.arglen = -1;
15240         ctx.argidx = -2;
15241     }
15242     ctx.args_owned = 0;
15243     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15244         ctx.dict = args;
15245     else
15246         ctx.dict = NULL;
15247     ctx.args = args;
15248 
15249     while (--ctx.fmtcnt >= 0) {
15250         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15251             Py_ssize_t nonfmtpos;
15252 
15253             nonfmtpos = ctx.fmtpos++;
15254             while (ctx.fmtcnt >= 0 &&
15255                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15256                 ctx.fmtpos++;
15257                 ctx.fmtcnt--;
15258             }
15259             if (ctx.fmtcnt < 0) {
15260                 ctx.fmtpos--;
15261                 ctx.writer.overallocate = 0;
15262             }
15263 
15264             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15265                                                 nonfmtpos, ctx.fmtpos) < 0)
15266                 goto onError;
15267         }
15268         else {
15269             ctx.fmtpos++;
15270             if (unicode_format_arg(&ctx) == -1)
15271                 goto onError;
15272         }
15273     }
15274 
15275     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15276         PyErr_SetString(PyExc_TypeError,
15277                         "not all arguments converted during string formatting");
15278         goto onError;
15279     }
15280 
15281     if (ctx.args_owned) {
15282         Py_DECREF(ctx.args);
15283     }
15284     return _PyUnicodeWriter_Finish(&ctx.writer);
15285 
15286   onError:
15287     _PyUnicodeWriter_Dealloc(&ctx.writer);
15288     if (ctx.args_owned) {
15289         Py_DECREF(ctx.args);
15290     }
15291     return NULL;
15292 }
15293 
15294 static PyObject *
15295 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15296 
15297 /*[clinic input]
15298 @classmethod
15299 str.__new__ as unicode_new
15300 
15301     object as x: object = NULL
15302     encoding: str = NULL
15303     errors: str = NULL
15304 
15305 [clinic start generated code]*/
15306 
15307 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15308 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15309                  const char *errors)
15310 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15311 {
15312     PyObject *unicode;
15313     if (x == NULL) {
15314         unicode = unicode_new_empty();
15315     }
15316     else if (encoding == NULL && errors == NULL) {
15317         unicode = PyObject_Str(x);
15318     }
15319     else {
15320         unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15321     }
15322 
15323     if (unicode != NULL && type != &PyUnicode_Type) {
15324         Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15325     }
15326     return unicode;
15327 }
15328 
15329 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15330 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15331 {
15332     PyObject *self;
15333     Py_ssize_t length, char_size;
15334     int share_wstr, share_utf8;
15335     unsigned int kind;
15336     void *data;
15337 
15338     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15339     assert(_PyUnicode_CHECK(unicode));
15340     if (PyUnicode_READY(unicode) == -1) {
15341         return NULL;
15342     }
15343 
15344     self = type->tp_alloc(type, 0);
15345     if (self == NULL) {
15346         return NULL;
15347     }
15348     kind = PyUnicode_KIND(unicode);
15349     length = PyUnicode_GET_LENGTH(unicode);
15350 
15351     _PyUnicode_LENGTH(self) = length;
15352 #ifdef Py_DEBUG
15353     _PyUnicode_HASH(self) = -1;
15354 #else
15355     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15356 #endif
15357     _PyUnicode_STATE(self).interned = 0;
15358     _PyUnicode_STATE(self).kind = kind;
15359     _PyUnicode_STATE(self).compact = 0;
15360     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15361     _PyUnicode_STATE(self).ready = 1;
15362     _PyUnicode_WSTR(self) = NULL;
15363     _PyUnicode_UTF8_LENGTH(self) = 0;
15364     _PyUnicode_UTF8(self) = NULL;
15365     _PyUnicode_WSTR_LENGTH(self) = 0;
15366     _PyUnicode_DATA_ANY(self) = NULL;
15367 
15368     share_utf8 = 0;
15369     share_wstr = 0;
15370     if (kind == PyUnicode_1BYTE_KIND) {
15371         char_size = 1;
15372         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15373             share_utf8 = 1;
15374     }
15375     else if (kind == PyUnicode_2BYTE_KIND) {
15376         char_size = 2;
15377         if (sizeof(wchar_t) == 2)
15378             share_wstr = 1;
15379     }
15380     else {
15381         assert(kind == PyUnicode_4BYTE_KIND);
15382         char_size = 4;
15383         if (sizeof(wchar_t) == 4)
15384             share_wstr = 1;
15385     }
15386 
15387     /* Ensure we won't overflow the length. */
15388     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15389         PyErr_NoMemory();
15390         goto onError;
15391     }
15392     data = PyObject_Malloc((length + 1) * char_size);
15393     if (data == NULL) {
15394         PyErr_NoMemory();
15395         goto onError;
15396     }
15397 
15398     _PyUnicode_DATA_ANY(self) = data;
15399     if (share_utf8) {
15400         _PyUnicode_UTF8_LENGTH(self) = length;
15401         _PyUnicode_UTF8(self) = data;
15402     }
15403     if (share_wstr) {
15404         _PyUnicode_WSTR_LENGTH(self) = length;
15405         _PyUnicode_WSTR(self) = (wchar_t *)data;
15406     }
15407 
15408     memcpy(data, PyUnicode_DATA(unicode),
15409               kind * (length + 1));
15410     assert(_PyUnicode_CheckConsistency(self, 1));
15411 #ifdef Py_DEBUG
15412     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15413 #endif
15414     return self;
15415 
15416 onError:
15417     Py_DECREF(self);
15418     return NULL;
15419 }
15420 
15421 void
_PyUnicode_ExactDealloc(PyObject * op)15422 _PyUnicode_ExactDealloc(PyObject *op)
15423 {
15424     assert(PyUnicode_CheckExact(op));
15425     unicode_dealloc(op);
15426 }
15427 
15428 PyDoc_STRVAR(unicode_doc,
15429 "str(object='') -> str\n\
15430 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15431 \n\
15432 Create a new string object from the given object. If encoding or\n\
15433 errors is specified, then the object must expose a data buffer\n\
15434 that will be decoded using the given encoding and error handler.\n\
15435 Otherwise, returns the result of object.__str__() (if defined)\n\
15436 or repr(object).\n\
15437 encoding defaults to sys.getdefaultencoding().\n\
15438 errors defaults to 'strict'.");
15439 
15440 static PyObject *unicode_iter(PyObject *seq);
15441 
15442 PyTypeObject PyUnicode_Type = {
15443     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15444     "str",                        /* tp_name */
15445     sizeof(PyUnicodeObject),      /* tp_basicsize */
15446     0,                            /* tp_itemsize */
15447     /* Slots */
15448     (destructor)unicode_dealloc,  /* tp_dealloc */
15449     0,                            /* tp_vectorcall_offset */
15450     0,                            /* tp_getattr */
15451     0,                            /* tp_setattr */
15452     0,                            /* tp_as_async */
15453     unicode_repr,                 /* tp_repr */
15454     &unicode_as_number,           /* tp_as_number */
15455     &unicode_as_sequence,         /* tp_as_sequence */
15456     &unicode_as_mapping,          /* tp_as_mapping */
15457     (hashfunc) unicode_hash,      /* tp_hash*/
15458     0,                            /* tp_call*/
15459     (reprfunc) unicode_str,       /* tp_str */
15460     PyObject_GenericGetAttr,      /* tp_getattro */
15461     0,                            /* tp_setattro */
15462     0,                            /* tp_as_buffer */
15463     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15464         Py_TPFLAGS_UNICODE_SUBCLASS |
15465         _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15466     unicode_doc,                  /* tp_doc */
15467     0,                            /* tp_traverse */
15468     0,                            /* tp_clear */
15469     PyUnicode_RichCompare,        /* tp_richcompare */
15470     0,                            /* tp_weaklistoffset */
15471     unicode_iter,                 /* tp_iter */
15472     0,                            /* tp_iternext */
15473     unicode_methods,              /* tp_methods */
15474     0,                            /* tp_members */
15475     0,                            /* tp_getset */
15476     0,                            /* tp_base */
15477     0,                            /* tp_dict */
15478     0,                            /* tp_descr_get */
15479     0,                            /* tp_descr_set */
15480     0,                            /* tp_dictoffset */
15481     0,                            /* tp_init */
15482     0,                            /* tp_alloc */
15483     unicode_new,                  /* tp_new */
15484     PyObject_Del,                 /* tp_free */
15485 };
15486 
15487 /* Initialize the Unicode implementation */
15488 
15489 void
_PyUnicode_InitState(PyInterpreterState * interp)15490 _PyUnicode_InitState(PyInterpreterState *interp)
15491 {
15492     if (!_Py_IsMainInterpreter(interp)) {
15493         return;
15494     }
15495 
15496     /* initialize the linebreak bloom filter */
15497     const Py_UCS2 linebreak[] = {
15498         0x000A, /* LINE FEED */
15499         0x000D, /* CARRIAGE RETURN */
15500         0x001C, /* FILE SEPARATOR */
15501         0x001D, /* GROUP SEPARATOR */
15502         0x001E, /* RECORD SEPARATOR */
15503         0x0085, /* NEXT LINE */
15504         0x2028, /* LINE SEPARATOR */
15505         0x2029, /* PARAGRAPH SEPARATOR */
15506     };
15507     bloom_linebreak = make_bloom_mask(
15508         PyUnicode_2BYTE_KIND, linebreak,
15509         Py_ARRAY_LENGTH(linebreak));
15510 }
15511 
15512 
15513 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15514 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15515 {
15516     if (!_Py_IsMainInterpreter(interp)) {
15517         return _PyStatus_OK();
15518     }
15519 
15520 #ifdef Py_DEBUG
15521     assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15522 
15523     for (int i = 0; i < 256; i++) {
15524         assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15525     }
15526 #endif
15527 
15528     return _PyStatus_OK();
15529 }
15530 
15531 
15532 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15533 _PyUnicode_InitTypes(PyInterpreterState *interp)
15534 {
15535     if (!_Py_IsMainInterpreter(interp)) {
15536         return _PyStatus_OK();
15537     }
15538 
15539     if (PyType_Ready(&EncodingMapType) < 0) {
15540         goto error;
15541     }
15542     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15543         goto error;
15544     }
15545     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15546         goto error;
15547     }
15548     return _PyStatus_OK();
15549 
15550 error:
15551     return _PyStatus_ERR("Can't initialize unicode types");
15552 }
15553 
15554 
15555 void
PyUnicode_InternInPlace(PyObject ** p)15556 PyUnicode_InternInPlace(PyObject **p)
15557 {
15558     PyObject *s = *p;
15559 #ifdef Py_DEBUG
15560     assert(s != NULL);
15561     assert(_PyUnicode_CHECK(s));
15562 #else
15563     if (s == NULL || !PyUnicode_Check(s)) {
15564         return;
15565     }
15566 #endif
15567 
15568     /* If it's a subclass, we don't really know what putting
15569        it in the interned dict might do. */
15570     if (!PyUnicode_CheckExact(s)) {
15571         return;
15572     }
15573 
15574     if (PyUnicode_CHECK_INTERNED(s)) {
15575         return;
15576     }
15577 
15578     if (PyUnicode_READY(s) == -1) {
15579         PyErr_Clear();
15580         return;
15581     }
15582 
15583     if (interned == NULL) {
15584         interned = PyDict_New();
15585         if (interned == NULL) {
15586             PyErr_Clear(); /* Don't leave an exception */
15587             return;
15588         }
15589     }
15590 
15591     PyObject *t = PyDict_SetDefault(interned, s, s);
15592     if (t == NULL) {
15593         PyErr_Clear();
15594         return;
15595     }
15596 
15597     if (t != s) {
15598         Py_INCREF(t);
15599         Py_SETREF(*p, t);
15600         return;
15601     }
15602 
15603     /* The two references in interned dict (key and value) are not counted by
15604        refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15605        this. */
15606     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15607     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15608 }
15609 
15610 void
PyUnicode_InternImmortal(PyObject ** p)15611 PyUnicode_InternImmortal(PyObject **p)
15612 {
15613     if (PyErr_WarnEx(PyExc_DeprecationWarning,
15614             "PyUnicode_InternImmortal() is deprecated; "
15615             "use PyUnicode_InternInPlace() instead", 1) < 0)
15616     {
15617         // The function has no return value, the exception cannot
15618         // be reported to the caller, so just log it.
15619         PyErr_WriteUnraisable(NULL);
15620     }
15621 
15622     PyUnicode_InternInPlace(p);
15623     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15624         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15625         Py_INCREF(*p);
15626     }
15627 }
15628 
15629 PyObject *
PyUnicode_InternFromString(const char * cp)15630 PyUnicode_InternFromString(const char *cp)
15631 {
15632     PyObject *s = PyUnicode_FromString(cp);
15633     if (s == NULL)
15634         return NULL;
15635     PyUnicode_InternInPlace(&s);
15636     return s;
15637 }
15638 
15639 
15640 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15641 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15642 {
15643     if (!_Py_IsMainInterpreter(interp)) {
15644         // interned dict is shared by all interpreters
15645         return;
15646     }
15647 
15648     if (interned == NULL) {
15649         return;
15650     }
15651     assert(PyDict_CheckExact(interned));
15652 
15653     /* Interned unicode strings are not forcibly deallocated; rather, we give
15654        them their stolen references back, and then clear and DECREF the
15655        interned dict. */
15656 
15657 #ifdef INTERNED_STATS
15658     fprintf(stderr, "releasing %zd interned strings\n",
15659             PyDict_GET_SIZE(interned));
15660 
15661     Py_ssize_t immortal_size = 0, mortal_size = 0;
15662 #endif
15663     Py_ssize_t pos = 0;
15664     PyObject *s, *ignored_value;
15665     while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15666         assert(PyUnicode_IS_READY(s));
15667 
15668         switch (PyUnicode_CHECK_INTERNED(s)) {
15669         case SSTATE_INTERNED_IMMORTAL:
15670             Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15671 #ifdef INTERNED_STATS
15672             immortal_size += PyUnicode_GET_LENGTH(s);
15673 #endif
15674             break;
15675         case SSTATE_INTERNED_MORTAL:
15676             // Restore the two references (key and value) ignored
15677             // by PyUnicode_InternInPlace().
15678             Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15679 #ifdef INTERNED_STATS
15680             mortal_size += PyUnicode_GET_LENGTH(s);
15681 #endif
15682             break;
15683         case SSTATE_NOT_INTERNED:
15684             /* fall through */
15685         default:
15686             Py_UNREACHABLE();
15687         }
15688         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15689     }
15690 #ifdef INTERNED_STATS
15691     fprintf(stderr,
15692             "total size of all interned strings: %zd/%zd mortal/immortal\n",
15693             mortal_size, immortal_size);
15694 #endif
15695 
15696     PyDict_Clear(interned);
15697     Py_CLEAR(interned);
15698 }
15699 
15700 
15701 /********************* Unicode Iterator **************************/
15702 
15703 typedef struct {
15704     PyObject_HEAD
15705     Py_ssize_t it_index;
15706     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15707 } unicodeiterobject;
15708 
15709 static void
unicodeiter_dealloc(unicodeiterobject * it)15710 unicodeiter_dealloc(unicodeiterobject *it)
15711 {
15712     _PyObject_GC_UNTRACK(it);
15713     Py_XDECREF(it->it_seq);
15714     PyObject_GC_Del(it);
15715 }
15716 
15717 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15718 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15719 {
15720     Py_VISIT(it->it_seq);
15721     return 0;
15722 }
15723 
15724 static PyObject *
unicodeiter_next(unicodeiterobject * it)15725 unicodeiter_next(unicodeiterobject *it)
15726 {
15727     PyObject *seq;
15728 
15729     assert(it != NULL);
15730     seq = it->it_seq;
15731     if (seq == NULL)
15732         return NULL;
15733     assert(_PyUnicode_CHECK(seq));
15734 
15735     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15736         int kind = PyUnicode_KIND(seq);
15737         const void *data = PyUnicode_DATA(seq);
15738         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15739         it->it_index++;
15740         return unicode_char(chr);
15741     }
15742 
15743     it->it_seq = NULL;
15744     Py_DECREF(seq);
15745     return NULL;
15746 }
15747 
15748 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15749 unicode_ascii_iter_next(unicodeiterobject *it)
15750 {
15751     assert(it != NULL);
15752     PyObject *seq = it->it_seq;
15753     if (seq == NULL) {
15754         return NULL;
15755     }
15756     assert(_PyUnicode_CHECK(seq));
15757     assert(PyUnicode_IS_COMPACT_ASCII(seq));
15758     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15759         const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15760         Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15761                                               data, it->it_index);
15762         it->it_index++;
15763         PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15764         return Py_NewRef(item);
15765     }
15766     it->it_seq = NULL;
15767     Py_DECREF(seq);
15768     return NULL;
15769 }
15770 
15771 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15772 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15773 {
15774     Py_ssize_t len = 0;
15775     if (it->it_seq)
15776         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15777     return PyLong_FromSsize_t(len);
15778 }
15779 
15780 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15781 
15782 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15783 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15784 {
15785     PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15786 
15787     /* _PyEval_GetBuiltin can invoke arbitrary code,
15788      * call must be before access of iterator pointers.
15789      * see issue #101765 */
15790 
15791     if (it->it_seq != NULL) {
15792         return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15793     } else {
15794         PyObject *u = (PyObject *)_PyUnicode_New(0);
15795         if (u == NULL) {
15796             Py_XDECREF(iter);
15797             return NULL;
15798         }
15799         return Py_BuildValue("N(N)", iter, u);
15800     }
15801 }
15802 
15803 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15804 
15805 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15806 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15807 {
15808     Py_ssize_t index = PyLong_AsSsize_t(state);
15809     if (index == -1 && PyErr_Occurred())
15810         return NULL;
15811     if (it->it_seq != NULL) {
15812         if (index < 0)
15813             index = 0;
15814         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15815             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15816         it->it_index = index;
15817     }
15818     Py_RETURN_NONE;
15819 }
15820 
15821 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15822 
15823 static PyMethodDef unicodeiter_methods[] = {
15824     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15825      length_hint_doc},
15826     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15827      reduce_doc},
15828     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15829      setstate_doc},
15830     {NULL,      NULL}       /* sentinel */
15831 };
15832 
15833 PyTypeObject PyUnicodeIter_Type = {
15834     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15835     "str_iterator",         /* tp_name */
15836     sizeof(unicodeiterobject),      /* tp_basicsize */
15837     0,                  /* tp_itemsize */
15838     /* methods */
15839     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15840     0,                  /* tp_vectorcall_offset */
15841     0,                  /* tp_getattr */
15842     0,                  /* tp_setattr */
15843     0,                  /* tp_as_async */
15844     0,                  /* tp_repr */
15845     0,                  /* tp_as_number */
15846     0,                  /* tp_as_sequence */
15847     0,                  /* tp_as_mapping */
15848     0,                  /* tp_hash */
15849     0,                  /* tp_call */
15850     0,                  /* tp_str */
15851     PyObject_GenericGetAttr,        /* tp_getattro */
15852     0,                  /* tp_setattro */
15853     0,                  /* tp_as_buffer */
15854     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15855     0,                  /* tp_doc */
15856     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15857     0,                  /* tp_clear */
15858     0,                  /* tp_richcompare */
15859     0,                  /* tp_weaklistoffset */
15860     PyObject_SelfIter,          /* tp_iter */
15861     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15862     unicodeiter_methods,            /* tp_methods */
15863     0,
15864 };
15865 
15866 PyTypeObject _PyUnicodeASCIIIter_Type = {
15867     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15868     .tp_name = "str_ascii_iterator",
15869     .tp_basicsize = sizeof(unicodeiterobject),
15870     .tp_dealloc = (destructor)unicodeiter_dealloc,
15871     .tp_getattro = PyObject_GenericGetAttr,
15872     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15873     .tp_traverse = (traverseproc)unicodeiter_traverse,
15874     .tp_iter = PyObject_SelfIter,
15875     .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15876     .tp_methods = unicodeiter_methods,
15877 };
15878 
15879 static PyObject *
unicode_iter(PyObject * seq)15880 unicode_iter(PyObject *seq)
15881 {
15882     unicodeiterobject *it;
15883 
15884     if (!PyUnicode_Check(seq)) {
15885         PyErr_BadInternalCall();
15886         return NULL;
15887     }
15888     if (PyUnicode_READY(seq) == -1)
15889         return NULL;
15890     if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15891         it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15892     }
15893     else {
15894         it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15895     }
15896     if (it == NULL)
15897         return NULL;
15898     it->it_index = 0;
15899     Py_INCREF(seq);
15900     it->it_seq = seq;
15901     _PyObject_GC_TRACK(it);
15902     return (PyObject *)it;
15903 }
15904 
15905 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15906 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15907 {
15908     int res;
15909     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15910     if (res == -2) {
15911         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15912         return -1;
15913     }
15914     if (res < 0) {
15915         PyErr_NoMemory();
15916         return -1;
15917     }
15918     return 0;
15919 }
15920 
15921 
15922 static int
config_get_codec_name(wchar_t ** config_encoding)15923 config_get_codec_name(wchar_t **config_encoding)
15924 {
15925     char *encoding;
15926     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15927         return -1;
15928     }
15929 
15930     PyObject *name_obj = NULL;
15931     PyObject *codec = _PyCodec_Lookup(encoding);
15932     PyMem_RawFree(encoding);
15933 
15934     if (!codec)
15935         goto error;
15936 
15937     name_obj = PyObject_GetAttrString(codec, "name");
15938     Py_CLEAR(codec);
15939     if (!name_obj) {
15940         goto error;
15941     }
15942 
15943     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15944     Py_DECREF(name_obj);
15945     if (wname == NULL) {
15946         goto error;
15947     }
15948 
15949     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15950     if (raw_wname == NULL) {
15951         PyMem_Free(wname);
15952         PyErr_NoMemory();
15953         goto error;
15954     }
15955 
15956     PyMem_RawFree(*config_encoding);
15957     *config_encoding = raw_wname;
15958 
15959     PyMem_Free(wname);
15960     return 0;
15961 
15962 error:
15963     Py_XDECREF(codec);
15964     Py_XDECREF(name_obj);
15965     return -1;
15966 }
15967 
15968 
15969 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15970 init_stdio_encoding(PyInterpreterState *interp)
15971 {
15972     /* Update the stdio encoding to the normalized Python codec name. */
15973     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15974     if (config_get_codec_name(&config->stdio_encoding) < 0) {
15975         return _PyStatus_ERR("failed to get the Python codec name "
15976                              "of the stdio encoding");
15977     }
15978     return _PyStatus_OK();
15979 }
15980 
15981 
15982 static int
init_fs_codec(PyInterpreterState * interp)15983 init_fs_codec(PyInterpreterState *interp)
15984 {
15985     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15986 
15987     _Py_error_handler error_handler;
15988     error_handler = get_error_handler_wide(config->filesystem_errors);
15989     if (error_handler == _Py_ERROR_UNKNOWN) {
15990         PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15991         return -1;
15992     }
15993 
15994     char *encoding, *errors;
15995     if (encode_wstr_utf8(config->filesystem_encoding,
15996                          &encoding,
15997                          "filesystem_encoding") < 0) {
15998         return -1;
15999     }
16000 
16001     if (encode_wstr_utf8(config->filesystem_errors,
16002                          &errors,
16003                          "filesystem_errors") < 0) {
16004         PyMem_RawFree(encoding);
16005         return -1;
16006     }
16007 
16008     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16009     PyMem_RawFree(fs_codec->encoding);
16010     fs_codec->encoding = encoding;
16011     /* encoding has been normalized by init_fs_encoding() */
16012     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16013     PyMem_RawFree(fs_codec->errors);
16014     fs_codec->errors = errors;
16015     fs_codec->error_handler = error_handler;
16016 
16017 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16018     assert(fs_codec->utf8 == 1);
16019 #endif
16020 
16021     /* At this point, PyUnicode_EncodeFSDefault() and
16022        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16023        the C implementation of the filesystem encoding. */
16024 
16025     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16026        global configuration variables. */
16027     if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16028                                   fs_codec->errors) < 0) {
16029         PyErr_NoMemory();
16030         return -1;
16031     }
16032     return 0;
16033 }
16034 
16035 
16036 static PyStatus
init_fs_encoding(PyThreadState * tstate)16037 init_fs_encoding(PyThreadState *tstate)
16038 {
16039     PyInterpreterState *interp = tstate->interp;
16040 
16041     /* Update the filesystem encoding to the normalized Python codec name.
16042        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16043        (Python codec name). */
16044     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16045     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16046         _Py_DumpPathConfig(tstate);
16047         return _PyStatus_ERR("failed to get the Python codec "
16048                              "of the filesystem encoding");
16049     }
16050 
16051     if (init_fs_codec(interp) < 0) {
16052         return _PyStatus_ERR("cannot initialize filesystem codec");
16053     }
16054     return _PyStatus_OK();
16055 }
16056 
16057 
16058 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16059 _PyUnicode_InitEncodings(PyThreadState *tstate)
16060 {
16061     PyStatus status = init_fs_encoding(tstate);
16062     if (_PyStatus_EXCEPTION(status)) {
16063         return status;
16064     }
16065 
16066     return init_stdio_encoding(tstate->interp);
16067 }
16068 
16069 
16070 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16071 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16072 {
16073     PyMem_RawFree(fs_codec->encoding);
16074     fs_codec->encoding = NULL;
16075     fs_codec->utf8 = 0;
16076     PyMem_RawFree(fs_codec->errors);
16077     fs_codec->errors = NULL;
16078     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16079 }
16080 
16081 
16082 #ifdef MS_WINDOWS
16083 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16084 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16085 {
16086     PyInterpreterState *interp = _PyInterpreterState_GET();
16087     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16088 
16089     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16090     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16091     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16092     if (encoding == NULL || errors == NULL) {
16093         PyMem_RawFree(encoding);
16094         PyMem_RawFree(errors);
16095         PyErr_NoMemory();
16096         return -1;
16097     }
16098 
16099     PyMem_RawFree(config->filesystem_encoding);
16100     config->filesystem_encoding = encoding;
16101     PyMem_RawFree(config->filesystem_errors);
16102     config->filesystem_errors = errors;
16103 
16104     return init_fs_codec(interp);
16105 }
16106 #endif
16107 
16108 
16109 #ifdef Py_DEBUG
16110 static inline int
unicode_is_finalizing(void)16111 unicode_is_finalizing(void)
16112 {
16113     return (interned == NULL);
16114 }
16115 #endif
16116 
16117 
16118 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)16119 _PyUnicode_FiniTypes(PyInterpreterState *interp)
16120 {
16121     if (!_Py_IsMainInterpreter(interp)) {
16122         return;
16123     }
16124 
16125     _PyStaticType_Dealloc(&EncodingMapType);
16126     _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16127     _PyStaticType_Dealloc(&PyFormatterIter_Type);
16128 }
16129 
16130 
unicode_static_dealloc(PyObject * op)16131 static void unicode_static_dealloc(PyObject *op)
16132 {
16133     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16134 
16135     assert(ascii->state.compact);
16136 
16137     if (ascii->state.ascii) {
16138         if (ascii->wstr) {
16139             PyObject_Free(ascii->wstr);
16140             ascii->wstr = NULL;
16141         }
16142     }
16143     else {
16144         PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16145         void* data = (void*)(compact + 1);
16146         if (ascii->wstr && ascii->wstr != data) {
16147             PyObject_Free(ascii->wstr);
16148             ascii->wstr = NULL;
16149             compact->wstr_length = 0;
16150         }
16151         if (compact->utf8) {
16152             PyObject_Free(compact->utf8);
16153             compact->utf8 = NULL;
16154             compact->utf8_length = 0;
16155         }
16156     }
16157 }
16158 
16159 
16160 void
_PyUnicode_Fini(PyInterpreterState * interp)16161 _PyUnicode_Fini(PyInterpreterState *interp)
16162 {
16163     struct _Py_unicode_state *state = &interp->unicode;
16164 
16165     if (_Py_IsMainInterpreter(interp)) {
16166         // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16167         assert(interned == NULL);
16168         // bpo-47182: force a unicodedata CAPI capsule re-import on
16169         // subsequent initialization of main interpreter.
16170         ucnhash_capi = NULL;
16171     }
16172 
16173     _PyUnicode_FiniEncodings(&state->fs_codec);
16174 
16175     unicode_clear_identifiers(state);
16176 
16177     // Clear the single character singletons
16178     for (int i = 0; i < 128; i++) {
16179         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16180     }
16181     for (int i = 0; i < 128; i++) {
16182         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16183     }
16184 }
16185 
16186 
16187 void
_PyStaticUnicode_Dealloc(PyObject * op)16188 _PyStaticUnicode_Dealloc(PyObject *op)
16189 {
16190     unicode_static_dealloc(op);
16191 }
16192 
16193 
16194 /* A _string module, to export formatter_parser and formatter_field_name_split
16195    to the string.Formatter class implemented in Python. */
16196 
16197 static PyMethodDef _string_methods[] = {
16198     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16199      METH_O, PyDoc_STR("split the argument as a field name")},
16200     {"formatter_parser", (PyCFunction) formatter_parser,
16201      METH_O, PyDoc_STR("parse the argument as a format string")},
16202     {NULL, NULL}
16203 };
16204 
16205 static struct PyModuleDef _string_module = {
16206     PyModuleDef_HEAD_INIT,
16207     .m_name = "_string",
16208     .m_doc = PyDoc_STR("string helper module"),
16209     .m_size = 0,
16210     .m_methods = _string_methods,
16211 };
16212 
16213 PyMODINIT_FUNC
PyInit__string(void)16214 PyInit__string(void)
16215 {
16216     return PyModuleDef_Init(&_string_module);
16217 }
16218 
16219 
16220 #ifdef __cplusplus
16221 }
16222 #endif
16223