• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h"       // _PyIndex_Check()
44 #include "pycore_bytes_methods.h"
45 #include "pycore_fileutils.h"
46 #include "pycore_initconfig.h"
47 #include "pycore_interp.h"         // PyInterpreterState.fs_codec
48 #include "pycore_object.h"
49 #include "pycore_pathconfig.h"
50 #include "pycore_pylifecycle.h"
51 #include "pycore_pystate.h"        // _PyInterpreterState_GET()
52 #include "ucnhash.h"
53 #include "stringlib/eq.h"
54 
55 #ifdef MS_WINDOWS
56 #include <windows.h>
57 #endif
58 
59 /* Uncomment to display statistics on interned strings at exit when
60    using Valgrind or Insecure++. */
61 /* #define INTERNED_STATS 1 */
62 
63 
64 /*[clinic input]
65 class str "PyObject *" "&PyUnicode_Type"
66 [clinic start generated code]*/
67 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68 
69 /*[python input]
70 class Py_UCS4_converter(CConverter):
71     type = 'Py_UCS4'
72     converter = 'convert_uc'
73 
74     def converter_init(self):
75         if self.default is not unspecified:
76             self.c_default = ascii(self.default)
77             if len(self.c_default) > 4 or self.c_default[0] != "'":
78                 self.c_default = hex(ord(self.default))
79 
80 [python start generated code]*/
81 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
82 
83 /* --- Globals ------------------------------------------------------------
84 
85 NOTE: In the interpreter's initialization phase, some globals are currently
86       initialized dynamically as needed. In the process Unicode objects may
87       be created before the Unicode type is ready.
88 
89 */
90 
91 
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 
96 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97 #define MAX_UNICODE 0x10ffff
98 
99 #ifdef Py_DEBUG
100 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
101 #else
102 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103 #endif
104 
105 #define _PyUnicode_UTF8(op)                             \
106     (((PyCompactUnicodeObject*)(op))->utf8)
107 #define PyUnicode_UTF8(op)                              \
108     (assert(_PyUnicode_CHECK(op)),                      \
109      assert(PyUnicode_IS_READY(op)),                    \
110      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
111          ((char*)((PyASCIIObject*)(op) + 1)) :          \
112          _PyUnicode_UTF8(op))
113 #define _PyUnicode_UTF8_LENGTH(op)                      \
114     (((PyCompactUnicodeObject*)(op))->utf8_length)
115 #define PyUnicode_UTF8_LENGTH(op)                       \
116     (assert(_PyUnicode_CHECK(op)),                      \
117      assert(PyUnicode_IS_READY(op)),                    \
118      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
119          ((PyASCIIObject*)(op))->length :               \
120          _PyUnicode_UTF8_LENGTH(op))
121 #define _PyUnicode_WSTR(op)                             \
122     (((PyASCIIObject*)(op))->wstr)
123 
124 /* Don't use deprecated macro of unicodeobject.h */
125 #undef PyUnicode_WSTR_LENGTH
126 #define PyUnicode_WSTR_LENGTH(op) \
127     (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
128      ((PyASCIIObject*)op)->length :                    \
129      ((PyCompactUnicodeObject*)op)->wstr_length)
130 #define _PyUnicode_WSTR_LENGTH(op)                      \
131     (((PyCompactUnicodeObject*)(op))->wstr_length)
132 #define _PyUnicode_LENGTH(op)                           \
133     (((PyASCIIObject *)(op))->length)
134 #define _PyUnicode_STATE(op)                            \
135     (((PyASCIIObject *)(op))->state)
136 #define _PyUnicode_HASH(op)                             \
137     (((PyASCIIObject *)(op))->hash)
138 #define _PyUnicode_KIND(op)                             \
139     (assert(_PyUnicode_CHECK(op)),                      \
140      ((PyASCIIObject *)(op))->state.kind)
141 #define _PyUnicode_GET_LENGTH(op)                       \
142     (assert(_PyUnicode_CHECK(op)),                      \
143      ((PyASCIIObject *)(op))->length)
144 #define _PyUnicode_DATA_ANY(op)                         \
145     (((PyUnicodeObject*)(op))->data.any)
146 
147 #undef PyUnicode_READY
148 #define PyUnicode_READY(op)                             \
149     (assert(_PyUnicode_CHECK(op)),                      \
150      (PyUnicode_IS_READY(op) ?                          \
151       0 :                                               \
152       _PyUnicode_Ready(op)))
153 
154 #define _PyUnicode_SHARE_UTF8(op)                       \
155     (assert(_PyUnicode_CHECK(op)),                      \
156      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
157      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158 #define _PyUnicode_SHARE_WSTR(op)                       \
159     (assert(_PyUnicode_CHECK(op)),                      \
160      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161 
162 /* true if the Unicode object has an allocated UTF-8 memory block
163    (not shared with other data) */
164 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
165     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
166       && _PyUnicode_UTF8(op)                            \
167       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168 
169 /* true if the Unicode object has an allocated wstr memory block
170    (not shared with other data) */
171 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
172     ((_PyUnicode_WSTR(op) &&                            \
173       (!PyUnicode_IS_READY(op) ||                       \
174        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175 
176 /* Generic helper macro to convert characters of different types.
177    from_type and to_type have to be valid type names, begin and end
178    are pointers to the source characters which should be of type
179    "from_type *".  to is a pointer of type "to_type *" and points to the
180    buffer where the result characters are written to. */
181 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182     do {                                                \
183         to_type *_to = (to_type *)(to);                \
184         const from_type *_iter = (const from_type *)(begin);\
185         const from_type *_end = (const from_type *)(end);\
186         Py_ssize_t n = (_end) - (_iter);                \
187         const from_type *_unrolled_end =                \
188             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
189         while (_iter < (_unrolled_end)) {               \
190             _to[0] = (to_type) _iter[0];                \
191             _to[1] = (to_type) _iter[1];                \
192             _to[2] = (to_type) _iter[2];                \
193             _to[3] = (to_type) _iter[3];                \
194             _iter += 4; _to += 4;                       \
195         }                                               \
196         while (_iter < (_end))                          \
197             *_to++ = (to_type) *_iter++;                \
198     } while (0)
199 
200 #ifdef MS_WINDOWS
201    /* On Windows, overallocate by 50% is the best factor */
202 #  define OVERALLOCATE_FACTOR 2
203 #else
204    /* On Linux, overallocate by 25% is the best factor */
205 #  define OVERALLOCATE_FACTOR 4
206 #endif
207 
208 #define INTERNED_STRINGS
209 
210 /* This dictionary holds all interned unicode strings.  Note that references
211    to strings in this dictionary are *not* counted in the string's ob_refcnt.
212    When the interned string reaches a refcnt of 0 the string deallocation
213    function will delete the reference from this dictionary.
214 
215    Another way to look at this is that to say that the actual reference
216    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
217 */
218 #ifdef INTERNED_STRINGS
219 static PyObject *interned = NULL;
220 #endif
221 
222 /* The empty Unicode object is shared to improve performance. */
223 static PyObject *unicode_empty = NULL;
224 
225 #define _Py_INCREF_UNICODE_EMPTY()                      \
226     do {                                                \
227         if (unicode_empty != NULL)                      \
228             Py_INCREF(unicode_empty);                   \
229         else {                                          \
230             unicode_empty = PyUnicode_New(0, 0);        \
231             if (unicode_empty != NULL) {                \
232                 Py_INCREF(unicode_empty);               \
233                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
234             }                                           \
235         }                                               \
236     } while (0)
237 
238 #define _Py_RETURN_UNICODE_EMPTY()                      \
239     do {                                                \
240         _Py_INCREF_UNICODE_EMPTY();                     \
241         return unicode_empty;                           \
242     } while (0)
243 
244 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)245 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
246              Py_ssize_t start, Py_ssize_t length)
247 {
248     assert(0 <= start);
249     assert(kind != PyUnicode_WCHAR_KIND);
250     switch (kind) {
251     case PyUnicode_1BYTE_KIND: {
252         assert(value <= 0xff);
253         Py_UCS1 ch = (unsigned char)value;
254         Py_UCS1 *to = (Py_UCS1 *)data + start;
255         memset(to, ch, length);
256         break;
257     }
258     case PyUnicode_2BYTE_KIND: {
259         assert(value <= 0xffff);
260         Py_UCS2 ch = (Py_UCS2)value;
261         Py_UCS2 *to = (Py_UCS2 *)data + start;
262         const Py_UCS2 *end = to + length;
263         for (; to < end; ++to) *to = ch;
264         break;
265     }
266     case PyUnicode_4BYTE_KIND: {
267         assert(value <= MAX_UNICODE);
268         Py_UCS4 ch = value;
269         Py_UCS4 * to = (Py_UCS4 *)data + start;
270         const Py_UCS4 *end = to + length;
271         for (; to < end; ++to) *to = ch;
272         break;
273     }
274     default: Py_UNREACHABLE();
275     }
276 }
277 
278 
279 /* Forward declaration */
280 static inline int
281 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
282 static inline void
283 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
284 static PyObject *
285 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
286                     const char *errors);
287 static PyObject *
288 unicode_decode_utf8(const char *s, Py_ssize_t size,
289                     _Py_error_handler error_handler, const char *errors,
290                     Py_ssize_t *consumed);
291 
292 /* List of static strings. */
293 static _Py_Identifier *static_strings = NULL;
294 
295 #define LATIN1_SINGLETONS
296 
297 #ifdef LATIN1_SINGLETONS
298 /* Single character Unicode strings in the Latin-1 range are being
299    shared as well. */
300 static PyObject *unicode_latin1[256] = {NULL};
301 #endif
302 
303 /* Fast detection of the most frequent whitespace characters */
304 const unsigned char _Py_ascii_whitespace[] = {
305     0, 0, 0, 0, 0, 0, 0, 0,
306 /*     case 0x0009: * CHARACTER TABULATION */
307 /*     case 0x000A: * LINE FEED */
308 /*     case 0x000B: * LINE TABULATION */
309 /*     case 0x000C: * FORM FEED */
310 /*     case 0x000D: * CARRIAGE RETURN */
311     0, 1, 1, 1, 1, 1, 0, 0,
312     0, 0, 0, 0, 0, 0, 0, 0,
313 /*     case 0x001C: * FILE SEPARATOR */
314 /*     case 0x001D: * GROUP SEPARATOR */
315 /*     case 0x001E: * RECORD SEPARATOR */
316 /*     case 0x001F: * UNIT SEPARATOR */
317     0, 0, 0, 0, 1, 1, 1, 1,
318 /*     case 0x0020: * SPACE */
319     1, 0, 0, 0, 0, 0, 0, 0,
320     0, 0, 0, 0, 0, 0, 0, 0,
321     0, 0, 0, 0, 0, 0, 0, 0,
322     0, 0, 0, 0, 0, 0, 0, 0,
323 
324     0, 0, 0, 0, 0, 0, 0, 0,
325     0, 0, 0, 0, 0, 0, 0, 0,
326     0, 0, 0, 0, 0, 0, 0, 0,
327     0, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331     0, 0, 0, 0, 0, 0, 0, 0
332 };
333 
334 /* forward */
335 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
336 static PyObject* get_latin1_char(unsigned char ch);
337 static int unicode_modifiable(PyObject *unicode);
338 
339 
340 static PyObject *
341 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
342 static PyObject *
343 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
344 static PyObject *
345 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
346 
347 static PyObject *
348 unicode_encode_call_errorhandler(const char *errors,
349        PyObject **errorHandler,const char *encoding, const char *reason,
350        PyObject *unicode, PyObject **exceptionObject,
351        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
352 
353 static void
354 raise_encode_exception(PyObject **exceptionObject,
355                        const char *encoding,
356                        PyObject *unicode,
357                        Py_ssize_t startpos, Py_ssize_t endpos,
358                        const char *reason);
359 
360 /* Same for linebreaks */
361 static const unsigned char ascii_linebreak[] = {
362     0, 0, 0, 0, 0, 0, 0, 0,
363 /*         0x000A, * LINE FEED */
364 /*         0x000B, * LINE TABULATION */
365 /*         0x000C, * FORM FEED */
366 /*         0x000D, * CARRIAGE RETURN */
367     0, 0, 1, 1, 1, 1, 0, 0,
368     0, 0, 0, 0, 0, 0, 0, 0,
369 /*         0x001C, * FILE SEPARATOR */
370 /*         0x001D, * GROUP SEPARATOR */
371 /*         0x001E, * RECORD SEPARATOR */
372     0, 0, 0, 0, 1, 1, 1, 0,
373     0, 0, 0, 0, 0, 0, 0, 0,
374     0, 0, 0, 0, 0, 0, 0, 0,
375     0, 0, 0, 0, 0, 0, 0, 0,
376     0, 0, 0, 0, 0, 0, 0, 0,
377 
378     0, 0, 0, 0, 0, 0, 0, 0,
379     0, 0, 0, 0, 0, 0, 0, 0,
380     0, 0, 0, 0, 0, 0, 0, 0,
381     0, 0, 0, 0, 0, 0, 0, 0,
382     0, 0, 0, 0, 0, 0, 0, 0,
383     0, 0, 0, 0, 0, 0, 0, 0,
384     0, 0, 0, 0, 0, 0, 0, 0,
385     0, 0, 0, 0, 0, 0, 0, 0
386 };
387 
388 static int convert_uc(PyObject *obj, void *addr);
389 
390 #include "clinic/unicodeobject.c.h"
391 
392 _Py_error_handler
_Py_GetErrorHandler(const char * errors)393 _Py_GetErrorHandler(const char *errors)
394 {
395     if (errors == NULL || strcmp(errors, "strict") == 0) {
396         return _Py_ERROR_STRICT;
397     }
398     if (strcmp(errors, "surrogateescape") == 0) {
399         return _Py_ERROR_SURROGATEESCAPE;
400     }
401     if (strcmp(errors, "replace") == 0) {
402         return _Py_ERROR_REPLACE;
403     }
404     if (strcmp(errors, "ignore") == 0) {
405         return _Py_ERROR_IGNORE;
406     }
407     if (strcmp(errors, "backslashreplace") == 0) {
408         return _Py_ERROR_BACKSLASHREPLACE;
409     }
410     if (strcmp(errors, "surrogatepass") == 0) {
411         return _Py_ERROR_SURROGATEPASS;
412     }
413     if (strcmp(errors, "xmlcharrefreplace") == 0) {
414         return _Py_ERROR_XMLCHARREFREPLACE;
415     }
416     return _Py_ERROR_OTHER;
417 }
418 
419 
420 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)421 get_error_handler_wide(const wchar_t *errors)
422 {
423     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
424         return _Py_ERROR_STRICT;
425     }
426     if (wcscmp(errors, L"surrogateescape") == 0) {
427         return _Py_ERROR_SURROGATEESCAPE;
428     }
429     if (wcscmp(errors, L"replace") == 0) {
430         return _Py_ERROR_REPLACE;
431     }
432     if (wcscmp(errors, L"ignore") == 0) {
433         return _Py_ERROR_IGNORE;
434     }
435     if (wcscmp(errors, L"backslashreplace") == 0) {
436         return _Py_ERROR_BACKSLASHREPLACE;
437     }
438     if (wcscmp(errors, L"surrogatepass") == 0) {
439         return _Py_ERROR_SURROGATEPASS;
440     }
441     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
442         return _Py_ERROR_XMLCHARREFREPLACE;
443     }
444     return _Py_ERROR_OTHER;
445 }
446 
447 
448 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)449 unicode_check_encoding_errors(const char *encoding, const char *errors)
450 {
451     if (encoding == NULL && errors == NULL) {
452         return 0;
453     }
454 
455     PyInterpreterState *interp = _PyInterpreterState_GET();
456 #ifndef Py_DEBUG
457     /* In release mode, only check in development mode (-X dev) */
458     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
459         return 0;
460     }
461 #else
462     /* Always check in debug mode */
463 #endif
464 
465     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
466        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
467     if (!interp->unicode.fs_codec.encoding) {
468         return 0;
469     }
470 
471     /* Disable checks during Python finalization. For example, it allows to
472        call _PyObject_Dump() during finalization for debugging purpose. */
473     if (interp->finalizing) {
474         return 0;
475     }
476 
477     if (encoding != NULL) {
478         PyObject *handler = _PyCodec_Lookup(encoding);
479         if (handler == NULL) {
480             return -1;
481         }
482         Py_DECREF(handler);
483     }
484 
485     if (errors != NULL) {
486         PyObject *handler = PyCodec_LookupError(errors);
487         if (handler == NULL) {
488             return -1;
489         }
490         Py_DECREF(handler);
491     }
492     return 0;
493 }
494 
495 
496 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
497    This function is kept for backward compatibility with the old API. */
498 Py_UNICODE
PyUnicode_GetMax(void)499 PyUnicode_GetMax(void)
500 {
501 #ifdef Py_UNICODE_WIDE
502     return 0x10FFFF;
503 #else
504     /* This is actually an illegal character, so it should
505        not be passed to unichr. */
506     return 0xFFFF;
507 #endif
508 }
509 
510 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)511 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
512 {
513 #define CHECK(expr) \
514     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
515 
516     PyASCIIObject *ascii;
517     unsigned int kind;
518 
519     assert(op != NULL);
520     CHECK(PyUnicode_Check(op));
521 
522     ascii = (PyASCIIObject *)op;
523     kind = ascii->state.kind;
524 
525     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
526         CHECK(kind == PyUnicode_1BYTE_KIND);
527         CHECK(ascii->state.ready == 1);
528     }
529     else {
530         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
531         void *data;
532 
533         if (ascii->state.compact == 1) {
534             data = compact + 1;
535             CHECK(kind == PyUnicode_1BYTE_KIND
536                                  || kind == PyUnicode_2BYTE_KIND
537                                  || kind == PyUnicode_4BYTE_KIND);
538             CHECK(ascii->state.ascii == 0);
539             CHECK(ascii->state.ready == 1);
540             CHECK(compact->utf8 != data);
541         }
542         else {
543             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
544 
545             data = unicode->data.any;
546             if (kind == PyUnicode_WCHAR_KIND) {
547                 CHECK(ascii->length == 0);
548                 CHECK(ascii->hash == -1);
549                 CHECK(ascii->state.compact == 0);
550                 CHECK(ascii->state.ascii == 0);
551                 CHECK(ascii->state.ready == 0);
552                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
553                 CHECK(ascii->wstr != NULL);
554                 CHECK(data == NULL);
555                 CHECK(compact->utf8 == NULL);
556             }
557             else {
558                 CHECK(kind == PyUnicode_1BYTE_KIND
559                                      || kind == PyUnicode_2BYTE_KIND
560                                      || kind == PyUnicode_4BYTE_KIND);
561                 CHECK(ascii->state.compact == 0);
562                 CHECK(ascii->state.ready == 1);
563                 CHECK(data != NULL);
564                 if (ascii->state.ascii) {
565                     CHECK(compact->utf8 == data);
566                     CHECK(compact->utf8_length == ascii->length);
567                 }
568                 else
569                     CHECK(compact->utf8 != data);
570             }
571         }
572         if (kind != PyUnicode_WCHAR_KIND) {
573             if (
574 #if SIZEOF_WCHAR_T == 2
575                 kind == PyUnicode_2BYTE_KIND
576 #else
577                 kind == PyUnicode_4BYTE_KIND
578 #endif
579                )
580             {
581                 CHECK(ascii->wstr == data);
582                 CHECK(compact->wstr_length == ascii->length);
583             } else
584                 CHECK(ascii->wstr != data);
585         }
586 
587         if (compact->utf8 == NULL)
588             CHECK(compact->utf8_length == 0);
589         if (ascii->wstr == NULL)
590             CHECK(compact->wstr_length == 0);
591     }
592 
593     /* check that the best kind is used: O(n) operation */
594     if (check_content && kind != PyUnicode_WCHAR_KIND) {
595         Py_ssize_t i;
596         Py_UCS4 maxchar = 0;
597         const void *data;
598         Py_UCS4 ch;
599 
600         data = PyUnicode_DATA(ascii);
601         for (i=0; i < ascii->length; i++)
602         {
603             ch = PyUnicode_READ(kind, data, i);
604             if (ch > maxchar)
605                 maxchar = ch;
606         }
607         if (kind == PyUnicode_1BYTE_KIND) {
608             if (ascii->state.ascii == 0) {
609                 CHECK(maxchar >= 128);
610                 CHECK(maxchar <= 255);
611             }
612             else
613                 CHECK(maxchar < 128);
614         }
615         else if (kind == PyUnicode_2BYTE_KIND) {
616             CHECK(maxchar >= 0x100);
617             CHECK(maxchar <= 0xFFFF);
618         }
619         else {
620             CHECK(maxchar >= 0x10000);
621             CHECK(maxchar <= MAX_UNICODE);
622         }
623         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
624     }
625     return 1;
626 
627 #undef CHECK
628 }
629 
630 
631 static PyObject*
unicode_result_wchar(PyObject * unicode)632 unicode_result_wchar(PyObject *unicode)
633 {
634 #ifndef Py_DEBUG
635     Py_ssize_t len;
636 
637     len = _PyUnicode_WSTR_LENGTH(unicode);
638     if (len == 0) {
639         Py_DECREF(unicode);
640         _Py_RETURN_UNICODE_EMPTY();
641     }
642 
643     if (len == 1) {
644         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
645         if ((Py_UCS4)ch < 256) {
646             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
647             Py_DECREF(unicode);
648             return latin1_char;
649         }
650     }
651 
652     if (_PyUnicode_Ready(unicode) < 0) {
653         Py_DECREF(unicode);
654         return NULL;
655     }
656 #else
657     assert(Py_REFCNT(unicode) == 1);
658 
659     /* don't make the result ready in debug mode to ensure that the caller
660        makes the string ready before using it */
661     assert(_PyUnicode_CheckConsistency(unicode, 1));
662 #endif
663     return unicode;
664 }
665 
666 static PyObject*
unicode_result_ready(PyObject * unicode)667 unicode_result_ready(PyObject *unicode)
668 {
669     Py_ssize_t length;
670 
671     length = PyUnicode_GET_LENGTH(unicode);
672     if (length == 0) {
673         if (unicode != unicode_empty) {
674             Py_DECREF(unicode);
675             _Py_RETURN_UNICODE_EMPTY();
676         }
677         return unicode_empty;
678     }
679 
680 #ifdef LATIN1_SINGLETONS
681     if (length == 1) {
682         const void *data = PyUnicode_DATA(unicode);
683         int kind = PyUnicode_KIND(unicode);
684         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
685         if (ch < 256) {
686             PyObject *latin1_char = unicode_latin1[ch];
687             if (latin1_char != NULL) {
688                 if (unicode != latin1_char) {
689                     Py_INCREF(latin1_char);
690                     Py_DECREF(unicode);
691                 }
692                 return latin1_char;
693             }
694             else {
695                 assert(_PyUnicode_CheckConsistency(unicode, 1));
696                 Py_INCREF(unicode);
697                 unicode_latin1[ch] = unicode;
698                 return unicode;
699             }
700         }
701     }
702 #endif
703 
704     assert(_PyUnicode_CheckConsistency(unicode, 1));
705     return unicode;
706 }
707 
708 static PyObject*
unicode_result(PyObject * unicode)709 unicode_result(PyObject *unicode)
710 {
711     assert(_PyUnicode_CHECK(unicode));
712     if (PyUnicode_IS_READY(unicode))
713         return unicode_result_ready(unicode);
714     else
715         return unicode_result_wchar(unicode);
716 }
717 
718 static PyObject*
unicode_result_unchanged(PyObject * unicode)719 unicode_result_unchanged(PyObject *unicode)
720 {
721     if (PyUnicode_CheckExact(unicode)) {
722         if (PyUnicode_READY(unicode) == -1)
723             return NULL;
724         Py_INCREF(unicode);
725         return unicode;
726     }
727     else
728         /* Subtype -- return genuine unicode string with the same value. */
729         return _PyUnicode_Copy(unicode);
730 }
731 
732 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
733    ASCII, Latin1, UTF-8, etc. */
734 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)735 backslashreplace(_PyBytesWriter *writer, char *str,
736                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
737 {
738     Py_ssize_t size, i;
739     Py_UCS4 ch;
740     enum PyUnicode_Kind kind;
741     const void *data;
742 
743     assert(PyUnicode_IS_READY(unicode));
744     kind = PyUnicode_KIND(unicode);
745     data = PyUnicode_DATA(unicode);
746 
747     size = 0;
748     /* determine replacement size */
749     for (i = collstart; i < collend; ++i) {
750         Py_ssize_t incr;
751 
752         ch = PyUnicode_READ(kind, data, i);
753         if (ch < 0x100)
754             incr = 2+2;
755         else if (ch < 0x10000)
756             incr = 2+4;
757         else {
758             assert(ch <= MAX_UNICODE);
759             incr = 2+8;
760         }
761         if (size > PY_SSIZE_T_MAX - incr) {
762             PyErr_SetString(PyExc_OverflowError,
763                             "encoded result is too long for a Python string");
764             return NULL;
765         }
766         size += incr;
767     }
768 
769     str = _PyBytesWriter_Prepare(writer, str, size);
770     if (str == NULL)
771         return NULL;
772 
773     /* generate replacement */
774     for (i = collstart; i < collend; ++i) {
775         ch = PyUnicode_READ(kind, data, i);
776         *str++ = '\\';
777         if (ch >= 0x00010000) {
778             *str++ = 'U';
779             *str++ = Py_hexdigits[(ch>>28)&0xf];
780             *str++ = Py_hexdigits[(ch>>24)&0xf];
781             *str++ = Py_hexdigits[(ch>>20)&0xf];
782             *str++ = Py_hexdigits[(ch>>16)&0xf];
783             *str++ = Py_hexdigits[(ch>>12)&0xf];
784             *str++ = Py_hexdigits[(ch>>8)&0xf];
785         }
786         else if (ch >= 0x100) {
787             *str++ = 'u';
788             *str++ = Py_hexdigits[(ch>>12)&0xf];
789             *str++ = Py_hexdigits[(ch>>8)&0xf];
790         }
791         else
792             *str++ = 'x';
793         *str++ = Py_hexdigits[(ch>>4)&0xf];
794         *str++ = Py_hexdigits[ch&0xf];
795     }
796     return str;
797 }
798 
799 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
800    ASCII, Latin1, UTF-8, etc. */
801 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)802 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
803                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
804 {
805     Py_ssize_t size, i;
806     Py_UCS4 ch;
807     enum PyUnicode_Kind kind;
808     const void *data;
809 
810     assert(PyUnicode_IS_READY(unicode));
811     kind = PyUnicode_KIND(unicode);
812     data = PyUnicode_DATA(unicode);
813 
814     size = 0;
815     /* determine replacement size */
816     for (i = collstart; i < collend; ++i) {
817         Py_ssize_t incr;
818 
819         ch = PyUnicode_READ(kind, data, i);
820         if (ch < 10)
821             incr = 2+1+1;
822         else if (ch < 100)
823             incr = 2+2+1;
824         else if (ch < 1000)
825             incr = 2+3+1;
826         else if (ch < 10000)
827             incr = 2+4+1;
828         else if (ch < 100000)
829             incr = 2+5+1;
830         else if (ch < 1000000)
831             incr = 2+6+1;
832         else {
833             assert(ch <= MAX_UNICODE);
834             incr = 2+7+1;
835         }
836         if (size > PY_SSIZE_T_MAX - incr) {
837             PyErr_SetString(PyExc_OverflowError,
838                             "encoded result is too long for a Python string");
839             return NULL;
840         }
841         size += incr;
842     }
843 
844     str = _PyBytesWriter_Prepare(writer, str, size);
845     if (str == NULL)
846         return NULL;
847 
848     /* generate replacement */
849     for (i = collstart; i < collend; ++i) {
850         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
851         if (size < 0) {
852             return NULL;
853         }
854         str += size;
855     }
856     return str;
857 }
858 
859 /* --- Bloom Filters ----------------------------------------------------- */
860 
861 /* stuff to implement simple "bloom filters" for Unicode characters.
862    to keep things simple, we use a single bitmask, using the least 5
863    bits from each unicode characters as the bit index. */
864 
865 /* the linebreak mask is set up by Unicode_Init below */
866 
867 #if LONG_BIT >= 128
868 #define BLOOM_WIDTH 128
869 #elif LONG_BIT >= 64
870 #define BLOOM_WIDTH 64
871 #elif LONG_BIT >= 32
872 #define BLOOM_WIDTH 32
873 #else
874 #error "LONG_BIT is smaller than 32"
875 #endif
876 
877 #define BLOOM_MASK unsigned long
878 
879 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
880 
881 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
882 
883 #define BLOOM_LINEBREAK(ch)                                             \
884     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
885      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
886 
887 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)888 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
889 {
890 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
891     do {                                               \
892         TYPE *data = (TYPE *)PTR;                      \
893         TYPE *end = data + LEN;                        \
894         Py_UCS4 ch;                                    \
895         for (; data != end; data++) {                  \
896             ch = *data;                                \
897             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
898         }                                              \
899         break;                                         \
900     } while (0)
901 
902     /* calculate simple bloom-style bitmask for a given unicode string */
903 
904     BLOOM_MASK mask;
905 
906     mask = 0;
907     switch (kind) {
908     case PyUnicode_1BYTE_KIND:
909         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
910         break;
911     case PyUnicode_2BYTE_KIND:
912         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
913         break;
914     case PyUnicode_4BYTE_KIND:
915         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
916         break;
917     default:
918         Py_UNREACHABLE();
919     }
920     return mask;
921 
922 #undef BLOOM_UPDATE
923 }
924 
925 static int
ensure_unicode(PyObject * obj)926 ensure_unicode(PyObject *obj)
927 {
928     if (!PyUnicode_Check(obj)) {
929         PyErr_Format(PyExc_TypeError,
930                      "must be str, not %.100s",
931                      Py_TYPE(obj)->tp_name);
932         return -1;
933     }
934     return PyUnicode_READY(obj);
935 }
936 
937 /* Compilation of templated routines */
938 
939 #include "stringlib/asciilib.h"
940 #include "stringlib/fastsearch.h"
941 #include "stringlib/partition.h"
942 #include "stringlib/split.h"
943 #include "stringlib/count.h"
944 #include "stringlib/find.h"
945 #include "stringlib/find_max_char.h"
946 #include "stringlib/undef.h"
947 
948 #include "stringlib/ucs1lib.h"
949 #include "stringlib/fastsearch.h"
950 #include "stringlib/partition.h"
951 #include "stringlib/split.h"
952 #include "stringlib/count.h"
953 #include "stringlib/find.h"
954 #include "stringlib/replace.h"
955 #include "stringlib/find_max_char.h"
956 #include "stringlib/undef.h"
957 
958 #include "stringlib/ucs2lib.h"
959 #include "stringlib/fastsearch.h"
960 #include "stringlib/partition.h"
961 #include "stringlib/split.h"
962 #include "stringlib/count.h"
963 #include "stringlib/find.h"
964 #include "stringlib/replace.h"
965 #include "stringlib/find_max_char.h"
966 #include "stringlib/undef.h"
967 
968 #include "stringlib/ucs4lib.h"
969 #include "stringlib/fastsearch.h"
970 #include "stringlib/partition.h"
971 #include "stringlib/split.h"
972 #include "stringlib/count.h"
973 #include "stringlib/find.h"
974 #include "stringlib/replace.h"
975 #include "stringlib/find_max_char.h"
976 #include "stringlib/undef.h"
977 
978 _Py_COMP_DIAG_PUSH
979 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
980 #include "stringlib/unicodedefs.h"
981 #include "stringlib/fastsearch.h"
982 #include "stringlib/count.h"
983 #include "stringlib/find.h"
984 #include "stringlib/undef.h"
985 _Py_COMP_DIAG_POP
986 
987 /* --- Unicode Object ----------------------------------------------------- */
988 
989 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)990 findchar(const void *s, int kind,
991          Py_ssize_t size, Py_UCS4 ch,
992          int direction)
993 {
994     switch (kind) {
995     case PyUnicode_1BYTE_KIND:
996         if ((Py_UCS1) ch != ch)
997             return -1;
998         if (direction > 0)
999             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000         else
1001             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002     case PyUnicode_2BYTE_KIND:
1003         if ((Py_UCS2) ch != ch)
1004             return -1;
1005         if (direction > 0)
1006             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007         else
1008             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009     case PyUnicode_4BYTE_KIND:
1010         if (direction > 0)
1011             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012         else
1013             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014     default:
1015         Py_UNREACHABLE();
1016     }
1017 }
1018 
1019 #ifdef Py_DEBUG
1020 /* Fill the data of a Unicode string with invalid characters to detect bugs
1021    earlier.
1022 
1023    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025    invalid character in Unicode 6.0. */
1026 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1027 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028 {
1029     int kind = PyUnicode_KIND(unicode);
1030     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032     if (length <= old_length)
1033         return;
1034     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035 }
1036 #endif
1037 
1038 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1039 resize_compact(PyObject *unicode, Py_ssize_t length)
1040 {
1041     Py_ssize_t char_size;
1042     Py_ssize_t struct_size;
1043     Py_ssize_t new_size;
1044     int share_wstr;
1045     PyObject *new_unicode;
1046 #ifdef Py_DEBUG
1047     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048 #endif
1049 
1050     assert(unicode_modifiable(unicode));
1051     assert(PyUnicode_IS_READY(unicode));
1052     assert(PyUnicode_IS_COMPACT(unicode));
1053 
1054     char_size = PyUnicode_KIND(unicode);
1055     if (PyUnicode_IS_ASCII(unicode))
1056         struct_size = sizeof(PyASCIIObject);
1057     else
1058         struct_size = sizeof(PyCompactUnicodeObject);
1059     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060 
1061     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062         PyErr_NoMemory();
1063         return NULL;
1064     }
1065     new_size = (struct_size + (length + 1) * char_size);
1066 
1067     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068         PyObject_DEL(_PyUnicode_UTF8(unicode));
1069         _PyUnicode_UTF8(unicode) = NULL;
1070         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071     }
1072 #ifdef Py_REF_DEBUG
1073     _Py_RefTotal--;
1074 #endif
1075 #ifdef Py_TRACE_REFS
1076     _Py_ForgetReference(unicode);
1077 #endif
1078 
1079     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
1080     if (new_unicode == NULL) {
1081         _Py_NewReference(unicode);
1082         PyErr_NoMemory();
1083         return NULL;
1084     }
1085     unicode = new_unicode;
1086     _Py_NewReference(unicode);
1087 
1088     _PyUnicode_LENGTH(unicode) = length;
1089     if (share_wstr) {
1090         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091         if (!PyUnicode_IS_ASCII(unicode))
1092             _PyUnicode_WSTR_LENGTH(unicode) = length;
1093     }
1094     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095         PyObject_DEL(_PyUnicode_WSTR(unicode));
1096         _PyUnicode_WSTR(unicode) = NULL;
1097         if (!PyUnicode_IS_ASCII(unicode))
1098             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099     }
1100 #ifdef Py_DEBUG
1101     unicode_fill_invalid(unicode, old_length);
1102 #endif
1103     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104                     length, 0);
1105     assert(_PyUnicode_CheckConsistency(unicode, 0));
1106     return unicode;
1107 }
1108 
1109 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1110 resize_inplace(PyObject *unicode, Py_ssize_t length)
1111 {
1112     wchar_t *wstr;
1113     Py_ssize_t new_size;
1114     assert(!PyUnicode_IS_COMPACT(unicode));
1115     assert(Py_REFCNT(unicode) == 1);
1116 
1117     if (PyUnicode_IS_READY(unicode)) {
1118         Py_ssize_t char_size;
1119         int share_wstr, share_utf8;
1120         void *data;
1121 #ifdef Py_DEBUG
1122         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123 #endif
1124 
1125         data = _PyUnicode_DATA_ANY(unicode);
1126         char_size = PyUnicode_KIND(unicode);
1127         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129 
1130         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131             PyErr_NoMemory();
1132             return -1;
1133         }
1134         new_size = (length + 1) * char_size;
1135 
1136         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137         {
1138             PyObject_DEL(_PyUnicode_UTF8(unicode));
1139             _PyUnicode_UTF8(unicode) = NULL;
1140             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141         }
1142 
1143         data = (PyObject *)PyObject_REALLOC(data, new_size);
1144         if (data == NULL) {
1145             PyErr_NoMemory();
1146             return -1;
1147         }
1148         _PyUnicode_DATA_ANY(unicode) = data;
1149         if (share_wstr) {
1150             _PyUnicode_WSTR(unicode) = data;
1151             _PyUnicode_WSTR_LENGTH(unicode) = length;
1152         }
1153         if (share_utf8) {
1154             _PyUnicode_UTF8(unicode) = data;
1155             _PyUnicode_UTF8_LENGTH(unicode) = length;
1156         }
1157         _PyUnicode_LENGTH(unicode) = length;
1158         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159 #ifdef Py_DEBUG
1160         unicode_fill_invalid(unicode, old_length);
1161 #endif
1162         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163             assert(_PyUnicode_CheckConsistency(unicode, 0));
1164             return 0;
1165         }
1166     }
1167     assert(_PyUnicode_WSTR(unicode) != NULL);
1168 
1169     /* check for integer overflow */
1170     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171         PyErr_NoMemory();
1172         return -1;
1173     }
1174     new_size = sizeof(wchar_t) * (length + 1);
1175     wstr =  _PyUnicode_WSTR(unicode);
1176     wstr = PyObject_REALLOC(wstr, new_size);
1177     if (!wstr) {
1178         PyErr_NoMemory();
1179         return -1;
1180     }
1181     _PyUnicode_WSTR(unicode) = wstr;
1182     _PyUnicode_WSTR(unicode)[length] = 0;
1183     _PyUnicode_WSTR_LENGTH(unicode) = length;
1184     assert(_PyUnicode_CheckConsistency(unicode, 0));
1185     return 0;
1186 }
1187 
1188 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1189 resize_copy(PyObject *unicode, Py_ssize_t length)
1190 {
1191     Py_ssize_t copy_length;
1192     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193         PyObject *copy;
1194 
1195         assert(PyUnicode_IS_READY(unicode));
1196 
1197         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198         if (copy == NULL)
1199             return NULL;
1200 
1201         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203         return copy;
1204     }
1205     else {
1206         PyObject *w;
1207 
1208         w = (PyObject*)_PyUnicode_New(length);
1209         if (w == NULL)
1210             return NULL;
1211         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212         copy_length = Py_MIN(copy_length, length);
1213         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214                   copy_length * sizeof(wchar_t));
1215         return w;
1216     }
1217 }
1218 
1219 /* We allocate one more byte to make sure the string is
1220    Ux0000 terminated; some code (e.g. new_identifier)
1221    relies on that.
1222 
1223    XXX This allocator could further be enhanced by assuring that the
1224    free list never reduces its size below 1.
1225 
1226 */
1227 
1228 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1229 _PyUnicode_New(Py_ssize_t length)
1230 {
1231     PyUnicodeObject *unicode;
1232     size_t new_size;
1233 
1234     /* Optimization for empty strings */
1235     if (length == 0 && unicode_empty != NULL) {
1236         Py_INCREF(unicode_empty);
1237         return (PyUnicodeObject*)unicode_empty;
1238     }
1239 
1240     /* Ensure we won't overflow the size. */
1241     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1242         return (PyUnicodeObject *)PyErr_NoMemory();
1243     }
1244     if (length < 0) {
1245         PyErr_SetString(PyExc_SystemError,
1246                         "Negative size passed to _PyUnicode_New");
1247         return NULL;
1248     }
1249 
1250     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1251     if (unicode == NULL)
1252         return NULL;
1253     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1254 
1255     _PyUnicode_WSTR_LENGTH(unicode) = length;
1256     _PyUnicode_HASH(unicode) = -1;
1257     _PyUnicode_STATE(unicode).interned = 0;
1258     _PyUnicode_STATE(unicode).kind = 0;
1259     _PyUnicode_STATE(unicode).compact = 0;
1260     _PyUnicode_STATE(unicode).ready = 0;
1261     _PyUnicode_STATE(unicode).ascii = 0;
1262     _PyUnicode_DATA_ANY(unicode) = NULL;
1263     _PyUnicode_LENGTH(unicode) = 0;
1264     _PyUnicode_UTF8(unicode) = NULL;
1265     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1266 
1267     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1268     if (!_PyUnicode_WSTR(unicode)) {
1269         Py_DECREF(unicode);
1270         PyErr_NoMemory();
1271         return NULL;
1272     }
1273 
1274     /* Initialize the first element to guard against cases where
1275      * the caller fails before initializing str -- unicode_resize()
1276      * reads str[0], and the Keep-Alive optimization can keep memory
1277      * allocated for str alive across a call to unicode_dealloc(unicode).
1278      * We don't want unicode_resize to read uninitialized memory in
1279      * that case.
1280      */
1281     _PyUnicode_WSTR(unicode)[0] = 0;
1282     _PyUnicode_WSTR(unicode)[length] = 0;
1283 
1284     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1285     return unicode;
1286 }
1287 
1288 static const char*
unicode_kind_name(PyObject * unicode)1289 unicode_kind_name(PyObject *unicode)
1290 {
1291     /* don't check consistency: unicode_kind_name() is called from
1292        _PyUnicode_Dump() */
1293     if (!PyUnicode_IS_COMPACT(unicode))
1294     {
1295         if (!PyUnicode_IS_READY(unicode))
1296             return "wstr";
1297         switch (PyUnicode_KIND(unicode))
1298         {
1299         case PyUnicode_1BYTE_KIND:
1300             if (PyUnicode_IS_ASCII(unicode))
1301                 return "legacy ascii";
1302             else
1303                 return "legacy latin1";
1304         case PyUnicode_2BYTE_KIND:
1305             return "legacy UCS2";
1306         case PyUnicode_4BYTE_KIND:
1307             return "legacy UCS4";
1308         default:
1309             return "<legacy invalid kind>";
1310         }
1311     }
1312     assert(PyUnicode_IS_READY(unicode));
1313     switch (PyUnicode_KIND(unicode)) {
1314     case PyUnicode_1BYTE_KIND:
1315         if (PyUnicode_IS_ASCII(unicode))
1316             return "ascii";
1317         else
1318             return "latin1";
1319     case PyUnicode_2BYTE_KIND:
1320         return "UCS2";
1321     case PyUnicode_4BYTE_KIND:
1322         return "UCS4";
1323     default:
1324         return "<invalid compact kind>";
1325     }
1326 }
1327 
1328 #ifdef Py_DEBUG
1329 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1330 const char *_PyUnicode_utf8(void *unicode_raw){
1331     PyObject *unicode = _PyObject_CAST(unicode_raw);
1332     return PyUnicode_UTF8(unicode);
1333 }
1334 
_PyUnicode_compact_data(void * unicode_raw)1335 const void *_PyUnicode_compact_data(void *unicode_raw) {
1336     PyObject *unicode = _PyObject_CAST(unicode_raw);
1337     return _PyUnicode_COMPACT_DATA(unicode);
1338 }
_PyUnicode_data(void * unicode_raw)1339 const void *_PyUnicode_data(void *unicode_raw) {
1340     PyObject *unicode = _PyObject_CAST(unicode_raw);
1341     printf("obj %p\n", (void*)unicode);
1342     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1343     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1344     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1345     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1346     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1347     return PyUnicode_DATA(unicode);
1348 }
1349 
1350 void
_PyUnicode_Dump(PyObject * op)1351 _PyUnicode_Dump(PyObject *op)
1352 {
1353     PyASCIIObject *ascii = (PyASCIIObject *)op;
1354     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1355     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1356     const void *data;
1357 
1358     if (ascii->state.compact)
1359     {
1360         if (ascii->state.ascii)
1361             data = (ascii + 1);
1362         else
1363             data = (compact + 1);
1364     }
1365     else
1366         data = unicode->data.any;
1367     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1368            unicode_kind_name(op), ascii->length);
1369 
1370     if (ascii->wstr == data)
1371         printf("shared ");
1372     printf("wstr=%p", (void *)ascii->wstr);
1373 
1374     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1375         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1376         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1377             printf("shared ");
1378         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1379                (void *)compact->utf8, compact->utf8_length);
1380     }
1381     printf(", data=%p\n", data);
1382 }
1383 #endif
1384 
1385 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1386 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1387 {
1388     PyObject *obj;
1389     PyCompactUnicodeObject *unicode;
1390     void *data;
1391     enum PyUnicode_Kind kind;
1392     int is_sharing, is_ascii;
1393     Py_ssize_t char_size;
1394     Py_ssize_t struct_size;
1395 
1396     /* Optimization for empty strings */
1397     if (size == 0 && unicode_empty != NULL) {
1398         Py_INCREF(unicode_empty);
1399         return unicode_empty;
1400     }
1401 
1402     is_ascii = 0;
1403     is_sharing = 0;
1404     struct_size = sizeof(PyCompactUnicodeObject);
1405     if (maxchar < 128) {
1406         kind = PyUnicode_1BYTE_KIND;
1407         char_size = 1;
1408         is_ascii = 1;
1409         struct_size = sizeof(PyASCIIObject);
1410     }
1411     else if (maxchar < 256) {
1412         kind = PyUnicode_1BYTE_KIND;
1413         char_size = 1;
1414     }
1415     else if (maxchar < 65536) {
1416         kind = PyUnicode_2BYTE_KIND;
1417         char_size = 2;
1418         if (sizeof(wchar_t) == 2)
1419             is_sharing = 1;
1420     }
1421     else {
1422         if (maxchar > MAX_UNICODE) {
1423             PyErr_SetString(PyExc_SystemError,
1424                             "invalid maximum character passed to PyUnicode_New");
1425             return NULL;
1426         }
1427         kind = PyUnicode_4BYTE_KIND;
1428         char_size = 4;
1429         if (sizeof(wchar_t) == 4)
1430             is_sharing = 1;
1431     }
1432 
1433     /* Ensure we won't overflow the size. */
1434     if (size < 0) {
1435         PyErr_SetString(PyExc_SystemError,
1436                         "Negative size passed to PyUnicode_New");
1437         return NULL;
1438     }
1439     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1440         return PyErr_NoMemory();
1441 
1442     /* Duplicated allocation code from _PyObject_New() instead of a call to
1443      * PyObject_New() so we are able to allocate space for the object and
1444      * it's data buffer.
1445      */
1446     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1447     if (obj == NULL)
1448         return PyErr_NoMemory();
1449     obj = PyObject_INIT(obj, &PyUnicode_Type);
1450     if (obj == NULL)
1451         return NULL;
1452 
1453     unicode = (PyCompactUnicodeObject *)obj;
1454     if (is_ascii)
1455         data = ((PyASCIIObject*)obj) + 1;
1456     else
1457         data = unicode + 1;
1458     _PyUnicode_LENGTH(unicode) = size;
1459     _PyUnicode_HASH(unicode) = -1;
1460     _PyUnicode_STATE(unicode).interned = 0;
1461     _PyUnicode_STATE(unicode).kind = kind;
1462     _PyUnicode_STATE(unicode).compact = 1;
1463     _PyUnicode_STATE(unicode).ready = 1;
1464     _PyUnicode_STATE(unicode).ascii = is_ascii;
1465     if (is_ascii) {
1466         ((char*)data)[size] = 0;
1467         _PyUnicode_WSTR(unicode) = NULL;
1468     }
1469     else if (kind == PyUnicode_1BYTE_KIND) {
1470         ((char*)data)[size] = 0;
1471         _PyUnicode_WSTR(unicode) = NULL;
1472         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1473         unicode->utf8 = NULL;
1474         unicode->utf8_length = 0;
1475     }
1476     else {
1477         unicode->utf8 = NULL;
1478         unicode->utf8_length = 0;
1479         if (kind == PyUnicode_2BYTE_KIND)
1480             ((Py_UCS2*)data)[size] = 0;
1481         else /* kind == PyUnicode_4BYTE_KIND */
1482             ((Py_UCS4*)data)[size] = 0;
1483         if (is_sharing) {
1484             _PyUnicode_WSTR_LENGTH(unicode) = size;
1485             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486         }
1487         else {
1488             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489             _PyUnicode_WSTR(unicode) = NULL;
1490         }
1491     }
1492 #ifdef Py_DEBUG
1493     unicode_fill_invalid((PyObject*)unicode, 0);
1494 #endif
1495     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1496     return obj;
1497 }
1498 
1499 #if SIZEOF_WCHAR_T == 2
1500 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501    will decode surrogate pairs, the other conversions are implemented as macros
1502    for efficiency.
1503 
1504    This function assumes that unicode can hold one more code point than wstr
1505    characters for a terminating null character. */
1506 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1507 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1508                               PyObject *unicode)
1509 {
1510     const wchar_t *iter;
1511     Py_UCS4 *ucs4_out;
1512 
1513     assert(unicode != NULL);
1514     assert(_PyUnicode_CHECK(unicode));
1515     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517 
1518     for (iter = begin; iter < end; ) {
1519         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520                            _PyUnicode_GET_LENGTH(unicode)));
1521         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522             && (iter+1) < end
1523             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1524         {
1525             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1526             iter += 2;
1527         }
1528         else {
1529             *ucs4_out++ = *iter;
1530             iter++;
1531         }
1532     }
1533     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534                         _PyUnicode_GET_LENGTH(unicode)));
1535 
1536 }
1537 #endif
1538 
1539 static int
unicode_check_modifiable(PyObject * unicode)1540 unicode_check_modifiable(PyObject *unicode)
1541 {
1542     if (!unicode_modifiable(unicode)) {
1543         PyErr_SetString(PyExc_SystemError,
1544                         "Cannot modify a string currently used");
1545         return -1;
1546     }
1547     return 0;
1548 }
1549 
1550 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1551 _copy_characters(PyObject *to, Py_ssize_t to_start,
1552                  PyObject *from, Py_ssize_t from_start,
1553                  Py_ssize_t how_many, int check_maxchar)
1554 {
1555     unsigned int from_kind, to_kind;
1556     const void *from_data;
1557     void *to_data;
1558 
1559     assert(0 <= how_many);
1560     assert(0 <= from_start);
1561     assert(0 <= to_start);
1562     assert(PyUnicode_Check(from));
1563     assert(PyUnicode_IS_READY(from));
1564     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1565 
1566     assert(PyUnicode_Check(to));
1567     assert(PyUnicode_IS_READY(to));
1568     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569 
1570     if (how_many == 0)
1571         return 0;
1572 
1573     from_kind = PyUnicode_KIND(from);
1574     from_data = PyUnicode_DATA(from);
1575     to_kind = PyUnicode_KIND(to);
1576     to_data = PyUnicode_DATA(to);
1577 
1578 #ifdef Py_DEBUG
1579     if (!check_maxchar
1580         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581     {
1582         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1583         Py_UCS4 ch;
1584         Py_ssize_t i;
1585         for (i=0; i < how_many; i++) {
1586             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587             assert(ch <= to_maxchar);
1588         }
1589     }
1590 #endif
1591 
1592     if (from_kind == to_kind) {
1593         if (check_maxchar
1594             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595         {
1596             /* Writing Latin-1 characters into an ASCII string requires to
1597                check that all written characters are pure ASCII */
1598             Py_UCS4 max_char;
1599             max_char = ucs1lib_find_max_char(from_data,
1600                                              (const Py_UCS1*)from_data + how_many);
1601             if (max_char >= 128)
1602                 return -1;
1603         }
1604         memcpy((char*)to_data + to_kind * to_start,
1605                   (const char*)from_data + from_kind * from_start,
1606                   to_kind * how_many);
1607     }
1608     else if (from_kind == PyUnicode_1BYTE_KIND
1609              && to_kind == PyUnicode_2BYTE_KIND)
1610     {
1611         _PyUnicode_CONVERT_BYTES(
1612             Py_UCS1, Py_UCS2,
1613             PyUnicode_1BYTE_DATA(from) + from_start,
1614             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615             PyUnicode_2BYTE_DATA(to) + to_start
1616             );
1617     }
1618     else if (from_kind == PyUnicode_1BYTE_KIND
1619              && to_kind == PyUnicode_4BYTE_KIND)
1620     {
1621         _PyUnicode_CONVERT_BYTES(
1622             Py_UCS1, Py_UCS4,
1623             PyUnicode_1BYTE_DATA(from) + from_start,
1624             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625             PyUnicode_4BYTE_DATA(to) + to_start
1626             );
1627     }
1628     else if (from_kind == PyUnicode_2BYTE_KIND
1629              && to_kind == PyUnicode_4BYTE_KIND)
1630     {
1631         _PyUnicode_CONVERT_BYTES(
1632             Py_UCS2, Py_UCS4,
1633             PyUnicode_2BYTE_DATA(from) + from_start,
1634             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635             PyUnicode_4BYTE_DATA(to) + to_start
1636             );
1637     }
1638     else {
1639         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640 
1641         if (!check_maxchar) {
1642             if (from_kind == PyUnicode_2BYTE_KIND
1643                 && to_kind == PyUnicode_1BYTE_KIND)
1644             {
1645                 _PyUnicode_CONVERT_BYTES(
1646                     Py_UCS2, Py_UCS1,
1647                     PyUnicode_2BYTE_DATA(from) + from_start,
1648                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649                     PyUnicode_1BYTE_DATA(to) + to_start
1650                     );
1651             }
1652             else if (from_kind == PyUnicode_4BYTE_KIND
1653                      && to_kind == PyUnicode_1BYTE_KIND)
1654             {
1655                 _PyUnicode_CONVERT_BYTES(
1656                     Py_UCS4, Py_UCS1,
1657                     PyUnicode_4BYTE_DATA(from) + from_start,
1658                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659                     PyUnicode_1BYTE_DATA(to) + to_start
1660                     );
1661             }
1662             else if (from_kind == PyUnicode_4BYTE_KIND
1663                      && to_kind == PyUnicode_2BYTE_KIND)
1664             {
1665                 _PyUnicode_CONVERT_BYTES(
1666                     Py_UCS4, Py_UCS2,
1667                     PyUnicode_4BYTE_DATA(from) + from_start,
1668                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669                     PyUnicode_2BYTE_DATA(to) + to_start
1670                     );
1671             }
1672             else {
1673                 Py_UNREACHABLE();
1674             }
1675         }
1676         else {
1677             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1678             Py_UCS4 ch;
1679             Py_ssize_t i;
1680 
1681             for (i=0; i < how_many; i++) {
1682                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1683                 if (ch > to_maxchar)
1684                     return -1;
1685                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686             }
1687         }
1688     }
1689     return 0;
1690 }
1691 
1692 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1693 _PyUnicode_FastCopyCharacters(
1694     PyObject *to, Py_ssize_t to_start,
1695     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1696 {
1697     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698 }
1699 
1700 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1701 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702                          PyObject *from, Py_ssize_t from_start,
1703                          Py_ssize_t how_many)
1704 {
1705     int err;
1706 
1707     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708         PyErr_BadInternalCall();
1709         return -1;
1710     }
1711 
1712     if (PyUnicode_READY(from) == -1)
1713         return -1;
1714     if (PyUnicode_READY(to) == -1)
1715         return -1;
1716 
1717     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1718         PyErr_SetString(PyExc_IndexError, "string index out of range");
1719         return -1;
1720     }
1721     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1722         PyErr_SetString(PyExc_IndexError, "string index out of range");
1723         return -1;
1724     }
1725     if (how_many < 0) {
1726         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727         return -1;
1728     }
1729     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1730     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731         PyErr_Format(PyExc_SystemError,
1732                      "Cannot write %zi characters at %zi "
1733                      "in a string of %zi characters",
1734                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1735         return -1;
1736     }
1737 
1738     if (how_many == 0)
1739         return 0;
1740 
1741     if (unicode_check_modifiable(to))
1742         return -1;
1743 
1744     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745     if (err) {
1746         PyErr_Format(PyExc_SystemError,
1747                      "Cannot copy %s characters "
1748                      "into a string of %s characters",
1749                      unicode_kind_name(from),
1750                      unicode_kind_name(to));
1751         return -1;
1752     }
1753     return how_many;
1754 }
1755 
1756 /* Find the maximum code point and count the number of surrogate pairs so a
1757    correct string length can be computed before converting a string to UCS4.
1758    This function counts single surrogates as a character and not as a pair.
1759 
1760    Return 0 on success, or -1 on error. */
1761 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1762 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1764 {
1765     const wchar_t *iter;
1766     Py_UCS4 ch;
1767 
1768     assert(num_surrogates != NULL && maxchar != NULL);
1769     *num_surrogates = 0;
1770     *maxchar = 0;
1771 
1772     for (iter = begin; iter < end; ) {
1773 #if SIZEOF_WCHAR_T == 2
1774         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775             && (iter+1) < end
1776             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777         {
1778             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779             ++(*num_surrogates);
1780             iter += 2;
1781         }
1782         else
1783 #endif
1784         {
1785             ch = *iter;
1786             iter++;
1787         }
1788         if (ch > *maxchar) {
1789             *maxchar = ch;
1790             if (*maxchar > MAX_UNICODE) {
1791                 PyErr_Format(PyExc_ValueError,
1792                              "character U+%x is not in range [U+0000; U+10ffff]",
1793                              ch);
1794                 return -1;
1795             }
1796         }
1797     }
1798     return 0;
1799 }
1800 
1801 int
_PyUnicode_Ready(PyObject * unicode)1802 _PyUnicode_Ready(PyObject *unicode)
1803 {
1804     wchar_t *end;
1805     Py_UCS4 maxchar = 0;
1806     Py_ssize_t num_surrogates;
1807 #if SIZEOF_WCHAR_T == 2
1808     Py_ssize_t length_wo_surrogates;
1809 #endif
1810 
1811     /* _PyUnicode_Ready() is only intended for old-style API usage where
1812        strings were created using _PyObject_New() and where no canonical
1813        representation (the str field) has been set yet aka strings
1814        which are not yet ready. */
1815     assert(_PyUnicode_CHECK(unicode));
1816     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1817     assert(_PyUnicode_WSTR(unicode) != NULL);
1818     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1819     assert(_PyUnicode_UTF8(unicode) == NULL);
1820     /* Actually, it should neither be interned nor be anything else: */
1821     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1822 
1823     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1824     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1825                                 &maxchar, &num_surrogates) == -1)
1826         return -1;
1827 
1828     if (maxchar < 256) {
1829         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1830         if (!_PyUnicode_DATA_ANY(unicode)) {
1831             PyErr_NoMemory();
1832             return -1;
1833         }
1834         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1835                                 _PyUnicode_WSTR(unicode), end,
1836                                 PyUnicode_1BYTE_DATA(unicode));
1837         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840         if (maxchar < 128) {
1841             _PyUnicode_STATE(unicode).ascii = 1;
1842             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1843             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1844         }
1845         else {
1846             _PyUnicode_STATE(unicode).ascii = 0;
1847             _PyUnicode_UTF8(unicode) = NULL;
1848             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1849         }
1850         PyObject_FREE(_PyUnicode_WSTR(unicode));
1851         _PyUnicode_WSTR(unicode) = NULL;
1852         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853     }
1854     /* In this case we might have to convert down from 4-byte native
1855        wchar_t to 2-byte unicode. */
1856     else if (maxchar < 65536) {
1857         assert(num_surrogates == 0 &&
1858                "FindMaxCharAndNumSurrogatePairs() messed up");
1859 
1860 #if SIZEOF_WCHAR_T == 2
1861         /* We can share representations and are done. */
1862         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1863         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1866         _PyUnicode_UTF8(unicode) = NULL;
1867         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1868 #else
1869         /* sizeof(wchar_t) == 4 */
1870         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1871             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1872         if (!_PyUnicode_DATA_ANY(unicode)) {
1873             PyErr_NoMemory();
1874             return -1;
1875         }
1876         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877                                 _PyUnicode_WSTR(unicode), end,
1878                                 PyUnicode_2BYTE_DATA(unicode));
1879         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882         _PyUnicode_UTF8(unicode) = NULL;
1883         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884         PyObject_FREE(_PyUnicode_WSTR(unicode));
1885         _PyUnicode_WSTR(unicode) = NULL;
1886         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887 #endif
1888     }
1889     /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1890     else {
1891 #if SIZEOF_WCHAR_T == 2
1892         /* in case the native representation is 2-bytes, we need to allocate a
1893            new normalized 4-byte version. */
1894         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1895         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896             PyErr_NoMemory();
1897             return -1;
1898         }
1899         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1900         if (!_PyUnicode_DATA_ANY(unicode)) {
1901             PyErr_NoMemory();
1902             return -1;
1903         }
1904         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1906         _PyUnicode_UTF8(unicode) = NULL;
1907         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1908         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909         _PyUnicode_STATE(unicode).ready = 1;
1910         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1911         PyObject_FREE(_PyUnicode_WSTR(unicode));
1912         _PyUnicode_WSTR(unicode) = NULL;
1913         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914 #else
1915         assert(num_surrogates == 0);
1916 
1917         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1918         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1919         _PyUnicode_UTF8(unicode) = NULL;
1920         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1921         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922 #endif
1923         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924     }
1925     _PyUnicode_STATE(unicode).ready = 1;
1926     assert(_PyUnicode_CheckConsistency(unicode, 1));
1927     return 0;
1928 }
1929 
1930 static void
unicode_dealloc(PyObject * unicode)1931 unicode_dealloc(PyObject *unicode)
1932 {
1933     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1934     case SSTATE_NOT_INTERNED:
1935         break;
1936 
1937     case SSTATE_INTERNED_MORTAL:
1938         /* revive dead object temporarily for DelItem */
1939         Py_SET_REFCNT(unicode, 3);
1940 #ifdef INTERNED_STRINGS
1941         if (PyDict_DelItem(interned, unicode) != 0) {
1942             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1943                                       NULL);
1944         }
1945 #endif
1946         break;
1947 
1948     case SSTATE_INTERNED_IMMORTAL:
1949         _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1950         break;
1951 
1952     default:
1953         Py_UNREACHABLE();
1954     }
1955 
1956     if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1957         PyObject_DEL(_PyUnicode_WSTR(unicode));
1958     }
1959     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1960         PyObject_DEL(_PyUnicode_UTF8(unicode));
1961     }
1962     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1963         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1964     }
1965 
1966     Py_TYPE(unicode)->tp_free(unicode);
1967 }
1968 
1969 #ifdef Py_DEBUG
1970 static int
unicode_is_singleton(PyObject * unicode)1971 unicode_is_singleton(PyObject *unicode)
1972 {
1973     if (unicode == unicode_empty) {
1974         return 1;
1975     }
1976 #ifdef LATIN1_SINGLETONS
1977     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1978     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1979     {
1980         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1981         if (ch < 256 && unicode_latin1[ch] == unicode)
1982             return 1;
1983     }
1984 #endif
1985     return 0;
1986 }
1987 #endif
1988 
1989 static int
unicode_modifiable(PyObject * unicode)1990 unicode_modifiable(PyObject *unicode)
1991 {
1992     assert(_PyUnicode_CHECK(unicode));
1993     if (Py_REFCNT(unicode) != 1)
1994         return 0;
1995     if (_PyUnicode_HASH(unicode) != -1)
1996         return 0;
1997     if (PyUnicode_CHECK_INTERNED(unicode))
1998         return 0;
1999     if (!PyUnicode_CheckExact(unicode))
2000         return 0;
2001 #ifdef Py_DEBUG
2002     /* singleton refcount is greater than 1 */
2003     assert(!unicode_is_singleton(unicode));
2004 #endif
2005     return 1;
2006 }
2007 
2008 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)2009 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2010 {
2011     PyObject *unicode;
2012     Py_ssize_t old_length;
2013 
2014     assert(p_unicode != NULL);
2015     unicode = *p_unicode;
2016 
2017     assert(unicode != NULL);
2018     assert(PyUnicode_Check(unicode));
2019     assert(0 <= length);
2020 
2021     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2022         old_length = PyUnicode_WSTR_LENGTH(unicode);
2023     else
2024         old_length = PyUnicode_GET_LENGTH(unicode);
2025     if (old_length == length)
2026         return 0;
2027 
2028     if (length == 0) {
2029         _Py_INCREF_UNICODE_EMPTY();
2030         if (!unicode_empty)
2031             return -1;
2032         Py_SETREF(*p_unicode, unicode_empty);
2033         return 0;
2034     }
2035 
2036     if (!unicode_modifiable(unicode)) {
2037         PyObject *copy = resize_copy(unicode, length);
2038         if (copy == NULL)
2039             return -1;
2040         Py_SETREF(*p_unicode, copy);
2041         return 0;
2042     }
2043 
2044     if (PyUnicode_IS_COMPACT(unicode)) {
2045         PyObject *new_unicode = resize_compact(unicode, length);
2046         if (new_unicode == NULL)
2047             return -1;
2048         *p_unicode = new_unicode;
2049         return 0;
2050     }
2051     return resize_inplace(unicode, length);
2052 }
2053 
2054 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2055 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2056 {
2057     PyObject *unicode;
2058     if (p_unicode == NULL) {
2059         PyErr_BadInternalCall();
2060         return -1;
2061     }
2062     unicode = *p_unicode;
2063     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2064     {
2065         PyErr_BadInternalCall();
2066         return -1;
2067     }
2068     return unicode_resize(p_unicode, length);
2069 }
2070 
2071 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2072 
2073    WARNING: The function doesn't copy the terminating null character and
2074    doesn't check the maximum character (may write a latin1 character in an
2075    ASCII string). */
2076 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2077 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2078                    const char *str, Py_ssize_t len)
2079 {
2080     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2081     const void *data = PyUnicode_DATA(unicode);
2082     const char *end = str + len;
2083 
2084     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2085     switch (kind) {
2086     case PyUnicode_1BYTE_KIND: {
2087 #ifdef Py_DEBUG
2088         if (PyUnicode_IS_ASCII(unicode)) {
2089             Py_UCS4 maxchar = ucs1lib_find_max_char(
2090                 (const Py_UCS1*)str,
2091                 (const Py_UCS1*)str + len);
2092             assert(maxchar < 128);
2093         }
2094 #endif
2095         memcpy((char *) data + index, str, len);
2096         break;
2097     }
2098     case PyUnicode_2BYTE_KIND: {
2099         Py_UCS2 *start = (Py_UCS2 *)data + index;
2100         Py_UCS2 *ucs2 = start;
2101 
2102         for (; str < end; ++ucs2, ++str)
2103             *ucs2 = (Py_UCS2)*str;
2104 
2105         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2106         break;
2107     }
2108     case PyUnicode_4BYTE_KIND: {
2109         Py_UCS4 *start = (Py_UCS4 *)data + index;
2110         Py_UCS4 *ucs4 = start;
2111 
2112         for (; str < end; ++ucs4, ++str)
2113             *ucs4 = (Py_UCS4)*str;
2114 
2115         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2116         break;
2117     }
2118     default:
2119         Py_UNREACHABLE();
2120     }
2121 }
2122 
2123 static PyObject*
get_latin1_char(unsigned char ch)2124 get_latin1_char(unsigned char ch)
2125 {
2126     PyObject *unicode;
2127 
2128 #ifdef LATIN1_SINGLETONS
2129     unicode = unicode_latin1[ch];
2130     if (unicode) {
2131         Py_INCREF(unicode);
2132         return unicode;
2133     }
2134 #endif
2135 
2136     unicode = PyUnicode_New(1, ch);
2137     if (!unicode) {
2138         return NULL;
2139     }
2140 
2141     PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2142     assert(_PyUnicode_CheckConsistency(unicode, 1));
2143 
2144 #ifdef LATIN1_SINGLETONS
2145     Py_INCREF(unicode);
2146     unicode_latin1[ch] = unicode;
2147 #endif
2148     return unicode;
2149 }
2150 
2151 static PyObject*
unicode_char(Py_UCS4 ch)2152 unicode_char(Py_UCS4 ch)
2153 {
2154     PyObject *unicode;
2155 
2156     assert(ch <= MAX_UNICODE);
2157 
2158     if (ch < 256)
2159         return get_latin1_char(ch);
2160 
2161     unicode = PyUnicode_New(1, ch);
2162     if (unicode == NULL)
2163         return NULL;
2164 
2165     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2166     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2167         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2168     } else {
2169         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2170         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2171     }
2172     assert(_PyUnicode_CheckConsistency(unicode, 1));
2173     return unicode;
2174 }
2175 
2176 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2177 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2178 {
2179     if (u == NULL)
2180         return (PyObject*)_PyUnicode_New(size);
2181 
2182     if (size < 0) {
2183         PyErr_BadInternalCall();
2184         return NULL;
2185     }
2186 
2187     return PyUnicode_FromWideChar(u, size);
2188 }
2189 
2190 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2191 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2192 {
2193     PyObject *unicode;
2194     Py_UCS4 maxchar = 0;
2195     Py_ssize_t num_surrogates;
2196 
2197     if (u == NULL && size != 0) {
2198         PyErr_BadInternalCall();
2199         return NULL;
2200     }
2201 
2202     if (size == -1) {
2203         size = wcslen(u);
2204     }
2205 
2206     /* If the Unicode data is known at construction time, we can apply
2207        some optimizations which share commonly used objects. */
2208 
2209     /* Optimization for empty strings */
2210     if (size == 0)
2211         _Py_RETURN_UNICODE_EMPTY();
2212 
2213     /* Single character Unicode objects in the Latin-1 range are
2214        shared when using this constructor */
2215     if (size == 1 && (Py_UCS4)*u < 256)
2216         return get_latin1_char((unsigned char)*u);
2217 
2218     /* If not empty and not single character, copy the Unicode data
2219        into the new object */
2220     if (find_maxchar_surrogates(u, u + size,
2221                                 &maxchar, &num_surrogates) == -1)
2222         return NULL;
2223 
2224     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2225     if (!unicode)
2226         return NULL;
2227 
2228     switch (PyUnicode_KIND(unicode)) {
2229     case PyUnicode_1BYTE_KIND:
2230         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2231                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2232         break;
2233     case PyUnicode_2BYTE_KIND:
2234 #if Py_UNICODE_SIZE == 2
2235         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2236 #else
2237         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2238                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2239 #endif
2240         break;
2241     case PyUnicode_4BYTE_KIND:
2242 #if SIZEOF_WCHAR_T == 2
2243         /* This is the only case which has to process surrogates, thus
2244            a simple copy loop is not enough and we need a function. */
2245         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2246 #else
2247         assert(num_surrogates == 0);
2248         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2249 #endif
2250         break;
2251     default:
2252         Py_UNREACHABLE();
2253     }
2254 
2255     return unicode_result(unicode);
2256 }
2257 
2258 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2259 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2260 {
2261     if (size < 0) {
2262         PyErr_SetString(PyExc_SystemError,
2263                         "Negative size passed to PyUnicode_FromStringAndSize");
2264         return NULL;
2265     }
2266     if (u != NULL)
2267         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2268     else
2269         return (PyObject *)_PyUnicode_New(size);
2270 }
2271 
2272 PyObject *
PyUnicode_FromString(const char * u)2273 PyUnicode_FromString(const char *u)
2274 {
2275     size_t size = strlen(u);
2276     if (size > PY_SSIZE_T_MAX) {
2277         PyErr_SetString(PyExc_OverflowError, "input too long");
2278         return NULL;
2279     }
2280     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2281 }
2282 
2283 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2284 _PyUnicode_FromId(_Py_Identifier *id)
2285 {
2286     if (!id->object) {
2287         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2288                                                   strlen(id->string),
2289                                                   NULL, NULL);
2290         if (!id->object)
2291             return NULL;
2292         PyUnicode_InternInPlace(&id->object);
2293         assert(!id->next);
2294         id->next = static_strings;
2295         static_strings = id;
2296     }
2297     return id->object;
2298 }
2299 
2300 static void
unicode_clear_static_strings(void)2301 unicode_clear_static_strings(void)
2302 {
2303     _Py_Identifier *tmp, *s = static_strings;
2304     while (s) {
2305         Py_CLEAR(s->object);
2306         tmp = s->next;
2307         s->next = NULL;
2308         s = tmp;
2309     }
2310     static_strings = NULL;
2311 }
2312 
2313 /* Internal function, doesn't check maximum character */
2314 
2315 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2316 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2317 {
2318     const unsigned char *s = (const unsigned char *)buffer;
2319     PyObject *unicode;
2320     if (size == 1) {
2321 #ifdef Py_DEBUG
2322         assert((unsigned char)s[0] < 128);
2323 #endif
2324         return get_latin1_char(s[0]);
2325     }
2326     unicode = PyUnicode_New(size, 127);
2327     if (!unicode)
2328         return NULL;
2329     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2330     assert(_PyUnicode_CheckConsistency(unicode, 1));
2331     return unicode;
2332 }
2333 
2334 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2335 kind_maxchar_limit(unsigned int kind)
2336 {
2337     switch (kind) {
2338     case PyUnicode_1BYTE_KIND:
2339         return 0x80;
2340     case PyUnicode_2BYTE_KIND:
2341         return 0x100;
2342     case PyUnicode_4BYTE_KIND:
2343         return 0x10000;
2344     default:
2345         Py_UNREACHABLE();
2346     }
2347 }
2348 
2349 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2350 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2351 {
2352     PyObject *res;
2353     unsigned char max_char;
2354 
2355     if (size == 0)
2356         _Py_RETURN_UNICODE_EMPTY();
2357     assert(size > 0);
2358     if (size == 1)
2359         return get_latin1_char(u[0]);
2360 
2361     max_char = ucs1lib_find_max_char(u, u + size);
2362     res = PyUnicode_New(size, max_char);
2363     if (!res)
2364         return NULL;
2365     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2366     assert(_PyUnicode_CheckConsistency(res, 1));
2367     return res;
2368 }
2369 
2370 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2371 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2372 {
2373     PyObject *res;
2374     Py_UCS2 max_char;
2375 
2376     if (size == 0)
2377         _Py_RETURN_UNICODE_EMPTY();
2378     assert(size > 0);
2379     if (size == 1)
2380         return unicode_char(u[0]);
2381 
2382     max_char = ucs2lib_find_max_char(u, u + size);
2383     res = PyUnicode_New(size, max_char);
2384     if (!res)
2385         return NULL;
2386     if (max_char >= 256)
2387         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2388     else {
2389         _PyUnicode_CONVERT_BYTES(
2390             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2391     }
2392     assert(_PyUnicode_CheckConsistency(res, 1));
2393     return res;
2394 }
2395 
2396 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2397 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2398 {
2399     PyObject *res;
2400     Py_UCS4 max_char;
2401 
2402     if (size == 0)
2403         _Py_RETURN_UNICODE_EMPTY();
2404     assert(size > 0);
2405     if (size == 1)
2406         return unicode_char(u[0]);
2407 
2408     max_char = ucs4lib_find_max_char(u, u + size);
2409     res = PyUnicode_New(size, max_char);
2410     if (!res)
2411         return NULL;
2412     if (max_char < 256)
2413         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2414                                  PyUnicode_1BYTE_DATA(res));
2415     else if (max_char < 0x10000)
2416         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2417                                  PyUnicode_2BYTE_DATA(res));
2418     else
2419         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2420     assert(_PyUnicode_CheckConsistency(res, 1));
2421     return res;
2422 }
2423 
2424 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2425 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2426 {
2427     if (size < 0) {
2428         PyErr_SetString(PyExc_ValueError, "size must be positive");
2429         return NULL;
2430     }
2431     switch (kind) {
2432     case PyUnicode_1BYTE_KIND:
2433         return _PyUnicode_FromUCS1(buffer, size);
2434     case PyUnicode_2BYTE_KIND:
2435         return _PyUnicode_FromUCS2(buffer, size);
2436     case PyUnicode_4BYTE_KIND:
2437         return _PyUnicode_FromUCS4(buffer, size);
2438     default:
2439         PyErr_SetString(PyExc_SystemError, "invalid kind");
2440         return NULL;
2441     }
2442 }
2443 
2444 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2445 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2446 {
2447     enum PyUnicode_Kind kind;
2448     const void *startptr, *endptr;
2449 
2450     assert(PyUnicode_IS_READY(unicode));
2451     assert(0 <= start);
2452     assert(end <= PyUnicode_GET_LENGTH(unicode));
2453     assert(start <= end);
2454 
2455     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2456         return PyUnicode_MAX_CHAR_VALUE(unicode);
2457 
2458     if (start == end)
2459         return 127;
2460 
2461     if (PyUnicode_IS_ASCII(unicode))
2462         return 127;
2463 
2464     kind = PyUnicode_KIND(unicode);
2465     startptr = PyUnicode_DATA(unicode);
2466     endptr = (char *)startptr + end * kind;
2467     startptr = (char *)startptr + start * kind;
2468     switch(kind) {
2469     case PyUnicode_1BYTE_KIND:
2470         return ucs1lib_find_max_char(startptr, endptr);
2471     case PyUnicode_2BYTE_KIND:
2472         return ucs2lib_find_max_char(startptr, endptr);
2473     case PyUnicode_4BYTE_KIND:
2474         return ucs4lib_find_max_char(startptr, endptr);
2475     default:
2476         Py_UNREACHABLE();
2477     }
2478 }
2479 
2480 /* Ensure that a string uses the most efficient storage, if it is not the
2481    case: create a new string with of the right kind. Write NULL into *p_unicode
2482    on error. */
2483 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2484 unicode_adjust_maxchar(PyObject **p_unicode)
2485 {
2486     PyObject *unicode, *copy;
2487     Py_UCS4 max_char;
2488     Py_ssize_t len;
2489     unsigned int kind;
2490 
2491     assert(p_unicode != NULL);
2492     unicode = *p_unicode;
2493     assert(PyUnicode_IS_READY(unicode));
2494     if (PyUnicode_IS_ASCII(unicode))
2495         return;
2496 
2497     len = PyUnicode_GET_LENGTH(unicode);
2498     kind = PyUnicode_KIND(unicode);
2499     if (kind == PyUnicode_1BYTE_KIND) {
2500         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2501         max_char = ucs1lib_find_max_char(u, u + len);
2502         if (max_char >= 128)
2503             return;
2504     }
2505     else if (kind == PyUnicode_2BYTE_KIND) {
2506         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2507         max_char = ucs2lib_find_max_char(u, u + len);
2508         if (max_char >= 256)
2509             return;
2510     }
2511     else if (kind == PyUnicode_4BYTE_KIND) {
2512         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2513         max_char = ucs4lib_find_max_char(u, u + len);
2514         if (max_char >= 0x10000)
2515             return;
2516     }
2517     else
2518         Py_UNREACHABLE();
2519 
2520     copy = PyUnicode_New(len, max_char);
2521     if (copy != NULL)
2522         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2523     Py_DECREF(unicode);
2524     *p_unicode = copy;
2525 }
2526 
2527 PyObject*
_PyUnicode_Copy(PyObject * unicode)2528 _PyUnicode_Copy(PyObject *unicode)
2529 {
2530     Py_ssize_t length;
2531     PyObject *copy;
2532 
2533     if (!PyUnicode_Check(unicode)) {
2534         PyErr_BadInternalCall();
2535         return NULL;
2536     }
2537     if (PyUnicode_READY(unicode) == -1)
2538         return NULL;
2539 
2540     length = PyUnicode_GET_LENGTH(unicode);
2541     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2542     if (!copy)
2543         return NULL;
2544     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2545 
2546     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2547               length * PyUnicode_KIND(unicode));
2548     assert(_PyUnicode_CheckConsistency(copy, 1));
2549     return copy;
2550 }
2551 
2552 
2553 /* Widen Unicode objects to larger buffers. Don't write terminating null
2554    character. Return NULL on error. */
2555 
2556 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2557 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2558 {
2559     void *result;
2560 
2561     assert(skind < kind);
2562     switch (kind) {
2563     case PyUnicode_2BYTE_KIND:
2564         result = PyMem_New(Py_UCS2, len);
2565         if (!result)
2566             return PyErr_NoMemory();
2567         assert(skind == PyUnicode_1BYTE_KIND);
2568         _PyUnicode_CONVERT_BYTES(
2569             Py_UCS1, Py_UCS2,
2570             (const Py_UCS1 *)data,
2571             ((const Py_UCS1 *)data) + len,
2572             result);
2573         return result;
2574     case PyUnicode_4BYTE_KIND:
2575         result = PyMem_New(Py_UCS4, len);
2576         if (!result)
2577             return PyErr_NoMemory();
2578         if (skind == PyUnicode_2BYTE_KIND) {
2579             _PyUnicode_CONVERT_BYTES(
2580                 Py_UCS2, Py_UCS4,
2581                 (const Py_UCS2 *)data,
2582                 ((const Py_UCS2 *)data) + len,
2583                 result);
2584         }
2585         else {
2586             assert(skind == PyUnicode_1BYTE_KIND);
2587             _PyUnicode_CONVERT_BYTES(
2588                 Py_UCS1, Py_UCS4,
2589                 (const Py_UCS1 *)data,
2590                 ((const Py_UCS1 *)data) + len,
2591                 result);
2592         }
2593         return result;
2594     default:
2595         Py_UNREACHABLE();
2596         return NULL;
2597     }
2598 }
2599 
2600 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2601 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2602         int copy_null)
2603 {
2604     int kind;
2605     const void *data;
2606     Py_ssize_t len, targetlen;
2607     if (PyUnicode_READY(string) == -1)
2608         return NULL;
2609     kind = PyUnicode_KIND(string);
2610     data = PyUnicode_DATA(string);
2611     len = PyUnicode_GET_LENGTH(string);
2612     targetlen = len;
2613     if (copy_null)
2614         targetlen++;
2615     if (!target) {
2616         target = PyMem_New(Py_UCS4, targetlen);
2617         if (!target) {
2618             PyErr_NoMemory();
2619             return NULL;
2620         }
2621     }
2622     else {
2623         if (targetsize < targetlen) {
2624             PyErr_Format(PyExc_SystemError,
2625                          "string is longer than the buffer");
2626             if (copy_null && 0 < targetsize)
2627                 target[0] = 0;
2628             return NULL;
2629         }
2630     }
2631     if (kind == PyUnicode_1BYTE_KIND) {
2632         const Py_UCS1 *start = (const Py_UCS1 *) data;
2633         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2634     }
2635     else if (kind == PyUnicode_2BYTE_KIND) {
2636         const Py_UCS2 *start = (const Py_UCS2 *) data;
2637         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2638     }
2639     else if (kind == PyUnicode_4BYTE_KIND) {
2640         memcpy(target, data, len * sizeof(Py_UCS4));
2641     }
2642     else {
2643         Py_UNREACHABLE();
2644     }
2645     if (copy_null)
2646         target[len] = 0;
2647     return target;
2648 }
2649 
2650 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2651 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2652                  int copy_null)
2653 {
2654     if (target == NULL || targetsize < 0) {
2655         PyErr_BadInternalCall();
2656         return NULL;
2657     }
2658     return as_ucs4(string, target, targetsize, copy_null);
2659 }
2660 
2661 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2662 PyUnicode_AsUCS4Copy(PyObject *string)
2663 {
2664     return as_ucs4(string, NULL, 0, 1);
2665 }
2666 
2667 /* maximum number of characters required for output of %lld or %p.
2668    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2669    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2670 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2671 
2672 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2673 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2674                              Py_ssize_t width, Py_ssize_t precision)
2675 {
2676     Py_ssize_t length, fill, arglen;
2677     Py_UCS4 maxchar;
2678 
2679     if (PyUnicode_READY(str) == -1)
2680         return -1;
2681 
2682     length = PyUnicode_GET_LENGTH(str);
2683     if ((precision == -1 || precision >= length)
2684         && width <= length)
2685         return _PyUnicodeWriter_WriteStr(writer, str);
2686 
2687     if (precision != -1)
2688         length = Py_MIN(precision, length);
2689 
2690     arglen = Py_MAX(length, width);
2691     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2692         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2693     else
2694         maxchar = writer->maxchar;
2695 
2696     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2697         return -1;
2698 
2699     if (width > length) {
2700         fill = width - length;
2701         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2702             return -1;
2703         writer->pos += fill;
2704     }
2705 
2706     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2707                                   str, 0, length);
2708     writer->pos += length;
2709     return 0;
2710 }
2711 
2712 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2713 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2714                               Py_ssize_t width, Py_ssize_t precision)
2715 {
2716     /* UTF-8 */
2717     Py_ssize_t length;
2718     PyObject *unicode;
2719     int res;
2720 
2721     if (precision == -1) {
2722         length = strlen(str);
2723     }
2724     else {
2725         length = 0;
2726         while (length < precision && str[length]) {
2727             length++;
2728         }
2729     }
2730     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2731     if (unicode == NULL)
2732         return -1;
2733 
2734     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2735     Py_DECREF(unicode);
2736     return res;
2737 }
2738 
2739 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2740 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2741                        const char *f, va_list *vargs)
2742 {
2743     const char *p;
2744     Py_ssize_t len;
2745     int zeropad;
2746     Py_ssize_t width;
2747     Py_ssize_t precision;
2748     int longflag;
2749     int longlongflag;
2750     int size_tflag;
2751     Py_ssize_t fill;
2752 
2753     p = f;
2754     f++;
2755     zeropad = 0;
2756     if (*f == '0') {
2757         zeropad = 1;
2758         f++;
2759     }
2760 
2761     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2762     width = -1;
2763     if (Py_ISDIGIT((unsigned)*f)) {
2764         width = *f - '0';
2765         f++;
2766         while (Py_ISDIGIT((unsigned)*f)) {
2767             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2768                 PyErr_SetString(PyExc_ValueError,
2769                                 "width too big");
2770                 return NULL;
2771             }
2772             width = (width * 10) + (*f - '0');
2773             f++;
2774         }
2775     }
2776     precision = -1;
2777     if (*f == '.') {
2778         f++;
2779         if (Py_ISDIGIT((unsigned)*f)) {
2780             precision = (*f - '0');
2781             f++;
2782             while (Py_ISDIGIT((unsigned)*f)) {
2783                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2784                     PyErr_SetString(PyExc_ValueError,
2785                                     "precision too big");
2786                     return NULL;
2787                 }
2788                 precision = (precision * 10) + (*f - '0');
2789                 f++;
2790             }
2791         }
2792         if (*f == '%') {
2793             /* "%.3%s" => f points to "3" */
2794             f--;
2795         }
2796     }
2797     if (*f == '\0') {
2798         /* bogus format "%.123" => go backward, f points to "3" */
2799         f--;
2800     }
2801 
2802     /* Handle %ld, %lu, %lld and %llu. */
2803     longflag = 0;
2804     longlongflag = 0;
2805     size_tflag = 0;
2806     if (*f == 'l') {
2807         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2808             longflag = 1;
2809             ++f;
2810         }
2811         else if (f[1] == 'l' &&
2812                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2813             longlongflag = 1;
2814             f += 2;
2815         }
2816     }
2817     /* handle the size_t flag. */
2818     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2819         size_tflag = 1;
2820         ++f;
2821     }
2822 
2823     if (f[1] == '\0')
2824         writer->overallocate = 0;
2825 
2826     switch (*f) {
2827     case 'c':
2828     {
2829         int ordinal = va_arg(*vargs, int);
2830         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2831             PyErr_SetString(PyExc_OverflowError,
2832                             "character argument not in range(0x110000)");
2833             return NULL;
2834         }
2835         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2836             return NULL;
2837         break;
2838     }
2839 
2840     case 'i':
2841     case 'd':
2842     case 'u':
2843     case 'x':
2844     {
2845         /* used by sprintf */
2846         char buffer[MAX_LONG_LONG_CHARS];
2847         Py_ssize_t arglen;
2848 
2849         if (*f == 'u') {
2850             if (longflag)
2851                 len = sprintf(buffer, "%lu",
2852                         va_arg(*vargs, unsigned long));
2853             else if (longlongflag)
2854                 len = sprintf(buffer, "%llu",
2855                         va_arg(*vargs, unsigned long long));
2856             else if (size_tflag)
2857                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2858                         va_arg(*vargs, size_t));
2859             else
2860                 len = sprintf(buffer, "%u",
2861                         va_arg(*vargs, unsigned int));
2862         }
2863         else if (*f == 'x') {
2864             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2865         }
2866         else {
2867             if (longflag)
2868                 len = sprintf(buffer, "%li",
2869                         va_arg(*vargs, long));
2870             else if (longlongflag)
2871                 len = sprintf(buffer, "%lli",
2872                         va_arg(*vargs, long long));
2873             else if (size_tflag)
2874                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2875                         va_arg(*vargs, Py_ssize_t));
2876             else
2877                 len = sprintf(buffer, "%i",
2878                         va_arg(*vargs, int));
2879         }
2880         assert(len >= 0);
2881 
2882         if (precision < len)
2883             precision = len;
2884 
2885         arglen = Py_MAX(precision, width);
2886         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2887             return NULL;
2888 
2889         if (width > precision) {
2890             Py_UCS4 fillchar;
2891             fill = width - precision;
2892             fillchar = zeropad?'0':' ';
2893             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2894                 return NULL;
2895             writer->pos += fill;
2896         }
2897         if (precision > len) {
2898             fill = precision - len;
2899             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2900                 return NULL;
2901             writer->pos += fill;
2902         }
2903 
2904         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2905             return NULL;
2906         break;
2907     }
2908 
2909     case 'p':
2910     {
2911         char number[MAX_LONG_LONG_CHARS];
2912 
2913         len = sprintf(number, "%p", va_arg(*vargs, void*));
2914         assert(len >= 0);
2915 
2916         /* %p is ill-defined:  ensure leading 0x. */
2917         if (number[1] == 'X')
2918             number[1] = 'x';
2919         else if (number[1] != 'x') {
2920             memmove(number + 2, number,
2921                     strlen(number) + 1);
2922             number[0] = '0';
2923             number[1] = 'x';
2924             len += 2;
2925         }
2926 
2927         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2928             return NULL;
2929         break;
2930     }
2931 
2932     case 's':
2933     {
2934         /* UTF-8 */
2935         const char *s = va_arg(*vargs, const char*);
2936         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2937             return NULL;
2938         break;
2939     }
2940 
2941     case 'U':
2942     {
2943         PyObject *obj = va_arg(*vargs, PyObject *);
2944         assert(obj && _PyUnicode_CHECK(obj));
2945 
2946         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2947             return NULL;
2948         break;
2949     }
2950 
2951     case 'V':
2952     {
2953         PyObject *obj = va_arg(*vargs, PyObject *);
2954         const char *str = va_arg(*vargs, const char *);
2955         if (obj) {
2956             assert(_PyUnicode_CHECK(obj));
2957             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2958                 return NULL;
2959         }
2960         else {
2961             assert(str != NULL);
2962             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2963                 return NULL;
2964         }
2965         break;
2966     }
2967 
2968     case 'S':
2969     {
2970         PyObject *obj = va_arg(*vargs, PyObject *);
2971         PyObject *str;
2972         assert(obj);
2973         str = PyObject_Str(obj);
2974         if (!str)
2975             return NULL;
2976         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2977             Py_DECREF(str);
2978             return NULL;
2979         }
2980         Py_DECREF(str);
2981         break;
2982     }
2983 
2984     case 'R':
2985     {
2986         PyObject *obj = va_arg(*vargs, PyObject *);
2987         PyObject *repr;
2988         assert(obj);
2989         repr = PyObject_Repr(obj);
2990         if (!repr)
2991             return NULL;
2992         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2993             Py_DECREF(repr);
2994             return NULL;
2995         }
2996         Py_DECREF(repr);
2997         break;
2998     }
2999 
3000     case 'A':
3001     {
3002         PyObject *obj = va_arg(*vargs, PyObject *);
3003         PyObject *ascii;
3004         assert(obj);
3005         ascii = PyObject_ASCII(obj);
3006         if (!ascii)
3007             return NULL;
3008         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3009             Py_DECREF(ascii);
3010             return NULL;
3011         }
3012         Py_DECREF(ascii);
3013         break;
3014     }
3015 
3016     case '%':
3017         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3018             return NULL;
3019         break;
3020 
3021     default:
3022         /* if we stumble upon an unknown formatting code, copy the rest
3023            of the format string to the output string. (we cannot just
3024            skip the code, since there's no way to know what's in the
3025            argument list) */
3026         len = strlen(p);
3027         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3028             return NULL;
3029         f = p+len;
3030         return f;
3031     }
3032 
3033     f++;
3034     return f;
3035 }
3036 
3037 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3038 PyUnicode_FromFormatV(const char *format, va_list vargs)
3039 {
3040     va_list vargs2;
3041     const char *f;
3042     _PyUnicodeWriter writer;
3043 
3044     _PyUnicodeWriter_Init(&writer);
3045     writer.min_length = strlen(format) + 100;
3046     writer.overallocate = 1;
3047 
3048     // Copy varags to be able to pass a reference to a subfunction.
3049     va_copy(vargs2, vargs);
3050 
3051     for (f = format; *f; ) {
3052         if (*f == '%') {
3053             f = unicode_fromformat_arg(&writer, f, &vargs2);
3054             if (f == NULL)
3055                 goto fail;
3056         }
3057         else {
3058             const char *p;
3059             Py_ssize_t len;
3060 
3061             p = f;
3062             do
3063             {
3064                 if ((unsigned char)*p > 127) {
3065                     PyErr_Format(PyExc_ValueError,
3066                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3067                         "string, got a non-ASCII byte: 0x%02x",
3068                         (unsigned char)*p);
3069                     goto fail;
3070                 }
3071                 p++;
3072             }
3073             while (*p != '\0' && *p != '%');
3074             len = p - f;
3075 
3076             if (*p == '\0')
3077                 writer.overallocate = 0;
3078 
3079             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3080                 goto fail;
3081 
3082             f = p;
3083         }
3084     }
3085     va_end(vargs2);
3086     return _PyUnicodeWriter_Finish(&writer);
3087 
3088   fail:
3089     va_end(vargs2);
3090     _PyUnicodeWriter_Dealloc(&writer);
3091     return NULL;
3092 }
3093 
3094 PyObject *
PyUnicode_FromFormat(const char * format,...)3095 PyUnicode_FromFormat(const char *format, ...)
3096 {
3097     PyObject* ret;
3098     va_list vargs;
3099 
3100 #ifdef HAVE_STDARG_PROTOTYPES
3101     va_start(vargs, format);
3102 #else
3103     va_start(vargs);
3104 #endif
3105     ret = PyUnicode_FromFormatV(format, vargs);
3106     va_end(vargs);
3107     return ret;
3108 }
3109 
3110 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3111 unicode_get_widechar_size(PyObject *unicode)
3112 {
3113     Py_ssize_t res;
3114 
3115     assert(unicode != NULL);
3116     assert(_PyUnicode_CHECK(unicode));
3117 
3118     if (_PyUnicode_WSTR(unicode) != NULL) {
3119         return PyUnicode_WSTR_LENGTH(unicode);
3120     }
3121     assert(PyUnicode_IS_READY(unicode));
3122 
3123     res = _PyUnicode_LENGTH(unicode);
3124 #if SIZEOF_WCHAR_T == 2
3125     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3126         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3127         const Py_UCS4 *end = s + res;
3128         for (; s < end; ++s) {
3129             if (*s > 0xFFFF) {
3130                 ++res;
3131             }
3132         }
3133     }
3134 #endif
3135     return res;
3136 }
3137 
3138 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3139 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3140 {
3141     const wchar_t *wstr;
3142 
3143     assert(unicode != NULL);
3144     assert(_PyUnicode_CHECK(unicode));
3145 
3146     wstr = _PyUnicode_WSTR(unicode);
3147     if (wstr != NULL) {
3148         memcpy(w, wstr, size * sizeof(wchar_t));
3149         return;
3150     }
3151     assert(PyUnicode_IS_READY(unicode));
3152 
3153     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3154         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3155         for (; size--; ++s, ++w) {
3156             *w = *s;
3157         }
3158     }
3159     else {
3160 #if SIZEOF_WCHAR_T == 4
3161         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3162         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3163         for (; size--; ++s, ++w) {
3164             *w = *s;
3165         }
3166 #else
3167         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3168         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3169         for (; size--; ++s, ++w) {
3170             Py_UCS4 ch = *s;
3171             if (ch > 0xFFFF) {
3172                 assert(ch <= MAX_UNICODE);
3173                 /* encode surrogate pair in this case */
3174                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3175                 if (!size--)
3176                     break;
3177                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3178             }
3179             else {
3180                 *w = ch;
3181             }
3182         }
3183 #endif
3184     }
3185 }
3186 
3187 #ifdef HAVE_WCHAR_H
3188 
3189 /* Convert a Unicode object to a wide character string.
3190 
3191    - If w is NULL: return the number of wide characters (including the null
3192      character) required to convert the unicode object. Ignore size argument.
3193 
3194    - Otherwise: return the number of wide characters (excluding the null
3195      character) written into w. Write at most size wide characters (including
3196      the null character). */
3197 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3198 PyUnicode_AsWideChar(PyObject *unicode,
3199                      wchar_t *w,
3200                      Py_ssize_t size)
3201 {
3202     Py_ssize_t res;
3203 
3204     if (unicode == NULL) {
3205         PyErr_BadInternalCall();
3206         return -1;
3207     }
3208     if (!PyUnicode_Check(unicode)) {
3209         PyErr_BadArgument();
3210         return -1;
3211     }
3212 
3213     res = unicode_get_widechar_size(unicode);
3214     if (w == NULL) {
3215         return res + 1;
3216     }
3217 
3218     if (size > res) {
3219         size = res + 1;
3220     }
3221     else {
3222         res = size;
3223     }
3224     unicode_copy_as_widechar(unicode, w, size);
3225     return res;
3226 }
3227 
3228 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3229 PyUnicode_AsWideCharString(PyObject *unicode,
3230                            Py_ssize_t *size)
3231 {
3232     wchar_t *buffer;
3233     Py_ssize_t buflen;
3234 
3235     if (unicode == NULL) {
3236         PyErr_BadInternalCall();
3237         return NULL;
3238     }
3239     if (!PyUnicode_Check(unicode)) {
3240         PyErr_BadArgument();
3241         return NULL;
3242     }
3243 
3244     buflen = unicode_get_widechar_size(unicode);
3245     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3246     if (buffer == NULL) {
3247         PyErr_NoMemory();
3248         return NULL;
3249     }
3250     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3251     if (size != NULL) {
3252         *size = buflen;
3253     }
3254     else if (wcslen(buffer) != (size_t)buflen) {
3255         PyMem_FREE(buffer);
3256         PyErr_SetString(PyExc_ValueError,
3257                         "embedded null character");
3258         return NULL;
3259     }
3260     return buffer;
3261 }
3262 
3263 #endif /* HAVE_WCHAR_H */
3264 
3265 PyObject *
PyUnicode_FromOrdinal(int ordinal)3266 PyUnicode_FromOrdinal(int ordinal)
3267 {
3268     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3269         PyErr_SetString(PyExc_ValueError,
3270                         "chr() arg not in range(0x110000)");
3271         return NULL;
3272     }
3273 
3274     return unicode_char((Py_UCS4)ordinal);
3275 }
3276 
3277 PyObject *
PyUnicode_FromObject(PyObject * obj)3278 PyUnicode_FromObject(PyObject *obj)
3279 {
3280     /* XXX Perhaps we should make this API an alias of
3281        PyObject_Str() instead ?! */
3282     if (PyUnicode_CheckExact(obj)) {
3283         if (PyUnicode_READY(obj) == -1)
3284             return NULL;
3285         Py_INCREF(obj);
3286         return obj;
3287     }
3288     if (PyUnicode_Check(obj)) {
3289         /* For a Unicode subtype that's not a Unicode object,
3290            return a true Unicode object with the same data. */
3291         return _PyUnicode_Copy(obj);
3292     }
3293     PyErr_Format(PyExc_TypeError,
3294                  "Can't convert '%.100s' object to str implicitly",
3295                  Py_TYPE(obj)->tp_name);
3296     return NULL;
3297 }
3298 
3299 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3300 PyUnicode_FromEncodedObject(PyObject *obj,
3301                             const char *encoding,
3302                             const char *errors)
3303 {
3304     Py_buffer buffer;
3305     PyObject *v;
3306 
3307     if (obj == NULL) {
3308         PyErr_BadInternalCall();
3309         return NULL;
3310     }
3311 
3312     /* Decoding bytes objects is the most common case and should be fast */
3313     if (PyBytes_Check(obj)) {
3314         if (PyBytes_GET_SIZE(obj) == 0) {
3315             if (unicode_check_encoding_errors(encoding, errors) < 0) {
3316                 return NULL;
3317             }
3318             _Py_RETURN_UNICODE_EMPTY();
3319         }
3320         return PyUnicode_Decode(
3321                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3322                 encoding, errors);
3323     }
3324 
3325     if (PyUnicode_Check(obj)) {
3326         PyErr_SetString(PyExc_TypeError,
3327                         "decoding str is not supported");
3328         return NULL;
3329     }
3330 
3331     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3332     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3333         PyErr_Format(PyExc_TypeError,
3334                      "decoding to str: need a bytes-like object, %.80s found",
3335                      Py_TYPE(obj)->tp_name);
3336         return NULL;
3337     }
3338 
3339     if (buffer.len == 0) {
3340         PyBuffer_Release(&buffer);
3341         if (unicode_check_encoding_errors(encoding, errors) < 0) {
3342             return NULL;
3343         }
3344         _Py_RETURN_UNICODE_EMPTY();
3345     }
3346 
3347     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3348     PyBuffer_Release(&buffer);
3349     return v;
3350 }
3351 
3352 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3353    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3354    longer than lower_len-1). */
3355 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3356 _Py_normalize_encoding(const char *encoding,
3357                        char *lower,
3358                        size_t lower_len)
3359 {
3360     const char *e;
3361     char *l;
3362     char *l_end;
3363     int punct;
3364 
3365     assert(encoding != NULL);
3366 
3367     e = encoding;
3368     l = lower;
3369     l_end = &lower[lower_len - 1];
3370     punct = 0;
3371     while (1) {
3372         char c = *e;
3373         if (c == 0) {
3374             break;
3375         }
3376 
3377         if (Py_ISALNUM(c) || c == '.') {
3378             if (punct && l != lower) {
3379                 if (l == l_end) {
3380                     return 0;
3381                 }
3382                 *l++ = '_';
3383             }
3384             punct = 0;
3385 
3386             if (l == l_end) {
3387                 return 0;
3388             }
3389             *l++ = Py_TOLOWER(c);
3390         }
3391         else {
3392             punct = 1;
3393         }
3394 
3395         e++;
3396     }
3397     *l = '\0';
3398     return 1;
3399 }
3400 
3401 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3402 PyUnicode_Decode(const char *s,
3403                  Py_ssize_t size,
3404                  const char *encoding,
3405                  const char *errors)
3406 {
3407     PyObject *buffer = NULL, *unicode;
3408     Py_buffer info;
3409     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3410 
3411     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3412         return NULL;
3413     }
3414 
3415     if (size == 0) {
3416         _Py_RETURN_UNICODE_EMPTY();
3417     }
3418 
3419     if (encoding == NULL) {
3420         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3421     }
3422 
3423     /* Shortcuts for common default encodings */
3424     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3425         char *lower = buflower;
3426 
3427         /* Fast paths */
3428         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3429             lower += 3;
3430             if (*lower == '_') {
3431                 /* Match "utf8" and "utf_8" */
3432                 lower++;
3433             }
3434 
3435             if (lower[0] == '8' && lower[1] == 0) {
3436                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3437             }
3438             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3439                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3440             }
3441             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3442                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3443             }
3444         }
3445         else {
3446             if (strcmp(lower, "ascii") == 0
3447                 || strcmp(lower, "us_ascii") == 0) {
3448                 return PyUnicode_DecodeASCII(s, size, errors);
3449             }
3450     #ifdef MS_WINDOWS
3451             else if (strcmp(lower, "mbcs") == 0) {
3452                 return PyUnicode_DecodeMBCS(s, size, errors);
3453             }
3454     #endif
3455             else if (strcmp(lower, "latin1") == 0
3456                      || strcmp(lower, "latin_1") == 0
3457                      || strcmp(lower, "iso_8859_1") == 0
3458                      || strcmp(lower, "iso8859_1") == 0) {
3459                 return PyUnicode_DecodeLatin1(s, size, errors);
3460             }
3461         }
3462     }
3463 
3464     /* Decode via the codec registry */
3465     buffer = NULL;
3466     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3467         goto onError;
3468     buffer = PyMemoryView_FromBuffer(&info);
3469     if (buffer == NULL)
3470         goto onError;
3471     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3472     if (unicode == NULL)
3473         goto onError;
3474     if (!PyUnicode_Check(unicode)) {
3475         PyErr_Format(PyExc_TypeError,
3476                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3477                      "use codecs.decode() to decode to arbitrary types",
3478                      encoding,
3479                      Py_TYPE(unicode)->tp_name);
3480         Py_DECREF(unicode);
3481         goto onError;
3482     }
3483     Py_DECREF(buffer);
3484     return unicode_result(unicode);
3485 
3486   onError:
3487     Py_XDECREF(buffer);
3488     return NULL;
3489 }
3490 
3491 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3492 PyUnicode_AsDecodedObject(PyObject *unicode,
3493                           const char *encoding,
3494                           const char *errors)
3495 {
3496     if (!PyUnicode_Check(unicode)) {
3497         PyErr_BadArgument();
3498         return NULL;
3499     }
3500 
3501     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3502                      "PyUnicode_AsDecodedObject() is deprecated; "
3503                      "use PyCodec_Decode() to decode from str", 1) < 0)
3504         return NULL;
3505 
3506     if (encoding == NULL)
3507         encoding = PyUnicode_GetDefaultEncoding();
3508 
3509     /* Decode via the codec registry */
3510     return PyCodec_Decode(unicode, encoding, errors);
3511 }
3512 
3513 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3514 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3515                            const char *encoding,
3516                            const char *errors)
3517 {
3518     PyObject *v;
3519 
3520     if (!PyUnicode_Check(unicode)) {
3521         PyErr_BadArgument();
3522         goto onError;
3523     }
3524 
3525     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3526                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3527                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3528         return NULL;
3529 
3530     if (encoding == NULL)
3531         encoding = PyUnicode_GetDefaultEncoding();
3532 
3533     /* Decode via the codec registry */
3534     v = PyCodec_Decode(unicode, encoding, errors);
3535     if (v == NULL)
3536         goto onError;
3537     if (!PyUnicode_Check(v)) {
3538         PyErr_Format(PyExc_TypeError,
3539                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3540                      "use codecs.decode() to decode to arbitrary types",
3541                      encoding,
3542                      Py_TYPE(unicode)->tp_name);
3543         Py_DECREF(v);
3544         goto onError;
3545     }
3546     return unicode_result(v);
3547 
3548   onError:
3549     return NULL;
3550 }
3551 
3552 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3553 PyUnicode_Encode(const Py_UNICODE *s,
3554                  Py_ssize_t size,
3555                  const char *encoding,
3556                  const char *errors)
3557 {
3558     PyObject *v, *unicode;
3559 
3560     unicode = PyUnicode_FromWideChar(s, size);
3561     if (unicode == NULL)
3562         return NULL;
3563     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3564     Py_DECREF(unicode);
3565     return v;
3566 }
3567 
3568 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3569 PyUnicode_AsEncodedObject(PyObject *unicode,
3570                           const char *encoding,
3571                           const char *errors)
3572 {
3573     PyObject *v;
3574 
3575     if (!PyUnicode_Check(unicode)) {
3576         PyErr_BadArgument();
3577         goto onError;
3578     }
3579 
3580     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3581                      "PyUnicode_AsEncodedObject() is deprecated; "
3582                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3583                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3584         return NULL;
3585 
3586     if (encoding == NULL)
3587         encoding = PyUnicode_GetDefaultEncoding();
3588 
3589     /* Encode via the codec registry */
3590     v = PyCodec_Encode(unicode, encoding, errors);
3591     if (v == NULL)
3592         goto onError;
3593     return v;
3594 
3595   onError:
3596     return NULL;
3597 }
3598 
3599 
3600 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3601 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3602                       int current_locale)
3603 {
3604     Py_ssize_t wlen;
3605     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3606     if (wstr == NULL) {
3607         return NULL;
3608     }
3609 
3610     if ((size_t)wlen != wcslen(wstr)) {
3611         PyErr_SetString(PyExc_ValueError, "embedded null character");
3612         PyMem_Free(wstr);
3613         return NULL;
3614     }
3615 
3616     char *str;
3617     size_t error_pos;
3618     const char *reason;
3619     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3620                                  current_locale, error_handler);
3621     PyMem_Free(wstr);
3622 
3623     if (res != 0) {
3624         if (res == -2) {
3625             PyObject *exc;
3626             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3627                     "locale", unicode,
3628                     (Py_ssize_t)error_pos,
3629                     (Py_ssize_t)(error_pos+1),
3630                     reason);
3631             if (exc != NULL) {
3632                 PyCodec_StrictErrors(exc);
3633                 Py_DECREF(exc);
3634             }
3635         }
3636         else if (res == -3) {
3637             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3638         }
3639         else {
3640             PyErr_NoMemory();
3641         }
3642         return NULL;
3643     }
3644 
3645     PyObject *bytes = PyBytes_FromString(str);
3646     PyMem_RawFree(str);
3647     return bytes;
3648 }
3649 
3650 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3651 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3652 {
3653     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3654     return unicode_encode_locale(unicode, error_handler, 1);
3655 }
3656 
3657 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3658 PyUnicode_EncodeFSDefault(PyObject *unicode)
3659 {
3660     PyInterpreterState *interp = _PyInterpreterState_GET();
3661     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3662     if (fs_codec->utf8) {
3663         return unicode_encode_utf8(unicode,
3664                                    fs_codec->error_handler,
3665                                    fs_codec->errors);
3666     }
3667 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3668     else if (fs_codec->encoding) {
3669         return PyUnicode_AsEncodedString(unicode,
3670                                          fs_codec->encoding,
3671                                          fs_codec->errors);
3672     }
3673 #endif
3674     else {
3675         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3676            machinery is not ready and so cannot be used:
3677            use wcstombs() in this case. */
3678         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3679         const wchar_t *filesystem_errors = config->filesystem_errors;
3680         assert(filesystem_errors != NULL);
3681         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3682         assert(errors != _Py_ERROR_UNKNOWN);
3683 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3684         return unicode_encode_utf8(unicode, errors, NULL);
3685 #else
3686         return unicode_encode_locale(unicode, errors, 0);
3687 #endif
3688     }
3689 }
3690 
3691 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3692 PyUnicode_AsEncodedString(PyObject *unicode,
3693                           const char *encoding,
3694                           const char *errors)
3695 {
3696     PyObject *v;
3697     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3698 
3699     if (!PyUnicode_Check(unicode)) {
3700         PyErr_BadArgument();
3701         return NULL;
3702     }
3703 
3704     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3705         return NULL;
3706     }
3707 
3708     if (encoding == NULL) {
3709         return _PyUnicode_AsUTF8String(unicode, errors);
3710     }
3711 
3712     /* Shortcuts for common default encodings */
3713     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3714         char *lower = buflower;
3715 
3716         /* Fast paths */
3717         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3718             lower += 3;
3719             if (*lower == '_') {
3720                 /* Match "utf8" and "utf_8" */
3721                 lower++;
3722             }
3723 
3724             if (lower[0] == '8' && lower[1] == 0) {
3725                 return _PyUnicode_AsUTF8String(unicode, errors);
3726             }
3727             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3728                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3729             }
3730             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3731                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3732             }
3733         }
3734         else {
3735             if (strcmp(lower, "ascii") == 0
3736                 || strcmp(lower, "us_ascii") == 0) {
3737                 return _PyUnicode_AsASCIIString(unicode, errors);
3738             }
3739 #ifdef MS_WINDOWS
3740             else if (strcmp(lower, "mbcs") == 0) {
3741                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3742             }
3743 #endif
3744             else if (strcmp(lower, "latin1") == 0 ||
3745                      strcmp(lower, "latin_1") == 0 ||
3746                      strcmp(lower, "iso_8859_1") == 0 ||
3747                      strcmp(lower, "iso8859_1") == 0) {
3748                 return _PyUnicode_AsLatin1String(unicode, errors);
3749             }
3750         }
3751     }
3752 
3753     /* Encode via the codec registry */
3754     v = _PyCodec_EncodeText(unicode, encoding, errors);
3755     if (v == NULL)
3756         return NULL;
3757 
3758     /* The normal path */
3759     if (PyBytes_Check(v))
3760         return v;
3761 
3762     /* If the codec returns a buffer, raise a warning and convert to bytes */
3763     if (PyByteArray_Check(v)) {
3764         int error;
3765         PyObject *b;
3766 
3767         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3768             "encoder %s returned bytearray instead of bytes; "
3769             "use codecs.encode() to encode to arbitrary types",
3770             encoding);
3771         if (error) {
3772             Py_DECREF(v);
3773             return NULL;
3774         }
3775 
3776         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3777                                       PyByteArray_GET_SIZE(v));
3778         Py_DECREF(v);
3779         return b;
3780     }
3781 
3782     PyErr_Format(PyExc_TypeError,
3783                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3784                  "use codecs.encode() to encode to arbitrary types",
3785                  encoding,
3786                  Py_TYPE(v)->tp_name);
3787     Py_DECREF(v);
3788     return NULL;
3789 }
3790 
3791 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3792 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3793                            const char *encoding,
3794                            const char *errors)
3795 {
3796     PyObject *v;
3797 
3798     if (!PyUnicode_Check(unicode)) {
3799         PyErr_BadArgument();
3800         goto onError;
3801     }
3802 
3803     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3804                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3805                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3806         return NULL;
3807 
3808     if (encoding == NULL)
3809         encoding = PyUnicode_GetDefaultEncoding();
3810 
3811     /* Encode via the codec registry */
3812     v = PyCodec_Encode(unicode, encoding, errors);
3813     if (v == NULL)
3814         goto onError;
3815     if (!PyUnicode_Check(v)) {
3816         PyErr_Format(PyExc_TypeError,
3817                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3818                      "use codecs.encode() to encode to arbitrary types",
3819                      encoding,
3820                      Py_TYPE(v)->tp_name);
3821         Py_DECREF(v);
3822         goto onError;
3823     }
3824     return v;
3825 
3826   onError:
3827     return NULL;
3828 }
3829 
3830 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3831 unicode_decode_locale(const char *str, Py_ssize_t len,
3832                       _Py_error_handler errors, int current_locale)
3833 {
3834     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3835         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3836         return NULL;
3837     }
3838 
3839     wchar_t *wstr;
3840     size_t wlen;
3841     const char *reason;
3842     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3843                                  current_locale, errors);
3844     if (res != 0) {
3845         if (res == -2) {
3846             PyObject *exc;
3847             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3848                                         "locale", str, len,
3849                                         (Py_ssize_t)wlen,
3850                                         (Py_ssize_t)(wlen + 1),
3851                                         reason);
3852             if (exc != NULL) {
3853                 PyCodec_StrictErrors(exc);
3854                 Py_DECREF(exc);
3855             }
3856         }
3857         else if (res == -3) {
3858             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3859         }
3860         else {
3861             PyErr_NoMemory();
3862         }
3863         return NULL;
3864     }
3865 
3866     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3867     PyMem_RawFree(wstr);
3868     return unicode;
3869 }
3870 
3871 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3872 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3873                               const char *errors)
3874 {
3875     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3876     return unicode_decode_locale(str, len, error_handler, 1);
3877 }
3878 
3879 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3880 PyUnicode_DecodeLocale(const char *str, const char *errors)
3881 {
3882     Py_ssize_t size = (Py_ssize_t)strlen(str);
3883     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3884     return unicode_decode_locale(str, size, error_handler, 1);
3885 }
3886 
3887 
3888 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3889 PyUnicode_DecodeFSDefault(const char *s) {
3890     Py_ssize_t size = (Py_ssize_t)strlen(s);
3891     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3892 }
3893 
3894 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3895 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3896 {
3897     PyInterpreterState *interp = _PyInterpreterState_GET();
3898     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3899     if (fs_codec->utf8) {
3900         return unicode_decode_utf8(s, size,
3901                                    fs_codec->error_handler,
3902                                    fs_codec->errors,
3903                                    NULL);
3904     }
3905 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3906     else if (fs_codec->encoding) {
3907         return PyUnicode_Decode(s, size,
3908                                 fs_codec->encoding,
3909                                 fs_codec->errors);
3910     }
3911 #endif
3912     else {
3913         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3914            machinery is not ready and so cannot be used:
3915            use mbstowcs() in this case. */
3916         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3917         const wchar_t *filesystem_errors = config->filesystem_errors;
3918         assert(filesystem_errors != NULL);
3919         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3920         assert(errors != _Py_ERROR_UNKNOWN);
3921 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3922         return unicode_decode_utf8(s, size, errors, NULL, NULL);
3923 #else
3924         return unicode_decode_locale(s, size, errors, 0);
3925 #endif
3926     }
3927 }
3928 
3929 
3930 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3931 PyUnicode_FSConverter(PyObject* arg, void* addr)
3932 {
3933     PyObject *path = NULL;
3934     PyObject *output = NULL;
3935     Py_ssize_t size;
3936     const char *data;
3937     if (arg == NULL) {
3938         Py_DECREF(*(PyObject**)addr);
3939         *(PyObject**)addr = NULL;
3940         return 1;
3941     }
3942     path = PyOS_FSPath(arg);
3943     if (path == NULL) {
3944         return 0;
3945     }
3946     if (PyBytes_Check(path)) {
3947         output = path;
3948     }
3949     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3950         output = PyUnicode_EncodeFSDefault(path);
3951         Py_DECREF(path);
3952         if (!output) {
3953             return 0;
3954         }
3955         assert(PyBytes_Check(output));
3956     }
3957 
3958     size = PyBytes_GET_SIZE(output);
3959     data = PyBytes_AS_STRING(output);
3960     if ((size_t)size != strlen(data)) {
3961         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3962         Py_DECREF(output);
3963         return 0;
3964     }
3965     *(PyObject**)addr = output;
3966     return Py_CLEANUP_SUPPORTED;
3967 }
3968 
3969 
3970 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3971 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3972 {
3973     int is_buffer = 0;
3974     PyObject *path = NULL;
3975     PyObject *output = NULL;
3976     if (arg == NULL) {
3977         Py_DECREF(*(PyObject**)addr);
3978         *(PyObject**)addr = NULL;
3979         return 1;
3980     }
3981 
3982     is_buffer = PyObject_CheckBuffer(arg);
3983     if (!is_buffer) {
3984         path = PyOS_FSPath(arg);
3985         if (path == NULL) {
3986             return 0;
3987         }
3988     }
3989     else {
3990         path = arg;
3991         Py_INCREF(arg);
3992     }
3993 
3994     if (PyUnicode_Check(path)) {
3995         output = path;
3996     }
3997     else if (PyBytes_Check(path) || is_buffer) {
3998         PyObject *path_bytes = NULL;
3999 
4000         if (!PyBytes_Check(path) &&
4001             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4002             "path should be string, bytes, or os.PathLike, not %.200s",
4003             Py_TYPE(arg)->tp_name)) {
4004                 Py_DECREF(path);
4005             return 0;
4006         }
4007         path_bytes = PyBytes_FromObject(path);
4008         Py_DECREF(path);
4009         if (!path_bytes) {
4010             return 0;
4011         }
4012         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4013                                                   PyBytes_GET_SIZE(path_bytes));
4014         Py_DECREF(path_bytes);
4015         if (!output) {
4016             return 0;
4017         }
4018     }
4019     else {
4020         PyErr_Format(PyExc_TypeError,
4021                      "path should be string, bytes, or os.PathLike, not %.200s",
4022                      Py_TYPE(arg)->tp_name);
4023         Py_DECREF(path);
4024         return 0;
4025     }
4026     if (PyUnicode_READY(output) == -1) {
4027         Py_DECREF(output);
4028         return 0;
4029     }
4030     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4031                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4032         PyErr_SetString(PyExc_ValueError, "embedded null character");
4033         Py_DECREF(output);
4034         return 0;
4035     }
4036     *(PyObject**)addr = output;
4037     return Py_CLEANUP_SUPPORTED;
4038 }
4039 
4040 
4041 static int unicode_fill_utf8(PyObject *unicode);
4042 
4043 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4044 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4045 {
4046     if (!PyUnicode_Check(unicode)) {
4047         PyErr_BadArgument();
4048         return NULL;
4049     }
4050     if (PyUnicode_READY(unicode) == -1)
4051         return NULL;
4052 
4053     if (PyUnicode_UTF8(unicode) == NULL) {
4054         if (unicode_fill_utf8(unicode) == -1) {
4055             return NULL;
4056         }
4057     }
4058 
4059     if (psize)
4060         *psize = PyUnicode_UTF8_LENGTH(unicode);
4061     return PyUnicode_UTF8(unicode);
4062 }
4063 
4064 const char *
PyUnicode_AsUTF8(PyObject * unicode)4065 PyUnicode_AsUTF8(PyObject *unicode)
4066 {
4067     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4068 }
4069 
4070 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4071 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4072 {
4073     if (!PyUnicode_Check(unicode)) {
4074         PyErr_BadArgument();
4075         return NULL;
4076     }
4077     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4078     if (w == NULL) {
4079         /* Non-ASCII compact unicode object */
4080         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4081         assert(PyUnicode_IS_READY(unicode));
4082 
4083         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4084         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4085             PyErr_NoMemory();
4086             return NULL;
4087         }
4088         w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4089         if (w == NULL) {
4090             PyErr_NoMemory();
4091             return NULL;
4092         }
4093         unicode_copy_as_widechar(unicode, w, wlen + 1);
4094         _PyUnicode_WSTR(unicode) = w;
4095         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4096             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4097         }
4098     }
4099     if (size != NULL)
4100         *size = PyUnicode_WSTR_LENGTH(unicode);
4101     return w;
4102 }
4103 
4104 /* Deprecated APIs */
4105 
4106 _Py_COMP_DIAG_PUSH
4107 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4108 
4109 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4110 PyUnicode_AsUnicode(PyObject *unicode)
4111 {
4112     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4113 }
4114 
4115 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4116 _PyUnicode_AsUnicode(PyObject *unicode)
4117 {
4118     Py_ssize_t size;
4119     const Py_UNICODE *wstr;
4120 
4121     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4122     if (wstr && wcslen(wstr) != (size_t)size) {
4123         PyErr_SetString(PyExc_ValueError, "embedded null character");
4124         return NULL;
4125     }
4126     return wstr;
4127 }
4128 
4129 
4130 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4131 PyUnicode_GetSize(PyObject *unicode)
4132 {
4133     if (!PyUnicode_Check(unicode)) {
4134         PyErr_BadArgument();
4135         goto onError;
4136     }
4137     if (_PyUnicode_WSTR(unicode) == NULL) {
4138         if (PyUnicode_AsUnicode(unicode) == NULL)
4139             goto onError;
4140     }
4141     return PyUnicode_WSTR_LENGTH(unicode);
4142 
4143   onError:
4144     return -1;
4145 }
4146 
4147 _Py_COMP_DIAG_POP
4148 
4149 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4150 PyUnicode_GetLength(PyObject *unicode)
4151 {
4152     if (!PyUnicode_Check(unicode)) {
4153         PyErr_BadArgument();
4154         return -1;
4155     }
4156     if (PyUnicode_READY(unicode) == -1)
4157         return -1;
4158     return PyUnicode_GET_LENGTH(unicode);
4159 }
4160 
4161 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163 {
4164     const void *data;
4165     int kind;
4166 
4167     if (!PyUnicode_Check(unicode)) {
4168         PyErr_BadArgument();
4169         return (Py_UCS4)-1;
4170     }
4171     if (PyUnicode_READY(unicode) == -1) {
4172         return (Py_UCS4)-1;
4173     }
4174     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4175         PyErr_SetString(PyExc_IndexError, "string index out of range");
4176         return (Py_UCS4)-1;
4177     }
4178     data = PyUnicode_DATA(unicode);
4179     kind = PyUnicode_KIND(unicode);
4180     return PyUnicode_READ(kind, data, index);
4181 }
4182 
4183 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4184 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4185 {
4186     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4187         PyErr_BadArgument();
4188         return -1;
4189     }
4190     assert(PyUnicode_IS_READY(unicode));
4191     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4192         PyErr_SetString(PyExc_IndexError, "string index out of range");
4193         return -1;
4194     }
4195     if (unicode_check_modifiable(unicode))
4196         return -1;
4197     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4198         PyErr_SetString(PyExc_ValueError, "character out of range");
4199         return -1;
4200     }
4201     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4202                     index, ch);
4203     return 0;
4204 }
4205 
4206 const char *
PyUnicode_GetDefaultEncoding(void)4207 PyUnicode_GetDefaultEncoding(void)
4208 {
4209     return "utf-8";
4210 }
4211 
4212 /* create or adjust a UnicodeDecodeError */
4213 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4214 make_decode_exception(PyObject **exceptionObject,
4215                       const char *encoding,
4216                       const char *input, Py_ssize_t length,
4217                       Py_ssize_t startpos, Py_ssize_t endpos,
4218                       const char *reason)
4219 {
4220     if (*exceptionObject == NULL) {
4221         *exceptionObject = PyUnicodeDecodeError_Create(
4222             encoding, input, length, startpos, endpos, reason);
4223     }
4224     else {
4225         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4226             goto onError;
4227         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4228             goto onError;
4229         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4230             goto onError;
4231     }
4232     return;
4233 
4234 onError:
4235     Py_CLEAR(*exceptionObject);
4236 }
4237 
4238 #ifdef MS_WINDOWS
4239 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4240 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4241 {
4242     if (newsize > *size) {
4243         wchar_t *newbuf = *buf;
4244         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4245             PyErr_NoMemory();
4246             return -1;
4247         }
4248         *buf = newbuf;
4249     }
4250     *size = newsize;
4251     return 0;
4252 }
4253 
4254 /* error handling callback helper:
4255    build arguments, call the callback and check the arguments,
4256    if no exception occurred, copy the replacement to the output
4257    and adjust various state variables.
4258    return 0 on success, -1 on error
4259 */
4260 
4261 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4262 unicode_decode_call_errorhandler_wchar(
4263     const char *errors, PyObject **errorHandler,
4264     const char *encoding, const char *reason,
4265     const char **input, const char **inend, Py_ssize_t *startinpos,
4266     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4267     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4268 {
4269     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4270 
4271     PyObject *restuple = NULL;
4272     PyObject *repunicode = NULL;
4273     Py_ssize_t outsize;
4274     Py_ssize_t insize;
4275     Py_ssize_t requiredsize;
4276     Py_ssize_t newpos;
4277     PyObject *inputobj = NULL;
4278     wchar_t *repwstr;
4279     Py_ssize_t repwlen;
4280 
4281     if (*errorHandler == NULL) {
4282         *errorHandler = PyCodec_LookupError(errors);
4283         if (*errorHandler == NULL)
4284             goto onError;
4285     }
4286 
4287     make_decode_exception(exceptionObject,
4288         encoding,
4289         *input, *inend - *input,
4290         *startinpos, *endinpos,
4291         reason);
4292     if (*exceptionObject == NULL)
4293         goto onError;
4294 
4295     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4296     if (restuple == NULL)
4297         goto onError;
4298     if (!PyTuple_Check(restuple)) {
4299         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4300         goto onError;
4301     }
4302     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4303         goto onError;
4304 
4305     /* Copy back the bytes variables, which might have been modified by the
4306        callback */
4307     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4308     if (!inputobj)
4309         goto onError;
4310     *input = PyBytes_AS_STRING(inputobj);
4311     insize = PyBytes_GET_SIZE(inputobj);
4312     *inend = *input + insize;
4313     /* we can DECREF safely, as the exception has another reference,
4314        so the object won't go away. */
4315     Py_DECREF(inputobj);
4316 
4317     if (newpos<0)
4318         newpos = insize+newpos;
4319     if (newpos<0 || newpos>insize) {
4320         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4321         goto onError;
4322     }
4323 
4324     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4325     if (repwstr == NULL)
4326         goto onError;
4327     /* need more space? (at least enough for what we
4328        have+the replacement+the rest of the string (starting
4329        at the new input position), so we won't have to check space
4330        when there are no errors in the rest of the string) */
4331     requiredsize = *outpos;
4332     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4333         goto overflow;
4334     requiredsize += repwlen;
4335     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4336         goto overflow;
4337     requiredsize += insize - newpos;
4338     outsize = *bufsize;
4339     if (requiredsize > outsize) {
4340         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4341             requiredsize = 2*outsize;
4342         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4343             goto onError;
4344         }
4345     }
4346     wcsncpy(*buf + *outpos, repwstr, repwlen);
4347     *outpos += repwlen;
4348     *endinpos = newpos;
4349     *inptr = *input + newpos;
4350 
4351     /* we made it! */
4352     Py_DECREF(restuple);
4353     return 0;
4354 
4355   overflow:
4356     PyErr_SetString(PyExc_OverflowError,
4357                     "decoded result is too long for a Python string");
4358 
4359   onError:
4360     Py_XDECREF(restuple);
4361     return -1;
4362 }
4363 #endif   /* MS_WINDOWS */
4364 
4365 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4366 unicode_decode_call_errorhandler_writer(
4367     const char *errors, PyObject **errorHandler,
4368     const char *encoding, const char *reason,
4369     const char **input, const char **inend, Py_ssize_t *startinpos,
4370     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4371     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4372 {
4373     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4374 
4375     PyObject *restuple = NULL;
4376     PyObject *repunicode = NULL;
4377     Py_ssize_t insize;
4378     Py_ssize_t newpos;
4379     Py_ssize_t replen;
4380     Py_ssize_t remain;
4381     PyObject *inputobj = NULL;
4382     int need_to_grow = 0;
4383     const char *new_inptr;
4384 
4385     if (*errorHandler == NULL) {
4386         *errorHandler = PyCodec_LookupError(errors);
4387         if (*errorHandler == NULL)
4388             goto onError;
4389     }
4390 
4391     make_decode_exception(exceptionObject,
4392         encoding,
4393         *input, *inend - *input,
4394         *startinpos, *endinpos,
4395         reason);
4396     if (*exceptionObject == NULL)
4397         goto onError;
4398 
4399     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4400     if (restuple == NULL)
4401         goto onError;
4402     if (!PyTuple_Check(restuple)) {
4403         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4404         goto onError;
4405     }
4406     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4407         goto onError;
4408 
4409     /* Copy back the bytes variables, which might have been modified by the
4410        callback */
4411     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412     if (!inputobj)
4413         goto onError;
4414     remain = *inend - *input - *endinpos;
4415     *input = PyBytes_AS_STRING(inputobj);
4416     insize = PyBytes_GET_SIZE(inputobj);
4417     *inend = *input + insize;
4418     /* we can DECREF safely, as the exception has another reference,
4419        so the object won't go away. */
4420     Py_DECREF(inputobj);
4421 
4422     if (newpos<0)
4423         newpos = insize+newpos;
4424     if (newpos<0 || newpos>insize) {
4425         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4426         goto onError;
4427     }
4428 
4429     replen = PyUnicode_GET_LENGTH(repunicode);
4430     if (replen > 1) {
4431         writer->min_length += replen - 1;
4432         need_to_grow = 1;
4433     }
4434     new_inptr = *input + newpos;
4435     if (*inend - new_inptr > remain) {
4436         /* We don't know the decoding algorithm here so we make the worst
4437            assumption that one byte decodes to one unicode character.
4438            If unfortunately one byte could decode to more unicode characters,
4439            the decoder may write out-of-bound then.  Is it possible for the
4440            algorithms using this function? */
4441         writer->min_length += *inend - new_inptr - remain;
4442         need_to_grow = 1;
4443     }
4444     if (need_to_grow) {
4445         writer->overallocate = 1;
4446         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4447                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4448             goto onError;
4449     }
4450     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4451         goto onError;
4452 
4453     *endinpos = newpos;
4454     *inptr = new_inptr;
4455 
4456     /* we made it! */
4457     Py_DECREF(restuple);
4458     return 0;
4459 
4460   onError:
4461     Py_XDECREF(restuple);
4462     return -1;
4463 }
4464 
4465 /* --- UTF-7 Codec -------------------------------------------------------- */
4466 
4467 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4468 
4469 /* Three simple macros defining base-64. */
4470 
4471 /* Is c a base-64 character? */
4472 
4473 #define IS_BASE64(c) \
4474     (((c) >= 'A' && (c) <= 'Z') ||     \
4475      ((c) >= 'a' && (c) <= 'z') ||     \
4476      ((c) >= '0' && (c) <= '9') ||     \
4477      (c) == '+' || (c) == '/')
4478 
4479 /* given that c is a base-64 character, what is its base-64 value? */
4480 
4481 #define FROM_BASE64(c)                                                  \
4482     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4483      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4484      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4485      (c) == '+' ? 62 : 63)
4486 
4487 /* What is the base-64 character of the bottom 6 bits of n? */
4488 
4489 #define TO_BASE64(n)  \
4490     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4491 
4492 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4493  * decoded as itself.  We are permissive on decoding; the only ASCII
4494  * byte not decoding to itself is the + which begins a base64
4495  * string. */
4496 
4497 #define DECODE_DIRECT(c)                                \
4498     ((c) <= 127 && (c) != '+')
4499 
4500 /* The UTF-7 encoder treats ASCII characters differently according to
4501  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4502  * the above).  See RFC2152.  This array identifies these different
4503  * sets:
4504  * 0 : "Set D"
4505  *     alphanumeric and '(),-./:?
4506  * 1 : "Set O"
4507  *     !"#$%&*;<=>@[]^_`{|}
4508  * 2 : "whitespace"
4509  *     ht nl cr sp
4510  * 3 : special (must be base64 encoded)
4511  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4512  */
4513 
4514 static
4515 char utf7_category[128] = {
4516 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4517     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4518 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4519     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4520 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4521     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4522 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4523     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4524 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4525     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4526 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4527     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4528 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4529     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4530 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4531     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4532 };
4533 
4534 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4535  * answer depends on whether we are encoding set O as itself, and also
4536  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4537  * clear that the answers to these questions vary between
4538  * applications, so this code needs to be flexible.  */
4539 
4540 #define ENCODE_DIRECT(c, directO, directWS)             \
4541     ((c) < 128 && (c) > 0 &&                            \
4542      ((utf7_category[(c)] == 0) ||                      \
4543       (directWS && (utf7_category[(c)] == 2)) ||        \
4544       (directO && (utf7_category[(c)] == 1))))
4545 
4546 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4547 PyUnicode_DecodeUTF7(const char *s,
4548                      Py_ssize_t size,
4549                      const char *errors)
4550 {
4551     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4552 }
4553 
4554 /* The decoder.  The only state we preserve is our read position,
4555  * i.e. how many characters we have consumed.  So if we end in the
4556  * middle of a shift sequence we have to back off the read position
4557  * and the output to the beginning of the sequence, otherwise we lose
4558  * all the shift state (seen bits, number of bits seen, high
4559  * surrogate). */
4560 
4561 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4562 PyUnicode_DecodeUTF7Stateful(const char *s,
4563                              Py_ssize_t size,
4564                              const char *errors,
4565                              Py_ssize_t *consumed)
4566 {
4567     const char *starts = s;
4568     Py_ssize_t startinpos;
4569     Py_ssize_t endinpos;
4570     const char *e;
4571     _PyUnicodeWriter writer;
4572     const char *errmsg = "";
4573     int inShift = 0;
4574     Py_ssize_t shiftOutStart;
4575     unsigned int base64bits = 0;
4576     unsigned long base64buffer = 0;
4577     Py_UCS4 surrogate = 0;
4578     PyObject *errorHandler = NULL;
4579     PyObject *exc = NULL;
4580 
4581     if (size == 0) {
4582         if (consumed)
4583             *consumed = 0;
4584         _Py_RETURN_UNICODE_EMPTY();
4585     }
4586 
4587     /* Start off assuming it's all ASCII. Widen later as necessary. */
4588     _PyUnicodeWriter_Init(&writer);
4589     writer.min_length = size;
4590 
4591     shiftOutStart = 0;
4592     e = s + size;
4593 
4594     while (s < e) {
4595         Py_UCS4 ch;
4596       restart:
4597         ch = (unsigned char) *s;
4598 
4599         if (inShift) { /* in a base-64 section */
4600             if (IS_BASE64(ch)) { /* consume a base-64 character */
4601                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4602                 base64bits += 6;
4603                 s++;
4604                 if (base64bits >= 16) {
4605                     /* we have enough bits for a UTF-16 value */
4606                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4607                     base64bits -= 16;
4608                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4609                     assert(outCh <= 0xffff);
4610                     if (surrogate) {
4611                         /* expecting a second surrogate */
4612                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4613                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4614                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4615                                 goto onError;
4616                             surrogate = 0;
4617                             continue;
4618                         }
4619                         else {
4620                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4621                                 goto onError;
4622                             surrogate = 0;
4623                         }
4624                     }
4625                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4626                         /* first surrogate */
4627                         surrogate = outCh;
4628                     }
4629                     else {
4630                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4631                             goto onError;
4632                     }
4633                 }
4634             }
4635             else { /* now leaving a base-64 section */
4636                 inShift = 0;
4637                 if (base64bits > 0) { /* left-over bits */
4638                     if (base64bits >= 6) {
4639                         /* We've seen at least one base-64 character */
4640                         s++;
4641                         errmsg = "partial character in shift sequence";
4642                         goto utf7Error;
4643                     }
4644                     else {
4645                         /* Some bits remain; they should be zero */
4646                         if (base64buffer != 0) {
4647                             s++;
4648                             errmsg = "non-zero padding bits in shift sequence";
4649                             goto utf7Error;
4650                         }
4651                     }
4652                 }
4653                 if (surrogate && DECODE_DIRECT(ch)) {
4654                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4655                         goto onError;
4656                 }
4657                 surrogate = 0;
4658                 if (ch == '-') {
4659                     /* '-' is absorbed; other terminating
4660                        characters are preserved */
4661                     s++;
4662                 }
4663             }
4664         }
4665         else if ( ch == '+' ) {
4666             startinpos = s-starts;
4667             s++; /* consume '+' */
4668             if (s < e && *s == '-') { /* '+-' encodes '+' */
4669                 s++;
4670                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4671                     goto onError;
4672             }
4673             else if (s < e && !IS_BASE64(*s)) {
4674                 s++;
4675                 errmsg = "ill-formed sequence";
4676                 goto utf7Error;
4677             }
4678             else { /* begin base64-encoded section */
4679                 inShift = 1;
4680                 surrogate = 0;
4681                 shiftOutStart = writer.pos;
4682                 base64bits = 0;
4683                 base64buffer = 0;
4684             }
4685         }
4686         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4687             s++;
4688             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4689                 goto onError;
4690         }
4691         else {
4692             startinpos = s-starts;
4693             s++;
4694             errmsg = "unexpected special character";
4695             goto utf7Error;
4696         }
4697         continue;
4698 utf7Error:
4699         endinpos = s-starts;
4700         if (unicode_decode_call_errorhandler_writer(
4701                 errors, &errorHandler,
4702                 "utf7", errmsg,
4703                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4704                 &writer))
4705             goto onError;
4706     }
4707 
4708     /* end of string */
4709 
4710     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4711         /* if we're in an inconsistent state, that's an error */
4712         inShift = 0;
4713         if (surrogate ||
4714                 (base64bits >= 6) ||
4715                 (base64bits > 0 && base64buffer != 0)) {
4716             endinpos = size;
4717             if (unicode_decode_call_errorhandler_writer(
4718                     errors, &errorHandler,
4719                     "utf7", "unterminated shift sequence",
4720                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4721                     &writer))
4722                 goto onError;
4723             if (s < e)
4724                 goto restart;
4725         }
4726     }
4727 
4728     /* return state */
4729     if (consumed) {
4730         if (inShift) {
4731             *consumed = startinpos;
4732             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4733                 PyObject *result = PyUnicode_FromKindAndData(
4734                         writer.kind, writer.data, shiftOutStart);
4735                 Py_XDECREF(errorHandler);
4736                 Py_XDECREF(exc);
4737                 _PyUnicodeWriter_Dealloc(&writer);
4738                 return result;
4739             }
4740             writer.pos = shiftOutStart; /* back off output */
4741         }
4742         else {
4743             *consumed = s-starts;
4744         }
4745     }
4746 
4747     Py_XDECREF(errorHandler);
4748     Py_XDECREF(exc);
4749     return _PyUnicodeWriter_Finish(&writer);
4750 
4751   onError:
4752     Py_XDECREF(errorHandler);
4753     Py_XDECREF(exc);
4754     _PyUnicodeWriter_Dealloc(&writer);
4755     return NULL;
4756 }
4757 
4758 
4759 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4760 _PyUnicode_EncodeUTF7(PyObject *str,
4761                       int base64SetO,
4762                       int base64WhiteSpace,
4763                       const char *errors)
4764 {
4765     int kind;
4766     const void *data;
4767     Py_ssize_t len;
4768     PyObject *v;
4769     int inShift = 0;
4770     Py_ssize_t i;
4771     unsigned int base64bits = 0;
4772     unsigned long base64buffer = 0;
4773     char * out;
4774     const char * start;
4775 
4776     if (PyUnicode_READY(str) == -1)
4777         return NULL;
4778     kind = PyUnicode_KIND(str);
4779     data = PyUnicode_DATA(str);
4780     len = PyUnicode_GET_LENGTH(str);
4781 
4782     if (len == 0)
4783         return PyBytes_FromStringAndSize(NULL, 0);
4784 
4785     /* It might be possible to tighten this worst case */
4786     if (len > PY_SSIZE_T_MAX / 8)
4787         return PyErr_NoMemory();
4788     v = PyBytes_FromStringAndSize(NULL, len * 8);
4789     if (v == NULL)
4790         return NULL;
4791 
4792     start = out = PyBytes_AS_STRING(v);
4793     for (i = 0; i < len; ++i) {
4794         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4795 
4796         if (inShift) {
4797             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4798                 /* shifting out */
4799                 if (base64bits) { /* output remaining bits */
4800                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4801                     base64buffer = 0;
4802                     base64bits = 0;
4803                 }
4804                 inShift = 0;
4805                 /* Characters not in the BASE64 set implicitly unshift the sequence
4806                    so no '-' is required, except if the character is itself a '-' */
4807                 if (IS_BASE64(ch) || ch == '-') {
4808                     *out++ = '-';
4809                 }
4810                 *out++ = (char) ch;
4811             }
4812             else {
4813                 goto encode_char;
4814             }
4815         }
4816         else { /* not in a shift sequence */
4817             if (ch == '+') {
4818                 *out++ = '+';
4819                         *out++ = '-';
4820             }
4821             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4822                 *out++ = (char) ch;
4823             }
4824             else {
4825                 *out++ = '+';
4826                 inShift = 1;
4827                 goto encode_char;
4828             }
4829         }
4830         continue;
4831 encode_char:
4832         if (ch >= 0x10000) {
4833             assert(ch <= MAX_UNICODE);
4834 
4835             /* code first surrogate */
4836             base64bits += 16;
4837             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4838             while (base64bits >= 6) {
4839                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4840                 base64bits -= 6;
4841             }
4842             /* prepare second surrogate */
4843             ch = Py_UNICODE_LOW_SURROGATE(ch);
4844         }
4845         base64bits += 16;
4846         base64buffer = (base64buffer << 16) | ch;
4847         while (base64bits >= 6) {
4848             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4849             base64bits -= 6;
4850         }
4851     }
4852     if (base64bits)
4853         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4854     if (inShift)
4855         *out++ = '-';
4856     if (_PyBytes_Resize(&v, out - start) < 0)
4857         return NULL;
4858     return v;
4859 }
4860 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4861 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4862                      Py_ssize_t size,
4863                      int base64SetO,
4864                      int base64WhiteSpace,
4865                      const char *errors)
4866 {
4867     PyObject *result;
4868     PyObject *tmp = PyUnicode_FromWideChar(s, size);
4869     if (tmp == NULL)
4870         return NULL;
4871     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4872                                    base64WhiteSpace, errors);
4873     Py_DECREF(tmp);
4874     return result;
4875 }
4876 
4877 #undef IS_BASE64
4878 #undef FROM_BASE64
4879 #undef TO_BASE64
4880 #undef DECODE_DIRECT
4881 #undef ENCODE_DIRECT
4882 
4883 /* --- UTF-8 Codec -------------------------------------------------------- */
4884 
4885 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4886 PyUnicode_DecodeUTF8(const char *s,
4887                      Py_ssize_t size,
4888                      const char *errors)
4889 {
4890     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4891 }
4892 
4893 #include "stringlib/asciilib.h"
4894 #include "stringlib/codecs.h"
4895 #include "stringlib/undef.h"
4896 
4897 #include "stringlib/ucs1lib.h"
4898 #include "stringlib/codecs.h"
4899 #include "stringlib/undef.h"
4900 
4901 #include "stringlib/ucs2lib.h"
4902 #include "stringlib/codecs.h"
4903 #include "stringlib/undef.h"
4904 
4905 #include "stringlib/ucs4lib.h"
4906 #include "stringlib/codecs.h"
4907 #include "stringlib/undef.h"
4908 
4909 /* Mask to quickly check whether a C 'long' contains a
4910    non-ASCII, UTF8-encoded char. */
4911 #if (SIZEOF_LONG == 8)
4912 # define ASCII_CHAR_MASK 0x8080808080808080UL
4913 #elif (SIZEOF_LONG == 4)
4914 # define ASCII_CHAR_MASK 0x80808080UL
4915 #else
4916 # error C 'long' size should be either 4 or 8!
4917 #endif
4918 
4919 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4920 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4921 {
4922     const char *p = start;
4923     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4924 
4925     /*
4926      * Issue #17237: m68k is a bit different from most architectures in
4927      * that objects do not use "natural alignment" - for example, int and
4928      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4929      * won't work; also, tests have shown that skipping the "optimised
4930      * version" will even speed up m68k.
4931      */
4932 #if !defined(__m68k__)
4933 #if SIZEOF_LONG <= SIZEOF_VOID_P
4934     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4935     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4936         /* Fast path, see in STRINGLIB(utf8_decode) for
4937            an explanation. */
4938         /* Help allocation */
4939         const char *_p = p;
4940         Py_UCS1 * q = dest;
4941         while (_p < aligned_end) {
4942             unsigned long value = *(const unsigned long *) _p;
4943             if (value & ASCII_CHAR_MASK)
4944                 break;
4945             *((unsigned long *)q) = value;
4946             _p += SIZEOF_LONG;
4947             q += SIZEOF_LONG;
4948         }
4949         p = _p;
4950         while (p < end) {
4951             if ((unsigned char)*p & 0x80)
4952                 break;
4953             *q++ = *p++;
4954         }
4955         return p - start;
4956     }
4957 #endif
4958 #endif
4959     while (p < end) {
4960         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4961            for an explanation. */
4962         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4963             /* Help allocation */
4964             const char *_p = p;
4965             while (_p < aligned_end) {
4966                 unsigned long value = *(const unsigned long *) _p;
4967                 if (value & ASCII_CHAR_MASK)
4968                     break;
4969                 _p += SIZEOF_LONG;
4970             }
4971             p = _p;
4972             if (_p == end)
4973                 break;
4974         }
4975         if ((unsigned char)*p & 0x80)
4976             break;
4977         ++p;
4978     }
4979     memcpy(dest, start, p - start);
4980     return p - start;
4981 }
4982 
4983 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4984 unicode_decode_utf8(const char *s, Py_ssize_t size,
4985                     _Py_error_handler error_handler, const char *errors,
4986                     Py_ssize_t *consumed)
4987 {
4988     if (size == 0) {
4989         if (consumed)
4990             *consumed = 0;
4991         _Py_RETURN_UNICODE_EMPTY();
4992     }
4993 
4994     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4995     if (size == 1 && (unsigned char)s[0] < 128) {
4996         if (consumed)
4997             *consumed = 1;
4998         return get_latin1_char((unsigned char)s[0]);
4999     }
5000 
5001     const char *starts = s;
5002     const char *end = s + size;
5003 
5004     // fast path: try ASCII string.
5005     PyObject *u = PyUnicode_New(size, 127);
5006     if (u == NULL) {
5007         return NULL;
5008     }
5009     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5010     if (s == end) {
5011         return u;
5012     }
5013 
5014     // Use _PyUnicodeWriter after fast path is failed.
5015     _PyUnicodeWriter writer;
5016     _PyUnicodeWriter_InitWithBuffer(&writer, u);
5017     writer.pos = s - starts;
5018 
5019     Py_ssize_t startinpos, endinpos;
5020     const char *errmsg = "";
5021     PyObject *error_handler_obj = NULL;
5022     PyObject *exc = NULL;
5023 
5024     while (s < end) {
5025         Py_UCS4 ch;
5026         int kind = writer.kind;
5027 
5028         if (kind == PyUnicode_1BYTE_KIND) {
5029             if (PyUnicode_IS_ASCII(writer.buffer))
5030                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5031             else
5032                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5033         } else if (kind == PyUnicode_2BYTE_KIND) {
5034             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5035         } else {
5036             assert(kind == PyUnicode_4BYTE_KIND);
5037             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5038         }
5039 
5040         switch (ch) {
5041         case 0:
5042             if (s == end || consumed)
5043                 goto End;
5044             errmsg = "unexpected end of data";
5045             startinpos = s - starts;
5046             endinpos = end - starts;
5047             break;
5048         case 1:
5049             errmsg = "invalid start byte";
5050             startinpos = s - starts;
5051             endinpos = startinpos + 1;
5052             break;
5053         case 2:
5054             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5055                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5056             {
5057                 /* Truncated surrogate code in range D800-DFFF */
5058                 goto End;
5059             }
5060             /* fall through */
5061         case 3:
5062         case 4:
5063             errmsg = "invalid continuation byte";
5064             startinpos = s - starts;
5065             endinpos = startinpos + ch - 1;
5066             break;
5067         default:
5068             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5069                 goto onError;
5070             continue;
5071         }
5072 
5073         if (error_handler == _Py_ERROR_UNKNOWN)
5074             error_handler = _Py_GetErrorHandler(errors);
5075 
5076         switch (error_handler) {
5077         case _Py_ERROR_IGNORE:
5078             s += (endinpos - startinpos);
5079             break;
5080 
5081         case _Py_ERROR_REPLACE:
5082             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5083                 goto onError;
5084             s += (endinpos - startinpos);
5085             break;
5086 
5087         case _Py_ERROR_SURROGATEESCAPE:
5088         {
5089             Py_ssize_t i;
5090 
5091             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5092                 goto onError;
5093             for (i=startinpos; i<endinpos; i++) {
5094                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5095                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5096                                 ch + 0xdc00);
5097                 writer.pos++;
5098             }
5099             s += (endinpos - startinpos);
5100             break;
5101         }
5102 
5103         default:
5104             if (unicode_decode_call_errorhandler_writer(
5105                     errors, &error_handler_obj,
5106                     "utf-8", errmsg,
5107                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5108                     &writer))
5109                 goto onError;
5110         }
5111     }
5112 
5113 End:
5114     if (consumed)
5115         *consumed = s - starts;
5116 
5117     Py_XDECREF(error_handler_obj);
5118     Py_XDECREF(exc);
5119     return _PyUnicodeWriter_Finish(&writer);
5120 
5121 onError:
5122     Py_XDECREF(error_handler_obj);
5123     Py_XDECREF(exc);
5124     _PyUnicodeWriter_Dealloc(&writer);
5125     return NULL;
5126 }
5127 
5128 
5129 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5130 PyUnicode_DecodeUTF8Stateful(const char *s,
5131                              Py_ssize_t size,
5132                              const char *errors,
5133                              Py_ssize_t *consumed)
5134 {
5135     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5136 }
5137 
5138 
5139 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5140    non-zero, use strict error handler otherwise.
5141 
5142    On success, write a pointer to a newly allocated wide character string into
5143    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5144    (in number of wchar_t units) into *wlen (if wlen is set).
5145 
5146    On memory allocation failure, return -1.
5147 
5148    On decoding error (if surrogateescape is zero), return -2. If wlen is
5149    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5150    is not NULL, write the decoding error message into *reason. */
5151 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5152 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5153                  const char **reason, _Py_error_handler errors)
5154 {
5155     const char *orig_s = s;
5156     const char *e;
5157     wchar_t *unicode;
5158     Py_ssize_t outpos;
5159 
5160     int surrogateescape = 0;
5161     int surrogatepass = 0;
5162     switch (errors)
5163     {
5164     case _Py_ERROR_STRICT:
5165         break;
5166     case _Py_ERROR_SURROGATEESCAPE:
5167         surrogateescape = 1;
5168         break;
5169     case _Py_ERROR_SURROGATEPASS:
5170         surrogatepass = 1;
5171         break;
5172     default:
5173         return -3;
5174     }
5175 
5176     /* Note: size will always be longer than the resulting Unicode
5177        character count */
5178     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5179         return -1;
5180     }
5181 
5182     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5183     if (!unicode) {
5184         return -1;
5185     }
5186 
5187     /* Unpack UTF-8 encoded data */
5188     e = s + size;
5189     outpos = 0;
5190     while (s < e) {
5191         Py_UCS4 ch;
5192 #if SIZEOF_WCHAR_T == 4
5193         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5194 #else
5195         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5196 #endif
5197         if (ch > 0xFF) {
5198 #if SIZEOF_WCHAR_T == 4
5199             Py_UNREACHABLE();
5200 #else
5201             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5202             /* write a surrogate pair */
5203             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5204             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5205 #endif
5206         }
5207         else {
5208             if (!ch && s == e) {
5209                 break;
5210             }
5211 
5212             if (surrogateescape) {
5213                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5214             }
5215             else {
5216                 /* Is it a valid three-byte code? */
5217                 if (surrogatepass
5218                     && (e - s) >= 3
5219                     && (s[0] & 0xf0) == 0xe0
5220                     && (s[1] & 0xc0) == 0x80
5221                     && (s[2] & 0xc0) == 0x80)
5222                 {
5223                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5224                     s += 3;
5225                     unicode[outpos++] = ch;
5226                 }
5227                 else {
5228                     PyMem_RawFree(unicode );
5229                     if (reason != NULL) {
5230                         switch (ch) {
5231                         case 0:
5232                             *reason = "unexpected end of data";
5233                             break;
5234                         case 1:
5235                             *reason = "invalid start byte";
5236                             break;
5237                         /* 2, 3, 4 */
5238                         default:
5239                             *reason = "invalid continuation byte";
5240                             break;
5241                         }
5242                     }
5243                     if (wlen != NULL) {
5244                         *wlen = s - orig_s;
5245                     }
5246                     return -2;
5247                 }
5248             }
5249         }
5250     }
5251     unicode[outpos] = L'\0';
5252     if (wlen) {
5253         *wlen = outpos;
5254     }
5255     *wstr = unicode;
5256     return 0;
5257 }
5258 
5259 
5260 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5261 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5262                                size_t *wlen)
5263 {
5264     wchar_t *wstr;
5265     int res = _Py_DecodeUTF8Ex(arg, arglen,
5266                                &wstr, wlen,
5267                                NULL, _Py_ERROR_SURROGATEESCAPE);
5268     if (res != 0) {
5269         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5270         assert(res != -3);
5271         if (wlen) {
5272             *wlen = (size_t)res;
5273         }
5274         return NULL;
5275     }
5276     return wstr;
5277 }
5278 
5279 
5280 /* UTF-8 encoder using the surrogateescape error handler .
5281 
5282    On success, return 0 and write the newly allocated character string (use
5283    PyMem_Free() to free the memory) into *str.
5284 
5285    On encoding failure, return -2 and write the position of the invalid
5286    surrogate character into *error_pos (if error_pos is set) and the decoding
5287    error message into *reason (if reason is set).
5288 
5289    On memory allocation failure, return -1. */
5290 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5291 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5292                  const char **reason, int raw_malloc, _Py_error_handler errors)
5293 {
5294     const Py_ssize_t max_char_size = 4;
5295     Py_ssize_t len = wcslen(text);
5296 
5297     assert(len >= 0);
5298 
5299     int surrogateescape = 0;
5300     int surrogatepass = 0;
5301     switch (errors)
5302     {
5303     case _Py_ERROR_STRICT:
5304         break;
5305     case _Py_ERROR_SURROGATEESCAPE:
5306         surrogateescape = 1;
5307         break;
5308     case _Py_ERROR_SURROGATEPASS:
5309         surrogatepass = 1;
5310         break;
5311     default:
5312         return -3;
5313     }
5314 
5315     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5316         return -1;
5317     }
5318     char *bytes;
5319     if (raw_malloc) {
5320         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5321     }
5322     else {
5323         bytes = PyMem_Malloc((len + 1) * max_char_size);
5324     }
5325     if (bytes == NULL) {
5326         return -1;
5327     }
5328 
5329     char *p = bytes;
5330     Py_ssize_t i;
5331     for (i = 0; i < len; ) {
5332         Py_ssize_t ch_pos = i;
5333         Py_UCS4 ch = text[i];
5334         i++;
5335 #if Py_UNICODE_SIZE == 2
5336         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5337             && i < len
5338             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5339         {
5340             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5341             i++;
5342         }
5343 #endif
5344 
5345         if (ch < 0x80) {
5346             /* Encode ASCII */
5347             *p++ = (char) ch;
5348 
5349         }
5350         else if (ch < 0x0800) {
5351             /* Encode Latin-1 */
5352             *p++ = (char)(0xc0 | (ch >> 6));
5353             *p++ = (char)(0x80 | (ch & 0x3f));
5354         }
5355         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5356             /* surrogateescape error handler */
5357             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5358                 if (error_pos != NULL) {
5359                     *error_pos = (size_t)ch_pos;
5360                 }
5361                 if (reason != NULL) {
5362                     *reason = "encoding error";
5363                 }
5364                 if (raw_malloc) {
5365                     PyMem_RawFree(bytes);
5366                 }
5367                 else {
5368                     PyMem_Free(bytes);
5369                 }
5370                 return -2;
5371             }
5372             *p++ = (char)(ch & 0xff);
5373         }
5374         else if (ch < 0x10000) {
5375             *p++ = (char)(0xe0 | (ch >> 12));
5376             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5377             *p++ = (char)(0x80 | (ch & 0x3f));
5378         }
5379         else {  /* ch >= 0x10000 */
5380             assert(ch <= MAX_UNICODE);
5381             /* Encode UCS4 Unicode ordinals */
5382             *p++ = (char)(0xf0 | (ch >> 18));
5383             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5384             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5385             *p++ = (char)(0x80 | (ch & 0x3f));
5386         }
5387     }
5388     *p++ = '\0';
5389 
5390     size_t final_size = (p - bytes);
5391     char *bytes2;
5392     if (raw_malloc) {
5393         bytes2 = PyMem_RawRealloc(bytes, final_size);
5394     }
5395     else {
5396         bytes2 = PyMem_Realloc(bytes, final_size);
5397     }
5398     if (bytes2 == NULL) {
5399         if (error_pos != NULL) {
5400             *error_pos = (size_t)-1;
5401         }
5402         if (raw_malloc) {
5403             PyMem_RawFree(bytes);
5404         }
5405         else {
5406             PyMem_Free(bytes);
5407         }
5408         return -1;
5409     }
5410     *str = bytes2;
5411     return 0;
5412 }
5413 
5414 
5415 /* Primary internal function which creates utf8 encoded bytes objects.
5416 
5417    Allocation strategy:  if the string is short, convert into a stack buffer
5418    and allocate exactly as much space needed at the end.  Else allocate the
5419    maximum possible needed (4 result bytes per Unicode character), and return
5420    the excess memory at the end.
5421 */
5422 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5423 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5424                     const char *errors)
5425 {
5426     if (!PyUnicode_Check(unicode)) {
5427         PyErr_BadArgument();
5428         return NULL;
5429     }
5430 
5431     if (PyUnicode_READY(unicode) == -1)
5432         return NULL;
5433 
5434     if (PyUnicode_UTF8(unicode))
5435         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5436                                          PyUnicode_UTF8_LENGTH(unicode));
5437 
5438     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5439     const void *data = PyUnicode_DATA(unicode);
5440     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5441 
5442     _PyBytesWriter writer;
5443     char *end;
5444 
5445     switch (kind) {
5446     default:
5447         Py_UNREACHABLE();
5448     case PyUnicode_1BYTE_KIND:
5449         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5450         assert(!PyUnicode_IS_ASCII(unicode));
5451         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5452         break;
5453     case PyUnicode_2BYTE_KIND:
5454         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5455         break;
5456     case PyUnicode_4BYTE_KIND:
5457         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5458         break;
5459     }
5460 
5461     if (end == NULL) {
5462         _PyBytesWriter_Dealloc(&writer);
5463         return NULL;
5464     }
5465     return _PyBytesWriter_Finish(&writer, end);
5466 }
5467 
5468 static int
unicode_fill_utf8(PyObject * unicode)5469 unicode_fill_utf8(PyObject *unicode)
5470 {
5471     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5472     assert(!PyUnicode_IS_ASCII(unicode));
5473 
5474     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5475     const void *data = PyUnicode_DATA(unicode);
5476     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5477 
5478     _PyBytesWriter writer;
5479     char *end;
5480 
5481     switch (kind) {
5482     default:
5483         Py_UNREACHABLE();
5484     case PyUnicode_1BYTE_KIND:
5485         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5486                                    _Py_ERROR_STRICT, NULL);
5487         break;
5488     case PyUnicode_2BYTE_KIND:
5489         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5490                                    _Py_ERROR_STRICT, NULL);
5491         break;
5492     case PyUnicode_4BYTE_KIND:
5493         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5494                                    _Py_ERROR_STRICT, NULL);
5495         break;
5496     }
5497     if (end == NULL) {
5498         _PyBytesWriter_Dealloc(&writer);
5499         return -1;
5500     }
5501 
5502     const char *start = writer.use_small_buffer ? writer.small_buffer :
5503                     PyBytes_AS_STRING(writer.buffer);
5504     Py_ssize_t len = end - start;
5505 
5506     char *cache = PyObject_MALLOC(len + 1);
5507     if (cache == NULL) {
5508         _PyBytesWriter_Dealloc(&writer);
5509         PyErr_NoMemory();
5510         return -1;
5511     }
5512     _PyUnicode_UTF8(unicode) = cache;
5513     _PyUnicode_UTF8_LENGTH(unicode) = len;
5514     memcpy(cache, start, len);
5515     cache[len] = '\0';
5516     _PyBytesWriter_Dealloc(&writer);
5517     return 0;
5518 }
5519 
5520 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5521 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5522 {
5523     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5524 }
5525 
5526 
5527 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5528 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5529                      Py_ssize_t size,
5530                      const char *errors)
5531 {
5532     PyObject *v, *unicode;
5533 
5534     unicode = PyUnicode_FromWideChar(s, size);
5535     if (unicode == NULL)
5536         return NULL;
5537     v = _PyUnicode_AsUTF8String(unicode, errors);
5538     Py_DECREF(unicode);
5539     return v;
5540 }
5541 
5542 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5543 PyUnicode_AsUTF8String(PyObject *unicode)
5544 {
5545     return _PyUnicode_AsUTF8String(unicode, NULL);
5546 }
5547 
5548 /* --- UTF-32 Codec ------------------------------------------------------- */
5549 
5550 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5551 PyUnicode_DecodeUTF32(const char *s,
5552                       Py_ssize_t size,
5553                       const char *errors,
5554                       int *byteorder)
5555 {
5556     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5557 }
5558 
5559 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5560 PyUnicode_DecodeUTF32Stateful(const char *s,
5561                               Py_ssize_t size,
5562                               const char *errors,
5563                               int *byteorder,
5564                               Py_ssize_t *consumed)
5565 {
5566     const char *starts = s;
5567     Py_ssize_t startinpos;
5568     Py_ssize_t endinpos;
5569     _PyUnicodeWriter writer;
5570     const unsigned char *q, *e;
5571     int le, bo = 0;       /* assume native ordering by default */
5572     const char *encoding;
5573     const char *errmsg = "";
5574     PyObject *errorHandler = NULL;
5575     PyObject *exc = NULL;
5576 
5577     q = (const unsigned char *)s;
5578     e = q + size;
5579 
5580     if (byteorder)
5581         bo = *byteorder;
5582 
5583     /* Check for BOM marks (U+FEFF) in the input and adjust current
5584        byte order setting accordingly. In native mode, the leading BOM
5585        mark is skipped, in all other modes, it is copied to the output
5586        stream as-is (giving a ZWNBSP character). */
5587     if (bo == 0 && size >= 4) {
5588         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5589         if (bom == 0x0000FEFF) {
5590             bo = -1;
5591             q += 4;
5592         }
5593         else if (bom == 0xFFFE0000) {
5594             bo = 1;
5595             q += 4;
5596         }
5597         if (byteorder)
5598             *byteorder = bo;
5599     }
5600 
5601     if (q == e) {
5602         if (consumed)
5603             *consumed = size;
5604         _Py_RETURN_UNICODE_EMPTY();
5605     }
5606 
5607 #ifdef WORDS_BIGENDIAN
5608     le = bo < 0;
5609 #else
5610     le = bo <= 0;
5611 #endif
5612     encoding = le ? "utf-32-le" : "utf-32-be";
5613 
5614     _PyUnicodeWriter_Init(&writer);
5615     writer.min_length = (e - q + 3) / 4;
5616     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5617         goto onError;
5618 
5619     while (1) {
5620         Py_UCS4 ch = 0;
5621         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5622 
5623         if (e - q >= 4) {
5624             enum PyUnicode_Kind kind = writer.kind;
5625             void *data = writer.data;
5626             const unsigned char *last = e - 4;
5627             Py_ssize_t pos = writer.pos;
5628             if (le) {
5629                 do {
5630                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5631                     if (ch > maxch)
5632                         break;
5633                     if (kind != PyUnicode_1BYTE_KIND &&
5634                         Py_UNICODE_IS_SURROGATE(ch))
5635                         break;
5636                     PyUnicode_WRITE(kind, data, pos++, ch);
5637                     q += 4;
5638                 } while (q <= last);
5639             }
5640             else {
5641                 do {
5642                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5643                     if (ch > maxch)
5644                         break;
5645                     if (kind != PyUnicode_1BYTE_KIND &&
5646                         Py_UNICODE_IS_SURROGATE(ch))
5647                         break;
5648                     PyUnicode_WRITE(kind, data, pos++, ch);
5649                     q += 4;
5650                 } while (q <= last);
5651             }
5652             writer.pos = pos;
5653         }
5654 
5655         if (Py_UNICODE_IS_SURROGATE(ch)) {
5656             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5657             startinpos = ((const char *)q) - starts;
5658             endinpos = startinpos + 4;
5659         }
5660         else if (ch <= maxch) {
5661             if (q == e || consumed)
5662                 break;
5663             /* remaining bytes at the end? (size should be divisible by 4) */
5664             errmsg = "truncated data";
5665             startinpos = ((const char *)q) - starts;
5666             endinpos = ((const char *)e) - starts;
5667         }
5668         else {
5669             if (ch < 0x110000) {
5670                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5671                     goto onError;
5672                 q += 4;
5673                 continue;
5674             }
5675             errmsg = "code point not in range(0x110000)";
5676             startinpos = ((const char *)q) - starts;
5677             endinpos = startinpos + 4;
5678         }
5679 
5680         /* The remaining input chars are ignored if the callback
5681            chooses to skip the input */
5682         if (unicode_decode_call_errorhandler_writer(
5683                 errors, &errorHandler,
5684                 encoding, errmsg,
5685                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5686                 &writer))
5687             goto onError;
5688     }
5689 
5690     if (consumed)
5691         *consumed = (const char *)q-starts;
5692 
5693     Py_XDECREF(errorHandler);
5694     Py_XDECREF(exc);
5695     return _PyUnicodeWriter_Finish(&writer);
5696 
5697   onError:
5698     _PyUnicodeWriter_Dealloc(&writer);
5699     Py_XDECREF(errorHandler);
5700     Py_XDECREF(exc);
5701     return NULL;
5702 }
5703 
5704 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5705 _PyUnicode_EncodeUTF32(PyObject *str,
5706                        const char *errors,
5707                        int byteorder)
5708 {
5709     enum PyUnicode_Kind kind;
5710     const void *data;
5711     Py_ssize_t len;
5712     PyObject *v;
5713     uint32_t *out;
5714 #if PY_LITTLE_ENDIAN
5715     int native_ordering = byteorder <= 0;
5716 #else
5717     int native_ordering = byteorder >= 0;
5718 #endif
5719     const char *encoding;
5720     Py_ssize_t nsize, pos;
5721     PyObject *errorHandler = NULL;
5722     PyObject *exc = NULL;
5723     PyObject *rep = NULL;
5724 
5725     if (!PyUnicode_Check(str)) {
5726         PyErr_BadArgument();
5727         return NULL;
5728     }
5729     if (PyUnicode_READY(str) == -1)
5730         return NULL;
5731     kind = PyUnicode_KIND(str);
5732     data = PyUnicode_DATA(str);
5733     len = PyUnicode_GET_LENGTH(str);
5734 
5735     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5736         return PyErr_NoMemory();
5737     nsize = len + (byteorder == 0);
5738     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5739     if (v == NULL)
5740         return NULL;
5741 
5742     /* output buffer is 4-bytes aligned */
5743     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5744     out = (uint32_t *)PyBytes_AS_STRING(v);
5745     if (byteorder == 0)
5746         *out++ = 0xFEFF;
5747     if (len == 0)
5748         goto done;
5749 
5750     if (byteorder == -1)
5751         encoding = "utf-32-le";
5752     else if (byteorder == 1)
5753         encoding = "utf-32-be";
5754     else
5755         encoding = "utf-32";
5756 
5757     if (kind == PyUnicode_1BYTE_KIND) {
5758         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5759         goto done;
5760     }
5761 
5762     pos = 0;
5763     while (pos < len) {
5764         Py_ssize_t repsize, moreunits;
5765 
5766         if (kind == PyUnicode_2BYTE_KIND) {
5767             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5768                                         &out, native_ordering);
5769         }
5770         else {
5771             assert(kind == PyUnicode_4BYTE_KIND);
5772             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5773                                         &out, native_ordering);
5774         }
5775         if (pos == len)
5776             break;
5777 
5778         rep = unicode_encode_call_errorhandler(
5779                 errors, &errorHandler,
5780                 encoding, "surrogates not allowed",
5781                 str, &exc, pos, pos + 1, &pos);
5782         if (!rep)
5783             goto error;
5784 
5785         if (PyBytes_Check(rep)) {
5786             repsize = PyBytes_GET_SIZE(rep);
5787             if (repsize & 3) {
5788                 raise_encode_exception(&exc, encoding,
5789                                        str, pos - 1, pos,
5790                                        "surrogates not allowed");
5791                 goto error;
5792             }
5793             moreunits = repsize / 4;
5794         }
5795         else {
5796             assert(PyUnicode_Check(rep));
5797             if (PyUnicode_READY(rep) < 0)
5798                 goto error;
5799             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5800             if (!PyUnicode_IS_ASCII(rep)) {
5801                 raise_encode_exception(&exc, encoding,
5802                                        str, pos - 1, pos,
5803                                        "surrogates not allowed");
5804                 goto error;
5805             }
5806         }
5807 
5808         /* four bytes are reserved for each surrogate */
5809         if (moreunits > 1) {
5810             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5811             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5812                 /* integer overflow */
5813                 PyErr_NoMemory();
5814                 goto error;
5815             }
5816             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5817                 goto error;
5818             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5819         }
5820 
5821         if (PyBytes_Check(rep)) {
5822             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5823             out += moreunits;
5824         } else /* rep is unicode */ {
5825             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5826             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5827                                  &out, native_ordering);
5828         }
5829 
5830         Py_CLEAR(rep);
5831     }
5832 
5833     /* Cut back to size actually needed. This is necessary for, for example,
5834        encoding of a string containing isolated surrogates and the 'ignore'
5835        handler is used. */
5836     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5837     if (nsize != PyBytes_GET_SIZE(v))
5838       _PyBytes_Resize(&v, nsize);
5839     Py_XDECREF(errorHandler);
5840     Py_XDECREF(exc);
5841   done:
5842     return v;
5843   error:
5844     Py_XDECREF(rep);
5845     Py_XDECREF(errorHandler);
5846     Py_XDECREF(exc);
5847     Py_XDECREF(v);
5848     return NULL;
5849 }
5850 
5851 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5852 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5853                       Py_ssize_t size,
5854                       const char *errors,
5855                       int byteorder)
5856 {
5857     PyObject *result;
5858     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5859     if (tmp == NULL)
5860         return NULL;
5861     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5862     Py_DECREF(tmp);
5863     return result;
5864 }
5865 
5866 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5867 PyUnicode_AsUTF32String(PyObject *unicode)
5868 {
5869     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5870 }
5871 
5872 /* --- UTF-16 Codec ------------------------------------------------------- */
5873 
5874 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5875 PyUnicode_DecodeUTF16(const char *s,
5876                       Py_ssize_t size,
5877                       const char *errors,
5878                       int *byteorder)
5879 {
5880     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5881 }
5882 
5883 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5884 PyUnicode_DecodeUTF16Stateful(const char *s,
5885                               Py_ssize_t size,
5886                               const char *errors,
5887                               int *byteorder,
5888                               Py_ssize_t *consumed)
5889 {
5890     const char *starts = s;
5891     Py_ssize_t startinpos;
5892     Py_ssize_t endinpos;
5893     _PyUnicodeWriter writer;
5894     const unsigned char *q, *e;
5895     int bo = 0;       /* assume native ordering by default */
5896     int native_ordering;
5897     const char *errmsg = "";
5898     PyObject *errorHandler = NULL;
5899     PyObject *exc = NULL;
5900     const char *encoding;
5901 
5902     q = (const unsigned char *)s;
5903     e = q + size;
5904 
5905     if (byteorder)
5906         bo = *byteorder;
5907 
5908     /* Check for BOM marks (U+FEFF) in the input and adjust current
5909        byte order setting accordingly. In native mode, the leading BOM
5910        mark is skipped, in all other modes, it is copied to the output
5911        stream as-is (giving a ZWNBSP character). */
5912     if (bo == 0 && size >= 2) {
5913         const Py_UCS4 bom = (q[1] << 8) | q[0];
5914         if (bom == 0xFEFF) {
5915             q += 2;
5916             bo = -1;
5917         }
5918         else if (bom == 0xFFFE) {
5919             q += 2;
5920             bo = 1;
5921         }
5922         if (byteorder)
5923             *byteorder = bo;
5924     }
5925 
5926     if (q == e) {
5927         if (consumed)
5928             *consumed = size;
5929         _Py_RETURN_UNICODE_EMPTY();
5930     }
5931 
5932 #if PY_LITTLE_ENDIAN
5933     native_ordering = bo <= 0;
5934     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5935 #else
5936     native_ordering = bo >= 0;
5937     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5938 #endif
5939 
5940     /* Note: size will always be longer than the resulting Unicode
5941        character count normally.  Error handler will take care of
5942        resizing when needed. */
5943     _PyUnicodeWriter_Init(&writer);
5944     writer.min_length = (e - q + 1) / 2;
5945     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5946         goto onError;
5947 
5948     while (1) {
5949         Py_UCS4 ch = 0;
5950         if (e - q >= 2) {
5951             int kind = writer.kind;
5952             if (kind == PyUnicode_1BYTE_KIND) {
5953                 if (PyUnicode_IS_ASCII(writer.buffer))
5954                     ch = asciilib_utf16_decode(&q, e,
5955                             (Py_UCS1*)writer.data, &writer.pos,
5956                             native_ordering);
5957                 else
5958                     ch = ucs1lib_utf16_decode(&q, e,
5959                             (Py_UCS1*)writer.data, &writer.pos,
5960                             native_ordering);
5961             } else if (kind == PyUnicode_2BYTE_KIND) {
5962                 ch = ucs2lib_utf16_decode(&q, e,
5963                         (Py_UCS2*)writer.data, &writer.pos,
5964                         native_ordering);
5965             } else {
5966                 assert(kind == PyUnicode_4BYTE_KIND);
5967                 ch = ucs4lib_utf16_decode(&q, e,
5968                         (Py_UCS4*)writer.data, &writer.pos,
5969                         native_ordering);
5970             }
5971         }
5972 
5973         switch (ch)
5974         {
5975         case 0:
5976             /* remaining byte at the end? (size should be even) */
5977             if (q == e || consumed)
5978                 goto End;
5979             errmsg = "truncated data";
5980             startinpos = ((const char *)q) - starts;
5981             endinpos = ((const char *)e) - starts;
5982             break;
5983             /* The remaining input chars are ignored if the callback
5984                chooses to skip the input */
5985         case 1:
5986             q -= 2;
5987             if (consumed)
5988                 goto End;
5989             errmsg = "unexpected end of data";
5990             startinpos = ((const char *)q) - starts;
5991             endinpos = ((const char *)e) - starts;
5992             break;
5993         case 2:
5994             errmsg = "illegal encoding";
5995             startinpos = ((const char *)q) - 2 - starts;
5996             endinpos = startinpos + 2;
5997             break;
5998         case 3:
5999             errmsg = "illegal UTF-16 surrogate";
6000             startinpos = ((const char *)q) - 4 - starts;
6001             endinpos = startinpos + 2;
6002             break;
6003         default:
6004             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6005                 goto onError;
6006             continue;
6007         }
6008 
6009         if (unicode_decode_call_errorhandler_writer(
6010                 errors,
6011                 &errorHandler,
6012                 encoding, errmsg,
6013                 &starts,
6014                 (const char **)&e,
6015                 &startinpos,
6016                 &endinpos,
6017                 &exc,
6018                 (const char **)&q,
6019                 &writer))
6020             goto onError;
6021     }
6022 
6023 End:
6024     if (consumed)
6025         *consumed = (const char *)q-starts;
6026 
6027     Py_XDECREF(errorHandler);
6028     Py_XDECREF(exc);
6029     return _PyUnicodeWriter_Finish(&writer);
6030 
6031   onError:
6032     _PyUnicodeWriter_Dealloc(&writer);
6033     Py_XDECREF(errorHandler);
6034     Py_XDECREF(exc);
6035     return NULL;
6036 }
6037 
6038 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6039 _PyUnicode_EncodeUTF16(PyObject *str,
6040                        const char *errors,
6041                        int byteorder)
6042 {
6043     enum PyUnicode_Kind kind;
6044     const void *data;
6045     Py_ssize_t len;
6046     PyObject *v;
6047     unsigned short *out;
6048     Py_ssize_t pairs;
6049 #if PY_BIG_ENDIAN
6050     int native_ordering = byteorder >= 0;
6051 #else
6052     int native_ordering = byteorder <= 0;
6053 #endif
6054     const char *encoding;
6055     Py_ssize_t nsize, pos;
6056     PyObject *errorHandler = NULL;
6057     PyObject *exc = NULL;
6058     PyObject *rep = NULL;
6059 
6060     if (!PyUnicode_Check(str)) {
6061         PyErr_BadArgument();
6062         return NULL;
6063     }
6064     if (PyUnicode_READY(str) == -1)
6065         return NULL;
6066     kind = PyUnicode_KIND(str);
6067     data = PyUnicode_DATA(str);
6068     len = PyUnicode_GET_LENGTH(str);
6069 
6070     pairs = 0;
6071     if (kind == PyUnicode_4BYTE_KIND) {
6072         const Py_UCS4 *in = (const Py_UCS4 *)data;
6073         const Py_UCS4 *end = in + len;
6074         while (in < end) {
6075             if (*in++ >= 0x10000) {
6076                 pairs++;
6077             }
6078         }
6079     }
6080     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6081         return PyErr_NoMemory();
6082     }
6083     nsize = len + pairs + (byteorder == 0);
6084     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6085     if (v == NULL) {
6086         return NULL;
6087     }
6088 
6089     /* output buffer is 2-bytes aligned */
6090     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6091     out = (unsigned short *)PyBytes_AS_STRING(v);
6092     if (byteorder == 0) {
6093         *out++ = 0xFEFF;
6094     }
6095     if (len == 0) {
6096         goto done;
6097     }
6098 
6099     if (kind == PyUnicode_1BYTE_KIND) {
6100         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6101         goto done;
6102     }
6103 
6104     if (byteorder < 0) {
6105         encoding = "utf-16-le";
6106     }
6107     else if (byteorder > 0) {
6108         encoding = "utf-16-be";
6109     }
6110     else {
6111         encoding = "utf-16";
6112     }
6113 
6114     pos = 0;
6115     while (pos < len) {
6116         Py_ssize_t repsize, moreunits;
6117 
6118         if (kind == PyUnicode_2BYTE_KIND) {
6119             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6120                                         &out, native_ordering);
6121         }
6122         else {
6123             assert(kind == PyUnicode_4BYTE_KIND);
6124             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6125                                         &out, native_ordering);
6126         }
6127         if (pos == len)
6128             break;
6129 
6130         rep = unicode_encode_call_errorhandler(
6131                 errors, &errorHandler,
6132                 encoding, "surrogates not allowed",
6133                 str, &exc, pos, pos + 1, &pos);
6134         if (!rep)
6135             goto error;
6136 
6137         if (PyBytes_Check(rep)) {
6138             repsize = PyBytes_GET_SIZE(rep);
6139             if (repsize & 1) {
6140                 raise_encode_exception(&exc, encoding,
6141                                        str, pos - 1, pos,
6142                                        "surrogates not allowed");
6143                 goto error;
6144             }
6145             moreunits = repsize / 2;
6146         }
6147         else {
6148             assert(PyUnicode_Check(rep));
6149             if (PyUnicode_READY(rep) < 0)
6150                 goto error;
6151             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6152             if (!PyUnicode_IS_ASCII(rep)) {
6153                 raise_encode_exception(&exc, encoding,
6154                                        str, pos - 1, pos,
6155                                        "surrogates not allowed");
6156                 goto error;
6157             }
6158         }
6159 
6160         /* two bytes are reserved for each surrogate */
6161         if (moreunits > 1) {
6162             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6163             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6164                 /* integer overflow */
6165                 PyErr_NoMemory();
6166                 goto error;
6167             }
6168             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6169                 goto error;
6170             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6171         }
6172 
6173         if (PyBytes_Check(rep)) {
6174             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6175             out += moreunits;
6176         } else /* rep is unicode */ {
6177             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6178             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6179                                  &out, native_ordering);
6180         }
6181 
6182         Py_CLEAR(rep);
6183     }
6184 
6185     /* Cut back to size actually needed. This is necessary for, for example,
6186     encoding of a string containing isolated surrogates and the 'ignore' handler
6187     is used. */
6188     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6189     if (nsize != PyBytes_GET_SIZE(v))
6190       _PyBytes_Resize(&v, nsize);
6191     Py_XDECREF(errorHandler);
6192     Py_XDECREF(exc);
6193   done:
6194     return v;
6195   error:
6196     Py_XDECREF(rep);
6197     Py_XDECREF(errorHandler);
6198     Py_XDECREF(exc);
6199     Py_XDECREF(v);
6200     return NULL;
6201 #undef STORECHAR
6202 }
6203 
6204 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6205 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6206                       Py_ssize_t size,
6207                       const char *errors,
6208                       int byteorder)
6209 {
6210     PyObject *result;
6211     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6212     if (tmp == NULL)
6213         return NULL;
6214     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6215     Py_DECREF(tmp);
6216     return result;
6217 }
6218 
6219 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6220 PyUnicode_AsUTF16String(PyObject *unicode)
6221 {
6222     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6223 }
6224 
6225 /* --- Unicode Escape Codec ----------------------------------------------- */
6226 
6227 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6228 
6229 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6230 _PyUnicode_DecodeUnicodeEscape(const char *s,
6231                                Py_ssize_t size,
6232                                const char *errors,
6233                                const char **first_invalid_escape)
6234 {
6235     const char *starts = s;
6236     _PyUnicodeWriter writer;
6237     const char *end;
6238     PyObject *errorHandler = NULL;
6239     PyObject *exc = NULL;
6240 
6241     // so we can remember if we've seen an invalid escape char or not
6242     *first_invalid_escape = NULL;
6243 
6244     if (size == 0) {
6245         _Py_RETURN_UNICODE_EMPTY();
6246     }
6247     /* Escaped strings will always be longer than the resulting
6248        Unicode string, so we start with size here and then reduce the
6249        length after conversion to the true value.
6250        (but if the error callback returns a long replacement string
6251        we'll have to allocate more space) */
6252     _PyUnicodeWriter_Init(&writer);
6253     writer.min_length = size;
6254     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6255         goto onError;
6256     }
6257 
6258     end = s + size;
6259     while (s < end) {
6260         unsigned char c = (unsigned char) *s++;
6261         Py_UCS4 ch;
6262         int count;
6263         Py_ssize_t startinpos;
6264         Py_ssize_t endinpos;
6265         const char *message;
6266 
6267 #define WRITE_ASCII_CHAR(ch)                                                  \
6268             do {                                                              \
6269                 assert(ch <= 127);                                            \
6270                 assert(writer.pos < writer.size);                             \
6271                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6272             } while(0)
6273 
6274 #define WRITE_CHAR(ch)                                                        \
6275             do {                                                              \
6276                 if (ch <= writer.maxchar) {                                   \
6277                     assert(writer.pos < writer.size);                         \
6278                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6279                 }                                                             \
6280                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6281                     goto onError;                                             \
6282                 }                                                             \
6283             } while(0)
6284 
6285         /* Non-escape characters are interpreted as Unicode ordinals */
6286         if (c != '\\') {
6287             WRITE_CHAR(c);
6288             continue;
6289         }
6290 
6291         startinpos = s - starts - 1;
6292         /* \ - Escapes */
6293         if (s >= end) {
6294             message = "\\ at end of string";
6295             goto error;
6296         }
6297         c = (unsigned char) *s++;
6298 
6299         assert(writer.pos < writer.size);
6300         switch (c) {
6301 
6302             /* \x escapes */
6303         case '\n': continue;
6304         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6305         case '\'': WRITE_ASCII_CHAR('\''); continue;
6306         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6307         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6308         /* FF */
6309         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6310         case 't': WRITE_ASCII_CHAR('\t'); continue;
6311         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6312         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6313         /* VT */
6314         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6315         /* BEL, not classic C */
6316         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6317 
6318             /* \OOO (octal) escapes */
6319         case '0': case '1': case '2': case '3':
6320         case '4': case '5': case '6': case '7':
6321             ch = c - '0';
6322             if (s < end && '0' <= *s && *s <= '7') {
6323                 ch = (ch<<3) + *s++ - '0';
6324                 if (s < end && '0' <= *s && *s <= '7') {
6325                     ch = (ch<<3) + *s++ - '0';
6326                 }
6327             }
6328             WRITE_CHAR(ch);
6329             continue;
6330 
6331             /* hex escapes */
6332             /* \xXX */
6333         case 'x':
6334             count = 2;
6335             message = "truncated \\xXX escape";
6336             goto hexescape;
6337 
6338             /* \uXXXX */
6339         case 'u':
6340             count = 4;
6341             message = "truncated \\uXXXX escape";
6342             goto hexescape;
6343 
6344             /* \UXXXXXXXX */
6345         case 'U':
6346             count = 8;
6347             message = "truncated \\UXXXXXXXX escape";
6348         hexescape:
6349             for (ch = 0; count && s < end; ++s, --count) {
6350                 c = (unsigned char)*s;
6351                 ch <<= 4;
6352                 if (c >= '0' && c <= '9') {
6353                     ch += c - '0';
6354                 }
6355                 else if (c >= 'a' && c <= 'f') {
6356                     ch += c - ('a' - 10);
6357                 }
6358                 else if (c >= 'A' && c <= 'F') {
6359                     ch += c - ('A' - 10);
6360                 }
6361                 else {
6362                     break;
6363                 }
6364             }
6365             if (count) {
6366                 goto error;
6367             }
6368 
6369             /* when we get here, ch is a 32-bit unicode character */
6370             if (ch > MAX_UNICODE) {
6371                 message = "illegal Unicode character";
6372                 goto error;
6373             }
6374 
6375             WRITE_CHAR(ch);
6376             continue;
6377 
6378             /* \N{name} */
6379         case 'N':
6380             if (ucnhash_CAPI == NULL) {
6381                 /* load the unicode data module */
6382                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6383                                                 PyUnicodeData_CAPSULE_NAME, 1);
6384                 if (ucnhash_CAPI == NULL) {
6385                     PyErr_SetString(
6386                         PyExc_UnicodeError,
6387                         "\\N escapes not supported (can't load unicodedata module)"
6388                         );
6389                     goto onError;
6390                 }
6391             }
6392 
6393             message = "malformed \\N character escape";
6394             if (s < end && *s == '{') {
6395                 const char *start = ++s;
6396                 size_t namelen;
6397                 /* look for the closing brace */
6398                 while (s < end && *s != '}')
6399                     s++;
6400                 namelen = s - start;
6401                 if (namelen && s < end) {
6402                     /* found a name.  look it up in the unicode database */
6403                     s++;
6404                     ch = 0xffffffff; /* in case 'getcode' messes up */
6405                     if (namelen <= INT_MAX &&
6406                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6407                                               &ch, 0)) {
6408                         assert(ch <= MAX_UNICODE);
6409                         WRITE_CHAR(ch);
6410                         continue;
6411                     }
6412                     message = "unknown Unicode character name";
6413                 }
6414             }
6415             goto error;
6416 
6417         default:
6418             if (*first_invalid_escape == NULL) {
6419                 *first_invalid_escape = s-1; /* Back up one char, since we've
6420                                                 already incremented s. */
6421             }
6422             WRITE_ASCII_CHAR('\\');
6423             WRITE_CHAR(c);
6424             continue;
6425         }
6426 
6427       error:
6428         endinpos = s-starts;
6429         writer.min_length = end - s + writer.pos;
6430         if (unicode_decode_call_errorhandler_writer(
6431                 errors, &errorHandler,
6432                 "unicodeescape", message,
6433                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6434                 &writer)) {
6435             goto onError;
6436         }
6437         assert(end - s <= writer.size - writer.pos);
6438 
6439 #undef WRITE_ASCII_CHAR
6440 #undef WRITE_CHAR
6441     }
6442 
6443     Py_XDECREF(errorHandler);
6444     Py_XDECREF(exc);
6445     return _PyUnicodeWriter_Finish(&writer);
6446 
6447   onError:
6448     _PyUnicodeWriter_Dealloc(&writer);
6449     Py_XDECREF(errorHandler);
6450     Py_XDECREF(exc);
6451     return NULL;
6452 }
6453 
6454 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6455 PyUnicode_DecodeUnicodeEscape(const char *s,
6456                               Py_ssize_t size,
6457                               const char *errors)
6458 {
6459     const char *first_invalid_escape;
6460     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6461                                                       &first_invalid_escape);
6462     if (result == NULL)
6463         return NULL;
6464     if (first_invalid_escape != NULL) {
6465         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6466                              "invalid escape sequence '\\%c'",
6467                              (unsigned char)*first_invalid_escape) < 0) {
6468             Py_DECREF(result);
6469             return NULL;
6470         }
6471     }
6472     return result;
6473 }
6474 
6475 /* Return a Unicode-Escape string version of the Unicode object. */
6476 
6477 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6478 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6479 {
6480     Py_ssize_t i, len;
6481     PyObject *repr;
6482     char *p;
6483     enum PyUnicode_Kind kind;
6484     const void *data;
6485     Py_ssize_t expandsize;
6486 
6487     /* Initial allocation is based on the longest-possible character
6488        escape.
6489 
6490        For UCS1 strings it's '\xxx', 4 bytes per source character.
6491        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6492        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6493     */
6494 
6495     if (!PyUnicode_Check(unicode)) {
6496         PyErr_BadArgument();
6497         return NULL;
6498     }
6499     if (PyUnicode_READY(unicode) == -1) {
6500         return NULL;
6501     }
6502 
6503     len = PyUnicode_GET_LENGTH(unicode);
6504     if (len == 0) {
6505         return PyBytes_FromStringAndSize(NULL, 0);
6506     }
6507 
6508     kind = PyUnicode_KIND(unicode);
6509     data = PyUnicode_DATA(unicode);
6510     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6511        bytes, and 1 byte characters 4. */
6512     expandsize = kind * 2 + 2;
6513     if (len > PY_SSIZE_T_MAX / expandsize) {
6514         return PyErr_NoMemory();
6515     }
6516     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6517     if (repr == NULL) {
6518         return NULL;
6519     }
6520 
6521     p = PyBytes_AS_STRING(repr);
6522     for (i = 0; i < len; i++) {
6523         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6524 
6525         /* U+0000-U+00ff range */
6526         if (ch < 0x100) {
6527             if (ch >= ' ' && ch < 127) {
6528                 if (ch != '\\') {
6529                     /* Copy printable US ASCII as-is */
6530                     *p++ = (char) ch;
6531                 }
6532                 /* Escape backslashes */
6533                 else {
6534                     *p++ = '\\';
6535                     *p++ = '\\';
6536                 }
6537             }
6538 
6539             /* Map special whitespace to '\t', \n', '\r' */
6540             else if (ch == '\t') {
6541                 *p++ = '\\';
6542                 *p++ = 't';
6543             }
6544             else if (ch == '\n') {
6545                 *p++ = '\\';
6546                 *p++ = 'n';
6547             }
6548             else if (ch == '\r') {
6549                 *p++ = '\\';
6550                 *p++ = 'r';
6551             }
6552 
6553             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6554             else {
6555                 *p++ = '\\';
6556                 *p++ = 'x';
6557                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6558                 *p++ = Py_hexdigits[ch & 0x000F];
6559             }
6560         }
6561         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6562         else if (ch < 0x10000) {
6563             *p++ = '\\';
6564             *p++ = 'u';
6565             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6566             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6567             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6568             *p++ = Py_hexdigits[ch & 0x000F];
6569         }
6570         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6571         else {
6572 
6573             /* Make sure that the first two digits are zero */
6574             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6575             *p++ = '\\';
6576             *p++ = 'U';
6577             *p++ = '0';
6578             *p++ = '0';
6579             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6580             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6581             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6582             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6583             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6584             *p++ = Py_hexdigits[ch & 0x0000000F];
6585         }
6586     }
6587 
6588     assert(p - PyBytes_AS_STRING(repr) > 0);
6589     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6590         return NULL;
6591     }
6592     return repr;
6593 }
6594 
6595 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6596 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6597                               Py_ssize_t size)
6598 {
6599     PyObject *result;
6600     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6601     if (tmp == NULL) {
6602         return NULL;
6603     }
6604 
6605     result = PyUnicode_AsUnicodeEscapeString(tmp);
6606     Py_DECREF(tmp);
6607     return result;
6608 }
6609 
6610 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6611 
6612 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6613 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6614                                  Py_ssize_t size,
6615                                  const char *errors)
6616 {
6617     const char *starts = s;
6618     _PyUnicodeWriter writer;
6619     const char *end;
6620     PyObject *errorHandler = NULL;
6621     PyObject *exc = NULL;
6622 
6623     if (size == 0) {
6624         _Py_RETURN_UNICODE_EMPTY();
6625     }
6626 
6627     /* Escaped strings will always be longer than the resulting
6628        Unicode string, so we start with size here and then reduce the
6629        length after conversion to the true value. (But decoding error
6630        handler might have to resize the string) */
6631     _PyUnicodeWriter_Init(&writer);
6632     writer.min_length = size;
6633     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6634         goto onError;
6635     }
6636 
6637     end = s + size;
6638     while (s < end) {
6639         unsigned char c = (unsigned char) *s++;
6640         Py_UCS4 ch;
6641         int count;
6642         Py_ssize_t startinpos;
6643         Py_ssize_t endinpos;
6644         const char *message;
6645 
6646 #define WRITE_CHAR(ch)                                                        \
6647             do {                                                              \
6648                 if (ch <= writer.maxchar) {                                   \
6649                     assert(writer.pos < writer.size);                         \
6650                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6651                 }                                                             \
6652                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6653                     goto onError;                                             \
6654                 }                                                             \
6655             } while(0)
6656 
6657         /* Non-escape characters are interpreted as Unicode ordinals */
6658         if (c != '\\' || s >= end) {
6659             WRITE_CHAR(c);
6660             continue;
6661         }
6662 
6663         c = (unsigned char) *s++;
6664         if (c == 'u') {
6665             count = 4;
6666             message = "truncated \\uXXXX escape";
6667         }
6668         else if (c == 'U') {
6669             count = 8;
6670             message = "truncated \\UXXXXXXXX escape";
6671         }
6672         else {
6673             assert(writer.pos < writer.size);
6674             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6675             WRITE_CHAR(c);
6676             continue;
6677         }
6678         startinpos = s - starts - 2;
6679 
6680         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6681         for (ch = 0; count && s < end; ++s, --count) {
6682             c = (unsigned char)*s;
6683             ch <<= 4;
6684             if (c >= '0' && c <= '9') {
6685                 ch += c - '0';
6686             }
6687             else if (c >= 'a' && c <= 'f') {
6688                 ch += c - ('a' - 10);
6689             }
6690             else if (c >= 'A' && c <= 'F') {
6691                 ch += c - ('A' - 10);
6692             }
6693             else {
6694                 break;
6695             }
6696         }
6697         if (!count) {
6698             if (ch <= MAX_UNICODE) {
6699                 WRITE_CHAR(ch);
6700                 continue;
6701             }
6702             message = "\\Uxxxxxxxx out of range";
6703         }
6704 
6705         endinpos = s-starts;
6706         writer.min_length = end - s + writer.pos;
6707         if (unicode_decode_call_errorhandler_writer(
6708                 errors, &errorHandler,
6709                 "rawunicodeescape", message,
6710                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6711                 &writer)) {
6712             goto onError;
6713         }
6714         assert(end - s <= writer.size - writer.pos);
6715 
6716 #undef WRITE_CHAR
6717     }
6718     Py_XDECREF(errorHandler);
6719     Py_XDECREF(exc);
6720     return _PyUnicodeWriter_Finish(&writer);
6721 
6722   onError:
6723     _PyUnicodeWriter_Dealloc(&writer);
6724     Py_XDECREF(errorHandler);
6725     Py_XDECREF(exc);
6726     return NULL;
6727 
6728 }
6729 
6730 
6731 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6732 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6733 {
6734     PyObject *repr;
6735     char *p;
6736     Py_ssize_t expandsize, pos;
6737     int kind;
6738     const void *data;
6739     Py_ssize_t len;
6740 
6741     if (!PyUnicode_Check(unicode)) {
6742         PyErr_BadArgument();
6743         return NULL;
6744     }
6745     if (PyUnicode_READY(unicode) == -1) {
6746         return NULL;
6747     }
6748     kind = PyUnicode_KIND(unicode);
6749     data = PyUnicode_DATA(unicode);
6750     len = PyUnicode_GET_LENGTH(unicode);
6751     if (kind == PyUnicode_1BYTE_KIND) {
6752         return PyBytes_FromStringAndSize(data, len);
6753     }
6754 
6755     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6756        bytes, and 1 byte characters 4. */
6757     expandsize = kind * 2 + 2;
6758 
6759     if (len > PY_SSIZE_T_MAX / expandsize) {
6760         return PyErr_NoMemory();
6761     }
6762     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6763     if (repr == NULL) {
6764         return NULL;
6765     }
6766     if (len == 0) {
6767         return repr;
6768     }
6769 
6770     p = PyBytes_AS_STRING(repr);
6771     for (pos = 0; pos < len; pos++) {
6772         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6773 
6774         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6775         if (ch < 0x100) {
6776             *p++ = (char) ch;
6777         }
6778         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6779         else if (ch < 0x10000) {
6780             *p++ = '\\';
6781             *p++ = 'u';
6782             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6783             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6784             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6785             *p++ = Py_hexdigits[ch & 15];
6786         }
6787         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6788         else {
6789             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6790             *p++ = '\\';
6791             *p++ = 'U';
6792             *p++ = '0';
6793             *p++ = '0';
6794             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6795             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6796             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6797             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6798             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6799             *p++ = Py_hexdigits[ch & 15];
6800         }
6801     }
6802 
6803     assert(p > PyBytes_AS_STRING(repr));
6804     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6805         return NULL;
6806     }
6807     return repr;
6808 }
6809 
6810 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6811 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6812                                  Py_ssize_t size)
6813 {
6814     PyObject *result;
6815     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6816     if (tmp == NULL)
6817         return NULL;
6818     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6819     Py_DECREF(tmp);
6820     return result;
6821 }
6822 
6823 /* --- Latin-1 Codec ------------------------------------------------------ */
6824 
6825 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6826 PyUnicode_DecodeLatin1(const char *s,
6827                        Py_ssize_t size,
6828                        const char *errors)
6829 {
6830     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6831     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6832 }
6833 
6834 /* create or adjust a UnicodeEncodeError */
6835 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6836 make_encode_exception(PyObject **exceptionObject,
6837                       const char *encoding,
6838                       PyObject *unicode,
6839                       Py_ssize_t startpos, Py_ssize_t endpos,
6840                       const char *reason)
6841 {
6842     if (*exceptionObject == NULL) {
6843         *exceptionObject = PyObject_CallFunction(
6844             PyExc_UnicodeEncodeError, "sOnns",
6845             encoding, unicode, startpos, endpos, reason);
6846     }
6847     else {
6848         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6849             goto onError;
6850         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6851             goto onError;
6852         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6853             goto onError;
6854         return;
6855       onError:
6856         Py_CLEAR(*exceptionObject);
6857     }
6858 }
6859 
6860 /* raises a UnicodeEncodeError */
6861 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6862 raise_encode_exception(PyObject **exceptionObject,
6863                        const char *encoding,
6864                        PyObject *unicode,
6865                        Py_ssize_t startpos, Py_ssize_t endpos,
6866                        const char *reason)
6867 {
6868     make_encode_exception(exceptionObject,
6869                           encoding, unicode, startpos, endpos, reason);
6870     if (*exceptionObject != NULL)
6871         PyCodec_StrictErrors(*exceptionObject);
6872 }
6873 
6874 /* error handling callback helper:
6875    build arguments, call the callback and check the arguments,
6876    put the result into newpos and return the replacement string, which
6877    has to be freed by the caller */
6878 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6879 unicode_encode_call_errorhandler(const char *errors,
6880                                  PyObject **errorHandler,
6881                                  const char *encoding, const char *reason,
6882                                  PyObject *unicode, PyObject **exceptionObject,
6883                                  Py_ssize_t startpos, Py_ssize_t endpos,
6884                                  Py_ssize_t *newpos)
6885 {
6886     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6887     Py_ssize_t len;
6888     PyObject *restuple;
6889     PyObject *resunicode;
6890 
6891     if (*errorHandler == NULL) {
6892         *errorHandler = PyCodec_LookupError(errors);
6893         if (*errorHandler == NULL)
6894             return NULL;
6895     }
6896 
6897     if (PyUnicode_READY(unicode) == -1)
6898         return NULL;
6899     len = PyUnicode_GET_LENGTH(unicode);
6900 
6901     make_encode_exception(exceptionObject,
6902                           encoding, unicode, startpos, endpos, reason);
6903     if (*exceptionObject == NULL)
6904         return NULL;
6905 
6906     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6907     if (restuple == NULL)
6908         return NULL;
6909     if (!PyTuple_Check(restuple)) {
6910         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6911         Py_DECREF(restuple);
6912         return NULL;
6913     }
6914     if (!PyArg_ParseTuple(restuple, argparse,
6915                           &resunicode, newpos)) {
6916         Py_DECREF(restuple);
6917         return NULL;
6918     }
6919     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6920         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6921         Py_DECREF(restuple);
6922         return NULL;
6923     }
6924     if (*newpos<0)
6925         *newpos = len + *newpos;
6926     if (*newpos<0 || *newpos>len) {
6927         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6928         Py_DECREF(restuple);
6929         return NULL;
6930     }
6931     Py_INCREF(resunicode);
6932     Py_DECREF(restuple);
6933     return resunicode;
6934 }
6935 
6936 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6937 unicode_encode_ucs1(PyObject *unicode,
6938                     const char *errors,
6939                     const Py_UCS4 limit)
6940 {
6941     /* input state */
6942     Py_ssize_t pos=0, size;
6943     int kind;
6944     const void *data;
6945     /* pointer into the output */
6946     char *str;
6947     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6948     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6949     PyObject *error_handler_obj = NULL;
6950     PyObject *exc = NULL;
6951     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6952     PyObject *rep = NULL;
6953     /* output object */
6954     _PyBytesWriter writer;
6955 
6956     if (PyUnicode_READY(unicode) == -1)
6957         return NULL;
6958     size = PyUnicode_GET_LENGTH(unicode);
6959     kind = PyUnicode_KIND(unicode);
6960     data = PyUnicode_DATA(unicode);
6961     /* allocate enough for a simple encoding without
6962        replacements, if we need more, we'll resize */
6963     if (size == 0)
6964         return PyBytes_FromStringAndSize(NULL, 0);
6965 
6966     _PyBytesWriter_Init(&writer);
6967     str = _PyBytesWriter_Alloc(&writer, size);
6968     if (str == NULL)
6969         return NULL;
6970 
6971     while (pos < size) {
6972         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6973 
6974         /* can we encode this? */
6975         if (ch < limit) {
6976             /* no overflow check, because we know that the space is enough */
6977             *str++ = (char)ch;
6978             ++pos;
6979         }
6980         else {
6981             Py_ssize_t newpos, i;
6982             /* startpos for collecting unencodable chars */
6983             Py_ssize_t collstart = pos;
6984             Py_ssize_t collend = collstart + 1;
6985             /* find all unecodable characters */
6986 
6987             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6988                 ++collend;
6989 
6990             /* Only overallocate the buffer if it's not the last write */
6991             writer.overallocate = (collend < size);
6992 
6993             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6994             if (error_handler == _Py_ERROR_UNKNOWN)
6995                 error_handler = _Py_GetErrorHandler(errors);
6996 
6997             switch (error_handler) {
6998             case _Py_ERROR_STRICT:
6999                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7000                 goto onError;
7001 
7002             case _Py_ERROR_REPLACE:
7003                 memset(str, '?', collend - collstart);
7004                 str += (collend - collstart);
7005                 /* fall through */
7006             case _Py_ERROR_IGNORE:
7007                 pos = collend;
7008                 break;
7009 
7010             case _Py_ERROR_BACKSLASHREPLACE:
7011                 /* subtract preallocated bytes */
7012                 writer.min_size -= (collend - collstart);
7013                 str = backslashreplace(&writer, str,
7014                                        unicode, collstart, collend);
7015                 if (str == NULL)
7016                     goto onError;
7017                 pos = collend;
7018                 break;
7019 
7020             case _Py_ERROR_XMLCHARREFREPLACE:
7021                 /* subtract preallocated bytes */
7022                 writer.min_size -= (collend - collstart);
7023                 str = xmlcharrefreplace(&writer, str,
7024                                         unicode, collstart, collend);
7025                 if (str == NULL)
7026                     goto onError;
7027                 pos = collend;
7028                 break;
7029 
7030             case _Py_ERROR_SURROGATEESCAPE:
7031                 for (i = collstart; i < collend; ++i) {
7032                     ch = PyUnicode_READ(kind, data, i);
7033                     if (ch < 0xdc80 || 0xdcff < ch) {
7034                         /* Not a UTF-8b surrogate */
7035                         break;
7036                     }
7037                     *str++ = (char)(ch - 0xdc00);
7038                     ++pos;
7039                 }
7040                 if (i >= collend)
7041                     break;
7042                 collstart = pos;
7043                 assert(collstart != collend);
7044                 /* fall through */
7045 
7046             default:
7047                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7048                                                        encoding, reason, unicode, &exc,
7049                                                        collstart, collend, &newpos);
7050                 if (rep == NULL)
7051                     goto onError;
7052 
7053                 /* subtract preallocated bytes */
7054                 writer.min_size -= newpos - collstart;
7055 
7056                 if (PyBytes_Check(rep)) {
7057                     /* Directly copy bytes result to output. */
7058                     str = _PyBytesWriter_WriteBytes(&writer, str,
7059                                                     PyBytes_AS_STRING(rep),
7060                                                     PyBytes_GET_SIZE(rep));
7061                 }
7062                 else {
7063                     assert(PyUnicode_Check(rep));
7064 
7065                     if (PyUnicode_READY(rep) < 0)
7066                         goto onError;
7067 
7068                     if (limit == 256 ?
7069                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7070                         !PyUnicode_IS_ASCII(rep))
7071                     {
7072                         /* Not all characters are smaller than limit */
7073                         raise_encode_exception(&exc, encoding, unicode,
7074                                                collstart, collend, reason);
7075                         goto onError;
7076                     }
7077                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7078                     str = _PyBytesWriter_WriteBytes(&writer, str,
7079                                                     PyUnicode_DATA(rep),
7080                                                     PyUnicode_GET_LENGTH(rep));
7081                 }
7082                 if (str == NULL)
7083                     goto onError;
7084 
7085                 pos = newpos;
7086                 Py_CLEAR(rep);
7087             }
7088 
7089             /* If overallocation was disabled, ensure that it was the last
7090                write. Otherwise, we missed an optimization */
7091             assert(writer.overallocate || pos == size);
7092         }
7093     }
7094 
7095     Py_XDECREF(error_handler_obj);
7096     Py_XDECREF(exc);
7097     return _PyBytesWriter_Finish(&writer, str);
7098 
7099   onError:
7100     Py_XDECREF(rep);
7101     _PyBytesWriter_Dealloc(&writer);
7102     Py_XDECREF(error_handler_obj);
7103     Py_XDECREF(exc);
7104     return NULL;
7105 }
7106 
7107 /* Deprecated */
7108 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7109 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7110                        Py_ssize_t size,
7111                        const char *errors)
7112 {
7113     PyObject *result;
7114     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7115     if (unicode == NULL)
7116         return NULL;
7117     result = unicode_encode_ucs1(unicode, errors, 256);
7118     Py_DECREF(unicode);
7119     return result;
7120 }
7121 
7122 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7123 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7124 {
7125     if (!PyUnicode_Check(unicode)) {
7126         PyErr_BadArgument();
7127         return NULL;
7128     }
7129     if (PyUnicode_READY(unicode) == -1)
7130         return NULL;
7131     /* Fast path: if it is a one-byte string, construct
7132        bytes object directly. */
7133     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7134         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7135                                          PyUnicode_GET_LENGTH(unicode));
7136     /* Non-Latin-1 characters present. Defer to above function to
7137        raise the exception. */
7138     return unicode_encode_ucs1(unicode, errors, 256);
7139 }
7140 
7141 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7142 PyUnicode_AsLatin1String(PyObject *unicode)
7143 {
7144     return _PyUnicode_AsLatin1String(unicode, NULL);
7145 }
7146 
7147 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7148 
7149 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7150 PyUnicode_DecodeASCII(const char *s,
7151                       Py_ssize_t size,
7152                       const char *errors)
7153 {
7154     const char *starts = s;
7155     const char *e = s + size;
7156     PyObject *error_handler_obj = NULL;
7157     PyObject *exc = NULL;
7158     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7159 
7160     if (size == 0)
7161         _Py_RETURN_UNICODE_EMPTY();
7162 
7163     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7164     if (size == 1 && (unsigned char)s[0] < 128)
7165         return get_latin1_char((unsigned char)s[0]);
7166 
7167     // Shortcut for simple case
7168     PyObject *u = PyUnicode_New(size, 127);
7169     if (u == NULL) {
7170         return NULL;
7171     }
7172     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7173     if (outpos == size) {
7174         return u;
7175     }
7176 
7177     _PyUnicodeWriter writer;
7178     _PyUnicodeWriter_InitWithBuffer(&writer, u);
7179     writer.pos = outpos;
7180 
7181     s += outpos;
7182     int kind = writer.kind;
7183     void *data = writer.data;
7184     Py_ssize_t startinpos, endinpos;
7185 
7186     while (s < e) {
7187         unsigned char c = (unsigned char)*s;
7188         if (c < 128) {
7189             PyUnicode_WRITE(kind, data, writer.pos, c);
7190             writer.pos++;
7191             ++s;
7192             continue;
7193         }
7194 
7195         /* byte outsize range 0x00..0x7f: call the error handler */
7196 
7197         if (error_handler == _Py_ERROR_UNKNOWN)
7198             error_handler = _Py_GetErrorHandler(errors);
7199 
7200         switch (error_handler)
7201         {
7202         case _Py_ERROR_REPLACE:
7203         case _Py_ERROR_SURROGATEESCAPE:
7204             /* Fast-path: the error handler only writes one character,
7205                but we may switch to UCS2 at the first write */
7206             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7207                 goto onError;
7208             kind = writer.kind;
7209             data = writer.data;
7210 
7211             if (error_handler == _Py_ERROR_REPLACE)
7212                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7213             else
7214                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7215             writer.pos++;
7216             ++s;
7217             break;
7218 
7219         case _Py_ERROR_IGNORE:
7220             ++s;
7221             break;
7222 
7223         default:
7224             startinpos = s-starts;
7225             endinpos = startinpos + 1;
7226             if (unicode_decode_call_errorhandler_writer(
7227                     errors, &error_handler_obj,
7228                     "ascii", "ordinal not in range(128)",
7229                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7230                     &writer))
7231                 goto onError;
7232             kind = writer.kind;
7233             data = writer.data;
7234         }
7235     }
7236     Py_XDECREF(error_handler_obj);
7237     Py_XDECREF(exc);
7238     return _PyUnicodeWriter_Finish(&writer);
7239 
7240   onError:
7241     _PyUnicodeWriter_Dealloc(&writer);
7242     Py_XDECREF(error_handler_obj);
7243     Py_XDECREF(exc);
7244     return NULL;
7245 }
7246 
7247 /* Deprecated */
7248 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7249 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7250                       Py_ssize_t size,
7251                       const char *errors)
7252 {
7253     PyObject *result;
7254     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7255     if (unicode == NULL)
7256         return NULL;
7257     result = unicode_encode_ucs1(unicode, errors, 128);
7258     Py_DECREF(unicode);
7259     return result;
7260 }
7261 
7262 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7263 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7264 {
7265     if (!PyUnicode_Check(unicode)) {
7266         PyErr_BadArgument();
7267         return NULL;
7268     }
7269     if (PyUnicode_READY(unicode) == -1)
7270         return NULL;
7271     /* Fast path: if it is an ASCII-only string, construct bytes object
7272        directly. Else defer to above function to raise the exception. */
7273     if (PyUnicode_IS_ASCII(unicode))
7274         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7275                                          PyUnicode_GET_LENGTH(unicode));
7276     return unicode_encode_ucs1(unicode, errors, 128);
7277 }
7278 
7279 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7280 PyUnicode_AsASCIIString(PyObject *unicode)
7281 {
7282     return _PyUnicode_AsASCIIString(unicode, NULL);
7283 }
7284 
7285 #ifdef MS_WINDOWS
7286 
7287 /* --- MBCS codecs for Windows -------------------------------------------- */
7288 
7289 #if SIZEOF_INT < SIZEOF_SIZE_T
7290 #define NEED_RETRY
7291 #endif
7292 
7293 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7294    transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7295    both cases also and avoids partial characters overrunning the
7296    length limit in MultiByteToWideChar on Windows */
7297 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7298 
7299 #ifndef WC_ERR_INVALID_CHARS
7300 #  define WC_ERR_INVALID_CHARS 0x0080
7301 #endif
7302 
7303 static const char*
code_page_name(UINT code_page,PyObject ** obj)7304 code_page_name(UINT code_page, PyObject **obj)
7305 {
7306     *obj = NULL;
7307     if (code_page == CP_ACP)
7308         return "mbcs";
7309     if (code_page == CP_UTF7)
7310         return "CP_UTF7";
7311     if (code_page == CP_UTF8)
7312         return "CP_UTF8";
7313 
7314     *obj = PyBytes_FromFormat("cp%u", code_page);
7315     if (*obj == NULL)
7316         return NULL;
7317     return PyBytes_AS_STRING(*obj);
7318 }
7319 
7320 static DWORD
decode_code_page_flags(UINT code_page)7321 decode_code_page_flags(UINT code_page)
7322 {
7323     if (code_page == CP_UTF7) {
7324         /* The CP_UTF7 decoder only supports flags=0 */
7325         return 0;
7326     }
7327     else
7328         return MB_ERR_INVALID_CHARS;
7329 }
7330 
7331 /*
7332  * Decode a byte string from a Windows code page into unicode object in strict
7333  * mode.
7334  *
7335  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7336  * OSError and returns -1 on other error.
7337  */
7338 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7339 decode_code_page_strict(UINT code_page,
7340                         wchar_t **buf,
7341                         Py_ssize_t *bufsize,
7342                         const char *in,
7343                         int insize)
7344 {
7345     DWORD flags = MB_ERR_INVALID_CHARS;
7346     wchar_t *out;
7347     DWORD outsize;
7348 
7349     /* First get the size of the result */
7350     assert(insize > 0);
7351     while ((outsize = MultiByteToWideChar(code_page, flags,
7352                                           in, insize, NULL, 0)) <= 0)
7353     {
7354         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7355             goto error;
7356         }
7357         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7358         flags = 0;
7359     }
7360 
7361     /* Extend a wchar_t* buffer */
7362     Py_ssize_t n = *bufsize;   /* Get the current length */
7363     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7364         return -1;
7365     }
7366     out = *buf + n;
7367 
7368     /* Do the conversion */
7369     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7370     if (outsize <= 0)
7371         goto error;
7372     return insize;
7373 
7374 error:
7375     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7376         return -2;
7377     PyErr_SetFromWindowsErr(0);
7378     return -1;
7379 }
7380 
7381 /*
7382  * Decode a byte string from a code page into unicode object with an error
7383  * handler.
7384  *
7385  * Returns consumed size if succeed, or raise an OSError or
7386  * UnicodeDecodeError exception and returns -1 on error.
7387  */
7388 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7389 decode_code_page_errors(UINT code_page,
7390                         wchar_t **buf,
7391                         Py_ssize_t *bufsize,
7392                         const char *in, const int size,
7393                         const char *errors, int final)
7394 {
7395     const char *startin = in;
7396     const char *endin = in + size;
7397     DWORD flags = MB_ERR_INVALID_CHARS;
7398     /* Ideally, we should get reason from FormatMessage. This is the Windows
7399        2000 English version of the message. */
7400     const char *reason = "No mapping for the Unicode character exists "
7401                          "in the target code page.";
7402     /* each step cannot decode more than 1 character, but a character can be
7403        represented as a surrogate pair */
7404     wchar_t buffer[2], *out;
7405     int insize;
7406     Py_ssize_t outsize;
7407     PyObject *errorHandler = NULL;
7408     PyObject *exc = NULL;
7409     PyObject *encoding_obj = NULL;
7410     const char *encoding;
7411     DWORD err;
7412     int ret = -1;
7413 
7414     assert(size > 0);
7415 
7416     encoding = code_page_name(code_page, &encoding_obj);
7417     if (encoding == NULL)
7418         return -1;
7419 
7420     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7421         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7422            UnicodeDecodeError. */
7423         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7424         if (exc != NULL) {
7425             PyCodec_StrictErrors(exc);
7426             Py_CLEAR(exc);
7427         }
7428         goto error;
7429     }
7430 
7431     /* Extend a wchar_t* buffer */
7432     Py_ssize_t n = *bufsize;   /* Get the current length */
7433     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7434         PyErr_NoMemory();
7435         goto error;
7436     }
7437     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7438         goto error;
7439     }
7440     out = *buf + n;
7441 
7442     /* Decode the byte string character per character */
7443     while (in < endin)
7444     {
7445         /* Decode a character */
7446         insize = 1;
7447         do
7448         {
7449             outsize = MultiByteToWideChar(code_page, flags,
7450                                           in, insize,
7451                                           buffer, Py_ARRAY_LENGTH(buffer));
7452             if (outsize > 0)
7453                 break;
7454             err = GetLastError();
7455             if (err == ERROR_INVALID_FLAGS && flags) {
7456                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7457                 flags = 0;
7458                 continue;
7459             }
7460             if (err != ERROR_NO_UNICODE_TRANSLATION
7461                 && err != ERROR_INSUFFICIENT_BUFFER)
7462             {
7463                 PyErr_SetFromWindowsErr(0);
7464                 goto error;
7465             }
7466             insize++;
7467         }
7468         /* 4=maximum length of a UTF-8 sequence */
7469         while (insize <= 4 && (in + insize) <= endin);
7470 
7471         if (outsize <= 0) {
7472             Py_ssize_t startinpos, endinpos, outpos;
7473 
7474             /* last character in partial decode? */
7475             if (in + insize >= endin && !final)
7476                 break;
7477 
7478             startinpos = in - startin;
7479             endinpos = startinpos + 1;
7480             outpos = out - *buf;
7481             if (unicode_decode_call_errorhandler_wchar(
7482                     errors, &errorHandler,
7483                     encoding, reason,
7484                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7485                     buf, bufsize, &outpos))
7486             {
7487                 goto error;
7488             }
7489             out = *buf + outpos;
7490         }
7491         else {
7492             in += insize;
7493             memcpy(out, buffer, outsize * sizeof(wchar_t));
7494             out += outsize;
7495         }
7496     }
7497 
7498     /* Shrink the buffer */
7499     assert(out - *buf <= *bufsize);
7500     *bufsize = out - *buf;
7501     /* (in - startin) <= size and size is an int */
7502     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7503 
7504 error:
7505     Py_XDECREF(encoding_obj);
7506     Py_XDECREF(errorHandler);
7507     Py_XDECREF(exc);
7508     return ret;
7509 }
7510 
7511 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7512 decode_code_page_stateful(int code_page,
7513                           const char *s, Py_ssize_t size,
7514                           const char *errors, Py_ssize_t *consumed)
7515 {
7516     wchar_t *buf = NULL;
7517     Py_ssize_t bufsize = 0;
7518     int chunk_size, final, converted, done;
7519 
7520     if (code_page < 0) {
7521         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7522         return NULL;
7523     }
7524     if (size < 0) {
7525         PyErr_BadInternalCall();
7526         return NULL;
7527     }
7528 
7529     if (consumed)
7530         *consumed = 0;
7531 
7532     do
7533     {
7534 #ifdef NEED_RETRY
7535         if (size > DECODING_CHUNK_SIZE) {
7536             chunk_size = DECODING_CHUNK_SIZE;
7537             final = 0;
7538             done = 0;
7539         }
7540         else
7541 #endif
7542         {
7543             chunk_size = (int)size;
7544             final = (consumed == NULL);
7545             done = 1;
7546         }
7547 
7548         if (chunk_size == 0 && done) {
7549             if (buf != NULL)
7550                 break;
7551             _Py_RETURN_UNICODE_EMPTY();
7552         }
7553 
7554         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7555                                             s, chunk_size);
7556         if (converted == -2)
7557             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7558                                                 s, chunk_size,
7559                                                 errors, final);
7560         assert(converted != 0 || done);
7561 
7562         if (converted < 0) {
7563             PyMem_Free(buf);
7564             return NULL;
7565         }
7566 
7567         if (consumed)
7568             *consumed += converted;
7569 
7570         s += converted;
7571         size -= converted;
7572     } while (!done);
7573 
7574     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7575     PyMem_Free(buf);
7576     return v;
7577 }
7578 
7579 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7580 PyUnicode_DecodeCodePageStateful(int code_page,
7581                                  const char *s,
7582                                  Py_ssize_t size,
7583                                  const char *errors,
7584                                  Py_ssize_t *consumed)
7585 {
7586     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7587 }
7588 
7589 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7590 PyUnicode_DecodeMBCSStateful(const char *s,
7591                              Py_ssize_t size,
7592                              const char *errors,
7593                              Py_ssize_t *consumed)
7594 {
7595     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7596 }
7597 
7598 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7599 PyUnicode_DecodeMBCS(const char *s,
7600                      Py_ssize_t size,
7601                      const char *errors)
7602 {
7603     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7604 }
7605 
7606 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7607 encode_code_page_flags(UINT code_page, const char *errors)
7608 {
7609     if (code_page == CP_UTF8) {
7610         return WC_ERR_INVALID_CHARS;
7611     }
7612     else if (code_page == CP_UTF7) {
7613         /* CP_UTF7 only supports flags=0 */
7614         return 0;
7615     }
7616     else {
7617         if (errors != NULL && strcmp(errors, "replace") == 0)
7618             return 0;
7619         else
7620             return WC_NO_BEST_FIT_CHARS;
7621     }
7622 }
7623 
7624 /*
7625  * Encode a Unicode string to a Windows code page into a byte string in strict
7626  * mode.
7627  *
7628  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7629  * an OSError and returns -1 on other error.
7630  */
7631 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7632 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7633                         PyObject *unicode, Py_ssize_t offset, int len,
7634                         const char* errors)
7635 {
7636     BOOL usedDefaultChar = FALSE;
7637     BOOL *pusedDefaultChar = &usedDefaultChar;
7638     int outsize;
7639     wchar_t *p;
7640     Py_ssize_t size;
7641     const DWORD flags = encode_code_page_flags(code_page, NULL);
7642     char *out;
7643     /* Create a substring so that we can get the UTF-16 representation
7644        of just the slice under consideration. */
7645     PyObject *substring;
7646 
7647     assert(len > 0);
7648 
7649     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7650         pusedDefaultChar = &usedDefaultChar;
7651     else
7652         pusedDefaultChar = NULL;
7653 
7654     substring = PyUnicode_Substring(unicode, offset, offset+len);
7655     if (substring == NULL)
7656         return -1;
7657     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7658     if (p == NULL) {
7659         Py_DECREF(substring);
7660         return -1;
7661     }
7662     assert(size <= INT_MAX);
7663 
7664     /* First get the size of the result */
7665     outsize = WideCharToMultiByte(code_page, flags,
7666                                   p, (int)size,
7667                                   NULL, 0,
7668                                   NULL, pusedDefaultChar);
7669     if (outsize <= 0)
7670         goto error;
7671     /* If we used a default char, then we failed! */
7672     if (pusedDefaultChar && *pusedDefaultChar) {
7673         Py_DECREF(substring);
7674         return -2;
7675     }
7676 
7677     if (*outbytes == NULL) {
7678         /* Create string object */
7679         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7680         if (*outbytes == NULL) {
7681             Py_DECREF(substring);
7682             return -1;
7683         }
7684         out = PyBytes_AS_STRING(*outbytes);
7685     }
7686     else {
7687         /* Extend string object */
7688         const Py_ssize_t n = PyBytes_Size(*outbytes);
7689         if (outsize > PY_SSIZE_T_MAX - n) {
7690             PyErr_NoMemory();
7691             Py_DECREF(substring);
7692             return -1;
7693         }
7694         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7695             Py_DECREF(substring);
7696             return -1;
7697         }
7698         out = PyBytes_AS_STRING(*outbytes) + n;
7699     }
7700 
7701     /* Do the conversion */
7702     outsize = WideCharToMultiByte(code_page, flags,
7703                                   p, (int)size,
7704                                   out, outsize,
7705                                   NULL, pusedDefaultChar);
7706     Py_CLEAR(substring);
7707     if (outsize <= 0)
7708         goto error;
7709     if (pusedDefaultChar && *pusedDefaultChar)
7710         return -2;
7711     return 0;
7712 
7713 error:
7714     Py_XDECREF(substring);
7715     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7716         return -2;
7717     PyErr_SetFromWindowsErr(0);
7718     return -1;
7719 }
7720 
7721 /*
7722  * Encode a Unicode string to a Windows code page into a byte string using an
7723  * error handler.
7724  *
7725  * Returns consumed characters if succeed, or raise an OSError and returns
7726  * -1 on other error.
7727  */
7728 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7729 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7730                         PyObject *unicode, Py_ssize_t unicode_offset,
7731                         Py_ssize_t insize, const char* errors)
7732 {
7733     const DWORD flags = encode_code_page_flags(code_page, errors);
7734     Py_ssize_t pos = unicode_offset;
7735     Py_ssize_t endin = unicode_offset + insize;
7736     /* Ideally, we should get reason from FormatMessage. This is the Windows
7737        2000 English version of the message. */
7738     const char *reason = "invalid character";
7739     /* 4=maximum length of a UTF-8 sequence */
7740     char buffer[4];
7741     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7742     Py_ssize_t outsize;
7743     char *out;
7744     PyObject *errorHandler = NULL;
7745     PyObject *exc = NULL;
7746     PyObject *encoding_obj = NULL;
7747     const char *encoding;
7748     Py_ssize_t newpos, newoutsize;
7749     PyObject *rep;
7750     int ret = -1;
7751 
7752     assert(insize > 0);
7753 
7754     encoding = code_page_name(code_page, &encoding_obj);
7755     if (encoding == NULL)
7756         return -1;
7757 
7758     if (errors == NULL || strcmp(errors, "strict") == 0) {
7759         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7760            then we raise a UnicodeEncodeError. */
7761         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7762         if (exc != NULL) {
7763             PyCodec_StrictErrors(exc);
7764             Py_DECREF(exc);
7765         }
7766         Py_XDECREF(encoding_obj);
7767         return -1;
7768     }
7769 
7770     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7771         pusedDefaultChar = &usedDefaultChar;
7772     else
7773         pusedDefaultChar = NULL;
7774 
7775     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7776         PyErr_NoMemory();
7777         goto error;
7778     }
7779     outsize = insize * Py_ARRAY_LENGTH(buffer);
7780 
7781     if (*outbytes == NULL) {
7782         /* Create string object */
7783         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7784         if (*outbytes == NULL)
7785             goto error;
7786         out = PyBytes_AS_STRING(*outbytes);
7787     }
7788     else {
7789         /* Extend string object */
7790         Py_ssize_t n = PyBytes_Size(*outbytes);
7791         if (n > PY_SSIZE_T_MAX - outsize) {
7792             PyErr_NoMemory();
7793             goto error;
7794         }
7795         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7796             goto error;
7797         out = PyBytes_AS_STRING(*outbytes) + n;
7798     }
7799 
7800     /* Encode the string character per character */
7801     while (pos < endin)
7802     {
7803         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7804         wchar_t chars[2];
7805         int charsize;
7806         if (ch < 0x10000) {
7807             chars[0] = (wchar_t)ch;
7808             charsize = 1;
7809         }
7810         else {
7811             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7812             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7813             charsize = 2;
7814         }
7815 
7816         outsize = WideCharToMultiByte(code_page, flags,
7817                                       chars, charsize,
7818                                       buffer, Py_ARRAY_LENGTH(buffer),
7819                                       NULL, pusedDefaultChar);
7820         if (outsize > 0) {
7821             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7822             {
7823                 pos++;
7824                 memcpy(out, buffer, outsize);
7825                 out += outsize;
7826                 continue;
7827             }
7828         }
7829         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7830             PyErr_SetFromWindowsErr(0);
7831             goto error;
7832         }
7833 
7834         rep = unicode_encode_call_errorhandler(
7835                   errors, &errorHandler, encoding, reason,
7836                   unicode, &exc,
7837                   pos, pos + 1, &newpos);
7838         if (rep == NULL)
7839             goto error;
7840         pos = newpos;
7841 
7842         if (PyBytes_Check(rep)) {
7843             outsize = PyBytes_GET_SIZE(rep);
7844             if (outsize != 1) {
7845                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7846                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7847                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7848                     Py_DECREF(rep);
7849                     goto error;
7850                 }
7851                 out = PyBytes_AS_STRING(*outbytes) + offset;
7852             }
7853             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7854             out += outsize;
7855         }
7856         else {
7857             Py_ssize_t i;
7858             enum PyUnicode_Kind kind;
7859             const void *data;
7860 
7861             if (PyUnicode_READY(rep) == -1) {
7862                 Py_DECREF(rep);
7863                 goto error;
7864             }
7865 
7866             outsize = PyUnicode_GET_LENGTH(rep);
7867             if (outsize != 1) {
7868                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7869                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7870                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7871                     Py_DECREF(rep);
7872                     goto error;
7873                 }
7874                 out = PyBytes_AS_STRING(*outbytes) + offset;
7875             }
7876             kind = PyUnicode_KIND(rep);
7877             data = PyUnicode_DATA(rep);
7878             for (i=0; i < outsize; i++) {
7879                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7880                 if (ch > 127) {
7881                     raise_encode_exception(&exc,
7882                         encoding, unicode,
7883                         pos, pos + 1,
7884                         "unable to encode error handler result to ASCII");
7885                     Py_DECREF(rep);
7886                     goto error;
7887                 }
7888                 *out = (unsigned char)ch;
7889                 out++;
7890             }
7891         }
7892         Py_DECREF(rep);
7893     }
7894     /* write a NUL byte */
7895     *out = 0;
7896     outsize = out - PyBytes_AS_STRING(*outbytes);
7897     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7898     if (_PyBytes_Resize(outbytes, outsize) < 0)
7899         goto error;
7900     ret = 0;
7901 
7902 error:
7903     Py_XDECREF(encoding_obj);
7904     Py_XDECREF(errorHandler);
7905     Py_XDECREF(exc);
7906     return ret;
7907 }
7908 
7909 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7910 encode_code_page(int code_page,
7911                  PyObject *unicode,
7912                  const char *errors)
7913 {
7914     Py_ssize_t len;
7915     PyObject *outbytes = NULL;
7916     Py_ssize_t offset;
7917     int chunk_len, ret, done;
7918 
7919     if (!PyUnicode_Check(unicode)) {
7920         PyErr_BadArgument();
7921         return NULL;
7922     }
7923 
7924     if (PyUnicode_READY(unicode) == -1)
7925         return NULL;
7926     len = PyUnicode_GET_LENGTH(unicode);
7927 
7928     if (code_page < 0) {
7929         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7930         return NULL;
7931     }
7932 
7933     if (len == 0)
7934         return PyBytes_FromStringAndSize(NULL, 0);
7935 
7936     offset = 0;
7937     do
7938     {
7939 #ifdef NEED_RETRY
7940         if (len > DECODING_CHUNK_SIZE) {
7941             chunk_len = DECODING_CHUNK_SIZE;
7942             done = 0;
7943         }
7944         else
7945 #endif
7946         {
7947             chunk_len = (int)len;
7948             done = 1;
7949         }
7950 
7951         ret = encode_code_page_strict(code_page, &outbytes,
7952                                       unicode, offset, chunk_len,
7953                                       errors);
7954         if (ret == -2)
7955             ret = encode_code_page_errors(code_page, &outbytes,
7956                                           unicode, offset,
7957                                           chunk_len, errors);
7958         if (ret < 0) {
7959             Py_XDECREF(outbytes);
7960             return NULL;
7961         }
7962 
7963         offset += chunk_len;
7964         len -= chunk_len;
7965     } while (!done);
7966 
7967     return outbytes;
7968 }
7969 
7970 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7971 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7972                      Py_ssize_t size,
7973                      const char *errors)
7974 {
7975     PyObject *unicode, *res;
7976     unicode = PyUnicode_FromWideChar(p, size);
7977     if (unicode == NULL)
7978         return NULL;
7979     res = encode_code_page(CP_ACP, unicode, errors);
7980     Py_DECREF(unicode);
7981     return res;
7982 }
7983 
7984 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7985 PyUnicode_EncodeCodePage(int code_page,
7986                          PyObject *unicode,
7987                          const char *errors)
7988 {
7989     return encode_code_page(code_page, unicode, errors);
7990 }
7991 
7992 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7993 PyUnicode_AsMBCSString(PyObject *unicode)
7994 {
7995     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7996 }
7997 
7998 #undef NEED_RETRY
7999 
8000 #endif /* MS_WINDOWS */
8001 
8002 /* --- Character Mapping Codec -------------------------------------------- */
8003 
8004 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8005 charmap_decode_string(const char *s,
8006                       Py_ssize_t size,
8007                       PyObject *mapping,
8008                       const char *errors,
8009                       _PyUnicodeWriter *writer)
8010 {
8011     const char *starts = s;
8012     const char *e;
8013     Py_ssize_t startinpos, endinpos;
8014     PyObject *errorHandler = NULL, *exc = NULL;
8015     Py_ssize_t maplen;
8016     enum PyUnicode_Kind mapkind;
8017     const void *mapdata;
8018     Py_UCS4 x;
8019     unsigned char ch;
8020 
8021     if (PyUnicode_READY(mapping) == -1)
8022         return -1;
8023 
8024     maplen = PyUnicode_GET_LENGTH(mapping);
8025     mapdata = PyUnicode_DATA(mapping);
8026     mapkind = PyUnicode_KIND(mapping);
8027 
8028     e = s + size;
8029 
8030     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8031         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8032          * is disabled in encoding aliases, latin1 is preferred because
8033          * its implementation is faster. */
8034         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8035         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8036         Py_UCS4 maxchar = writer->maxchar;
8037 
8038         assert (writer->kind == PyUnicode_1BYTE_KIND);
8039         while (s < e) {
8040             ch = *s;
8041             x = mapdata_ucs1[ch];
8042             if (x > maxchar) {
8043                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8044                     goto onError;
8045                 maxchar = writer->maxchar;
8046                 outdata = (Py_UCS1 *)writer->data;
8047             }
8048             outdata[writer->pos] = x;
8049             writer->pos++;
8050             ++s;
8051         }
8052         return 0;
8053     }
8054 
8055     while (s < e) {
8056         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8057             enum PyUnicode_Kind outkind = writer->kind;
8058             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8059             if (outkind == PyUnicode_1BYTE_KIND) {
8060                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8061                 Py_UCS4 maxchar = writer->maxchar;
8062                 while (s < e) {
8063                     ch = *s;
8064                     x = mapdata_ucs2[ch];
8065                     if (x > maxchar)
8066                         goto Error;
8067                     outdata[writer->pos] = x;
8068                     writer->pos++;
8069                     ++s;
8070                 }
8071                 break;
8072             }
8073             else if (outkind == PyUnicode_2BYTE_KIND) {
8074                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8075                 while (s < e) {
8076                     ch = *s;
8077                     x = mapdata_ucs2[ch];
8078                     if (x == 0xFFFE)
8079                         goto Error;
8080                     outdata[writer->pos] = x;
8081                     writer->pos++;
8082                     ++s;
8083                 }
8084                 break;
8085             }
8086         }
8087         ch = *s;
8088 
8089         if (ch < maplen)
8090             x = PyUnicode_READ(mapkind, mapdata, ch);
8091         else
8092             x = 0xfffe; /* invalid value */
8093 Error:
8094         if (x == 0xfffe)
8095         {
8096             /* undefined mapping */
8097             startinpos = s-starts;
8098             endinpos = startinpos+1;
8099             if (unicode_decode_call_errorhandler_writer(
8100                     errors, &errorHandler,
8101                     "charmap", "character maps to <undefined>",
8102                     &starts, &e, &startinpos, &endinpos, &exc, &s,
8103                     writer)) {
8104                 goto onError;
8105             }
8106             continue;
8107         }
8108 
8109         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8110             goto onError;
8111         ++s;
8112     }
8113     Py_XDECREF(errorHandler);
8114     Py_XDECREF(exc);
8115     return 0;
8116 
8117 onError:
8118     Py_XDECREF(errorHandler);
8119     Py_XDECREF(exc);
8120     return -1;
8121 }
8122 
8123 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8124 charmap_decode_mapping(const char *s,
8125                        Py_ssize_t size,
8126                        PyObject *mapping,
8127                        const char *errors,
8128                        _PyUnicodeWriter *writer)
8129 {
8130     const char *starts = s;
8131     const char *e;
8132     Py_ssize_t startinpos, endinpos;
8133     PyObject *errorHandler = NULL, *exc = NULL;
8134     unsigned char ch;
8135     PyObject *key, *item = NULL;
8136 
8137     e = s + size;
8138 
8139     while (s < e) {
8140         ch = *s;
8141 
8142         /* Get mapping (char ordinal -> integer, Unicode char or None) */
8143         key = PyLong_FromLong((long)ch);
8144         if (key == NULL)
8145             goto onError;
8146 
8147         item = PyObject_GetItem(mapping, key);
8148         Py_DECREF(key);
8149         if (item == NULL) {
8150             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8151                 /* No mapping found means: mapping is undefined. */
8152                 PyErr_Clear();
8153                 goto Undefined;
8154             } else
8155                 goto onError;
8156         }
8157 
8158         /* Apply mapping */
8159         if (item == Py_None)
8160             goto Undefined;
8161         if (PyLong_Check(item)) {
8162             long value = PyLong_AS_LONG(item);
8163             if (value == 0xFFFE)
8164                 goto Undefined;
8165             if (value < 0 || value > MAX_UNICODE) {
8166                 PyErr_Format(PyExc_TypeError,
8167                              "character mapping must be in range(0x%x)",
8168                              (unsigned long)MAX_UNICODE + 1);
8169                 goto onError;
8170             }
8171 
8172             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8173                 goto onError;
8174         }
8175         else if (PyUnicode_Check(item)) {
8176             if (PyUnicode_READY(item) == -1)
8177                 goto onError;
8178             if (PyUnicode_GET_LENGTH(item) == 1) {
8179                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8180                 if (value == 0xFFFE)
8181                     goto Undefined;
8182                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8183                     goto onError;
8184             }
8185             else {
8186                 writer->overallocate = 1;
8187                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8188                     goto onError;
8189             }
8190         }
8191         else {
8192             /* wrong return value */
8193             PyErr_SetString(PyExc_TypeError,
8194                             "character mapping must return integer, None or str");
8195             goto onError;
8196         }
8197         Py_CLEAR(item);
8198         ++s;
8199         continue;
8200 
8201 Undefined:
8202         /* undefined mapping */
8203         Py_CLEAR(item);
8204         startinpos = s-starts;
8205         endinpos = startinpos+1;
8206         if (unicode_decode_call_errorhandler_writer(
8207                 errors, &errorHandler,
8208                 "charmap", "character maps to <undefined>",
8209                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8210                 writer)) {
8211             goto onError;
8212         }
8213     }
8214     Py_XDECREF(errorHandler);
8215     Py_XDECREF(exc);
8216     return 0;
8217 
8218 onError:
8219     Py_XDECREF(item);
8220     Py_XDECREF(errorHandler);
8221     Py_XDECREF(exc);
8222     return -1;
8223 }
8224 
8225 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8226 PyUnicode_DecodeCharmap(const char *s,
8227                         Py_ssize_t size,
8228                         PyObject *mapping,
8229                         const char *errors)
8230 {
8231     _PyUnicodeWriter writer;
8232 
8233     /* Default to Latin-1 */
8234     if (mapping == NULL)
8235         return PyUnicode_DecodeLatin1(s, size, errors);
8236 
8237     if (size == 0)
8238         _Py_RETURN_UNICODE_EMPTY();
8239     _PyUnicodeWriter_Init(&writer);
8240     writer.min_length = size;
8241     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8242         goto onError;
8243 
8244     if (PyUnicode_CheckExact(mapping)) {
8245         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8246             goto onError;
8247     }
8248     else {
8249         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8250             goto onError;
8251     }
8252     return _PyUnicodeWriter_Finish(&writer);
8253 
8254   onError:
8255     _PyUnicodeWriter_Dealloc(&writer);
8256     return NULL;
8257 }
8258 
8259 /* Charmap encoding: the lookup table */
8260 
8261 struct encoding_map {
8262     PyObject_HEAD
8263     unsigned char level1[32];
8264     int count2, count3;
8265     unsigned char level23[1];
8266 };
8267 
8268 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8269 encoding_map_size(PyObject *obj, PyObject* args)
8270 {
8271     struct encoding_map *map = (struct encoding_map*)obj;
8272     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8273                            128*map->count3);
8274 }
8275 
8276 static PyMethodDef encoding_map_methods[] = {
8277     {"size", encoding_map_size, METH_NOARGS,
8278      PyDoc_STR("Return the size (in bytes) of this object") },
8279     { 0 }
8280 };
8281 
8282 static PyTypeObject EncodingMapType = {
8283     PyVarObject_HEAD_INIT(NULL, 0)
8284     "EncodingMap",          /*tp_name*/
8285     sizeof(struct encoding_map),   /*tp_basicsize*/
8286     0,                      /*tp_itemsize*/
8287     /* methods */
8288     0,                      /*tp_dealloc*/
8289     0,                      /*tp_vectorcall_offset*/
8290     0,                      /*tp_getattr*/
8291     0,                      /*tp_setattr*/
8292     0,                      /*tp_as_async*/
8293     0,                      /*tp_repr*/
8294     0,                      /*tp_as_number*/
8295     0,                      /*tp_as_sequence*/
8296     0,                      /*tp_as_mapping*/
8297     0,                      /*tp_hash*/
8298     0,                      /*tp_call*/
8299     0,                      /*tp_str*/
8300     0,                      /*tp_getattro*/
8301     0,                      /*tp_setattro*/
8302     0,                      /*tp_as_buffer*/
8303     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8304     0,                      /*tp_doc*/
8305     0,                      /*tp_traverse*/
8306     0,                      /*tp_clear*/
8307     0,                      /*tp_richcompare*/
8308     0,                      /*tp_weaklistoffset*/
8309     0,                      /*tp_iter*/
8310     0,                      /*tp_iternext*/
8311     encoding_map_methods,   /*tp_methods*/
8312     0,                      /*tp_members*/
8313     0,                      /*tp_getset*/
8314     0,                      /*tp_base*/
8315     0,                      /*tp_dict*/
8316     0,                      /*tp_descr_get*/
8317     0,                      /*tp_descr_set*/
8318     0,                      /*tp_dictoffset*/
8319     0,                      /*tp_init*/
8320     0,                      /*tp_alloc*/
8321     0,                      /*tp_new*/
8322     0,                      /*tp_free*/
8323     0,                      /*tp_is_gc*/
8324 };
8325 
8326 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8327 PyUnicode_BuildEncodingMap(PyObject* string)
8328 {
8329     PyObject *result;
8330     struct encoding_map *mresult;
8331     int i;
8332     int need_dict = 0;
8333     unsigned char level1[32];
8334     unsigned char level2[512];
8335     unsigned char *mlevel1, *mlevel2, *mlevel3;
8336     int count2 = 0, count3 = 0;
8337     int kind;
8338     const void *data;
8339     Py_ssize_t length;
8340     Py_UCS4 ch;
8341 
8342     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8343         PyErr_BadArgument();
8344         return NULL;
8345     }
8346     kind = PyUnicode_KIND(string);
8347     data = PyUnicode_DATA(string);
8348     length = PyUnicode_GET_LENGTH(string);
8349     length = Py_MIN(length, 256);
8350     memset(level1, 0xFF, sizeof level1);
8351     memset(level2, 0xFF, sizeof level2);
8352 
8353     /* If there isn't a one-to-one mapping of NULL to \0,
8354        or if there are non-BMP characters, we need to use
8355        a mapping dictionary. */
8356     if (PyUnicode_READ(kind, data, 0) != 0)
8357         need_dict = 1;
8358     for (i = 1; i < length; i++) {
8359         int l1, l2;
8360         ch = PyUnicode_READ(kind, data, i);
8361         if (ch == 0 || ch > 0xFFFF) {
8362             need_dict = 1;
8363             break;
8364         }
8365         if (ch == 0xFFFE)
8366             /* unmapped character */
8367             continue;
8368         l1 = ch >> 11;
8369         l2 = ch >> 7;
8370         if (level1[l1] == 0xFF)
8371             level1[l1] = count2++;
8372         if (level2[l2] == 0xFF)
8373             level2[l2] = count3++;
8374     }
8375 
8376     if (count2 >= 0xFF || count3 >= 0xFF)
8377         need_dict = 1;
8378 
8379     if (need_dict) {
8380         PyObject *result = PyDict_New();
8381         PyObject *key, *value;
8382         if (!result)
8383             return NULL;
8384         for (i = 0; i < length; i++) {
8385             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8386             value = PyLong_FromLong(i);
8387             if (!key || !value)
8388                 goto failed1;
8389             if (PyDict_SetItem(result, key, value) == -1)
8390                 goto failed1;
8391             Py_DECREF(key);
8392             Py_DECREF(value);
8393         }
8394         return result;
8395       failed1:
8396         Py_XDECREF(key);
8397         Py_XDECREF(value);
8398         Py_DECREF(result);
8399         return NULL;
8400     }
8401 
8402     /* Create a three-level trie */
8403     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8404                              16*count2 + 128*count3 - 1);
8405     if (!result)
8406         return PyErr_NoMemory();
8407     PyObject_Init(result, &EncodingMapType);
8408     mresult = (struct encoding_map*)result;
8409     mresult->count2 = count2;
8410     mresult->count3 = count3;
8411     mlevel1 = mresult->level1;
8412     mlevel2 = mresult->level23;
8413     mlevel3 = mresult->level23 + 16*count2;
8414     memcpy(mlevel1, level1, 32);
8415     memset(mlevel2, 0xFF, 16*count2);
8416     memset(mlevel3, 0, 128*count3);
8417     count3 = 0;
8418     for (i = 1; i < length; i++) {
8419         int o1, o2, o3, i2, i3;
8420         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8421         if (ch == 0xFFFE)
8422             /* unmapped character */
8423             continue;
8424         o1 = ch>>11;
8425         o2 = (ch>>7) & 0xF;
8426         i2 = 16*mlevel1[o1] + o2;
8427         if (mlevel2[i2] == 0xFF)
8428             mlevel2[i2] = count3++;
8429         o3 = ch & 0x7F;
8430         i3 = 128*mlevel2[i2] + o3;
8431         mlevel3[i3] = i;
8432     }
8433     return result;
8434 }
8435 
8436 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8437 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8438 {
8439     struct encoding_map *map = (struct encoding_map*)mapping;
8440     int l1 = c>>11;
8441     int l2 = (c>>7) & 0xF;
8442     int l3 = c & 0x7F;
8443     int i;
8444 
8445     if (c > 0xFFFF)
8446         return -1;
8447     if (c == 0)
8448         return 0;
8449     /* level 1*/
8450     i = map->level1[l1];
8451     if (i == 0xFF) {
8452         return -1;
8453     }
8454     /* level 2*/
8455     i = map->level23[16*i+l2];
8456     if (i == 0xFF) {
8457         return -1;
8458     }
8459     /* level 3 */
8460     i = map->level23[16*map->count2 + 128*i + l3];
8461     if (i == 0) {
8462         return -1;
8463     }
8464     return i;
8465 }
8466 
8467 /* Lookup the character ch in the mapping. If the character
8468    can't be found, Py_None is returned (or NULL, if another
8469    error occurred). */
8470 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8471 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8472 {
8473     PyObject *w = PyLong_FromLong((long)c);
8474     PyObject *x;
8475 
8476     if (w == NULL)
8477         return NULL;
8478     x = PyObject_GetItem(mapping, w);
8479     Py_DECREF(w);
8480     if (x == NULL) {
8481         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8482             /* No mapping found means: mapping is undefined. */
8483             PyErr_Clear();
8484             Py_RETURN_NONE;
8485         } else
8486             return NULL;
8487     }
8488     else if (x == Py_None)
8489         return x;
8490     else if (PyLong_Check(x)) {
8491         long value = PyLong_AS_LONG(x);
8492         if (value < 0 || value > 255) {
8493             PyErr_SetString(PyExc_TypeError,
8494                             "character mapping must be in range(256)");
8495             Py_DECREF(x);
8496             return NULL;
8497         }
8498         return x;
8499     }
8500     else if (PyBytes_Check(x))
8501         return x;
8502     else {
8503         /* wrong return value */
8504         PyErr_Format(PyExc_TypeError,
8505                      "character mapping must return integer, bytes or None, not %.400s",
8506                      Py_TYPE(x)->tp_name);
8507         Py_DECREF(x);
8508         return NULL;
8509     }
8510 }
8511 
8512 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8513 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8514 {
8515     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8516     /* exponentially overallocate to minimize reallocations */
8517     if (requiredsize < 2*outsize)
8518         requiredsize = 2*outsize;
8519     if (_PyBytes_Resize(outobj, requiredsize))
8520         return -1;
8521     return 0;
8522 }
8523 
8524 typedef enum charmapencode_result {
8525     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8526 } charmapencode_result;
8527 /* lookup the character, put the result in the output string and adjust
8528    various state variables. Resize the output bytes object if not enough
8529    space is available. Return a new reference to the object that
8530    was put in the output buffer, or Py_None, if the mapping was undefined
8531    (in which case no character was written) or NULL, if a
8532    reallocation error occurred. The caller must decref the result */
8533 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8534 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8535                      PyObject **outobj, Py_ssize_t *outpos)
8536 {
8537     PyObject *rep;
8538     char *outstart;
8539     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8540 
8541     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8542         int res = encoding_map_lookup(c, mapping);
8543         Py_ssize_t requiredsize = *outpos+1;
8544         if (res == -1)
8545             return enc_FAILED;
8546         if (outsize<requiredsize)
8547             if (charmapencode_resize(outobj, outpos, requiredsize))
8548                 return enc_EXCEPTION;
8549         outstart = PyBytes_AS_STRING(*outobj);
8550         outstart[(*outpos)++] = (char)res;
8551         return enc_SUCCESS;
8552     }
8553 
8554     rep = charmapencode_lookup(c, mapping);
8555     if (rep==NULL)
8556         return enc_EXCEPTION;
8557     else if (rep==Py_None) {
8558         Py_DECREF(rep);
8559         return enc_FAILED;
8560     } else {
8561         if (PyLong_Check(rep)) {
8562             Py_ssize_t requiredsize = *outpos+1;
8563             if (outsize<requiredsize)
8564                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8565                     Py_DECREF(rep);
8566                     return enc_EXCEPTION;
8567                 }
8568             outstart = PyBytes_AS_STRING(*outobj);
8569             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8570         }
8571         else {
8572             const char *repchars = PyBytes_AS_STRING(rep);
8573             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8574             Py_ssize_t requiredsize = *outpos+repsize;
8575             if (outsize<requiredsize)
8576                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8577                     Py_DECREF(rep);
8578                     return enc_EXCEPTION;
8579                 }
8580             outstart = PyBytes_AS_STRING(*outobj);
8581             memcpy(outstart + *outpos, repchars, repsize);
8582             *outpos += repsize;
8583         }
8584     }
8585     Py_DECREF(rep);
8586     return enc_SUCCESS;
8587 }
8588 
8589 /* handle an error in PyUnicode_EncodeCharmap
8590    Return 0 on success, -1 on error */
8591 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8592 charmap_encoding_error(
8593     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8594     PyObject **exceptionObject,
8595     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8596     PyObject **res, Py_ssize_t *respos)
8597 {
8598     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8599     Py_ssize_t size, repsize;
8600     Py_ssize_t newpos;
8601     enum PyUnicode_Kind kind;
8602     const void *data;
8603     Py_ssize_t index;
8604     /* startpos for collecting unencodable chars */
8605     Py_ssize_t collstartpos = *inpos;
8606     Py_ssize_t collendpos = *inpos+1;
8607     Py_ssize_t collpos;
8608     const char *encoding = "charmap";
8609     const char *reason = "character maps to <undefined>";
8610     charmapencode_result x;
8611     Py_UCS4 ch;
8612     int val;
8613 
8614     if (PyUnicode_READY(unicode) == -1)
8615         return -1;
8616     size = PyUnicode_GET_LENGTH(unicode);
8617     /* find all unencodable characters */
8618     while (collendpos < size) {
8619         PyObject *rep;
8620         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8621             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8622             val = encoding_map_lookup(ch, mapping);
8623             if (val != -1)
8624                 break;
8625             ++collendpos;
8626             continue;
8627         }
8628 
8629         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8630         rep = charmapencode_lookup(ch, mapping);
8631         if (rep==NULL)
8632             return -1;
8633         else if (rep!=Py_None) {
8634             Py_DECREF(rep);
8635             break;
8636         }
8637         Py_DECREF(rep);
8638         ++collendpos;
8639     }
8640     /* cache callback name lookup
8641      * (if not done yet, i.e. it's the first error) */
8642     if (*error_handler == _Py_ERROR_UNKNOWN)
8643         *error_handler = _Py_GetErrorHandler(errors);
8644 
8645     switch (*error_handler) {
8646     case _Py_ERROR_STRICT:
8647         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8648         return -1;
8649 
8650     case _Py_ERROR_REPLACE:
8651         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8652             x = charmapencode_output('?', mapping, res, respos);
8653             if (x==enc_EXCEPTION) {
8654                 return -1;
8655             }
8656             else if (x==enc_FAILED) {
8657                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8658                 return -1;
8659             }
8660         }
8661         /* fall through */
8662     case _Py_ERROR_IGNORE:
8663         *inpos = collendpos;
8664         break;
8665 
8666     case _Py_ERROR_XMLCHARREFREPLACE:
8667         /* generate replacement (temporarily (mis)uses p) */
8668         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8669             char buffer[2+29+1+1];
8670             char *cp;
8671             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8672             for (cp = buffer; *cp; ++cp) {
8673                 x = charmapencode_output(*cp, mapping, res, respos);
8674                 if (x==enc_EXCEPTION)
8675                     return -1;
8676                 else if (x==enc_FAILED) {
8677                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8678                     return -1;
8679                 }
8680             }
8681         }
8682         *inpos = collendpos;
8683         break;
8684 
8685     default:
8686         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8687                                                       encoding, reason, unicode, exceptionObject,
8688                                                       collstartpos, collendpos, &newpos);
8689         if (repunicode == NULL)
8690             return -1;
8691         if (PyBytes_Check(repunicode)) {
8692             /* Directly copy bytes result to output. */
8693             Py_ssize_t outsize = PyBytes_Size(*res);
8694             Py_ssize_t requiredsize;
8695             repsize = PyBytes_Size(repunicode);
8696             requiredsize = *respos + repsize;
8697             if (requiredsize > outsize)
8698                 /* Make room for all additional bytes. */
8699                 if (charmapencode_resize(res, respos, requiredsize)) {
8700                     Py_DECREF(repunicode);
8701                     return -1;
8702                 }
8703             memcpy(PyBytes_AsString(*res) + *respos,
8704                    PyBytes_AsString(repunicode),  repsize);
8705             *respos += repsize;
8706             *inpos = newpos;
8707             Py_DECREF(repunicode);
8708             break;
8709         }
8710         /* generate replacement  */
8711         if (PyUnicode_READY(repunicode) == -1) {
8712             Py_DECREF(repunicode);
8713             return -1;
8714         }
8715         repsize = PyUnicode_GET_LENGTH(repunicode);
8716         data = PyUnicode_DATA(repunicode);
8717         kind = PyUnicode_KIND(repunicode);
8718         for (index = 0; index < repsize; index++) {
8719             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8720             x = charmapencode_output(repch, mapping, res, respos);
8721             if (x==enc_EXCEPTION) {
8722                 Py_DECREF(repunicode);
8723                 return -1;
8724             }
8725             else if (x==enc_FAILED) {
8726                 Py_DECREF(repunicode);
8727                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8728                 return -1;
8729             }
8730         }
8731         *inpos = newpos;
8732         Py_DECREF(repunicode);
8733     }
8734     return 0;
8735 }
8736 
8737 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8738 _PyUnicode_EncodeCharmap(PyObject *unicode,
8739                          PyObject *mapping,
8740                          const char *errors)
8741 {
8742     /* output object */
8743     PyObject *res = NULL;
8744     /* current input position */
8745     Py_ssize_t inpos = 0;
8746     Py_ssize_t size;
8747     /* current output position */
8748     Py_ssize_t respos = 0;
8749     PyObject *error_handler_obj = NULL;
8750     PyObject *exc = NULL;
8751     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8752     const void *data;
8753     int kind;
8754 
8755     if (PyUnicode_READY(unicode) == -1)
8756         return NULL;
8757     size = PyUnicode_GET_LENGTH(unicode);
8758     data = PyUnicode_DATA(unicode);
8759     kind = PyUnicode_KIND(unicode);
8760 
8761     /* Default to Latin-1 */
8762     if (mapping == NULL)
8763         return unicode_encode_ucs1(unicode, errors, 256);
8764 
8765     /* allocate enough for a simple encoding without
8766        replacements, if we need more, we'll resize */
8767     res = PyBytes_FromStringAndSize(NULL, size);
8768     if (res == NULL)
8769         goto onError;
8770     if (size == 0)
8771         return res;
8772 
8773     while (inpos<size) {
8774         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8775         /* try to encode it */
8776         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8777         if (x==enc_EXCEPTION) /* error */
8778             goto onError;
8779         if (x==enc_FAILED) { /* unencodable character */
8780             if (charmap_encoding_error(unicode, &inpos, mapping,
8781                                        &exc,
8782                                        &error_handler, &error_handler_obj, errors,
8783                                        &res, &respos)) {
8784                 goto onError;
8785             }
8786         }
8787         else
8788             /* done with this character => adjust input position */
8789             ++inpos;
8790     }
8791 
8792     /* Resize if we allocated to much */
8793     if (respos<PyBytes_GET_SIZE(res))
8794         if (_PyBytes_Resize(&res, respos) < 0)
8795             goto onError;
8796 
8797     Py_XDECREF(exc);
8798     Py_XDECREF(error_handler_obj);
8799     return res;
8800 
8801   onError:
8802     Py_XDECREF(res);
8803     Py_XDECREF(exc);
8804     Py_XDECREF(error_handler_obj);
8805     return NULL;
8806 }
8807 
8808 /* Deprecated */
8809 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8810 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8811                         Py_ssize_t size,
8812                         PyObject *mapping,
8813                         const char *errors)
8814 {
8815     PyObject *result;
8816     PyObject *unicode = PyUnicode_FromWideChar(p, size);
8817     if (unicode == NULL)
8818         return NULL;
8819     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8820     Py_DECREF(unicode);
8821     return result;
8822 }
8823 
8824 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8825 PyUnicode_AsCharmapString(PyObject *unicode,
8826                           PyObject *mapping)
8827 {
8828     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8829         PyErr_BadArgument();
8830         return NULL;
8831     }
8832     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8833 }
8834 
8835 /* create or adjust a UnicodeTranslateError */
8836 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8837 make_translate_exception(PyObject **exceptionObject,
8838                          PyObject *unicode,
8839                          Py_ssize_t startpos, Py_ssize_t endpos,
8840                          const char *reason)
8841 {
8842     if (*exceptionObject == NULL) {
8843         *exceptionObject = _PyUnicodeTranslateError_Create(
8844             unicode, startpos, endpos, reason);
8845     }
8846     else {
8847         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8848             goto onError;
8849         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8850             goto onError;
8851         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8852             goto onError;
8853         return;
8854       onError:
8855         Py_CLEAR(*exceptionObject);
8856     }
8857 }
8858 
8859 /* error handling callback helper:
8860    build arguments, call the callback and check the arguments,
8861    put the result into newpos and return the replacement string, which
8862    has to be freed by the caller */
8863 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8864 unicode_translate_call_errorhandler(const char *errors,
8865                                     PyObject **errorHandler,
8866                                     const char *reason,
8867                                     PyObject *unicode, PyObject **exceptionObject,
8868                                     Py_ssize_t startpos, Py_ssize_t endpos,
8869                                     Py_ssize_t *newpos)
8870 {
8871     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8872 
8873     Py_ssize_t i_newpos;
8874     PyObject *restuple;
8875     PyObject *resunicode;
8876 
8877     if (*errorHandler == NULL) {
8878         *errorHandler = PyCodec_LookupError(errors);
8879         if (*errorHandler == NULL)
8880             return NULL;
8881     }
8882 
8883     make_translate_exception(exceptionObject,
8884                              unicode, startpos, endpos, reason);
8885     if (*exceptionObject == NULL)
8886         return NULL;
8887 
8888     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8889     if (restuple == NULL)
8890         return NULL;
8891     if (!PyTuple_Check(restuple)) {
8892         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8893         Py_DECREF(restuple);
8894         return NULL;
8895     }
8896     if (!PyArg_ParseTuple(restuple, argparse,
8897                           &resunicode, &i_newpos)) {
8898         Py_DECREF(restuple);
8899         return NULL;
8900     }
8901     if (i_newpos<0)
8902         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8903     else
8904         *newpos = i_newpos;
8905     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8906         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8907         Py_DECREF(restuple);
8908         return NULL;
8909     }
8910     Py_INCREF(resunicode);
8911     Py_DECREF(restuple);
8912     return resunicode;
8913 }
8914 
8915 /* Lookup the character ch in the mapping and put the result in result,
8916    which must be decrefed by the caller.
8917    Return 0 on success, -1 on error */
8918 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8919 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8920 {
8921     PyObject *w = PyLong_FromLong((long)c);
8922     PyObject *x;
8923 
8924     if (w == NULL)
8925         return -1;
8926     x = PyObject_GetItem(mapping, w);
8927     Py_DECREF(w);
8928     if (x == NULL) {
8929         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8930             /* No mapping found means: use 1:1 mapping. */
8931             PyErr_Clear();
8932             *result = NULL;
8933             return 0;
8934         } else
8935             return -1;
8936     }
8937     else if (x == Py_None) {
8938         *result = x;
8939         return 0;
8940     }
8941     else if (PyLong_Check(x)) {
8942         long value = PyLong_AS_LONG(x);
8943         if (value < 0 || value > MAX_UNICODE) {
8944             PyErr_Format(PyExc_ValueError,
8945                          "character mapping must be in range(0x%x)",
8946                          MAX_UNICODE+1);
8947             Py_DECREF(x);
8948             return -1;
8949         }
8950         *result = x;
8951         return 0;
8952     }
8953     else if (PyUnicode_Check(x)) {
8954         *result = x;
8955         return 0;
8956     }
8957     else {
8958         /* wrong return value */
8959         PyErr_SetString(PyExc_TypeError,
8960                         "character mapping must return integer, None or str");
8961         Py_DECREF(x);
8962         return -1;
8963     }
8964 }
8965 
8966 /* lookup the character, write the result into the writer.
8967    Return 1 if the result was written into the writer, return 0 if the mapping
8968    was undefined, raise an exception return -1 on error. */
8969 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8970 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8971                         _PyUnicodeWriter *writer)
8972 {
8973     PyObject *item;
8974 
8975     if (charmaptranslate_lookup(ch, mapping, &item))
8976         return -1;
8977 
8978     if (item == NULL) {
8979         /* not found => default to 1:1 mapping */
8980         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8981             return -1;
8982         }
8983         return 1;
8984     }
8985 
8986     if (item == Py_None) {
8987         Py_DECREF(item);
8988         return 0;
8989     }
8990 
8991     if (PyLong_Check(item)) {
8992         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8993         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8994            used it */
8995         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8996             Py_DECREF(item);
8997             return -1;
8998         }
8999         Py_DECREF(item);
9000         return 1;
9001     }
9002 
9003     if (!PyUnicode_Check(item)) {
9004         Py_DECREF(item);
9005         return -1;
9006     }
9007 
9008     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9009         Py_DECREF(item);
9010         return -1;
9011     }
9012 
9013     Py_DECREF(item);
9014     return 1;
9015 }
9016 
9017 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9018 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9019                               Py_UCS1 *translate)
9020 {
9021     PyObject *item = NULL;
9022     int ret = 0;
9023 
9024     if (charmaptranslate_lookup(ch, mapping, &item)) {
9025         return -1;
9026     }
9027 
9028     if (item == Py_None) {
9029         /* deletion */
9030         translate[ch] = 0xfe;
9031     }
9032     else if (item == NULL) {
9033         /* not found => default to 1:1 mapping */
9034         translate[ch] = ch;
9035         return 1;
9036     }
9037     else if (PyLong_Check(item)) {
9038         long replace = PyLong_AS_LONG(item);
9039         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9040            used it */
9041         if (127 < replace) {
9042             /* invalid character or character outside ASCII:
9043                skip the fast translate */
9044             goto exit;
9045         }
9046         translate[ch] = (Py_UCS1)replace;
9047     }
9048     else if (PyUnicode_Check(item)) {
9049         Py_UCS4 replace;
9050 
9051         if (PyUnicode_READY(item) == -1) {
9052             Py_DECREF(item);
9053             return -1;
9054         }
9055         if (PyUnicode_GET_LENGTH(item) != 1)
9056             goto exit;
9057 
9058         replace = PyUnicode_READ_CHAR(item, 0);
9059         if (replace > 127)
9060             goto exit;
9061         translate[ch] = (Py_UCS1)replace;
9062     }
9063     else {
9064         /* not None, NULL, long or unicode */
9065         goto exit;
9066     }
9067     ret = 1;
9068 
9069   exit:
9070     Py_DECREF(item);
9071     return ret;
9072 }
9073 
9074 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9075    was translated into writer, return 0 if the input string was partially
9076    translated into writer, raise an exception and return -1 on error. */
9077 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9078 unicode_fast_translate(PyObject *input, PyObject *mapping,
9079                        _PyUnicodeWriter *writer, int ignore,
9080                        Py_ssize_t *input_pos)
9081 {
9082     Py_UCS1 ascii_table[128], ch, ch2;
9083     Py_ssize_t len;
9084     const Py_UCS1 *in, *end;
9085     Py_UCS1 *out;
9086     int res = 0;
9087 
9088     len = PyUnicode_GET_LENGTH(input);
9089 
9090     memset(ascii_table, 0xff, 128);
9091 
9092     in = PyUnicode_1BYTE_DATA(input);
9093     end = in + len;
9094 
9095     assert(PyUnicode_IS_ASCII(writer->buffer));
9096     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9097     out = PyUnicode_1BYTE_DATA(writer->buffer);
9098 
9099     for (; in < end; in++) {
9100         ch = *in;
9101         ch2 = ascii_table[ch];
9102         if (ch2 == 0xff) {
9103             int translate = unicode_fast_translate_lookup(mapping, ch,
9104                                                           ascii_table);
9105             if (translate < 0)
9106                 return -1;
9107             if (translate == 0)
9108                 goto exit;
9109             ch2 = ascii_table[ch];
9110         }
9111         if (ch2 == 0xfe) {
9112             if (ignore)
9113                 continue;
9114             goto exit;
9115         }
9116         assert(ch2 < 128);
9117         *out = ch2;
9118         out++;
9119     }
9120     res = 1;
9121 
9122 exit:
9123     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9124     *input_pos = in - PyUnicode_1BYTE_DATA(input);
9125     return res;
9126 }
9127 
9128 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9129 _PyUnicode_TranslateCharmap(PyObject *input,
9130                             PyObject *mapping,
9131                             const char *errors)
9132 {
9133     /* input object */
9134     const void *data;
9135     Py_ssize_t size, i;
9136     int kind;
9137     /* output buffer */
9138     _PyUnicodeWriter writer;
9139     /* error handler */
9140     const char *reason = "character maps to <undefined>";
9141     PyObject *errorHandler = NULL;
9142     PyObject *exc = NULL;
9143     int ignore;
9144     int res;
9145 
9146     if (mapping == NULL) {
9147         PyErr_BadArgument();
9148         return NULL;
9149     }
9150 
9151     if (PyUnicode_READY(input) == -1)
9152         return NULL;
9153     data = PyUnicode_DATA(input);
9154     kind = PyUnicode_KIND(input);
9155     size = PyUnicode_GET_LENGTH(input);
9156 
9157     if (size == 0)
9158         return PyUnicode_FromObject(input);
9159 
9160     /* allocate enough for a simple 1:1 translation without
9161        replacements, if we need more, we'll resize */
9162     _PyUnicodeWriter_Init(&writer);
9163     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9164         goto onError;
9165 
9166     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9167 
9168     if (PyUnicode_READY(input) == -1)
9169         return NULL;
9170     if (PyUnicode_IS_ASCII(input)) {
9171         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9172         if (res < 0) {
9173             _PyUnicodeWriter_Dealloc(&writer);
9174             return NULL;
9175         }
9176         if (res == 1)
9177             return _PyUnicodeWriter_Finish(&writer);
9178     }
9179     else {
9180         i = 0;
9181     }
9182 
9183     while (i<size) {
9184         /* try to encode it */
9185         int translate;
9186         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9187         Py_ssize_t newpos;
9188         /* startpos for collecting untranslatable chars */
9189         Py_ssize_t collstart;
9190         Py_ssize_t collend;
9191         Py_UCS4 ch;
9192 
9193         ch = PyUnicode_READ(kind, data, i);
9194         translate = charmaptranslate_output(ch, mapping, &writer);
9195         if (translate < 0)
9196             goto onError;
9197 
9198         if (translate != 0) {
9199             /* it worked => adjust input pointer */
9200             ++i;
9201             continue;
9202         }
9203 
9204         /* untranslatable character */
9205         collstart = i;
9206         collend = i+1;
9207 
9208         /* find all untranslatable characters */
9209         while (collend < size) {
9210             PyObject *x;
9211             ch = PyUnicode_READ(kind, data, collend);
9212             if (charmaptranslate_lookup(ch, mapping, &x))
9213                 goto onError;
9214             Py_XDECREF(x);
9215             if (x != Py_None)
9216                 break;
9217             ++collend;
9218         }
9219 
9220         if (ignore) {
9221             i = collend;
9222         }
9223         else {
9224             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9225                                                              reason, input, &exc,
9226                                                              collstart, collend, &newpos);
9227             if (repunicode == NULL)
9228                 goto onError;
9229             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9230                 Py_DECREF(repunicode);
9231                 goto onError;
9232             }
9233             Py_DECREF(repunicode);
9234             i = newpos;
9235         }
9236     }
9237     Py_XDECREF(exc);
9238     Py_XDECREF(errorHandler);
9239     return _PyUnicodeWriter_Finish(&writer);
9240 
9241   onError:
9242     _PyUnicodeWriter_Dealloc(&writer);
9243     Py_XDECREF(exc);
9244     Py_XDECREF(errorHandler);
9245     return NULL;
9246 }
9247 
9248 /* Deprecated. Use PyUnicode_Translate instead. */
9249 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9250 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9251                            Py_ssize_t size,
9252                            PyObject *mapping,
9253                            const char *errors)
9254 {
9255     PyObject *result;
9256     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9257     if (!unicode)
9258         return NULL;
9259     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9260     Py_DECREF(unicode);
9261     return result;
9262 }
9263 
9264 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9265 PyUnicode_Translate(PyObject *str,
9266                     PyObject *mapping,
9267                     const char *errors)
9268 {
9269     if (ensure_unicode(str) < 0)
9270         return NULL;
9271     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9272 }
9273 
9274 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9275 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9276 {
9277     if (!PyUnicode_Check(unicode)) {
9278         PyErr_BadInternalCall();
9279         return NULL;
9280     }
9281     if (PyUnicode_READY(unicode) == -1)
9282         return NULL;
9283     if (PyUnicode_IS_ASCII(unicode)) {
9284         /* If the string is already ASCII, just return the same string */
9285         Py_INCREF(unicode);
9286         return unicode;
9287     }
9288 
9289     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9290     PyObject *result = PyUnicode_New(len, 127);
9291     if (result == NULL) {
9292         return NULL;
9293     }
9294 
9295     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9296     int kind = PyUnicode_KIND(unicode);
9297     const void *data = PyUnicode_DATA(unicode);
9298     Py_ssize_t i;
9299     for (i = 0; i < len; ++i) {
9300         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9301         if (ch < 127) {
9302             out[i] = ch;
9303         }
9304         else if (Py_UNICODE_ISSPACE(ch)) {
9305             out[i] = ' ';
9306         }
9307         else {
9308             int decimal = Py_UNICODE_TODECIMAL(ch);
9309             if (decimal < 0) {
9310                 out[i] = '?';
9311                 out[i+1] = '\0';
9312                 _PyUnicode_LENGTH(result) = i + 1;
9313                 break;
9314             }
9315             out[i] = '0' + decimal;
9316         }
9317     }
9318 
9319     assert(_PyUnicode_CheckConsistency(result, 1));
9320     return result;
9321 }
9322 
9323 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9324 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9325                                   Py_ssize_t length)
9326 {
9327     PyObject *decimal;
9328     Py_ssize_t i;
9329     Py_UCS4 maxchar;
9330     enum PyUnicode_Kind kind;
9331     const void *data;
9332 
9333     maxchar = 127;
9334     for (i = 0; i < length; i++) {
9335         Py_UCS4 ch = s[i];
9336         if (ch > 127) {
9337             int decimal = Py_UNICODE_TODECIMAL(ch);
9338             if (decimal >= 0)
9339                 ch = '0' + decimal;
9340             maxchar = Py_MAX(maxchar, ch);
9341         }
9342     }
9343 
9344     /* Copy to a new string */
9345     decimal = PyUnicode_New(length, maxchar);
9346     if (decimal == NULL)
9347         return decimal;
9348     kind = PyUnicode_KIND(decimal);
9349     data = PyUnicode_DATA(decimal);
9350     /* Iterate over code points */
9351     for (i = 0; i < length; i++) {
9352         Py_UCS4 ch = s[i];
9353         if (ch > 127) {
9354             int decimal = Py_UNICODE_TODECIMAL(ch);
9355             if (decimal >= 0)
9356                 ch = '0' + decimal;
9357         }
9358         PyUnicode_WRITE(kind, data, i, ch);
9359     }
9360     return unicode_result(decimal);
9361 }
9362 /* --- Decimal Encoder ---------------------------------------------------- */
9363 
9364 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9365 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9366                         Py_ssize_t length,
9367                         char *output,
9368                         const char *errors)
9369 {
9370     PyObject *unicode;
9371     Py_ssize_t i;
9372     enum PyUnicode_Kind kind;
9373     const void *data;
9374 
9375     if (output == NULL) {
9376         PyErr_BadArgument();
9377         return -1;
9378     }
9379 
9380     unicode = PyUnicode_FromWideChar(s, length);
9381     if (unicode == NULL)
9382         return -1;
9383 
9384     kind = PyUnicode_KIND(unicode);
9385     data = PyUnicode_DATA(unicode);
9386 
9387     for (i=0; i < length; ) {
9388         PyObject *exc;
9389         Py_UCS4 ch;
9390         int decimal;
9391         Py_ssize_t startpos;
9392 
9393         ch = PyUnicode_READ(kind, data, i);
9394 
9395         if (Py_UNICODE_ISSPACE(ch)) {
9396             *output++ = ' ';
9397             i++;
9398             continue;
9399         }
9400         decimal = Py_UNICODE_TODECIMAL(ch);
9401         if (decimal >= 0) {
9402             *output++ = '0' + decimal;
9403             i++;
9404             continue;
9405         }
9406         if (0 < ch && ch < 256) {
9407             *output++ = (char)ch;
9408             i++;
9409             continue;
9410         }
9411 
9412         startpos = i;
9413         exc = NULL;
9414         raise_encode_exception(&exc, "decimal", unicode,
9415                                startpos, startpos+1,
9416                                "invalid decimal Unicode string");
9417         Py_XDECREF(exc);
9418         Py_DECREF(unicode);
9419         return -1;
9420     }
9421     /* 0-terminate the output string */
9422     *output++ = '\0';
9423     Py_DECREF(unicode);
9424     return 0;
9425 }
9426 
9427 /* --- Helpers ------------------------------------------------------------ */
9428 
9429 /* helper macro to fixup start/end slice values */
9430 #define ADJUST_INDICES(start, end, len)         \
9431     if (end > len)                              \
9432         end = len;                              \
9433     else if (end < 0) {                         \
9434         end += len;                             \
9435         if (end < 0)                            \
9436             end = 0;                            \
9437     }                                           \
9438     if (start < 0) {                            \
9439         start += len;                           \
9440         if (start < 0)                          \
9441             start = 0;                          \
9442     }
9443 
9444 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9445 any_find_slice(PyObject* s1, PyObject* s2,
9446                Py_ssize_t start,
9447                Py_ssize_t end,
9448                int direction)
9449 {
9450     int kind1, kind2;
9451     const void *buf1, *buf2;
9452     Py_ssize_t len1, len2, result;
9453 
9454     kind1 = PyUnicode_KIND(s1);
9455     kind2 = PyUnicode_KIND(s2);
9456     if (kind1 < kind2)
9457         return -1;
9458 
9459     len1 = PyUnicode_GET_LENGTH(s1);
9460     len2 = PyUnicode_GET_LENGTH(s2);
9461     ADJUST_INDICES(start, end, len1);
9462     if (end - start < len2)
9463         return -1;
9464 
9465     buf1 = PyUnicode_DATA(s1);
9466     buf2 = PyUnicode_DATA(s2);
9467     if (len2 == 1) {
9468         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9469         result = findchar((const char *)buf1 + kind1*start,
9470                           kind1, end - start, ch, direction);
9471         if (result == -1)
9472             return -1;
9473         else
9474             return start + result;
9475     }
9476 
9477     if (kind2 != kind1) {
9478         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9479         if (!buf2)
9480             return -2;
9481     }
9482 
9483     if (direction > 0) {
9484         switch (kind1) {
9485         case PyUnicode_1BYTE_KIND:
9486             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9487                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9488             else
9489                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9490             break;
9491         case PyUnicode_2BYTE_KIND:
9492             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9493             break;
9494         case PyUnicode_4BYTE_KIND:
9495             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9496             break;
9497         default:
9498             Py_UNREACHABLE();
9499         }
9500     }
9501     else {
9502         switch (kind1) {
9503         case PyUnicode_1BYTE_KIND:
9504             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9505                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9506             else
9507                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9508             break;
9509         case PyUnicode_2BYTE_KIND:
9510             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9511             break;
9512         case PyUnicode_4BYTE_KIND:
9513             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9514             break;
9515         default:
9516             Py_UNREACHABLE();
9517         }
9518     }
9519 
9520     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9521     if (kind2 != kind1)
9522         PyMem_Free((void *)buf2);
9523 
9524     return result;
9525 }
9526 
9527 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9528 #include "stringlib/localeutil.h"
9529 
9530 /**
9531  * InsertThousandsGrouping:
9532  * @writer: Unicode writer.
9533  * @n_buffer: Number of characters in @buffer.
9534  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9535  * @d_pos: Start of digits string.
9536  * @n_digits: The number of digits in the string, in which we want
9537  *            to put the grouping chars.
9538  * @min_width: The minimum width of the digits in the output string.
9539  *             Output will be zero-padded on the left to fill.
9540  * @grouping: see definition in localeconv().
9541  * @thousands_sep: see definition in localeconv().
9542  *
9543  * There are 2 modes: counting and filling. If @writer is NULL,
9544  *  we are in counting mode, else filling mode.
9545  * If counting, the required buffer size is returned.
9546  * If filling, we know the buffer will be large enough, so we don't
9547  *  need to pass in the buffer size.
9548  * Inserts thousand grouping characters (as defined by grouping and
9549  *  thousands_sep) into @writer.
9550  *
9551  * Return value: -1 on error, number of characters otherwise.
9552  **/
9553 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9554 _PyUnicode_InsertThousandsGrouping(
9555     _PyUnicodeWriter *writer,
9556     Py_ssize_t n_buffer,
9557     PyObject *digits,
9558     Py_ssize_t d_pos,
9559     Py_ssize_t n_digits,
9560     Py_ssize_t min_width,
9561     const char *grouping,
9562     PyObject *thousands_sep,
9563     Py_UCS4 *maxchar)
9564 {
9565     min_width = Py_MAX(0, min_width);
9566     if (writer) {
9567         assert(digits != NULL);
9568         assert(maxchar == NULL);
9569     }
9570     else {
9571         assert(digits == NULL);
9572         assert(maxchar != NULL);
9573     }
9574     assert(0 <= d_pos);
9575     assert(0 <= n_digits);
9576     assert(grouping != NULL);
9577 
9578     if (digits != NULL) {
9579         if (PyUnicode_READY(digits) == -1) {
9580             return -1;
9581         }
9582     }
9583     if (PyUnicode_READY(thousands_sep) == -1) {
9584         return -1;
9585     }
9586 
9587     Py_ssize_t count = 0;
9588     Py_ssize_t n_zeros;
9589     int loop_broken = 0;
9590     int use_separator = 0; /* First time through, don't append the
9591                               separator. They only go between
9592                               groups. */
9593     Py_ssize_t buffer_pos;
9594     Py_ssize_t digits_pos;
9595     Py_ssize_t len;
9596     Py_ssize_t n_chars;
9597     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9598                                         be looked at */
9599     /* A generator that returns all of the grouping widths, until it
9600        returns 0. */
9601     GroupGenerator groupgen;
9602     GroupGenerator_init(&groupgen, grouping);
9603     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9604 
9605     /* if digits are not grouped, thousands separator
9606        should be an empty string */
9607     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9608 
9609     digits_pos = d_pos + n_digits;
9610     if (writer) {
9611         buffer_pos = writer->pos + n_buffer;
9612         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9613         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9614     }
9615     else {
9616         buffer_pos = n_buffer;
9617     }
9618 
9619     if (!writer) {
9620         *maxchar = 127;
9621     }
9622 
9623     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9624         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9625         n_zeros = Py_MAX(0, len - remaining);
9626         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9627 
9628         /* Use n_zero zero's and n_chars chars */
9629 
9630         /* Count only, don't do anything. */
9631         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9632 
9633         /* Copy into the writer. */
9634         InsertThousandsGrouping_fill(writer, &buffer_pos,
9635                                      digits, &digits_pos,
9636                                      n_chars, n_zeros,
9637                                      use_separator ? thousands_sep : NULL,
9638                                      thousands_sep_len, maxchar);
9639 
9640         /* Use a separator next time. */
9641         use_separator = 1;
9642 
9643         remaining -= n_chars;
9644         min_width -= len;
9645 
9646         if (remaining <= 0 && min_width <= 0) {
9647             loop_broken = 1;
9648             break;
9649         }
9650         min_width -= thousands_sep_len;
9651     }
9652     if (!loop_broken) {
9653         /* We left the loop without using a break statement. */
9654 
9655         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9656         n_zeros = Py_MAX(0, len - remaining);
9657         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9658 
9659         /* Use n_zero zero's and n_chars chars */
9660         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9661 
9662         /* Copy into the writer. */
9663         InsertThousandsGrouping_fill(writer, &buffer_pos,
9664                                      digits, &digits_pos,
9665                                      n_chars, n_zeros,
9666                                      use_separator ? thousands_sep : NULL,
9667                                      thousands_sep_len, maxchar);
9668     }
9669     return count;
9670 }
9671 
9672 
9673 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9674 PyUnicode_Count(PyObject *str,
9675                 PyObject *substr,
9676                 Py_ssize_t start,
9677                 Py_ssize_t end)
9678 {
9679     Py_ssize_t result;
9680     int kind1, kind2;
9681     const void *buf1 = NULL, *buf2 = NULL;
9682     Py_ssize_t len1, len2;
9683 
9684     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9685         return -1;
9686 
9687     kind1 = PyUnicode_KIND(str);
9688     kind2 = PyUnicode_KIND(substr);
9689     if (kind1 < kind2)
9690         return 0;
9691 
9692     len1 = PyUnicode_GET_LENGTH(str);
9693     len2 = PyUnicode_GET_LENGTH(substr);
9694     ADJUST_INDICES(start, end, len1);
9695     if (end - start < len2)
9696         return 0;
9697 
9698     buf1 = PyUnicode_DATA(str);
9699     buf2 = PyUnicode_DATA(substr);
9700     if (kind2 != kind1) {
9701         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9702         if (!buf2)
9703             goto onError;
9704     }
9705 
9706     switch (kind1) {
9707     case PyUnicode_1BYTE_KIND:
9708         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9709             result = asciilib_count(
9710                 ((const Py_UCS1*)buf1) + start, end - start,
9711                 buf2, len2, PY_SSIZE_T_MAX
9712                 );
9713         else
9714             result = ucs1lib_count(
9715                 ((const Py_UCS1*)buf1) + start, end - start,
9716                 buf2, len2, PY_SSIZE_T_MAX
9717                 );
9718         break;
9719     case PyUnicode_2BYTE_KIND:
9720         result = ucs2lib_count(
9721             ((const Py_UCS2*)buf1) + start, end - start,
9722             buf2, len2, PY_SSIZE_T_MAX
9723             );
9724         break;
9725     case PyUnicode_4BYTE_KIND:
9726         result = ucs4lib_count(
9727             ((const Py_UCS4*)buf1) + start, end - start,
9728             buf2, len2, PY_SSIZE_T_MAX
9729             );
9730         break;
9731     default:
9732         Py_UNREACHABLE();
9733     }
9734 
9735     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9736     if (kind2 != kind1)
9737         PyMem_Free((void *)buf2);
9738 
9739     return result;
9740   onError:
9741     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9742     if (kind2 != kind1)
9743         PyMem_Free((void *)buf2);
9744     return -1;
9745 }
9746 
9747 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9748 PyUnicode_Find(PyObject *str,
9749                PyObject *substr,
9750                Py_ssize_t start,
9751                Py_ssize_t end,
9752                int direction)
9753 {
9754     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9755         return -2;
9756 
9757     return any_find_slice(str, substr, start, end, direction);
9758 }
9759 
9760 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9761 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9762                    Py_ssize_t start, Py_ssize_t end,
9763                    int direction)
9764 {
9765     int kind;
9766     Py_ssize_t len, result;
9767     if (PyUnicode_READY(str) == -1)
9768         return -2;
9769     len = PyUnicode_GET_LENGTH(str);
9770     ADJUST_INDICES(start, end, len);
9771     if (end - start < 1)
9772         return -1;
9773     kind = PyUnicode_KIND(str);
9774     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9775                       kind, end-start, ch, direction);
9776     if (result == -1)
9777         return -1;
9778     else
9779         return start + result;
9780 }
9781 
9782 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9783 tailmatch(PyObject *self,
9784           PyObject *substring,
9785           Py_ssize_t start,
9786           Py_ssize_t end,
9787           int direction)
9788 {
9789     int kind_self;
9790     int kind_sub;
9791     const void *data_self;
9792     const void *data_sub;
9793     Py_ssize_t offset;
9794     Py_ssize_t i;
9795     Py_ssize_t end_sub;
9796 
9797     if (PyUnicode_READY(self) == -1 ||
9798         PyUnicode_READY(substring) == -1)
9799         return -1;
9800 
9801     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9802     end -= PyUnicode_GET_LENGTH(substring);
9803     if (end < start)
9804         return 0;
9805 
9806     if (PyUnicode_GET_LENGTH(substring) == 0)
9807         return 1;
9808 
9809     kind_self = PyUnicode_KIND(self);
9810     data_self = PyUnicode_DATA(self);
9811     kind_sub = PyUnicode_KIND(substring);
9812     data_sub = PyUnicode_DATA(substring);
9813     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9814 
9815     if (direction > 0)
9816         offset = end;
9817     else
9818         offset = start;
9819 
9820     if (PyUnicode_READ(kind_self, data_self, offset) ==
9821         PyUnicode_READ(kind_sub, data_sub, 0) &&
9822         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9823         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9824         /* If both are of the same kind, memcmp is sufficient */
9825         if (kind_self == kind_sub) {
9826             return ! memcmp((char *)data_self +
9827                                 (offset * PyUnicode_KIND(substring)),
9828                             data_sub,
9829                             PyUnicode_GET_LENGTH(substring) *
9830                                 PyUnicode_KIND(substring));
9831         }
9832         /* otherwise we have to compare each character by first accessing it */
9833         else {
9834             /* We do not need to compare 0 and len(substring)-1 because
9835                the if statement above ensured already that they are equal
9836                when we end up here. */
9837             for (i = 1; i < end_sub; ++i) {
9838                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9839                     PyUnicode_READ(kind_sub, data_sub, i))
9840                     return 0;
9841             }
9842             return 1;
9843         }
9844     }
9845 
9846     return 0;
9847 }
9848 
9849 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9850 PyUnicode_Tailmatch(PyObject *str,
9851                     PyObject *substr,
9852                     Py_ssize_t start,
9853                     Py_ssize_t end,
9854                     int direction)
9855 {
9856     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9857         return -1;
9858 
9859     return tailmatch(str, substr, start, end, direction);
9860 }
9861 
9862 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9863 ascii_upper_or_lower(PyObject *self, int lower)
9864 {
9865     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9866     const char *data = PyUnicode_DATA(self);
9867     char *resdata;
9868     PyObject *res;
9869 
9870     res = PyUnicode_New(len, 127);
9871     if (res == NULL)
9872         return NULL;
9873     resdata = PyUnicode_DATA(res);
9874     if (lower)
9875         _Py_bytes_lower(resdata, data, len);
9876     else
9877         _Py_bytes_upper(resdata, data, len);
9878     return res;
9879 }
9880 
9881 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9882 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9883 {
9884     Py_ssize_t j;
9885     int final_sigma;
9886     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9887     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9888 
9889      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9890 
9891     where ! is a negation and \p{xxx} is a character with property xxx.
9892     */
9893     for (j = i - 1; j >= 0; j--) {
9894         c = PyUnicode_READ(kind, data, j);
9895         if (!_PyUnicode_IsCaseIgnorable(c))
9896             break;
9897     }
9898     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9899     if (final_sigma) {
9900         for (j = i + 1; j < length; j++) {
9901             c = PyUnicode_READ(kind, data, j);
9902             if (!_PyUnicode_IsCaseIgnorable(c))
9903                 break;
9904         }
9905         final_sigma = j == length || !_PyUnicode_IsCased(c);
9906     }
9907     return (final_sigma) ? 0x3C2 : 0x3C3;
9908 }
9909 
9910 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9911 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9912            Py_UCS4 c, Py_UCS4 *mapped)
9913 {
9914     /* Obscure special case. */
9915     if (c == 0x3A3) {
9916         mapped[0] = handle_capital_sigma(kind, data, length, i);
9917         return 1;
9918     }
9919     return _PyUnicode_ToLowerFull(c, mapped);
9920 }
9921 
9922 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9923 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9924 {
9925     Py_ssize_t i, k = 0;
9926     int n_res, j;
9927     Py_UCS4 c, mapped[3];
9928 
9929     c = PyUnicode_READ(kind, data, 0);
9930     n_res = _PyUnicode_ToTitleFull(c, mapped);
9931     for (j = 0; j < n_res; j++) {
9932         *maxchar = Py_MAX(*maxchar, mapped[j]);
9933         res[k++] = mapped[j];
9934     }
9935     for (i = 1; i < length; i++) {
9936         c = PyUnicode_READ(kind, data, i);
9937         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9938         for (j = 0; j < n_res; j++) {
9939             *maxchar = Py_MAX(*maxchar, mapped[j]);
9940             res[k++] = mapped[j];
9941         }
9942     }
9943     return k;
9944 }
9945 
9946 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9948     Py_ssize_t i, k = 0;
9949 
9950     for (i = 0; i < length; i++) {
9951         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9952         int n_res, j;
9953         if (Py_UNICODE_ISUPPER(c)) {
9954             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9955         }
9956         else if (Py_UNICODE_ISLOWER(c)) {
9957             n_res = _PyUnicode_ToUpperFull(c, mapped);
9958         }
9959         else {
9960             n_res = 1;
9961             mapped[0] = c;
9962         }
9963         for (j = 0; j < n_res; j++) {
9964             *maxchar = Py_MAX(*maxchar, mapped[j]);
9965             res[k++] = mapped[j];
9966         }
9967     }
9968     return k;
9969 }
9970 
9971 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9972 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9973                   Py_UCS4 *maxchar, int lower)
9974 {
9975     Py_ssize_t i, k = 0;
9976 
9977     for (i = 0; i < length; i++) {
9978         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9979         int n_res, j;
9980         if (lower)
9981             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9982         else
9983             n_res = _PyUnicode_ToUpperFull(c, mapped);
9984         for (j = 0; j < n_res; j++) {
9985             *maxchar = Py_MAX(*maxchar, mapped[j]);
9986             res[k++] = mapped[j];
9987         }
9988     }
9989     return k;
9990 }
9991 
9992 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9993 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9994 {
9995     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9996 }
9997 
9998 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9999 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10000 {
10001     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10002 }
10003 
10004 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10005 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10006 {
10007     Py_ssize_t i, k = 0;
10008 
10009     for (i = 0; i < length; i++) {
10010         Py_UCS4 c = PyUnicode_READ(kind, data, i);
10011         Py_UCS4 mapped[3];
10012         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10013         for (j = 0; j < n_res; j++) {
10014             *maxchar = Py_MAX(*maxchar, mapped[j]);
10015             res[k++] = mapped[j];
10016         }
10017     }
10018     return k;
10019 }
10020 
10021 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10022 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10023 {
10024     Py_ssize_t i, k = 0;
10025     int previous_is_cased;
10026 
10027     previous_is_cased = 0;
10028     for (i = 0; i < length; i++) {
10029         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10030         Py_UCS4 mapped[3];
10031         int n_res, j;
10032 
10033         if (previous_is_cased)
10034             n_res = lower_ucs4(kind, data, length, i, c, mapped);
10035         else
10036             n_res = _PyUnicode_ToTitleFull(c, mapped);
10037 
10038         for (j = 0; j < n_res; j++) {
10039             *maxchar = Py_MAX(*maxchar, mapped[j]);
10040             res[k++] = mapped[j];
10041         }
10042 
10043         previous_is_cased = _PyUnicode_IsCased(c);
10044     }
10045     return k;
10046 }
10047 
10048 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10049 case_operation(PyObject *self,
10050                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10051 {
10052     PyObject *res = NULL;
10053     Py_ssize_t length, newlength = 0;
10054     int kind, outkind;
10055     const void *data;
10056     void *outdata;
10057     Py_UCS4 maxchar = 0, *tmp, *tmpend;
10058 
10059     assert(PyUnicode_IS_READY(self));
10060 
10061     kind = PyUnicode_KIND(self);
10062     data = PyUnicode_DATA(self);
10063     length = PyUnicode_GET_LENGTH(self);
10064     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10065         PyErr_SetString(PyExc_OverflowError, "string is too long");
10066         return NULL;
10067     }
10068     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
10069     if (tmp == NULL)
10070         return PyErr_NoMemory();
10071     newlength = perform(kind, data, length, tmp, &maxchar);
10072     res = PyUnicode_New(newlength, maxchar);
10073     if (res == NULL)
10074         goto leave;
10075     tmpend = tmp + newlength;
10076     outdata = PyUnicode_DATA(res);
10077     outkind = PyUnicode_KIND(res);
10078     switch (outkind) {
10079     case PyUnicode_1BYTE_KIND:
10080         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10081         break;
10082     case PyUnicode_2BYTE_KIND:
10083         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10084         break;
10085     case PyUnicode_4BYTE_KIND:
10086         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10087         break;
10088     default:
10089         Py_UNREACHABLE();
10090     }
10091   leave:
10092     PyMem_FREE(tmp);
10093     return res;
10094 }
10095 
10096 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10097 PyUnicode_Join(PyObject *separator, PyObject *seq)
10098 {
10099     PyObject *res;
10100     PyObject *fseq;
10101     Py_ssize_t seqlen;
10102     PyObject **items;
10103 
10104     fseq = PySequence_Fast(seq, "can only join an iterable");
10105     if (fseq == NULL) {
10106         return NULL;
10107     }
10108 
10109     /* NOTE: the following code can't call back into Python code,
10110      * so we are sure that fseq won't be mutated.
10111      */
10112 
10113     items = PySequence_Fast_ITEMS(fseq);
10114     seqlen = PySequence_Fast_GET_SIZE(fseq);
10115     res = _PyUnicode_JoinArray(separator, items, seqlen);
10116     Py_DECREF(fseq);
10117     return res;
10118 }
10119 
10120 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10121 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10122 {
10123     PyObject *res = NULL; /* the result */
10124     PyObject *sep = NULL;
10125     Py_ssize_t seplen;
10126     PyObject *item;
10127     Py_ssize_t sz, i, res_offset;
10128     Py_UCS4 maxchar;
10129     Py_UCS4 item_maxchar;
10130     int use_memcpy;
10131     unsigned char *res_data = NULL, *sep_data = NULL;
10132     PyObject *last_obj;
10133     unsigned int kind = 0;
10134 
10135     /* If empty sequence, return u"". */
10136     if (seqlen == 0) {
10137         _Py_RETURN_UNICODE_EMPTY();
10138     }
10139 
10140     /* If singleton sequence with an exact Unicode, return that. */
10141     last_obj = NULL;
10142     if (seqlen == 1) {
10143         if (PyUnicode_CheckExact(items[0])) {
10144             res = items[0];
10145             Py_INCREF(res);
10146             return res;
10147         }
10148         seplen = 0;
10149         maxchar = 0;
10150     }
10151     else {
10152         /* Set up sep and seplen */
10153         if (separator == NULL) {
10154             /* fall back to a blank space separator */
10155             sep = PyUnicode_FromOrdinal(' ');
10156             if (!sep)
10157                 goto onError;
10158             seplen = 1;
10159             maxchar = 32;
10160         }
10161         else {
10162             if (!PyUnicode_Check(separator)) {
10163                 PyErr_Format(PyExc_TypeError,
10164                              "separator: expected str instance,"
10165                              " %.80s found",
10166                              Py_TYPE(separator)->tp_name);
10167                 goto onError;
10168             }
10169             if (PyUnicode_READY(separator))
10170                 goto onError;
10171             sep = separator;
10172             seplen = PyUnicode_GET_LENGTH(separator);
10173             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10174             /* inc refcount to keep this code path symmetric with the
10175                above case of a blank separator */
10176             Py_INCREF(sep);
10177         }
10178         last_obj = sep;
10179     }
10180 
10181     /* There are at least two things to join, or else we have a subclass
10182      * of str in the sequence.
10183      * Do a pre-pass to figure out the total amount of space we'll
10184      * need (sz), and see whether all argument are strings.
10185      */
10186     sz = 0;
10187 #ifdef Py_DEBUG
10188     use_memcpy = 0;
10189 #else
10190     use_memcpy = 1;
10191 #endif
10192     for (i = 0; i < seqlen; i++) {
10193         size_t add_sz;
10194         item = items[i];
10195         if (!PyUnicode_Check(item)) {
10196             PyErr_Format(PyExc_TypeError,
10197                          "sequence item %zd: expected str instance,"
10198                          " %.80s found",
10199                          i, Py_TYPE(item)->tp_name);
10200             goto onError;
10201         }
10202         if (PyUnicode_READY(item) == -1)
10203             goto onError;
10204         add_sz = PyUnicode_GET_LENGTH(item);
10205         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10206         maxchar = Py_MAX(maxchar, item_maxchar);
10207         if (i != 0) {
10208             add_sz += seplen;
10209         }
10210         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10211             PyErr_SetString(PyExc_OverflowError,
10212                             "join() result is too long for a Python string");
10213             goto onError;
10214         }
10215         sz += add_sz;
10216         if (use_memcpy && last_obj != NULL) {
10217             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10218                 use_memcpy = 0;
10219         }
10220         last_obj = item;
10221     }
10222 
10223     res = PyUnicode_New(sz, maxchar);
10224     if (res == NULL)
10225         goto onError;
10226 
10227     /* Catenate everything. */
10228 #ifdef Py_DEBUG
10229     use_memcpy = 0;
10230 #else
10231     if (use_memcpy) {
10232         res_data = PyUnicode_1BYTE_DATA(res);
10233         kind = PyUnicode_KIND(res);
10234         if (seplen != 0)
10235             sep_data = PyUnicode_1BYTE_DATA(sep);
10236     }
10237 #endif
10238     if (use_memcpy) {
10239         for (i = 0; i < seqlen; ++i) {
10240             Py_ssize_t itemlen;
10241             item = items[i];
10242 
10243             /* Copy item, and maybe the separator. */
10244             if (i && seplen != 0) {
10245                 memcpy(res_data,
10246                           sep_data,
10247                           kind * seplen);
10248                 res_data += kind * seplen;
10249             }
10250 
10251             itemlen = PyUnicode_GET_LENGTH(item);
10252             if (itemlen != 0) {
10253                 memcpy(res_data,
10254                           PyUnicode_DATA(item),
10255                           kind * itemlen);
10256                 res_data += kind * itemlen;
10257             }
10258         }
10259         assert(res_data == PyUnicode_1BYTE_DATA(res)
10260                            + kind * PyUnicode_GET_LENGTH(res));
10261     }
10262     else {
10263         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10264             Py_ssize_t itemlen;
10265             item = items[i];
10266 
10267             /* Copy item, and maybe the separator. */
10268             if (i && seplen != 0) {
10269                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10270                 res_offset += seplen;
10271             }
10272 
10273             itemlen = PyUnicode_GET_LENGTH(item);
10274             if (itemlen != 0) {
10275                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10276                 res_offset += itemlen;
10277             }
10278         }
10279         assert(res_offset == PyUnicode_GET_LENGTH(res));
10280     }
10281 
10282     Py_XDECREF(sep);
10283     assert(_PyUnicode_CheckConsistency(res, 1));
10284     return res;
10285 
10286   onError:
10287     Py_XDECREF(sep);
10288     Py_XDECREF(res);
10289     return NULL;
10290 }
10291 
10292 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10293 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10294                     Py_UCS4 fill_char)
10295 {
10296     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10297     void *data = PyUnicode_DATA(unicode);
10298     assert(PyUnicode_IS_READY(unicode));
10299     assert(unicode_modifiable(unicode));
10300     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10301     assert(start >= 0);
10302     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10303     unicode_fill(kind, data, fill_char, start, length);
10304 }
10305 
10306 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10307 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10308                Py_UCS4 fill_char)
10309 {
10310     Py_ssize_t maxlen;
10311 
10312     if (!PyUnicode_Check(unicode)) {
10313         PyErr_BadInternalCall();
10314         return -1;
10315     }
10316     if (PyUnicode_READY(unicode) == -1)
10317         return -1;
10318     if (unicode_check_modifiable(unicode))
10319         return -1;
10320 
10321     if (start < 0) {
10322         PyErr_SetString(PyExc_IndexError, "string index out of range");
10323         return -1;
10324     }
10325     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10326         PyErr_SetString(PyExc_ValueError,
10327                          "fill character is bigger than "
10328                          "the string maximum character");
10329         return -1;
10330     }
10331 
10332     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10333     length = Py_MIN(maxlen, length);
10334     if (length <= 0)
10335         return 0;
10336 
10337     _PyUnicode_FastFill(unicode, start, length, fill_char);
10338     return length;
10339 }
10340 
10341 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10342 pad(PyObject *self,
10343     Py_ssize_t left,
10344     Py_ssize_t right,
10345     Py_UCS4 fill)
10346 {
10347     PyObject *u;
10348     Py_UCS4 maxchar;
10349     int kind;
10350     void *data;
10351 
10352     if (left < 0)
10353         left = 0;
10354     if (right < 0)
10355         right = 0;
10356 
10357     if (left == 0 && right == 0)
10358         return unicode_result_unchanged(self);
10359 
10360     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10361         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10362         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10363         return NULL;
10364     }
10365     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10366     maxchar = Py_MAX(maxchar, fill);
10367     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10368     if (!u)
10369         return NULL;
10370 
10371     kind = PyUnicode_KIND(u);
10372     data = PyUnicode_DATA(u);
10373     if (left)
10374         unicode_fill(kind, data, fill, 0, left);
10375     if (right)
10376         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10377     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10378     assert(_PyUnicode_CheckConsistency(u, 1));
10379     return u;
10380 }
10381 
10382 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10383 PyUnicode_Splitlines(PyObject *string, int keepends)
10384 {
10385     PyObject *list;
10386 
10387     if (ensure_unicode(string) < 0)
10388         return NULL;
10389 
10390     switch (PyUnicode_KIND(string)) {
10391     case PyUnicode_1BYTE_KIND:
10392         if (PyUnicode_IS_ASCII(string))
10393             list = asciilib_splitlines(
10394                 string, PyUnicode_1BYTE_DATA(string),
10395                 PyUnicode_GET_LENGTH(string), keepends);
10396         else
10397             list = ucs1lib_splitlines(
10398                 string, PyUnicode_1BYTE_DATA(string),
10399                 PyUnicode_GET_LENGTH(string), keepends);
10400         break;
10401     case PyUnicode_2BYTE_KIND:
10402         list = ucs2lib_splitlines(
10403             string, PyUnicode_2BYTE_DATA(string),
10404             PyUnicode_GET_LENGTH(string), keepends);
10405         break;
10406     case PyUnicode_4BYTE_KIND:
10407         list = ucs4lib_splitlines(
10408             string, PyUnicode_4BYTE_DATA(string),
10409             PyUnicode_GET_LENGTH(string), keepends);
10410         break;
10411     default:
10412         Py_UNREACHABLE();
10413     }
10414     return list;
10415 }
10416 
10417 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10418 split(PyObject *self,
10419       PyObject *substring,
10420       Py_ssize_t maxcount)
10421 {
10422     int kind1, kind2;
10423     const void *buf1, *buf2;
10424     Py_ssize_t len1, len2;
10425     PyObject* out;
10426 
10427     if (maxcount < 0)
10428         maxcount = PY_SSIZE_T_MAX;
10429 
10430     if (PyUnicode_READY(self) == -1)
10431         return NULL;
10432 
10433     if (substring == NULL)
10434         switch (PyUnicode_KIND(self)) {
10435         case PyUnicode_1BYTE_KIND:
10436             if (PyUnicode_IS_ASCII(self))
10437                 return asciilib_split_whitespace(
10438                     self,  PyUnicode_1BYTE_DATA(self),
10439                     PyUnicode_GET_LENGTH(self), maxcount
10440                     );
10441             else
10442                 return ucs1lib_split_whitespace(
10443                     self,  PyUnicode_1BYTE_DATA(self),
10444                     PyUnicode_GET_LENGTH(self), maxcount
10445                     );
10446         case PyUnicode_2BYTE_KIND:
10447             return ucs2lib_split_whitespace(
10448                 self,  PyUnicode_2BYTE_DATA(self),
10449                 PyUnicode_GET_LENGTH(self), maxcount
10450                 );
10451         case PyUnicode_4BYTE_KIND:
10452             return ucs4lib_split_whitespace(
10453                 self,  PyUnicode_4BYTE_DATA(self),
10454                 PyUnicode_GET_LENGTH(self), maxcount
10455                 );
10456         default:
10457             Py_UNREACHABLE();
10458         }
10459 
10460     if (PyUnicode_READY(substring) == -1)
10461         return NULL;
10462 
10463     kind1 = PyUnicode_KIND(self);
10464     kind2 = PyUnicode_KIND(substring);
10465     len1 = PyUnicode_GET_LENGTH(self);
10466     len2 = PyUnicode_GET_LENGTH(substring);
10467     if (kind1 < kind2 || len1 < len2) {
10468         out = PyList_New(1);
10469         if (out == NULL)
10470             return NULL;
10471         Py_INCREF(self);
10472         PyList_SET_ITEM(out, 0, self);
10473         return out;
10474     }
10475     buf1 = PyUnicode_DATA(self);
10476     buf2 = PyUnicode_DATA(substring);
10477     if (kind2 != kind1) {
10478         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10479         if (!buf2)
10480             return NULL;
10481     }
10482 
10483     switch (kind1) {
10484     case PyUnicode_1BYTE_KIND:
10485         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10486             out = asciilib_split(
10487                 self,  buf1, len1, buf2, len2, maxcount);
10488         else
10489             out = ucs1lib_split(
10490                 self,  buf1, len1, buf2, len2, maxcount);
10491         break;
10492     case PyUnicode_2BYTE_KIND:
10493         out = ucs2lib_split(
10494             self,  buf1, len1, buf2, len2, maxcount);
10495         break;
10496     case PyUnicode_4BYTE_KIND:
10497         out = ucs4lib_split(
10498             self,  buf1, len1, buf2, len2, maxcount);
10499         break;
10500     default:
10501         out = NULL;
10502     }
10503     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10504     if (kind2 != kind1)
10505         PyMem_Free((void *)buf2);
10506     return out;
10507 }
10508 
10509 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10510 rsplit(PyObject *self,
10511        PyObject *substring,
10512        Py_ssize_t maxcount)
10513 {
10514     int kind1, kind2;
10515     const void *buf1, *buf2;
10516     Py_ssize_t len1, len2;
10517     PyObject* out;
10518 
10519     if (maxcount < 0)
10520         maxcount = PY_SSIZE_T_MAX;
10521 
10522     if (PyUnicode_READY(self) == -1)
10523         return NULL;
10524 
10525     if (substring == NULL)
10526         switch (PyUnicode_KIND(self)) {
10527         case PyUnicode_1BYTE_KIND:
10528             if (PyUnicode_IS_ASCII(self))
10529                 return asciilib_rsplit_whitespace(
10530                     self,  PyUnicode_1BYTE_DATA(self),
10531                     PyUnicode_GET_LENGTH(self), maxcount
10532                     );
10533             else
10534                 return ucs1lib_rsplit_whitespace(
10535                     self,  PyUnicode_1BYTE_DATA(self),
10536                     PyUnicode_GET_LENGTH(self), maxcount
10537                     );
10538         case PyUnicode_2BYTE_KIND:
10539             return ucs2lib_rsplit_whitespace(
10540                 self,  PyUnicode_2BYTE_DATA(self),
10541                 PyUnicode_GET_LENGTH(self), maxcount
10542                 );
10543         case PyUnicode_4BYTE_KIND:
10544             return ucs4lib_rsplit_whitespace(
10545                 self,  PyUnicode_4BYTE_DATA(self),
10546                 PyUnicode_GET_LENGTH(self), maxcount
10547                 );
10548         default:
10549             Py_UNREACHABLE();
10550         }
10551 
10552     if (PyUnicode_READY(substring) == -1)
10553         return NULL;
10554 
10555     kind1 = PyUnicode_KIND(self);
10556     kind2 = PyUnicode_KIND(substring);
10557     len1 = PyUnicode_GET_LENGTH(self);
10558     len2 = PyUnicode_GET_LENGTH(substring);
10559     if (kind1 < kind2 || len1 < len2) {
10560         out = PyList_New(1);
10561         if (out == NULL)
10562             return NULL;
10563         Py_INCREF(self);
10564         PyList_SET_ITEM(out, 0, self);
10565         return out;
10566     }
10567     buf1 = PyUnicode_DATA(self);
10568     buf2 = PyUnicode_DATA(substring);
10569     if (kind2 != kind1) {
10570         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10571         if (!buf2)
10572             return NULL;
10573     }
10574 
10575     switch (kind1) {
10576     case PyUnicode_1BYTE_KIND:
10577         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10578             out = asciilib_rsplit(
10579                 self,  buf1, len1, buf2, len2, maxcount);
10580         else
10581             out = ucs1lib_rsplit(
10582                 self,  buf1, len1, buf2, len2, maxcount);
10583         break;
10584     case PyUnicode_2BYTE_KIND:
10585         out = ucs2lib_rsplit(
10586             self,  buf1, len1, buf2, len2, maxcount);
10587         break;
10588     case PyUnicode_4BYTE_KIND:
10589         out = ucs4lib_rsplit(
10590             self,  buf1, len1, buf2, len2, maxcount);
10591         break;
10592     default:
10593         out = NULL;
10594     }
10595     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10596     if (kind2 != kind1)
10597         PyMem_Free((void *)buf2);
10598     return out;
10599 }
10600 
10601 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10602 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10603             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10604 {
10605     switch (kind) {
10606     case PyUnicode_1BYTE_KIND:
10607         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10608             return asciilib_find(buf1, len1, buf2, len2, offset);
10609         else
10610             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10611     case PyUnicode_2BYTE_KIND:
10612         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10613     case PyUnicode_4BYTE_KIND:
10614         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10615     }
10616     Py_UNREACHABLE();
10617 }
10618 
10619 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10620 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10621              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10622 {
10623     switch (kind) {
10624     case PyUnicode_1BYTE_KIND:
10625         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10626             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10627         else
10628             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10629     case PyUnicode_2BYTE_KIND:
10630         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10631     case PyUnicode_4BYTE_KIND:
10632         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10633     }
10634     Py_UNREACHABLE();
10635 }
10636 
10637 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10638 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10639                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10640 {
10641     int kind = PyUnicode_KIND(u);
10642     void *data = PyUnicode_DATA(u);
10643     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10644     if (kind == PyUnicode_1BYTE_KIND) {
10645         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10646                                       (Py_UCS1 *)data + len,
10647                                       u1, u2, maxcount);
10648     }
10649     else if (kind == PyUnicode_2BYTE_KIND) {
10650         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10651                                       (Py_UCS2 *)data + len,
10652                                       u1, u2, maxcount);
10653     }
10654     else {
10655         assert(kind == PyUnicode_4BYTE_KIND);
10656         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10657                                       (Py_UCS4 *)data + len,
10658                                       u1, u2, maxcount);
10659     }
10660 }
10661 
10662 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10663 replace(PyObject *self, PyObject *str1,
10664         PyObject *str2, Py_ssize_t maxcount)
10665 {
10666     PyObject *u;
10667     const char *sbuf = PyUnicode_DATA(self);
10668     const void *buf1 = PyUnicode_DATA(str1);
10669     const void *buf2 = PyUnicode_DATA(str2);
10670     int srelease = 0, release1 = 0, release2 = 0;
10671     int skind = PyUnicode_KIND(self);
10672     int kind1 = PyUnicode_KIND(str1);
10673     int kind2 = PyUnicode_KIND(str2);
10674     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10675     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10676     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10677     int mayshrink;
10678     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10679 
10680     if (slen < len1)
10681         goto nothing;
10682 
10683     if (maxcount < 0)
10684         maxcount = PY_SSIZE_T_MAX;
10685     else if (maxcount == 0)
10686         goto nothing;
10687 
10688     if (str1 == str2)
10689         goto nothing;
10690 
10691     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10692     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10693     if (maxchar < maxchar_str1)
10694         /* substring too wide to be present */
10695         goto nothing;
10696     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10697     /* Replacing str1 with str2 may cause a maxchar reduction in the
10698        result string. */
10699     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10700     maxchar = Py_MAX(maxchar, maxchar_str2);
10701 
10702     if (len1 == len2) {
10703         /* same length */
10704         if (len1 == 0)
10705             goto nothing;
10706         if (len1 == 1) {
10707             /* replace characters */
10708             Py_UCS4 u1, u2;
10709             Py_ssize_t pos;
10710 
10711             u1 = PyUnicode_READ(kind1, buf1, 0);
10712             pos = findchar(sbuf, skind, slen, u1, 1);
10713             if (pos < 0)
10714                 goto nothing;
10715             u2 = PyUnicode_READ(kind2, buf2, 0);
10716             u = PyUnicode_New(slen, maxchar);
10717             if (!u)
10718                 goto error;
10719 
10720             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10721             replace_1char_inplace(u, pos, u1, u2, maxcount);
10722         }
10723         else {
10724             int rkind = skind;
10725             char *res;
10726             Py_ssize_t i;
10727 
10728             if (kind1 < rkind) {
10729                 /* widen substring */
10730                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10731                 if (!buf1) goto error;
10732                 release1 = 1;
10733             }
10734             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10735             if (i < 0)
10736                 goto nothing;
10737             if (rkind > kind2) {
10738                 /* widen replacement */
10739                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10740                 if (!buf2) goto error;
10741                 release2 = 1;
10742             }
10743             else if (rkind < kind2) {
10744                 /* widen self and buf1 */
10745                 rkind = kind2;
10746                 if (release1) {
10747                     assert(buf1 != PyUnicode_DATA(str1));
10748                     PyMem_Free((void *)buf1);
10749                     buf1 = PyUnicode_DATA(str1);
10750                     release1 = 0;
10751                 }
10752                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10753                 if (!sbuf) goto error;
10754                 srelease = 1;
10755                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10756                 if (!buf1) goto error;
10757                 release1 = 1;
10758             }
10759             u = PyUnicode_New(slen, maxchar);
10760             if (!u)
10761                 goto error;
10762             assert(PyUnicode_KIND(u) == rkind);
10763             res = PyUnicode_DATA(u);
10764 
10765             memcpy(res, sbuf, rkind * slen);
10766             /* change everything in-place, starting with this one */
10767             memcpy(res + rkind * i,
10768                    buf2,
10769                    rkind * len2);
10770             i += len1;
10771 
10772             while ( --maxcount > 0) {
10773                 i = anylib_find(rkind, self,
10774                                 sbuf+rkind*i, slen-i,
10775                                 str1, buf1, len1, i);
10776                 if (i == -1)
10777                     break;
10778                 memcpy(res + rkind * i,
10779                        buf2,
10780                        rkind * len2);
10781                 i += len1;
10782             }
10783         }
10784     }
10785     else {
10786         Py_ssize_t n, i, j, ires;
10787         Py_ssize_t new_size;
10788         int rkind = skind;
10789         char *res;
10790 
10791         if (kind1 < rkind) {
10792             /* widen substring */
10793             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10794             if (!buf1) goto error;
10795             release1 = 1;
10796         }
10797         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10798         if (n == 0)
10799             goto nothing;
10800         if (kind2 < rkind) {
10801             /* widen replacement */
10802             buf2 = unicode_askind(kind2, buf2, len2, rkind);
10803             if (!buf2) goto error;
10804             release2 = 1;
10805         }
10806         else if (kind2 > rkind) {
10807             /* widen self and buf1 */
10808             rkind = kind2;
10809             sbuf = unicode_askind(skind, sbuf, slen, rkind);
10810             if (!sbuf) goto error;
10811             srelease = 1;
10812             if (release1) {
10813                 assert(buf1 != PyUnicode_DATA(str1));
10814                 PyMem_Free((void *)buf1);
10815                 buf1 = PyUnicode_DATA(str1);
10816                 release1 = 0;
10817             }
10818             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10819             if (!buf1) goto error;
10820             release1 = 1;
10821         }
10822         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10823            PyUnicode_GET_LENGTH(str1))); */
10824         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10825                 PyErr_SetString(PyExc_OverflowError,
10826                                 "replace string is too long");
10827                 goto error;
10828         }
10829         new_size = slen + n * (len2 - len1);
10830         if (new_size == 0) {
10831             _Py_INCREF_UNICODE_EMPTY();
10832             if (!unicode_empty)
10833                 goto error;
10834             u = unicode_empty;
10835             goto done;
10836         }
10837         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10838             PyErr_SetString(PyExc_OverflowError,
10839                             "replace string is too long");
10840             goto error;
10841         }
10842         u = PyUnicode_New(new_size, maxchar);
10843         if (!u)
10844             goto error;
10845         assert(PyUnicode_KIND(u) == rkind);
10846         res = PyUnicode_DATA(u);
10847         ires = i = 0;
10848         if (len1 > 0) {
10849             while (n-- > 0) {
10850                 /* look for next match */
10851                 j = anylib_find(rkind, self,
10852                                 sbuf + rkind * i, slen-i,
10853                                 str1, buf1, len1, i);
10854                 if (j == -1)
10855                     break;
10856                 else if (j > i) {
10857                     /* copy unchanged part [i:j] */
10858                     memcpy(res + rkind * ires,
10859                            sbuf + rkind * i,
10860                            rkind * (j-i));
10861                     ires += j - i;
10862                 }
10863                 /* copy substitution string */
10864                 if (len2 > 0) {
10865                     memcpy(res + rkind * ires,
10866                            buf2,
10867                            rkind * len2);
10868                     ires += len2;
10869                 }
10870                 i = j + len1;
10871             }
10872             if (i < slen)
10873                 /* copy tail [i:] */
10874                 memcpy(res + rkind * ires,
10875                        sbuf + rkind * i,
10876                        rkind * (slen-i));
10877         }
10878         else {
10879             /* interleave */
10880             while (n > 0) {
10881                 memcpy(res + rkind * ires,
10882                        buf2,
10883                        rkind * len2);
10884                 ires += len2;
10885                 if (--n <= 0)
10886                     break;
10887                 memcpy(res + rkind * ires,
10888                        sbuf + rkind * i,
10889                        rkind);
10890                 ires++;
10891                 i++;
10892             }
10893             memcpy(res + rkind * ires,
10894                    sbuf + rkind * i,
10895                    rkind * (slen-i));
10896         }
10897     }
10898 
10899     if (mayshrink) {
10900         unicode_adjust_maxchar(&u);
10901         if (u == NULL)
10902             goto error;
10903     }
10904 
10905   done:
10906     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10907     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10908     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10909     if (srelease)
10910         PyMem_FREE((void *)sbuf);
10911     if (release1)
10912         PyMem_FREE((void *)buf1);
10913     if (release2)
10914         PyMem_FREE((void *)buf2);
10915     assert(_PyUnicode_CheckConsistency(u, 1));
10916     return u;
10917 
10918   nothing:
10919     /* nothing to replace; return original string (when possible) */
10920     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10921     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10922     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10923     if (srelease)
10924         PyMem_FREE((void *)sbuf);
10925     if (release1)
10926         PyMem_FREE((void *)buf1);
10927     if (release2)
10928         PyMem_FREE((void *)buf2);
10929     return unicode_result_unchanged(self);
10930 
10931   error:
10932     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10933     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10934     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10935     if (srelease)
10936         PyMem_FREE((void *)sbuf);
10937     if (release1)
10938         PyMem_FREE((void *)buf1);
10939     if (release2)
10940         PyMem_FREE((void *)buf2);
10941     return NULL;
10942 }
10943 
10944 /* --- Unicode Object Methods --------------------------------------------- */
10945 
10946 /*[clinic input]
10947 str.title as unicode_title
10948 
10949 Return a version of the string where each word is titlecased.
10950 
10951 More specifically, words start with uppercased characters and all remaining
10952 cased characters have lower case.
10953 [clinic start generated code]*/
10954 
10955 static PyObject *
unicode_title_impl(PyObject * self)10956 unicode_title_impl(PyObject *self)
10957 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10958 {
10959     if (PyUnicode_READY(self) == -1)
10960         return NULL;
10961     return case_operation(self, do_title);
10962 }
10963 
10964 /*[clinic input]
10965 str.capitalize as unicode_capitalize
10966 
10967 Return a capitalized version of the string.
10968 
10969 More specifically, make the first character have upper case and the rest lower
10970 case.
10971 [clinic start generated code]*/
10972 
10973 static PyObject *
unicode_capitalize_impl(PyObject * self)10974 unicode_capitalize_impl(PyObject *self)
10975 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10976 {
10977     if (PyUnicode_READY(self) == -1)
10978         return NULL;
10979     if (PyUnicode_GET_LENGTH(self) == 0)
10980         return unicode_result_unchanged(self);
10981     return case_operation(self, do_capitalize);
10982 }
10983 
10984 /*[clinic input]
10985 str.casefold as unicode_casefold
10986 
10987 Return a version of the string suitable for caseless comparisons.
10988 [clinic start generated code]*/
10989 
10990 static PyObject *
unicode_casefold_impl(PyObject * self)10991 unicode_casefold_impl(PyObject *self)
10992 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10993 {
10994     if (PyUnicode_READY(self) == -1)
10995         return NULL;
10996     if (PyUnicode_IS_ASCII(self))
10997         return ascii_upper_or_lower(self, 1);
10998     return case_operation(self, do_casefold);
10999 }
11000 
11001 
11002 /* Argument converter. Accepts a single Unicode character. */
11003 
11004 static int
convert_uc(PyObject * obj,void * addr)11005 convert_uc(PyObject *obj, void *addr)
11006 {
11007     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11008 
11009     if (!PyUnicode_Check(obj)) {
11010         PyErr_Format(PyExc_TypeError,
11011                      "The fill character must be a unicode character, "
11012                      "not %.100s", Py_TYPE(obj)->tp_name);
11013         return 0;
11014     }
11015     if (PyUnicode_READY(obj) < 0)
11016         return 0;
11017     if (PyUnicode_GET_LENGTH(obj) != 1) {
11018         PyErr_SetString(PyExc_TypeError,
11019                         "The fill character must be exactly one character long");
11020         return 0;
11021     }
11022     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11023     return 1;
11024 }
11025 
11026 /*[clinic input]
11027 str.center as unicode_center
11028 
11029     width: Py_ssize_t
11030     fillchar: Py_UCS4 = ' '
11031     /
11032 
11033 Return a centered string of length width.
11034 
11035 Padding is done using the specified fill character (default is a space).
11036 [clinic start generated code]*/
11037 
11038 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11039 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11040 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11041 {
11042     Py_ssize_t marg, left;
11043 
11044     if (PyUnicode_READY(self) == -1)
11045         return NULL;
11046 
11047     if (PyUnicode_GET_LENGTH(self) >= width)
11048         return unicode_result_unchanged(self);
11049 
11050     marg = width - PyUnicode_GET_LENGTH(self);
11051     left = marg / 2 + (marg & width & 1);
11052 
11053     return pad(self, left, marg - left, fillchar);
11054 }
11055 
11056 /* This function assumes that str1 and str2 are readied by the caller. */
11057 
11058 static int
unicode_compare(PyObject * str1,PyObject * str2)11059 unicode_compare(PyObject *str1, PyObject *str2)
11060 {
11061 #define COMPARE(TYPE1, TYPE2) \
11062     do { \
11063         TYPE1* p1 = (TYPE1 *)data1; \
11064         TYPE2* p2 = (TYPE2 *)data2; \
11065         TYPE1* end = p1 + len; \
11066         Py_UCS4 c1, c2; \
11067         for (; p1 != end; p1++, p2++) { \
11068             c1 = *p1; \
11069             c2 = *p2; \
11070             if (c1 != c2) \
11071                 return (c1 < c2) ? -1 : 1; \
11072         } \
11073     } \
11074     while (0)
11075 
11076     int kind1, kind2;
11077     const void *data1, *data2;
11078     Py_ssize_t len1, len2, len;
11079 
11080     kind1 = PyUnicode_KIND(str1);
11081     kind2 = PyUnicode_KIND(str2);
11082     data1 = PyUnicode_DATA(str1);
11083     data2 = PyUnicode_DATA(str2);
11084     len1 = PyUnicode_GET_LENGTH(str1);
11085     len2 = PyUnicode_GET_LENGTH(str2);
11086     len = Py_MIN(len1, len2);
11087 
11088     switch(kind1) {
11089     case PyUnicode_1BYTE_KIND:
11090     {
11091         switch(kind2) {
11092         case PyUnicode_1BYTE_KIND:
11093         {
11094             int cmp = memcmp(data1, data2, len);
11095             /* normalize result of memcmp() into the range [-1; 1] */
11096             if (cmp < 0)
11097                 return -1;
11098             if (cmp > 0)
11099                 return 1;
11100             break;
11101         }
11102         case PyUnicode_2BYTE_KIND:
11103             COMPARE(Py_UCS1, Py_UCS2);
11104             break;
11105         case PyUnicode_4BYTE_KIND:
11106             COMPARE(Py_UCS1, Py_UCS4);
11107             break;
11108         default:
11109             Py_UNREACHABLE();
11110         }
11111         break;
11112     }
11113     case PyUnicode_2BYTE_KIND:
11114     {
11115         switch(kind2) {
11116         case PyUnicode_1BYTE_KIND:
11117             COMPARE(Py_UCS2, Py_UCS1);
11118             break;
11119         case PyUnicode_2BYTE_KIND:
11120         {
11121             COMPARE(Py_UCS2, Py_UCS2);
11122             break;
11123         }
11124         case PyUnicode_4BYTE_KIND:
11125             COMPARE(Py_UCS2, Py_UCS4);
11126             break;
11127         default:
11128             Py_UNREACHABLE();
11129         }
11130         break;
11131     }
11132     case PyUnicode_4BYTE_KIND:
11133     {
11134         switch(kind2) {
11135         case PyUnicode_1BYTE_KIND:
11136             COMPARE(Py_UCS4, Py_UCS1);
11137             break;
11138         case PyUnicode_2BYTE_KIND:
11139             COMPARE(Py_UCS4, Py_UCS2);
11140             break;
11141         case PyUnicode_4BYTE_KIND:
11142         {
11143 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11144             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11145             /* normalize result of wmemcmp() into the range [-1; 1] */
11146             if (cmp < 0)
11147                 return -1;
11148             if (cmp > 0)
11149                 return 1;
11150 #else
11151             COMPARE(Py_UCS4, Py_UCS4);
11152 #endif
11153             break;
11154         }
11155         default:
11156             Py_UNREACHABLE();
11157         }
11158         break;
11159     }
11160     default:
11161         Py_UNREACHABLE();
11162     }
11163 
11164     if (len1 == len2)
11165         return 0;
11166     if (len1 < len2)
11167         return -1;
11168     else
11169         return 1;
11170 
11171 #undef COMPARE
11172 }
11173 
11174 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11175 unicode_compare_eq(PyObject *str1, PyObject *str2)
11176 {
11177     int kind;
11178     const void *data1, *data2;
11179     Py_ssize_t len;
11180     int cmp;
11181 
11182     len = PyUnicode_GET_LENGTH(str1);
11183     if (PyUnicode_GET_LENGTH(str2) != len)
11184         return 0;
11185     kind = PyUnicode_KIND(str1);
11186     if (PyUnicode_KIND(str2) != kind)
11187         return 0;
11188     data1 = PyUnicode_DATA(str1);
11189     data2 = PyUnicode_DATA(str2);
11190 
11191     cmp = memcmp(data1, data2, len * kind);
11192     return (cmp == 0);
11193 }
11194 
11195 
11196 int
PyUnicode_Compare(PyObject * left,PyObject * right)11197 PyUnicode_Compare(PyObject *left, PyObject *right)
11198 {
11199     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11200         if (PyUnicode_READY(left) == -1 ||
11201             PyUnicode_READY(right) == -1)
11202             return -1;
11203 
11204         /* a string is equal to itself */
11205         if (left == right)
11206             return 0;
11207 
11208         return unicode_compare(left, right);
11209     }
11210     PyErr_Format(PyExc_TypeError,
11211                  "Can't compare %.100s and %.100s",
11212                  Py_TYPE(left)->tp_name,
11213                  Py_TYPE(right)->tp_name);
11214     return -1;
11215 }
11216 
11217 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11218 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11219 {
11220     Py_ssize_t i;
11221     int kind;
11222     Py_UCS4 chr;
11223     const unsigned char *ustr = (const unsigned char *)str;
11224 
11225     assert(_PyUnicode_CHECK(uni));
11226     if (!PyUnicode_IS_READY(uni)) {
11227         const wchar_t *ws = _PyUnicode_WSTR(uni);
11228         /* Compare Unicode string and source character set string */
11229         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11230             if (chr != ustr[i])
11231                 return (chr < ustr[i]) ? -1 : 1;
11232         }
11233         /* This check keeps Python strings that end in '\0' from comparing equal
11234          to C strings identical up to that point. */
11235         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11236             return 1; /* uni is longer */
11237         if (ustr[i])
11238             return -1; /* str is longer */
11239         return 0;
11240     }
11241     kind = PyUnicode_KIND(uni);
11242     if (kind == PyUnicode_1BYTE_KIND) {
11243         const void *data = PyUnicode_1BYTE_DATA(uni);
11244         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11245         size_t len, len2 = strlen(str);
11246         int cmp;
11247 
11248         len = Py_MIN(len1, len2);
11249         cmp = memcmp(data, str, len);
11250         if (cmp != 0) {
11251             if (cmp < 0)
11252                 return -1;
11253             else
11254                 return 1;
11255         }
11256         if (len1 > len2)
11257             return 1; /* uni is longer */
11258         if (len1 < len2)
11259             return -1; /* str is longer */
11260         return 0;
11261     }
11262     else {
11263         const void *data = PyUnicode_DATA(uni);
11264         /* Compare Unicode string and source character set string */
11265         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11266             if (chr != (unsigned char)str[i])
11267                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11268         /* This check keeps Python strings that end in '\0' from comparing equal
11269          to C strings identical up to that point. */
11270         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11271             return 1; /* uni is longer */
11272         if (str[i])
11273             return -1; /* str is longer */
11274         return 0;
11275     }
11276 }
11277 
11278 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11279 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11280 {
11281     size_t i, len;
11282     const wchar_t *p;
11283     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11284     if (strlen(str) != len)
11285         return 0;
11286     p = _PyUnicode_WSTR(unicode);
11287     assert(p);
11288     for (i = 0; i < len; i++) {
11289         unsigned char c = (unsigned char)str[i];
11290         if (c >= 128 || p[i] != (wchar_t)c)
11291             return 0;
11292     }
11293     return 1;
11294 }
11295 
11296 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11297 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11298 {
11299     size_t len;
11300     assert(_PyUnicode_CHECK(unicode));
11301     assert(str);
11302 #ifndef NDEBUG
11303     for (const char *p = str; *p; p++) {
11304         assert((unsigned char)*p < 128);
11305     }
11306 #endif
11307     if (PyUnicode_READY(unicode) == -1) {
11308         /* Memory error or bad data */
11309         PyErr_Clear();
11310         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11311     }
11312     if (!PyUnicode_IS_ASCII(unicode))
11313         return 0;
11314     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11315     return strlen(str) == len &&
11316            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11317 }
11318 
11319 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11320 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11321 {
11322     PyObject *right_uni;
11323 
11324     assert(_PyUnicode_CHECK(left));
11325     assert(right->string);
11326 #ifndef NDEBUG
11327     for (const char *p = right->string; *p; p++) {
11328         assert((unsigned char)*p < 128);
11329     }
11330 #endif
11331 
11332     if (PyUnicode_READY(left) == -1) {
11333         /* memory error or bad data */
11334         PyErr_Clear();
11335         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11336     }
11337 
11338     if (!PyUnicode_IS_ASCII(left))
11339         return 0;
11340 
11341     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11342     if (right_uni == NULL) {
11343         /* memory error or bad data */
11344         PyErr_Clear();
11345         return _PyUnicode_EqualToASCIIString(left, right->string);
11346     }
11347 
11348     if (left == right_uni)
11349         return 1;
11350 
11351     if (PyUnicode_CHECK_INTERNED(left))
11352         return 0;
11353 
11354 #ifdef INTERNED_STRINGS
11355     assert(_PyUnicode_HASH(right_uni) != -1);
11356     Py_hash_t hash = _PyUnicode_HASH(left);
11357     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11358         return 0;
11359 #endif
11360 
11361     return unicode_compare_eq(left, right_uni);
11362 }
11363 
11364 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11365 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11366 {
11367     int result;
11368 
11369     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11370         Py_RETURN_NOTIMPLEMENTED;
11371 
11372     if (PyUnicode_READY(left) == -1 ||
11373         PyUnicode_READY(right) == -1)
11374         return NULL;
11375 
11376     if (left == right) {
11377         switch (op) {
11378         case Py_EQ:
11379         case Py_LE:
11380         case Py_GE:
11381             /* a string is equal to itself */
11382             Py_RETURN_TRUE;
11383         case Py_NE:
11384         case Py_LT:
11385         case Py_GT:
11386             Py_RETURN_FALSE;
11387         default:
11388             PyErr_BadArgument();
11389             return NULL;
11390         }
11391     }
11392     else if (op == Py_EQ || op == Py_NE) {
11393         result = unicode_compare_eq(left, right);
11394         result ^= (op == Py_NE);
11395         return PyBool_FromLong(result);
11396     }
11397     else {
11398         result = unicode_compare(left, right);
11399         Py_RETURN_RICHCOMPARE(result, 0, op);
11400     }
11401 }
11402 
11403 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11404 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11405 {
11406     return unicode_eq(aa, bb);
11407 }
11408 
11409 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11410 PyUnicode_Contains(PyObject *str, PyObject *substr)
11411 {
11412     int kind1, kind2;
11413     const void *buf1, *buf2;
11414     Py_ssize_t len1, len2;
11415     int result;
11416 
11417     if (!PyUnicode_Check(substr)) {
11418         PyErr_Format(PyExc_TypeError,
11419                      "'in <string>' requires string as left operand, not %.100s",
11420                      Py_TYPE(substr)->tp_name);
11421         return -1;
11422     }
11423     if (PyUnicode_READY(substr) == -1)
11424         return -1;
11425     if (ensure_unicode(str) < 0)
11426         return -1;
11427 
11428     kind1 = PyUnicode_KIND(str);
11429     kind2 = PyUnicode_KIND(substr);
11430     if (kind1 < kind2)
11431         return 0;
11432     len1 = PyUnicode_GET_LENGTH(str);
11433     len2 = PyUnicode_GET_LENGTH(substr);
11434     if (len1 < len2)
11435         return 0;
11436     buf1 = PyUnicode_DATA(str);
11437     buf2 = PyUnicode_DATA(substr);
11438     if (len2 == 1) {
11439         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11440         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11441         return result;
11442     }
11443     if (kind2 != kind1) {
11444         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11445         if (!buf2)
11446             return -1;
11447     }
11448 
11449     switch (kind1) {
11450     case PyUnicode_1BYTE_KIND:
11451         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11452         break;
11453     case PyUnicode_2BYTE_KIND:
11454         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11455         break;
11456     case PyUnicode_4BYTE_KIND:
11457         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11458         break;
11459     default:
11460         Py_UNREACHABLE();
11461     }
11462 
11463     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11464     if (kind2 != kind1)
11465         PyMem_Free((void *)buf2);
11466 
11467     return result;
11468 }
11469 
11470 /* Concat to string or Unicode object giving a new Unicode object. */
11471 
11472 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11473 PyUnicode_Concat(PyObject *left, PyObject *right)
11474 {
11475     PyObject *result;
11476     Py_UCS4 maxchar, maxchar2;
11477     Py_ssize_t left_len, right_len, new_len;
11478 
11479     if (ensure_unicode(left) < 0)
11480         return NULL;
11481 
11482     if (!PyUnicode_Check(right)) {
11483         PyErr_Format(PyExc_TypeError,
11484                      "can only concatenate str (not \"%.200s\") to str",
11485                      Py_TYPE(right)->tp_name);
11486         return NULL;
11487     }
11488     if (PyUnicode_READY(right) < 0)
11489         return NULL;
11490 
11491     /* Shortcuts */
11492     if (left == unicode_empty)
11493         return PyUnicode_FromObject(right);
11494     if (right == unicode_empty)
11495         return PyUnicode_FromObject(left);
11496 
11497     left_len = PyUnicode_GET_LENGTH(left);
11498     right_len = PyUnicode_GET_LENGTH(right);
11499     if (left_len > PY_SSIZE_T_MAX - right_len) {
11500         PyErr_SetString(PyExc_OverflowError,
11501                         "strings are too large to concat");
11502         return NULL;
11503     }
11504     new_len = left_len + right_len;
11505 
11506     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11507     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11508     maxchar = Py_MAX(maxchar, maxchar2);
11509 
11510     /* Concat the two Unicode strings */
11511     result = PyUnicode_New(new_len, maxchar);
11512     if (result == NULL)
11513         return NULL;
11514     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11515     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11516     assert(_PyUnicode_CheckConsistency(result, 1));
11517     return result;
11518 }
11519 
11520 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11521 PyUnicode_Append(PyObject **p_left, PyObject *right)
11522 {
11523     PyObject *left, *res;
11524     Py_UCS4 maxchar, maxchar2;
11525     Py_ssize_t left_len, right_len, new_len;
11526 
11527     if (p_left == NULL) {
11528         if (!PyErr_Occurred())
11529             PyErr_BadInternalCall();
11530         return;
11531     }
11532     left = *p_left;
11533     if (right == NULL || left == NULL
11534         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11535         if (!PyErr_Occurred())
11536             PyErr_BadInternalCall();
11537         goto error;
11538     }
11539 
11540     if (PyUnicode_READY(left) == -1)
11541         goto error;
11542     if (PyUnicode_READY(right) == -1)
11543         goto error;
11544 
11545     /* Shortcuts */
11546     if (left == unicode_empty) {
11547         Py_DECREF(left);
11548         Py_INCREF(right);
11549         *p_left = right;
11550         return;
11551     }
11552     if (right == unicode_empty)
11553         return;
11554 
11555     left_len = PyUnicode_GET_LENGTH(left);
11556     right_len = PyUnicode_GET_LENGTH(right);
11557     if (left_len > PY_SSIZE_T_MAX - right_len) {
11558         PyErr_SetString(PyExc_OverflowError,
11559                         "strings are too large to concat");
11560         goto error;
11561     }
11562     new_len = left_len + right_len;
11563 
11564     if (unicode_modifiable(left)
11565         && PyUnicode_CheckExact(right)
11566         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11567         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11568            to change the structure size, but characters are stored just after
11569            the structure, and so it requires to move all characters which is
11570            not so different than duplicating the string. */
11571         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11572     {
11573         /* append inplace */
11574         if (unicode_resize(p_left, new_len) != 0)
11575             goto error;
11576 
11577         /* copy 'right' into the newly allocated area of 'left' */
11578         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11579     }
11580     else {
11581         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11582         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11583         maxchar = Py_MAX(maxchar, maxchar2);
11584 
11585         /* Concat the two Unicode strings */
11586         res = PyUnicode_New(new_len, maxchar);
11587         if (res == NULL)
11588             goto error;
11589         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11590         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11591         Py_DECREF(left);
11592         *p_left = res;
11593     }
11594     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11595     return;
11596 
11597 error:
11598     Py_CLEAR(*p_left);
11599 }
11600 
11601 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11602 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11603 {
11604     PyUnicode_Append(pleft, right);
11605     Py_XDECREF(right);
11606 }
11607 
11608 /*
11609 Wraps stringlib_parse_args_finds() and additionally ensures that the
11610 first argument is a unicode object.
11611 */
11612 
11613 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11614 parse_args_finds_unicode(const char * function_name, PyObject *args,
11615                          PyObject **substring,
11616                          Py_ssize_t *start, Py_ssize_t *end)
11617 {
11618     if(stringlib_parse_args_finds(function_name, args, substring,
11619                                   start, end)) {
11620         if (ensure_unicode(*substring) < 0)
11621             return 0;
11622         return 1;
11623     }
11624     return 0;
11625 }
11626 
11627 PyDoc_STRVAR(count__doc__,
11628              "S.count(sub[, start[, end]]) -> int\n\
11629 \n\
11630 Return the number of non-overlapping occurrences of substring sub in\n\
11631 string S[start:end].  Optional arguments start and end are\n\
11632 interpreted as in slice notation.");
11633 
11634 static PyObject *
unicode_count(PyObject * self,PyObject * args)11635 unicode_count(PyObject *self, PyObject *args)
11636 {
11637     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11638     Py_ssize_t start = 0;
11639     Py_ssize_t end = PY_SSIZE_T_MAX;
11640     PyObject *result;
11641     int kind1, kind2;
11642     const void *buf1, *buf2;
11643     Py_ssize_t len1, len2, iresult;
11644 
11645     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11646         return NULL;
11647 
11648     kind1 = PyUnicode_KIND(self);
11649     kind2 = PyUnicode_KIND(substring);
11650     if (kind1 < kind2)
11651         return PyLong_FromLong(0);
11652 
11653     len1 = PyUnicode_GET_LENGTH(self);
11654     len2 = PyUnicode_GET_LENGTH(substring);
11655     ADJUST_INDICES(start, end, len1);
11656     if (end - start < len2)
11657         return PyLong_FromLong(0);
11658 
11659     buf1 = PyUnicode_DATA(self);
11660     buf2 = PyUnicode_DATA(substring);
11661     if (kind2 != kind1) {
11662         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11663         if (!buf2)
11664             return NULL;
11665     }
11666     switch (kind1) {
11667     case PyUnicode_1BYTE_KIND:
11668         iresult = ucs1lib_count(
11669             ((const Py_UCS1*)buf1) + start, end - start,
11670             buf2, len2, PY_SSIZE_T_MAX
11671             );
11672         break;
11673     case PyUnicode_2BYTE_KIND:
11674         iresult = ucs2lib_count(
11675             ((const Py_UCS2*)buf1) + start, end - start,
11676             buf2, len2, PY_SSIZE_T_MAX
11677             );
11678         break;
11679     case PyUnicode_4BYTE_KIND:
11680         iresult = ucs4lib_count(
11681             ((const Py_UCS4*)buf1) + start, end - start,
11682             buf2, len2, PY_SSIZE_T_MAX
11683             );
11684         break;
11685     default:
11686         Py_UNREACHABLE();
11687     }
11688 
11689     result = PyLong_FromSsize_t(iresult);
11690 
11691     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11692     if (kind2 != kind1)
11693         PyMem_Free((void *)buf2);
11694 
11695     return result;
11696 }
11697 
11698 /*[clinic input]
11699 str.encode as unicode_encode
11700 
11701     encoding: str(c_default="NULL") = 'utf-8'
11702         The encoding in which to encode the string.
11703     errors: str(c_default="NULL") = 'strict'
11704         The error handling scheme to use for encoding errors.
11705         The default is 'strict' meaning that encoding errors raise a
11706         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11707         'xmlcharrefreplace' as well as any other name registered with
11708         codecs.register_error that can handle UnicodeEncodeErrors.
11709 
11710 Encode the string using the codec registered for encoding.
11711 [clinic start generated code]*/
11712 
11713 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11714 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11715 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11716 {
11717     return PyUnicode_AsEncodedString(self, encoding, errors);
11718 }
11719 
11720 /*[clinic input]
11721 str.expandtabs as unicode_expandtabs
11722 
11723     tabsize: int = 8
11724 
11725 Return a copy where all tab characters are expanded using spaces.
11726 
11727 If tabsize is not given, a tab size of 8 characters is assumed.
11728 [clinic start generated code]*/
11729 
11730 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11731 unicode_expandtabs_impl(PyObject *self, int tabsize)
11732 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11733 {
11734     Py_ssize_t i, j, line_pos, src_len, incr;
11735     Py_UCS4 ch;
11736     PyObject *u;
11737     const void *src_data;
11738     void *dest_data;
11739     int kind;
11740     int found;
11741 
11742     if (PyUnicode_READY(self) == -1)
11743         return NULL;
11744 
11745     /* First pass: determine size of output string */
11746     src_len = PyUnicode_GET_LENGTH(self);
11747     i = j = line_pos = 0;
11748     kind = PyUnicode_KIND(self);
11749     src_data = PyUnicode_DATA(self);
11750     found = 0;
11751     for (; i < src_len; i++) {
11752         ch = PyUnicode_READ(kind, src_data, i);
11753         if (ch == '\t') {
11754             found = 1;
11755             if (tabsize > 0) {
11756                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11757                 if (j > PY_SSIZE_T_MAX - incr)
11758                     goto overflow;
11759                 line_pos += incr;
11760                 j += incr;
11761             }
11762         }
11763         else {
11764             if (j > PY_SSIZE_T_MAX - 1)
11765                 goto overflow;
11766             line_pos++;
11767             j++;
11768             if (ch == '\n' || ch == '\r')
11769                 line_pos = 0;
11770         }
11771     }
11772     if (!found)
11773         return unicode_result_unchanged(self);
11774 
11775     /* Second pass: create output string and fill it */
11776     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11777     if (!u)
11778         return NULL;
11779     dest_data = PyUnicode_DATA(u);
11780 
11781     i = j = line_pos = 0;
11782 
11783     for (; i < src_len; i++) {
11784         ch = PyUnicode_READ(kind, src_data, i);
11785         if (ch == '\t') {
11786             if (tabsize > 0) {
11787                 incr = tabsize - (line_pos % tabsize);
11788                 line_pos += incr;
11789                 unicode_fill(kind, dest_data, ' ', j, incr);
11790                 j += incr;
11791             }
11792         }
11793         else {
11794             line_pos++;
11795             PyUnicode_WRITE(kind, dest_data, j, ch);
11796             j++;
11797             if (ch == '\n' || ch == '\r')
11798                 line_pos = 0;
11799         }
11800     }
11801     assert (j == PyUnicode_GET_LENGTH(u));
11802     return unicode_result(u);
11803 
11804   overflow:
11805     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11806     return NULL;
11807 }
11808 
11809 PyDoc_STRVAR(find__doc__,
11810              "S.find(sub[, start[, end]]) -> int\n\
11811 \n\
11812 Return the lowest index in S where substring sub is found,\n\
11813 such that sub is contained within S[start:end].  Optional\n\
11814 arguments start and end are interpreted as in slice notation.\n\
11815 \n\
11816 Return -1 on failure.");
11817 
11818 static PyObject *
unicode_find(PyObject * self,PyObject * args)11819 unicode_find(PyObject *self, PyObject *args)
11820 {
11821     /* initialize variables to prevent gcc warning */
11822     PyObject *substring = NULL;
11823     Py_ssize_t start = 0;
11824     Py_ssize_t end = 0;
11825     Py_ssize_t result;
11826 
11827     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11828         return NULL;
11829 
11830     if (PyUnicode_READY(self) == -1)
11831         return NULL;
11832 
11833     result = any_find_slice(self, substring, start, end, 1);
11834 
11835     if (result == -2)
11836         return NULL;
11837 
11838     return PyLong_FromSsize_t(result);
11839 }
11840 
11841 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11842 unicode_getitem(PyObject *self, Py_ssize_t index)
11843 {
11844     const void *data;
11845     enum PyUnicode_Kind kind;
11846     Py_UCS4 ch;
11847 
11848     if (!PyUnicode_Check(self)) {
11849         PyErr_BadArgument();
11850         return NULL;
11851     }
11852     if (PyUnicode_READY(self) == -1) {
11853         return NULL;
11854     }
11855     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11856         PyErr_SetString(PyExc_IndexError, "string index out of range");
11857         return NULL;
11858     }
11859     kind = PyUnicode_KIND(self);
11860     data = PyUnicode_DATA(self);
11861     ch = PyUnicode_READ(kind, data, index);
11862     return unicode_char(ch);
11863 }
11864 
11865 /* Believe it or not, this produces the same value for ASCII strings
11866    as bytes_hash(). */
11867 static Py_hash_t
unicode_hash(PyObject * self)11868 unicode_hash(PyObject *self)
11869 {
11870     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11871 
11872 #ifdef Py_DEBUG
11873     assert(_Py_HashSecret_Initialized);
11874 #endif
11875     if (_PyUnicode_HASH(self) != -1)
11876         return _PyUnicode_HASH(self);
11877     if (PyUnicode_READY(self) == -1)
11878         return -1;
11879 
11880     x = _Py_HashBytes(PyUnicode_DATA(self),
11881                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11882     _PyUnicode_HASH(self) = x;
11883     return x;
11884 }
11885 
11886 PyDoc_STRVAR(index__doc__,
11887              "S.index(sub[, start[, end]]) -> int\n\
11888 \n\
11889 Return the lowest index in S where substring sub is found,\n\
11890 such that sub is contained within S[start:end].  Optional\n\
11891 arguments start and end are interpreted as in slice notation.\n\
11892 \n\
11893 Raises ValueError when the substring is not found.");
11894 
11895 static PyObject *
unicode_index(PyObject * self,PyObject * args)11896 unicode_index(PyObject *self, PyObject *args)
11897 {
11898     /* initialize variables to prevent gcc warning */
11899     Py_ssize_t result;
11900     PyObject *substring = NULL;
11901     Py_ssize_t start = 0;
11902     Py_ssize_t end = 0;
11903 
11904     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11905         return NULL;
11906 
11907     if (PyUnicode_READY(self) == -1)
11908         return NULL;
11909 
11910     result = any_find_slice(self, substring, start, end, 1);
11911 
11912     if (result == -2)
11913         return NULL;
11914 
11915     if (result < 0) {
11916         PyErr_SetString(PyExc_ValueError, "substring not found");
11917         return NULL;
11918     }
11919 
11920     return PyLong_FromSsize_t(result);
11921 }
11922 
11923 /*[clinic input]
11924 str.isascii as unicode_isascii
11925 
11926 Return True if all characters in the string are ASCII, False otherwise.
11927 
11928 ASCII characters have code points in the range U+0000-U+007F.
11929 Empty string is ASCII too.
11930 [clinic start generated code]*/
11931 
11932 static PyObject *
unicode_isascii_impl(PyObject * self)11933 unicode_isascii_impl(PyObject *self)
11934 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11935 {
11936     if (PyUnicode_READY(self) == -1) {
11937         return NULL;
11938     }
11939     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11940 }
11941 
11942 /*[clinic input]
11943 str.islower as unicode_islower
11944 
11945 Return True if the string is a lowercase string, False otherwise.
11946 
11947 A string is lowercase if all cased characters in the string are lowercase and
11948 there is at least one cased character in the string.
11949 [clinic start generated code]*/
11950 
11951 static PyObject *
unicode_islower_impl(PyObject * self)11952 unicode_islower_impl(PyObject *self)
11953 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11954 {
11955     Py_ssize_t i, length;
11956     int kind;
11957     const void *data;
11958     int cased;
11959 
11960     if (PyUnicode_READY(self) == -1)
11961         return NULL;
11962     length = PyUnicode_GET_LENGTH(self);
11963     kind = PyUnicode_KIND(self);
11964     data = PyUnicode_DATA(self);
11965 
11966     /* Shortcut for single character strings */
11967     if (length == 1)
11968         return PyBool_FromLong(
11969             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11970 
11971     /* Special case for empty strings */
11972     if (length == 0)
11973         Py_RETURN_FALSE;
11974 
11975     cased = 0;
11976     for (i = 0; i < length; i++) {
11977         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11978 
11979         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11980             Py_RETURN_FALSE;
11981         else if (!cased && Py_UNICODE_ISLOWER(ch))
11982             cased = 1;
11983     }
11984     return PyBool_FromLong(cased);
11985 }
11986 
11987 /*[clinic input]
11988 str.isupper as unicode_isupper
11989 
11990 Return True if the string is an uppercase string, False otherwise.
11991 
11992 A string is uppercase if all cased characters in the string are uppercase and
11993 there is at least one cased character in the string.
11994 [clinic start generated code]*/
11995 
11996 static PyObject *
unicode_isupper_impl(PyObject * self)11997 unicode_isupper_impl(PyObject *self)
11998 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11999 {
12000     Py_ssize_t i, length;
12001     int kind;
12002     const void *data;
12003     int cased;
12004 
12005     if (PyUnicode_READY(self) == -1)
12006         return NULL;
12007     length = PyUnicode_GET_LENGTH(self);
12008     kind = PyUnicode_KIND(self);
12009     data = PyUnicode_DATA(self);
12010 
12011     /* Shortcut for single character strings */
12012     if (length == 1)
12013         return PyBool_FromLong(
12014             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12015 
12016     /* Special case for empty strings */
12017     if (length == 0)
12018         Py_RETURN_FALSE;
12019 
12020     cased = 0;
12021     for (i = 0; i < length; i++) {
12022         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12023 
12024         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12025             Py_RETURN_FALSE;
12026         else if (!cased && Py_UNICODE_ISUPPER(ch))
12027             cased = 1;
12028     }
12029     return PyBool_FromLong(cased);
12030 }
12031 
12032 /*[clinic input]
12033 str.istitle as unicode_istitle
12034 
12035 Return True if the string is a title-cased string, False otherwise.
12036 
12037 In a title-cased string, upper- and title-case characters may only
12038 follow uncased characters and lowercase characters only cased ones.
12039 [clinic start generated code]*/
12040 
12041 static PyObject *
unicode_istitle_impl(PyObject * self)12042 unicode_istitle_impl(PyObject *self)
12043 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12044 {
12045     Py_ssize_t i, length;
12046     int kind;
12047     const void *data;
12048     int cased, previous_is_cased;
12049 
12050     if (PyUnicode_READY(self) == -1)
12051         return NULL;
12052     length = PyUnicode_GET_LENGTH(self);
12053     kind = PyUnicode_KIND(self);
12054     data = PyUnicode_DATA(self);
12055 
12056     /* Shortcut for single character strings */
12057     if (length == 1) {
12058         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12059         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12060                                (Py_UNICODE_ISUPPER(ch) != 0));
12061     }
12062 
12063     /* Special case for empty strings */
12064     if (length == 0)
12065         Py_RETURN_FALSE;
12066 
12067     cased = 0;
12068     previous_is_cased = 0;
12069     for (i = 0; i < length; i++) {
12070         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12071 
12072         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12073             if (previous_is_cased)
12074                 Py_RETURN_FALSE;
12075             previous_is_cased = 1;
12076             cased = 1;
12077         }
12078         else if (Py_UNICODE_ISLOWER(ch)) {
12079             if (!previous_is_cased)
12080                 Py_RETURN_FALSE;
12081             previous_is_cased = 1;
12082             cased = 1;
12083         }
12084         else
12085             previous_is_cased = 0;
12086     }
12087     return PyBool_FromLong(cased);
12088 }
12089 
12090 /*[clinic input]
12091 str.isspace as unicode_isspace
12092 
12093 Return True if the string is a whitespace string, False otherwise.
12094 
12095 A string is whitespace if all characters in the string are whitespace and there
12096 is at least one character in the string.
12097 [clinic start generated code]*/
12098 
12099 static PyObject *
unicode_isspace_impl(PyObject * self)12100 unicode_isspace_impl(PyObject *self)
12101 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12102 {
12103     Py_ssize_t i, length;
12104     int kind;
12105     const void *data;
12106 
12107     if (PyUnicode_READY(self) == -1)
12108         return NULL;
12109     length = PyUnicode_GET_LENGTH(self);
12110     kind = PyUnicode_KIND(self);
12111     data = PyUnicode_DATA(self);
12112 
12113     /* Shortcut for single character strings */
12114     if (length == 1)
12115         return PyBool_FromLong(
12116             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12117 
12118     /* Special case for empty strings */
12119     if (length == 0)
12120         Py_RETURN_FALSE;
12121 
12122     for (i = 0; i < length; i++) {
12123         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12124         if (!Py_UNICODE_ISSPACE(ch))
12125             Py_RETURN_FALSE;
12126     }
12127     Py_RETURN_TRUE;
12128 }
12129 
12130 /*[clinic input]
12131 str.isalpha as unicode_isalpha
12132 
12133 Return True if the string is an alphabetic string, False otherwise.
12134 
12135 A string is alphabetic if all characters in the string are alphabetic and there
12136 is at least one character in the string.
12137 [clinic start generated code]*/
12138 
12139 static PyObject *
unicode_isalpha_impl(PyObject * self)12140 unicode_isalpha_impl(PyObject *self)
12141 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12142 {
12143     Py_ssize_t i, length;
12144     int kind;
12145     const void *data;
12146 
12147     if (PyUnicode_READY(self) == -1)
12148         return NULL;
12149     length = PyUnicode_GET_LENGTH(self);
12150     kind = PyUnicode_KIND(self);
12151     data = PyUnicode_DATA(self);
12152 
12153     /* Shortcut for single character strings */
12154     if (length == 1)
12155         return PyBool_FromLong(
12156             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12157 
12158     /* Special case for empty strings */
12159     if (length == 0)
12160         Py_RETURN_FALSE;
12161 
12162     for (i = 0; i < length; i++) {
12163         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12164             Py_RETURN_FALSE;
12165     }
12166     Py_RETURN_TRUE;
12167 }
12168 
12169 /*[clinic input]
12170 str.isalnum as unicode_isalnum
12171 
12172 Return True if the string is an alpha-numeric string, False otherwise.
12173 
12174 A string is alpha-numeric if all characters in the string are alpha-numeric and
12175 there is at least one character in the string.
12176 [clinic start generated code]*/
12177 
12178 static PyObject *
unicode_isalnum_impl(PyObject * self)12179 unicode_isalnum_impl(PyObject *self)
12180 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12181 {
12182     int kind;
12183     const void *data;
12184     Py_ssize_t len, i;
12185 
12186     if (PyUnicode_READY(self) == -1)
12187         return NULL;
12188 
12189     kind = PyUnicode_KIND(self);
12190     data = PyUnicode_DATA(self);
12191     len = PyUnicode_GET_LENGTH(self);
12192 
12193     /* Shortcut for single character strings */
12194     if (len == 1) {
12195         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12196         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12197     }
12198 
12199     /* Special case for empty strings */
12200     if (len == 0)
12201         Py_RETURN_FALSE;
12202 
12203     for (i = 0; i < len; i++) {
12204         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12205         if (!Py_UNICODE_ISALNUM(ch))
12206             Py_RETURN_FALSE;
12207     }
12208     Py_RETURN_TRUE;
12209 }
12210 
12211 /*[clinic input]
12212 str.isdecimal as unicode_isdecimal
12213 
12214 Return True if the string is a decimal string, False otherwise.
12215 
12216 A string is a decimal string if all characters in the string are decimal and
12217 there is at least one character in the string.
12218 [clinic start generated code]*/
12219 
12220 static PyObject *
unicode_isdecimal_impl(PyObject * self)12221 unicode_isdecimal_impl(PyObject *self)
12222 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12223 {
12224     Py_ssize_t i, length;
12225     int kind;
12226     const void *data;
12227 
12228     if (PyUnicode_READY(self) == -1)
12229         return NULL;
12230     length = PyUnicode_GET_LENGTH(self);
12231     kind = PyUnicode_KIND(self);
12232     data = PyUnicode_DATA(self);
12233 
12234     /* Shortcut for single character strings */
12235     if (length == 1)
12236         return PyBool_FromLong(
12237             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12238 
12239     /* Special case for empty strings */
12240     if (length == 0)
12241         Py_RETURN_FALSE;
12242 
12243     for (i = 0; i < length; i++) {
12244         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12245             Py_RETURN_FALSE;
12246     }
12247     Py_RETURN_TRUE;
12248 }
12249 
12250 /*[clinic input]
12251 str.isdigit as unicode_isdigit
12252 
12253 Return True if the string is a digit string, False otherwise.
12254 
12255 A string is a digit string if all characters in the string are digits and there
12256 is at least one character in the string.
12257 [clinic start generated code]*/
12258 
12259 static PyObject *
unicode_isdigit_impl(PyObject * self)12260 unicode_isdigit_impl(PyObject *self)
12261 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12262 {
12263     Py_ssize_t i, length;
12264     int kind;
12265     const void *data;
12266 
12267     if (PyUnicode_READY(self) == -1)
12268         return NULL;
12269     length = PyUnicode_GET_LENGTH(self);
12270     kind = PyUnicode_KIND(self);
12271     data = PyUnicode_DATA(self);
12272 
12273     /* Shortcut for single character strings */
12274     if (length == 1) {
12275         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12276         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12277     }
12278 
12279     /* Special case for empty strings */
12280     if (length == 0)
12281         Py_RETURN_FALSE;
12282 
12283     for (i = 0; i < length; i++) {
12284         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12285             Py_RETURN_FALSE;
12286     }
12287     Py_RETURN_TRUE;
12288 }
12289 
12290 /*[clinic input]
12291 str.isnumeric as unicode_isnumeric
12292 
12293 Return True if the string is a numeric string, False otherwise.
12294 
12295 A string is numeric if all characters in the string are numeric and there is at
12296 least one character in the string.
12297 [clinic start generated code]*/
12298 
12299 static PyObject *
unicode_isnumeric_impl(PyObject * self)12300 unicode_isnumeric_impl(PyObject *self)
12301 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12302 {
12303     Py_ssize_t i, length;
12304     int kind;
12305     const void *data;
12306 
12307     if (PyUnicode_READY(self) == -1)
12308         return NULL;
12309     length = PyUnicode_GET_LENGTH(self);
12310     kind = PyUnicode_KIND(self);
12311     data = PyUnicode_DATA(self);
12312 
12313     /* Shortcut for single character strings */
12314     if (length == 1)
12315         return PyBool_FromLong(
12316             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12317 
12318     /* Special case for empty strings */
12319     if (length == 0)
12320         Py_RETURN_FALSE;
12321 
12322     for (i = 0; i < length; i++) {
12323         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12324             Py_RETURN_FALSE;
12325     }
12326     Py_RETURN_TRUE;
12327 }
12328 
12329 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12330 _PyUnicode_ScanIdentifier(PyObject *self)
12331 {
12332     Py_ssize_t i;
12333     if (PyUnicode_READY(self) == -1)
12334         return -1;
12335 
12336     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12337     if (len == 0) {
12338         /* an empty string is not a valid identifier */
12339         return 0;
12340     }
12341 
12342     int kind = PyUnicode_KIND(self);
12343     const void *data = PyUnicode_DATA(self);
12344     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12345     /* PEP 3131 says that the first character must be in
12346        XID_Start and subsequent characters in XID_Continue,
12347        and for the ASCII range, the 2.x rules apply (i.e
12348        start with letters and underscore, continue with
12349        letters, digits, underscore). However, given the current
12350        definition of XID_Start and XID_Continue, it is sufficient
12351        to check just for these, except that _ must be allowed
12352        as starting an identifier.  */
12353     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12354         return 0;
12355     }
12356 
12357     for (i = 1; i < len; i++) {
12358         ch = PyUnicode_READ(kind, data, i);
12359         if (!_PyUnicode_IsXidContinue(ch)) {
12360             return i;
12361         }
12362     }
12363     return i;
12364 }
12365 
12366 int
PyUnicode_IsIdentifier(PyObject * self)12367 PyUnicode_IsIdentifier(PyObject *self)
12368 {
12369     if (PyUnicode_IS_READY(self)) {
12370         Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12371         Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12372         /* an empty string is not a valid identifier */
12373         return len && i == len;
12374     }
12375     else {
12376 _Py_COMP_DIAG_PUSH
12377 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12378         Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12379         if (len == 0) {
12380             /* an empty string is not a valid identifier */
12381             return 0;
12382         }
12383 
12384         const wchar_t *wstr = _PyUnicode_WSTR(self);
12385         Py_UCS4 ch = wstr[i++];
12386 #if SIZEOF_WCHAR_T == 2
12387         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12388             && i < len
12389             && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12390         {
12391             ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12392             i++;
12393         }
12394 #endif
12395         if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12396             return 0;
12397         }
12398 
12399         while (i < len) {
12400             ch = wstr[i++];
12401 #if SIZEOF_WCHAR_T == 2
12402             if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12403                 && i < len
12404                 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12405             {
12406                 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12407                 i++;
12408             }
12409 #endif
12410             if (!_PyUnicode_IsXidContinue(ch)) {
12411                 return 0;
12412             }
12413         }
12414         return 1;
12415 _Py_COMP_DIAG_POP
12416     }
12417 }
12418 
12419 /*[clinic input]
12420 str.isidentifier as unicode_isidentifier
12421 
12422 Return True if the string is a valid Python identifier, False otherwise.
12423 
12424 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12425 such as "def" or "class".
12426 [clinic start generated code]*/
12427 
12428 static PyObject *
unicode_isidentifier_impl(PyObject * self)12429 unicode_isidentifier_impl(PyObject *self)
12430 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12431 {
12432     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12433 }
12434 
12435 /*[clinic input]
12436 str.isprintable as unicode_isprintable
12437 
12438 Return True if the string is printable, False otherwise.
12439 
12440 A string is printable if all of its characters are considered printable in
12441 repr() or if it is empty.
12442 [clinic start generated code]*/
12443 
12444 static PyObject *
unicode_isprintable_impl(PyObject * self)12445 unicode_isprintable_impl(PyObject *self)
12446 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12447 {
12448     Py_ssize_t i, length;
12449     int kind;
12450     const void *data;
12451 
12452     if (PyUnicode_READY(self) == -1)
12453         return NULL;
12454     length = PyUnicode_GET_LENGTH(self);
12455     kind = PyUnicode_KIND(self);
12456     data = PyUnicode_DATA(self);
12457 
12458     /* Shortcut for single character strings */
12459     if (length == 1)
12460         return PyBool_FromLong(
12461             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12462 
12463     for (i = 0; i < length; i++) {
12464         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12465             Py_RETURN_FALSE;
12466         }
12467     }
12468     Py_RETURN_TRUE;
12469 }
12470 
12471 /*[clinic input]
12472 str.join as unicode_join
12473 
12474     iterable: object
12475     /
12476 
12477 Concatenate any number of strings.
12478 
12479 The string whose method is called is inserted in between each given string.
12480 The result is returned as a new string.
12481 
12482 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12483 [clinic start generated code]*/
12484 
12485 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12486 unicode_join(PyObject *self, PyObject *iterable)
12487 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12488 {
12489     return PyUnicode_Join(self, iterable);
12490 }
12491 
12492 static Py_ssize_t
unicode_length(PyObject * self)12493 unicode_length(PyObject *self)
12494 {
12495     if (PyUnicode_READY(self) == -1)
12496         return -1;
12497     return PyUnicode_GET_LENGTH(self);
12498 }
12499 
12500 /*[clinic input]
12501 str.ljust as unicode_ljust
12502 
12503     width: Py_ssize_t
12504     fillchar: Py_UCS4 = ' '
12505     /
12506 
12507 Return a left-justified string of length width.
12508 
12509 Padding is done using the specified fill character (default is a space).
12510 [clinic start generated code]*/
12511 
12512 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12513 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12514 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12515 {
12516     if (PyUnicode_READY(self) == -1)
12517         return NULL;
12518 
12519     if (PyUnicode_GET_LENGTH(self) >= width)
12520         return unicode_result_unchanged(self);
12521 
12522     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12523 }
12524 
12525 /*[clinic input]
12526 str.lower as unicode_lower
12527 
12528 Return a copy of the string converted to lowercase.
12529 [clinic start generated code]*/
12530 
12531 static PyObject *
unicode_lower_impl(PyObject * self)12532 unicode_lower_impl(PyObject *self)
12533 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12534 {
12535     if (PyUnicode_READY(self) == -1)
12536         return NULL;
12537     if (PyUnicode_IS_ASCII(self))
12538         return ascii_upper_or_lower(self, 1);
12539     return case_operation(self, do_lower);
12540 }
12541 
12542 #define LEFTSTRIP 0
12543 #define RIGHTSTRIP 1
12544 #define BOTHSTRIP 2
12545 
12546 /* Arrays indexed by above */
12547 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12548 
12549 #define STRIPNAME(i) (stripfuncnames[i])
12550 
12551 /* externally visible for str.strip(unicode) */
12552 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12553 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12554 {
12555     const void *data;
12556     int kind;
12557     Py_ssize_t i, j, len;
12558     BLOOM_MASK sepmask;
12559     Py_ssize_t seplen;
12560 
12561     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12562         return NULL;
12563 
12564     kind = PyUnicode_KIND(self);
12565     data = PyUnicode_DATA(self);
12566     len = PyUnicode_GET_LENGTH(self);
12567     seplen = PyUnicode_GET_LENGTH(sepobj);
12568     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12569                               PyUnicode_DATA(sepobj),
12570                               seplen);
12571 
12572     i = 0;
12573     if (striptype != RIGHTSTRIP) {
12574         while (i < len) {
12575             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12576             if (!BLOOM(sepmask, ch))
12577                 break;
12578             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12579                 break;
12580             i++;
12581         }
12582     }
12583 
12584     j = len;
12585     if (striptype != LEFTSTRIP) {
12586         j--;
12587         while (j >= i) {
12588             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12589             if (!BLOOM(sepmask, ch))
12590                 break;
12591             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12592                 break;
12593             j--;
12594         }
12595 
12596         j++;
12597     }
12598 
12599     return PyUnicode_Substring(self, i, j);
12600 }
12601 
12602 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12603 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12604 {
12605     const unsigned char *data;
12606     int kind;
12607     Py_ssize_t length;
12608 
12609     if (PyUnicode_READY(self) == -1)
12610         return NULL;
12611 
12612     length = PyUnicode_GET_LENGTH(self);
12613     end = Py_MIN(end, length);
12614 
12615     if (start == 0 && end == length)
12616         return unicode_result_unchanged(self);
12617 
12618     if (start < 0 || end < 0) {
12619         PyErr_SetString(PyExc_IndexError, "string index out of range");
12620         return NULL;
12621     }
12622     if (start >= length || end < start)
12623         _Py_RETURN_UNICODE_EMPTY();
12624 
12625     length = end - start;
12626     if (PyUnicode_IS_ASCII(self)) {
12627         data = PyUnicode_1BYTE_DATA(self);
12628         return _PyUnicode_FromASCII((const char*)(data + start), length);
12629     }
12630     else {
12631         kind = PyUnicode_KIND(self);
12632         data = PyUnicode_1BYTE_DATA(self);
12633         return PyUnicode_FromKindAndData(kind,
12634                                          data + kind * start,
12635                                          length);
12636     }
12637 }
12638 
12639 static PyObject *
do_strip(PyObject * self,int striptype)12640 do_strip(PyObject *self, int striptype)
12641 {
12642     Py_ssize_t len, i, j;
12643 
12644     if (PyUnicode_READY(self) == -1)
12645         return NULL;
12646 
12647     len = PyUnicode_GET_LENGTH(self);
12648 
12649     if (PyUnicode_IS_ASCII(self)) {
12650         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12651 
12652         i = 0;
12653         if (striptype != RIGHTSTRIP) {
12654             while (i < len) {
12655                 Py_UCS1 ch = data[i];
12656                 if (!_Py_ascii_whitespace[ch])
12657                     break;
12658                 i++;
12659             }
12660         }
12661 
12662         j = len;
12663         if (striptype != LEFTSTRIP) {
12664             j--;
12665             while (j >= i) {
12666                 Py_UCS1 ch = data[j];
12667                 if (!_Py_ascii_whitespace[ch])
12668                     break;
12669                 j--;
12670             }
12671             j++;
12672         }
12673     }
12674     else {
12675         int kind = PyUnicode_KIND(self);
12676         const void *data = PyUnicode_DATA(self);
12677 
12678         i = 0;
12679         if (striptype != RIGHTSTRIP) {
12680             while (i < len) {
12681                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12682                 if (!Py_UNICODE_ISSPACE(ch))
12683                     break;
12684                 i++;
12685             }
12686         }
12687 
12688         j = len;
12689         if (striptype != LEFTSTRIP) {
12690             j--;
12691             while (j >= i) {
12692                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12693                 if (!Py_UNICODE_ISSPACE(ch))
12694                     break;
12695                 j--;
12696             }
12697             j++;
12698         }
12699     }
12700 
12701     return PyUnicode_Substring(self, i, j);
12702 }
12703 
12704 
12705 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12706 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12707 {
12708     if (sep != Py_None) {
12709         if (PyUnicode_Check(sep))
12710             return _PyUnicode_XStrip(self, striptype, sep);
12711         else {
12712             PyErr_Format(PyExc_TypeError,
12713                          "%s arg must be None or str",
12714                          STRIPNAME(striptype));
12715             return NULL;
12716         }
12717     }
12718 
12719     return do_strip(self, striptype);
12720 }
12721 
12722 
12723 /*[clinic input]
12724 str.strip as unicode_strip
12725 
12726     chars: object = None
12727     /
12728 
12729 Return a copy of the string with leading and trailing whitespace removed.
12730 
12731 If chars is given and not None, remove characters in chars instead.
12732 [clinic start generated code]*/
12733 
12734 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12735 unicode_strip_impl(PyObject *self, PyObject *chars)
12736 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12737 {
12738     return do_argstrip(self, BOTHSTRIP, chars);
12739 }
12740 
12741 
12742 /*[clinic input]
12743 str.lstrip as unicode_lstrip
12744 
12745     chars: object = None
12746     /
12747 
12748 Return a copy of the string with leading whitespace removed.
12749 
12750 If chars is given and not None, remove characters in chars instead.
12751 [clinic start generated code]*/
12752 
12753 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12754 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12755 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12756 {
12757     return do_argstrip(self, LEFTSTRIP, chars);
12758 }
12759 
12760 
12761 /*[clinic input]
12762 str.rstrip as unicode_rstrip
12763 
12764     chars: object = None
12765     /
12766 
12767 Return a copy of the string with trailing whitespace removed.
12768 
12769 If chars is given and not None, remove characters in chars instead.
12770 [clinic start generated code]*/
12771 
12772 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12773 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12774 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12775 {
12776     return do_argstrip(self, RIGHTSTRIP, chars);
12777 }
12778 
12779 
12780 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12781 unicode_repeat(PyObject *str, Py_ssize_t len)
12782 {
12783     PyObject *u;
12784     Py_ssize_t nchars, n;
12785 
12786     if (len < 1)
12787         _Py_RETURN_UNICODE_EMPTY();
12788 
12789     /* no repeat, return original string */
12790     if (len == 1)
12791         return unicode_result_unchanged(str);
12792 
12793     if (PyUnicode_READY(str) == -1)
12794         return NULL;
12795 
12796     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12797         PyErr_SetString(PyExc_OverflowError,
12798                         "repeated string is too long");
12799         return NULL;
12800     }
12801     nchars = len * PyUnicode_GET_LENGTH(str);
12802 
12803     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12804     if (!u)
12805         return NULL;
12806     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12807 
12808     if (PyUnicode_GET_LENGTH(str) == 1) {
12809         int kind = PyUnicode_KIND(str);
12810         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12811         if (kind == PyUnicode_1BYTE_KIND) {
12812             void *to = PyUnicode_DATA(u);
12813             memset(to, (unsigned char)fill_char, len);
12814         }
12815         else if (kind == PyUnicode_2BYTE_KIND) {
12816             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12817             for (n = 0; n < len; ++n)
12818                 ucs2[n] = fill_char;
12819         } else {
12820             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12821             assert(kind == PyUnicode_4BYTE_KIND);
12822             for (n = 0; n < len; ++n)
12823                 ucs4[n] = fill_char;
12824         }
12825     }
12826     else {
12827         /* number of characters copied this far */
12828         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12829         Py_ssize_t char_size = PyUnicode_KIND(str);
12830         char *to = (char *) PyUnicode_DATA(u);
12831         memcpy(to, PyUnicode_DATA(str),
12832                   PyUnicode_GET_LENGTH(str) * char_size);
12833         while (done < nchars) {
12834             n = (done <= nchars-done) ? done : nchars-done;
12835             memcpy(to + (done * char_size), to, n * char_size);
12836             done += n;
12837         }
12838     }
12839 
12840     assert(_PyUnicode_CheckConsistency(u, 1));
12841     return u;
12842 }
12843 
12844 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12845 PyUnicode_Replace(PyObject *str,
12846                   PyObject *substr,
12847                   PyObject *replstr,
12848                   Py_ssize_t maxcount)
12849 {
12850     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12851             ensure_unicode(replstr) < 0)
12852         return NULL;
12853     return replace(str, substr, replstr, maxcount);
12854 }
12855 
12856 /*[clinic input]
12857 str.replace as unicode_replace
12858 
12859     old: unicode
12860     new: unicode
12861     count: Py_ssize_t = -1
12862         Maximum number of occurrences to replace.
12863         -1 (the default value) means replace all occurrences.
12864     /
12865 
12866 Return a copy with all occurrences of substring old replaced by new.
12867 
12868 If the optional argument count is given, only the first count occurrences are
12869 replaced.
12870 [clinic start generated code]*/
12871 
12872 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12873 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12874                      Py_ssize_t count)
12875 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12876 {
12877     if (PyUnicode_READY(self) == -1)
12878         return NULL;
12879     return replace(self, old, new, count);
12880 }
12881 
12882 /*[clinic input]
12883 str.removeprefix as unicode_removeprefix
12884 
12885     prefix: unicode
12886     /
12887 
12888 Return a str with the given prefix string removed if present.
12889 
12890 If the string starts with the prefix string, return string[len(prefix):].
12891 Otherwise, return a copy of the original string.
12892 [clinic start generated code]*/
12893 
12894 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12895 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12896 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12897 {
12898     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12899     if (match == -1) {
12900         return NULL;
12901     }
12902     if (match) {
12903         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12904                                    PyUnicode_GET_LENGTH(self));
12905     }
12906     return unicode_result_unchanged(self);
12907 }
12908 
12909 /*[clinic input]
12910 str.removesuffix as unicode_removesuffix
12911 
12912     suffix: unicode
12913     /
12914 
12915 Return a str with the given suffix string removed if present.
12916 
12917 If the string ends with the suffix string and that suffix is not empty,
12918 return string[:-len(suffix)]. Otherwise, return a copy of the original
12919 string.
12920 [clinic start generated code]*/
12921 
12922 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12923 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12924 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12925 {
12926     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12927     if (match == -1) {
12928         return NULL;
12929     }
12930     if (match) {
12931         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12932                                             - PyUnicode_GET_LENGTH(suffix));
12933     }
12934     return unicode_result_unchanged(self);
12935 }
12936 
12937 static PyObject *
unicode_repr(PyObject * unicode)12938 unicode_repr(PyObject *unicode)
12939 {
12940     PyObject *repr;
12941     Py_ssize_t isize;
12942     Py_ssize_t osize, squote, dquote, i, o;
12943     Py_UCS4 max, quote;
12944     int ikind, okind, unchanged;
12945     const void *idata;
12946     void *odata;
12947 
12948     if (PyUnicode_READY(unicode) == -1)
12949         return NULL;
12950 
12951     isize = PyUnicode_GET_LENGTH(unicode);
12952     idata = PyUnicode_DATA(unicode);
12953 
12954     /* Compute length of output, quote characters, and
12955        maximum character */
12956     osize = 0;
12957     max = 127;
12958     squote = dquote = 0;
12959     ikind = PyUnicode_KIND(unicode);
12960     for (i = 0; i < isize; i++) {
12961         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12962         Py_ssize_t incr = 1;
12963         switch (ch) {
12964         case '\'': squote++; break;
12965         case '"':  dquote++; break;
12966         case '\\': case '\t': case '\r': case '\n':
12967             incr = 2;
12968             break;
12969         default:
12970             /* Fast-path ASCII */
12971             if (ch < ' ' || ch == 0x7f)
12972                 incr = 4; /* \xHH */
12973             else if (ch < 0x7f)
12974                 ;
12975             else if (Py_UNICODE_ISPRINTABLE(ch))
12976                 max = ch > max ? ch : max;
12977             else if (ch < 0x100)
12978                 incr = 4; /* \xHH */
12979             else if (ch < 0x10000)
12980                 incr = 6; /* \uHHHH */
12981             else
12982                 incr = 10; /* \uHHHHHHHH */
12983         }
12984         if (osize > PY_SSIZE_T_MAX - incr) {
12985             PyErr_SetString(PyExc_OverflowError,
12986                             "string is too long to generate repr");
12987             return NULL;
12988         }
12989         osize += incr;
12990     }
12991 
12992     quote = '\'';
12993     unchanged = (osize == isize);
12994     if (squote) {
12995         unchanged = 0;
12996         if (dquote)
12997             /* Both squote and dquote present. Use squote,
12998                and escape them */
12999             osize += squote;
13000         else
13001             quote = '"';
13002     }
13003     osize += 2;   /* quotes */
13004 
13005     repr = PyUnicode_New(osize, max);
13006     if (repr == NULL)
13007         return NULL;
13008     okind = PyUnicode_KIND(repr);
13009     odata = PyUnicode_DATA(repr);
13010 
13011     PyUnicode_WRITE(okind, odata, 0, quote);
13012     PyUnicode_WRITE(okind, odata, osize-1, quote);
13013     if (unchanged) {
13014         _PyUnicode_FastCopyCharacters(repr, 1,
13015                                       unicode, 0,
13016                                       isize);
13017     }
13018     else {
13019         for (i = 0, o = 1; i < isize; i++) {
13020             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13021 
13022             /* Escape quotes and backslashes */
13023             if ((ch == quote) || (ch == '\\')) {
13024                 PyUnicode_WRITE(okind, odata, o++, '\\');
13025                 PyUnicode_WRITE(okind, odata, o++, ch);
13026                 continue;
13027             }
13028 
13029             /* Map special whitespace to '\t', \n', '\r' */
13030             if (ch == '\t') {
13031                 PyUnicode_WRITE(okind, odata, o++, '\\');
13032                 PyUnicode_WRITE(okind, odata, o++, 't');
13033             }
13034             else if (ch == '\n') {
13035                 PyUnicode_WRITE(okind, odata, o++, '\\');
13036                 PyUnicode_WRITE(okind, odata, o++, 'n');
13037             }
13038             else if (ch == '\r') {
13039                 PyUnicode_WRITE(okind, odata, o++, '\\');
13040                 PyUnicode_WRITE(okind, odata, o++, 'r');
13041             }
13042 
13043             /* Map non-printable US ASCII to '\xhh' */
13044             else if (ch < ' ' || ch == 0x7F) {
13045                 PyUnicode_WRITE(okind, odata, o++, '\\');
13046                 PyUnicode_WRITE(okind, odata, o++, 'x');
13047                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13048                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13049             }
13050 
13051             /* Copy ASCII characters as-is */
13052             else if (ch < 0x7F) {
13053                 PyUnicode_WRITE(okind, odata, o++, ch);
13054             }
13055 
13056             /* Non-ASCII characters */
13057             else {
13058                 /* Map Unicode whitespace and control characters
13059                    (categories Z* and C* except ASCII space)
13060                 */
13061                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13062                     PyUnicode_WRITE(okind, odata, o++, '\\');
13063                     /* Map 8-bit characters to '\xhh' */
13064                     if (ch <= 0xff) {
13065                         PyUnicode_WRITE(okind, odata, o++, 'x');
13066                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13067                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13068                     }
13069                     /* Map 16-bit characters to '\uxxxx' */
13070                     else if (ch <= 0xffff) {
13071                         PyUnicode_WRITE(okind, odata, o++, 'u');
13072                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13073                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13074                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13075                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13076                     }
13077                     /* Map 21-bit characters to '\U00xxxxxx' */
13078                     else {
13079                         PyUnicode_WRITE(okind, odata, o++, 'U');
13080                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13081                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13082                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13083                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13084                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13085                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13086                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13087                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13088                     }
13089                 }
13090                 /* Copy characters as-is */
13091                 else {
13092                     PyUnicode_WRITE(okind, odata, o++, ch);
13093                 }
13094             }
13095         }
13096     }
13097     /* Closing quote already added at the beginning */
13098     assert(_PyUnicode_CheckConsistency(repr, 1));
13099     return repr;
13100 }
13101 
13102 PyDoc_STRVAR(rfind__doc__,
13103              "S.rfind(sub[, start[, end]]) -> int\n\
13104 \n\
13105 Return the highest index in S where substring sub is found,\n\
13106 such that sub is contained within S[start:end].  Optional\n\
13107 arguments start and end are interpreted as in slice notation.\n\
13108 \n\
13109 Return -1 on failure.");
13110 
13111 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13112 unicode_rfind(PyObject *self, PyObject *args)
13113 {
13114     /* initialize variables to prevent gcc warning */
13115     PyObject *substring = NULL;
13116     Py_ssize_t start = 0;
13117     Py_ssize_t end = 0;
13118     Py_ssize_t result;
13119 
13120     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13121         return NULL;
13122 
13123     if (PyUnicode_READY(self) == -1)
13124         return NULL;
13125 
13126     result = any_find_slice(self, substring, start, end, -1);
13127 
13128     if (result == -2)
13129         return NULL;
13130 
13131     return PyLong_FromSsize_t(result);
13132 }
13133 
13134 PyDoc_STRVAR(rindex__doc__,
13135              "S.rindex(sub[, start[, end]]) -> int\n\
13136 \n\
13137 Return the highest index in S where substring sub is found,\n\
13138 such that sub is contained within S[start:end].  Optional\n\
13139 arguments start and end are interpreted as in slice notation.\n\
13140 \n\
13141 Raises ValueError when the substring is not found.");
13142 
13143 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13144 unicode_rindex(PyObject *self, PyObject *args)
13145 {
13146     /* initialize variables to prevent gcc warning */
13147     PyObject *substring = NULL;
13148     Py_ssize_t start = 0;
13149     Py_ssize_t end = 0;
13150     Py_ssize_t result;
13151 
13152     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13153         return NULL;
13154 
13155     if (PyUnicode_READY(self) == -1)
13156         return NULL;
13157 
13158     result = any_find_slice(self, substring, start, end, -1);
13159 
13160     if (result == -2)
13161         return NULL;
13162 
13163     if (result < 0) {
13164         PyErr_SetString(PyExc_ValueError, "substring not found");
13165         return NULL;
13166     }
13167 
13168     return PyLong_FromSsize_t(result);
13169 }
13170 
13171 /*[clinic input]
13172 str.rjust as unicode_rjust
13173 
13174     width: Py_ssize_t
13175     fillchar: Py_UCS4 = ' '
13176     /
13177 
13178 Return a right-justified string of length width.
13179 
13180 Padding is done using the specified fill character (default is a space).
13181 [clinic start generated code]*/
13182 
13183 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13184 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13185 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13186 {
13187     if (PyUnicode_READY(self) == -1)
13188         return NULL;
13189 
13190     if (PyUnicode_GET_LENGTH(self) >= width)
13191         return unicode_result_unchanged(self);
13192 
13193     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13194 }
13195 
13196 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13197 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13198 {
13199     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13200         return NULL;
13201 
13202     return split(s, sep, maxsplit);
13203 }
13204 
13205 /*[clinic input]
13206 str.split as unicode_split
13207 
13208     sep: object = None
13209         The delimiter according which to split the string.
13210         None (the default value) means split according to any whitespace,
13211         and discard empty strings from the result.
13212     maxsplit: Py_ssize_t = -1
13213         Maximum number of splits to do.
13214         -1 (the default value) means no limit.
13215 
13216 Return a list of the words in the string, using sep as the delimiter string.
13217 [clinic start generated code]*/
13218 
13219 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13220 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13221 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
13222 {
13223     if (sep == Py_None)
13224         return split(self, NULL, maxsplit);
13225     if (PyUnicode_Check(sep))
13226         return split(self, sep, maxsplit);
13227 
13228     PyErr_Format(PyExc_TypeError,
13229                  "must be str or None, not %.100s",
13230                  Py_TYPE(sep)->tp_name);
13231     return NULL;
13232 }
13233 
13234 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13235 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13236 {
13237     PyObject* out;
13238     int kind1, kind2;
13239     const void *buf1, *buf2;
13240     Py_ssize_t len1, len2;
13241 
13242     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13243         return NULL;
13244 
13245     kind1 = PyUnicode_KIND(str_obj);
13246     kind2 = PyUnicode_KIND(sep_obj);
13247     len1 = PyUnicode_GET_LENGTH(str_obj);
13248     len2 = PyUnicode_GET_LENGTH(sep_obj);
13249     if (kind1 < kind2 || len1 < len2) {
13250         _Py_INCREF_UNICODE_EMPTY();
13251         if (!unicode_empty)
13252             out = NULL;
13253         else {
13254             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13255             Py_DECREF(unicode_empty);
13256         }
13257         return out;
13258     }
13259     buf1 = PyUnicode_DATA(str_obj);
13260     buf2 = PyUnicode_DATA(sep_obj);
13261     if (kind2 != kind1) {
13262         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13263         if (!buf2)
13264             return NULL;
13265     }
13266 
13267     switch (kind1) {
13268     case PyUnicode_1BYTE_KIND:
13269         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13270             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13271         else
13272             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13273         break;
13274     case PyUnicode_2BYTE_KIND:
13275         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276         break;
13277     case PyUnicode_4BYTE_KIND:
13278         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279         break;
13280     default:
13281         Py_UNREACHABLE();
13282     }
13283 
13284     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13285     if (kind2 != kind1)
13286         PyMem_Free((void *)buf2);
13287 
13288     return out;
13289 }
13290 
13291 
13292 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13293 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13294 {
13295     PyObject* out;
13296     int kind1, kind2;
13297     const void *buf1, *buf2;
13298     Py_ssize_t len1, len2;
13299 
13300     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13301         return NULL;
13302 
13303     kind1 = PyUnicode_KIND(str_obj);
13304     kind2 = PyUnicode_KIND(sep_obj);
13305     len1 = PyUnicode_GET_LENGTH(str_obj);
13306     len2 = PyUnicode_GET_LENGTH(sep_obj);
13307     if (kind1 < kind2 || len1 < len2) {
13308         _Py_INCREF_UNICODE_EMPTY();
13309         if (!unicode_empty)
13310             out = NULL;
13311         else {
13312             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13313             Py_DECREF(unicode_empty);
13314         }
13315         return out;
13316     }
13317     buf1 = PyUnicode_DATA(str_obj);
13318     buf2 = PyUnicode_DATA(sep_obj);
13319     if (kind2 != kind1) {
13320         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13321         if (!buf2)
13322             return NULL;
13323     }
13324 
13325     switch (kind1) {
13326     case PyUnicode_1BYTE_KIND:
13327         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13328             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13329         else
13330             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13331         break;
13332     case PyUnicode_2BYTE_KIND:
13333         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13334         break;
13335     case PyUnicode_4BYTE_KIND:
13336         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13337         break;
13338     default:
13339         Py_UNREACHABLE();
13340     }
13341 
13342     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13343     if (kind2 != kind1)
13344         PyMem_Free((void *)buf2);
13345 
13346     return out;
13347 }
13348 
13349 /*[clinic input]
13350 str.partition as unicode_partition
13351 
13352     sep: object
13353     /
13354 
13355 Partition the string into three parts using the given separator.
13356 
13357 This will search for the separator in the string.  If the separator is found,
13358 returns a 3-tuple containing the part before the separator, the separator
13359 itself, and the part after it.
13360 
13361 If the separator is not found, returns a 3-tuple containing the original string
13362 and two empty strings.
13363 [clinic start generated code]*/
13364 
13365 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13366 unicode_partition(PyObject *self, PyObject *sep)
13367 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13368 {
13369     return PyUnicode_Partition(self, sep);
13370 }
13371 
13372 /*[clinic input]
13373 str.rpartition as unicode_rpartition = str.partition
13374 
13375 Partition the string into three parts using the given separator.
13376 
13377 This will search for the separator in the string, starting at the end. If
13378 the separator is found, returns a 3-tuple containing the part before the
13379 separator, the separator itself, and the part after it.
13380 
13381 If the separator is not found, returns a 3-tuple containing two empty strings
13382 and the original string.
13383 [clinic start generated code]*/
13384 
13385 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13386 unicode_rpartition(PyObject *self, PyObject *sep)
13387 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13388 {
13389     return PyUnicode_RPartition(self, sep);
13390 }
13391 
13392 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13393 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13394 {
13395     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13396         return NULL;
13397 
13398     return rsplit(s, sep, maxsplit);
13399 }
13400 
13401 /*[clinic input]
13402 str.rsplit as unicode_rsplit = str.split
13403 
13404 Return a list of the words in the string, using sep as the delimiter string.
13405 
13406 Splits are done starting at the end of the string and working to the front.
13407 [clinic start generated code]*/
13408 
13409 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13410 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13411 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13412 {
13413     if (sep == Py_None)
13414         return rsplit(self, NULL, maxsplit);
13415     if (PyUnicode_Check(sep))
13416         return rsplit(self, sep, maxsplit);
13417 
13418     PyErr_Format(PyExc_TypeError,
13419                  "must be str or None, not %.100s",
13420                  Py_TYPE(sep)->tp_name);
13421     return NULL;
13422 }
13423 
13424 /*[clinic input]
13425 str.splitlines as unicode_splitlines
13426 
13427     keepends: bool(accept={int}) = False
13428 
13429 Return a list of the lines in the string, breaking at line boundaries.
13430 
13431 Line breaks are not included in the resulting list unless keepends is given and
13432 true.
13433 [clinic start generated code]*/
13434 
13435 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13436 unicode_splitlines_impl(PyObject *self, int keepends)
13437 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13438 {
13439     return PyUnicode_Splitlines(self, keepends);
13440 }
13441 
13442 static
unicode_str(PyObject * self)13443 PyObject *unicode_str(PyObject *self)
13444 {
13445     return unicode_result_unchanged(self);
13446 }
13447 
13448 /*[clinic input]
13449 str.swapcase as unicode_swapcase
13450 
13451 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13452 [clinic start generated code]*/
13453 
13454 static PyObject *
unicode_swapcase_impl(PyObject * self)13455 unicode_swapcase_impl(PyObject *self)
13456 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13457 {
13458     if (PyUnicode_READY(self) == -1)
13459         return NULL;
13460     return case_operation(self, do_swapcase);
13461 }
13462 
13463 /*[clinic input]
13464 
13465 @staticmethod
13466 str.maketrans as unicode_maketrans
13467 
13468   x: object
13469 
13470   y: unicode=NULL
13471 
13472   z: unicode=NULL
13473 
13474   /
13475 
13476 Return a translation table usable for str.translate().
13477 
13478 If there is only one argument, it must be a dictionary mapping Unicode
13479 ordinals (integers) or characters to Unicode ordinals, strings or None.
13480 Character keys will be then converted to ordinals.
13481 If there are two arguments, they must be strings of equal length, and
13482 in the resulting dictionary, each character in x will be mapped to the
13483 character at the same position in y. If there is a third argument, it
13484 must be a string, whose characters will be mapped to None in the result.
13485 [clinic start generated code]*/
13486 
13487 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13488 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13489 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13490 {
13491     PyObject *new = NULL, *key, *value;
13492     Py_ssize_t i = 0;
13493     int res;
13494 
13495     new = PyDict_New();
13496     if (!new)
13497         return NULL;
13498     if (y != NULL) {
13499         int x_kind, y_kind, z_kind;
13500         const void *x_data, *y_data, *z_data;
13501 
13502         /* x must be a string too, of equal length */
13503         if (!PyUnicode_Check(x)) {
13504             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13505                             "be a string if there is a second argument");
13506             goto err;
13507         }
13508         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13509             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13510                             "arguments must have equal length");
13511             goto err;
13512         }
13513         /* create entries for translating chars in x to those in y */
13514         x_kind = PyUnicode_KIND(x);
13515         y_kind = PyUnicode_KIND(y);
13516         x_data = PyUnicode_DATA(x);
13517         y_data = PyUnicode_DATA(y);
13518         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13519             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13520             if (!key)
13521                 goto err;
13522             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13523             if (!value) {
13524                 Py_DECREF(key);
13525                 goto err;
13526             }
13527             res = PyDict_SetItem(new, key, value);
13528             Py_DECREF(key);
13529             Py_DECREF(value);
13530             if (res < 0)
13531                 goto err;
13532         }
13533         /* create entries for deleting chars in z */
13534         if (z != NULL) {
13535             z_kind = PyUnicode_KIND(z);
13536             z_data = PyUnicode_DATA(z);
13537             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13538                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13539                 if (!key)
13540                     goto err;
13541                 res = PyDict_SetItem(new, key, Py_None);
13542                 Py_DECREF(key);
13543                 if (res < 0)
13544                     goto err;
13545             }
13546         }
13547     } else {
13548         int kind;
13549         const void *data;
13550 
13551         /* x must be a dict */
13552         if (!PyDict_CheckExact(x)) {
13553             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13554                             "to maketrans it must be a dict");
13555             goto err;
13556         }
13557         /* copy entries into the new dict, converting string keys to int keys */
13558         while (PyDict_Next(x, &i, &key, &value)) {
13559             if (PyUnicode_Check(key)) {
13560                 /* convert string keys to integer keys */
13561                 PyObject *newkey;
13562                 if (PyUnicode_GET_LENGTH(key) != 1) {
13563                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13564                                     "table must be of length 1");
13565                     goto err;
13566                 }
13567                 kind = PyUnicode_KIND(key);
13568                 data = PyUnicode_DATA(key);
13569                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13570                 if (!newkey)
13571                     goto err;
13572                 res = PyDict_SetItem(new, newkey, value);
13573                 Py_DECREF(newkey);
13574                 if (res < 0)
13575                     goto err;
13576             } else if (PyLong_Check(key)) {
13577                 /* just keep integer keys */
13578                 if (PyDict_SetItem(new, key, value) < 0)
13579                     goto err;
13580             } else {
13581                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13582                                 "be strings or integers");
13583                 goto err;
13584             }
13585         }
13586     }
13587     return new;
13588   err:
13589     Py_DECREF(new);
13590     return NULL;
13591 }
13592 
13593 /*[clinic input]
13594 str.translate as unicode_translate
13595 
13596     table: object
13597         Translation table, which must be a mapping of Unicode ordinals to
13598         Unicode ordinals, strings, or None.
13599     /
13600 
13601 Replace each character in the string using the given translation table.
13602 
13603 The table must implement lookup/indexing via __getitem__, for instance a
13604 dictionary or list.  If this operation raises LookupError, the character is
13605 left untouched.  Characters mapped to None are deleted.
13606 [clinic start generated code]*/
13607 
13608 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13609 unicode_translate(PyObject *self, PyObject *table)
13610 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13611 {
13612     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13613 }
13614 
13615 /*[clinic input]
13616 str.upper as unicode_upper
13617 
13618 Return a copy of the string converted to uppercase.
13619 [clinic start generated code]*/
13620 
13621 static PyObject *
unicode_upper_impl(PyObject * self)13622 unicode_upper_impl(PyObject *self)
13623 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13624 {
13625     if (PyUnicode_READY(self) == -1)
13626         return NULL;
13627     if (PyUnicode_IS_ASCII(self))
13628         return ascii_upper_or_lower(self, 0);
13629     return case_operation(self, do_upper);
13630 }
13631 
13632 /*[clinic input]
13633 str.zfill as unicode_zfill
13634 
13635     width: Py_ssize_t
13636     /
13637 
13638 Pad a numeric string with zeros on the left, to fill a field of the given width.
13639 
13640 The string is never truncated.
13641 [clinic start generated code]*/
13642 
13643 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13644 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13645 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13646 {
13647     Py_ssize_t fill;
13648     PyObject *u;
13649     int kind;
13650     const void *data;
13651     Py_UCS4 chr;
13652 
13653     if (PyUnicode_READY(self) == -1)
13654         return NULL;
13655 
13656     if (PyUnicode_GET_LENGTH(self) >= width)
13657         return unicode_result_unchanged(self);
13658 
13659     fill = width - PyUnicode_GET_LENGTH(self);
13660 
13661     u = pad(self, fill, 0, '0');
13662 
13663     if (u == NULL)
13664         return NULL;
13665 
13666     kind = PyUnicode_KIND(u);
13667     data = PyUnicode_DATA(u);
13668     chr = PyUnicode_READ(kind, data, fill);
13669 
13670     if (chr == '+' || chr == '-') {
13671         /* move sign to beginning of string */
13672         PyUnicode_WRITE(kind, data, 0, chr);
13673         PyUnicode_WRITE(kind, data, fill, '0');
13674     }
13675 
13676     assert(_PyUnicode_CheckConsistency(u, 1));
13677     return u;
13678 }
13679 
13680 #if 0
13681 static PyObject *
13682 unicode__decimal2ascii(PyObject *self)
13683 {
13684     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13685 }
13686 #endif
13687 
13688 PyDoc_STRVAR(startswith__doc__,
13689              "S.startswith(prefix[, start[, end]]) -> bool\n\
13690 \n\
13691 Return True if S starts with the specified prefix, False otherwise.\n\
13692 With optional start, test S beginning at that position.\n\
13693 With optional end, stop comparing S at that position.\n\
13694 prefix can also be a tuple of strings to try.");
13695 
13696 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13697 unicode_startswith(PyObject *self,
13698                    PyObject *args)
13699 {
13700     PyObject *subobj;
13701     PyObject *substring;
13702     Py_ssize_t start = 0;
13703     Py_ssize_t end = PY_SSIZE_T_MAX;
13704     int result;
13705 
13706     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13707         return NULL;
13708     if (PyTuple_Check(subobj)) {
13709         Py_ssize_t i;
13710         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13711             substring = PyTuple_GET_ITEM(subobj, i);
13712             if (!PyUnicode_Check(substring)) {
13713                 PyErr_Format(PyExc_TypeError,
13714                              "tuple for startswith must only contain str, "
13715                              "not %.100s",
13716                              Py_TYPE(substring)->tp_name);
13717                 return NULL;
13718             }
13719             result = tailmatch(self, substring, start, end, -1);
13720             if (result == -1)
13721                 return NULL;
13722             if (result) {
13723                 Py_RETURN_TRUE;
13724             }
13725         }
13726         /* nothing matched */
13727         Py_RETURN_FALSE;
13728     }
13729     if (!PyUnicode_Check(subobj)) {
13730         PyErr_Format(PyExc_TypeError,
13731                      "startswith first arg must be str or "
13732                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13733         return NULL;
13734     }
13735     result = tailmatch(self, subobj, start, end, -1);
13736     if (result == -1)
13737         return NULL;
13738     return PyBool_FromLong(result);
13739 }
13740 
13741 
13742 PyDoc_STRVAR(endswith__doc__,
13743              "S.endswith(suffix[, start[, end]]) -> bool\n\
13744 \n\
13745 Return True if S ends with the specified suffix, False otherwise.\n\
13746 With optional start, test S beginning at that position.\n\
13747 With optional end, stop comparing S at that position.\n\
13748 suffix can also be a tuple of strings to try.");
13749 
13750 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13751 unicode_endswith(PyObject *self,
13752                  PyObject *args)
13753 {
13754     PyObject *subobj;
13755     PyObject *substring;
13756     Py_ssize_t start = 0;
13757     Py_ssize_t end = PY_SSIZE_T_MAX;
13758     int result;
13759 
13760     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13761         return NULL;
13762     if (PyTuple_Check(subobj)) {
13763         Py_ssize_t i;
13764         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13765             substring = PyTuple_GET_ITEM(subobj, i);
13766             if (!PyUnicode_Check(substring)) {
13767                 PyErr_Format(PyExc_TypeError,
13768                              "tuple for endswith must only contain str, "
13769                              "not %.100s",
13770                              Py_TYPE(substring)->tp_name);
13771                 return NULL;
13772             }
13773             result = tailmatch(self, substring, start, end, +1);
13774             if (result == -1)
13775                 return NULL;
13776             if (result) {
13777                 Py_RETURN_TRUE;
13778             }
13779         }
13780         Py_RETURN_FALSE;
13781     }
13782     if (!PyUnicode_Check(subobj)) {
13783         PyErr_Format(PyExc_TypeError,
13784                      "endswith first arg must be str or "
13785                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13786         return NULL;
13787     }
13788     result = tailmatch(self, subobj, start, end, +1);
13789     if (result == -1)
13790         return NULL;
13791     return PyBool_FromLong(result);
13792 }
13793 
13794 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13795 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13796 {
13797     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13798     writer->data = PyUnicode_DATA(writer->buffer);
13799 
13800     if (!writer->readonly) {
13801         writer->kind = PyUnicode_KIND(writer->buffer);
13802         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13803     }
13804     else {
13805         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13806            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13807         writer->kind = PyUnicode_WCHAR_KIND;
13808         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13809 
13810         /* Copy-on-write mode: set buffer size to 0 so
13811          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13812          * next write. */
13813         writer->size = 0;
13814     }
13815 }
13816 
13817 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13818 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13819 {
13820     memset(writer, 0, sizeof(*writer));
13821 
13822     /* ASCII is the bare minimum */
13823     writer->min_char = 127;
13824 
13825     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13826        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13827     writer->kind = PyUnicode_WCHAR_KIND;
13828     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13829 }
13830 
13831 // Initialize _PyUnicodeWriter with initial buffer
13832 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13833 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13834 {
13835     memset(writer, 0, sizeof(*writer));
13836     writer->buffer = buffer;
13837     _PyUnicodeWriter_Update(writer);
13838     writer->min_length = writer->size;
13839 }
13840 
13841 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13842 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13843                                  Py_ssize_t length, Py_UCS4 maxchar)
13844 {
13845     Py_ssize_t newlen;
13846     PyObject *newbuffer;
13847 
13848     assert(maxchar <= MAX_UNICODE);
13849 
13850     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13851     assert((maxchar > writer->maxchar && length >= 0)
13852            || length > 0);
13853 
13854     if (length > PY_SSIZE_T_MAX - writer->pos) {
13855         PyErr_NoMemory();
13856         return -1;
13857     }
13858     newlen = writer->pos + length;
13859 
13860     maxchar = Py_MAX(maxchar, writer->min_char);
13861 
13862     if (writer->buffer == NULL) {
13863         assert(!writer->readonly);
13864         if (writer->overallocate
13865             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13866             /* overallocate to limit the number of realloc() */
13867             newlen += newlen / OVERALLOCATE_FACTOR;
13868         }
13869         if (newlen < writer->min_length)
13870             newlen = writer->min_length;
13871 
13872         writer->buffer = PyUnicode_New(newlen, maxchar);
13873         if (writer->buffer == NULL)
13874             return -1;
13875     }
13876     else if (newlen > writer->size) {
13877         if (writer->overallocate
13878             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13879             /* overallocate to limit the number of realloc() */
13880             newlen += newlen / OVERALLOCATE_FACTOR;
13881         }
13882         if (newlen < writer->min_length)
13883             newlen = writer->min_length;
13884 
13885         if (maxchar > writer->maxchar || writer->readonly) {
13886             /* resize + widen */
13887             maxchar = Py_MAX(maxchar, writer->maxchar);
13888             newbuffer = PyUnicode_New(newlen, maxchar);
13889             if (newbuffer == NULL)
13890                 return -1;
13891             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13892                                           writer->buffer, 0, writer->pos);
13893             Py_DECREF(writer->buffer);
13894             writer->readonly = 0;
13895         }
13896         else {
13897             newbuffer = resize_compact(writer->buffer, newlen);
13898             if (newbuffer == NULL)
13899                 return -1;
13900         }
13901         writer->buffer = newbuffer;
13902     }
13903     else if (maxchar > writer->maxchar) {
13904         assert(!writer->readonly);
13905         newbuffer = PyUnicode_New(writer->size, maxchar);
13906         if (newbuffer == NULL)
13907             return -1;
13908         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13909                                       writer->buffer, 0, writer->pos);
13910         Py_SETREF(writer->buffer, newbuffer);
13911     }
13912     _PyUnicodeWriter_Update(writer);
13913     return 0;
13914 
13915 #undef OVERALLOCATE_FACTOR
13916 }
13917 
13918 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13919 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13920                                      enum PyUnicode_Kind kind)
13921 {
13922     Py_UCS4 maxchar;
13923 
13924     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13925     assert(writer->kind < kind);
13926 
13927     switch (kind)
13928     {
13929     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13930     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13931     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13932     default:
13933         Py_UNREACHABLE();
13934     }
13935 
13936     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13937 }
13938 
13939 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13940 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13941 {
13942     assert(ch <= MAX_UNICODE);
13943     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13944         return -1;
13945     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13946     writer->pos++;
13947     return 0;
13948 }
13949 
13950 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13951 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13952 {
13953     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13954 }
13955 
13956 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13957 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13958 {
13959     Py_UCS4 maxchar;
13960     Py_ssize_t len;
13961 
13962     if (PyUnicode_READY(str) == -1)
13963         return -1;
13964     len = PyUnicode_GET_LENGTH(str);
13965     if (len == 0)
13966         return 0;
13967     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13968     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13969         if (writer->buffer == NULL && !writer->overallocate) {
13970             assert(_PyUnicode_CheckConsistency(str, 1));
13971             writer->readonly = 1;
13972             Py_INCREF(str);
13973             writer->buffer = str;
13974             _PyUnicodeWriter_Update(writer);
13975             writer->pos += len;
13976             return 0;
13977         }
13978         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13979             return -1;
13980     }
13981     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13982                                   str, 0, len);
13983     writer->pos += len;
13984     return 0;
13985 }
13986 
13987 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13988 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13989                                 Py_ssize_t start, Py_ssize_t end)
13990 {
13991     Py_UCS4 maxchar;
13992     Py_ssize_t len;
13993 
13994     if (PyUnicode_READY(str) == -1)
13995         return -1;
13996 
13997     assert(0 <= start);
13998     assert(end <= PyUnicode_GET_LENGTH(str));
13999     assert(start <= end);
14000 
14001     if (end == 0)
14002         return 0;
14003 
14004     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14005         return _PyUnicodeWriter_WriteStr(writer, str);
14006 
14007     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14008         maxchar = _PyUnicode_FindMaxChar(str, start, end);
14009     else
14010         maxchar = writer->maxchar;
14011     len = end - start;
14012 
14013     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14014         return -1;
14015 
14016     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14017                                   str, start, len);
14018     writer->pos += len;
14019     return 0;
14020 }
14021 
14022 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)14023 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14024                                   const char *ascii, Py_ssize_t len)
14025 {
14026     if (len == -1)
14027         len = strlen(ascii);
14028 
14029     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14030 
14031     if (writer->buffer == NULL && !writer->overallocate) {
14032         PyObject *str;
14033 
14034         str = _PyUnicode_FromASCII(ascii, len);
14035         if (str == NULL)
14036             return -1;
14037 
14038         writer->readonly = 1;
14039         writer->buffer = str;
14040         _PyUnicodeWriter_Update(writer);
14041         writer->pos += len;
14042         return 0;
14043     }
14044 
14045     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14046         return -1;
14047 
14048     switch (writer->kind)
14049     {
14050     case PyUnicode_1BYTE_KIND:
14051     {
14052         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14053         Py_UCS1 *data = writer->data;
14054 
14055         memcpy(data + writer->pos, str, len);
14056         break;
14057     }
14058     case PyUnicode_2BYTE_KIND:
14059     {
14060         _PyUnicode_CONVERT_BYTES(
14061             Py_UCS1, Py_UCS2,
14062             ascii, ascii + len,
14063             (Py_UCS2 *)writer->data + writer->pos);
14064         break;
14065     }
14066     case PyUnicode_4BYTE_KIND:
14067     {
14068         _PyUnicode_CONVERT_BYTES(
14069             Py_UCS1, Py_UCS4,
14070             ascii, ascii + len,
14071             (Py_UCS4 *)writer->data + writer->pos);
14072         break;
14073     }
14074     default:
14075         Py_UNREACHABLE();
14076     }
14077 
14078     writer->pos += len;
14079     return 0;
14080 }
14081 
14082 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14083 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14084                                    const char *str, Py_ssize_t len)
14085 {
14086     Py_UCS4 maxchar;
14087 
14088     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14089     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14090         return -1;
14091     unicode_write_cstr(writer->buffer, writer->pos, str, len);
14092     writer->pos += len;
14093     return 0;
14094 }
14095 
14096 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14097 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14098 {
14099     PyObject *str;
14100 
14101     if (writer->pos == 0) {
14102         Py_CLEAR(writer->buffer);
14103         _Py_RETURN_UNICODE_EMPTY();
14104     }
14105 
14106     str = writer->buffer;
14107     writer->buffer = NULL;
14108 
14109     if (writer->readonly) {
14110         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14111         return str;
14112     }
14113 
14114     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14115         PyObject *str2;
14116         str2 = resize_compact(str, writer->pos);
14117         if (str2 == NULL) {
14118             Py_DECREF(str);
14119             return NULL;
14120         }
14121         str = str2;
14122     }
14123 
14124     assert(_PyUnicode_CheckConsistency(str, 1));
14125     return unicode_result_ready(str);
14126 }
14127 
14128 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14129 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14130 {
14131     Py_CLEAR(writer->buffer);
14132 }
14133 
14134 #include "stringlib/unicode_format.h"
14135 
14136 PyDoc_STRVAR(format__doc__,
14137              "S.format(*args, **kwargs) -> str\n\
14138 \n\
14139 Return a formatted version of S, using substitutions from args and kwargs.\n\
14140 The substitutions are identified by braces ('{' and '}').");
14141 
14142 PyDoc_STRVAR(format_map__doc__,
14143              "S.format_map(mapping) -> str\n\
14144 \n\
14145 Return a formatted version of S, using substitutions from mapping.\n\
14146 The substitutions are identified by braces ('{' and '}').");
14147 
14148 /*[clinic input]
14149 str.__format__ as unicode___format__
14150 
14151     format_spec: unicode
14152     /
14153 
14154 Return a formatted version of the string as described by format_spec.
14155 [clinic start generated code]*/
14156 
14157 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14158 unicode___format___impl(PyObject *self, PyObject *format_spec)
14159 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14160 {
14161     _PyUnicodeWriter writer;
14162     int ret;
14163 
14164     if (PyUnicode_READY(self) == -1)
14165         return NULL;
14166     _PyUnicodeWriter_Init(&writer);
14167     ret = _PyUnicode_FormatAdvancedWriter(&writer,
14168                                           self, format_spec, 0,
14169                                           PyUnicode_GET_LENGTH(format_spec));
14170     if (ret == -1) {
14171         _PyUnicodeWriter_Dealloc(&writer);
14172         return NULL;
14173     }
14174     return _PyUnicodeWriter_Finish(&writer);
14175 }
14176 
14177 /*[clinic input]
14178 str.__sizeof__ as unicode_sizeof
14179 
14180 Return the size of the string in memory, in bytes.
14181 [clinic start generated code]*/
14182 
14183 static PyObject *
unicode_sizeof_impl(PyObject * self)14184 unicode_sizeof_impl(PyObject *self)
14185 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14186 {
14187     Py_ssize_t size;
14188 
14189     /* If it's a compact object, account for base structure +
14190        character data. */
14191     if (PyUnicode_IS_COMPACT_ASCII(self))
14192         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14193     else if (PyUnicode_IS_COMPACT(self))
14194         size = sizeof(PyCompactUnicodeObject) +
14195             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14196     else {
14197         /* If it is a two-block object, account for base object, and
14198            for character block if present. */
14199         size = sizeof(PyUnicodeObject);
14200         if (_PyUnicode_DATA_ANY(self))
14201             size += (PyUnicode_GET_LENGTH(self) + 1) *
14202                 PyUnicode_KIND(self);
14203     }
14204     /* If the wstr pointer is present, account for it unless it is shared
14205        with the data pointer. Check if the data is not shared. */
14206     if (_PyUnicode_HAS_WSTR_MEMORY(self))
14207         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14208     if (_PyUnicode_HAS_UTF8_MEMORY(self))
14209         size += PyUnicode_UTF8_LENGTH(self) + 1;
14210 
14211     return PyLong_FromSsize_t(size);
14212 }
14213 
14214 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14215 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14216 {
14217     PyObject *copy = _PyUnicode_Copy(v);
14218     if (!copy)
14219         return NULL;
14220     return Py_BuildValue("(N)", copy);
14221 }
14222 
14223 static PyMethodDef unicode_methods[] = {
14224     UNICODE_ENCODE_METHODDEF
14225     UNICODE_REPLACE_METHODDEF
14226     UNICODE_SPLIT_METHODDEF
14227     UNICODE_RSPLIT_METHODDEF
14228     UNICODE_JOIN_METHODDEF
14229     UNICODE_CAPITALIZE_METHODDEF
14230     UNICODE_CASEFOLD_METHODDEF
14231     UNICODE_TITLE_METHODDEF
14232     UNICODE_CENTER_METHODDEF
14233     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14234     UNICODE_EXPANDTABS_METHODDEF
14235     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14236     UNICODE_PARTITION_METHODDEF
14237     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14238     UNICODE_LJUST_METHODDEF
14239     UNICODE_LOWER_METHODDEF
14240     UNICODE_LSTRIP_METHODDEF
14241     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14242     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14243     UNICODE_RJUST_METHODDEF
14244     UNICODE_RSTRIP_METHODDEF
14245     UNICODE_RPARTITION_METHODDEF
14246     UNICODE_SPLITLINES_METHODDEF
14247     UNICODE_STRIP_METHODDEF
14248     UNICODE_SWAPCASE_METHODDEF
14249     UNICODE_TRANSLATE_METHODDEF
14250     UNICODE_UPPER_METHODDEF
14251     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14252     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14253     UNICODE_REMOVEPREFIX_METHODDEF
14254     UNICODE_REMOVESUFFIX_METHODDEF
14255     UNICODE_ISASCII_METHODDEF
14256     UNICODE_ISLOWER_METHODDEF
14257     UNICODE_ISUPPER_METHODDEF
14258     UNICODE_ISTITLE_METHODDEF
14259     UNICODE_ISSPACE_METHODDEF
14260     UNICODE_ISDECIMAL_METHODDEF
14261     UNICODE_ISDIGIT_METHODDEF
14262     UNICODE_ISNUMERIC_METHODDEF
14263     UNICODE_ISALPHA_METHODDEF
14264     UNICODE_ISALNUM_METHODDEF
14265     UNICODE_ISIDENTIFIER_METHODDEF
14266     UNICODE_ISPRINTABLE_METHODDEF
14267     UNICODE_ZFILL_METHODDEF
14268     {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14269     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14270     UNICODE___FORMAT___METHODDEF
14271     UNICODE_MAKETRANS_METHODDEF
14272     UNICODE_SIZEOF_METHODDEF
14273 #if 0
14274     /* These methods are just used for debugging the implementation. */
14275     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14276 #endif
14277 
14278     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14279     {NULL, NULL}
14280 };
14281 
14282 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14283 unicode_mod(PyObject *v, PyObject *w)
14284 {
14285     if (!PyUnicode_Check(v))
14286         Py_RETURN_NOTIMPLEMENTED;
14287     return PyUnicode_Format(v, w);
14288 }
14289 
14290 static PyNumberMethods unicode_as_number = {
14291     0,              /*nb_add*/
14292     0,              /*nb_subtract*/
14293     0,              /*nb_multiply*/
14294     unicode_mod,            /*nb_remainder*/
14295 };
14296 
14297 static PySequenceMethods unicode_as_sequence = {
14298     (lenfunc) unicode_length,       /* sq_length */
14299     PyUnicode_Concat,           /* sq_concat */
14300     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14301     (ssizeargfunc) unicode_getitem,     /* sq_item */
14302     0,                  /* sq_slice */
14303     0,                  /* sq_ass_item */
14304     0,                  /* sq_ass_slice */
14305     PyUnicode_Contains,         /* sq_contains */
14306 };
14307 
14308 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14309 unicode_subscript(PyObject* self, PyObject* item)
14310 {
14311     if (PyUnicode_READY(self) == -1)
14312         return NULL;
14313 
14314     if (_PyIndex_Check(item)) {
14315         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14316         if (i == -1 && PyErr_Occurred())
14317             return NULL;
14318         if (i < 0)
14319             i += PyUnicode_GET_LENGTH(self);
14320         return unicode_getitem(self, i);
14321     } else if (PySlice_Check(item)) {
14322         Py_ssize_t start, stop, step, slicelength, i;
14323         size_t cur;
14324         PyObject *result;
14325         const void *src_data;
14326         void *dest_data;
14327         int src_kind, dest_kind;
14328         Py_UCS4 ch, max_char, kind_limit;
14329 
14330         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14331             return NULL;
14332         }
14333         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14334                                             &start, &stop, step);
14335 
14336         if (slicelength <= 0) {
14337             _Py_RETURN_UNICODE_EMPTY();
14338         } else if (start == 0 && step == 1 &&
14339                    slicelength == PyUnicode_GET_LENGTH(self)) {
14340             return unicode_result_unchanged(self);
14341         } else if (step == 1) {
14342             return PyUnicode_Substring(self,
14343                                        start, start + slicelength);
14344         }
14345         /* General case */
14346         src_kind = PyUnicode_KIND(self);
14347         src_data = PyUnicode_DATA(self);
14348         if (!PyUnicode_IS_ASCII(self)) {
14349             kind_limit = kind_maxchar_limit(src_kind);
14350             max_char = 0;
14351             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14352                 ch = PyUnicode_READ(src_kind, src_data, cur);
14353                 if (ch > max_char) {
14354                     max_char = ch;
14355                     if (max_char >= kind_limit)
14356                         break;
14357                 }
14358             }
14359         }
14360         else
14361             max_char = 127;
14362         result = PyUnicode_New(slicelength, max_char);
14363         if (result == NULL)
14364             return NULL;
14365         dest_kind = PyUnicode_KIND(result);
14366         dest_data = PyUnicode_DATA(result);
14367 
14368         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14369             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14370             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14371         }
14372         assert(_PyUnicode_CheckConsistency(result, 1));
14373         return result;
14374     } else {
14375         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14376         return NULL;
14377     }
14378 }
14379 
14380 static PyMappingMethods unicode_as_mapping = {
14381     (lenfunc)unicode_length,        /* mp_length */
14382     (binaryfunc)unicode_subscript,  /* mp_subscript */
14383     (objobjargproc)0,           /* mp_ass_subscript */
14384 };
14385 
14386 
14387 /* Helpers for PyUnicode_Format() */
14388 
14389 struct unicode_formatter_t {
14390     PyObject *args;
14391     int args_owned;
14392     Py_ssize_t arglen, argidx;
14393     PyObject *dict;
14394 
14395     enum PyUnicode_Kind fmtkind;
14396     Py_ssize_t fmtcnt, fmtpos;
14397     const void *fmtdata;
14398     PyObject *fmtstr;
14399 
14400     _PyUnicodeWriter writer;
14401 };
14402 
14403 struct unicode_format_arg_t {
14404     Py_UCS4 ch;
14405     int flags;
14406     Py_ssize_t width;
14407     int prec;
14408     int sign;
14409 };
14410 
14411 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14412 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14413 {
14414     Py_ssize_t argidx = ctx->argidx;
14415 
14416     if (argidx < ctx->arglen) {
14417         ctx->argidx++;
14418         if (ctx->arglen < 0)
14419             return ctx->args;
14420         else
14421             return PyTuple_GetItem(ctx->args, argidx);
14422     }
14423     PyErr_SetString(PyExc_TypeError,
14424                     "not enough arguments for format string");
14425     return NULL;
14426 }
14427 
14428 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14429 
14430 /* Format a float into the writer if the writer is not NULL, or into *p_output
14431    otherwise.
14432 
14433    Return 0 on success, raise an exception and return -1 on error. */
14434 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14435 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14436             PyObject **p_output,
14437             _PyUnicodeWriter *writer)
14438 {
14439     char *p;
14440     double x;
14441     Py_ssize_t len;
14442     int prec;
14443     int dtoa_flags;
14444 
14445     x = PyFloat_AsDouble(v);
14446     if (x == -1.0 && PyErr_Occurred())
14447         return -1;
14448 
14449     prec = arg->prec;
14450     if (prec < 0)
14451         prec = 6;
14452 
14453     if (arg->flags & F_ALT)
14454         dtoa_flags = Py_DTSF_ALT;
14455     else
14456         dtoa_flags = 0;
14457     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14458     if (p == NULL)
14459         return -1;
14460     len = strlen(p);
14461     if (writer) {
14462         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14463             PyMem_Free(p);
14464             return -1;
14465         }
14466     }
14467     else
14468         *p_output = _PyUnicode_FromASCII(p, len);
14469     PyMem_Free(p);
14470     return 0;
14471 }
14472 
14473 /* formatlong() emulates the format codes d, u, o, x and X, and
14474  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14475  * Python's regular ints.
14476  * Return value:  a new PyUnicodeObject*, or NULL if error.
14477  *     The output string is of the form
14478  *         "-"? ("0x" | "0X")? digit+
14479  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14480  *         set in flags.  The case of hex digits will be correct,
14481  *     There will be at least prec digits, zero-filled on the left if
14482  *         necessary to get that many.
14483  * val          object to be converted
14484  * flags        bitmask of format flags; only F_ALT is looked at
14485  * prec         minimum number of digits; 0-fill on left if needed
14486  * type         a character in [duoxX]; u acts the same as d
14487  *
14488  * CAUTION:  o, x and X conversions on regular ints can never
14489  * produce a '-' sign, but can for Python's unbounded ints.
14490  */
14491 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14492 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14493 {
14494     PyObject *result = NULL;
14495     char *buf;
14496     Py_ssize_t i;
14497     int sign;           /* 1 if '-', else 0 */
14498     int len;            /* number of characters */
14499     Py_ssize_t llen;
14500     int numdigits;      /* len == numnondigits + numdigits */
14501     int numnondigits = 0;
14502 
14503     /* Avoid exceeding SSIZE_T_MAX */
14504     if (prec > INT_MAX-3) {
14505         PyErr_SetString(PyExc_OverflowError,
14506                         "precision too large");
14507         return NULL;
14508     }
14509 
14510     assert(PyLong_Check(val));
14511 
14512     switch (type) {
14513     default:
14514         Py_UNREACHABLE();
14515     case 'd':
14516     case 'i':
14517     case 'u':
14518         /* int and int subclasses should print numerically when a numeric */
14519         /* format code is used (see issue18780) */
14520         result = PyNumber_ToBase(val, 10);
14521         break;
14522     case 'o':
14523         numnondigits = 2;
14524         result = PyNumber_ToBase(val, 8);
14525         break;
14526     case 'x':
14527     case 'X':
14528         numnondigits = 2;
14529         result = PyNumber_ToBase(val, 16);
14530         break;
14531     }
14532     if (!result)
14533         return NULL;
14534 
14535     assert(unicode_modifiable(result));
14536     assert(PyUnicode_IS_READY(result));
14537     assert(PyUnicode_IS_ASCII(result));
14538 
14539     /* To modify the string in-place, there can only be one reference. */
14540     if (Py_REFCNT(result) != 1) {
14541         Py_DECREF(result);
14542         PyErr_BadInternalCall();
14543         return NULL;
14544     }
14545     buf = PyUnicode_DATA(result);
14546     llen = PyUnicode_GET_LENGTH(result);
14547     if (llen > INT_MAX) {
14548         Py_DECREF(result);
14549         PyErr_SetString(PyExc_ValueError,
14550                         "string too large in _PyUnicode_FormatLong");
14551         return NULL;
14552     }
14553     len = (int)llen;
14554     sign = buf[0] == '-';
14555     numnondigits += sign;
14556     numdigits = len - numnondigits;
14557     assert(numdigits > 0);
14558 
14559     /* Get rid of base marker unless F_ALT */
14560     if (((alt) == 0 &&
14561         (type == 'o' || type == 'x' || type == 'X'))) {
14562         assert(buf[sign] == '0');
14563         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14564                buf[sign+1] == 'o');
14565         numnondigits -= 2;
14566         buf += 2;
14567         len -= 2;
14568         if (sign)
14569             buf[0] = '-';
14570         assert(len == numnondigits + numdigits);
14571         assert(numdigits > 0);
14572     }
14573 
14574     /* Fill with leading zeroes to meet minimum width. */
14575     if (prec > numdigits) {
14576         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14577                                 numnondigits + prec);
14578         char *b1;
14579         if (!r1) {
14580             Py_DECREF(result);
14581             return NULL;
14582         }
14583         b1 = PyBytes_AS_STRING(r1);
14584         for (i = 0; i < numnondigits; ++i)
14585             *b1++ = *buf++;
14586         for (i = 0; i < prec - numdigits; i++)
14587             *b1++ = '0';
14588         for (i = 0; i < numdigits; i++)
14589             *b1++ = *buf++;
14590         *b1 = '\0';
14591         Py_DECREF(result);
14592         result = r1;
14593         buf = PyBytes_AS_STRING(result);
14594         len = numnondigits + prec;
14595     }
14596 
14597     /* Fix up case for hex conversions. */
14598     if (type == 'X') {
14599         /* Need to convert all lower case letters to upper case.
14600            and need to convert 0x to 0X (and -0x to -0X). */
14601         for (i = 0; i < len; i++)
14602             if (buf[i] >= 'a' && buf[i] <= 'x')
14603                 buf[i] -= 'a'-'A';
14604     }
14605     if (!PyUnicode_Check(result)
14606         || buf != PyUnicode_DATA(result)) {
14607         PyObject *unicode;
14608         unicode = _PyUnicode_FromASCII(buf, len);
14609         Py_DECREF(result);
14610         result = unicode;
14611     }
14612     else if (len != PyUnicode_GET_LENGTH(result)) {
14613         if (PyUnicode_Resize(&result, len) < 0)
14614             Py_CLEAR(result);
14615     }
14616     return result;
14617 }
14618 
14619 /* Format an integer or a float as an integer.
14620  * Return 1 if the number has been formatted into the writer,
14621  *        0 if the number has been formatted into *p_output
14622  *       -1 and raise an exception on error */
14623 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14624 mainformatlong(PyObject *v,
14625                struct unicode_format_arg_t *arg,
14626                PyObject **p_output,
14627                _PyUnicodeWriter *writer)
14628 {
14629     PyObject *iobj, *res;
14630     char type = (char)arg->ch;
14631 
14632     if (!PyNumber_Check(v))
14633         goto wrongtype;
14634 
14635     /* make sure number is a type of integer for o, x, and X */
14636     if (!PyLong_Check(v)) {
14637         if (type == 'o' || type == 'x' || type == 'X') {
14638             iobj = PyNumber_Index(v);
14639             if (iobj == NULL) {
14640                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14641                     goto wrongtype;
14642                 return -1;
14643             }
14644         }
14645         else {
14646             iobj = PyNumber_Long(v);
14647             if (iobj == NULL ) {
14648                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14649                     goto wrongtype;
14650                 return -1;
14651             }
14652         }
14653         assert(PyLong_Check(iobj));
14654     }
14655     else {
14656         iobj = v;
14657         Py_INCREF(iobj);
14658     }
14659 
14660     if (PyLong_CheckExact(v)
14661         && arg->width == -1 && arg->prec == -1
14662         && !(arg->flags & (F_SIGN | F_BLANK))
14663         && type != 'X')
14664     {
14665         /* Fast path */
14666         int alternate = arg->flags & F_ALT;
14667         int base;
14668 
14669         switch(type)
14670         {
14671             default:
14672                 Py_UNREACHABLE();
14673             case 'd':
14674             case 'i':
14675             case 'u':
14676                 base = 10;
14677                 break;
14678             case 'o':
14679                 base = 8;
14680                 break;
14681             case 'x':
14682             case 'X':
14683                 base = 16;
14684                 break;
14685         }
14686 
14687         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14688             Py_DECREF(iobj);
14689             return -1;
14690         }
14691         Py_DECREF(iobj);
14692         return 1;
14693     }
14694 
14695     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14696     Py_DECREF(iobj);
14697     if (res == NULL)
14698         return -1;
14699     *p_output = res;
14700     return 0;
14701 
14702 wrongtype:
14703     switch(type)
14704     {
14705         case 'o':
14706         case 'x':
14707         case 'X':
14708             PyErr_Format(PyExc_TypeError,
14709                     "%%%c format: an integer is required, "
14710                     "not %.200s",
14711                     type, Py_TYPE(v)->tp_name);
14712             break;
14713         default:
14714             PyErr_Format(PyExc_TypeError,
14715                     "%%%c format: a number is required, "
14716                     "not %.200s",
14717                     type, Py_TYPE(v)->tp_name);
14718             break;
14719     }
14720     return -1;
14721 }
14722 
14723 static Py_UCS4
formatchar(PyObject * v)14724 formatchar(PyObject *v)
14725 {
14726     /* presume that the buffer is at least 3 characters long */
14727     if (PyUnicode_Check(v)) {
14728         if (PyUnicode_GET_LENGTH(v) == 1) {
14729             return PyUnicode_READ_CHAR(v, 0);
14730         }
14731         goto onError;
14732     }
14733     else {
14734         PyObject *iobj;
14735         long x;
14736         /* make sure number is a type of integer */
14737         if (!PyLong_Check(v)) {
14738             iobj = PyNumber_Index(v);
14739             if (iobj == NULL) {
14740                 goto onError;
14741             }
14742             x = PyLong_AsLong(iobj);
14743             Py_DECREF(iobj);
14744         }
14745         else {
14746             x = PyLong_AsLong(v);
14747         }
14748         if (x == -1 && PyErr_Occurred())
14749             goto onError;
14750 
14751         if (x < 0 || x > MAX_UNICODE) {
14752             PyErr_SetString(PyExc_OverflowError,
14753                             "%c arg not in range(0x110000)");
14754             return (Py_UCS4) -1;
14755         }
14756 
14757         return (Py_UCS4) x;
14758     }
14759 
14760   onError:
14761     PyErr_SetString(PyExc_TypeError,
14762                     "%c requires int or char");
14763     return (Py_UCS4) -1;
14764 }
14765 
14766 /* Parse options of an argument: flags, width, precision.
14767    Handle also "%(name)" syntax.
14768 
14769    Return 0 if the argument has been formatted into arg->str.
14770    Return 1 if the argument has been written into ctx->writer,
14771    Raise an exception and return -1 on error. */
14772 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14773 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14774                          struct unicode_format_arg_t *arg)
14775 {
14776 #define FORMAT_READ(ctx) \
14777         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14778 
14779     PyObject *v;
14780 
14781     if (arg->ch == '(') {
14782         /* Get argument value from a dictionary. Example: "%(name)s". */
14783         Py_ssize_t keystart;
14784         Py_ssize_t keylen;
14785         PyObject *key;
14786         int pcount = 1;
14787 
14788         if (ctx->dict == NULL) {
14789             PyErr_SetString(PyExc_TypeError,
14790                             "format requires a mapping");
14791             return -1;
14792         }
14793         ++ctx->fmtpos;
14794         --ctx->fmtcnt;
14795         keystart = ctx->fmtpos;
14796         /* Skip over balanced parentheses */
14797         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14798             arg->ch = FORMAT_READ(ctx);
14799             if (arg->ch == ')')
14800                 --pcount;
14801             else if (arg->ch == '(')
14802                 ++pcount;
14803             ctx->fmtpos++;
14804         }
14805         keylen = ctx->fmtpos - keystart - 1;
14806         if (ctx->fmtcnt < 0 || pcount > 0) {
14807             PyErr_SetString(PyExc_ValueError,
14808                             "incomplete format key");
14809             return -1;
14810         }
14811         key = PyUnicode_Substring(ctx->fmtstr,
14812                                   keystart, keystart + keylen);
14813         if (key == NULL)
14814             return -1;
14815         if (ctx->args_owned) {
14816             ctx->args_owned = 0;
14817             Py_DECREF(ctx->args);
14818         }
14819         ctx->args = PyObject_GetItem(ctx->dict, key);
14820         Py_DECREF(key);
14821         if (ctx->args == NULL)
14822             return -1;
14823         ctx->args_owned = 1;
14824         ctx->arglen = -1;
14825         ctx->argidx = -2;
14826     }
14827 
14828     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14829     while (--ctx->fmtcnt >= 0) {
14830         arg->ch = FORMAT_READ(ctx);
14831         ctx->fmtpos++;
14832         switch (arg->ch) {
14833         case '-': arg->flags |= F_LJUST; continue;
14834         case '+': arg->flags |= F_SIGN; continue;
14835         case ' ': arg->flags |= F_BLANK; continue;
14836         case '#': arg->flags |= F_ALT; continue;
14837         case '0': arg->flags |= F_ZERO; continue;
14838         }
14839         break;
14840     }
14841 
14842     /* Parse width. Example: "%10s" => width=10 */
14843     if (arg->ch == '*') {
14844         v = unicode_format_getnextarg(ctx);
14845         if (v == NULL)
14846             return -1;
14847         if (!PyLong_Check(v)) {
14848             PyErr_SetString(PyExc_TypeError,
14849                             "* wants int");
14850             return -1;
14851         }
14852         arg->width = PyLong_AsSsize_t(v);
14853         if (arg->width == -1 && PyErr_Occurred())
14854             return -1;
14855         if (arg->width < 0) {
14856             arg->flags |= F_LJUST;
14857             arg->width = -arg->width;
14858         }
14859         if (--ctx->fmtcnt >= 0) {
14860             arg->ch = FORMAT_READ(ctx);
14861             ctx->fmtpos++;
14862         }
14863     }
14864     else if (arg->ch >= '0' && arg->ch <= '9') {
14865         arg->width = arg->ch - '0';
14866         while (--ctx->fmtcnt >= 0) {
14867             arg->ch = FORMAT_READ(ctx);
14868             ctx->fmtpos++;
14869             if (arg->ch < '0' || arg->ch > '9')
14870                 break;
14871             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14872                mixing signed and unsigned comparison. Since arg->ch is between
14873                '0' and '9', casting to int is safe. */
14874             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14875                 PyErr_SetString(PyExc_ValueError,
14876                                 "width too big");
14877                 return -1;
14878             }
14879             arg->width = arg->width*10 + (arg->ch - '0');
14880         }
14881     }
14882 
14883     /* Parse precision. Example: "%.3f" => prec=3 */
14884     if (arg->ch == '.') {
14885         arg->prec = 0;
14886         if (--ctx->fmtcnt >= 0) {
14887             arg->ch = FORMAT_READ(ctx);
14888             ctx->fmtpos++;
14889         }
14890         if (arg->ch == '*') {
14891             v = unicode_format_getnextarg(ctx);
14892             if (v == NULL)
14893                 return -1;
14894             if (!PyLong_Check(v)) {
14895                 PyErr_SetString(PyExc_TypeError,
14896                                 "* wants int");
14897                 return -1;
14898             }
14899             arg->prec = _PyLong_AsInt(v);
14900             if (arg->prec == -1 && PyErr_Occurred())
14901                 return -1;
14902             if (arg->prec < 0)
14903                 arg->prec = 0;
14904             if (--ctx->fmtcnt >= 0) {
14905                 arg->ch = FORMAT_READ(ctx);
14906                 ctx->fmtpos++;
14907             }
14908         }
14909         else if (arg->ch >= '0' && arg->ch <= '9') {
14910             arg->prec = arg->ch - '0';
14911             while (--ctx->fmtcnt >= 0) {
14912                 arg->ch = FORMAT_READ(ctx);
14913                 ctx->fmtpos++;
14914                 if (arg->ch < '0' || arg->ch > '9')
14915                     break;
14916                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14917                     PyErr_SetString(PyExc_ValueError,
14918                                     "precision too big");
14919                     return -1;
14920                 }
14921                 arg->prec = arg->prec*10 + (arg->ch - '0');
14922             }
14923         }
14924     }
14925 
14926     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14927     if (ctx->fmtcnt >= 0) {
14928         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14929             if (--ctx->fmtcnt >= 0) {
14930                 arg->ch = FORMAT_READ(ctx);
14931                 ctx->fmtpos++;
14932             }
14933         }
14934     }
14935     if (ctx->fmtcnt < 0) {
14936         PyErr_SetString(PyExc_ValueError,
14937                         "incomplete format");
14938         return -1;
14939     }
14940     return 0;
14941 
14942 #undef FORMAT_READ
14943 }
14944 
14945 /* Format one argument. Supported conversion specifiers:
14946 
14947    - "s", "r", "a": any type
14948    - "i", "d", "u": int or float
14949    - "o", "x", "X": int
14950    - "e", "E", "f", "F", "g", "G": float
14951    - "c": int or str (1 character)
14952 
14953    When possible, the output is written directly into the Unicode writer
14954    (ctx->writer). A string is created when padding is required.
14955 
14956    Return 0 if the argument has been formatted into *p_str,
14957           1 if the argument has been written into ctx->writer,
14958          -1 on error. */
14959 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14960 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14961                           struct unicode_format_arg_t *arg,
14962                           PyObject **p_str)
14963 {
14964     PyObject *v;
14965     _PyUnicodeWriter *writer = &ctx->writer;
14966 
14967     if (ctx->fmtcnt == 0)
14968         ctx->writer.overallocate = 0;
14969 
14970     v = unicode_format_getnextarg(ctx);
14971     if (v == NULL)
14972         return -1;
14973 
14974 
14975     switch (arg->ch) {
14976     case 's':
14977     case 'r':
14978     case 'a':
14979         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14980             /* Fast path */
14981             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14982                 return -1;
14983             return 1;
14984         }
14985 
14986         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14987             *p_str = v;
14988             Py_INCREF(*p_str);
14989         }
14990         else {
14991             if (arg->ch == 's')
14992                 *p_str = PyObject_Str(v);
14993             else if (arg->ch == 'r')
14994                 *p_str = PyObject_Repr(v);
14995             else
14996                 *p_str = PyObject_ASCII(v);
14997         }
14998         break;
14999 
15000     case 'i':
15001     case 'd':
15002     case 'u':
15003     case 'o':
15004     case 'x':
15005     case 'X':
15006     {
15007         int ret = mainformatlong(v, arg, p_str, writer);
15008         if (ret != 0)
15009             return ret;
15010         arg->sign = 1;
15011         break;
15012     }
15013 
15014     case 'e':
15015     case 'E':
15016     case 'f':
15017     case 'F':
15018     case 'g':
15019     case 'G':
15020         if (arg->width == -1 && arg->prec == -1
15021             && !(arg->flags & (F_SIGN | F_BLANK)))
15022         {
15023             /* Fast path */
15024             if (formatfloat(v, arg, NULL, writer) == -1)
15025                 return -1;
15026             return 1;
15027         }
15028 
15029         arg->sign = 1;
15030         if (formatfloat(v, arg, p_str, NULL) == -1)
15031             return -1;
15032         break;
15033 
15034     case 'c':
15035     {
15036         Py_UCS4 ch = formatchar(v);
15037         if (ch == (Py_UCS4) -1)
15038             return -1;
15039         if (arg->width == -1 && arg->prec == -1) {
15040             /* Fast path */
15041             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15042                 return -1;
15043             return 1;
15044         }
15045         *p_str = PyUnicode_FromOrdinal(ch);
15046         break;
15047     }
15048 
15049     default:
15050         PyErr_Format(PyExc_ValueError,
15051                      "unsupported format character '%c' (0x%x) "
15052                      "at index %zd",
15053                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15054                      (int)arg->ch,
15055                      ctx->fmtpos - 1);
15056         return -1;
15057     }
15058     if (*p_str == NULL)
15059         return -1;
15060     assert (PyUnicode_Check(*p_str));
15061     return 0;
15062 }
15063 
15064 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15065 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15066                           struct unicode_format_arg_t *arg,
15067                           PyObject *str)
15068 {
15069     Py_ssize_t len;
15070     enum PyUnicode_Kind kind;
15071     const void *pbuf;
15072     Py_ssize_t pindex;
15073     Py_UCS4 signchar;
15074     Py_ssize_t buflen;
15075     Py_UCS4 maxchar;
15076     Py_ssize_t sublen;
15077     _PyUnicodeWriter *writer = &ctx->writer;
15078     Py_UCS4 fill;
15079 
15080     fill = ' ';
15081     if (arg->sign && arg->flags & F_ZERO)
15082         fill = '0';
15083 
15084     if (PyUnicode_READY(str) == -1)
15085         return -1;
15086 
15087     len = PyUnicode_GET_LENGTH(str);
15088     if ((arg->width == -1 || arg->width <= len)
15089         && (arg->prec == -1 || arg->prec >= len)
15090         && !(arg->flags & (F_SIGN | F_BLANK)))
15091     {
15092         /* Fast path */
15093         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15094             return -1;
15095         return 0;
15096     }
15097 
15098     /* Truncate the string for "s", "r" and "a" formats
15099        if the precision is set */
15100     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15101         if (arg->prec >= 0 && len > arg->prec)
15102             len = arg->prec;
15103     }
15104 
15105     /* Adjust sign and width */
15106     kind = PyUnicode_KIND(str);
15107     pbuf = PyUnicode_DATA(str);
15108     pindex = 0;
15109     signchar = '\0';
15110     if (arg->sign) {
15111         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15112         if (ch == '-' || ch == '+') {
15113             signchar = ch;
15114             len--;
15115             pindex++;
15116         }
15117         else if (arg->flags & F_SIGN)
15118             signchar = '+';
15119         else if (arg->flags & F_BLANK)
15120             signchar = ' ';
15121         else
15122             arg->sign = 0;
15123     }
15124     if (arg->width < len)
15125         arg->width = len;
15126 
15127     /* Prepare the writer */
15128     maxchar = writer->maxchar;
15129     if (!(arg->flags & F_LJUST)) {
15130         if (arg->sign) {
15131             if ((arg->width-1) > len)
15132                 maxchar = Py_MAX(maxchar, fill);
15133         }
15134         else {
15135             if (arg->width > len)
15136                 maxchar = Py_MAX(maxchar, fill);
15137         }
15138     }
15139     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15140         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15141         maxchar = Py_MAX(maxchar, strmaxchar);
15142     }
15143 
15144     buflen = arg->width;
15145     if (arg->sign && len == arg->width)
15146         buflen++;
15147     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15148         return -1;
15149 
15150     /* Write the sign if needed */
15151     if (arg->sign) {
15152         if (fill != ' ') {
15153             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15154             writer->pos += 1;
15155         }
15156         if (arg->width > len)
15157             arg->width--;
15158     }
15159 
15160     /* Write the numeric prefix for "x", "X" and "o" formats
15161        if the alternate form is used.
15162        For example, write "0x" for the "%#x" format. */
15163     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15164         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15165         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15166         if (fill != ' ') {
15167             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15168             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15169             writer->pos += 2;
15170             pindex += 2;
15171         }
15172         arg->width -= 2;
15173         if (arg->width < 0)
15174             arg->width = 0;
15175         len -= 2;
15176     }
15177 
15178     /* Pad left with the fill character if needed */
15179     if (arg->width > len && !(arg->flags & F_LJUST)) {
15180         sublen = arg->width - len;
15181         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15182         writer->pos += sublen;
15183         arg->width = len;
15184     }
15185 
15186     /* If padding with spaces: write sign if needed and/or numeric prefix if
15187        the alternate form is used */
15188     if (fill == ' ') {
15189         if (arg->sign) {
15190             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15191             writer->pos += 1;
15192         }
15193         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15194             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15195             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15196             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15197             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15198             writer->pos += 2;
15199             pindex += 2;
15200         }
15201     }
15202 
15203     /* Write characters */
15204     if (len) {
15205         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15206                                       str, pindex, len);
15207         writer->pos += len;
15208     }
15209 
15210     /* Pad right with the fill character if needed */
15211     if (arg->width > len) {
15212         sublen = arg->width - len;
15213         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15214         writer->pos += sublen;
15215     }
15216     return 0;
15217 }
15218 
15219 /* Helper of PyUnicode_Format(): format one arg.
15220    Return 0 on success, raise an exception and return -1 on error. */
15221 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15222 unicode_format_arg(struct unicode_formatter_t *ctx)
15223 {
15224     struct unicode_format_arg_t arg;
15225     PyObject *str;
15226     int ret;
15227 
15228     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15229     if (arg.ch == '%') {
15230         ctx->fmtpos++;
15231         ctx->fmtcnt--;
15232         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15233             return -1;
15234         return 0;
15235     }
15236     arg.flags = 0;
15237     arg.width = -1;
15238     arg.prec = -1;
15239     arg.sign = 0;
15240     str = NULL;
15241 
15242     ret = unicode_format_arg_parse(ctx, &arg);
15243     if (ret == -1)
15244         return -1;
15245 
15246     ret = unicode_format_arg_format(ctx, &arg, &str);
15247     if (ret == -1)
15248         return -1;
15249 
15250     if (ret != 1) {
15251         ret = unicode_format_arg_output(ctx, &arg, str);
15252         Py_DECREF(str);
15253         if (ret == -1)
15254             return -1;
15255     }
15256 
15257     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15258         PyErr_SetString(PyExc_TypeError,
15259                         "not all arguments converted during string formatting");
15260         return -1;
15261     }
15262     return 0;
15263 }
15264 
15265 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15266 PyUnicode_Format(PyObject *format, PyObject *args)
15267 {
15268     struct unicode_formatter_t ctx;
15269 
15270     if (format == NULL || args == NULL) {
15271         PyErr_BadInternalCall();
15272         return NULL;
15273     }
15274 
15275     if (ensure_unicode(format) < 0)
15276         return NULL;
15277 
15278     ctx.fmtstr = format;
15279     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15280     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15281     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15282     ctx.fmtpos = 0;
15283 
15284     _PyUnicodeWriter_Init(&ctx.writer);
15285     ctx.writer.min_length = ctx.fmtcnt + 100;
15286     ctx.writer.overallocate = 1;
15287 
15288     if (PyTuple_Check(args)) {
15289         ctx.arglen = PyTuple_Size(args);
15290         ctx.argidx = 0;
15291     }
15292     else {
15293         ctx.arglen = -1;
15294         ctx.argidx = -2;
15295     }
15296     ctx.args_owned = 0;
15297     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15298         ctx.dict = args;
15299     else
15300         ctx.dict = NULL;
15301     ctx.args = args;
15302 
15303     while (--ctx.fmtcnt >= 0) {
15304         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15305             Py_ssize_t nonfmtpos;
15306 
15307             nonfmtpos = ctx.fmtpos++;
15308             while (ctx.fmtcnt >= 0 &&
15309                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15310                 ctx.fmtpos++;
15311                 ctx.fmtcnt--;
15312             }
15313             if (ctx.fmtcnt < 0) {
15314                 ctx.fmtpos--;
15315                 ctx.writer.overallocate = 0;
15316             }
15317 
15318             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15319                                                 nonfmtpos, ctx.fmtpos) < 0)
15320                 goto onError;
15321         }
15322         else {
15323             ctx.fmtpos++;
15324             if (unicode_format_arg(&ctx) == -1)
15325                 goto onError;
15326         }
15327     }
15328 
15329     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15330         PyErr_SetString(PyExc_TypeError,
15331                         "not all arguments converted during string formatting");
15332         goto onError;
15333     }
15334 
15335     if (ctx.args_owned) {
15336         Py_DECREF(ctx.args);
15337     }
15338     return _PyUnicodeWriter_Finish(&ctx.writer);
15339 
15340   onError:
15341     _PyUnicodeWriter_Dealloc(&ctx.writer);
15342     if (ctx.args_owned) {
15343         Py_DECREF(ctx.args);
15344     }
15345     return NULL;
15346 }
15347 
15348 static PyObject *
15349 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15350 
15351 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15352 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15353 {
15354     PyObject *x = NULL;
15355     static char *kwlist[] = {"object", "encoding", "errors", 0};
15356     char *encoding = NULL;
15357     char *errors = NULL;
15358 
15359     if (type != &PyUnicode_Type)
15360         return unicode_subtype_new(type, args, kwds);
15361     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15362                                      kwlist, &x, &encoding, &errors))
15363         return NULL;
15364     if (x == NULL)
15365         _Py_RETURN_UNICODE_EMPTY();
15366     if (encoding == NULL && errors == NULL)
15367         return PyObject_Str(x);
15368     else
15369         return PyUnicode_FromEncodedObject(x, encoding, errors);
15370 }
15371 
15372 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15373 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15374 {
15375     PyObject *unicode, *self;
15376     Py_ssize_t length, char_size;
15377     int share_wstr, share_utf8;
15378     unsigned int kind;
15379     void *data;
15380 
15381     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15382 
15383     unicode = unicode_new(&PyUnicode_Type, args, kwds);
15384     if (unicode == NULL)
15385         return NULL;
15386     assert(_PyUnicode_CHECK(unicode));
15387     if (PyUnicode_READY(unicode) == -1) {
15388         Py_DECREF(unicode);
15389         return NULL;
15390     }
15391 
15392     self = type->tp_alloc(type, 0);
15393     if (self == NULL) {
15394         Py_DECREF(unicode);
15395         return NULL;
15396     }
15397     kind = PyUnicode_KIND(unicode);
15398     length = PyUnicode_GET_LENGTH(unicode);
15399 
15400     _PyUnicode_LENGTH(self) = length;
15401 #ifdef Py_DEBUG
15402     _PyUnicode_HASH(self) = -1;
15403 #else
15404     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15405 #endif
15406     _PyUnicode_STATE(self).interned = 0;
15407     _PyUnicode_STATE(self).kind = kind;
15408     _PyUnicode_STATE(self).compact = 0;
15409     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15410     _PyUnicode_STATE(self).ready = 1;
15411     _PyUnicode_WSTR(self) = NULL;
15412     _PyUnicode_UTF8_LENGTH(self) = 0;
15413     _PyUnicode_UTF8(self) = NULL;
15414     _PyUnicode_WSTR_LENGTH(self) = 0;
15415     _PyUnicode_DATA_ANY(self) = NULL;
15416 
15417     share_utf8 = 0;
15418     share_wstr = 0;
15419     if (kind == PyUnicode_1BYTE_KIND) {
15420         char_size = 1;
15421         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15422             share_utf8 = 1;
15423     }
15424     else if (kind == PyUnicode_2BYTE_KIND) {
15425         char_size = 2;
15426         if (sizeof(wchar_t) == 2)
15427             share_wstr = 1;
15428     }
15429     else {
15430         assert(kind == PyUnicode_4BYTE_KIND);
15431         char_size = 4;
15432         if (sizeof(wchar_t) == 4)
15433             share_wstr = 1;
15434     }
15435 
15436     /* Ensure we won't overflow the length. */
15437     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15438         PyErr_NoMemory();
15439         goto onError;
15440     }
15441     data = PyObject_MALLOC((length + 1) * char_size);
15442     if (data == NULL) {
15443         PyErr_NoMemory();
15444         goto onError;
15445     }
15446 
15447     _PyUnicode_DATA_ANY(self) = data;
15448     if (share_utf8) {
15449         _PyUnicode_UTF8_LENGTH(self) = length;
15450         _PyUnicode_UTF8(self) = data;
15451     }
15452     if (share_wstr) {
15453         _PyUnicode_WSTR_LENGTH(self) = length;
15454         _PyUnicode_WSTR(self) = (wchar_t *)data;
15455     }
15456 
15457     memcpy(data, PyUnicode_DATA(unicode),
15458               kind * (length + 1));
15459     assert(_PyUnicode_CheckConsistency(self, 1));
15460 #ifdef Py_DEBUG
15461     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15462 #endif
15463     Py_DECREF(unicode);
15464     return self;
15465 
15466 onError:
15467     Py_DECREF(unicode);
15468     Py_DECREF(self);
15469     return NULL;
15470 }
15471 
15472 PyDoc_STRVAR(unicode_doc,
15473 "str(object='') -> str\n\
15474 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15475 \n\
15476 Create a new string object from the given object. If encoding or\n\
15477 errors is specified, then the object must expose a data buffer\n\
15478 that will be decoded using the given encoding and error handler.\n\
15479 Otherwise, returns the result of object.__str__() (if defined)\n\
15480 or repr(object).\n\
15481 encoding defaults to sys.getdefaultencoding().\n\
15482 errors defaults to 'strict'.");
15483 
15484 static PyObject *unicode_iter(PyObject *seq);
15485 
15486 PyTypeObject PyUnicode_Type = {
15487     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15488     "str",                        /* tp_name */
15489     sizeof(PyUnicodeObject),      /* tp_basicsize */
15490     0,                            /* tp_itemsize */
15491     /* Slots */
15492     (destructor)unicode_dealloc,  /* tp_dealloc */
15493     0,                            /* tp_vectorcall_offset */
15494     0,                            /* tp_getattr */
15495     0,                            /* tp_setattr */
15496     0,                            /* tp_as_async */
15497     unicode_repr,                 /* tp_repr */
15498     &unicode_as_number,           /* tp_as_number */
15499     &unicode_as_sequence,         /* tp_as_sequence */
15500     &unicode_as_mapping,          /* tp_as_mapping */
15501     (hashfunc) unicode_hash,      /* tp_hash*/
15502     0,                            /* tp_call*/
15503     (reprfunc) unicode_str,       /* tp_str */
15504     PyObject_GenericGetAttr,      /* tp_getattro */
15505     0,                            /* tp_setattro */
15506     0,                            /* tp_as_buffer */
15507     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15508     Py_TPFLAGS_UNICODE_SUBCLASS,   /* tp_flags */
15509     unicode_doc,                  /* tp_doc */
15510     0,                            /* tp_traverse */
15511     0,                            /* tp_clear */
15512     PyUnicode_RichCompare,        /* tp_richcompare */
15513     0,                            /* tp_weaklistoffset */
15514     unicode_iter,                 /* tp_iter */
15515     0,                            /* tp_iternext */
15516     unicode_methods,              /* tp_methods */
15517     0,                            /* tp_members */
15518     0,                            /* tp_getset */
15519     &PyBaseObject_Type,           /* tp_base */
15520     0,                            /* tp_dict */
15521     0,                            /* tp_descr_get */
15522     0,                            /* tp_descr_set */
15523     0,                            /* tp_dictoffset */
15524     0,                            /* tp_init */
15525     0,                            /* tp_alloc */
15526     unicode_new,                  /* tp_new */
15527     PyObject_Del,                 /* tp_free */
15528 };
15529 
15530 /* Initialize the Unicode implementation */
15531 
15532 PyStatus
_PyUnicode_Init(void)15533 _PyUnicode_Init(void)
15534 {
15535     /* XXX - move this array to unicodectype.c ? */
15536     Py_UCS2 linebreak[] = {
15537         0x000A, /* LINE FEED */
15538         0x000D, /* CARRIAGE RETURN */
15539         0x001C, /* FILE SEPARATOR */
15540         0x001D, /* GROUP SEPARATOR */
15541         0x001E, /* RECORD SEPARATOR */
15542         0x0085, /* NEXT LINE */
15543         0x2028, /* LINE SEPARATOR */
15544         0x2029, /* PARAGRAPH SEPARATOR */
15545     };
15546 
15547     /* Init the implementation */
15548     _Py_INCREF_UNICODE_EMPTY();
15549     if (!unicode_empty) {
15550         return _PyStatus_ERR("Can't create empty string");
15551     }
15552     Py_DECREF(unicode_empty);
15553 
15554     if (PyType_Ready(&PyUnicode_Type) < 0) {
15555         return _PyStatus_ERR("Can't initialize unicode type");
15556     }
15557 
15558     /* initialize the linebreak bloom filter */
15559     bloom_linebreak = make_bloom_mask(
15560         PyUnicode_2BYTE_KIND, linebreak,
15561         Py_ARRAY_LENGTH(linebreak));
15562 
15563     if (PyType_Ready(&EncodingMapType) < 0) {
15564          return _PyStatus_ERR("Can't initialize encoding map type");
15565     }
15566     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15567         return _PyStatus_ERR("Can't initialize field name iterator type");
15568     }
15569     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15570         return _PyStatus_ERR("Can't initialize formatter iter type");
15571     }
15572     return _PyStatus_OK();
15573 }
15574 
15575 
15576 void
PyUnicode_InternInPlace(PyObject ** p)15577 PyUnicode_InternInPlace(PyObject **p)
15578 {
15579     PyObject *s = *p;
15580 #ifdef Py_DEBUG
15581     assert(s != NULL);
15582     assert(_PyUnicode_CHECK(s));
15583 #else
15584     if (s == NULL || !PyUnicode_Check(s)) {
15585         return;
15586     }
15587 #endif
15588 
15589     /* If it's a subclass, we don't really know what putting
15590        it in the interned dict might do. */
15591     if (!PyUnicode_CheckExact(s)) {
15592         return;
15593     }
15594 
15595     if (PyUnicode_CHECK_INTERNED(s)) {
15596         return;
15597     }
15598 
15599 #ifdef INTERNED_STRINGS
15600     if (interned == NULL) {
15601         interned = PyDict_New();
15602         if (interned == NULL) {
15603             PyErr_Clear(); /* Don't leave an exception */
15604             return;
15605         }
15606     }
15607 
15608     PyObject *t;
15609     t = PyDict_SetDefault(interned, s, s);
15610 
15611     if (t == NULL) {
15612         PyErr_Clear();
15613         return;
15614     }
15615 
15616     if (t != s) {
15617         Py_INCREF(t);
15618         Py_SETREF(*p, t);
15619         return;
15620     }
15621 
15622     /* The two references in interned are not counted by refcnt.
15623        The deallocator will take care of this */
15624     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15625     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15626 #endif
15627 }
15628 
15629 void
PyUnicode_InternImmortal(PyObject ** p)15630 PyUnicode_InternImmortal(PyObject **p)
15631 {
15632     PyUnicode_InternInPlace(p);
15633     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15634         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15635         Py_INCREF(*p);
15636     }
15637 }
15638 
15639 PyObject *
PyUnicode_InternFromString(const char * cp)15640 PyUnicode_InternFromString(const char *cp)
15641 {
15642     PyObject *s = PyUnicode_FromString(cp);
15643     if (s == NULL)
15644         return NULL;
15645     PyUnicode_InternInPlace(&s);
15646     return s;
15647 }
15648 
15649 
15650 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15651 static void
unicode_release_interned(void)15652 unicode_release_interned(void)
15653 {
15654     if (interned == NULL || !PyDict_Check(interned)) {
15655         return;
15656     }
15657     PyObject *keys = PyDict_Keys(interned);
15658     if (keys == NULL || !PyList_Check(keys)) {
15659         PyErr_Clear();
15660         return;
15661     }
15662 
15663     /* Since unicode_release_interned() is intended to help a leak
15664        detector, interned unicode strings are not forcibly deallocated;
15665        rather, we give them their stolen references back, and then clear
15666        and DECREF the interned dict. */
15667 
15668     Py_ssize_t n = PyList_GET_SIZE(keys);
15669 #ifdef INTERNED_STATS
15670     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15671             n);
15672 
15673     Py_ssize_t immortal_size = 0, mortal_size = 0;
15674 #endif
15675     for (Py_ssize_t i = 0; i < n; i++) {
15676         PyObject *s = PyList_GET_ITEM(keys, i);
15677         if (PyUnicode_READY(s) == -1) {
15678             Py_UNREACHABLE();
15679         }
15680         switch (PyUnicode_CHECK_INTERNED(s)) {
15681         case SSTATE_INTERNED_IMMORTAL:
15682             Py_REFCNT(s) += 1;
15683 #ifdef INTERNED_STATS
15684             immortal_size += PyUnicode_GET_LENGTH(s);
15685 #endif
15686             break;
15687         case SSTATE_INTERNED_MORTAL:
15688             Py_REFCNT(s) += 2;
15689 #ifdef INTERNED_STATS
15690             mortal_size += PyUnicode_GET_LENGTH(s);
15691 #endif
15692             break;
15693         case SSTATE_NOT_INTERNED:
15694             /* fall through */
15695         default:
15696             Py_UNREACHABLE();
15697         }
15698         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15699     }
15700 #ifdef INTERNED_STATS
15701     fprintf(stderr, "total size of all interned strings: "
15702             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15703             "mortal/immortal\n", mortal_size, immortal_size);
15704 #endif
15705     Py_DECREF(keys);
15706     PyDict_Clear(interned);
15707     Py_CLEAR(interned);
15708 }
15709 #endif
15710 
15711 
15712 /********************* Unicode Iterator **************************/
15713 
15714 typedef struct {
15715     PyObject_HEAD
15716     Py_ssize_t it_index;
15717     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15718 } unicodeiterobject;
15719 
15720 static void
unicodeiter_dealloc(unicodeiterobject * it)15721 unicodeiter_dealloc(unicodeiterobject *it)
15722 {
15723     _PyObject_GC_UNTRACK(it);
15724     Py_XDECREF(it->it_seq);
15725     PyObject_GC_Del(it);
15726 }
15727 
15728 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15729 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15730 {
15731     Py_VISIT(it->it_seq);
15732     return 0;
15733 }
15734 
15735 static PyObject *
unicodeiter_next(unicodeiterobject * it)15736 unicodeiter_next(unicodeiterobject *it)
15737 {
15738     PyObject *seq, *item;
15739 
15740     assert(it != NULL);
15741     seq = it->it_seq;
15742     if (seq == NULL)
15743         return NULL;
15744     assert(_PyUnicode_CHECK(seq));
15745 
15746     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15747         int kind = PyUnicode_KIND(seq);
15748         const void *data = PyUnicode_DATA(seq);
15749         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15750         item = PyUnicode_FromOrdinal(chr);
15751         if (item != NULL)
15752             ++it->it_index;
15753         return item;
15754     }
15755 
15756     it->it_seq = NULL;
15757     Py_DECREF(seq);
15758     return NULL;
15759 }
15760 
15761 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15762 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15763 {
15764     Py_ssize_t len = 0;
15765     if (it->it_seq)
15766         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15767     return PyLong_FromSsize_t(len);
15768 }
15769 
15770 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15771 
15772 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15773 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15774 {
15775     _Py_IDENTIFIER(iter);
15776     if (it->it_seq != NULL) {
15777         return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15778                              it->it_seq, it->it_index);
15779     } else {
15780         PyObject *u = (PyObject *)_PyUnicode_New(0);
15781         if (u == NULL)
15782             return NULL;
15783         return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15784     }
15785 }
15786 
15787 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15788 
15789 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15790 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15791 {
15792     Py_ssize_t index = PyLong_AsSsize_t(state);
15793     if (index == -1 && PyErr_Occurred())
15794         return NULL;
15795     if (it->it_seq != NULL) {
15796         if (index < 0)
15797             index = 0;
15798         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15799             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15800         it->it_index = index;
15801     }
15802     Py_RETURN_NONE;
15803 }
15804 
15805 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15806 
15807 static PyMethodDef unicodeiter_methods[] = {
15808     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15809      length_hint_doc},
15810     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15811      reduce_doc},
15812     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15813      setstate_doc},
15814     {NULL,      NULL}       /* sentinel */
15815 };
15816 
15817 PyTypeObject PyUnicodeIter_Type = {
15818     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15819     "str_iterator",         /* tp_name */
15820     sizeof(unicodeiterobject),      /* tp_basicsize */
15821     0,                  /* tp_itemsize */
15822     /* methods */
15823     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15824     0,                  /* tp_vectorcall_offset */
15825     0,                  /* tp_getattr */
15826     0,                  /* tp_setattr */
15827     0,                  /* tp_as_async */
15828     0,                  /* tp_repr */
15829     0,                  /* tp_as_number */
15830     0,                  /* tp_as_sequence */
15831     0,                  /* tp_as_mapping */
15832     0,                  /* tp_hash */
15833     0,                  /* tp_call */
15834     0,                  /* tp_str */
15835     PyObject_GenericGetAttr,        /* tp_getattro */
15836     0,                  /* tp_setattro */
15837     0,                  /* tp_as_buffer */
15838     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15839     0,                  /* tp_doc */
15840     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15841     0,                  /* tp_clear */
15842     0,                  /* tp_richcompare */
15843     0,                  /* tp_weaklistoffset */
15844     PyObject_SelfIter,          /* tp_iter */
15845     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15846     unicodeiter_methods,            /* tp_methods */
15847     0,
15848 };
15849 
15850 static PyObject *
unicode_iter(PyObject * seq)15851 unicode_iter(PyObject *seq)
15852 {
15853     unicodeiterobject *it;
15854 
15855     if (!PyUnicode_Check(seq)) {
15856         PyErr_BadInternalCall();
15857         return NULL;
15858     }
15859     if (PyUnicode_READY(seq) == -1)
15860         return NULL;
15861     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15862     if (it == NULL)
15863         return NULL;
15864     it->it_index = 0;
15865     Py_INCREF(seq);
15866     it->it_seq = seq;
15867     _PyObject_GC_TRACK(it);
15868     return (PyObject *)it;
15869 }
15870 
15871 
15872 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15873 Py_UNICODE_strlen(const Py_UNICODE *u)
15874 {
15875     return wcslen(u);
15876 }
15877 
15878 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15879 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15880 {
15881     Py_UNICODE *u = s1;
15882     while ((*u++ = *s2++));
15883     return s1;
15884 }
15885 
15886 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15887 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15888 {
15889     Py_UNICODE *u = s1;
15890     while ((*u++ = *s2++))
15891         if (n-- == 0)
15892             break;
15893     return s1;
15894 }
15895 
15896 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15897 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15898 {
15899     Py_UNICODE *u1 = s1;
15900     u1 += wcslen(u1);
15901     while ((*u1++ = *s2++));
15902     return s1;
15903 }
15904 
15905 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15906 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15907 {
15908     while (*s1 && *s2 && *s1 == *s2)
15909         s1++, s2++;
15910     if (*s1 && *s2)
15911         return (*s1 < *s2) ? -1 : +1;
15912     if (*s1)
15913         return 1;
15914     if (*s2)
15915         return -1;
15916     return 0;
15917 }
15918 
15919 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15920 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15921 {
15922     Py_UNICODE u1, u2;
15923     for (; n != 0; n--) {
15924         u1 = *s1;
15925         u2 = *s2;
15926         if (u1 != u2)
15927             return (u1 < u2) ? -1 : +1;
15928         if (u1 == '\0')
15929             return 0;
15930         s1++;
15931         s2++;
15932     }
15933     return 0;
15934 }
15935 
15936 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15937 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15938 {
15939     const Py_UNICODE *p;
15940     for (p = s; *p; p++)
15941         if (*p == c)
15942             return (Py_UNICODE*)p;
15943     return NULL;
15944 }
15945 
15946 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15947 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15948 {
15949     const Py_UNICODE *p;
15950     p = s + wcslen(s);
15951     while (p != s) {
15952         p--;
15953         if (*p == c)
15954             return (Py_UNICODE*)p;
15955     }
15956     return NULL;
15957 }
15958 
15959 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15960 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15961 {
15962     Py_UNICODE *u, *copy;
15963     Py_ssize_t len, size;
15964 
15965     if (!PyUnicode_Check(unicode)) {
15966         PyErr_BadArgument();
15967         return NULL;
15968     }
15969 _Py_COMP_DIAG_PUSH
15970 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
15971     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15972 _Py_COMP_DIAG_POP
15973     if (u == NULL)
15974         return NULL;
15975     /* Ensure we won't overflow the size. */
15976     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15977         PyErr_NoMemory();
15978         return NULL;
15979     }
15980     size = len + 1; /* copy the null character */
15981     size *= sizeof(Py_UNICODE);
15982     copy = PyMem_Malloc(size);
15983     if (copy == NULL) {
15984         PyErr_NoMemory();
15985         return NULL;
15986     }
15987     memcpy(copy, u, size);
15988     return copy;
15989 }
15990 
15991 
15992 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15993 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15994 {
15995     int res;
15996     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15997     if (res == -2) {
15998         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15999         return -1;
16000     }
16001     if (res < 0) {
16002         PyErr_NoMemory();
16003         return -1;
16004     }
16005     return 0;
16006 }
16007 
16008 
16009 static int
config_get_codec_name(wchar_t ** config_encoding)16010 config_get_codec_name(wchar_t **config_encoding)
16011 {
16012     char *encoding;
16013     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16014         return -1;
16015     }
16016 
16017     PyObject *name_obj = NULL;
16018     PyObject *codec = _PyCodec_Lookup(encoding);
16019     PyMem_RawFree(encoding);
16020 
16021     if (!codec)
16022         goto error;
16023 
16024     name_obj = PyObject_GetAttrString(codec, "name");
16025     Py_CLEAR(codec);
16026     if (!name_obj) {
16027         goto error;
16028     }
16029 
16030     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16031     Py_DECREF(name_obj);
16032     if (wname == NULL) {
16033         goto error;
16034     }
16035 
16036     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16037     if (raw_wname == NULL) {
16038         PyMem_Free(wname);
16039         PyErr_NoMemory();
16040         goto error;
16041     }
16042 
16043     PyMem_RawFree(*config_encoding);
16044     *config_encoding = raw_wname;
16045 
16046     PyMem_Free(wname);
16047     return 0;
16048 
16049 error:
16050     Py_XDECREF(codec);
16051     Py_XDECREF(name_obj);
16052     return -1;
16053 }
16054 
16055 
16056 static PyStatus
init_stdio_encoding(PyThreadState * tstate)16057 init_stdio_encoding(PyThreadState *tstate)
16058 {
16059     /* Update the stdio encoding to the normalized Python codec name. */
16060     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
16061     if (config_get_codec_name(&config->stdio_encoding) < 0) {
16062         return _PyStatus_ERR("failed to get the Python codec name "
16063                              "of the stdio encoding");
16064     }
16065     return _PyStatus_OK();
16066 }
16067 
16068 
16069 static int
init_fs_codec(PyInterpreterState * interp)16070 init_fs_codec(PyInterpreterState *interp)
16071 {
16072     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16073 
16074     _Py_error_handler error_handler;
16075     error_handler = get_error_handler_wide(config->filesystem_errors);
16076     if (error_handler == _Py_ERROR_UNKNOWN) {
16077         PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16078         return -1;
16079     }
16080 
16081     char *encoding, *errors;
16082     if (encode_wstr_utf8(config->filesystem_encoding,
16083                          &encoding,
16084                          "filesystem_encoding") < 0) {
16085         return -1;
16086     }
16087 
16088     if (encode_wstr_utf8(config->filesystem_errors,
16089                          &errors,
16090                          "filesystem_errors") < 0) {
16091         PyMem_RawFree(encoding);
16092         return -1;
16093     }
16094 
16095     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16096     PyMem_RawFree(fs_codec->encoding);
16097     fs_codec->encoding = encoding;
16098     /* encoding has been normalized by init_fs_encoding() */
16099     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16100     PyMem_RawFree(fs_codec->errors);
16101     fs_codec->errors = errors;
16102     fs_codec->error_handler = error_handler;
16103 
16104 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16105     assert(fs_codec->utf8 == 1);
16106 #endif
16107 
16108     /* At this point, PyUnicode_EncodeFSDefault() and
16109        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16110        the C implementation of the filesystem encoding. */
16111 
16112     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16113        global configuration variables. */
16114     if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16115                                   fs_codec->errors) < 0) {
16116         PyErr_NoMemory();
16117         return -1;
16118     }
16119     return 0;
16120 }
16121 
16122 
16123 static PyStatus
init_fs_encoding(PyThreadState * tstate)16124 init_fs_encoding(PyThreadState *tstate)
16125 {
16126     PyInterpreterState *interp = tstate->interp;
16127 
16128     /* Update the filesystem encoding to the normalized Python codec name.
16129        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16130        (Python codec name). */
16131     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16132     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16133         _Py_DumpPathConfig(tstate);
16134         return _PyStatus_ERR("failed to get the Python codec "
16135                              "of the filesystem encoding");
16136     }
16137 
16138     if (init_fs_codec(interp) < 0) {
16139         return _PyStatus_ERR("cannot initialize filesystem codec");
16140     }
16141     return _PyStatus_OK();
16142 }
16143 
16144 
16145 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16146 _PyUnicode_InitEncodings(PyThreadState *tstate)
16147 {
16148     PyStatus status = init_fs_encoding(tstate);
16149     if (_PyStatus_EXCEPTION(status)) {
16150         return status;
16151     }
16152 
16153     return init_stdio_encoding(tstate);
16154 }
16155 
16156 
16157 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16158 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16159 {
16160     PyMem_RawFree(fs_codec->encoding);
16161     fs_codec->encoding = NULL;
16162     fs_codec->utf8 = 0;
16163     PyMem_RawFree(fs_codec->errors);
16164     fs_codec->errors = NULL;
16165     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16166 }
16167 
16168 
16169 #ifdef MS_WINDOWS
16170 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16171 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16172 {
16173     PyInterpreterState *interp = _PyInterpreterState_GET();
16174     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16175 
16176     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16177     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16178     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16179     if (encoding == NULL || errors == NULL) {
16180         PyMem_RawFree(encoding);
16181         PyMem_RawFree(errors);
16182         PyErr_NoMemory();
16183         return -1;
16184     }
16185 
16186     PyMem_RawFree(config->filesystem_encoding);
16187     config->filesystem_encoding = encoding;
16188     PyMem_RawFree(config->filesystem_errors);
16189     config->filesystem_errors = errors;
16190 
16191     return init_fs_codec(interp);
16192 }
16193 #endif
16194 
16195 
16196 void
_PyUnicode_Fini(PyThreadState * tstate)16197 _PyUnicode_Fini(PyThreadState *tstate)
16198 {
16199     if (_Py_IsMainInterpreter(tstate)) {
16200 #if defined(WITH_VALGRIND) || defined(__INSURE__)
16201         /* Insure++ is a memory analysis tool that aids in discovering
16202          * memory leaks and other memory problems.  On Python exit, the
16203          * interned string dictionaries are flagged as being in use at exit
16204          * (which it is).  Under normal circumstances, this is fine because
16205          * the memory will be automatically reclaimed by the system.  Under
16206          * memory debugging, it's a huge source of useless noise, so we
16207          * trade off slower shutdown for less distraction in the memory
16208          * reports.  -baw
16209          */
16210         unicode_release_interned();
16211 #endif /* __INSURE__ */
16212 
16213         Py_CLEAR(unicode_empty);
16214 
16215 #ifdef LATIN1_SINGLETONS
16216         for (Py_ssize_t i = 0; i < 256; i++) {
16217             Py_CLEAR(unicode_latin1[i]);
16218         }
16219 #endif
16220         unicode_clear_static_strings();
16221     }
16222 
16223     _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
16224 }
16225 
16226 
16227 /* A _string module, to export formatter_parser and formatter_field_name_split
16228    to the string.Formatter class implemented in Python. */
16229 
16230 static PyMethodDef _string_methods[] = {
16231     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16232      METH_O, PyDoc_STR("split the argument as a field name")},
16233     {"formatter_parser", (PyCFunction) formatter_parser,
16234      METH_O, PyDoc_STR("parse the argument as a format string")},
16235     {NULL, NULL}
16236 };
16237 
16238 static struct PyModuleDef _string_module = {
16239     PyModuleDef_HEAD_INIT,
16240     "_string",
16241     PyDoc_STR("string helper module"),
16242     0,
16243     _string_methods,
16244     NULL,
16245     NULL,
16246     NULL,
16247     NULL
16248 };
16249 
16250 PyMODINIT_FUNC
PyInit__string(void)16251 PyInit__string(void)
16252 {
16253     return PyModule_Create(&_string_module);
16254 }
16255 
16256 
16257 #ifdef __cplusplus
16258 }
16259 #endif
16260