• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "internal/pystate.h"
44 #include "ucnhash.h"
45 #include "bytes_methods.h"
46 #include "stringlib/eq.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /*[clinic input]
53 class str "PyObject *" "&PyUnicode_Type"
54 [clinic start generated code]*/
55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56 
57 /*[python input]
58 class Py_UCS4_converter(CConverter):
59     type = 'Py_UCS4'
60     converter = 'convert_uc'
61 
62     def converter_init(self):
63         if self.default is not unspecified:
64             self.c_default = ascii(self.default)
65             if len(self.c_default) > 4 or self.c_default[0] != "'":
66                 self.c_default = hex(ord(self.default))
67 
68 [python start generated code]*/
69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
70 
71 /* --- Globals ------------------------------------------------------------
72 
73 NOTE: In the interpreter's initialization phase, some globals are currently
74       initialized dynamically as needed. In the process Unicode objects may
75       be created before the Unicode type is ready.
76 
77 */
78 
79 
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83 
84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85 #define MAX_UNICODE 0x10ffff
86 
87 #ifdef Py_DEBUG
88 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
89 #else
90 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91 #endif
92 
93 #define _PyUnicode_UTF8(op)                             \
94     (((PyCompactUnicodeObject*)(op))->utf8)
95 #define PyUnicode_UTF8(op)                              \
96     (assert(_PyUnicode_CHECK(op)),                      \
97      assert(PyUnicode_IS_READY(op)),                    \
98      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
99          ((char*)((PyASCIIObject*)(op) + 1)) :          \
100          _PyUnicode_UTF8(op))
101 #define _PyUnicode_UTF8_LENGTH(op)                      \
102     (((PyCompactUnicodeObject*)(op))->utf8_length)
103 #define PyUnicode_UTF8_LENGTH(op)                       \
104     (assert(_PyUnicode_CHECK(op)),                      \
105      assert(PyUnicode_IS_READY(op)),                    \
106      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
107          ((PyASCIIObject*)(op))->length :               \
108          _PyUnicode_UTF8_LENGTH(op))
109 #define _PyUnicode_WSTR(op)                             \
110     (((PyASCIIObject*)(op))->wstr)
111 #define _PyUnicode_WSTR_LENGTH(op)                      \
112     (((PyCompactUnicodeObject*)(op))->wstr_length)
113 #define _PyUnicode_LENGTH(op)                           \
114     (((PyASCIIObject *)(op))->length)
115 #define _PyUnicode_STATE(op)                            \
116     (((PyASCIIObject *)(op))->state)
117 #define _PyUnicode_HASH(op)                             \
118     (((PyASCIIObject *)(op))->hash)
119 #define _PyUnicode_KIND(op)                             \
120     (assert(_PyUnicode_CHECK(op)),                      \
121      ((PyASCIIObject *)(op))->state.kind)
122 #define _PyUnicode_GET_LENGTH(op)                       \
123     (assert(_PyUnicode_CHECK(op)),                      \
124      ((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_DATA_ANY(op)                         \
126     (((PyUnicodeObject*)(op))->data.any)
127 
128 #undef PyUnicode_READY
129 #define PyUnicode_READY(op)                             \
130     (assert(_PyUnicode_CHECK(op)),                      \
131      (PyUnicode_IS_READY(op) ?                          \
132       0 :                                               \
133       _PyUnicode_Ready(op)))
134 
135 #define _PyUnicode_SHARE_UTF8(op)                       \
136     (assert(_PyUnicode_CHECK(op)),                      \
137      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
138      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139 #define _PyUnicode_SHARE_WSTR(op)                       \
140     (assert(_PyUnicode_CHECK(op)),                      \
141      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142 
143 /* true if the Unicode object has an allocated UTF-8 memory block
144    (not shared with other data) */
145 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
146     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
147       && _PyUnicode_UTF8(op)                            \
148       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149 
150 /* true if the Unicode object has an allocated wstr memory block
151    (not shared with other data) */
152 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
153     ((_PyUnicode_WSTR(op) &&                            \
154       (!PyUnicode_IS_READY(op) ||                       \
155        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156 
157 /* Generic helper macro to convert characters of different types.
158    from_type and to_type have to be valid type names, begin and end
159    are pointers to the source characters which should be of type
160    "from_type *".  to is a pointer of type "to_type *" and points to the
161    buffer where the result characters are written to. */
162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163     do {                                                \
164         to_type *_to = (to_type *)(to);                \
165         const from_type *_iter = (from_type *)(begin);  \
166         const from_type *_end = (from_type *)(end);     \
167         Py_ssize_t n = (_end) - (_iter);                \
168         const from_type *_unrolled_end =                \
169             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
170         while (_iter < (_unrolled_end)) {               \
171             _to[0] = (to_type) _iter[0];                \
172             _to[1] = (to_type) _iter[1];                \
173             _to[2] = (to_type) _iter[2];                \
174             _to[3] = (to_type) _iter[3];                \
175             _iter += 4; _to += 4;                       \
176         }                                               \
177         while (_iter < (_end))                          \
178             *_to++ = (to_type) *_iter++;                \
179     } while (0)
180 
181 #ifdef MS_WINDOWS
182    /* On Windows, overallocate by 50% is the best factor */
183 #  define OVERALLOCATE_FACTOR 2
184 #else
185    /* On Linux, overallocate by 25% is the best factor */
186 #  define OVERALLOCATE_FACTOR 4
187 #endif
188 
189 /* This dictionary holds all interned unicode strings.  Note that references
190    to strings in this dictionary are *not* counted in the string's ob_refcnt.
191    When the interned string reaches a refcnt of 0 the string deallocation
192    function will delete the reference from this dictionary.
193 
194    Another way to look at this is that to say that the actual reference
195    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
196 */
197 static PyObject *interned = NULL;
198 
199 /* The empty Unicode object is shared to improve performance. */
200 static PyObject *unicode_empty = NULL;
201 
202 #define _Py_INCREF_UNICODE_EMPTY()                      \
203     do {                                                \
204         if (unicode_empty != NULL)                      \
205             Py_INCREF(unicode_empty);                   \
206         else {                                          \
207             unicode_empty = PyUnicode_New(0, 0);        \
208             if (unicode_empty != NULL) {                \
209                 Py_INCREF(unicode_empty);               \
210                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211             }                                           \
212         }                                               \
213     } while (0)
214 
215 #define _Py_RETURN_UNICODE_EMPTY()                      \
216     do {                                                \
217         _Py_INCREF_UNICODE_EMPTY();                     \
218         return unicode_empty;                           \
219     } while (0)
220 
221 #define FILL(kind, data, value, start, length) \
222     do { \
223         assert(0 <= start); \
224         assert(kind != PyUnicode_WCHAR_KIND); \
225         switch (kind) { \
226         case PyUnicode_1BYTE_KIND: { \
227             assert(value <= 0xff); \
228             Py_UCS1 ch = (unsigned char)value; \
229             Py_UCS1 *to = (Py_UCS1 *)data + start; \
230             memset(to, ch, length); \
231             break; \
232         } \
233         case PyUnicode_2BYTE_KIND: { \
234             assert(value <= 0xffff); \
235             Py_UCS2 ch = (Py_UCS2)value; \
236             Py_UCS2 *to = (Py_UCS2 *)data + start; \
237             const Py_UCS2 *end = to + length; \
238             for (; to < end; ++to) *to = ch; \
239             break; \
240         } \
241         case PyUnicode_4BYTE_KIND: { \
242             assert(value <= MAX_UNICODE); \
243             Py_UCS4 ch = value; \
244             Py_UCS4 * to = (Py_UCS4 *)data + start; \
245             const Py_UCS4 *end = to + length; \
246             for (; to < end; ++to) *to = ch; \
247             break; \
248         } \
249         default: Py_UNREACHABLE(); \
250         } \
251     } while (0)
252 
253 
254 /* Forward declaration */
255 static inline int
256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257 
258 /* List of static strings. */
259 static _Py_Identifier *static_strings = NULL;
260 
261 /* Single character Unicode strings in the Latin-1 range are being
262    shared as well. */
263 static PyObject *unicode_latin1[256] = {NULL};
264 
265 /* Fast detection of the most frequent whitespace characters */
266 const unsigned char _Py_ascii_whitespace[] = {
267     0, 0, 0, 0, 0, 0, 0, 0,
268 /*     case 0x0009: * CHARACTER TABULATION */
269 /*     case 0x000A: * LINE FEED */
270 /*     case 0x000B: * LINE TABULATION */
271 /*     case 0x000C: * FORM FEED */
272 /*     case 0x000D: * CARRIAGE RETURN */
273     0, 1, 1, 1, 1, 1, 0, 0,
274     0, 0, 0, 0, 0, 0, 0, 0,
275 /*     case 0x001C: * FILE SEPARATOR */
276 /*     case 0x001D: * GROUP SEPARATOR */
277 /*     case 0x001E: * RECORD SEPARATOR */
278 /*     case 0x001F: * UNIT SEPARATOR */
279     0, 0, 0, 0, 1, 1, 1, 1,
280 /*     case 0x0020: * SPACE */
281     1, 0, 0, 0, 0, 0, 0, 0,
282     0, 0, 0, 0, 0, 0, 0, 0,
283     0, 0, 0, 0, 0, 0, 0, 0,
284     0, 0, 0, 0, 0, 0, 0, 0,
285 
286     0, 0, 0, 0, 0, 0, 0, 0,
287     0, 0, 0, 0, 0, 0, 0, 0,
288     0, 0, 0, 0, 0, 0, 0, 0,
289     0, 0, 0, 0, 0, 0, 0, 0,
290     0, 0, 0, 0, 0, 0, 0, 0,
291     0, 0, 0, 0, 0, 0, 0, 0,
292     0, 0, 0, 0, 0, 0, 0, 0,
293     0, 0, 0, 0, 0, 0, 0, 0
294 };
295 
296 /* forward */
297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
298 static PyObject* get_latin1_char(unsigned char ch);
299 static int unicode_modifiable(PyObject *unicode);
300 
301 
302 static PyObject *
303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
304 static PyObject *
305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306 static PyObject *
307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308 
309 static PyObject *
310 unicode_encode_call_errorhandler(const char *errors,
311        PyObject **errorHandler,const char *encoding, const char *reason,
312        PyObject *unicode, PyObject **exceptionObject,
313        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314 
315 static void
316 raise_encode_exception(PyObject **exceptionObject,
317                        const char *encoding,
318                        PyObject *unicode,
319                        Py_ssize_t startpos, Py_ssize_t endpos,
320                        const char *reason);
321 
322 /* Same for linebreaks */
323 static const unsigned char ascii_linebreak[] = {
324     0, 0, 0, 0, 0, 0, 0, 0,
325 /*         0x000A, * LINE FEED */
326 /*         0x000B, * LINE TABULATION */
327 /*         0x000C, * FORM FEED */
328 /*         0x000D, * CARRIAGE RETURN */
329     0, 0, 1, 1, 1, 1, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331 /*         0x001C, * FILE SEPARATOR */
332 /*         0x001D, * GROUP SEPARATOR */
333 /*         0x001E, * RECORD SEPARATOR */
334     0, 0, 0, 0, 1, 1, 1, 0,
335     0, 0, 0, 0, 0, 0, 0, 0,
336     0, 0, 0, 0, 0, 0, 0, 0,
337     0, 0, 0, 0, 0, 0, 0, 0,
338     0, 0, 0, 0, 0, 0, 0, 0,
339 
340     0, 0, 0, 0, 0, 0, 0, 0,
341     0, 0, 0, 0, 0, 0, 0, 0,
342     0, 0, 0, 0, 0, 0, 0, 0,
343     0, 0, 0, 0, 0, 0, 0, 0,
344     0, 0, 0, 0, 0, 0, 0, 0,
345     0, 0, 0, 0, 0, 0, 0, 0,
346     0, 0, 0, 0, 0, 0, 0, 0,
347     0, 0, 0, 0, 0, 0, 0, 0
348 };
349 
350 static int convert_uc(PyObject *obj, void *addr);
351 
352 #include "clinic/unicodeobject.c.h"
353 
354 typedef enum {
355     _Py_ERROR_UNKNOWN=0,
356     _Py_ERROR_STRICT,
357     _Py_ERROR_SURROGATEESCAPE,
358     _Py_ERROR_REPLACE,
359     _Py_ERROR_IGNORE,
360     _Py_ERROR_BACKSLASHREPLACE,
361     _Py_ERROR_SURROGATEPASS,
362     _Py_ERROR_XMLCHARREFREPLACE,
363     _Py_ERROR_OTHER
364 } _Py_error_handler;
365 
366 static _Py_error_handler
get_error_handler(const char * errors)367 get_error_handler(const char *errors)
368 {
369     if (errors == NULL || strcmp(errors, "strict") == 0) {
370         return _Py_ERROR_STRICT;
371     }
372     if (strcmp(errors, "surrogateescape") == 0) {
373         return _Py_ERROR_SURROGATEESCAPE;
374     }
375     if (strcmp(errors, "replace") == 0) {
376         return _Py_ERROR_REPLACE;
377     }
378     if (strcmp(errors, "ignore") == 0) {
379         return _Py_ERROR_IGNORE;
380     }
381     if (strcmp(errors, "backslashreplace") == 0) {
382         return _Py_ERROR_BACKSLASHREPLACE;
383     }
384     if (strcmp(errors, "surrogatepass") == 0) {
385         return _Py_ERROR_SURROGATEPASS;
386     }
387     if (strcmp(errors, "xmlcharrefreplace") == 0) {
388         return _Py_ERROR_XMLCHARREFREPLACE;
389     }
390     return _Py_ERROR_OTHER;
391 }
392 
393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394    This function is kept for backward compatibility with the old API. */
395 Py_UNICODE
PyUnicode_GetMax(void)396 PyUnicode_GetMax(void)
397 {
398 #ifdef Py_UNICODE_WIDE
399     return 0x10FFFF;
400 #else
401     /* This is actually an illegal character, so it should
402        not be passed to unichr. */
403     return 0xFFFF;
404 #endif
405 }
406 
407 #ifdef Py_DEBUG
408 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)409 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
410 {
411     PyASCIIObject *ascii;
412     unsigned int kind;
413 
414     assert(PyUnicode_Check(op));
415 
416     ascii = (PyASCIIObject *)op;
417     kind = ascii->state.kind;
418 
419     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
420         assert(kind == PyUnicode_1BYTE_KIND);
421         assert(ascii->state.ready == 1);
422     }
423     else {
424         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
425         void *data;
426 
427         if (ascii->state.compact == 1) {
428             data = compact + 1;
429             assert(kind == PyUnicode_1BYTE_KIND
430                    || kind == PyUnicode_2BYTE_KIND
431                    || kind == PyUnicode_4BYTE_KIND);
432             assert(ascii->state.ascii == 0);
433             assert(ascii->state.ready == 1);
434             assert (compact->utf8 != data);
435         }
436         else {
437             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438 
439             data = unicode->data.any;
440             if (kind == PyUnicode_WCHAR_KIND) {
441                 assert(ascii->length == 0);
442                 assert(ascii->hash == -1);
443                 assert(ascii->state.compact == 0);
444                 assert(ascii->state.ascii == 0);
445                 assert(ascii->state.ready == 0);
446                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
447                 assert(ascii->wstr != NULL);
448                 assert(data == NULL);
449                 assert(compact->utf8 == NULL);
450             }
451             else {
452                 assert(kind == PyUnicode_1BYTE_KIND
453                        || kind == PyUnicode_2BYTE_KIND
454                        || kind == PyUnicode_4BYTE_KIND);
455                 assert(ascii->state.compact == 0);
456                 assert(ascii->state.ready == 1);
457                 assert(data != NULL);
458                 if (ascii->state.ascii) {
459                     assert (compact->utf8 == data);
460                     assert (compact->utf8_length == ascii->length);
461                 }
462                 else
463                     assert (compact->utf8 != data);
464             }
465         }
466         if (kind != PyUnicode_WCHAR_KIND) {
467             if (
468 #if SIZEOF_WCHAR_T == 2
469                 kind == PyUnicode_2BYTE_KIND
470 #else
471                 kind == PyUnicode_4BYTE_KIND
472 #endif
473                )
474             {
475                 assert(ascii->wstr == data);
476                 assert(compact->wstr_length == ascii->length);
477             } else
478                 assert(ascii->wstr != data);
479         }
480 
481         if (compact->utf8 == NULL)
482             assert(compact->utf8_length == 0);
483         if (ascii->wstr == NULL)
484             assert(compact->wstr_length == 0);
485     }
486     /* check that the best kind is used */
487     if (check_content && kind != PyUnicode_WCHAR_KIND)
488     {
489         Py_ssize_t i;
490         Py_UCS4 maxchar = 0;
491         void *data;
492         Py_UCS4 ch;
493 
494         data = PyUnicode_DATA(ascii);
495         for (i=0; i < ascii->length; i++)
496         {
497             ch = PyUnicode_READ(kind, data, i);
498             if (ch > maxchar)
499                 maxchar = ch;
500         }
501         if (kind == PyUnicode_1BYTE_KIND) {
502             if (ascii->state.ascii == 0) {
503                 assert(maxchar >= 128);
504                 assert(maxchar <= 255);
505             }
506             else
507                 assert(maxchar < 128);
508         }
509         else if (kind == PyUnicode_2BYTE_KIND) {
510             assert(maxchar >= 0x100);
511             assert(maxchar <= 0xFFFF);
512         }
513         else {
514             assert(maxchar >= 0x10000);
515             assert(maxchar <= MAX_UNICODE);
516         }
517         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
518     }
519     return 1;
520 }
521 #endif
522 
523 static PyObject*
unicode_result_wchar(PyObject * unicode)524 unicode_result_wchar(PyObject *unicode)
525 {
526 #ifndef Py_DEBUG
527     Py_ssize_t len;
528 
529     len = _PyUnicode_WSTR_LENGTH(unicode);
530     if (len == 0) {
531         Py_DECREF(unicode);
532         _Py_RETURN_UNICODE_EMPTY();
533     }
534 
535     if (len == 1) {
536         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
537         if ((Py_UCS4)ch < 256) {
538             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539             Py_DECREF(unicode);
540             return latin1_char;
541         }
542     }
543 
544     if (_PyUnicode_Ready(unicode) < 0) {
545         Py_DECREF(unicode);
546         return NULL;
547     }
548 #else
549     assert(Py_REFCNT(unicode) == 1);
550 
551     /* don't make the result ready in debug mode to ensure that the caller
552        makes the string ready before using it */
553     assert(_PyUnicode_CheckConsistency(unicode, 1));
554 #endif
555     return unicode;
556 }
557 
558 static PyObject*
unicode_result_ready(PyObject * unicode)559 unicode_result_ready(PyObject *unicode)
560 {
561     Py_ssize_t length;
562 
563     length = PyUnicode_GET_LENGTH(unicode);
564     if (length == 0) {
565         if (unicode != unicode_empty) {
566             Py_DECREF(unicode);
567             _Py_RETURN_UNICODE_EMPTY();
568         }
569         return unicode_empty;
570     }
571 
572     if (length == 1) {
573         void *data = PyUnicode_DATA(unicode);
574         int kind = PyUnicode_KIND(unicode);
575         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
576         if (ch < 256) {
577             PyObject *latin1_char = unicode_latin1[ch];
578             if (latin1_char != NULL) {
579                 if (unicode != latin1_char) {
580                     Py_INCREF(latin1_char);
581                     Py_DECREF(unicode);
582                 }
583                 return latin1_char;
584             }
585             else {
586                 assert(_PyUnicode_CheckConsistency(unicode, 1));
587                 Py_INCREF(unicode);
588                 unicode_latin1[ch] = unicode;
589                 return unicode;
590             }
591         }
592     }
593 
594     assert(_PyUnicode_CheckConsistency(unicode, 1));
595     return unicode;
596 }
597 
598 static PyObject*
unicode_result(PyObject * unicode)599 unicode_result(PyObject *unicode)
600 {
601     assert(_PyUnicode_CHECK(unicode));
602     if (PyUnicode_IS_READY(unicode))
603         return unicode_result_ready(unicode);
604     else
605         return unicode_result_wchar(unicode);
606 }
607 
608 static PyObject*
unicode_result_unchanged(PyObject * unicode)609 unicode_result_unchanged(PyObject *unicode)
610 {
611     if (PyUnicode_CheckExact(unicode)) {
612         if (PyUnicode_READY(unicode) == -1)
613             return NULL;
614         Py_INCREF(unicode);
615         return unicode;
616     }
617     else
618         /* Subtype -- return genuine unicode string with the same value. */
619         return _PyUnicode_Copy(unicode);
620 }
621 
622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623    ASCII, Latin1, UTF-8, etc. */
624 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)625 backslashreplace(_PyBytesWriter *writer, char *str,
626                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627 {
628     Py_ssize_t size, i;
629     Py_UCS4 ch;
630     enum PyUnicode_Kind kind;
631     void *data;
632 
633     assert(PyUnicode_IS_READY(unicode));
634     kind = PyUnicode_KIND(unicode);
635     data = PyUnicode_DATA(unicode);
636 
637     size = 0;
638     /* determine replacement size */
639     for (i = collstart; i < collend; ++i) {
640         Py_ssize_t incr;
641 
642         ch = PyUnicode_READ(kind, data, i);
643         if (ch < 0x100)
644             incr = 2+2;
645         else if (ch < 0x10000)
646             incr = 2+4;
647         else {
648             assert(ch <= MAX_UNICODE);
649             incr = 2+8;
650         }
651         if (size > PY_SSIZE_T_MAX - incr) {
652             PyErr_SetString(PyExc_OverflowError,
653                             "encoded result is too long for a Python string");
654             return NULL;
655         }
656         size += incr;
657     }
658 
659     str = _PyBytesWriter_Prepare(writer, str, size);
660     if (str == NULL)
661         return NULL;
662 
663     /* generate replacement */
664     for (i = collstart; i < collend; ++i) {
665         ch = PyUnicode_READ(kind, data, i);
666         *str++ = '\\';
667         if (ch >= 0x00010000) {
668             *str++ = 'U';
669             *str++ = Py_hexdigits[(ch>>28)&0xf];
670             *str++ = Py_hexdigits[(ch>>24)&0xf];
671             *str++ = Py_hexdigits[(ch>>20)&0xf];
672             *str++ = Py_hexdigits[(ch>>16)&0xf];
673             *str++ = Py_hexdigits[(ch>>12)&0xf];
674             *str++ = Py_hexdigits[(ch>>8)&0xf];
675         }
676         else if (ch >= 0x100) {
677             *str++ = 'u';
678             *str++ = Py_hexdigits[(ch>>12)&0xf];
679             *str++ = Py_hexdigits[(ch>>8)&0xf];
680         }
681         else
682             *str++ = 'x';
683         *str++ = Py_hexdigits[(ch>>4)&0xf];
684         *str++ = Py_hexdigits[ch&0xf];
685     }
686     return str;
687 }
688 
689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690    ASCII, Latin1, UTF-8, etc. */
691 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)692 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
693                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694 {
695     Py_ssize_t size, i;
696     Py_UCS4 ch;
697     enum PyUnicode_Kind kind;
698     void *data;
699 
700     assert(PyUnicode_IS_READY(unicode));
701     kind = PyUnicode_KIND(unicode);
702     data = PyUnicode_DATA(unicode);
703 
704     size = 0;
705     /* determine replacement size */
706     for (i = collstart; i < collend; ++i) {
707         Py_ssize_t incr;
708 
709         ch = PyUnicode_READ(kind, data, i);
710         if (ch < 10)
711             incr = 2+1+1;
712         else if (ch < 100)
713             incr = 2+2+1;
714         else if (ch < 1000)
715             incr = 2+3+1;
716         else if (ch < 10000)
717             incr = 2+4+1;
718         else if (ch < 100000)
719             incr = 2+5+1;
720         else if (ch < 1000000)
721             incr = 2+6+1;
722         else {
723             assert(ch <= MAX_UNICODE);
724             incr = 2+7+1;
725         }
726         if (size > PY_SSIZE_T_MAX - incr) {
727             PyErr_SetString(PyExc_OverflowError,
728                             "encoded result is too long for a Python string");
729             return NULL;
730         }
731         size += incr;
732     }
733 
734     str = _PyBytesWriter_Prepare(writer, str, size);
735     if (str == NULL)
736         return NULL;
737 
738     /* generate replacement */
739     for (i = collstart; i < collend; ++i) {
740         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741     }
742     return str;
743 }
744 
745 /* --- Bloom Filters ----------------------------------------------------- */
746 
747 /* stuff to implement simple "bloom filters" for Unicode characters.
748    to keep things simple, we use a single bitmask, using the least 5
749    bits from each unicode characters as the bit index. */
750 
751 /* the linebreak mask is set up by Unicode_Init below */
752 
753 #if LONG_BIT >= 128
754 #define BLOOM_WIDTH 128
755 #elif LONG_BIT >= 64
756 #define BLOOM_WIDTH 64
757 #elif LONG_BIT >= 32
758 #define BLOOM_WIDTH 32
759 #else
760 #error "LONG_BIT is smaller than 32"
761 #endif
762 
763 #define BLOOM_MASK unsigned long
764 
765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
766 
767 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
768 
769 #define BLOOM_LINEBREAK(ch)                                             \
770     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
771      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
772 
773 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
775 {
776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
777     do {                                               \
778         TYPE *data = (TYPE *)PTR;                      \
779         TYPE *end = data + LEN;                        \
780         Py_UCS4 ch;                                    \
781         for (; data != end; data++) {                  \
782             ch = *data;                                \
783             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784         }                                              \
785         break;                                         \
786     } while (0)
787 
788     /* calculate simple bloom-style bitmask for a given unicode string */
789 
790     BLOOM_MASK mask;
791 
792     mask = 0;
793     switch (kind) {
794     case PyUnicode_1BYTE_KIND:
795         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796         break;
797     case PyUnicode_2BYTE_KIND:
798         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799         break;
800     case PyUnicode_4BYTE_KIND:
801         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802         break;
803     default:
804         Py_UNREACHABLE();
805     }
806     return mask;
807 
808 #undef BLOOM_UPDATE
809 }
810 
811 static int
ensure_unicode(PyObject * obj)812 ensure_unicode(PyObject *obj)
813 {
814     if (!PyUnicode_Check(obj)) {
815         PyErr_Format(PyExc_TypeError,
816                      "must be str, not %.100s",
817                      Py_TYPE(obj)->tp_name);
818         return -1;
819     }
820     return PyUnicode_READY(obj);
821 }
822 
823 /* Compilation of templated routines */
824 
825 #include "stringlib/asciilib.h"
826 #include "stringlib/fastsearch.h"
827 #include "stringlib/partition.h"
828 #include "stringlib/split.h"
829 #include "stringlib/count.h"
830 #include "stringlib/find.h"
831 #include "stringlib/find_max_char.h"
832 #include "stringlib/undef.h"
833 
834 #include "stringlib/ucs1lib.h"
835 #include "stringlib/fastsearch.h"
836 #include "stringlib/partition.h"
837 #include "stringlib/split.h"
838 #include "stringlib/count.h"
839 #include "stringlib/find.h"
840 #include "stringlib/replace.h"
841 #include "stringlib/find_max_char.h"
842 #include "stringlib/undef.h"
843 
844 #include "stringlib/ucs2lib.h"
845 #include "stringlib/fastsearch.h"
846 #include "stringlib/partition.h"
847 #include "stringlib/split.h"
848 #include "stringlib/count.h"
849 #include "stringlib/find.h"
850 #include "stringlib/replace.h"
851 #include "stringlib/find_max_char.h"
852 #include "stringlib/undef.h"
853 
854 #include "stringlib/ucs4lib.h"
855 #include "stringlib/fastsearch.h"
856 #include "stringlib/partition.h"
857 #include "stringlib/split.h"
858 #include "stringlib/count.h"
859 #include "stringlib/find.h"
860 #include "stringlib/replace.h"
861 #include "stringlib/find_max_char.h"
862 #include "stringlib/undef.h"
863 
864 #include "stringlib/unicodedefs.h"
865 #include "stringlib/fastsearch.h"
866 #include "stringlib/count.h"
867 #include "stringlib/find.h"
868 #include "stringlib/undef.h"
869 
870 /* --- Unicode Object ----------------------------------------------------- */
871 
872 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)873 findchar(const void *s, int kind,
874          Py_ssize_t size, Py_UCS4 ch,
875          int direction)
876 {
877     switch (kind) {
878     case PyUnicode_1BYTE_KIND:
879         if ((Py_UCS1) ch != ch)
880             return -1;
881         if (direction > 0)
882             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883         else
884             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
885     case PyUnicode_2BYTE_KIND:
886         if ((Py_UCS2) ch != ch)
887             return -1;
888         if (direction > 0)
889             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890         else
891             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
892     case PyUnicode_4BYTE_KIND:
893         if (direction > 0)
894             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895         else
896             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
897     default:
898         Py_UNREACHABLE();
899     }
900 }
901 
902 #ifdef Py_DEBUG
903 /* Fill the data of a Unicode string with invalid characters to detect bugs
904    earlier.
905 
906    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908    invalid character in Unicode 6.0. */
909 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911 {
912     int kind = PyUnicode_KIND(unicode);
913     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915     if (length <= old_length)
916         return;
917     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918 }
919 #endif
920 
921 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)922 resize_compact(PyObject *unicode, Py_ssize_t length)
923 {
924     Py_ssize_t char_size;
925     Py_ssize_t struct_size;
926     Py_ssize_t new_size;
927     int share_wstr;
928     PyObject *new_unicode;
929 #ifdef Py_DEBUG
930     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931 #endif
932 
933     assert(unicode_modifiable(unicode));
934     assert(PyUnicode_IS_READY(unicode));
935     assert(PyUnicode_IS_COMPACT(unicode));
936 
937     char_size = PyUnicode_KIND(unicode);
938     if (PyUnicode_IS_ASCII(unicode))
939         struct_size = sizeof(PyASCIIObject);
940     else
941         struct_size = sizeof(PyCompactUnicodeObject);
942     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
943 
944     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945         PyErr_NoMemory();
946         return NULL;
947     }
948     new_size = (struct_size + (length + 1) * char_size);
949 
950     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951         PyObject_DEL(_PyUnicode_UTF8(unicode));
952         _PyUnicode_UTF8(unicode) = NULL;
953         _PyUnicode_UTF8_LENGTH(unicode) = 0;
954     }
955     _Py_DEC_REFTOTAL;
956     _Py_ForgetReference(unicode);
957 
958     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
959     if (new_unicode == NULL) {
960         _Py_NewReference(unicode);
961         PyErr_NoMemory();
962         return NULL;
963     }
964     unicode = new_unicode;
965     _Py_NewReference(unicode);
966 
967     _PyUnicode_LENGTH(unicode) = length;
968     if (share_wstr) {
969         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
970         if (!PyUnicode_IS_ASCII(unicode))
971             _PyUnicode_WSTR_LENGTH(unicode) = length;
972     }
973     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974         PyObject_DEL(_PyUnicode_WSTR(unicode));
975         _PyUnicode_WSTR(unicode) = NULL;
976         if (!PyUnicode_IS_ASCII(unicode))
977             _PyUnicode_WSTR_LENGTH(unicode) = 0;
978     }
979 #ifdef Py_DEBUG
980     unicode_fill_invalid(unicode, old_length);
981 #endif
982     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983                     length, 0);
984     assert(_PyUnicode_CheckConsistency(unicode, 0));
985     return unicode;
986 }
987 
988 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)989 resize_inplace(PyObject *unicode, Py_ssize_t length)
990 {
991     wchar_t *wstr;
992     Py_ssize_t new_size;
993     assert(!PyUnicode_IS_COMPACT(unicode));
994     assert(Py_REFCNT(unicode) == 1);
995 
996     if (PyUnicode_IS_READY(unicode)) {
997         Py_ssize_t char_size;
998         int share_wstr, share_utf8;
999         void *data;
1000 #ifdef Py_DEBUG
1001         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002 #endif
1003 
1004         data = _PyUnicode_DATA_ANY(unicode);
1005         char_size = PyUnicode_KIND(unicode);
1006         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1008 
1009         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010             PyErr_NoMemory();
1011             return -1;
1012         }
1013         new_size = (length + 1) * char_size;
1014 
1015         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016         {
1017             PyObject_DEL(_PyUnicode_UTF8(unicode));
1018             _PyUnicode_UTF8(unicode) = NULL;
1019             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020         }
1021 
1022         data = (PyObject *)PyObject_REALLOC(data, new_size);
1023         if (data == NULL) {
1024             PyErr_NoMemory();
1025             return -1;
1026         }
1027         _PyUnicode_DATA_ANY(unicode) = data;
1028         if (share_wstr) {
1029             _PyUnicode_WSTR(unicode) = data;
1030             _PyUnicode_WSTR_LENGTH(unicode) = length;
1031         }
1032         if (share_utf8) {
1033             _PyUnicode_UTF8(unicode) = data;
1034             _PyUnicode_UTF8_LENGTH(unicode) = length;
1035         }
1036         _PyUnicode_LENGTH(unicode) = length;
1037         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1038 #ifdef Py_DEBUG
1039         unicode_fill_invalid(unicode, old_length);
1040 #endif
1041         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1042             assert(_PyUnicode_CheckConsistency(unicode, 0));
1043             return 0;
1044         }
1045     }
1046     assert(_PyUnicode_WSTR(unicode) != NULL);
1047 
1048     /* check for integer overflow */
1049     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1050         PyErr_NoMemory();
1051         return -1;
1052     }
1053     new_size = sizeof(wchar_t) * (length + 1);
1054     wstr =  _PyUnicode_WSTR(unicode);
1055     wstr = PyObject_REALLOC(wstr, new_size);
1056     if (!wstr) {
1057         PyErr_NoMemory();
1058         return -1;
1059     }
1060     _PyUnicode_WSTR(unicode) = wstr;
1061     _PyUnicode_WSTR(unicode)[length] = 0;
1062     _PyUnicode_WSTR_LENGTH(unicode) = length;
1063     assert(_PyUnicode_CheckConsistency(unicode, 0));
1064     return 0;
1065 }
1066 
1067 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1068 resize_copy(PyObject *unicode, Py_ssize_t length)
1069 {
1070     Py_ssize_t copy_length;
1071     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1072         PyObject *copy;
1073 
1074         assert(PyUnicode_IS_READY(unicode));
1075 
1076         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077         if (copy == NULL)
1078             return NULL;
1079 
1080         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1081         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1082         return copy;
1083     }
1084     else {
1085         PyObject *w;
1086 
1087         w = (PyObject*)_PyUnicode_New(length);
1088         if (w == NULL)
1089             return NULL;
1090         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091         copy_length = Py_MIN(copy_length, length);
1092         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1093                   copy_length * sizeof(wchar_t));
1094         return w;
1095     }
1096 }
1097 
1098 /* We allocate one more byte to make sure the string is
1099    Ux0000 terminated; some code (e.g. new_identifier)
1100    relies on that.
1101 
1102    XXX This allocator could further be enhanced by assuring that the
1103    free list never reduces its size below 1.
1104 
1105 */
1106 
1107 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1108 _PyUnicode_New(Py_ssize_t length)
1109 {
1110     PyUnicodeObject *unicode;
1111     size_t new_size;
1112 
1113     /* Optimization for empty strings */
1114     if (length == 0 && unicode_empty != NULL) {
1115         Py_INCREF(unicode_empty);
1116         return (PyUnicodeObject*)unicode_empty;
1117     }
1118 
1119     /* Ensure we won't overflow the size. */
1120     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1121         return (PyUnicodeObject *)PyErr_NoMemory();
1122     }
1123     if (length < 0) {
1124         PyErr_SetString(PyExc_SystemError,
1125                         "Negative size passed to _PyUnicode_New");
1126         return NULL;
1127     }
1128 
1129     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130     if (unicode == NULL)
1131         return NULL;
1132     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1133 
1134     _PyUnicode_WSTR_LENGTH(unicode) = length;
1135     _PyUnicode_HASH(unicode) = -1;
1136     _PyUnicode_STATE(unicode).interned = 0;
1137     _PyUnicode_STATE(unicode).kind = 0;
1138     _PyUnicode_STATE(unicode).compact = 0;
1139     _PyUnicode_STATE(unicode).ready = 0;
1140     _PyUnicode_STATE(unicode).ascii = 0;
1141     _PyUnicode_DATA_ANY(unicode) = NULL;
1142     _PyUnicode_LENGTH(unicode) = 0;
1143     _PyUnicode_UTF8(unicode) = NULL;
1144     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145 
1146     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147     if (!_PyUnicode_WSTR(unicode)) {
1148         Py_DECREF(unicode);
1149         PyErr_NoMemory();
1150         return NULL;
1151     }
1152 
1153     /* Initialize the first element to guard against cases where
1154      * the caller fails before initializing str -- unicode_resize()
1155      * reads str[0], and the Keep-Alive optimization can keep memory
1156      * allocated for str alive across a call to unicode_dealloc(unicode).
1157      * We don't want unicode_resize to read uninitialized memory in
1158      * that case.
1159      */
1160     _PyUnicode_WSTR(unicode)[0] = 0;
1161     _PyUnicode_WSTR(unicode)[length] = 0;
1162 
1163     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1164     return unicode;
1165 }
1166 
1167 static const char*
unicode_kind_name(PyObject * unicode)1168 unicode_kind_name(PyObject *unicode)
1169 {
1170     /* don't check consistency: unicode_kind_name() is called from
1171        _PyUnicode_Dump() */
1172     if (!PyUnicode_IS_COMPACT(unicode))
1173     {
1174         if (!PyUnicode_IS_READY(unicode))
1175             return "wstr";
1176         switch (PyUnicode_KIND(unicode))
1177         {
1178         case PyUnicode_1BYTE_KIND:
1179             if (PyUnicode_IS_ASCII(unicode))
1180                 return "legacy ascii";
1181             else
1182                 return "legacy latin1";
1183         case PyUnicode_2BYTE_KIND:
1184             return "legacy UCS2";
1185         case PyUnicode_4BYTE_KIND:
1186             return "legacy UCS4";
1187         default:
1188             return "<legacy invalid kind>";
1189         }
1190     }
1191     assert(PyUnicode_IS_READY(unicode));
1192     switch (PyUnicode_KIND(unicode)) {
1193     case PyUnicode_1BYTE_KIND:
1194         if (PyUnicode_IS_ASCII(unicode))
1195             return "ascii";
1196         else
1197             return "latin1";
1198     case PyUnicode_2BYTE_KIND:
1199         return "UCS2";
1200     case PyUnicode_4BYTE_KIND:
1201         return "UCS4";
1202     default:
1203         return "<invalid compact kind>";
1204     }
1205 }
1206 
1207 #ifdef Py_DEBUG
1208 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1209 char *_PyUnicode_utf8(void *unicode){
1210     return PyUnicode_UTF8(unicode);
1211 }
1212 
_PyUnicode_compact_data(void * unicode)1213 void *_PyUnicode_compact_data(void *unicode) {
1214     return _PyUnicode_COMPACT_DATA(unicode);
1215 }
_PyUnicode_data(void * unicode)1216 void *_PyUnicode_data(void *unicode){
1217     printf("obj %p\n", unicode);
1218     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223     return PyUnicode_DATA(unicode);
1224 }
1225 
1226 void
_PyUnicode_Dump(PyObject * op)1227 _PyUnicode_Dump(PyObject *op)
1228 {
1229     PyASCIIObject *ascii = (PyASCIIObject *)op;
1230     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232     void *data;
1233 
1234     if (ascii->state.compact)
1235     {
1236         if (ascii->state.ascii)
1237             data = (ascii + 1);
1238         else
1239             data = (compact + 1);
1240     }
1241     else
1242         data = unicode->data.any;
1243     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244            unicode_kind_name(op), ascii->length);
1245 
1246     if (ascii->wstr == data)
1247         printf("shared ");
1248     printf("wstr=%p", ascii->wstr);
1249 
1250     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1251         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1252         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253             printf("shared ");
1254         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255                compact->utf8, compact->utf8_length);
1256     }
1257     printf(", data=%p\n", data);
1258 }
1259 #endif
1260 
1261 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263 {
1264     PyObject *obj;
1265     PyCompactUnicodeObject *unicode;
1266     void *data;
1267     enum PyUnicode_Kind kind;
1268     int is_sharing, is_ascii;
1269     Py_ssize_t char_size;
1270     Py_ssize_t struct_size;
1271 
1272     /* Optimization for empty strings */
1273     if (size == 0 && unicode_empty != NULL) {
1274         Py_INCREF(unicode_empty);
1275         return unicode_empty;
1276     }
1277 
1278     is_ascii = 0;
1279     is_sharing = 0;
1280     struct_size = sizeof(PyCompactUnicodeObject);
1281     if (maxchar < 128) {
1282         kind = PyUnicode_1BYTE_KIND;
1283         char_size = 1;
1284         is_ascii = 1;
1285         struct_size = sizeof(PyASCIIObject);
1286     }
1287     else if (maxchar < 256) {
1288         kind = PyUnicode_1BYTE_KIND;
1289         char_size = 1;
1290     }
1291     else if (maxchar < 65536) {
1292         kind = PyUnicode_2BYTE_KIND;
1293         char_size = 2;
1294         if (sizeof(wchar_t) == 2)
1295             is_sharing = 1;
1296     }
1297     else {
1298         if (maxchar > MAX_UNICODE) {
1299             PyErr_SetString(PyExc_SystemError,
1300                             "invalid maximum character passed to PyUnicode_New");
1301             return NULL;
1302         }
1303         kind = PyUnicode_4BYTE_KIND;
1304         char_size = 4;
1305         if (sizeof(wchar_t) == 4)
1306             is_sharing = 1;
1307     }
1308 
1309     /* Ensure we won't overflow the size. */
1310     if (size < 0) {
1311         PyErr_SetString(PyExc_SystemError,
1312                         "Negative size passed to PyUnicode_New");
1313         return NULL;
1314     }
1315     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316         return PyErr_NoMemory();
1317 
1318     /* Duplicated allocation code from _PyObject_New() instead of a call to
1319      * PyObject_New() so we are able to allocate space for the object and
1320      * it's data buffer.
1321      */
1322     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323     if (obj == NULL)
1324         return PyErr_NoMemory();
1325     obj = PyObject_INIT(obj, &PyUnicode_Type);
1326     if (obj == NULL)
1327         return NULL;
1328 
1329     unicode = (PyCompactUnicodeObject *)obj;
1330     if (is_ascii)
1331         data = ((PyASCIIObject*)obj) + 1;
1332     else
1333         data = unicode + 1;
1334     _PyUnicode_LENGTH(unicode) = size;
1335     _PyUnicode_HASH(unicode) = -1;
1336     _PyUnicode_STATE(unicode).interned = 0;
1337     _PyUnicode_STATE(unicode).kind = kind;
1338     _PyUnicode_STATE(unicode).compact = 1;
1339     _PyUnicode_STATE(unicode).ready = 1;
1340     _PyUnicode_STATE(unicode).ascii = is_ascii;
1341     if (is_ascii) {
1342         ((char*)data)[size] = 0;
1343         _PyUnicode_WSTR(unicode) = NULL;
1344     }
1345     else if (kind == PyUnicode_1BYTE_KIND) {
1346         ((char*)data)[size] = 0;
1347         _PyUnicode_WSTR(unicode) = NULL;
1348         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1349         unicode->utf8 = NULL;
1350         unicode->utf8_length = 0;
1351     }
1352     else {
1353         unicode->utf8 = NULL;
1354         unicode->utf8_length = 0;
1355         if (kind == PyUnicode_2BYTE_KIND)
1356             ((Py_UCS2*)data)[size] = 0;
1357         else /* kind == PyUnicode_4BYTE_KIND */
1358             ((Py_UCS4*)data)[size] = 0;
1359         if (is_sharing) {
1360             _PyUnicode_WSTR_LENGTH(unicode) = size;
1361             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362         }
1363         else {
1364             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365             _PyUnicode_WSTR(unicode) = NULL;
1366         }
1367     }
1368 #ifdef Py_DEBUG
1369     unicode_fill_invalid((PyObject*)unicode, 0);
1370 #endif
1371     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1372     return obj;
1373 }
1374 
1375 #if SIZEOF_WCHAR_T == 2
1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377    will decode surrogate pairs, the other conversions are implemented as macros
1378    for efficiency.
1379 
1380    This function assumes that unicode can hold one more code point than wstr
1381    characters for a terminating null character. */
1382 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1384                               PyObject *unicode)
1385 {
1386     const wchar_t *iter;
1387     Py_UCS4 *ucs4_out;
1388 
1389     assert(unicode != NULL);
1390     assert(_PyUnicode_CHECK(unicode));
1391     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393 
1394     for (iter = begin; iter < end; ) {
1395         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396                            _PyUnicode_GET_LENGTH(unicode)));
1397         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398             && (iter+1) < end
1399             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1400         {
1401             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1402             iter += 2;
1403         }
1404         else {
1405             *ucs4_out++ = *iter;
1406             iter++;
1407         }
1408     }
1409     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410                         _PyUnicode_GET_LENGTH(unicode)));
1411 
1412 }
1413 #endif
1414 
1415 static int
unicode_check_modifiable(PyObject * unicode)1416 unicode_check_modifiable(PyObject *unicode)
1417 {
1418     if (!unicode_modifiable(unicode)) {
1419         PyErr_SetString(PyExc_SystemError,
1420                         "Cannot modify a string currently used");
1421         return -1;
1422     }
1423     return 0;
1424 }
1425 
1426 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1427 _copy_characters(PyObject *to, Py_ssize_t to_start,
1428                  PyObject *from, Py_ssize_t from_start,
1429                  Py_ssize_t how_many, int check_maxchar)
1430 {
1431     unsigned int from_kind, to_kind;
1432     void *from_data, *to_data;
1433 
1434     assert(0 <= how_many);
1435     assert(0 <= from_start);
1436     assert(0 <= to_start);
1437     assert(PyUnicode_Check(from));
1438     assert(PyUnicode_IS_READY(from));
1439     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1440 
1441     assert(PyUnicode_Check(to));
1442     assert(PyUnicode_IS_READY(to));
1443     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444 
1445     if (how_many == 0)
1446         return 0;
1447 
1448     from_kind = PyUnicode_KIND(from);
1449     from_data = PyUnicode_DATA(from);
1450     to_kind = PyUnicode_KIND(to);
1451     to_data = PyUnicode_DATA(to);
1452 
1453 #ifdef Py_DEBUG
1454     if (!check_maxchar
1455         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456     {
1457         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458         Py_UCS4 ch;
1459         Py_ssize_t i;
1460         for (i=0; i < how_many; i++) {
1461             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462             assert(ch <= to_maxchar);
1463         }
1464     }
1465 #endif
1466 
1467     if (from_kind == to_kind) {
1468         if (check_maxchar
1469             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470         {
1471             /* Writing Latin-1 characters into an ASCII string requires to
1472                check that all written characters are pure ASCII */
1473             Py_UCS4 max_char;
1474             max_char = ucs1lib_find_max_char(from_data,
1475                                              (Py_UCS1*)from_data + how_many);
1476             if (max_char >= 128)
1477                 return -1;
1478         }
1479         memcpy((char*)to_data + to_kind * to_start,
1480                   (char*)from_data + from_kind * from_start,
1481                   to_kind * how_many);
1482     }
1483     else if (from_kind == PyUnicode_1BYTE_KIND
1484              && to_kind == PyUnicode_2BYTE_KIND)
1485     {
1486         _PyUnicode_CONVERT_BYTES(
1487             Py_UCS1, Py_UCS2,
1488             PyUnicode_1BYTE_DATA(from) + from_start,
1489             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490             PyUnicode_2BYTE_DATA(to) + to_start
1491             );
1492     }
1493     else if (from_kind == PyUnicode_1BYTE_KIND
1494              && to_kind == PyUnicode_4BYTE_KIND)
1495     {
1496         _PyUnicode_CONVERT_BYTES(
1497             Py_UCS1, Py_UCS4,
1498             PyUnicode_1BYTE_DATA(from) + from_start,
1499             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500             PyUnicode_4BYTE_DATA(to) + to_start
1501             );
1502     }
1503     else if (from_kind == PyUnicode_2BYTE_KIND
1504              && to_kind == PyUnicode_4BYTE_KIND)
1505     {
1506         _PyUnicode_CONVERT_BYTES(
1507             Py_UCS2, Py_UCS4,
1508             PyUnicode_2BYTE_DATA(from) + from_start,
1509             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510             PyUnicode_4BYTE_DATA(to) + to_start
1511             );
1512     }
1513     else {
1514         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515 
1516         if (!check_maxchar) {
1517             if (from_kind == PyUnicode_2BYTE_KIND
1518                 && to_kind == PyUnicode_1BYTE_KIND)
1519             {
1520                 _PyUnicode_CONVERT_BYTES(
1521                     Py_UCS2, Py_UCS1,
1522                     PyUnicode_2BYTE_DATA(from) + from_start,
1523                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524                     PyUnicode_1BYTE_DATA(to) + to_start
1525                     );
1526             }
1527             else if (from_kind == PyUnicode_4BYTE_KIND
1528                      && to_kind == PyUnicode_1BYTE_KIND)
1529             {
1530                 _PyUnicode_CONVERT_BYTES(
1531                     Py_UCS4, Py_UCS1,
1532                     PyUnicode_4BYTE_DATA(from) + from_start,
1533                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534                     PyUnicode_1BYTE_DATA(to) + to_start
1535                     );
1536             }
1537             else if (from_kind == PyUnicode_4BYTE_KIND
1538                      && to_kind == PyUnicode_2BYTE_KIND)
1539             {
1540                 _PyUnicode_CONVERT_BYTES(
1541                     Py_UCS4, Py_UCS2,
1542                     PyUnicode_4BYTE_DATA(from) + from_start,
1543                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544                     PyUnicode_2BYTE_DATA(to) + to_start
1545                     );
1546             }
1547             else {
1548                 Py_UNREACHABLE();
1549             }
1550         }
1551         else {
1552             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1553             Py_UCS4 ch;
1554             Py_ssize_t i;
1555 
1556             for (i=0; i < how_many; i++) {
1557                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1558                 if (ch > to_maxchar)
1559                     return -1;
1560                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561             }
1562         }
1563     }
1564     return 0;
1565 }
1566 
1567 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1568 _PyUnicode_FastCopyCharacters(
1569     PyObject *to, Py_ssize_t to_start,
1570     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1571 {
1572     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573 }
1574 
1575 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577                          PyObject *from, Py_ssize_t from_start,
1578                          Py_ssize_t how_many)
1579 {
1580     int err;
1581 
1582     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583         PyErr_BadInternalCall();
1584         return -1;
1585     }
1586 
1587     if (PyUnicode_READY(from) == -1)
1588         return -1;
1589     if (PyUnicode_READY(to) == -1)
1590         return -1;
1591 
1592     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1593         PyErr_SetString(PyExc_IndexError, "string index out of range");
1594         return -1;
1595     }
1596     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1597         PyErr_SetString(PyExc_IndexError, "string index out of range");
1598         return -1;
1599     }
1600     if (how_many < 0) {
1601         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602         return -1;
1603     }
1604     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1605     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606         PyErr_Format(PyExc_SystemError,
1607                      "Cannot write %zi characters at %zi "
1608                      "in a string of %zi characters",
1609                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1610         return -1;
1611     }
1612 
1613     if (how_many == 0)
1614         return 0;
1615 
1616     if (unicode_check_modifiable(to))
1617         return -1;
1618 
1619     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620     if (err) {
1621         PyErr_Format(PyExc_SystemError,
1622                      "Cannot copy %s characters "
1623                      "into a string of %s characters",
1624                      unicode_kind_name(from),
1625                      unicode_kind_name(to));
1626         return -1;
1627     }
1628     return how_many;
1629 }
1630 
1631 /* Find the maximum code point and count the number of surrogate pairs so a
1632    correct string length can be computed before converting a string to UCS4.
1633    This function counts single surrogates as a character and not as a pair.
1634 
1635    Return 0 on success, or -1 on error. */
1636 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1639 {
1640     const wchar_t *iter;
1641     Py_UCS4 ch;
1642 
1643     assert(num_surrogates != NULL && maxchar != NULL);
1644     *num_surrogates = 0;
1645     *maxchar = 0;
1646 
1647     for (iter = begin; iter < end; ) {
1648 #if SIZEOF_WCHAR_T == 2
1649         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650             && (iter+1) < end
1651             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652         {
1653             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654             ++(*num_surrogates);
1655             iter += 2;
1656         }
1657         else
1658 #endif
1659         {
1660             ch = *iter;
1661             iter++;
1662         }
1663         if (ch > *maxchar) {
1664             *maxchar = ch;
1665             if (*maxchar > MAX_UNICODE) {
1666                 PyErr_Format(PyExc_ValueError,
1667                              "character U+%x is not in range [U+0000; U+10ffff]",
1668                              ch);
1669                 return -1;
1670             }
1671         }
1672     }
1673     return 0;
1674 }
1675 
1676 int
_PyUnicode_Ready(PyObject * unicode)1677 _PyUnicode_Ready(PyObject *unicode)
1678 {
1679     wchar_t *end;
1680     Py_UCS4 maxchar = 0;
1681     Py_ssize_t num_surrogates;
1682 #if SIZEOF_WCHAR_T == 2
1683     Py_ssize_t length_wo_surrogates;
1684 #endif
1685 
1686     /* _PyUnicode_Ready() is only intended for old-style API usage where
1687        strings were created using _PyObject_New() and where no canonical
1688        representation (the str field) has been set yet aka strings
1689        which are not yet ready. */
1690     assert(_PyUnicode_CHECK(unicode));
1691     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1692     assert(_PyUnicode_WSTR(unicode) != NULL);
1693     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1694     assert(_PyUnicode_UTF8(unicode) == NULL);
1695     /* Actually, it should neither be interned nor be anything else: */
1696     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1697 
1698     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1699     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1700                                 &maxchar, &num_surrogates) == -1)
1701         return -1;
1702 
1703     if (maxchar < 256) {
1704         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705         if (!_PyUnicode_DATA_ANY(unicode)) {
1706             PyErr_NoMemory();
1707             return -1;
1708         }
1709         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1710                                 _PyUnicode_WSTR(unicode), end,
1711                                 PyUnicode_1BYTE_DATA(unicode));
1712         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715         if (maxchar < 128) {
1716             _PyUnicode_STATE(unicode).ascii = 1;
1717             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1718             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1719         }
1720         else {
1721             _PyUnicode_STATE(unicode).ascii = 0;
1722             _PyUnicode_UTF8(unicode) = NULL;
1723             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1724         }
1725         PyObject_FREE(_PyUnicode_WSTR(unicode));
1726         _PyUnicode_WSTR(unicode) = NULL;
1727         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728     }
1729     /* In this case we might have to convert down from 4-byte native
1730        wchar_t to 2-byte unicode. */
1731     else if (maxchar < 65536) {
1732         assert(num_surrogates == 0 &&
1733                "FindMaxCharAndNumSurrogatePairs() messed up");
1734 
1735 #if SIZEOF_WCHAR_T == 2
1736         /* We can share representations and are done. */
1737         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1738         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1741         _PyUnicode_UTF8(unicode) = NULL;
1742         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743 #else
1744         /* sizeof(wchar_t) == 4 */
1745         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1746             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1747         if (!_PyUnicode_DATA_ANY(unicode)) {
1748             PyErr_NoMemory();
1749             return -1;
1750         }
1751         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752                                 _PyUnicode_WSTR(unicode), end,
1753                                 PyUnicode_2BYTE_DATA(unicode));
1754         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1757         _PyUnicode_UTF8(unicode) = NULL;
1758         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1759         PyObject_FREE(_PyUnicode_WSTR(unicode));
1760         _PyUnicode_WSTR(unicode) = NULL;
1761         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762 #endif
1763     }
1764     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765     else {
1766 #if SIZEOF_WCHAR_T == 2
1767         /* in case the native representation is 2-bytes, we need to allocate a
1768            new normalized 4-byte version. */
1769         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1770         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771             PyErr_NoMemory();
1772             return -1;
1773         }
1774         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775         if (!_PyUnicode_DATA_ANY(unicode)) {
1776             PyErr_NoMemory();
1777             return -1;
1778         }
1779         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1781         _PyUnicode_UTF8(unicode) = NULL;
1782         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1783         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784         _PyUnicode_STATE(unicode).ready = 1;
1785         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1786         PyObject_FREE(_PyUnicode_WSTR(unicode));
1787         _PyUnicode_WSTR(unicode) = NULL;
1788         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789 #else
1790         assert(num_surrogates == 0);
1791 
1792         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1793         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1794         _PyUnicode_UTF8(unicode) = NULL;
1795         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1796         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797 #endif
1798         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799     }
1800     _PyUnicode_STATE(unicode).ready = 1;
1801     assert(_PyUnicode_CheckConsistency(unicode, 1));
1802     return 0;
1803 }
1804 
1805 static void
unicode_dealloc(PyObject * unicode)1806 unicode_dealloc(PyObject *unicode)
1807 {
1808     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1809     case SSTATE_NOT_INTERNED:
1810         break;
1811 
1812     case SSTATE_INTERNED_MORTAL:
1813         /* revive dead object temporarily for DelItem */
1814         Py_REFCNT(unicode) = 3;
1815         if (PyDict_DelItem(interned, unicode) != 0)
1816             Py_FatalError(
1817                 "deletion of interned string failed");
1818         break;
1819 
1820     case SSTATE_INTERNED_IMMORTAL:
1821         Py_FatalError("Immortal interned string died.");
1822         /* fall through */
1823 
1824     default:
1825         Py_FatalError("Inconsistent interned string state.");
1826     }
1827 
1828     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1829         PyObject_DEL(_PyUnicode_WSTR(unicode));
1830     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1831         PyObject_DEL(_PyUnicode_UTF8(unicode));
1832     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1834 
1835     Py_TYPE(unicode)->tp_free(unicode);
1836 }
1837 
1838 #ifdef Py_DEBUG
1839 static int
unicode_is_singleton(PyObject * unicode)1840 unicode_is_singleton(PyObject *unicode)
1841 {
1842     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843     if (unicode == unicode_empty)
1844         return 1;
1845     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846     {
1847         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848         if (ch < 256 && unicode_latin1[ch] == unicode)
1849             return 1;
1850     }
1851     return 0;
1852 }
1853 #endif
1854 
1855 static int
unicode_modifiable(PyObject * unicode)1856 unicode_modifiable(PyObject *unicode)
1857 {
1858     assert(_PyUnicode_CHECK(unicode));
1859     if (Py_REFCNT(unicode) != 1)
1860         return 0;
1861     if (_PyUnicode_HASH(unicode) != -1)
1862         return 0;
1863     if (PyUnicode_CHECK_INTERNED(unicode))
1864         return 0;
1865     if (!PyUnicode_CheckExact(unicode))
1866         return 0;
1867 #ifdef Py_DEBUG
1868     /* singleton refcount is greater than 1 */
1869     assert(!unicode_is_singleton(unicode));
1870 #endif
1871     return 1;
1872 }
1873 
1874 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876 {
1877     PyObject *unicode;
1878     Py_ssize_t old_length;
1879 
1880     assert(p_unicode != NULL);
1881     unicode = *p_unicode;
1882 
1883     assert(unicode != NULL);
1884     assert(PyUnicode_Check(unicode));
1885     assert(0 <= length);
1886 
1887     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1888         old_length = PyUnicode_WSTR_LENGTH(unicode);
1889     else
1890         old_length = PyUnicode_GET_LENGTH(unicode);
1891     if (old_length == length)
1892         return 0;
1893 
1894     if (length == 0) {
1895         _Py_INCREF_UNICODE_EMPTY();
1896         if (!unicode_empty)
1897             return -1;
1898         Py_SETREF(*p_unicode, unicode_empty);
1899         return 0;
1900     }
1901 
1902     if (!unicode_modifiable(unicode)) {
1903         PyObject *copy = resize_copy(unicode, length);
1904         if (copy == NULL)
1905             return -1;
1906         Py_SETREF(*p_unicode, copy);
1907         return 0;
1908     }
1909 
1910     if (PyUnicode_IS_COMPACT(unicode)) {
1911         PyObject *new_unicode = resize_compact(unicode, length);
1912         if (new_unicode == NULL)
1913             return -1;
1914         *p_unicode = new_unicode;
1915         return 0;
1916     }
1917     return resize_inplace(unicode, length);
1918 }
1919 
1920 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1922 {
1923     PyObject *unicode;
1924     if (p_unicode == NULL) {
1925         PyErr_BadInternalCall();
1926         return -1;
1927     }
1928     unicode = *p_unicode;
1929     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1930     {
1931         PyErr_BadInternalCall();
1932         return -1;
1933     }
1934     return unicode_resize(p_unicode, length);
1935 }
1936 
1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1938 
1939    WARNING: The function doesn't copy the terminating null character and
1940    doesn't check the maximum character (may write a latin1 character in an
1941    ASCII string). */
1942 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944                    const char *str, Py_ssize_t len)
1945 {
1946     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947     void *data = PyUnicode_DATA(unicode);
1948     const char *end = str + len;
1949 
1950     switch (kind) {
1951     case PyUnicode_1BYTE_KIND: {
1952         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1953 #ifdef Py_DEBUG
1954         if (PyUnicode_IS_ASCII(unicode)) {
1955             Py_UCS4 maxchar = ucs1lib_find_max_char(
1956                 (const Py_UCS1*)str,
1957                 (const Py_UCS1*)str + len);
1958             assert(maxchar < 128);
1959         }
1960 #endif
1961         memcpy((char *) data + index, str, len);
1962         break;
1963     }
1964     case PyUnicode_2BYTE_KIND: {
1965         Py_UCS2 *start = (Py_UCS2 *)data + index;
1966         Py_UCS2 *ucs2 = start;
1967         assert(index <= PyUnicode_GET_LENGTH(unicode));
1968 
1969         for (; str < end; ++ucs2, ++str)
1970             *ucs2 = (Py_UCS2)*str;
1971 
1972         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1973         break;
1974     }
1975     default: {
1976         Py_UCS4 *start = (Py_UCS4 *)data + index;
1977         Py_UCS4 *ucs4 = start;
1978         assert(kind == PyUnicode_4BYTE_KIND);
1979         assert(index <= PyUnicode_GET_LENGTH(unicode));
1980 
1981         for (; str < end; ++ucs4, ++str)
1982             *ucs4 = (Py_UCS4)*str;
1983 
1984         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1985     }
1986     }
1987 }
1988 
1989 static PyObject*
get_latin1_char(unsigned char ch)1990 get_latin1_char(unsigned char ch)
1991 {
1992     PyObject *unicode = unicode_latin1[ch];
1993     if (!unicode) {
1994         unicode = PyUnicode_New(1, ch);
1995         if (!unicode)
1996             return NULL;
1997         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1998         assert(_PyUnicode_CheckConsistency(unicode, 1));
1999         unicode_latin1[ch] = unicode;
2000     }
2001     Py_INCREF(unicode);
2002     return unicode;
2003 }
2004 
2005 static PyObject*
unicode_char(Py_UCS4 ch)2006 unicode_char(Py_UCS4 ch)
2007 {
2008     PyObject *unicode;
2009 
2010     assert(ch <= MAX_UNICODE);
2011 
2012     if (ch < 256)
2013         return get_latin1_char(ch);
2014 
2015     unicode = PyUnicode_New(1, ch);
2016     if (unicode == NULL)
2017         return NULL;
2018 
2019     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2021         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2022     } else {
2023         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025     }
2026     assert(_PyUnicode_CheckConsistency(unicode, 1));
2027     return unicode;
2028 }
2029 
2030 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2032 {
2033     if (u == NULL)
2034         return (PyObject*)_PyUnicode_New(size);
2035 
2036     if (size < 0) {
2037         PyErr_BadInternalCall();
2038         return NULL;
2039     }
2040 
2041     return PyUnicode_FromWideChar(u, size);
2042 }
2043 
2044 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046 {
2047     PyObject *unicode;
2048     Py_UCS4 maxchar = 0;
2049     Py_ssize_t num_surrogates;
2050 
2051     if (u == NULL && size != 0) {
2052         PyErr_BadInternalCall();
2053         return NULL;
2054     }
2055 
2056     if (size == -1) {
2057         size = wcslen(u);
2058     }
2059 
2060     /* If the Unicode data is known at construction time, we can apply
2061        some optimizations which share commonly used objects. */
2062 
2063     /* Optimization for empty strings */
2064     if (size == 0)
2065         _Py_RETURN_UNICODE_EMPTY();
2066 
2067     /* Single character Unicode objects in the Latin-1 range are
2068        shared when using this constructor */
2069     if (size == 1 && (Py_UCS4)*u < 256)
2070         return get_latin1_char((unsigned char)*u);
2071 
2072     /* If not empty and not single character, copy the Unicode data
2073        into the new object */
2074     if (find_maxchar_surrogates(u, u + size,
2075                                 &maxchar, &num_surrogates) == -1)
2076         return NULL;
2077 
2078     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2079     if (!unicode)
2080         return NULL;
2081 
2082     switch (PyUnicode_KIND(unicode)) {
2083     case PyUnicode_1BYTE_KIND:
2084         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2085                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086         break;
2087     case PyUnicode_2BYTE_KIND:
2088 #if Py_UNICODE_SIZE == 2
2089         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2090 #else
2091         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2092                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093 #endif
2094         break;
2095     case PyUnicode_4BYTE_KIND:
2096 #if SIZEOF_WCHAR_T == 2
2097         /* This is the only case which has to process surrogates, thus
2098            a simple copy loop is not enough and we need a function. */
2099         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2100 #else
2101         assert(num_surrogates == 0);
2102         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2103 #endif
2104         break;
2105     default:
2106         Py_UNREACHABLE();
2107     }
2108 
2109     return unicode_result(unicode);
2110 }
2111 
2112 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2114 {
2115     if (size < 0) {
2116         PyErr_SetString(PyExc_SystemError,
2117                         "Negative size passed to PyUnicode_FromStringAndSize");
2118         return NULL;
2119     }
2120     if (u != NULL)
2121         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122     else
2123         return (PyObject *)_PyUnicode_New(size);
2124 }
2125 
2126 PyObject *
PyUnicode_FromString(const char * u)2127 PyUnicode_FromString(const char *u)
2128 {
2129     size_t size = strlen(u);
2130     if (size > PY_SSIZE_T_MAX) {
2131         PyErr_SetString(PyExc_OverflowError, "input too long");
2132         return NULL;
2133     }
2134     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2135 }
2136 
2137 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2138 _PyUnicode_FromId(_Py_Identifier *id)
2139 {
2140     if (!id->object) {
2141         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142                                                   strlen(id->string),
2143                                                   NULL, NULL);
2144         if (!id->object)
2145             return NULL;
2146         PyUnicode_InternInPlace(&id->object);
2147         assert(!id->next);
2148         id->next = static_strings;
2149         static_strings = id;
2150     }
2151     return id->object;
2152 }
2153 
2154 void
_PyUnicode_ClearStaticStrings()2155 _PyUnicode_ClearStaticStrings()
2156 {
2157     _Py_Identifier *tmp, *s = static_strings;
2158     while (s) {
2159         Py_CLEAR(s->object);
2160         tmp = s->next;
2161         s->next = NULL;
2162         s = tmp;
2163     }
2164     static_strings = NULL;
2165 }
2166 
2167 /* Internal function, doesn't check maximum character */
2168 
2169 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2171 {
2172     const unsigned char *s = (const unsigned char *)buffer;
2173     PyObject *unicode;
2174     if (size == 1) {
2175 #ifdef Py_DEBUG
2176         assert((unsigned char)s[0] < 128);
2177 #endif
2178         return get_latin1_char(s[0]);
2179     }
2180     unicode = PyUnicode_New(size, 127);
2181     if (!unicode)
2182         return NULL;
2183     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184     assert(_PyUnicode_CheckConsistency(unicode, 1));
2185     return unicode;
2186 }
2187 
2188 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2189 kind_maxchar_limit(unsigned int kind)
2190 {
2191     switch (kind) {
2192     case PyUnicode_1BYTE_KIND:
2193         return 0x80;
2194     case PyUnicode_2BYTE_KIND:
2195         return 0x100;
2196     case PyUnicode_4BYTE_KIND:
2197         return 0x10000;
2198     default:
2199         Py_UNREACHABLE();
2200     }
2201 }
2202 
2203 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2205 {
2206     PyObject *res;
2207     unsigned char max_char;
2208 
2209     if (size == 0)
2210         _Py_RETURN_UNICODE_EMPTY();
2211     assert(size > 0);
2212     if (size == 1)
2213         return get_latin1_char(u[0]);
2214 
2215     max_char = ucs1lib_find_max_char(u, u + size);
2216     res = PyUnicode_New(size, max_char);
2217     if (!res)
2218         return NULL;
2219     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2220     assert(_PyUnicode_CheckConsistency(res, 1));
2221     return res;
2222 }
2223 
2224 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2226 {
2227     PyObject *res;
2228     Py_UCS2 max_char;
2229 
2230     if (size == 0)
2231         _Py_RETURN_UNICODE_EMPTY();
2232     assert(size > 0);
2233     if (size == 1)
2234         return unicode_char(u[0]);
2235 
2236     max_char = ucs2lib_find_max_char(u, u + size);
2237     res = PyUnicode_New(size, max_char);
2238     if (!res)
2239         return NULL;
2240     if (max_char >= 256)
2241         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2242     else {
2243         _PyUnicode_CONVERT_BYTES(
2244             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245     }
2246     assert(_PyUnicode_CheckConsistency(res, 1));
2247     return res;
2248 }
2249 
2250 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2252 {
2253     PyObject *res;
2254     Py_UCS4 max_char;
2255 
2256     if (size == 0)
2257         _Py_RETURN_UNICODE_EMPTY();
2258     assert(size > 0);
2259     if (size == 1)
2260         return unicode_char(u[0]);
2261 
2262     max_char = ucs4lib_find_max_char(u, u + size);
2263     res = PyUnicode_New(size, max_char);
2264     if (!res)
2265         return NULL;
2266     if (max_char < 256)
2267         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268                                  PyUnicode_1BYTE_DATA(res));
2269     else if (max_char < 0x10000)
2270         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271                                  PyUnicode_2BYTE_DATA(res));
2272     else
2273         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2274     assert(_PyUnicode_CheckConsistency(res, 1));
2275     return res;
2276 }
2277 
2278 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280 {
2281     if (size < 0) {
2282         PyErr_SetString(PyExc_ValueError, "size must be positive");
2283         return NULL;
2284     }
2285     switch (kind) {
2286     case PyUnicode_1BYTE_KIND:
2287         return _PyUnicode_FromUCS1(buffer, size);
2288     case PyUnicode_2BYTE_KIND:
2289         return _PyUnicode_FromUCS2(buffer, size);
2290     case PyUnicode_4BYTE_KIND:
2291         return _PyUnicode_FromUCS4(buffer, size);
2292     default:
2293         PyErr_SetString(PyExc_SystemError, "invalid kind");
2294         return NULL;
2295     }
2296 }
2297 
2298 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300 {
2301     enum PyUnicode_Kind kind;
2302     void *startptr, *endptr;
2303 
2304     assert(PyUnicode_IS_READY(unicode));
2305     assert(0 <= start);
2306     assert(end <= PyUnicode_GET_LENGTH(unicode));
2307     assert(start <= end);
2308 
2309     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310         return PyUnicode_MAX_CHAR_VALUE(unicode);
2311 
2312     if (start == end)
2313         return 127;
2314 
2315     if (PyUnicode_IS_ASCII(unicode))
2316         return 127;
2317 
2318     kind = PyUnicode_KIND(unicode);
2319     startptr = PyUnicode_DATA(unicode);
2320     endptr = (char *)startptr + end * kind;
2321     startptr = (char *)startptr + start * kind;
2322     switch(kind) {
2323     case PyUnicode_1BYTE_KIND:
2324         return ucs1lib_find_max_char(startptr, endptr);
2325     case PyUnicode_2BYTE_KIND:
2326         return ucs2lib_find_max_char(startptr, endptr);
2327     case PyUnicode_4BYTE_KIND:
2328         return ucs4lib_find_max_char(startptr, endptr);
2329     default:
2330         Py_UNREACHABLE();
2331     }
2332 }
2333 
2334 /* Ensure that a string uses the most efficient storage, if it is not the
2335    case: create a new string with of the right kind. Write NULL into *p_unicode
2336    on error. */
2337 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2338 unicode_adjust_maxchar(PyObject **p_unicode)
2339 {
2340     PyObject *unicode, *copy;
2341     Py_UCS4 max_char;
2342     Py_ssize_t len;
2343     unsigned int kind;
2344 
2345     assert(p_unicode != NULL);
2346     unicode = *p_unicode;
2347     assert(PyUnicode_IS_READY(unicode));
2348     if (PyUnicode_IS_ASCII(unicode))
2349         return;
2350 
2351     len = PyUnicode_GET_LENGTH(unicode);
2352     kind = PyUnicode_KIND(unicode);
2353     if (kind == PyUnicode_1BYTE_KIND) {
2354         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2355         max_char = ucs1lib_find_max_char(u, u + len);
2356         if (max_char >= 128)
2357             return;
2358     }
2359     else if (kind == PyUnicode_2BYTE_KIND) {
2360         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2361         max_char = ucs2lib_find_max_char(u, u + len);
2362         if (max_char >= 256)
2363             return;
2364     }
2365     else {
2366         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2367         assert(kind == PyUnicode_4BYTE_KIND);
2368         max_char = ucs4lib_find_max_char(u, u + len);
2369         if (max_char >= 0x10000)
2370             return;
2371     }
2372     copy = PyUnicode_New(len, max_char);
2373     if (copy != NULL)
2374         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2375     Py_DECREF(unicode);
2376     *p_unicode = copy;
2377 }
2378 
2379 PyObject*
_PyUnicode_Copy(PyObject * unicode)2380 _PyUnicode_Copy(PyObject *unicode)
2381 {
2382     Py_ssize_t length;
2383     PyObject *copy;
2384 
2385     if (!PyUnicode_Check(unicode)) {
2386         PyErr_BadInternalCall();
2387         return NULL;
2388     }
2389     if (PyUnicode_READY(unicode) == -1)
2390         return NULL;
2391 
2392     length = PyUnicode_GET_LENGTH(unicode);
2393     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2394     if (!copy)
2395         return NULL;
2396     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397 
2398     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2399               length * PyUnicode_KIND(unicode));
2400     assert(_PyUnicode_CheckConsistency(copy, 1));
2401     return copy;
2402 }
2403 
2404 
2405 /* Widen Unicode objects to larger buffers. Don't write terminating null
2406    character. Return NULL on error. */
2407 
2408 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410 {
2411     Py_ssize_t len;
2412     void *result;
2413     unsigned int skind;
2414 
2415     if (PyUnicode_READY(s) == -1)
2416         return NULL;
2417 
2418     len = PyUnicode_GET_LENGTH(s);
2419     skind = PyUnicode_KIND(s);
2420     if (skind >= kind) {
2421         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2422         return NULL;
2423     }
2424     switch (kind) {
2425     case PyUnicode_2BYTE_KIND:
2426         result = PyMem_New(Py_UCS2, len);
2427         if (!result)
2428             return PyErr_NoMemory();
2429         assert(skind == PyUnicode_1BYTE_KIND);
2430         _PyUnicode_CONVERT_BYTES(
2431             Py_UCS1, Py_UCS2,
2432             PyUnicode_1BYTE_DATA(s),
2433             PyUnicode_1BYTE_DATA(s) + len,
2434             result);
2435         return result;
2436     case PyUnicode_4BYTE_KIND:
2437         result = PyMem_New(Py_UCS4, len);
2438         if (!result)
2439             return PyErr_NoMemory();
2440         if (skind == PyUnicode_2BYTE_KIND) {
2441             _PyUnicode_CONVERT_BYTES(
2442                 Py_UCS2, Py_UCS4,
2443                 PyUnicode_2BYTE_DATA(s),
2444                 PyUnicode_2BYTE_DATA(s) + len,
2445                 result);
2446         }
2447         else {
2448             assert(skind == PyUnicode_1BYTE_KIND);
2449             _PyUnicode_CONVERT_BYTES(
2450                 Py_UCS1, Py_UCS4,
2451                 PyUnicode_1BYTE_DATA(s),
2452                 PyUnicode_1BYTE_DATA(s) + len,
2453                 result);
2454         }
2455         return result;
2456     default:
2457         break;
2458     }
2459     PyErr_SetString(PyExc_SystemError, "invalid kind");
2460     return NULL;
2461 }
2462 
2463 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465         int copy_null)
2466 {
2467     int kind;
2468     void *data;
2469     Py_ssize_t len, targetlen;
2470     if (PyUnicode_READY(string) == -1)
2471         return NULL;
2472     kind = PyUnicode_KIND(string);
2473     data = PyUnicode_DATA(string);
2474     len = PyUnicode_GET_LENGTH(string);
2475     targetlen = len;
2476     if (copy_null)
2477         targetlen++;
2478     if (!target) {
2479         target = PyMem_New(Py_UCS4, targetlen);
2480         if (!target) {
2481             PyErr_NoMemory();
2482             return NULL;
2483         }
2484     }
2485     else {
2486         if (targetsize < targetlen) {
2487             PyErr_Format(PyExc_SystemError,
2488                          "string is longer than the buffer");
2489             if (copy_null && 0 < targetsize)
2490                 target[0] = 0;
2491             return NULL;
2492         }
2493     }
2494     if (kind == PyUnicode_1BYTE_KIND) {
2495         Py_UCS1 *start = (Py_UCS1 *) data;
2496         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2497     }
2498     else if (kind == PyUnicode_2BYTE_KIND) {
2499         Py_UCS2 *start = (Py_UCS2 *) data;
2500         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501     }
2502     else {
2503         assert(kind == PyUnicode_4BYTE_KIND);
2504         memcpy(target, data, len * sizeof(Py_UCS4));
2505     }
2506     if (copy_null)
2507         target[len] = 0;
2508     return target;
2509 }
2510 
2511 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513                  int copy_null)
2514 {
2515     if (target == NULL || targetsize < 0) {
2516         PyErr_BadInternalCall();
2517         return NULL;
2518     }
2519     return as_ucs4(string, target, targetsize, copy_null);
2520 }
2521 
2522 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2523 PyUnicode_AsUCS4Copy(PyObject *string)
2524 {
2525     return as_ucs4(string, NULL, 0, 1);
2526 }
2527 
2528 /* maximum number of characters required for output of %lld or %p.
2529    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2532 
2533 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535                              Py_ssize_t width, Py_ssize_t precision)
2536 {
2537     Py_ssize_t length, fill, arglen;
2538     Py_UCS4 maxchar;
2539 
2540     if (PyUnicode_READY(str) == -1)
2541         return -1;
2542 
2543     length = PyUnicode_GET_LENGTH(str);
2544     if ((precision == -1 || precision >= length)
2545         && width <= length)
2546         return _PyUnicodeWriter_WriteStr(writer, str);
2547 
2548     if (precision != -1)
2549         length = Py_MIN(precision, length);
2550 
2551     arglen = Py_MAX(length, width);
2552     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554     else
2555         maxchar = writer->maxchar;
2556 
2557     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558         return -1;
2559 
2560     if (width > length) {
2561         fill = width - length;
2562         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563             return -1;
2564         writer->pos += fill;
2565     }
2566 
2567     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568                                   str, 0, length);
2569     writer->pos += length;
2570     return 0;
2571 }
2572 
2573 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575                               Py_ssize_t width, Py_ssize_t precision)
2576 {
2577     /* UTF-8 */
2578     Py_ssize_t length;
2579     PyObject *unicode;
2580     int res;
2581 
2582     if (precision == -1) {
2583         length = strlen(str);
2584     }
2585     else {
2586         length = 0;
2587         while (length < precision && str[length]) {
2588             length++;
2589         }
2590     }
2591     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2592     if (unicode == NULL)
2593         return -1;
2594 
2595     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2596     Py_DECREF(unicode);
2597     return res;
2598 }
2599 
2600 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2601 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2602                        const char *f, va_list *vargs)
2603 {
2604     const char *p;
2605     Py_ssize_t len;
2606     int zeropad;
2607     Py_ssize_t width;
2608     Py_ssize_t precision;
2609     int longflag;
2610     int longlongflag;
2611     int size_tflag;
2612     Py_ssize_t fill;
2613 
2614     p = f;
2615     f++;
2616     zeropad = 0;
2617     if (*f == '0') {
2618         zeropad = 1;
2619         f++;
2620     }
2621 
2622     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2623     width = -1;
2624     if (Py_ISDIGIT((unsigned)*f)) {
2625         width = *f - '0';
2626         f++;
2627         while (Py_ISDIGIT((unsigned)*f)) {
2628             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629                 PyErr_SetString(PyExc_ValueError,
2630                                 "width too big");
2631                 return NULL;
2632             }
2633             width = (width * 10) + (*f - '0');
2634             f++;
2635         }
2636     }
2637     precision = -1;
2638     if (*f == '.') {
2639         f++;
2640         if (Py_ISDIGIT((unsigned)*f)) {
2641             precision = (*f - '0');
2642             f++;
2643             while (Py_ISDIGIT((unsigned)*f)) {
2644                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2645                     PyErr_SetString(PyExc_ValueError,
2646                                     "precision too big");
2647                     return NULL;
2648                 }
2649                 precision = (precision * 10) + (*f - '0');
2650                 f++;
2651             }
2652         }
2653         if (*f == '%') {
2654             /* "%.3%s" => f points to "3" */
2655             f--;
2656         }
2657     }
2658     if (*f == '\0') {
2659         /* bogus format "%.123" => go backward, f points to "3" */
2660         f--;
2661     }
2662 
2663     /* Handle %ld, %lu, %lld and %llu. */
2664     longflag = 0;
2665     longlongflag = 0;
2666     size_tflag = 0;
2667     if (*f == 'l') {
2668         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2669             longflag = 1;
2670             ++f;
2671         }
2672         else if (f[1] == 'l' &&
2673                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2674             longlongflag = 1;
2675             f += 2;
2676         }
2677     }
2678     /* handle the size_t flag. */
2679     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2680         size_tflag = 1;
2681         ++f;
2682     }
2683 
2684     if (f[1] == '\0')
2685         writer->overallocate = 0;
2686 
2687     switch (*f) {
2688     case 'c':
2689     {
2690         int ordinal = va_arg(*vargs, int);
2691         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2692             PyErr_SetString(PyExc_OverflowError,
2693                             "character argument not in range(0x110000)");
2694             return NULL;
2695         }
2696         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2697             return NULL;
2698         break;
2699     }
2700 
2701     case 'i':
2702     case 'd':
2703     case 'u':
2704     case 'x':
2705     {
2706         /* used by sprintf */
2707         char buffer[MAX_LONG_LONG_CHARS];
2708         Py_ssize_t arglen;
2709 
2710         if (*f == 'u') {
2711             if (longflag)
2712                 len = sprintf(buffer, "%lu",
2713                         va_arg(*vargs, unsigned long));
2714             else if (longlongflag)
2715                 len = sprintf(buffer, "%llu",
2716                         va_arg(*vargs, unsigned long long));
2717             else if (size_tflag)
2718                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2719                         va_arg(*vargs, size_t));
2720             else
2721                 len = sprintf(buffer, "%u",
2722                         va_arg(*vargs, unsigned int));
2723         }
2724         else if (*f == 'x') {
2725             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2726         }
2727         else {
2728             if (longflag)
2729                 len = sprintf(buffer, "%li",
2730                         va_arg(*vargs, long));
2731             else if (longlongflag)
2732                 len = sprintf(buffer, "%lli",
2733                         va_arg(*vargs, long long));
2734             else if (size_tflag)
2735                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2736                         va_arg(*vargs, Py_ssize_t));
2737             else
2738                 len = sprintf(buffer, "%i",
2739                         va_arg(*vargs, int));
2740         }
2741         assert(len >= 0);
2742 
2743         if (precision < len)
2744             precision = len;
2745 
2746         arglen = Py_MAX(precision, width);
2747         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2748             return NULL;
2749 
2750         if (width > precision) {
2751             Py_UCS4 fillchar;
2752             fill = width - precision;
2753             fillchar = zeropad?'0':' ';
2754             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2755                 return NULL;
2756             writer->pos += fill;
2757         }
2758         if (precision > len) {
2759             fill = precision - len;
2760             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2761                 return NULL;
2762             writer->pos += fill;
2763         }
2764 
2765         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2766             return NULL;
2767         break;
2768     }
2769 
2770     case 'p':
2771     {
2772         char number[MAX_LONG_LONG_CHARS];
2773 
2774         len = sprintf(number, "%p", va_arg(*vargs, void*));
2775         assert(len >= 0);
2776 
2777         /* %p is ill-defined:  ensure leading 0x. */
2778         if (number[1] == 'X')
2779             number[1] = 'x';
2780         else if (number[1] != 'x') {
2781             memmove(number + 2, number,
2782                     strlen(number) + 1);
2783             number[0] = '0';
2784             number[1] = 'x';
2785             len += 2;
2786         }
2787 
2788         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2789             return NULL;
2790         break;
2791     }
2792 
2793     case 's':
2794     {
2795         /* UTF-8 */
2796         const char *s = va_arg(*vargs, const char*);
2797         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2798             return NULL;
2799         break;
2800     }
2801 
2802     case 'U':
2803     {
2804         PyObject *obj = va_arg(*vargs, PyObject *);
2805         assert(obj && _PyUnicode_CHECK(obj));
2806 
2807         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2808             return NULL;
2809         break;
2810     }
2811 
2812     case 'V':
2813     {
2814         PyObject *obj = va_arg(*vargs, PyObject *);
2815         const char *str = va_arg(*vargs, const char *);
2816         if (obj) {
2817             assert(_PyUnicode_CHECK(obj));
2818             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2819                 return NULL;
2820         }
2821         else {
2822             assert(str != NULL);
2823             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2824                 return NULL;
2825         }
2826         break;
2827     }
2828 
2829     case 'S':
2830     {
2831         PyObject *obj = va_arg(*vargs, PyObject *);
2832         PyObject *str;
2833         assert(obj);
2834         str = PyObject_Str(obj);
2835         if (!str)
2836             return NULL;
2837         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2838             Py_DECREF(str);
2839             return NULL;
2840         }
2841         Py_DECREF(str);
2842         break;
2843     }
2844 
2845     case 'R':
2846     {
2847         PyObject *obj = va_arg(*vargs, PyObject *);
2848         PyObject *repr;
2849         assert(obj);
2850         repr = PyObject_Repr(obj);
2851         if (!repr)
2852             return NULL;
2853         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2854             Py_DECREF(repr);
2855             return NULL;
2856         }
2857         Py_DECREF(repr);
2858         break;
2859     }
2860 
2861     case 'A':
2862     {
2863         PyObject *obj = va_arg(*vargs, PyObject *);
2864         PyObject *ascii;
2865         assert(obj);
2866         ascii = PyObject_ASCII(obj);
2867         if (!ascii)
2868             return NULL;
2869         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2870             Py_DECREF(ascii);
2871             return NULL;
2872         }
2873         Py_DECREF(ascii);
2874         break;
2875     }
2876 
2877     case '%':
2878         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2879             return NULL;
2880         break;
2881 
2882     default:
2883         /* if we stumble upon an unknown formatting code, copy the rest
2884            of the format string to the output string. (we cannot just
2885            skip the code, since there's no way to know what's in the
2886            argument list) */
2887         len = strlen(p);
2888         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2889             return NULL;
2890         f = p+len;
2891         return f;
2892     }
2893 
2894     f++;
2895     return f;
2896 }
2897 
2898 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2899 PyUnicode_FromFormatV(const char *format, va_list vargs)
2900 {
2901     va_list vargs2;
2902     const char *f;
2903     _PyUnicodeWriter writer;
2904 
2905     _PyUnicodeWriter_Init(&writer);
2906     writer.min_length = strlen(format) + 100;
2907     writer.overallocate = 1;
2908 
2909     // Copy varags to be able to pass a reference to a subfunction.
2910     va_copy(vargs2, vargs);
2911 
2912     for (f = format; *f; ) {
2913         if (*f == '%') {
2914             f = unicode_fromformat_arg(&writer, f, &vargs2);
2915             if (f == NULL)
2916                 goto fail;
2917         }
2918         else {
2919             const char *p;
2920             Py_ssize_t len;
2921 
2922             p = f;
2923             do
2924             {
2925                 if ((unsigned char)*p > 127) {
2926                     PyErr_Format(PyExc_ValueError,
2927                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2928                         "string, got a non-ASCII byte: 0x%02x",
2929                         (unsigned char)*p);
2930                     goto fail;
2931                 }
2932                 p++;
2933             }
2934             while (*p != '\0' && *p != '%');
2935             len = p - f;
2936 
2937             if (*p == '\0')
2938                 writer.overallocate = 0;
2939 
2940             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2941                 goto fail;
2942 
2943             f = p;
2944         }
2945     }
2946     va_end(vargs2);
2947     return _PyUnicodeWriter_Finish(&writer);
2948 
2949   fail:
2950     va_end(vargs2);
2951     _PyUnicodeWriter_Dealloc(&writer);
2952     return NULL;
2953 }
2954 
2955 PyObject *
PyUnicode_FromFormat(const char * format,...)2956 PyUnicode_FromFormat(const char *format, ...)
2957 {
2958     PyObject* ret;
2959     va_list vargs;
2960 
2961 #ifdef HAVE_STDARG_PROTOTYPES
2962     va_start(vargs, format);
2963 #else
2964     va_start(vargs);
2965 #endif
2966     ret = PyUnicode_FromFormatV(format, vargs);
2967     va_end(vargs);
2968     return ret;
2969 }
2970 
2971 #ifdef HAVE_WCHAR_H
2972 
2973 /* Convert a Unicode object to a wide character string.
2974 
2975    - If w is NULL: return the number of wide characters (including the null
2976      character) required to convert the unicode object. Ignore size argument.
2977 
2978    - Otherwise: return the number of wide characters (excluding the null
2979      character) written into w. Write at most size wide characters (including
2980      the null character). */
2981 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2982 PyUnicode_AsWideChar(PyObject *unicode,
2983                      wchar_t *w,
2984                      Py_ssize_t size)
2985 {
2986     Py_ssize_t res;
2987     const wchar_t *wstr;
2988 
2989     if (unicode == NULL) {
2990         PyErr_BadInternalCall();
2991         return -1;
2992     }
2993     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2994     if (wstr == NULL)
2995         return -1;
2996 
2997     if (w != NULL) {
2998         if (size > res)
2999             size = res + 1;
3000         else
3001             res = size;
3002         memcpy(w, wstr, size * sizeof(wchar_t));
3003         return res;
3004     }
3005     else
3006         return res + 1;
3007 }
3008 
3009 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3010 PyUnicode_AsWideCharString(PyObject *unicode,
3011                            Py_ssize_t *size)
3012 {
3013     const wchar_t *wstr;
3014     wchar_t *buffer;
3015     Py_ssize_t buflen;
3016 
3017     if (unicode == NULL) {
3018         PyErr_BadInternalCall();
3019         return NULL;
3020     }
3021 
3022     wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3023     if (wstr == NULL) {
3024         return NULL;
3025     }
3026     if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3027         PyErr_SetString(PyExc_ValueError,
3028                         "embedded null character");
3029         return NULL;
3030     }
3031 
3032     buffer = PyMem_NEW(wchar_t, buflen + 1);
3033     if (buffer == NULL) {
3034         PyErr_NoMemory();
3035         return NULL;
3036     }
3037     memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3038     if (size != NULL)
3039         *size = buflen;
3040     return buffer;
3041 }
3042 
3043 #endif /* HAVE_WCHAR_H */
3044 
3045 PyObject *
PyUnicode_FromOrdinal(int ordinal)3046 PyUnicode_FromOrdinal(int ordinal)
3047 {
3048     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3049         PyErr_SetString(PyExc_ValueError,
3050                         "chr() arg not in range(0x110000)");
3051         return NULL;
3052     }
3053 
3054     return unicode_char((Py_UCS4)ordinal);
3055 }
3056 
3057 PyObject *
PyUnicode_FromObject(PyObject * obj)3058 PyUnicode_FromObject(PyObject *obj)
3059 {
3060     /* XXX Perhaps we should make this API an alias of
3061        PyObject_Str() instead ?! */
3062     if (PyUnicode_CheckExact(obj)) {
3063         if (PyUnicode_READY(obj) == -1)
3064             return NULL;
3065         Py_INCREF(obj);
3066         return obj;
3067     }
3068     if (PyUnicode_Check(obj)) {
3069         /* For a Unicode subtype that's not a Unicode object,
3070            return a true Unicode object with the same data. */
3071         return _PyUnicode_Copy(obj);
3072     }
3073     PyErr_Format(PyExc_TypeError,
3074                  "Can't convert '%.100s' object to str implicitly",
3075                  Py_TYPE(obj)->tp_name);
3076     return NULL;
3077 }
3078 
3079 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3080 PyUnicode_FromEncodedObject(PyObject *obj,
3081                             const char *encoding,
3082                             const char *errors)
3083 {
3084     Py_buffer buffer;
3085     PyObject *v;
3086 
3087     if (obj == NULL) {
3088         PyErr_BadInternalCall();
3089         return NULL;
3090     }
3091 
3092     /* Decoding bytes objects is the most common case and should be fast */
3093     if (PyBytes_Check(obj)) {
3094         if (PyBytes_GET_SIZE(obj) == 0)
3095             _Py_RETURN_UNICODE_EMPTY();
3096         v = PyUnicode_Decode(
3097                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3098                 encoding, errors);
3099         return v;
3100     }
3101 
3102     if (PyUnicode_Check(obj)) {
3103         PyErr_SetString(PyExc_TypeError,
3104                         "decoding str is not supported");
3105         return NULL;
3106     }
3107 
3108     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3109     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3110         PyErr_Format(PyExc_TypeError,
3111                      "decoding to str: need a bytes-like object, %.80s found",
3112                      Py_TYPE(obj)->tp_name);
3113         return NULL;
3114     }
3115 
3116     if (buffer.len == 0) {
3117         PyBuffer_Release(&buffer);
3118         _Py_RETURN_UNICODE_EMPTY();
3119     }
3120 
3121     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3122     PyBuffer_Release(&buffer);
3123     return v;
3124 }
3125 
3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3127    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3128    longer than lower_len-1). */
3129 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3130 _Py_normalize_encoding(const char *encoding,
3131                        char *lower,
3132                        size_t lower_len)
3133 {
3134     const char *e;
3135     char *l;
3136     char *l_end;
3137     int punct;
3138 
3139     assert(encoding != NULL);
3140 
3141     e = encoding;
3142     l = lower;
3143     l_end = &lower[lower_len - 1];
3144     punct = 0;
3145     while (1) {
3146         char c = *e;
3147         if (c == 0) {
3148             break;
3149         }
3150 
3151         if (Py_ISALNUM(c) || c == '.') {
3152             if (punct && l != lower) {
3153                 if (l == l_end) {
3154                     return 0;
3155                 }
3156                 *l++ = '_';
3157             }
3158             punct = 0;
3159 
3160             if (l == l_end) {
3161                 return 0;
3162             }
3163             *l++ = Py_TOLOWER(c);
3164         }
3165         else {
3166             punct = 1;
3167         }
3168 
3169         e++;
3170     }
3171     *l = '\0';
3172     return 1;
3173 }
3174 
3175 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3176 PyUnicode_Decode(const char *s,
3177                  Py_ssize_t size,
3178                  const char *encoding,
3179                  const char *errors)
3180 {
3181     PyObject *buffer = NULL, *unicode;
3182     Py_buffer info;
3183     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3184 
3185     if (encoding == NULL) {
3186         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187     }
3188 
3189     /* Shortcuts for common default encodings */
3190     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3191         char *lower = buflower;
3192 
3193         /* Fast paths */
3194         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3195             lower += 3;
3196             if (*lower == '_') {
3197                 /* Match "utf8" and "utf_8" */
3198                 lower++;
3199             }
3200 
3201             if (lower[0] == '8' && lower[1] == 0) {
3202                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3203             }
3204             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3205                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3206             }
3207             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3208                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3209             }
3210         }
3211         else {
3212             if (strcmp(lower, "ascii") == 0
3213                 || strcmp(lower, "us_ascii") == 0) {
3214                 return PyUnicode_DecodeASCII(s, size, errors);
3215             }
3216     #ifdef MS_WINDOWS
3217             else if (strcmp(lower, "mbcs") == 0) {
3218                 return PyUnicode_DecodeMBCS(s, size, errors);
3219             }
3220     #endif
3221             else if (strcmp(lower, "latin1") == 0
3222                      || strcmp(lower, "latin_1") == 0
3223                      || strcmp(lower, "iso_8859_1") == 0
3224                      || strcmp(lower, "iso8859_1") == 0) {
3225                 return PyUnicode_DecodeLatin1(s, size, errors);
3226             }
3227         }
3228     }
3229 
3230     /* Decode via the codec registry */
3231     buffer = NULL;
3232     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3233         goto onError;
3234     buffer = PyMemoryView_FromBuffer(&info);
3235     if (buffer == NULL)
3236         goto onError;
3237     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3238     if (unicode == NULL)
3239         goto onError;
3240     if (!PyUnicode_Check(unicode)) {
3241         PyErr_Format(PyExc_TypeError,
3242                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3243                      "use codecs.decode() to decode to arbitrary types",
3244                      encoding,
3245                      Py_TYPE(unicode)->tp_name);
3246         Py_DECREF(unicode);
3247         goto onError;
3248     }
3249     Py_DECREF(buffer);
3250     return unicode_result(unicode);
3251 
3252   onError:
3253     Py_XDECREF(buffer);
3254     return NULL;
3255 }
3256 
3257 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3258 PyUnicode_AsDecodedObject(PyObject *unicode,
3259                           const char *encoding,
3260                           const char *errors)
3261 {
3262     if (!PyUnicode_Check(unicode)) {
3263         PyErr_BadArgument();
3264         return NULL;
3265     }
3266 
3267     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3268                      "PyUnicode_AsDecodedObject() is deprecated; "
3269                      "use PyCodec_Decode() to decode from str", 1) < 0)
3270         return NULL;
3271 
3272     if (encoding == NULL)
3273         encoding = PyUnicode_GetDefaultEncoding();
3274 
3275     /* Decode via the codec registry */
3276     return PyCodec_Decode(unicode, encoding, errors);
3277 }
3278 
3279 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3280 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3281                            const char *encoding,
3282                            const char *errors)
3283 {
3284     PyObject *v;
3285 
3286     if (!PyUnicode_Check(unicode)) {
3287         PyErr_BadArgument();
3288         goto onError;
3289     }
3290 
3291     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3292                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3293                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3294         return NULL;
3295 
3296     if (encoding == NULL)
3297         encoding = PyUnicode_GetDefaultEncoding();
3298 
3299     /* Decode via the codec registry */
3300     v = PyCodec_Decode(unicode, encoding, errors);
3301     if (v == NULL)
3302         goto onError;
3303     if (!PyUnicode_Check(v)) {
3304         PyErr_Format(PyExc_TypeError,
3305                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3306                      "use codecs.decode() to decode to arbitrary types",
3307                      encoding,
3308                      Py_TYPE(unicode)->tp_name);
3309         Py_DECREF(v);
3310         goto onError;
3311     }
3312     return unicode_result(v);
3313 
3314   onError:
3315     return NULL;
3316 }
3317 
3318 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3319 PyUnicode_Encode(const Py_UNICODE *s,
3320                  Py_ssize_t size,
3321                  const char *encoding,
3322                  const char *errors)
3323 {
3324     PyObject *v, *unicode;
3325 
3326     unicode = PyUnicode_FromWideChar(s, size);
3327     if (unicode == NULL)
3328         return NULL;
3329     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3330     Py_DECREF(unicode);
3331     return v;
3332 }
3333 
3334 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3335 PyUnicode_AsEncodedObject(PyObject *unicode,
3336                           const char *encoding,
3337                           const char *errors)
3338 {
3339     PyObject *v;
3340 
3341     if (!PyUnicode_Check(unicode)) {
3342         PyErr_BadArgument();
3343         goto onError;
3344     }
3345 
3346     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3347                      "PyUnicode_AsEncodedObject() is deprecated; "
3348                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3349                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3350         return NULL;
3351 
3352     if (encoding == NULL)
3353         encoding = PyUnicode_GetDefaultEncoding();
3354 
3355     /* Encode via the codec registry */
3356     v = PyCodec_Encode(unicode, encoding, errors);
3357     if (v == NULL)
3358         goto onError;
3359     return v;
3360 
3361   onError:
3362     return NULL;
3363 }
3364 
3365 static int
locale_error_handler(const char * errors,int * surrogateescape)3366 locale_error_handler(const char *errors, int *surrogateescape)
3367 {
3368     _Py_error_handler error_handler = get_error_handler(errors);
3369     switch (error_handler)
3370     {
3371     case _Py_ERROR_STRICT:
3372         *surrogateescape = 0;
3373         return 0;
3374     case _Py_ERROR_SURROGATEESCAPE:
3375         *surrogateescape = 1;
3376         return 0;
3377     default:
3378         PyErr_Format(PyExc_ValueError,
3379                      "only 'strict' and 'surrogateescape' error handlers "
3380                      "are supported, not '%s'",
3381                      errors);
3382         return -1;
3383     }
3384 }
3385 
3386 static PyObject *
unicode_encode_locale(PyObject * unicode,const char * errors,int current_locale)3387 unicode_encode_locale(PyObject *unicode, const char *errors,
3388                       int current_locale)
3389 {
3390     int surrogateescape;
3391     if (locale_error_handler(errors, &surrogateescape) < 0)
3392         return NULL;
3393 
3394     Py_ssize_t wlen;
3395     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3396     if (wstr == NULL) {
3397         return NULL;
3398     }
3399 
3400     if ((size_t)wlen != wcslen(wstr)) {
3401         PyErr_SetString(PyExc_ValueError, "embedded null character");
3402         PyMem_Free(wstr);
3403         return NULL;
3404     }
3405 
3406     char *str;
3407     size_t error_pos;
3408     const char *reason;
3409     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3410                                  current_locale, surrogateescape);
3411     PyMem_Free(wstr);
3412 
3413     if (res != 0) {
3414         if (res == -2) {
3415             PyObject *exc;
3416             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3417                     "locale", unicode,
3418                     (Py_ssize_t)error_pos,
3419                     (Py_ssize_t)(error_pos+1),
3420                     reason);
3421             if (exc != NULL) {
3422                 PyCodec_StrictErrors(exc);
3423                 Py_DECREF(exc);
3424             }
3425         }
3426         else {
3427             PyErr_NoMemory();
3428         }
3429         return NULL;
3430     }
3431 
3432     PyObject *bytes = PyBytes_FromString(str);
3433     PyMem_RawFree(str);
3434     return bytes;
3435 }
3436 
3437 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3439 {
3440     return unicode_encode_locale(unicode, errors, 1);
3441 }
3442 
3443 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3444 PyUnicode_EncodeFSDefault(PyObject *unicode)
3445 {
3446 #if defined(__APPLE__)
3447     return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3448 #else
3449     PyInterpreterState *interp = PyThreadState_GET()->interp;
3450     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3451        cannot use it to encode and decode filenames before it is loaded. Load
3452        the Python codec requires to encode at least its own filename. Use the C
3453        version of the locale codec until the codec registry is initialized and
3454        the Python codec is loaded.
3455 
3456        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3457        cannot only rely on it: check also interp->fscodec_initialized for
3458        subinterpreters. */
3459     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3460         return PyUnicode_AsEncodedString(unicode,
3461                                          Py_FileSystemDefaultEncoding,
3462                                          Py_FileSystemDefaultEncodeErrors);
3463     }
3464     else {
3465         return unicode_encode_locale(unicode,
3466                                      Py_FileSystemDefaultEncodeErrors, 0);
3467     }
3468 #endif
3469 }
3470 
3471 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3472 PyUnicode_AsEncodedString(PyObject *unicode,
3473                           const char *encoding,
3474                           const char *errors)
3475 {
3476     PyObject *v;
3477     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3478 
3479     if (!PyUnicode_Check(unicode)) {
3480         PyErr_BadArgument();
3481         return NULL;
3482     }
3483 
3484     if (encoding == NULL) {
3485         return _PyUnicode_AsUTF8String(unicode, errors);
3486     }
3487 
3488     /* Shortcuts for common default encodings */
3489     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3490         char *lower = buflower;
3491 
3492         /* Fast paths */
3493         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3494             lower += 3;
3495             if (*lower == '_') {
3496                 /* Match "utf8" and "utf_8" */
3497                 lower++;
3498             }
3499 
3500             if (lower[0] == '8' && lower[1] == 0) {
3501                 return _PyUnicode_AsUTF8String(unicode, errors);
3502             }
3503             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3504                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3505             }
3506             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3507                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3508             }
3509         }
3510         else {
3511             if (strcmp(lower, "ascii") == 0
3512                 || strcmp(lower, "us_ascii") == 0) {
3513                 return _PyUnicode_AsASCIIString(unicode, errors);
3514             }
3515 #ifdef MS_WINDOWS
3516             else if (strcmp(lower, "mbcs") == 0) {
3517                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3518             }
3519 #endif
3520             else if (strcmp(lower, "latin1") == 0 ||
3521                      strcmp(lower, "latin_1") == 0 ||
3522                      strcmp(lower, "iso_8859_1") == 0 ||
3523                      strcmp(lower, "iso8859_1") == 0) {
3524                 return _PyUnicode_AsLatin1String(unicode, errors);
3525             }
3526         }
3527     }
3528 
3529     /* Encode via the codec registry */
3530     v = _PyCodec_EncodeText(unicode, encoding, errors);
3531     if (v == NULL)
3532         return NULL;
3533 
3534     /* The normal path */
3535     if (PyBytes_Check(v))
3536         return v;
3537 
3538     /* If the codec returns a buffer, raise a warning and convert to bytes */
3539     if (PyByteArray_Check(v)) {
3540         int error;
3541         PyObject *b;
3542 
3543         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3544             "encoder %s returned bytearray instead of bytes; "
3545             "use codecs.encode() to encode to arbitrary types",
3546             encoding);
3547         if (error) {
3548             Py_DECREF(v);
3549             return NULL;
3550         }
3551 
3552         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3553                                       PyByteArray_GET_SIZE(v));
3554         Py_DECREF(v);
3555         return b;
3556     }
3557 
3558     PyErr_Format(PyExc_TypeError,
3559                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3560                  "use codecs.encode() to encode to arbitrary types",
3561                  encoding,
3562                  Py_TYPE(v)->tp_name);
3563     Py_DECREF(v);
3564     return NULL;
3565 }
3566 
3567 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3568 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3569                            const char *encoding,
3570                            const char *errors)
3571 {
3572     PyObject *v;
3573 
3574     if (!PyUnicode_Check(unicode)) {
3575         PyErr_BadArgument();
3576         goto onError;
3577     }
3578 
3579     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3580                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3581                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3582         return NULL;
3583 
3584     if (encoding == NULL)
3585         encoding = PyUnicode_GetDefaultEncoding();
3586 
3587     /* Encode via the codec registry */
3588     v = PyCodec_Encode(unicode, encoding, errors);
3589     if (v == NULL)
3590         goto onError;
3591     if (!PyUnicode_Check(v)) {
3592         PyErr_Format(PyExc_TypeError,
3593                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3594                      "use codecs.encode() to encode to arbitrary types",
3595                      encoding,
3596                      Py_TYPE(v)->tp_name);
3597         Py_DECREF(v);
3598         goto onError;
3599     }
3600     return v;
3601 
3602   onError:
3603     return NULL;
3604 }
3605 
3606 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,const char * errors,int current_locale)3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3608                       int current_locale)
3609 {
3610     int surrogateescape;
3611     if (locale_error_handler(errors, &surrogateescape) < 0)
3612         return NULL;
3613 
3614     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3615         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3616         return NULL;
3617     }
3618 
3619     wchar_t *wstr;
3620     size_t wlen;
3621     const char *reason;
3622     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3623                                  current_locale, surrogateescape);
3624     if (res != 0) {
3625         if (res == -2) {
3626             PyObject *exc;
3627             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3628                                         "locale", str, len,
3629                                         (Py_ssize_t)wlen,
3630                                         (Py_ssize_t)(wlen + 1),
3631                                         reason);
3632             if (exc != NULL) {
3633                 PyCodec_StrictErrors(exc);
3634                 Py_DECREF(exc);
3635             }
3636         }
3637         else {
3638             PyErr_NoMemory();
3639         }
3640         return NULL;
3641     }
3642 
3643     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3644     PyMem_RawFree(wstr);
3645     return unicode;
3646 }
3647 
3648 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3650                               const char *errors)
3651 {
3652     return unicode_decode_locale(str, len, errors, 1);
3653 }
3654 
3655 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3656 PyUnicode_DecodeLocale(const char *str, const char *errors)
3657 {
3658     Py_ssize_t size = (Py_ssize_t)strlen(str);
3659     return unicode_decode_locale(str, size, errors, 1);
3660 }
3661 
3662 
3663 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3664 PyUnicode_DecodeFSDefault(const char *s) {
3665     Py_ssize_t size = (Py_ssize_t)strlen(s);
3666     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3667 }
3668 
3669 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3671 {
3672 #if defined(__APPLE__)
3673     return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3674 #else
3675     PyInterpreterState *interp = PyThreadState_GET()->interp;
3676     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3677        cannot use it to encode and decode filenames before it is loaded. Load
3678        the Python codec requires to encode at least its own filename. Use the C
3679        version of the locale codec until the codec registry is initialized and
3680        the Python codec is loaded.
3681 
3682        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3683        cannot only rely on it: check also interp->fscodec_initialized for
3684        subinterpreters. */
3685     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3686         return PyUnicode_Decode(s, size,
3687                                 Py_FileSystemDefaultEncoding,
3688                                 Py_FileSystemDefaultEncodeErrors);
3689     }
3690     else {
3691         return unicode_decode_locale(s, size,
3692                                      Py_FileSystemDefaultEncodeErrors, 0);
3693     }
3694 #endif
3695 }
3696 
3697 
3698 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3699 PyUnicode_FSConverter(PyObject* arg, void* addr)
3700 {
3701     PyObject *path = NULL;
3702     PyObject *output = NULL;
3703     Py_ssize_t size;
3704     void *data;
3705     if (arg == NULL) {
3706         Py_DECREF(*(PyObject**)addr);
3707         *(PyObject**)addr = NULL;
3708         return 1;
3709     }
3710     path = PyOS_FSPath(arg);
3711     if (path == NULL) {
3712         return 0;
3713     }
3714     if (PyBytes_Check(path)) {
3715         output = path;
3716     }
3717     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3718         output = PyUnicode_EncodeFSDefault(path);
3719         Py_DECREF(path);
3720         if (!output) {
3721             return 0;
3722         }
3723         assert(PyBytes_Check(output));
3724     }
3725 
3726     size = PyBytes_GET_SIZE(output);
3727     data = PyBytes_AS_STRING(output);
3728     if ((size_t)size != strlen(data)) {
3729         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3730         Py_DECREF(output);
3731         return 0;
3732     }
3733     *(PyObject**)addr = output;
3734     return Py_CLEANUP_SUPPORTED;
3735 }
3736 
3737 
3738 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3739 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3740 {
3741     int is_buffer = 0;
3742     PyObject *path = NULL;
3743     PyObject *output = NULL;
3744     if (arg == NULL) {
3745         Py_DECREF(*(PyObject**)addr);
3746         *(PyObject**)addr = NULL;
3747         return 1;
3748     }
3749 
3750     is_buffer = PyObject_CheckBuffer(arg);
3751     if (!is_buffer) {
3752         path = PyOS_FSPath(arg);
3753         if (path == NULL) {
3754             return 0;
3755         }
3756     }
3757     else {
3758         path = arg;
3759         Py_INCREF(arg);
3760     }
3761 
3762     if (PyUnicode_Check(path)) {
3763         if (PyUnicode_READY(path) == -1) {
3764             Py_DECREF(path);
3765             return 0;
3766         }
3767         output = path;
3768     }
3769     else if (PyBytes_Check(path) || is_buffer) {
3770         PyObject *path_bytes = NULL;
3771 
3772         if (!PyBytes_Check(path) &&
3773             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3774             "path should be string, bytes, or os.PathLike, not %.200s",
3775             Py_TYPE(arg)->tp_name)) {
3776                 Py_DECREF(path);
3777             return 0;
3778         }
3779         path_bytes = PyBytes_FromObject(path);
3780         Py_DECREF(path);
3781         if (!path_bytes) {
3782             return 0;
3783         }
3784         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3785                                                   PyBytes_GET_SIZE(path_bytes));
3786         Py_DECREF(path_bytes);
3787         if (!output) {
3788             return 0;
3789         }
3790     }
3791     else {
3792         PyErr_Format(PyExc_TypeError,
3793                      "path should be string, bytes, or os.PathLike, not %.200s",
3794                      Py_TYPE(arg)->tp_name);
3795         Py_DECREF(path);
3796         return 0;
3797     }
3798     if (PyUnicode_READY(output) == -1) {
3799         Py_DECREF(output);
3800         return 0;
3801     }
3802     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3803                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3804         PyErr_SetString(PyExc_ValueError, "embedded null character");
3805         Py_DECREF(output);
3806         return 0;
3807     }
3808     *(PyObject**)addr = output;
3809     return Py_CLEANUP_SUPPORTED;
3810 }
3811 
3812 
3813 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3815 {
3816     PyObject *bytes;
3817 
3818     if (!PyUnicode_Check(unicode)) {
3819         PyErr_BadArgument();
3820         return NULL;
3821     }
3822     if (PyUnicode_READY(unicode) == -1)
3823         return NULL;
3824 
3825     if (PyUnicode_UTF8(unicode) == NULL) {
3826         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3827         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3828         if (bytes == NULL)
3829             return NULL;
3830         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3831         if (_PyUnicode_UTF8(unicode) == NULL) {
3832             PyErr_NoMemory();
3833             Py_DECREF(bytes);
3834             return NULL;
3835         }
3836         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3837         memcpy(_PyUnicode_UTF8(unicode),
3838                   PyBytes_AS_STRING(bytes),
3839                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
3840         Py_DECREF(bytes);
3841     }
3842 
3843     if (psize)
3844         *psize = PyUnicode_UTF8_LENGTH(unicode);
3845     return PyUnicode_UTF8(unicode);
3846 }
3847 
3848 const char *
PyUnicode_AsUTF8(PyObject * unicode)3849 PyUnicode_AsUTF8(PyObject *unicode)
3850 {
3851     return PyUnicode_AsUTF8AndSize(unicode, NULL);
3852 }
3853 
3854 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3856 {
3857     const unsigned char *one_byte;
3858 #if SIZEOF_WCHAR_T == 4
3859     const Py_UCS2 *two_bytes;
3860 #else
3861     const Py_UCS4 *four_bytes;
3862     const Py_UCS4 *ucs4_end;
3863     Py_ssize_t num_surrogates;
3864 #endif
3865     wchar_t *w;
3866     wchar_t *wchar_end;
3867 
3868     if (!PyUnicode_Check(unicode)) {
3869         PyErr_BadArgument();
3870         return NULL;
3871     }
3872     if (_PyUnicode_WSTR(unicode) == NULL) {
3873         /* Non-ASCII compact unicode object */
3874         assert(_PyUnicode_KIND(unicode) != 0);
3875         assert(PyUnicode_IS_READY(unicode));
3876 
3877         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3878 #if SIZEOF_WCHAR_T == 2
3879             four_bytes = PyUnicode_4BYTE_DATA(unicode);
3880             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3881             num_surrogates = 0;
3882 
3883             for (; four_bytes < ucs4_end; ++four_bytes) {
3884                 if (*four_bytes > 0xFFFF)
3885                     ++num_surrogates;
3886             }
3887 
3888             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3889                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3890             if (!_PyUnicode_WSTR(unicode)) {
3891                 PyErr_NoMemory();
3892                 return NULL;
3893             }
3894             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3895 
3896             w = _PyUnicode_WSTR(unicode);
3897             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3898             four_bytes = PyUnicode_4BYTE_DATA(unicode);
3899             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3900                 if (*four_bytes > 0xFFFF) {
3901                     assert(*four_bytes <= MAX_UNICODE);
3902                     /* encode surrogate pair in this case */
3903                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3904                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3905                 }
3906                 else
3907                     *w = *four_bytes;
3908 
3909                 if (w > wchar_end) {
3910                     Py_UNREACHABLE();
3911                 }
3912             }
3913             *w = 0;
3914 #else
3915             /* sizeof(wchar_t) == 4 */
3916             Py_FatalError("Impossible unicode object state, wstr and str "
3917                           "should share memory already.");
3918             return NULL;
3919 #endif
3920         }
3921         else {
3922             if ((size_t)_PyUnicode_LENGTH(unicode) >
3923                     PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3924                 PyErr_NoMemory();
3925                 return NULL;
3926             }
3927             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3928                                                   (_PyUnicode_LENGTH(unicode) + 1));
3929             if (!_PyUnicode_WSTR(unicode)) {
3930                 PyErr_NoMemory();
3931                 return NULL;
3932             }
3933             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3934                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3935             w = _PyUnicode_WSTR(unicode);
3936             wchar_end = w + _PyUnicode_LENGTH(unicode);
3937 
3938             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3939                 one_byte = PyUnicode_1BYTE_DATA(unicode);
3940                 for (; w < wchar_end; ++one_byte, ++w)
3941                     *w = *one_byte;
3942                 /* null-terminate the wstr */
3943                 *w = 0;
3944             }
3945             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3946 #if SIZEOF_WCHAR_T == 4
3947                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
3948                 for (; w < wchar_end; ++two_bytes, ++w)
3949                     *w = *two_bytes;
3950                 /* null-terminate the wstr */
3951                 *w = 0;
3952 #else
3953                 /* sizeof(wchar_t) == 2 */
3954                 PyObject_FREE(_PyUnicode_WSTR(unicode));
3955                 _PyUnicode_WSTR(unicode) = NULL;
3956                 Py_FatalError("Impossible unicode object state, wstr "
3957                               "and str should share memory already.");
3958                 return NULL;
3959 #endif
3960             }
3961             else {
3962                 Py_UNREACHABLE();
3963             }
3964         }
3965     }
3966     if (size != NULL)
3967         *size = PyUnicode_WSTR_LENGTH(unicode);
3968     return _PyUnicode_WSTR(unicode);
3969 }
3970 
3971 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)3972 PyUnicode_AsUnicode(PyObject *unicode)
3973 {
3974     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3975 }
3976 
3977 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)3978 _PyUnicode_AsUnicode(PyObject *unicode)
3979 {
3980     Py_ssize_t size;
3981     const Py_UNICODE *wstr;
3982 
3983     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3984     if (wstr && wcslen(wstr) != (size_t)size) {
3985         PyErr_SetString(PyExc_ValueError, "embedded null character");
3986         return NULL;
3987     }
3988     return wstr;
3989 }
3990 
3991 
3992 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)3993 PyUnicode_GetSize(PyObject *unicode)
3994 {
3995     if (!PyUnicode_Check(unicode)) {
3996         PyErr_BadArgument();
3997         goto onError;
3998     }
3999     if (_PyUnicode_WSTR(unicode) == NULL) {
4000         if (PyUnicode_AsUnicode(unicode) == NULL)
4001             goto onError;
4002     }
4003     return PyUnicode_WSTR_LENGTH(unicode);
4004 
4005   onError:
4006     return -1;
4007 }
4008 
4009 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4010 PyUnicode_GetLength(PyObject *unicode)
4011 {
4012     if (!PyUnicode_Check(unicode)) {
4013         PyErr_BadArgument();
4014         return -1;
4015     }
4016     if (PyUnicode_READY(unicode) == -1)
4017         return -1;
4018     return PyUnicode_GET_LENGTH(unicode);
4019 }
4020 
4021 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4023 {
4024     void *data;
4025     int kind;
4026 
4027     if (!PyUnicode_Check(unicode)) {
4028         PyErr_BadArgument();
4029         return (Py_UCS4)-1;
4030     }
4031     if (PyUnicode_READY(unicode) == -1) {
4032         return (Py_UCS4)-1;
4033     }
4034     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4035         PyErr_SetString(PyExc_IndexError, "string index out of range");
4036         return (Py_UCS4)-1;
4037     }
4038     data = PyUnicode_DATA(unicode);
4039     kind = PyUnicode_KIND(unicode);
4040     return PyUnicode_READ(kind, data, index);
4041 }
4042 
4043 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4045 {
4046     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4047         PyErr_BadArgument();
4048         return -1;
4049     }
4050     assert(PyUnicode_IS_READY(unicode));
4051     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4052         PyErr_SetString(PyExc_IndexError, "string index out of range");
4053         return -1;
4054     }
4055     if (unicode_check_modifiable(unicode))
4056         return -1;
4057     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4058         PyErr_SetString(PyExc_ValueError, "character out of range");
4059         return -1;
4060     }
4061     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4062                     index, ch);
4063     return 0;
4064 }
4065 
4066 const char *
PyUnicode_GetDefaultEncoding(void)4067 PyUnicode_GetDefaultEncoding(void)
4068 {
4069     return "utf-8";
4070 }
4071 
4072 /* create or adjust a UnicodeDecodeError */
4073 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4074 make_decode_exception(PyObject **exceptionObject,
4075                       const char *encoding,
4076                       const char *input, Py_ssize_t length,
4077                       Py_ssize_t startpos, Py_ssize_t endpos,
4078                       const char *reason)
4079 {
4080     if (*exceptionObject == NULL) {
4081         *exceptionObject = PyUnicodeDecodeError_Create(
4082             encoding, input, length, startpos, endpos, reason);
4083     }
4084     else {
4085         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4086             goto onError;
4087         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4088             goto onError;
4089         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4090             goto onError;
4091     }
4092     return;
4093 
4094 onError:
4095     Py_CLEAR(*exceptionObject);
4096 }
4097 
4098 #ifdef MS_WINDOWS
4099 /* error handling callback helper:
4100    build arguments, call the callback and check the arguments,
4101    if no exception occurred, copy the replacement to the output
4102    and adjust various state variables.
4103    return 0 on success, -1 on error
4104 */
4105 
4106 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4107 unicode_decode_call_errorhandler_wchar(
4108     const char *errors, PyObject **errorHandler,
4109     const char *encoding, const char *reason,
4110     const char **input, const char **inend, Py_ssize_t *startinpos,
4111     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4112     PyObject **output, Py_ssize_t *outpos)
4113 {
4114     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4115 
4116     PyObject *restuple = NULL;
4117     PyObject *repunicode = NULL;
4118     Py_ssize_t outsize;
4119     Py_ssize_t insize;
4120     Py_ssize_t requiredsize;
4121     Py_ssize_t newpos;
4122     PyObject *inputobj = NULL;
4123     wchar_t *repwstr;
4124     Py_ssize_t repwlen;
4125 
4126     assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4127     outsize = _PyUnicode_WSTR_LENGTH(*output);
4128 
4129     if (*errorHandler == NULL) {
4130         *errorHandler = PyCodec_LookupError(errors);
4131         if (*errorHandler == NULL)
4132             goto onError;
4133     }
4134 
4135     make_decode_exception(exceptionObject,
4136         encoding,
4137         *input, *inend - *input,
4138         *startinpos, *endinpos,
4139         reason);
4140     if (*exceptionObject == NULL)
4141         goto onError;
4142 
4143     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4144     if (restuple == NULL)
4145         goto onError;
4146     if (!PyTuple_Check(restuple)) {
4147         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4148         goto onError;
4149     }
4150     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4151         goto onError;
4152 
4153     /* Copy back the bytes variables, which might have been modified by the
4154        callback */
4155     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4156     if (!inputobj)
4157         goto onError;
4158     *input = PyBytes_AS_STRING(inputobj);
4159     insize = PyBytes_GET_SIZE(inputobj);
4160     *inend = *input + insize;
4161     /* we can DECREF safely, as the exception has another reference,
4162        so the object won't go away. */
4163     Py_DECREF(inputobj);
4164 
4165     if (newpos<0)
4166         newpos = insize+newpos;
4167     if (newpos<0 || newpos>insize) {
4168         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4169         goto onError;
4170     }
4171 
4172     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4173     if (repwstr == NULL)
4174         goto onError;
4175     /* need more space? (at least enough for what we
4176        have+the replacement+the rest of the string (starting
4177        at the new input position), so we won't have to check space
4178        when there are no errors in the rest of the string) */
4179     requiredsize = *outpos;
4180     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4181         goto overflow;
4182     requiredsize += repwlen;
4183     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4184         goto overflow;
4185     requiredsize += insize - newpos;
4186     if (requiredsize > outsize) {
4187         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4188             requiredsize = 2*outsize;
4189         if (unicode_resize(output, requiredsize) < 0)
4190             goto onError;
4191     }
4192     wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4193     *outpos += repwlen;
4194     *endinpos = newpos;
4195     *inptr = *input + newpos;
4196 
4197     /* we made it! */
4198     Py_DECREF(restuple);
4199     return 0;
4200 
4201   overflow:
4202     PyErr_SetString(PyExc_OverflowError,
4203                     "decoded result is too long for a Python string");
4204 
4205   onError:
4206     Py_XDECREF(restuple);
4207     return -1;
4208 }
4209 #endif   /* MS_WINDOWS */
4210 
4211 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4212 unicode_decode_call_errorhandler_writer(
4213     const char *errors, PyObject **errorHandler,
4214     const char *encoding, const char *reason,
4215     const char **input, const char **inend, Py_ssize_t *startinpos,
4216     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4217     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4218 {
4219     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4220 
4221     PyObject *restuple = NULL;
4222     PyObject *repunicode = NULL;
4223     Py_ssize_t insize;
4224     Py_ssize_t newpos;
4225     Py_ssize_t replen;
4226     Py_ssize_t remain;
4227     PyObject *inputobj = NULL;
4228     int need_to_grow = 0;
4229     const char *new_inptr;
4230 
4231     if (*errorHandler == NULL) {
4232         *errorHandler = PyCodec_LookupError(errors);
4233         if (*errorHandler == NULL)
4234             goto onError;
4235     }
4236 
4237     make_decode_exception(exceptionObject,
4238         encoding,
4239         *input, *inend - *input,
4240         *startinpos, *endinpos,
4241         reason);
4242     if (*exceptionObject == NULL)
4243         goto onError;
4244 
4245     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4246     if (restuple == NULL)
4247         goto onError;
4248     if (!PyTuple_Check(restuple)) {
4249         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4250         goto onError;
4251     }
4252     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4253         goto onError;
4254 
4255     /* Copy back the bytes variables, which might have been modified by the
4256        callback */
4257     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4258     if (!inputobj)
4259         goto onError;
4260     remain = *inend - *input - *endinpos;
4261     *input = PyBytes_AS_STRING(inputobj);
4262     insize = PyBytes_GET_SIZE(inputobj);
4263     *inend = *input + insize;
4264     /* we can DECREF safely, as the exception has another reference,
4265        so the object won't go away. */
4266     Py_DECREF(inputobj);
4267 
4268     if (newpos<0)
4269         newpos = insize+newpos;
4270     if (newpos<0 || newpos>insize) {
4271         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4272         goto onError;
4273     }
4274 
4275     replen = PyUnicode_GET_LENGTH(repunicode);
4276     if (replen > 1) {
4277         writer->min_length += replen - 1;
4278         need_to_grow = 1;
4279     }
4280     new_inptr = *input + newpos;
4281     if (*inend - new_inptr > remain) {
4282         /* We don't know the decoding algorithm here so we make the worst
4283            assumption that one byte decodes to one unicode character.
4284            If unfortunately one byte could decode to more unicode characters,
4285            the decoder may write out-of-bound then.  Is it possible for the
4286            algorithms using this function? */
4287         writer->min_length += *inend - new_inptr - remain;
4288         need_to_grow = 1;
4289     }
4290     if (need_to_grow) {
4291         writer->overallocate = 1;
4292         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4293                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4294             goto onError;
4295     }
4296     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4297         goto onError;
4298 
4299     *endinpos = newpos;
4300     *inptr = new_inptr;
4301 
4302     /* we made it! */
4303     Py_DECREF(restuple);
4304     return 0;
4305 
4306   onError:
4307     Py_XDECREF(restuple);
4308     return -1;
4309 }
4310 
4311 /* --- UTF-7 Codec -------------------------------------------------------- */
4312 
4313 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4314 
4315 /* Three simple macros defining base-64. */
4316 
4317 /* Is c a base-64 character? */
4318 
4319 #define IS_BASE64(c) \
4320     (((c) >= 'A' && (c) <= 'Z') ||     \
4321      ((c) >= 'a' && (c) <= 'z') ||     \
4322      ((c) >= '0' && (c) <= '9') ||     \
4323      (c) == '+' || (c) == '/')
4324 
4325 /* given that c is a base-64 character, what is its base-64 value? */
4326 
4327 #define FROM_BASE64(c)                                                  \
4328     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4329      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4330      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4331      (c) == '+' ? 62 : 63)
4332 
4333 /* What is the base-64 character of the bottom 6 bits of n? */
4334 
4335 #define TO_BASE64(n)  \
4336     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4337 
4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4339  * decoded as itself.  We are permissive on decoding; the only ASCII
4340  * byte not decoding to itself is the + which begins a base64
4341  * string. */
4342 
4343 #define DECODE_DIRECT(c)                                \
4344     ((c) <= 127 && (c) != '+')
4345 
4346 /* The UTF-7 encoder treats ASCII characters differently according to
4347  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4348  * the above).  See RFC2152.  This array identifies these different
4349  * sets:
4350  * 0 : "Set D"
4351  *     alphanumeric and '(),-./:?
4352  * 1 : "Set O"
4353  *     !"#$%&*;<=>@[]^_`{|}
4354  * 2 : "whitespace"
4355  *     ht nl cr sp
4356  * 3 : special (must be base64 encoded)
4357  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4358  */
4359 
4360 static
4361 char utf7_category[128] = {
4362 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4363     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4365     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4366 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4367     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4368 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4369     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4370 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4371     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4372 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4373     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4374 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4375     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4376 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4377     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4378 };
4379 
4380 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4381  * answer depends on whether we are encoding set O as itself, and also
4382  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4383  * clear that the answers to these questions vary between
4384  * applications, so this code needs to be flexible.  */
4385 
4386 #define ENCODE_DIRECT(c, directO, directWS)             \
4387     ((c) < 128 && (c) > 0 &&                            \
4388      ((utf7_category[(c)] == 0) ||                      \
4389       (directWS && (utf7_category[(c)] == 2)) ||        \
4390       (directO && (utf7_category[(c)] == 1))))
4391 
4392 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4393 PyUnicode_DecodeUTF7(const char *s,
4394                      Py_ssize_t size,
4395                      const char *errors)
4396 {
4397     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4398 }
4399 
4400 /* The decoder.  The only state we preserve is our read position,
4401  * i.e. how many characters we have consumed.  So if we end in the
4402  * middle of a shift sequence we have to back off the read position
4403  * and the output to the beginning of the sequence, otherwise we lose
4404  * all the shift state (seen bits, number of bits seen, high
4405  * surrogate). */
4406 
4407 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4408 PyUnicode_DecodeUTF7Stateful(const char *s,
4409                              Py_ssize_t size,
4410                              const char *errors,
4411                              Py_ssize_t *consumed)
4412 {
4413     const char *starts = s;
4414     Py_ssize_t startinpos;
4415     Py_ssize_t endinpos;
4416     const char *e;
4417     _PyUnicodeWriter writer;
4418     const char *errmsg = "";
4419     int inShift = 0;
4420     Py_ssize_t shiftOutStart;
4421     unsigned int base64bits = 0;
4422     unsigned long base64buffer = 0;
4423     Py_UCS4 surrogate = 0;
4424     PyObject *errorHandler = NULL;
4425     PyObject *exc = NULL;
4426 
4427     if (size == 0) {
4428         if (consumed)
4429             *consumed = 0;
4430         _Py_RETURN_UNICODE_EMPTY();
4431     }
4432 
4433     /* Start off assuming it's all ASCII. Widen later as necessary. */
4434     _PyUnicodeWriter_Init(&writer);
4435     writer.min_length = size;
4436 
4437     shiftOutStart = 0;
4438     e = s + size;
4439 
4440     while (s < e) {
4441         Py_UCS4 ch;
4442       restart:
4443         ch = (unsigned char) *s;
4444 
4445         if (inShift) { /* in a base-64 section */
4446             if (IS_BASE64(ch)) { /* consume a base-64 character */
4447                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4448                 base64bits += 6;
4449                 s++;
4450                 if (base64bits >= 16) {
4451                     /* we have enough bits for a UTF-16 value */
4452                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4453                     base64bits -= 16;
4454                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4455                     assert(outCh <= 0xffff);
4456                     if (surrogate) {
4457                         /* expecting a second surrogate */
4458                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4459                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4460                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4461                                 goto onError;
4462                             surrogate = 0;
4463                             continue;
4464                         }
4465                         else {
4466                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4467                                 goto onError;
4468                             surrogate = 0;
4469                         }
4470                     }
4471                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4472                         /* first surrogate */
4473                         surrogate = outCh;
4474                     }
4475                     else {
4476                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4477                             goto onError;
4478                     }
4479                 }
4480             }
4481             else { /* now leaving a base-64 section */
4482                 inShift = 0;
4483                 if (base64bits > 0) { /* left-over bits */
4484                     if (base64bits >= 6) {
4485                         /* We've seen at least one base-64 character */
4486                         s++;
4487                         errmsg = "partial character in shift sequence";
4488                         goto utf7Error;
4489                     }
4490                     else {
4491                         /* Some bits remain; they should be zero */
4492                         if (base64buffer != 0) {
4493                             s++;
4494                             errmsg = "non-zero padding bits in shift sequence";
4495                             goto utf7Error;
4496                         }
4497                     }
4498                 }
4499                 if (surrogate && DECODE_DIRECT(ch)) {
4500                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4501                         goto onError;
4502                 }
4503                 surrogate = 0;
4504                 if (ch == '-') {
4505                     /* '-' is absorbed; other terminating
4506                        characters are preserved */
4507                     s++;
4508                 }
4509             }
4510         }
4511         else if ( ch == '+' ) {
4512             startinpos = s-starts;
4513             s++; /* consume '+' */
4514             if (s < e && *s == '-') { /* '+-' encodes '+' */
4515                 s++;
4516                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4517                     goto onError;
4518             }
4519             else { /* begin base64-encoded section */
4520                 inShift = 1;
4521                 surrogate = 0;
4522                 shiftOutStart = writer.pos;
4523                 base64bits = 0;
4524                 base64buffer = 0;
4525             }
4526         }
4527         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4528             s++;
4529             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4530                 goto onError;
4531         }
4532         else {
4533             startinpos = s-starts;
4534             s++;
4535             errmsg = "unexpected special character";
4536             goto utf7Error;
4537         }
4538         continue;
4539 utf7Error:
4540         endinpos = s-starts;
4541         if (unicode_decode_call_errorhandler_writer(
4542                 errors, &errorHandler,
4543                 "utf7", errmsg,
4544                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4545                 &writer))
4546             goto onError;
4547     }
4548 
4549     /* end of string */
4550 
4551     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4552         /* if we're in an inconsistent state, that's an error */
4553         inShift = 0;
4554         if (surrogate ||
4555                 (base64bits >= 6) ||
4556                 (base64bits > 0 && base64buffer != 0)) {
4557             endinpos = size;
4558             if (unicode_decode_call_errorhandler_writer(
4559                     errors, &errorHandler,
4560                     "utf7", "unterminated shift sequence",
4561                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4562                     &writer))
4563                 goto onError;
4564             if (s < e)
4565                 goto restart;
4566         }
4567     }
4568 
4569     /* return state */
4570     if (consumed) {
4571         if (inShift) {
4572             *consumed = startinpos;
4573             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4574                 PyObject *result = PyUnicode_FromKindAndData(
4575                         writer.kind, writer.data, shiftOutStart);
4576                 Py_XDECREF(errorHandler);
4577                 Py_XDECREF(exc);
4578                 _PyUnicodeWriter_Dealloc(&writer);
4579                 return result;
4580             }
4581             writer.pos = shiftOutStart; /* back off output */
4582         }
4583         else {
4584             *consumed = s-starts;
4585         }
4586     }
4587 
4588     Py_XDECREF(errorHandler);
4589     Py_XDECREF(exc);
4590     return _PyUnicodeWriter_Finish(&writer);
4591 
4592   onError:
4593     Py_XDECREF(errorHandler);
4594     Py_XDECREF(exc);
4595     _PyUnicodeWriter_Dealloc(&writer);
4596     return NULL;
4597 }
4598 
4599 
4600 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4601 _PyUnicode_EncodeUTF7(PyObject *str,
4602                       int base64SetO,
4603                       int base64WhiteSpace,
4604                       const char *errors)
4605 {
4606     int kind;
4607     void *data;
4608     Py_ssize_t len;
4609     PyObject *v;
4610     int inShift = 0;
4611     Py_ssize_t i;
4612     unsigned int base64bits = 0;
4613     unsigned long base64buffer = 0;
4614     char * out;
4615     char * start;
4616 
4617     if (PyUnicode_READY(str) == -1)
4618         return NULL;
4619     kind = PyUnicode_KIND(str);
4620     data = PyUnicode_DATA(str);
4621     len = PyUnicode_GET_LENGTH(str);
4622 
4623     if (len == 0)
4624         return PyBytes_FromStringAndSize(NULL, 0);
4625 
4626     /* It might be possible to tighten this worst case */
4627     if (len > PY_SSIZE_T_MAX / 8)
4628         return PyErr_NoMemory();
4629     v = PyBytes_FromStringAndSize(NULL, len * 8);
4630     if (v == NULL)
4631         return NULL;
4632 
4633     start = out = PyBytes_AS_STRING(v);
4634     for (i = 0; i < len; ++i) {
4635         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4636 
4637         if (inShift) {
4638             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4639                 /* shifting out */
4640                 if (base64bits) { /* output remaining bits */
4641                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4642                     base64buffer = 0;
4643                     base64bits = 0;
4644                 }
4645                 inShift = 0;
4646                 /* Characters not in the BASE64 set implicitly unshift the sequence
4647                    so no '-' is required, except if the character is itself a '-' */
4648                 if (IS_BASE64(ch) || ch == '-') {
4649                     *out++ = '-';
4650                 }
4651                 *out++ = (char) ch;
4652             }
4653             else {
4654                 goto encode_char;
4655             }
4656         }
4657         else { /* not in a shift sequence */
4658             if (ch == '+') {
4659                 *out++ = '+';
4660                         *out++ = '-';
4661             }
4662             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4663                 *out++ = (char) ch;
4664             }
4665             else {
4666                 *out++ = '+';
4667                 inShift = 1;
4668                 goto encode_char;
4669             }
4670         }
4671         continue;
4672 encode_char:
4673         if (ch >= 0x10000) {
4674             assert(ch <= MAX_UNICODE);
4675 
4676             /* code first surrogate */
4677             base64bits += 16;
4678             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4679             while (base64bits >= 6) {
4680                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4681                 base64bits -= 6;
4682             }
4683             /* prepare second surrogate */
4684             ch = Py_UNICODE_LOW_SURROGATE(ch);
4685         }
4686         base64bits += 16;
4687         base64buffer = (base64buffer << 16) | ch;
4688         while (base64bits >= 6) {
4689             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4690             base64bits -= 6;
4691         }
4692     }
4693     if (base64bits)
4694         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4695     if (inShift)
4696         *out++ = '-';
4697     if (_PyBytes_Resize(&v, out - start) < 0)
4698         return NULL;
4699     return v;
4700 }
4701 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4703                      Py_ssize_t size,
4704                      int base64SetO,
4705                      int base64WhiteSpace,
4706                      const char *errors)
4707 {
4708     PyObject *result;
4709     PyObject *tmp = PyUnicode_FromWideChar(s, size);
4710     if (tmp == NULL)
4711         return NULL;
4712     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4713                                    base64WhiteSpace, errors);
4714     Py_DECREF(tmp);
4715     return result;
4716 }
4717 
4718 #undef IS_BASE64
4719 #undef FROM_BASE64
4720 #undef TO_BASE64
4721 #undef DECODE_DIRECT
4722 #undef ENCODE_DIRECT
4723 
4724 /* --- UTF-8 Codec -------------------------------------------------------- */
4725 
4726 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4727 PyUnicode_DecodeUTF8(const char *s,
4728                      Py_ssize_t size,
4729                      const char *errors)
4730 {
4731     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4732 }
4733 
4734 #include "stringlib/asciilib.h"
4735 #include "stringlib/codecs.h"
4736 #include "stringlib/undef.h"
4737 
4738 #include "stringlib/ucs1lib.h"
4739 #include "stringlib/codecs.h"
4740 #include "stringlib/undef.h"
4741 
4742 #include "stringlib/ucs2lib.h"
4743 #include "stringlib/codecs.h"
4744 #include "stringlib/undef.h"
4745 
4746 #include "stringlib/ucs4lib.h"
4747 #include "stringlib/codecs.h"
4748 #include "stringlib/undef.h"
4749 
4750 /* Mask to quickly check whether a C 'long' contains a
4751    non-ASCII, UTF8-encoded char. */
4752 #if (SIZEOF_LONG == 8)
4753 # define ASCII_CHAR_MASK 0x8080808080808080UL
4754 #elif (SIZEOF_LONG == 4)
4755 # define ASCII_CHAR_MASK 0x80808080UL
4756 #else
4757 # error C 'long' size should be either 4 or 8!
4758 #endif
4759 
4760 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4762 {
4763     const char *p = start;
4764     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4765 
4766     /*
4767      * Issue #17237: m68k is a bit different from most architectures in
4768      * that objects do not use "natural alignment" - for example, int and
4769      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4770      * won't work; also, tests have shown that skipping the "optimised
4771      * version" will even speed up m68k.
4772      */
4773 #if !defined(__m68k__)
4774 #if SIZEOF_LONG <= SIZEOF_VOID_P
4775     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4776     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4777         /* Fast path, see in STRINGLIB(utf8_decode) for
4778            an explanation. */
4779         /* Help allocation */
4780         const char *_p = p;
4781         Py_UCS1 * q = dest;
4782         while (_p < aligned_end) {
4783             unsigned long value = *(const unsigned long *) _p;
4784             if (value & ASCII_CHAR_MASK)
4785                 break;
4786             *((unsigned long *)q) = value;
4787             _p += SIZEOF_LONG;
4788             q += SIZEOF_LONG;
4789         }
4790         p = _p;
4791         while (p < end) {
4792             if ((unsigned char)*p & 0x80)
4793                 break;
4794             *q++ = *p++;
4795         }
4796         return p - start;
4797     }
4798 #endif
4799 #endif
4800     while (p < end) {
4801         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4802            for an explanation. */
4803         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4804             /* Help allocation */
4805             const char *_p = p;
4806             while (_p < aligned_end) {
4807                 unsigned long value = *(unsigned long *) _p;
4808                 if (value & ASCII_CHAR_MASK)
4809                     break;
4810                 _p += SIZEOF_LONG;
4811             }
4812             p = _p;
4813             if (_p == end)
4814                 break;
4815         }
4816         if ((unsigned char)*p & 0x80)
4817             break;
4818         ++p;
4819     }
4820     memcpy(dest, start, p - start);
4821     return p - start;
4822 }
4823 
4824 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4825 PyUnicode_DecodeUTF8Stateful(const char *s,
4826                              Py_ssize_t size,
4827                              const char *errors,
4828                              Py_ssize_t *consumed)
4829 {
4830     _PyUnicodeWriter writer;
4831     const char *starts = s;
4832     const char *end = s + size;
4833 
4834     Py_ssize_t startinpos;
4835     Py_ssize_t endinpos;
4836     const char *errmsg = "";
4837     PyObject *error_handler_obj = NULL;
4838     PyObject *exc = NULL;
4839     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4840 
4841     if (size == 0) {
4842         if (consumed)
4843             *consumed = 0;
4844         _Py_RETURN_UNICODE_EMPTY();
4845     }
4846 
4847     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4848     if (size == 1 && (unsigned char)s[0] < 128) {
4849         if (consumed)
4850             *consumed = 1;
4851         return get_latin1_char((unsigned char)s[0]);
4852     }
4853 
4854     _PyUnicodeWriter_Init(&writer);
4855     writer.min_length = size;
4856     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4857         goto onError;
4858 
4859     writer.pos = ascii_decode(s, end, writer.data);
4860     s += writer.pos;
4861     while (s < end) {
4862         Py_UCS4 ch;
4863         int kind = writer.kind;
4864 
4865         if (kind == PyUnicode_1BYTE_KIND) {
4866             if (PyUnicode_IS_ASCII(writer.buffer))
4867                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4868             else
4869                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4870         } else if (kind == PyUnicode_2BYTE_KIND) {
4871             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4872         } else {
4873             assert(kind == PyUnicode_4BYTE_KIND);
4874             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4875         }
4876 
4877         switch (ch) {
4878         case 0:
4879             if (s == end || consumed)
4880                 goto End;
4881             errmsg = "unexpected end of data";
4882             startinpos = s - starts;
4883             endinpos = end - starts;
4884             break;
4885         case 1:
4886             errmsg = "invalid start byte";
4887             startinpos = s - starts;
4888             endinpos = startinpos + 1;
4889             break;
4890         case 2:
4891         case 3:
4892         case 4:
4893             errmsg = "invalid continuation byte";
4894             startinpos = s - starts;
4895             endinpos = startinpos + ch - 1;
4896             break;
4897         default:
4898             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4899                 goto onError;
4900             continue;
4901         }
4902 
4903         if (error_handler == _Py_ERROR_UNKNOWN)
4904             error_handler = get_error_handler(errors);
4905 
4906         switch (error_handler) {
4907         case _Py_ERROR_IGNORE:
4908             s += (endinpos - startinpos);
4909             break;
4910 
4911         case _Py_ERROR_REPLACE:
4912             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4913                 goto onError;
4914             s += (endinpos - startinpos);
4915             break;
4916 
4917         case _Py_ERROR_SURROGATEESCAPE:
4918         {
4919             Py_ssize_t i;
4920 
4921             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4922                 goto onError;
4923             for (i=startinpos; i<endinpos; i++) {
4924                 ch = (Py_UCS4)(unsigned char)(starts[i]);
4925                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4926                                 ch + 0xdc00);
4927                 writer.pos++;
4928             }
4929             s += (endinpos - startinpos);
4930             break;
4931         }
4932 
4933         default:
4934             if (unicode_decode_call_errorhandler_writer(
4935                     errors, &error_handler_obj,
4936                     "utf-8", errmsg,
4937                     &starts, &end, &startinpos, &endinpos, &exc, &s,
4938                     &writer))
4939                 goto onError;
4940         }
4941     }
4942 
4943 End:
4944     if (consumed)
4945         *consumed = s - starts;
4946 
4947     Py_XDECREF(error_handler_obj);
4948     Py_XDECREF(exc);
4949     return _PyUnicodeWriter_Finish(&writer);
4950 
4951 onError:
4952     Py_XDECREF(error_handler_obj);
4953     Py_XDECREF(exc);
4954     _PyUnicodeWriter_Dealloc(&writer);
4955     return NULL;
4956 }
4957 
4958 
4959 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4960    non-zero, use strict error handler otherwise.
4961 
4962    On success, write a pointer to a newly allocated wide character string into
4963    *wstr (use PyMem_RawFree() to free the memory) and write the output length
4964    (in number of wchar_t units) into *wlen (if wlen is set).
4965 
4966    On memory allocation failure, return -1.
4967 
4968    On decoding error (if surrogateescape is zero), return -2. If wlen is
4969    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4970    is not NULL, write the decoding error message into *reason. */
4971 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,int surrogateescape)4972 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4973                  const char **reason, int surrogateescape)
4974 {
4975     const char *orig_s = s;
4976     const char *e;
4977     wchar_t *unicode;
4978     Py_ssize_t outpos;
4979 
4980     /* Note: size will always be longer than the resulting Unicode
4981        character count */
4982     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
4983         return -1;
4984     }
4985 
4986     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4987     if (!unicode) {
4988         return -1;
4989     }
4990 
4991     /* Unpack UTF-8 encoded data */
4992     e = s + size;
4993     outpos = 0;
4994     while (s < e) {
4995         Py_UCS4 ch;
4996 #if SIZEOF_WCHAR_T == 4
4997         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4998 #else
4999         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5000 #endif
5001         if (ch > 0xFF) {
5002 #if SIZEOF_WCHAR_T == 4
5003             Py_UNREACHABLE();
5004 #else
5005             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5006             /* write a surrogate pair */
5007             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5008             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5009 #endif
5010         }
5011         else {
5012             if (!ch && s == e)
5013                 break;
5014             if (!surrogateescape) {
5015                 PyMem_RawFree(unicode );
5016                 if (reason != NULL) {
5017                     switch (ch) {
5018                     case 0:
5019                         *reason = "unexpected end of data";
5020                         break;
5021                     case 1:
5022                         *reason = "invalid start byte";
5023                         break;
5024                     /* 2, 3, 4 */
5025                     default:
5026                         *reason = "invalid continuation byte";
5027                         break;
5028                     }
5029                 }
5030                 if (wlen != NULL) {
5031                     *wlen = s - orig_s;
5032                 }
5033                 return -2;
5034             }
5035             /* surrogateescape */
5036             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5037         }
5038     }
5039     unicode[outpos] = L'\0';
5040     if (wlen) {
5041         *wlen = outpos;
5042     }
5043     *wstr = unicode;
5044     return 0;
5045 }
5046 
5047 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen)5048 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5049 {
5050     wchar_t *wstr;
5051     int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5052     if (res != 0) {
5053         return NULL;
5054     }
5055     return wstr;
5056 }
5057 
5058 
5059 /* UTF-8 encoder using the surrogateescape error handler .
5060 
5061    On success, return 0 and write the newly allocated character string (use
5062    PyMem_Free() to free the memory) into *str.
5063 
5064    On encoding failure, return -2 and write the position of the invalid
5065    surrogate character into *error_pos (if error_pos is set) and the decoding
5066    error message into *reason (if reason is set).
5067 
5068    On memory allocation failure, return -1. */
5069 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,int surrogateescape)5070 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5071                  const char **reason, int raw_malloc, int surrogateescape)
5072 {
5073     const Py_ssize_t max_char_size = 4;
5074     Py_ssize_t len = wcslen(text);
5075 
5076     assert(len >= 0);
5077 
5078     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5079         return -1;
5080     }
5081     char *bytes;
5082     if (raw_malloc) {
5083         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5084     }
5085     else {
5086         bytes = PyMem_Malloc((len + 1) * max_char_size);
5087     }
5088     if (bytes == NULL) {
5089         return -1;
5090     }
5091 
5092     char *p = bytes;
5093     Py_ssize_t i;
5094     for (i = 0; i < len; i++) {
5095         Py_UCS4 ch = text[i];
5096 
5097         if (ch < 0x80) {
5098             /* Encode ASCII */
5099             *p++ = (char) ch;
5100 
5101         }
5102         else if (ch < 0x0800) {
5103             /* Encode Latin-1 */
5104             *p++ = (char)(0xc0 | (ch >> 6));
5105             *p++ = (char)(0x80 | (ch & 0x3f));
5106         }
5107         else if (Py_UNICODE_IS_SURROGATE(ch)) {
5108             /* surrogateescape error handler */
5109             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5110                 if (error_pos != NULL) {
5111                     *error_pos = (size_t)i;
5112                 }
5113                 if (reason != NULL) {
5114                     *reason = "encoding error";
5115                 }
5116                 if (raw_malloc) {
5117                     PyMem_RawFree(bytes);
5118                 }
5119                 else {
5120                     PyMem_Free(bytes);
5121                 }
5122                 return -2;
5123             }
5124             *p++ = (char)(ch & 0xff);
5125         }
5126         else if (ch < 0x10000) {
5127             *p++ = (char)(0xe0 | (ch >> 12));
5128             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5129             *p++ = (char)(0x80 | (ch & 0x3f));
5130         }
5131         else {  /* ch >= 0x10000 */
5132             assert(ch <= MAX_UNICODE);
5133             /* Encode UCS4 Unicode ordinals */
5134             *p++ = (char)(0xf0 | (ch >> 18));
5135             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5136             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5137             *p++ = (char)(0x80 | (ch & 0x3f));
5138         }
5139     }
5140     *p++ = '\0';
5141 
5142     size_t final_size = (p - bytes);
5143     char *bytes2;
5144     if (raw_malloc) {
5145         bytes2 = PyMem_RawRealloc(bytes, final_size);
5146     }
5147     else {
5148         bytes2 = PyMem_Realloc(bytes, final_size);
5149     }
5150     if (bytes2 == NULL) {
5151         if (error_pos != NULL) {
5152             *error_pos = (size_t)-1;
5153         }
5154         if (raw_malloc) {
5155             PyMem_RawFree(bytes);
5156         }
5157         else {
5158             PyMem_Free(bytes);
5159         }
5160         return -1;
5161     }
5162     *str = bytes2;
5163     return 0;
5164 }
5165 
5166 
5167 /* Primary internal function which creates utf8 encoded bytes objects.
5168 
5169    Allocation strategy:  if the string is short, convert into a stack buffer
5170    and allocate exactly as much space needed at the end.  Else allocate the
5171    maximum possible needed (4 result bytes per Unicode character), and return
5172    the excess memory at the end.
5173 */
5174 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5175 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5176 {
5177     enum PyUnicode_Kind kind;
5178     void *data;
5179     Py_ssize_t size;
5180 
5181     if (!PyUnicode_Check(unicode)) {
5182         PyErr_BadArgument();
5183         return NULL;
5184     }
5185 
5186     if (PyUnicode_READY(unicode) == -1)
5187         return NULL;
5188 
5189     if (PyUnicode_UTF8(unicode))
5190         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5191                                          PyUnicode_UTF8_LENGTH(unicode));
5192 
5193     kind = PyUnicode_KIND(unicode);
5194     data = PyUnicode_DATA(unicode);
5195     size = PyUnicode_GET_LENGTH(unicode);
5196 
5197     switch (kind) {
5198     default:
5199         Py_UNREACHABLE();
5200     case PyUnicode_1BYTE_KIND:
5201         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5202         assert(!PyUnicode_IS_ASCII(unicode));
5203         return ucs1lib_utf8_encoder(unicode, data, size, errors);
5204     case PyUnicode_2BYTE_KIND:
5205         return ucs2lib_utf8_encoder(unicode, data, size, errors);
5206     case PyUnicode_4BYTE_KIND:
5207         return ucs4lib_utf8_encoder(unicode, data, size, errors);
5208     }
5209 }
5210 
5211 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5212 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5213                      Py_ssize_t size,
5214                      const char *errors)
5215 {
5216     PyObject *v, *unicode;
5217 
5218     unicode = PyUnicode_FromWideChar(s, size);
5219     if (unicode == NULL)
5220         return NULL;
5221     v = _PyUnicode_AsUTF8String(unicode, errors);
5222     Py_DECREF(unicode);
5223     return v;
5224 }
5225 
5226 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5227 PyUnicode_AsUTF8String(PyObject *unicode)
5228 {
5229     return _PyUnicode_AsUTF8String(unicode, NULL);
5230 }
5231 
5232 /* --- UTF-32 Codec ------------------------------------------------------- */
5233 
5234 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5235 PyUnicode_DecodeUTF32(const char *s,
5236                       Py_ssize_t size,
5237                       const char *errors,
5238                       int *byteorder)
5239 {
5240     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5241 }
5242 
5243 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5244 PyUnicode_DecodeUTF32Stateful(const char *s,
5245                               Py_ssize_t size,
5246                               const char *errors,
5247                               int *byteorder,
5248                               Py_ssize_t *consumed)
5249 {
5250     const char *starts = s;
5251     Py_ssize_t startinpos;
5252     Py_ssize_t endinpos;
5253     _PyUnicodeWriter writer;
5254     const unsigned char *q, *e;
5255     int le, bo = 0;       /* assume native ordering by default */
5256     const char *encoding;
5257     const char *errmsg = "";
5258     PyObject *errorHandler = NULL;
5259     PyObject *exc = NULL;
5260 
5261     q = (unsigned char *)s;
5262     e = q + size;
5263 
5264     if (byteorder)
5265         bo = *byteorder;
5266 
5267     /* Check for BOM marks (U+FEFF) in the input and adjust current
5268        byte order setting accordingly. In native mode, the leading BOM
5269        mark is skipped, in all other modes, it is copied to the output
5270        stream as-is (giving a ZWNBSP character). */
5271     if (bo == 0 && size >= 4) {
5272         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5273         if (bom == 0x0000FEFF) {
5274             bo = -1;
5275             q += 4;
5276         }
5277         else if (bom == 0xFFFE0000) {
5278             bo = 1;
5279             q += 4;
5280         }
5281         if (byteorder)
5282             *byteorder = bo;
5283     }
5284 
5285     if (q == e) {
5286         if (consumed)
5287             *consumed = size;
5288         _Py_RETURN_UNICODE_EMPTY();
5289     }
5290 
5291 #ifdef WORDS_BIGENDIAN
5292     le = bo < 0;
5293 #else
5294     le = bo <= 0;
5295 #endif
5296     encoding = le ? "utf-32-le" : "utf-32-be";
5297 
5298     _PyUnicodeWriter_Init(&writer);
5299     writer.min_length = (e - q + 3) / 4;
5300     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5301         goto onError;
5302 
5303     while (1) {
5304         Py_UCS4 ch = 0;
5305         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5306 
5307         if (e - q >= 4) {
5308             enum PyUnicode_Kind kind = writer.kind;
5309             void *data = writer.data;
5310             const unsigned char *last = e - 4;
5311             Py_ssize_t pos = writer.pos;
5312             if (le) {
5313                 do {
5314                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5315                     if (ch > maxch)
5316                         break;
5317                     if (kind != PyUnicode_1BYTE_KIND &&
5318                         Py_UNICODE_IS_SURROGATE(ch))
5319                         break;
5320                     PyUnicode_WRITE(kind, data, pos++, ch);
5321                     q += 4;
5322                 } while (q <= last);
5323             }
5324             else {
5325                 do {
5326                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5327                     if (ch > maxch)
5328                         break;
5329                     if (kind != PyUnicode_1BYTE_KIND &&
5330                         Py_UNICODE_IS_SURROGATE(ch))
5331                         break;
5332                     PyUnicode_WRITE(kind, data, pos++, ch);
5333                     q += 4;
5334                 } while (q <= last);
5335             }
5336             writer.pos = pos;
5337         }
5338 
5339         if (Py_UNICODE_IS_SURROGATE(ch)) {
5340             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5341             startinpos = ((const char *)q) - starts;
5342             endinpos = startinpos + 4;
5343         }
5344         else if (ch <= maxch) {
5345             if (q == e || consumed)
5346                 break;
5347             /* remaining bytes at the end? (size should be divisible by 4) */
5348             errmsg = "truncated data";
5349             startinpos = ((const char *)q) - starts;
5350             endinpos = ((const char *)e) - starts;
5351         }
5352         else {
5353             if (ch < 0x110000) {
5354                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5355                     goto onError;
5356                 q += 4;
5357                 continue;
5358             }
5359             errmsg = "code point not in range(0x110000)";
5360             startinpos = ((const char *)q) - starts;
5361             endinpos = startinpos + 4;
5362         }
5363 
5364         /* The remaining input chars are ignored if the callback
5365            chooses to skip the input */
5366         if (unicode_decode_call_errorhandler_writer(
5367                 errors, &errorHandler,
5368                 encoding, errmsg,
5369                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5370                 &writer))
5371             goto onError;
5372     }
5373 
5374     if (consumed)
5375         *consumed = (const char *)q-starts;
5376 
5377     Py_XDECREF(errorHandler);
5378     Py_XDECREF(exc);
5379     return _PyUnicodeWriter_Finish(&writer);
5380 
5381   onError:
5382     _PyUnicodeWriter_Dealloc(&writer);
5383     Py_XDECREF(errorHandler);
5384     Py_XDECREF(exc);
5385     return NULL;
5386 }
5387 
5388 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5389 _PyUnicode_EncodeUTF32(PyObject *str,
5390                        const char *errors,
5391                        int byteorder)
5392 {
5393     enum PyUnicode_Kind kind;
5394     const void *data;
5395     Py_ssize_t len;
5396     PyObject *v;
5397     uint32_t *out;
5398 #if PY_LITTLE_ENDIAN
5399     int native_ordering = byteorder <= 0;
5400 #else
5401     int native_ordering = byteorder >= 0;
5402 #endif
5403     const char *encoding;
5404     Py_ssize_t nsize, pos;
5405     PyObject *errorHandler = NULL;
5406     PyObject *exc = NULL;
5407     PyObject *rep = NULL;
5408 
5409     if (!PyUnicode_Check(str)) {
5410         PyErr_BadArgument();
5411         return NULL;
5412     }
5413     if (PyUnicode_READY(str) == -1)
5414         return NULL;
5415     kind = PyUnicode_KIND(str);
5416     data = PyUnicode_DATA(str);
5417     len = PyUnicode_GET_LENGTH(str);
5418 
5419     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5420         return PyErr_NoMemory();
5421     nsize = len + (byteorder == 0);
5422     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5423     if (v == NULL)
5424         return NULL;
5425 
5426     /* output buffer is 4-bytes aligned */
5427     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5428     out = (uint32_t *)PyBytes_AS_STRING(v);
5429     if (byteorder == 0)
5430         *out++ = 0xFEFF;
5431     if (len == 0)
5432         goto done;
5433 
5434     if (byteorder == -1)
5435         encoding = "utf-32-le";
5436     else if (byteorder == 1)
5437         encoding = "utf-32-be";
5438     else
5439         encoding = "utf-32";
5440 
5441     if (kind == PyUnicode_1BYTE_KIND) {
5442         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5443         goto done;
5444     }
5445 
5446     pos = 0;
5447     while (pos < len) {
5448         Py_ssize_t repsize, moreunits;
5449 
5450         if (kind == PyUnicode_2BYTE_KIND) {
5451             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5452                                         &out, native_ordering);
5453         }
5454         else {
5455             assert(kind == PyUnicode_4BYTE_KIND);
5456             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5457                                         &out, native_ordering);
5458         }
5459         if (pos == len)
5460             break;
5461 
5462         rep = unicode_encode_call_errorhandler(
5463                 errors, &errorHandler,
5464                 encoding, "surrogates not allowed",
5465                 str, &exc, pos, pos + 1, &pos);
5466         if (!rep)
5467             goto error;
5468 
5469         if (PyBytes_Check(rep)) {
5470             repsize = PyBytes_GET_SIZE(rep);
5471             if (repsize & 3) {
5472                 raise_encode_exception(&exc, encoding,
5473                                        str, pos - 1, pos,
5474                                        "surrogates not allowed");
5475                 goto error;
5476             }
5477             moreunits = repsize / 4;
5478         }
5479         else {
5480             assert(PyUnicode_Check(rep));
5481             if (PyUnicode_READY(rep) < 0)
5482                 goto error;
5483             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5484             if (!PyUnicode_IS_ASCII(rep)) {
5485                 raise_encode_exception(&exc, encoding,
5486                                        str, pos - 1, pos,
5487                                        "surrogates not allowed");
5488                 goto error;
5489             }
5490         }
5491 
5492         /* four bytes are reserved for each surrogate */
5493         if (moreunits > 1) {
5494             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5495             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5496                 /* integer overflow */
5497                 PyErr_NoMemory();
5498                 goto error;
5499             }
5500             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5501                 goto error;
5502             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5503         }
5504 
5505         if (PyBytes_Check(rep)) {
5506             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5507             out += moreunits;
5508         } else /* rep is unicode */ {
5509             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5510             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5511                                  &out, native_ordering);
5512         }
5513 
5514         Py_CLEAR(rep);
5515     }
5516 
5517     /* Cut back to size actually needed. This is necessary for, for example,
5518        encoding of a string containing isolated surrogates and the 'ignore'
5519        handler is used. */
5520     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5521     if (nsize != PyBytes_GET_SIZE(v))
5522       _PyBytes_Resize(&v, nsize);
5523     Py_XDECREF(errorHandler);
5524     Py_XDECREF(exc);
5525   done:
5526     return v;
5527   error:
5528     Py_XDECREF(rep);
5529     Py_XDECREF(errorHandler);
5530     Py_XDECREF(exc);
5531     Py_XDECREF(v);
5532     return NULL;
5533 }
5534 
5535 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5536 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5537                       Py_ssize_t size,
5538                       const char *errors,
5539                       int byteorder)
5540 {
5541     PyObject *result;
5542     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5543     if (tmp == NULL)
5544         return NULL;
5545     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5546     Py_DECREF(tmp);
5547     return result;
5548 }
5549 
5550 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5551 PyUnicode_AsUTF32String(PyObject *unicode)
5552 {
5553     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5554 }
5555 
5556 /* --- UTF-16 Codec ------------------------------------------------------- */
5557 
5558 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5559 PyUnicode_DecodeUTF16(const char *s,
5560                       Py_ssize_t size,
5561                       const char *errors,
5562                       int *byteorder)
5563 {
5564     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5565 }
5566 
5567 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5568 PyUnicode_DecodeUTF16Stateful(const char *s,
5569                               Py_ssize_t size,
5570                               const char *errors,
5571                               int *byteorder,
5572                               Py_ssize_t *consumed)
5573 {
5574     const char *starts = s;
5575     Py_ssize_t startinpos;
5576     Py_ssize_t endinpos;
5577     _PyUnicodeWriter writer;
5578     const unsigned char *q, *e;
5579     int bo = 0;       /* assume native ordering by default */
5580     int native_ordering;
5581     const char *errmsg = "";
5582     PyObject *errorHandler = NULL;
5583     PyObject *exc = NULL;
5584     const char *encoding;
5585 
5586     q = (unsigned char *)s;
5587     e = q + size;
5588 
5589     if (byteorder)
5590         bo = *byteorder;
5591 
5592     /* Check for BOM marks (U+FEFF) in the input and adjust current
5593        byte order setting accordingly. In native mode, the leading BOM
5594        mark is skipped, in all other modes, it is copied to the output
5595        stream as-is (giving a ZWNBSP character). */
5596     if (bo == 0 && size >= 2) {
5597         const Py_UCS4 bom = (q[1] << 8) | q[0];
5598         if (bom == 0xFEFF) {
5599             q += 2;
5600             bo = -1;
5601         }
5602         else if (bom == 0xFFFE) {
5603             q += 2;
5604             bo = 1;
5605         }
5606         if (byteorder)
5607             *byteorder = bo;
5608     }
5609 
5610     if (q == e) {
5611         if (consumed)
5612             *consumed = size;
5613         _Py_RETURN_UNICODE_EMPTY();
5614     }
5615 
5616 #if PY_LITTLE_ENDIAN
5617     native_ordering = bo <= 0;
5618     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5619 #else
5620     native_ordering = bo >= 0;
5621     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5622 #endif
5623 
5624     /* Note: size will always be longer than the resulting Unicode
5625        character count normally.  Error handler will take care of
5626        resizing when needed. */
5627     _PyUnicodeWriter_Init(&writer);
5628     writer.min_length = (e - q + 1) / 2;
5629     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5630         goto onError;
5631 
5632     while (1) {
5633         Py_UCS4 ch = 0;
5634         if (e - q >= 2) {
5635             int kind = writer.kind;
5636             if (kind == PyUnicode_1BYTE_KIND) {
5637                 if (PyUnicode_IS_ASCII(writer.buffer))
5638                     ch = asciilib_utf16_decode(&q, e,
5639                             (Py_UCS1*)writer.data, &writer.pos,
5640                             native_ordering);
5641                 else
5642                     ch = ucs1lib_utf16_decode(&q, e,
5643                             (Py_UCS1*)writer.data, &writer.pos,
5644                             native_ordering);
5645             } else if (kind == PyUnicode_2BYTE_KIND) {
5646                 ch = ucs2lib_utf16_decode(&q, e,
5647                         (Py_UCS2*)writer.data, &writer.pos,
5648                         native_ordering);
5649             } else {
5650                 assert(kind == PyUnicode_4BYTE_KIND);
5651                 ch = ucs4lib_utf16_decode(&q, e,
5652                         (Py_UCS4*)writer.data, &writer.pos,
5653                         native_ordering);
5654             }
5655         }
5656 
5657         switch (ch)
5658         {
5659         case 0:
5660             /* remaining byte at the end? (size should be even) */
5661             if (q == e || consumed)
5662                 goto End;
5663             errmsg = "truncated data";
5664             startinpos = ((const char *)q) - starts;
5665             endinpos = ((const char *)e) - starts;
5666             break;
5667             /* The remaining input chars are ignored if the callback
5668                chooses to skip the input */
5669         case 1:
5670             q -= 2;
5671             if (consumed)
5672                 goto End;
5673             errmsg = "unexpected end of data";
5674             startinpos = ((const char *)q) - starts;
5675             endinpos = ((const char *)e) - starts;
5676             break;
5677         case 2:
5678             errmsg = "illegal encoding";
5679             startinpos = ((const char *)q) - 2 - starts;
5680             endinpos = startinpos + 2;
5681             break;
5682         case 3:
5683             errmsg = "illegal UTF-16 surrogate";
5684             startinpos = ((const char *)q) - 4 - starts;
5685             endinpos = startinpos + 2;
5686             break;
5687         default:
5688             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5689                 goto onError;
5690             continue;
5691         }
5692 
5693         if (unicode_decode_call_errorhandler_writer(
5694                 errors,
5695                 &errorHandler,
5696                 encoding, errmsg,
5697                 &starts,
5698                 (const char **)&e,
5699                 &startinpos,
5700                 &endinpos,
5701                 &exc,
5702                 (const char **)&q,
5703                 &writer))
5704             goto onError;
5705     }
5706 
5707 End:
5708     if (consumed)
5709         *consumed = (const char *)q-starts;
5710 
5711     Py_XDECREF(errorHandler);
5712     Py_XDECREF(exc);
5713     return _PyUnicodeWriter_Finish(&writer);
5714 
5715   onError:
5716     _PyUnicodeWriter_Dealloc(&writer);
5717     Py_XDECREF(errorHandler);
5718     Py_XDECREF(exc);
5719     return NULL;
5720 }
5721 
5722 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5723 _PyUnicode_EncodeUTF16(PyObject *str,
5724                        const char *errors,
5725                        int byteorder)
5726 {
5727     enum PyUnicode_Kind kind;
5728     const void *data;
5729     Py_ssize_t len;
5730     PyObject *v;
5731     unsigned short *out;
5732     Py_ssize_t pairs;
5733 #if PY_BIG_ENDIAN
5734     int native_ordering = byteorder >= 0;
5735 #else
5736     int native_ordering = byteorder <= 0;
5737 #endif
5738     const char *encoding;
5739     Py_ssize_t nsize, pos;
5740     PyObject *errorHandler = NULL;
5741     PyObject *exc = NULL;
5742     PyObject *rep = NULL;
5743 
5744     if (!PyUnicode_Check(str)) {
5745         PyErr_BadArgument();
5746         return NULL;
5747     }
5748     if (PyUnicode_READY(str) == -1)
5749         return NULL;
5750     kind = PyUnicode_KIND(str);
5751     data = PyUnicode_DATA(str);
5752     len = PyUnicode_GET_LENGTH(str);
5753 
5754     pairs = 0;
5755     if (kind == PyUnicode_4BYTE_KIND) {
5756         const Py_UCS4 *in = (const Py_UCS4 *)data;
5757         const Py_UCS4 *end = in + len;
5758         while (in < end) {
5759             if (*in++ >= 0x10000) {
5760                 pairs++;
5761             }
5762         }
5763     }
5764     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5765         return PyErr_NoMemory();
5766     }
5767     nsize = len + pairs + (byteorder == 0);
5768     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5769     if (v == NULL) {
5770         return NULL;
5771     }
5772 
5773     /* output buffer is 2-bytes aligned */
5774     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5775     out = (unsigned short *)PyBytes_AS_STRING(v);
5776     if (byteorder == 0) {
5777         *out++ = 0xFEFF;
5778     }
5779     if (len == 0) {
5780         goto done;
5781     }
5782 
5783     if (kind == PyUnicode_1BYTE_KIND) {
5784         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5785         goto done;
5786     }
5787 
5788     if (byteorder < 0) {
5789         encoding = "utf-16-le";
5790     }
5791     else if (byteorder > 0) {
5792         encoding = "utf-16-be";
5793     }
5794     else {
5795         encoding = "utf-16";
5796     }
5797 
5798     pos = 0;
5799     while (pos < len) {
5800         Py_ssize_t repsize, moreunits;
5801 
5802         if (kind == PyUnicode_2BYTE_KIND) {
5803             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5804                                         &out, native_ordering);
5805         }
5806         else {
5807             assert(kind == PyUnicode_4BYTE_KIND);
5808             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5809                                         &out, native_ordering);
5810         }
5811         if (pos == len)
5812             break;
5813 
5814         rep = unicode_encode_call_errorhandler(
5815                 errors, &errorHandler,
5816                 encoding, "surrogates not allowed",
5817                 str, &exc, pos, pos + 1, &pos);
5818         if (!rep)
5819             goto error;
5820 
5821         if (PyBytes_Check(rep)) {
5822             repsize = PyBytes_GET_SIZE(rep);
5823             if (repsize & 1) {
5824                 raise_encode_exception(&exc, encoding,
5825                                        str, pos - 1, pos,
5826                                        "surrogates not allowed");
5827                 goto error;
5828             }
5829             moreunits = repsize / 2;
5830         }
5831         else {
5832             assert(PyUnicode_Check(rep));
5833             if (PyUnicode_READY(rep) < 0)
5834                 goto error;
5835             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5836             if (!PyUnicode_IS_ASCII(rep)) {
5837                 raise_encode_exception(&exc, encoding,
5838                                        str, pos - 1, pos,
5839                                        "surrogates not allowed");
5840                 goto error;
5841             }
5842         }
5843 
5844         /* two bytes are reserved for each surrogate */
5845         if (moreunits > 1) {
5846             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5847             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5848                 /* integer overflow */
5849                 PyErr_NoMemory();
5850                 goto error;
5851             }
5852             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
5853                 goto error;
5854             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5855         }
5856 
5857         if (PyBytes_Check(rep)) {
5858             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5859             out += moreunits;
5860         } else /* rep is unicode */ {
5861             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5862             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5863                                  &out, native_ordering);
5864         }
5865 
5866         Py_CLEAR(rep);
5867     }
5868 
5869     /* Cut back to size actually needed. This is necessary for, for example,
5870     encoding of a string containing isolated surrogates and the 'ignore' handler
5871     is used. */
5872     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5873     if (nsize != PyBytes_GET_SIZE(v))
5874       _PyBytes_Resize(&v, nsize);
5875     Py_XDECREF(errorHandler);
5876     Py_XDECREF(exc);
5877   done:
5878     return v;
5879   error:
5880     Py_XDECREF(rep);
5881     Py_XDECREF(errorHandler);
5882     Py_XDECREF(exc);
5883     Py_XDECREF(v);
5884     return NULL;
5885 #undef STORECHAR
5886 }
5887 
5888 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5889 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5890                       Py_ssize_t size,
5891                       const char *errors,
5892                       int byteorder)
5893 {
5894     PyObject *result;
5895     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5896     if (tmp == NULL)
5897         return NULL;
5898     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5899     Py_DECREF(tmp);
5900     return result;
5901 }
5902 
5903 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5904 PyUnicode_AsUTF16String(PyObject *unicode)
5905 {
5906     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5907 }
5908 
5909 /* --- Unicode Escape Codec ----------------------------------------------- */
5910 
5911 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5912 
5913 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5914 _PyUnicode_DecodeUnicodeEscape(const char *s,
5915                                Py_ssize_t size,
5916                                const char *errors,
5917                                const char **first_invalid_escape)
5918 {
5919     const char *starts = s;
5920     _PyUnicodeWriter writer;
5921     const char *end;
5922     PyObject *errorHandler = NULL;
5923     PyObject *exc = NULL;
5924 
5925     // so we can remember if we've seen an invalid escape char or not
5926     *first_invalid_escape = NULL;
5927 
5928     if (size == 0) {
5929         _Py_RETURN_UNICODE_EMPTY();
5930     }
5931     /* Escaped strings will always be longer than the resulting
5932        Unicode string, so we start with size here and then reduce the
5933        length after conversion to the true value.
5934        (but if the error callback returns a long replacement string
5935        we'll have to allocate more space) */
5936     _PyUnicodeWriter_Init(&writer);
5937     writer.min_length = size;
5938     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5939         goto onError;
5940     }
5941 
5942     end = s + size;
5943     while (s < end) {
5944         unsigned char c = (unsigned char) *s++;
5945         Py_UCS4 ch;
5946         int count;
5947         Py_ssize_t startinpos;
5948         Py_ssize_t endinpos;
5949         const char *message;
5950 
5951 #define WRITE_ASCII_CHAR(ch)                                                  \
5952             do {                                                              \
5953                 assert(ch <= 127);                                            \
5954                 assert(writer.pos < writer.size);                             \
5955                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
5956             } while(0)
5957 
5958 #define WRITE_CHAR(ch)                                                        \
5959             do {                                                              \
5960                 if (ch <= writer.maxchar) {                                   \
5961                     assert(writer.pos < writer.size);                         \
5962                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5963                 }                                                             \
5964                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5965                     goto onError;                                             \
5966                 }                                                             \
5967             } while(0)
5968 
5969         /* Non-escape characters are interpreted as Unicode ordinals */
5970         if (c != '\\') {
5971             WRITE_CHAR(c);
5972             continue;
5973         }
5974 
5975         startinpos = s - starts - 1;
5976         /* \ - Escapes */
5977         if (s >= end) {
5978             message = "\\ at end of string";
5979             goto error;
5980         }
5981         c = (unsigned char) *s++;
5982 
5983         assert(writer.pos < writer.size);
5984         switch (c) {
5985 
5986             /* \x escapes */
5987         case '\n': continue;
5988         case '\\': WRITE_ASCII_CHAR('\\'); continue;
5989         case '\'': WRITE_ASCII_CHAR('\''); continue;
5990         case '\"': WRITE_ASCII_CHAR('\"'); continue;
5991         case 'b': WRITE_ASCII_CHAR('\b'); continue;
5992         /* FF */
5993         case 'f': WRITE_ASCII_CHAR('\014'); continue;
5994         case 't': WRITE_ASCII_CHAR('\t'); continue;
5995         case 'n': WRITE_ASCII_CHAR('\n'); continue;
5996         case 'r': WRITE_ASCII_CHAR('\r'); continue;
5997         /* VT */
5998         case 'v': WRITE_ASCII_CHAR('\013'); continue;
5999         /* BEL, not classic C */
6000         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6001 
6002             /* \OOO (octal) escapes */
6003         case '0': case '1': case '2': case '3':
6004         case '4': case '5': case '6': case '7':
6005             ch = c - '0';
6006             if (s < end && '0' <= *s && *s <= '7') {
6007                 ch = (ch<<3) + *s++ - '0';
6008                 if (s < end && '0' <= *s && *s <= '7') {
6009                     ch = (ch<<3) + *s++ - '0';
6010                 }
6011             }
6012             WRITE_CHAR(ch);
6013             continue;
6014 
6015             /* hex escapes */
6016             /* \xXX */
6017         case 'x':
6018             count = 2;
6019             message = "truncated \\xXX escape";
6020             goto hexescape;
6021 
6022             /* \uXXXX */
6023         case 'u':
6024             count = 4;
6025             message = "truncated \\uXXXX escape";
6026             goto hexescape;
6027 
6028             /* \UXXXXXXXX */
6029         case 'U':
6030             count = 8;
6031             message = "truncated \\UXXXXXXXX escape";
6032         hexescape:
6033             for (ch = 0; count && s < end; ++s, --count) {
6034                 c = (unsigned char)*s;
6035                 ch <<= 4;
6036                 if (c >= '0' && c <= '9') {
6037                     ch += c - '0';
6038                 }
6039                 else if (c >= 'a' && c <= 'f') {
6040                     ch += c - ('a' - 10);
6041                 }
6042                 else if (c >= 'A' && c <= 'F') {
6043                     ch += c - ('A' - 10);
6044                 }
6045                 else {
6046                     break;
6047                 }
6048             }
6049             if (count) {
6050                 goto error;
6051             }
6052 
6053             /* when we get here, ch is a 32-bit unicode character */
6054             if (ch > MAX_UNICODE) {
6055                 message = "illegal Unicode character";
6056                 goto error;
6057             }
6058 
6059             WRITE_CHAR(ch);
6060             continue;
6061 
6062             /* \N{name} */
6063         case 'N':
6064             if (ucnhash_CAPI == NULL) {
6065                 /* load the unicode data module */
6066                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6067                                                 PyUnicodeData_CAPSULE_NAME, 1);
6068                 if (ucnhash_CAPI == NULL) {
6069                     PyErr_SetString(
6070                         PyExc_UnicodeError,
6071                         "\\N escapes not supported (can't load unicodedata module)"
6072                         );
6073                     goto onError;
6074                 }
6075             }
6076 
6077             message = "malformed \\N character escape";
6078             if (s < end && *s == '{') {
6079                 const char *start = ++s;
6080                 size_t namelen;
6081                 /* look for the closing brace */
6082                 while (s < end && *s != '}')
6083                     s++;
6084                 namelen = s - start;
6085                 if (namelen && s < end) {
6086                     /* found a name.  look it up in the unicode database */
6087                     s++;
6088                     ch = 0xffffffff; /* in case 'getcode' messes up */
6089                     if (namelen <= INT_MAX &&
6090                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6091                                               &ch, 0)) {
6092                         assert(ch <= MAX_UNICODE);
6093                         WRITE_CHAR(ch);
6094                         continue;
6095                     }
6096                     message = "unknown Unicode character name";
6097                 }
6098             }
6099             goto error;
6100 
6101         default:
6102             if (*first_invalid_escape == NULL) {
6103                 *first_invalid_escape = s-1; /* Back up one char, since we've
6104                                                 already incremented s. */
6105             }
6106             WRITE_ASCII_CHAR('\\');
6107             WRITE_CHAR(c);
6108             continue;
6109         }
6110 
6111       error:
6112         endinpos = s-starts;
6113         writer.min_length = end - s + writer.pos;
6114         if (unicode_decode_call_errorhandler_writer(
6115                 errors, &errorHandler,
6116                 "unicodeescape", message,
6117                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6118                 &writer)) {
6119             goto onError;
6120         }
6121         assert(end - s <= writer.size - writer.pos);
6122 
6123 #undef WRITE_ASCII_CHAR
6124 #undef WRITE_CHAR
6125     }
6126 
6127     Py_XDECREF(errorHandler);
6128     Py_XDECREF(exc);
6129     return _PyUnicodeWriter_Finish(&writer);
6130 
6131   onError:
6132     _PyUnicodeWriter_Dealloc(&writer);
6133     Py_XDECREF(errorHandler);
6134     Py_XDECREF(exc);
6135     return NULL;
6136 }
6137 
6138 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6139 PyUnicode_DecodeUnicodeEscape(const char *s,
6140                               Py_ssize_t size,
6141                               const char *errors)
6142 {
6143     const char *first_invalid_escape;
6144     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6145                                                       &first_invalid_escape);
6146     if (result == NULL)
6147         return NULL;
6148     if (first_invalid_escape != NULL) {
6149         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6150                              "invalid escape sequence '\\%c'",
6151                              (unsigned char)*first_invalid_escape) < 0) {
6152             Py_DECREF(result);
6153             return NULL;
6154         }
6155     }
6156     return result;
6157 }
6158 
6159 /* Return a Unicode-Escape string version of the Unicode object. */
6160 
6161 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6162 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6163 {
6164     Py_ssize_t i, len;
6165     PyObject *repr;
6166     char *p;
6167     enum PyUnicode_Kind kind;
6168     void *data;
6169     Py_ssize_t expandsize;
6170 
6171     /* Initial allocation is based on the longest-possible character
6172        escape.
6173 
6174        For UCS1 strings it's '\xxx', 4 bytes per source character.
6175        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6176        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6177     */
6178 
6179     if (!PyUnicode_Check(unicode)) {
6180         PyErr_BadArgument();
6181         return NULL;
6182     }
6183     if (PyUnicode_READY(unicode) == -1) {
6184         return NULL;
6185     }
6186 
6187     len = PyUnicode_GET_LENGTH(unicode);
6188     if (len == 0) {
6189         return PyBytes_FromStringAndSize(NULL, 0);
6190     }
6191 
6192     kind = PyUnicode_KIND(unicode);
6193     data = PyUnicode_DATA(unicode);
6194     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6195        bytes, and 1 byte characters 4. */
6196     expandsize = kind * 2 + 2;
6197     if (len > PY_SSIZE_T_MAX / expandsize) {
6198         return PyErr_NoMemory();
6199     }
6200     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6201     if (repr == NULL) {
6202         return NULL;
6203     }
6204 
6205     p = PyBytes_AS_STRING(repr);
6206     for (i = 0; i < len; i++) {
6207         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6208 
6209         /* U+0000-U+00ff range */
6210         if (ch < 0x100) {
6211             if (ch >= ' ' && ch < 127) {
6212                 if (ch != '\\') {
6213                     /* Copy printable US ASCII as-is */
6214                     *p++ = (char) ch;
6215                 }
6216                 /* Escape backslashes */
6217                 else {
6218                     *p++ = '\\';
6219                     *p++ = '\\';
6220                 }
6221             }
6222 
6223             /* Map special whitespace to '\t', \n', '\r' */
6224             else if (ch == '\t') {
6225                 *p++ = '\\';
6226                 *p++ = 't';
6227             }
6228             else if (ch == '\n') {
6229                 *p++ = '\\';
6230                 *p++ = 'n';
6231             }
6232             else if (ch == '\r') {
6233                 *p++ = '\\';
6234                 *p++ = 'r';
6235             }
6236 
6237             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6238             else {
6239                 *p++ = '\\';
6240                 *p++ = 'x';
6241                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6242                 *p++ = Py_hexdigits[ch & 0x000F];
6243             }
6244         }
6245         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6246         else if (ch < 0x10000) {
6247             *p++ = '\\';
6248             *p++ = 'u';
6249             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6250             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6251             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6252             *p++ = Py_hexdigits[ch & 0x000F];
6253         }
6254         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6255         else {
6256 
6257             /* Make sure that the first two digits are zero */
6258             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6259             *p++ = '\\';
6260             *p++ = 'U';
6261             *p++ = '0';
6262             *p++ = '0';
6263             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6264             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6265             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6266             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6267             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6268             *p++ = Py_hexdigits[ch & 0x0000000F];
6269         }
6270     }
6271 
6272     assert(p - PyBytes_AS_STRING(repr) > 0);
6273     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6274         return NULL;
6275     }
6276     return repr;
6277 }
6278 
6279 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6280 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6281                               Py_ssize_t size)
6282 {
6283     PyObject *result;
6284     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6285     if (tmp == NULL) {
6286         return NULL;
6287     }
6288 
6289     result = PyUnicode_AsUnicodeEscapeString(tmp);
6290     Py_DECREF(tmp);
6291     return result;
6292 }
6293 
6294 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6295 
6296 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6297 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6298                                  Py_ssize_t size,
6299                                  const char *errors)
6300 {
6301     const char *starts = s;
6302     _PyUnicodeWriter writer;
6303     const char *end;
6304     PyObject *errorHandler = NULL;
6305     PyObject *exc = NULL;
6306 
6307     if (size == 0) {
6308         _Py_RETURN_UNICODE_EMPTY();
6309     }
6310 
6311     /* Escaped strings will always be longer than the resulting
6312        Unicode string, so we start with size here and then reduce the
6313        length after conversion to the true value. (But decoding error
6314        handler might have to resize the string) */
6315     _PyUnicodeWriter_Init(&writer);
6316      writer.min_length = size;
6317     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6318         goto onError;
6319     }
6320 
6321     end = s + size;
6322     while (s < end) {
6323         unsigned char c = (unsigned char) *s++;
6324         Py_UCS4 ch;
6325         int count;
6326         Py_ssize_t startinpos;
6327         Py_ssize_t endinpos;
6328         const char *message;
6329 
6330 #define WRITE_CHAR(ch)                                                        \
6331             do {                                                              \
6332                 if (ch <= writer.maxchar) {                                   \
6333                     assert(writer.pos < writer.size);                         \
6334                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6335                 }                                                             \
6336                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6337                     goto onError;                                             \
6338                 }                                                             \
6339             } while(0)
6340 
6341         /* Non-escape characters are interpreted as Unicode ordinals */
6342         if (c != '\\' || s >= end) {
6343             WRITE_CHAR(c);
6344             continue;
6345         }
6346 
6347         c = (unsigned char) *s++;
6348         if (c == 'u') {
6349             count = 4;
6350             message = "truncated \\uXXXX escape";
6351         }
6352         else if (c == 'U') {
6353             count = 8;
6354             message = "truncated \\UXXXXXXXX escape";
6355         }
6356         else {
6357             assert(writer.pos < writer.size);
6358             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6359             WRITE_CHAR(c);
6360             continue;
6361         }
6362         startinpos = s - starts - 2;
6363 
6364         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6365         for (ch = 0; count && s < end; ++s, --count) {
6366             c = (unsigned char)*s;
6367             ch <<= 4;
6368             if (c >= '0' && c <= '9') {
6369                 ch += c - '0';
6370             }
6371             else if (c >= 'a' && c <= 'f') {
6372                 ch += c - ('a' - 10);
6373             }
6374             else if (c >= 'A' && c <= 'F') {
6375                 ch += c - ('A' - 10);
6376             }
6377             else {
6378                 break;
6379             }
6380         }
6381         if (!count) {
6382             if (ch <= MAX_UNICODE) {
6383                 WRITE_CHAR(ch);
6384                 continue;
6385             }
6386             message = "\\Uxxxxxxxx out of range";
6387         }
6388 
6389         endinpos = s-starts;
6390         writer.min_length = end - s + writer.pos;
6391         if (unicode_decode_call_errorhandler_writer(
6392                 errors, &errorHandler,
6393                 "rawunicodeescape", message,
6394                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6395                 &writer)) {
6396             goto onError;
6397         }
6398         assert(end - s <= writer.size - writer.pos);
6399 
6400 #undef WRITE_CHAR
6401     }
6402     Py_XDECREF(errorHandler);
6403     Py_XDECREF(exc);
6404     return _PyUnicodeWriter_Finish(&writer);
6405 
6406   onError:
6407     _PyUnicodeWriter_Dealloc(&writer);
6408     Py_XDECREF(errorHandler);
6409     Py_XDECREF(exc);
6410     return NULL;
6411 
6412 }
6413 
6414 
6415 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6416 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6417 {
6418     PyObject *repr;
6419     char *p;
6420     Py_ssize_t expandsize, pos;
6421     int kind;
6422     void *data;
6423     Py_ssize_t len;
6424 
6425     if (!PyUnicode_Check(unicode)) {
6426         PyErr_BadArgument();
6427         return NULL;
6428     }
6429     if (PyUnicode_READY(unicode) == -1) {
6430         return NULL;
6431     }
6432     kind = PyUnicode_KIND(unicode);
6433     data = PyUnicode_DATA(unicode);
6434     len = PyUnicode_GET_LENGTH(unicode);
6435     if (kind == PyUnicode_1BYTE_KIND) {
6436         return PyBytes_FromStringAndSize(data, len);
6437     }
6438 
6439     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6440        bytes, and 1 byte characters 4. */
6441     expandsize = kind * 2 + 2;
6442 
6443     if (len > PY_SSIZE_T_MAX / expandsize) {
6444         return PyErr_NoMemory();
6445     }
6446     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6447     if (repr == NULL) {
6448         return NULL;
6449     }
6450     if (len == 0) {
6451         return repr;
6452     }
6453 
6454     p = PyBytes_AS_STRING(repr);
6455     for (pos = 0; pos < len; pos++) {
6456         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6457 
6458         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6459         if (ch < 0x100) {
6460             *p++ = (char) ch;
6461         }
6462         /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6463         else if (ch < 0x10000) {
6464             *p++ = '\\';
6465             *p++ = 'u';
6466             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6467             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6468             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6469             *p++ = Py_hexdigits[ch & 15];
6470         }
6471         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6472         else {
6473             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6474             *p++ = '\\';
6475             *p++ = 'U';
6476             *p++ = '0';
6477             *p++ = '0';
6478             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6479             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6480             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6481             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6482             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6483             *p++ = Py_hexdigits[ch & 15];
6484         }
6485     }
6486 
6487     assert(p > PyBytes_AS_STRING(repr));
6488     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6489         return NULL;
6490     }
6491     return repr;
6492 }
6493 
6494 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6495 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6496                                  Py_ssize_t size)
6497 {
6498     PyObject *result;
6499     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6500     if (tmp == NULL)
6501         return NULL;
6502     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6503     Py_DECREF(tmp);
6504     return result;
6505 }
6506 
6507 /* --- Unicode Internal Codec ------------------------------------------- */
6508 
6509 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6510 _PyUnicode_DecodeUnicodeInternal(const char *s,
6511                                  Py_ssize_t size,
6512                                  const char *errors)
6513 {
6514     const char *starts = s;
6515     Py_ssize_t startinpos;
6516     Py_ssize_t endinpos;
6517     _PyUnicodeWriter writer;
6518     const char *end;
6519     const char *reason;
6520     PyObject *errorHandler = NULL;
6521     PyObject *exc = NULL;
6522 
6523     if (PyErr_WarnEx(PyExc_DeprecationWarning,
6524                      "unicode_internal codec has been deprecated",
6525                      1))
6526         return NULL;
6527 
6528     if (size < 0) {
6529         PyErr_BadInternalCall();
6530         return NULL;
6531     }
6532     if (size == 0)
6533         _Py_RETURN_UNICODE_EMPTY();
6534 
6535     _PyUnicodeWriter_Init(&writer);
6536     if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6537         PyErr_NoMemory();
6538         goto onError;
6539     }
6540     writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6541 
6542     end = s + size;
6543     while (s < end) {
6544         Py_UNICODE uch;
6545         Py_UCS4 ch;
6546         if (end - s < Py_UNICODE_SIZE) {
6547             endinpos = end-starts;
6548             reason = "truncated input";
6549             goto error;
6550         }
6551         /* We copy the raw representation one byte at a time because the
6552            pointer may be unaligned (see test_codeccallbacks). */
6553         ((char *) &uch)[0] = s[0];
6554         ((char *) &uch)[1] = s[1];
6555 #ifdef Py_UNICODE_WIDE
6556         ((char *) &uch)[2] = s[2];
6557         ((char *) &uch)[3] = s[3];
6558 #endif
6559         ch = uch;
6560 #ifdef Py_UNICODE_WIDE
6561         /* We have to sanity check the raw data, otherwise doom looms for
6562            some malformed UCS-4 data. */
6563         if (ch > 0x10ffff) {
6564             endinpos = s - starts + Py_UNICODE_SIZE;
6565             reason = "illegal code point (> 0x10FFFF)";
6566             goto error;
6567         }
6568 #endif
6569         s += Py_UNICODE_SIZE;
6570 #ifndef Py_UNICODE_WIDE
6571         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6572         {
6573             Py_UNICODE uch2;
6574             ((char *) &uch2)[0] = s[0];
6575             ((char *) &uch2)[1] = s[1];
6576             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6577             {
6578                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6579                 s += Py_UNICODE_SIZE;
6580             }
6581         }
6582 #endif
6583 
6584         if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6585             goto onError;
6586         continue;
6587 
6588   error:
6589         startinpos = s - starts;
6590         if (unicode_decode_call_errorhandler_writer(
6591                 errors, &errorHandler,
6592                 "unicode_internal", reason,
6593                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6594                 &writer))
6595             goto onError;
6596     }
6597 
6598     Py_XDECREF(errorHandler);
6599     Py_XDECREF(exc);
6600     return _PyUnicodeWriter_Finish(&writer);
6601 
6602   onError:
6603     _PyUnicodeWriter_Dealloc(&writer);
6604     Py_XDECREF(errorHandler);
6605     Py_XDECREF(exc);
6606     return NULL;
6607 }
6608 
6609 /* --- Latin-1 Codec ------------------------------------------------------ */
6610 
6611 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6612 PyUnicode_DecodeLatin1(const char *s,
6613                        Py_ssize_t size,
6614                        const char *errors)
6615 {
6616     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6617     return _PyUnicode_FromUCS1((unsigned char*)s, size);
6618 }
6619 
6620 /* create or adjust a UnicodeEncodeError */
6621 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6622 make_encode_exception(PyObject **exceptionObject,
6623                       const char *encoding,
6624                       PyObject *unicode,
6625                       Py_ssize_t startpos, Py_ssize_t endpos,
6626                       const char *reason)
6627 {
6628     if (*exceptionObject == NULL) {
6629         *exceptionObject = PyObject_CallFunction(
6630             PyExc_UnicodeEncodeError, "sOnns",
6631             encoding, unicode, startpos, endpos, reason);
6632     }
6633     else {
6634         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6635             goto onError;
6636         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6637             goto onError;
6638         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6639             goto onError;
6640         return;
6641       onError:
6642         Py_CLEAR(*exceptionObject);
6643     }
6644 }
6645 
6646 /* raises a UnicodeEncodeError */
6647 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6648 raise_encode_exception(PyObject **exceptionObject,
6649                        const char *encoding,
6650                        PyObject *unicode,
6651                        Py_ssize_t startpos, Py_ssize_t endpos,
6652                        const char *reason)
6653 {
6654     make_encode_exception(exceptionObject,
6655                           encoding, unicode, startpos, endpos, reason);
6656     if (*exceptionObject != NULL)
6657         PyCodec_StrictErrors(*exceptionObject);
6658 }
6659 
6660 /* error handling callback helper:
6661    build arguments, call the callback and check the arguments,
6662    put the result into newpos and return the replacement string, which
6663    has to be freed by the caller */
6664 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6665 unicode_encode_call_errorhandler(const char *errors,
6666                                  PyObject **errorHandler,
6667                                  const char *encoding, const char *reason,
6668                                  PyObject *unicode, PyObject **exceptionObject,
6669                                  Py_ssize_t startpos, Py_ssize_t endpos,
6670                                  Py_ssize_t *newpos)
6671 {
6672     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6673     Py_ssize_t len;
6674     PyObject *restuple;
6675     PyObject *resunicode;
6676 
6677     if (*errorHandler == NULL) {
6678         *errorHandler = PyCodec_LookupError(errors);
6679         if (*errorHandler == NULL)
6680             return NULL;
6681     }
6682 
6683     if (PyUnicode_READY(unicode) == -1)
6684         return NULL;
6685     len = PyUnicode_GET_LENGTH(unicode);
6686 
6687     make_encode_exception(exceptionObject,
6688                           encoding, unicode, startpos, endpos, reason);
6689     if (*exceptionObject == NULL)
6690         return NULL;
6691 
6692     restuple = PyObject_CallFunctionObjArgs(
6693         *errorHandler, *exceptionObject, NULL);
6694     if (restuple == NULL)
6695         return NULL;
6696     if (!PyTuple_Check(restuple)) {
6697         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698         Py_DECREF(restuple);
6699         return NULL;
6700     }
6701     if (!PyArg_ParseTuple(restuple, argparse,
6702                           &resunicode, newpos)) {
6703         Py_DECREF(restuple);
6704         return NULL;
6705     }
6706     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6707         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6708         Py_DECREF(restuple);
6709         return NULL;
6710     }
6711     if (*newpos<0)
6712         *newpos = len + *newpos;
6713     if (*newpos<0 || *newpos>len) {
6714         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6715         Py_DECREF(restuple);
6716         return NULL;
6717     }
6718     Py_INCREF(resunicode);
6719     Py_DECREF(restuple);
6720     return resunicode;
6721 }
6722 
6723 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6724 unicode_encode_ucs1(PyObject *unicode,
6725                     const char *errors,
6726                     const Py_UCS4 limit)
6727 {
6728     /* input state */
6729     Py_ssize_t pos=0, size;
6730     int kind;
6731     void *data;
6732     /* pointer into the output */
6733     char *str;
6734     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6735     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6736     PyObject *error_handler_obj = NULL;
6737     PyObject *exc = NULL;
6738     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6739     PyObject *rep = NULL;
6740     /* output object */
6741     _PyBytesWriter writer;
6742 
6743     if (PyUnicode_READY(unicode) == -1)
6744         return NULL;
6745     size = PyUnicode_GET_LENGTH(unicode);
6746     kind = PyUnicode_KIND(unicode);
6747     data = PyUnicode_DATA(unicode);
6748     /* allocate enough for a simple encoding without
6749        replacements, if we need more, we'll resize */
6750     if (size == 0)
6751         return PyBytes_FromStringAndSize(NULL, 0);
6752 
6753     _PyBytesWriter_Init(&writer);
6754     str = _PyBytesWriter_Alloc(&writer, size);
6755     if (str == NULL)
6756         return NULL;
6757 
6758     while (pos < size) {
6759         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6760 
6761         /* can we encode this? */
6762         if (ch < limit) {
6763             /* no overflow check, because we know that the space is enough */
6764             *str++ = (char)ch;
6765             ++pos;
6766         }
6767         else {
6768             Py_ssize_t newpos, i;
6769             /* startpos for collecting unencodable chars */
6770             Py_ssize_t collstart = pos;
6771             Py_ssize_t collend = collstart + 1;
6772             /* find all unecodable characters */
6773 
6774             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6775                 ++collend;
6776 
6777             /* Only overallocate the buffer if it's not the last write */
6778             writer.overallocate = (collend < size);
6779 
6780             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6781             if (error_handler == _Py_ERROR_UNKNOWN)
6782                 error_handler = get_error_handler(errors);
6783 
6784             switch (error_handler) {
6785             case _Py_ERROR_STRICT:
6786                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6787                 goto onError;
6788 
6789             case _Py_ERROR_REPLACE:
6790                 memset(str, '?', collend - collstart);
6791                 str += (collend - collstart);
6792                 /* fall through */
6793             case _Py_ERROR_IGNORE:
6794                 pos = collend;
6795                 break;
6796 
6797             case _Py_ERROR_BACKSLASHREPLACE:
6798                 /* subtract preallocated bytes */
6799                 writer.min_size -= (collend - collstart);
6800                 str = backslashreplace(&writer, str,
6801                                        unicode, collstart, collend);
6802                 if (str == NULL)
6803                     goto onError;
6804                 pos = collend;
6805                 break;
6806 
6807             case _Py_ERROR_XMLCHARREFREPLACE:
6808                 /* subtract preallocated bytes */
6809                 writer.min_size -= (collend - collstart);
6810                 str = xmlcharrefreplace(&writer, str,
6811                                         unicode, collstart, collend);
6812                 if (str == NULL)
6813                     goto onError;
6814                 pos = collend;
6815                 break;
6816 
6817             case _Py_ERROR_SURROGATEESCAPE:
6818                 for (i = collstart; i < collend; ++i) {
6819                     ch = PyUnicode_READ(kind, data, i);
6820                     if (ch < 0xdc80 || 0xdcff < ch) {
6821                         /* Not a UTF-8b surrogate */
6822                         break;
6823                     }
6824                     *str++ = (char)(ch - 0xdc00);
6825                     ++pos;
6826                 }
6827                 if (i >= collend)
6828                     break;
6829                 collstart = pos;
6830                 assert(collstart != collend);
6831                 /* fall through */
6832 
6833             default:
6834                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6835                                                        encoding, reason, unicode, &exc,
6836                                                        collstart, collend, &newpos);
6837                 if (rep == NULL)
6838                     goto onError;
6839 
6840                 /* subtract preallocated bytes */
6841                 writer.min_size -= newpos - collstart;
6842 
6843                 if (PyBytes_Check(rep)) {
6844                     /* Directly copy bytes result to output. */
6845                     str = _PyBytesWriter_WriteBytes(&writer, str,
6846                                                     PyBytes_AS_STRING(rep),
6847                                                     PyBytes_GET_SIZE(rep));
6848                 }
6849                 else {
6850                     assert(PyUnicode_Check(rep));
6851 
6852                     if (PyUnicode_READY(rep) < 0)
6853                         goto onError;
6854 
6855                     if (limit == 256 ?
6856                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6857                         !PyUnicode_IS_ASCII(rep))
6858                     {
6859                         /* Not all characters are smaller than limit */
6860                         raise_encode_exception(&exc, encoding, unicode,
6861                                                collstart, collend, reason);
6862                         goto onError;
6863                     }
6864                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6865                     str = _PyBytesWriter_WriteBytes(&writer, str,
6866                                                     PyUnicode_DATA(rep),
6867                                                     PyUnicode_GET_LENGTH(rep));
6868                 }
6869                 if (str == NULL)
6870                     goto onError;
6871 
6872                 pos = newpos;
6873                 Py_CLEAR(rep);
6874             }
6875 
6876             /* If overallocation was disabled, ensure that it was the last
6877                write. Otherwise, we missed an optimization */
6878             assert(writer.overallocate || pos == size);
6879         }
6880     }
6881 
6882     Py_XDECREF(error_handler_obj);
6883     Py_XDECREF(exc);
6884     return _PyBytesWriter_Finish(&writer, str);
6885 
6886   onError:
6887     Py_XDECREF(rep);
6888     _PyBytesWriter_Dealloc(&writer);
6889     Py_XDECREF(error_handler_obj);
6890     Py_XDECREF(exc);
6891     return NULL;
6892 }
6893 
6894 /* Deprecated */
6895 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6896 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6897                        Py_ssize_t size,
6898                        const char *errors)
6899 {
6900     PyObject *result;
6901     PyObject *unicode = PyUnicode_FromWideChar(p, size);
6902     if (unicode == NULL)
6903         return NULL;
6904     result = unicode_encode_ucs1(unicode, errors, 256);
6905     Py_DECREF(unicode);
6906     return result;
6907 }
6908 
6909 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6910 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6911 {
6912     if (!PyUnicode_Check(unicode)) {
6913         PyErr_BadArgument();
6914         return NULL;
6915     }
6916     if (PyUnicode_READY(unicode) == -1)
6917         return NULL;
6918     /* Fast path: if it is a one-byte string, construct
6919        bytes object directly. */
6920     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6921         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6922                                          PyUnicode_GET_LENGTH(unicode));
6923     /* Non-Latin-1 characters present. Defer to above function to
6924        raise the exception. */
6925     return unicode_encode_ucs1(unicode, errors, 256);
6926 }
6927 
6928 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6929 PyUnicode_AsLatin1String(PyObject *unicode)
6930 {
6931     return _PyUnicode_AsLatin1String(unicode, NULL);
6932 }
6933 
6934 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6935 
6936 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6937 PyUnicode_DecodeASCII(const char *s,
6938                       Py_ssize_t size,
6939                       const char *errors)
6940 {
6941     const char *starts = s;
6942     _PyUnicodeWriter writer;
6943     int kind;
6944     void *data;
6945     Py_ssize_t startinpos;
6946     Py_ssize_t endinpos;
6947     Py_ssize_t outpos;
6948     const char *e;
6949     PyObject *error_handler_obj = NULL;
6950     PyObject *exc = NULL;
6951     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6952 
6953     if (size == 0)
6954         _Py_RETURN_UNICODE_EMPTY();
6955 
6956     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6957     if (size == 1 && (unsigned char)s[0] < 128)
6958         return get_latin1_char((unsigned char)s[0]);
6959 
6960     _PyUnicodeWriter_Init(&writer);
6961     writer.min_length = size;
6962     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6963         return NULL;
6964 
6965     e = s + size;
6966     data = writer.data;
6967     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6968     writer.pos = outpos;
6969     if (writer.pos == size)
6970         return _PyUnicodeWriter_Finish(&writer);
6971 
6972     s += writer.pos;
6973     kind = writer.kind;
6974     while (s < e) {
6975         unsigned char c = (unsigned char)*s;
6976         if (c < 128) {
6977             PyUnicode_WRITE(kind, data, writer.pos, c);
6978             writer.pos++;
6979             ++s;
6980             continue;
6981         }
6982 
6983         /* byte outsize range 0x00..0x7f: call the error handler */
6984 
6985         if (error_handler == _Py_ERROR_UNKNOWN)
6986             error_handler = get_error_handler(errors);
6987 
6988         switch (error_handler)
6989         {
6990         case _Py_ERROR_REPLACE:
6991         case _Py_ERROR_SURROGATEESCAPE:
6992             /* Fast-path: the error handler only writes one character,
6993                but we may switch to UCS2 at the first write */
6994             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6995                 goto onError;
6996             kind = writer.kind;
6997             data = writer.data;
6998 
6999             if (error_handler == _Py_ERROR_REPLACE)
7000                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7001             else
7002                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7003             writer.pos++;
7004             ++s;
7005             break;
7006 
7007         case _Py_ERROR_IGNORE:
7008             ++s;
7009             break;
7010 
7011         default:
7012             startinpos = s-starts;
7013             endinpos = startinpos + 1;
7014             if (unicode_decode_call_errorhandler_writer(
7015                     errors, &error_handler_obj,
7016                     "ascii", "ordinal not in range(128)",
7017                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7018                     &writer))
7019                 goto onError;
7020             kind = writer.kind;
7021             data = writer.data;
7022         }
7023     }
7024     Py_XDECREF(error_handler_obj);
7025     Py_XDECREF(exc);
7026     return _PyUnicodeWriter_Finish(&writer);
7027 
7028   onError:
7029     _PyUnicodeWriter_Dealloc(&writer);
7030     Py_XDECREF(error_handler_obj);
7031     Py_XDECREF(exc);
7032     return NULL;
7033 }
7034 
7035 /* Deprecated */
7036 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7037 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7038                       Py_ssize_t size,
7039                       const char *errors)
7040 {
7041     PyObject *result;
7042     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7043     if (unicode == NULL)
7044         return NULL;
7045     result = unicode_encode_ucs1(unicode, errors, 128);
7046     Py_DECREF(unicode);
7047     return result;
7048 }
7049 
7050 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7051 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7052 {
7053     if (!PyUnicode_Check(unicode)) {
7054         PyErr_BadArgument();
7055         return NULL;
7056     }
7057     if (PyUnicode_READY(unicode) == -1)
7058         return NULL;
7059     /* Fast path: if it is an ASCII-only string, construct bytes object
7060        directly. Else defer to above function to raise the exception. */
7061     if (PyUnicode_IS_ASCII(unicode))
7062         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7063                                          PyUnicode_GET_LENGTH(unicode));
7064     return unicode_encode_ucs1(unicode, errors, 128);
7065 }
7066 
7067 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7068 PyUnicode_AsASCIIString(PyObject *unicode)
7069 {
7070     return _PyUnicode_AsASCIIString(unicode, NULL);
7071 }
7072 
7073 #ifdef MS_WINDOWS
7074 
7075 /* --- MBCS codecs for Windows -------------------------------------------- */
7076 
7077 #if SIZEOF_INT < SIZEOF_SIZE_T
7078 #define NEED_RETRY
7079 #endif
7080 
7081 #ifndef WC_ERR_INVALID_CHARS
7082 #  define WC_ERR_INVALID_CHARS 0x0080
7083 #endif
7084 
7085 static const char*
code_page_name(UINT code_page,PyObject ** obj)7086 code_page_name(UINT code_page, PyObject **obj)
7087 {
7088     *obj = NULL;
7089     if (code_page == CP_ACP)
7090         return "mbcs";
7091     if (code_page == CP_UTF7)
7092         return "CP_UTF7";
7093     if (code_page == CP_UTF8)
7094         return "CP_UTF8";
7095 
7096     *obj = PyBytes_FromFormat("cp%u", code_page);
7097     if (*obj == NULL)
7098         return NULL;
7099     return PyBytes_AS_STRING(*obj);
7100 }
7101 
7102 static DWORD
decode_code_page_flags(UINT code_page)7103 decode_code_page_flags(UINT code_page)
7104 {
7105     if (code_page == CP_UTF7) {
7106         /* The CP_UTF7 decoder only supports flags=0 */
7107         return 0;
7108     }
7109     else
7110         return MB_ERR_INVALID_CHARS;
7111 }
7112 
7113 /*
7114  * Decode a byte string from a Windows code page into unicode object in strict
7115  * mode.
7116  *
7117  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7118  * OSError and returns -1 on other error.
7119  */
7120 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7121 decode_code_page_strict(UINT code_page,
7122                         PyObject **v,
7123                         const char *in,
7124                         int insize)
7125 {
7126     const DWORD flags = decode_code_page_flags(code_page);
7127     wchar_t *out;
7128     DWORD outsize;
7129 
7130     /* First get the size of the result */
7131     assert(insize > 0);
7132     outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7133     if (outsize <= 0)
7134         goto error;
7135 
7136     if (*v == NULL) {
7137         /* Create unicode object */
7138         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7139         *v = (PyObject*)_PyUnicode_New(outsize);
7140         if (*v == NULL)
7141             return -1;
7142         out = PyUnicode_AS_UNICODE(*v);
7143     }
7144     else {
7145         /* Extend unicode object */
7146         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7147         if (unicode_resize(v, n + outsize) < 0)
7148             return -1;
7149         out = PyUnicode_AS_UNICODE(*v) + n;
7150     }
7151 
7152     /* Do the conversion */
7153     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7154     if (outsize <= 0)
7155         goto error;
7156     return insize;
7157 
7158 error:
7159     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7160         return -2;
7161     PyErr_SetFromWindowsErr(0);
7162     return -1;
7163 }
7164 
7165 /*
7166  * Decode a byte string from a code page into unicode object with an error
7167  * handler.
7168  *
7169  * Returns consumed size if succeed, or raise an OSError or
7170  * UnicodeDecodeError exception and returns -1 on error.
7171  */
7172 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7173 decode_code_page_errors(UINT code_page,
7174                         PyObject **v,
7175                         const char *in, const int size,
7176                         const char *errors, int final)
7177 {
7178     const char *startin = in;
7179     const char *endin = in + size;
7180     const DWORD flags = decode_code_page_flags(code_page);
7181     /* Ideally, we should get reason from FormatMessage. This is the Windows
7182        2000 English version of the message. */
7183     const char *reason = "No mapping for the Unicode character exists "
7184                          "in the target code page.";
7185     /* each step cannot decode more than 1 character, but a character can be
7186        represented as a surrogate pair */
7187     wchar_t buffer[2], *out;
7188     int insize;
7189     Py_ssize_t outsize;
7190     PyObject *errorHandler = NULL;
7191     PyObject *exc = NULL;
7192     PyObject *encoding_obj = NULL;
7193     const char *encoding;
7194     DWORD err;
7195     int ret = -1;
7196 
7197     assert(size > 0);
7198 
7199     encoding = code_page_name(code_page, &encoding_obj);
7200     if (encoding == NULL)
7201         return -1;
7202 
7203     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7204         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7205            UnicodeDecodeError. */
7206         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7207         if (exc != NULL) {
7208             PyCodec_StrictErrors(exc);
7209             Py_CLEAR(exc);
7210         }
7211         goto error;
7212     }
7213 
7214     if (*v == NULL) {
7215         /* Create unicode object */
7216         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7217             PyErr_NoMemory();
7218             goto error;
7219         }
7220         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7221         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7222         if (*v == NULL)
7223             goto error;
7224         out = PyUnicode_AS_UNICODE(*v);
7225     }
7226     else {
7227         /* Extend unicode object */
7228         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7229         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7230             PyErr_NoMemory();
7231             goto error;
7232         }
7233         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7234             goto error;
7235         out = PyUnicode_AS_UNICODE(*v) + n;
7236     }
7237 
7238     /* Decode the byte string character per character */
7239     while (in < endin)
7240     {
7241         /* Decode a character */
7242         insize = 1;
7243         do
7244         {
7245             outsize = MultiByteToWideChar(code_page, flags,
7246                                           in, insize,
7247                                           buffer, Py_ARRAY_LENGTH(buffer));
7248             if (outsize > 0)
7249                 break;
7250             err = GetLastError();
7251             if (err != ERROR_NO_UNICODE_TRANSLATION
7252                 && err != ERROR_INSUFFICIENT_BUFFER)
7253             {
7254                 PyErr_SetFromWindowsErr(0);
7255                 goto error;
7256             }
7257             insize++;
7258         }
7259         /* 4=maximum length of a UTF-8 sequence */
7260         while (insize <= 4 && (in + insize) <= endin);
7261 
7262         if (outsize <= 0) {
7263             Py_ssize_t startinpos, endinpos, outpos;
7264 
7265             /* last character in partial decode? */
7266             if (in + insize >= endin && !final)
7267                 break;
7268 
7269             startinpos = in - startin;
7270             endinpos = startinpos + 1;
7271             outpos = out - PyUnicode_AS_UNICODE(*v);
7272             if (unicode_decode_call_errorhandler_wchar(
7273                     errors, &errorHandler,
7274                     encoding, reason,
7275                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7276                     v, &outpos))
7277             {
7278                 goto error;
7279             }
7280             out = PyUnicode_AS_UNICODE(*v) + outpos;
7281         }
7282         else {
7283             in += insize;
7284             memcpy(out, buffer, outsize * sizeof(wchar_t));
7285             out += outsize;
7286         }
7287     }
7288 
7289     /* write a NUL character at the end */
7290     *out = 0;
7291 
7292     /* Extend unicode object */
7293     outsize = out - PyUnicode_AS_UNICODE(*v);
7294     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7295     if (unicode_resize(v, outsize) < 0)
7296         goto error;
7297     /* (in - startin) <= size and size is an int */
7298     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7299 
7300 error:
7301     Py_XDECREF(encoding_obj);
7302     Py_XDECREF(errorHandler);
7303     Py_XDECREF(exc);
7304     return ret;
7305 }
7306 
7307 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7308 decode_code_page_stateful(int code_page,
7309                           const char *s, Py_ssize_t size,
7310                           const char *errors, Py_ssize_t *consumed)
7311 {
7312     PyObject *v = NULL;
7313     int chunk_size, final, converted, done;
7314 
7315     if (code_page < 0) {
7316         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7317         return NULL;
7318     }
7319     if (size < 0) {
7320         PyErr_BadInternalCall();
7321         return NULL;
7322     }
7323 
7324     if (consumed)
7325         *consumed = 0;
7326 
7327     do
7328     {
7329 #ifdef NEED_RETRY
7330         if (size > INT_MAX) {
7331             chunk_size = INT_MAX;
7332             final = 0;
7333             done = 0;
7334         }
7335         else
7336 #endif
7337         {
7338             chunk_size = (int)size;
7339             final = (consumed == NULL);
7340             done = 1;
7341         }
7342 
7343         if (chunk_size == 0 && done) {
7344             if (v != NULL)
7345                 break;
7346             _Py_RETURN_UNICODE_EMPTY();
7347         }
7348 
7349         converted = decode_code_page_strict(code_page, &v,
7350                                             s, chunk_size);
7351         if (converted == -2)
7352             converted = decode_code_page_errors(code_page, &v,
7353                                                 s, chunk_size,
7354                                                 errors, final);
7355         assert(converted != 0 || done);
7356 
7357         if (converted < 0) {
7358             Py_XDECREF(v);
7359             return NULL;
7360         }
7361 
7362         if (consumed)
7363             *consumed += converted;
7364 
7365         s += converted;
7366         size -= converted;
7367     } while (!done);
7368 
7369     return unicode_result(v);
7370 }
7371 
7372 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7373 PyUnicode_DecodeCodePageStateful(int code_page,
7374                                  const char *s,
7375                                  Py_ssize_t size,
7376                                  const char *errors,
7377                                  Py_ssize_t *consumed)
7378 {
7379     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7380 }
7381 
7382 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7383 PyUnicode_DecodeMBCSStateful(const char *s,
7384                              Py_ssize_t size,
7385                              const char *errors,
7386                              Py_ssize_t *consumed)
7387 {
7388     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7389 }
7390 
7391 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7392 PyUnicode_DecodeMBCS(const char *s,
7393                      Py_ssize_t size,
7394                      const char *errors)
7395 {
7396     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7397 }
7398 
7399 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7400 encode_code_page_flags(UINT code_page, const char *errors)
7401 {
7402     if (code_page == CP_UTF8) {
7403         return WC_ERR_INVALID_CHARS;
7404     }
7405     else if (code_page == CP_UTF7) {
7406         /* CP_UTF7 only supports flags=0 */
7407         return 0;
7408     }
7409     else {
7410         if (errors != NULL && strcmp(errors, "replace") == 0)
7411             return 0;
7412         else
7413             return WC_NO_BEST_FIT_CHARS;
7414     }
7415 }
7416 
7417 /*
7418  * Encode a Unicode string to a Windows code page into a byte string in strict
7419  * mode.
7420  *
7421  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7422  * an OSError and returns -1 on other error.
7423  */
7424 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7425 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7426                         PyObject *unicode, Py_ssize_t offset, int len,
7427                         const char* errors)
7428 {
7429     BOOL usedDefaultChar = FALSE;
7430     BOOL *pusedDefaultChar = &usedDefaultChar;
7431     int outsize;
7432     wchar_t *p;
7433     Py_ssize_t size;
7434     const DWORD flags = encode_code_page_flags(code_page, NULL);
7435     char *out;
7436     /* Create a substring so that we can get the UTF-16 representation
7437        of just the slice under consideration. */
7438     PyObject *substring;
7439 
7440     assert(len > 0);
7441 
7442     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7443         pusedDefaultChar = &usedDefaultChar;
7444     else
7445         pusedDefaultChar = NULL;
7446 
7447     substring = PyUnicode_Substring(unicode, offset, offset+len);
7448     if (substring == NULL)
7449         return -1;
7450     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7451     if (p == NULL) {
7452         Py_DECREF(substring);
7453         return -1;
7454     }
7455     assert(size <= INT_MAX);
7456 
7457     /* First get the size of the result */
7458     outsize = WideCharToMultiByte(code_page, flags,
7459                                   p, (int)size,
7460                                   NULL, 0,
7461                                   NULL, pusedDefaultChar);
7462     if (outsize <= 0)
7463         goto error;
7464     /* If we used a default char, then we failed! */
7465     if (pusedDefaultChar && *pusedDefaultChar) {
7466         Py_DECREF(substring);
7467         return -2;
7468     }
7469 
7470     if (*outbytes == NULL) {
7471         /* Create string object */
7472         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7473         if (*outbytes == NULL) {
7474             Py_DECREF(substring);
7475             return -1;
7476         }
7477         out = PyBytes_AS_STRING(*outbytes);
7478     }
7479     else {
7480         /* Extend string object */
7481         const Py_ssize_t n = PyBytes_Size(*outbytes);
7482         if (outsize > PY_SSIZE_T_MAX - n) {
7483             PyErr_NoMemory();
7484             Py_DECREF(substring);
7485             return -1;
7486         }
7487         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7488             Py_DECREF(substring);
7489             return -1;
7490         }
7491         out = PyBytes_AS_STRING(*outbytes) + n;
7492     }
7493 
7494     /* Do the conversion */
7495     outsize = WideCharToMultiByte(code_page, flags,
7496                                   p, (int)size,
7497                                   out, outsize,
7498                                   NULL, pusedDefaultChar);
7499     Py_CLEAR(substring);
7500     if (outsize <= 0)
7501         goto error;
7502     if (pusedDefaultChar && *pusedDefaultChar)
7503         return -2;
7504     return 0;
7505 
7506 error:
7507     Py_XDECREF(substring);
7508     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7509         return -2;
7510     PyErr_SetFromWindowsErr(0);
7511     return -1;
7512 }
7513 
7514 /*
7515  * Encode a Unicode string to a Windows code page into a byte string using an
7516  * error handler.
7517  *
7518  * Returns consumed characters if succeed, or raise an OSError and returns
7519  * -1 on other error.
7520  */
7521 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7522 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7523                         PyObject *unicode, Py_ssize_t unicode_offset,
7524                         Py_ssize_t insize, const char* errors)
7525 {
7526     const DWORD flags = encode_code_page_flags(code_page, errors);
7527     Py_ssize_t pos = unicode_offset;
7528     Py_ssize_t endin = unicode_offset + insize;
7529     /* Ideally, we should get reason from FormatMessage. This is the Windows
7530        2000 English version of the message. */
7531     const char *reason = "invalid character";
7532     /* 4=maximum length of a UTF-8 sequence */
7533     char buffer[4];
7534     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7535     Py_ssize_t outsize;
7536     char *out;
7537     PyObject *errorHandler = NULL;
7538     PyObject *exc = NULL;
7539     PyObject *encoding_obj = NULL;
7540     const char *encoding;
7541     Py_ssize_t newpos, newoutsize;
7542     PyObject *rep;
7543     int ret = -1;
7544 
7545     assert(insize > 0);
7546 
7547     encoding = code_page_name(code_page, &encoding_obj);
7548     if (encoding == NULL)
7549         return -1;
7550 
7551     if (errors == NULL || strcmp(errors, "strict") == 0) {
7552         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7553            then we raise a UnicodeEncodeError. */
7554         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7555         if (exc != NULL) {
7556             PyCodec_StrictErrors(exc);
7557             Py_DECREF(exc);
7558         }
7559         Py_XDECREF(encoding_obj);
7560         return -1;
7561     }
7562 
7563     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7564         pusedDefaultChar = &usedDefaultChar;
7565     else
7566         pusedDefaultChar = NULL;
7567 
7568     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7569         PyErr_NoMemory();
7570         goto error;
7571     }
7572     outsize = insize * Py_ARRAY_LENGTH(buffer);
7573 
7574     if (*outbytes == NULL) {
7575         /* Create string object */
7576         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7577         if (*outbytes == NULL)
7578             goto error;
7579         out = PyBytes_AS_STRING(*outbytes);
7580     }
7581     else {
7582         /* Extend string object */
7583         Py_ssize_t n = PyBytes_Size(*outbytes);
7584         if (n > PY_SSIZE_T_MAX - outsize) {
7585             PyErr_NoMemory();
7586             goto error;
7587         }
7588         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7589             goto error;
7590         out = PyBytes_AS_STRING(*outbytes) + n;
7591     }
7592 
7593     /* Encode the string character per character */
7594     while (pos < endin)
7595     {
7596         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7597         wchar_t chars[2];
7598         int charsize;
7599         if (ch < 0x10000) {
7600             chars[0] = (wchar_t)ch;
7601             charsize = 1;
7602         }
7603         else {
7604             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7605             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7606             charsize = 2;
7607         }
7608 
7609         outsize = WideCharToMultiByte(code_page, flags,
7610                                       chars, charsize,
7611                                       buffer, Py_ARRAY_LENGTH(buffer),
7612                                       NULL, pusedDefaultChar);
7613         if (outsize > 0) {
7614             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7615             {
7616                 pos++;
7617                 memcpy(out, buffer, outsize);
7618                 out += outsize;
7619                 continue;
7620             }
7621         }
7622         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7623             PyErr_SetFromWindowsErr(0);
7624             goto error;
7625         }
7626 
7627         rep = unicode_encode_call_errorhandler(
7628                   errors, &errorHandler, encoding, reason,
7629                   unicode, &exc,
7630                   pos, pos + 1, &newpos);
7631         if (rep == NULL)
7632             goto error;
7633         pos = newpos;
7634 
7635         if (PyBytes_Check(rep)) {
7636             outsize = PyBytes_GET_SIZE(rep);
7637             if (outsize != 1) {
7638                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7639                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7640                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7641                     Py_DECREF(rep);
7642                     goto error;
7643                 }
7644                 out = PyBytes_AS_STRING(*outbytes) + offset;
7645             }
7646             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7647             out += outsize;
7648         }
7649         else {
7650             Py_ssize_t i;
7651             enum PyUnicode_Kind kind;
7652             void *data;
7653 
7654             if (PyUnicode_READY(rep) == -1) {
7655                 Py_DECREF(rep);
7656                 goto error;
7657             }
7658 
7659             outsize = PyUnicode_GET_LENGTH(rep);
7660             if (outsize != 1) {
7661                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7662                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7663                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7664                     Py_DECREF(rep);
7665                     goto error;
7666                 }
7667                 out = PyBytes_AS_STRING(*outbytes) + offset;
7668             }
7669             kind = PyUnicode_KIND(rep);
7670             data = PyUnicode_DATA(rep);
7671             for (i=0; i < outsize; i++) {
7672                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7673                 if (ch > 127) {
7674                     raise_encode_exception(&exc,
7675                         encoding, unicode,
7676                         pos, pos + 1,
7677                         "unable to encode error handler result to ASCII");
7678                     Py_DECREF(rep);
7679                     goto error;
7680                 }
7681                 *out = (unsigned char)ch;
7682                 out++;
7683             }
7684         }
7685         Py_DECREF(rep);
7686     }
7687     /* write a NUL byte */
7688     *out = 0;
7689     outsize = out - PyBytes_AS_STRING(*outbytes);
7690     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7691     if (_PyBytes_Resize(outbytes, outsize) < 0)
7692         goto error;
7693     ret = 0;
7694 
7695 error:
7696     Py_XDECREF(encoding_obj);
7697     Py_XDECREF(errorHandler);
7698     Py_XDECREF(exc);
7699     return ret;
7700 }
7701 
7702 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7703 encode_code_page(int code_page,
7704                  PyObject *unicode,
7705                  const char *errors)
7706 {
7707     Py_ssize_t len;
7708     PyObject *outbytes = NULL;
7709     Py_ssize_t offset;
7710     int chunk_len, ret, done;
7711 
7712     if (!PyUnicode_Check(unicode)) {
7713         PyErr_BadArgument();
7714         return NULL;
7715     }
7716 
7717     if (PyUnicode_READY(unicode) == -1)
7718         return NULL;
7719     len = PyUnicode_GET_LENGTH(unicode);
7720 
7721     if (code_page < 0) {
7722         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7723         return NULL;
7724     }
7725 
7726     if (len == 0)
7727         return PyBytes_FromStringAndSize(NULL, 0);
7728 
7729     offset = 0;
7730     do
7731     {
7732 #ifdef NEED_RETRY
7733         /* UTF-16 encoding may double the size, so use only INT_MAX/2
7734            chunks. */
7735         if (len > INT_MAX/2) {
7736             chunk_len = INT_MAX/2;
7737             done = 0;
7738         }
7739         else
7740 #endif
7741         {
7742             chunk_len = (int)len;
7743             done = 1;
7744         }
7745 
7746         ret = encode_code_page_strict(code_page, &outbytes,
7747                                       unicode, offset, chunk_len,
7748                                       errors);
7749         if (ret == -2)
7750             ret = encode_code_page_errors(code_page, &outbytes,
7751                                           unicode, offset,
7752                                           chunk_len, errors);
7753         if (ret < 0) {
7754             Py_XDECREF(outbytes);
7755             return NULL;
7756         }
7757 
7758         offset += chunk_len;
7759         len -= chunk_len;
7760     } while (!done);
7761 
7762     return outbytes;
7763 }
7764 
7765 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7766 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7767                      Py_ssize_t size,
7768                      const char *errors)
7769 {
7770     PyObject *unicode, *res;
7771     unicode = PyUnicode_FromWideChar(p, size);
7772     if (unicode == NULL)
7773         return NULL;
7774     res = encode_code_page(CP_ACP, unicode, errors);
7775     Py_DECREF(unicode);
7776     return res;
7777 }
7778 
7779 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7780 PyUnicode_EncodeCodePage(int code_page,
7781                          PyObject *unicode,
7782                          const char *errors)
7783 {
7784     return encode_code_page(code_page, unicode, errors);
7785 }
7786 
7787 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7788 PyUnicode_AsMBCSString(PyObject *unicode)
7789 {
7790     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7791 }
7792 
7793 #undef NEED_RETRY
7794 
7795 #endif /* MS_WINDOWS */
7796 
7797 /* --- Character Mapping Codec -------------------------------------------- */
7798 
7799 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7800 charmap_decode_string(const char *s,
7801                       Py_ssize_t size,
7802                       PyObject *mapping,
7803                       const char *errors,
7804                       _PyUnicodeWriter *writer)
7805 {
7806     const char *starts = s;
7807     const char *e;
7808     Py_ssize_t startinpos, endinpos;
7809     PyObject *errorHandler = NULL, *exc = NULL;
7810     Py_ssize_t maplen;
7811     enum PyUnicode_Kind mapkind;
7812     void *mapdata;
7813     Py_UCS4 x;
7814     unsigned char ch;
7815 
7816     if (PyUnicode_READY(mapping) == -1)
7817         return -1;
7818 
7819     maplen = PyUnicode_GET_LENGTH(mapping);
7820     mapdata = PyUnicode_DATA(mapping);
7821     mapkind = PyUnicode_KIND(mapping);
7822 
7823     e = s + size;
7824 
7825     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7826         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7827          * is disabled in encoding aliases, latin1 is preferred because
7828          * its implementation is faster. */
7829         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7830         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7831         Py_UCS4 maxchar = writer->maxchar;
7832 
7833         assert (writer->kind == PyUnicode_1BYTE_KIND);
7834         while (s < e) {
7835             ch = *s;
7836             x = mapdata_ucs1[ch];
7837             if (x > maxchar) {
7838                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7839                     goto onError;
7840                 maxchar = writer->maxchar;
7841                 outdata = (Py_UCS1 *)writer->data;
7842             }
7843             outdata[writer->pos] = x;
7844             writer->pos++;
7845             ++s;
7846         }
7847         return 0;
7848     }
7849 
7850     while (s < e) {
7851         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7852             enum PyUnicode_Kind outkind = writer->kind;
7853             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7854             if (outkind == PyUnicode_1BYTE_KIND) {
7855                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7856                 Py_UCS4 maxchar = writer->maxchar;
7857                 while (s < e) {
7858                     ch = *s;
7859                     x = mapdata_ucs2[ch];
7860                     if (x > maxchar)
7861                         goto Error;
7862                     outdata[writer->pos] = x;
7863                     writer->pos++;
7864                     ++s;
7865                 }
7866                 break;
7867             }
7868             else if (outkind == PyUnicode_2BYTE_KIND) {
7869                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7870                 while (s < e) {
7871                     ch = *s;
7872                     x = mapdata_ucs2[ch];
7873                     if (x == 0xFFFE)
7874                         goto Error;
7875                     outdata[writer->pos] = x;
7876                     writer->pos++;
7877                     ++s;
7878                 }
7879                 break;
7880             }
7881         }
7882         ch = *s;
7883 
7884         if (ch < maplen)
7885             x = PyUnicode_READ(mapkind, mapdata, ch);
7886         else
7887             x = 0xfffe; /* invalid value */
7888 Error:
7889         if (x == 0xfffe)
7890         {
7891             /* undefined mapping */
7892             startinpos = s-starts;
7893             endinpos = startinpos+1;
7894             if (unicode_decode_call_errorhandler_writer(
7895                     errors, &errorHandler,
7896                     "charmap", "character maps to <undefined>",
7897                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7898                     writer)) {
7899                 goto onError;
7900             }
7901             continue;
7902         }
7903 
7904         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7905             goto onError;
7906         ++s;
7907     }
7908     Py_XDECREF(errorHandler);
7909     Py_XDECREF(exc);
7910     return 0;
7911 
7912 onError:
7913     Py_XDECREF(errorHandler);
7914     Py_XDECREF(exc);
7915     return -1;
7916 }
7917 
7918 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7919 charmap_decode_mapping(const char *s,
7920                        Py_ssize_t size,
7921                        PyObject *mapping,
7922                        const char *errors,
7923                        _PyUnicodeWriter *writer)
7924 {
7925     const char *starts = s;
7926     const char *e;
7927     Py_ssize_t startinpos, endinpos;
7928     PyObject *errorHandler = NULL, *exc = NULL;
7929     unsigned char ch;
7930     PyObject *key, *item = NULL;
7931 
7932     e = s + size;
7933 
7934     while (s < e) {
7935         ch = *s;
7936 
7937         /* Get mapping (char ordinal -> integer, Unicode char or None) */
7938         key = PyLong_FromLong((long)ch);
7939         if (key == NULL)
7940             goto onError;
7941 
7942         item = PyObject_GetItem(mapping, key);
7943         Py_DECREF(key);
7944         if (item == NULL) {
7945             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7946                 /* No mapping found means: mapping is undefined. */
7947                 PyErr_Clear();
7948                 goto Undefined;
7949             } else
7950                 goto onError;
7951         }
7952 
7953         /* Apply mapping */
7954         if (item == Py_None)
7955             goto Undefined;
7956         if (PyLong_Check(item)) {
7957             long value = PyLong_AS_LONG(item);
7958             if (value == 0xFFFE)
7959                 goto Undefined;
7960             if (value < 0 || value > MAX_UNICODE) {
7961                 PyErr_Format(PyExc_TypeError,
7962                              "character mapping must be in range(0x%lx)",
7963                              (unsigned long)MAX_UNICODE + 1);
7964                 goto onError;
7965             }
7966 
7967             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968                 goto onError;
7969         }
7970         else if (PyUnicode_Check(item)) {
7971             if (PyUnicode_READY(item) == -1)
7972                 goto onError;
7973             if (PyUnicode_GET_LENGTH(item) == 1) {
7974                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7975                 if (value == 0xFFFE)
7976                     goto Undefined;
7977                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7978                     goto onError;
7979             }
7980             else {
7981                 writer->overallocate = 1;
7982                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7983                     goto onError;
7984             }
7985         }
7986         else {
7987             /* wrong return value */
7988             PyErr_SetString(PyExc_TypeError,
7989                             "character mapping must return integer, None or str");
7990             goto onError;
7991         }
7992         Py_CLEAR(item);
7993         ++s;
7994         continue;
7995 
7996 Undefined:
7997         /* undefined mapping */
7998         Py_CLEAR(item);
7999         startinpos = s-starts;
8000         endinpos = startinpos+1;
8001         if (unicode_decode_call_errorhandler_writer(
8002                 errors, &errorHandler,
8003                 "charmap", "character maps to <undefined>",
8004                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8005                 writer)) {
8006             goto onError;
8007         }
8008     }
8009     Py_XDECREF(errorHandler);
8010     Py_XDECREF(exc);
8011     return 0;
8012 
8013 onError:
8014     Py_XDECREF(item);
8015     Py_XDECREF(errorHandler);
8016     Py_XDECREF(exc);
8017     return -1;
8018 }
8019 
8020 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8021 PyUnicode_DecodeCharmap(const char *s,
8022                         Py_ssize_t size,
8023                         PyObject *mapping,
8024                         const char *errors)
8025 {
8026     _PyUnicodeWriter writer;
8027 
8028     /* Default to Latin-1 */
8029     if (mapping == NULL)
8030         return PyUnicode_DecodeLatin1(s, size, errors);
8031 
8032     if (size == 0)
8033         _Py_RETURN_UNICODE_EMPTY();
8034     _PyUnicodeWriter_Init(&writer);
8035     writer.min_length = size;
8036     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8037         goto onError;
8038 
8039     if (PyUnicode_CheckExact(mapping)) {
8040         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8041             goto onError;
8042     }
8043     else {
8044         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8045             goto onError;
8046     }
8047     return _PyUnicodeWriter_Finish(&writer);
8048 
8049   onError:
8050     _PyUnicodeWriter_Dealloc(&writer);
8051     return NULL;
8052 }
8053 
8054 /* Charmap encoding: the lookup table */
8055 
8056 struct encoding_map {
8057     PyObject_HEAD
8058     unsigned char level1[32];
8059     int count2, count3;
8060     unsigned char level23[1];
8061 };
8062 
8063 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8064 encoding_map_size(PyObject *obj, PyObject* args)
8065 {
8066     struct encoding_map *map = (struct encoding_map*)obj;
8067     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8068                            128*map->count3);
8069 }
8070 
8071 static PyMethodDef encoding_map_methods[] = {
8072     {"size", encoding_map_size, METH_NOARGS,
8073      PyDoc_STR("Return the size (in bytes) of this object") },
8074     { 0 }
8075 };
8076 
8077 static void
encoding_map_dealloc(PyObject * o)8078 encoding_map_dealloc(PyObject* o)
8079 {
8080     PyObject_FREE(o);
8081 }
8082 
8083 static PyTypeObject EncodingMapType = {
8084     PyVarObject_HEAD_INIT(NULL, 0)
8085     "EncodingMap",          /*tp_name*/
8086     sizeof(struct encoding_map),   /*tp_basicsize*/
8087     0,                      /*tp_itemsize*/
8088     /* methods */
8089     encoding_map_dealloc,   /*tp_dealloc*/
8090     0,                      /*tp_print*/
8091     0,                      /*tp_getattr*/
8092     0,                      /*tp_setattr*/
8093     0,                      /*tp_reserved*/
8094     0,                      /*tp_repr*/
8095     0,                      /*tp_as_number*/
8096     0,                      /*tp_as_sequence*/
8097     0,                      /*tp_as_mapping*/
8098     0,                      /*tp_hash*/
8099     0,                      /*tp_call*/
8100     0,                      /*tp_str*/
8101     0,                      /*tp_getattro*/
8102     0,                      /*tp_setattro*/
8103     0,                      /*tp_as_buffer*/
8104     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8105     0,                      /*tp_doc*/
8106     0,                      /*tp_traverse*/
8107     0,                      /*tp_clear*/
8108     0,                      /*tp_richcompare*/
8109     0,                      /*tp_weaklistoffset*/
8110     0,                      /*tp_iter*/
8111     0,                      /*tp_iternext*/
8112     encoding_map_methods,   /*tp_methods*/
8113     0,                      /*tp_members*/
8114     0,                      /*tp_getset*/
8115     0,                      /*tp_base*/
8116     0,                      /*tp_dict*/
8117     0,                      /*tp_descr_get*/
8118     0,                      /*tp_descr_set*/
8119     0,                      /*tp_dictoffset*/
8120     0,                      /*tp_init*/
8121     0,                      /*tp_alloc*/
8122     0,                      /*tp_new*/
8123     0,                      /*tp_free*/
8124     0,                      /*tp_is_gc*/
8125 };
8126 
8127 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8128 PyUnicode_BuildEncodingMap(PyObject* string)
8129 {
8130     PyObject *result;
8131     struct encoding_map *mresult;
8132     int i;
8133     int need_dict = 0;
8134     unsigned char level1[32];
8135     unsigned char level2[512];
8136     unsigned char *mlevel1, *mlevel2, *mlevel3;
8137     int count2 = 0, count3 = 0;
8138     int kind;
8139     void *data;
8140     Py_ssize_t length;
8141     Py_UCS4 ch;
8142 
8143     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8144         PyErr_BadArgument();
8145         return NULL;
8146     }
8147     kind = PyUnicode_KIND(string);
8148     data = PyUnicode_DATA(string);
8149     length = PyUnicode_GET_LENGTH(string);
8150     length = Py_MIN(length, 256);
8151     memset(level1, 0xFF, sizeof level1);
8152     memset(level2, 0xFF, sizeof level2);
8153 
8154     /* If there isn't a one-to-one mapping of NULL to \0,
8155        or if there are non-BMP characters, we need to use
8156        a mapping dictionary. */
8157     if (PyUnicode_READ(kind, data, 0) != 0)
8158         need_dict = 1;
8159     for (i = 1; i < length; i++) {
8160         int l1, l2;
8161         ch = PyUnicode_READ(kind, data, i);
8162         if (ch == 0 || ch > 0xFFFF) {
8163             need_dict = 1;
8164             break;
8165         }
8166         if (ch == 0xFFFE)
8167             /* unmapped character */
8168             continue;
8169         l1 = ch >> 11;
8170         l2 = ch >> 7;
8171         if (level1[l1] == 0xFF)
8172             level1[l1] = count2++;
8173         if (level2[l2] == 0xFF)
8174             level2[l2] = count3++;
8175     }
8176 
8177     if (count2 >= 0xFF || count3 >= 0xFF)
8178         need_dict = 1;
8179 
8180     if (need_dict) {
8181         PyObject *result = PyDict_New();
8182         PyObject *key, *value;
8183         if (!result)
8184             return NULL;
8185         for (i = 0; i < length; i++) {
8186             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8187             value = PyLong_FromLong(i);
8188             if (!key || !value)
8189                 goto failed1;
8190             if (PyDict_SetItem(result, key, value) == -1)
8191                 goto failed1;
8192             Py_DECREF(key);
8193             Py_DECREF(value);
8194         }
8195         return result;
8196       failed1:
8197         Py_XDECREF(key);
8198         Py_XDECREF(value);
8199         Py_DECREF(result);
8200         return NULL;
8201     }
8202 
8203     /* Create a three-level trie */
8204     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8205                              16*count2 + 128*count3 - 1);
8206     if (!result)
8207         return PyErr_NoMemory();
8208     PyObject_Init(result, &EncodingMapType);
8209     mresult = (struct encoding_map*)result;
8210     mresult->count2 = count2;
8211     mresult->count3 = count3;
8212     mlevel1 = mresult->level1;
8213     mlevel2 = mresult->level23;
8214     mlevel3 = mresult->level23 + 16*count2;
8215     memcpy(mlevel1, level1, 32);
8216     memset(mlevel2, 0xFF, 16*count2);
8217     memset(mlevel3, 0, 128*count3);
8218     count3 = 0;
8219     for (i = 1; i < length; i++) {
8220         int o1, o2, o3, i2, i3;
8221         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8222         if (ch == 0xFFFE)
8223             /* unmapped character */
8224             continue;
8225         o1 = ch>>11;
8226         o2 = (ch>>7) & 0xF;
8227         i2 = 16*mlevel1[o1] + o2;
8228         if (mlevel2[i2] == 0xFF)
8229             mlevel2[i2] = count3++;
8230         o3 = ch & 0x7F;
8231         i3 = 128*mlevel2[i2] + o3;
8232         mlevel3[i3] = i;
8233     }
8234     return result;
8235 }
8236 
8237 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8238 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8239 {
8240     struct encoding_map *map = (struct encoding_map*)mapping;
8241     int l1 = c>>11;
8242     int l2 = (c>>7) & 0xF;
8243     int l3 = c & 0x7F;
8244     int i;
8245 
8246     if (c > 0xFFFF)
8247         return -1;
8248     if (c == 0)
8249         return 0;
8250     /* level 1*/
8251     i = map->level1[l1];
8252     if (i == 0xFF) {
8253         return -1;
8254     }
8255     /* level 2*/
8256     i = map->level23[16*i+l2];
8257     if (i == 0xFF) {
8258         return -1;
8259     }
8260     /* level 3 */
8261     i = map->level23[16*map->count2 + 128*i + l3];
8262     if (i == 0) {
8263         return -1;
8264     }
8265     return i;
8266 }
8267 
8268 /* Lookup the character ch in the mapping. If the character
8269    can't be found, Py_None is returned (or NULL, if another
8270    error occurred). */
8271 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8272 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8273 {
8274     PyObject *w = PyLong_FromLong((long)c);
8275     PyObject *x;
8276 
8277     if (w == NULL)
8278         return NULL;
8279     x = PyObject_GetItem(mapping, w);
8280     Py_DECREF(w);
8281     if (x == NULL) {
8282         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8283             /* No mapping found means: mapping is undefined. */
8284             PyErr_Clear();
8285             Py_RETURN_NONE;
8286         } else
8287             return NULL;
8288     }
8289     else if (x == Py_None)
8290         return x;
8291     else if (PyLong_Check(x)) {
8292         long value = PyLong_AS_LONG(x);
8293         if (value < 0 || value > 255) {
8294             PyErr_SetString(PyExc_TypeError,
8295                             "character mapping must be in range(256)");
8296             Py_DECREF(x);
8297             return NULL;
8298         }
8299         return x;
8300     }
8301     else if (PyBytes_Check(x))
8302         return x;
8303     else {
8304         /* wrong return value */
8305         PyErr_Format(PyExc_TypeError,
8306                      "character mapping must return integer, bytes or None, not %.400s",
8307                      x->ob_type->tp_name);
8308         Py_DECREF(x);
8309         return NULL;
8310     }
8311 }
8312 
8313 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8314 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8315 {
8316     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8317     /* exponentially overallocate to minimize reallocations */
8318     if (requiredsize < 2*outsize)
8319         requiredsize = 2*outsize;
8320     if (_PyBytes_Resize(outobj, requiredsize))
8321         return -1;
8322     return 0;
8323 }
8324 
8325 typedef enum charmapencode_result {
8326     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8327 } charmapencode_result;
8328 /* lookup the character, put the result in the output string and adjust
8329    various state variables. Resize the output bytes object if not enough
8330    space is available. Return a new reference to the object that
8331    was put in the output buffer, or Py_None, if the mapping was undefined
8332    (in which case no character was written) or NULL, if a
8333    reallocation error occurred. The caller must decref the result */
8334 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8335 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8336                      PyObject **outobj, Py_ssize_t *outpos)
8337 {
8338     PyObject *rep;
8339     char *outstart;
8340     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8341 
8342     if (Py_TYPE(mapping) == &EncodingMapType) {
8343         int res = encoding_map_lookup(c, mapping);
8344         Py_ssize_t requiredsize = *outpos+1;
8345         if (res == -1)
8346             return enc_FAILED;
8347         if (outsize<requiredsize)
8348             if (charmapencode_resize(outobj, outpos, requiredsize))
8349                 return enc_EXCEPTION;
8350         outstart = PyBytes_AS_STRING(*outobj);
8351         outstart[(*outpos)++] = (char)res;
8352         return enc_SUCCESS;
8353     }
8354 
8355     rep = charmapencode_lookup(c, mapping);
8356     if (rep==NULL)
8357         return enc_EXCEPTION;
8358     else if (rep==Py_None) {
8359         Py_DECREF(rep);
8360         return enc_FAILED;
8361     } else {
8362         if (PyLong_Check(rep)) {
8363             Py_ssize_t requiredsize = *outpos+1;
8364             if (outsize<requiredsize)
8365                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8366                     Py_DECREF(rep);
8367                     return enc_EXCEPTION;
8368                 }
8369             outstart = PyBytes_AS_STRING(*outobj);
8370             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8371         }
8372         else {
8373             const char *repchars = PyBytes_AS_STRING(rep);
8374             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8375             Py_ssize_t requiredsize = *outpos+repsize;
8376             if (outsize<requiredsize)
8377                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8378                     Py_DECREF(rep);
8379                     return enc_EXCEPTION;
8380                 }
8381             outstart = PyBytes_AS_STRING(*outobj);
8382             memcpy(outstart + *outpos, repchars, repsize);
8383             *outpos += repsize;
8384         }
8385     }
8386     Py_DECREF(rep);
8387     return enc_SUCCESS;
8388 }
8389 
8390 /* handle an error in PyUnicode_EncodeCharmap
8391    Return 0 on success, -1 on error */
8392 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8393 charmap_encoding_error(
8394     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8395     PyObject **exceptionObject,
8396     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8397     PyObject **res, Py_ssize_t *respos)
8398 {
8399     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8400     Py_ssize_t size, repsize;
8401     Py_ssize_t newpos;
8402     enum PyUnicode_Kind kind;
8403     void *data;
8404     Py_ssize_t index;
8405     /* startpos for collecting unencodable chars */
8406     Py_ssize_t collstartpos = *inpos;
8407     Py_ssize_t collendpos = *inpos+1;
8408     Py_ssize_t collpos;
8409     const char *encoding = "charmap";
8410     const char *reason = "character maps to <undefined>";
8411     charmapencode_result x;
8412     Py_UCS4 ch;
8413     int val;
8414 
8415     if (PyUnicode_READY(unicode) == -1)
8416         return -1;
8417     size = PyUnicode_GET_LENGTH(unicode);
8418     /* find all unencodable characters */
8419     while (collendpos < size) {
8420         PyObject *rep;
8421         if (Py_TYPE(mapping) == &EncodingMapType) {
8422             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8423             val = encoding_map_lookup(ch, mapping);
8424             if (val != -1)
8425                 break;
8426             ++collendpos;
8427             continue;
8428         }
8429 
8430         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8431         rep = charmapencode_lookup(ch, mapping);
8432         if (rep==NULL)
8433             return -1;
8434         else if (rep!=Py_None) {
8435             Py_DECREF(rep);
8436             break;
8437         }
8438         Py_DECREF(rep);
8439         ++collendpos;
8440     }
8441     /* cache callback name lookup
8442      * (if not done yet, i.e. it's the first error) */
8443     if (*error_handler == _Py_ERROR_UNKNOWN)
8444         *error_handler = get_error_handler(errors);
8445 
8446     switch (*error_handler) {
8447     case _Py_ERROR_STRICT:
8448         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8449         return -1;
8450 
8451     case _Py_ERROR_REPLACE:
8452         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8453             x = charmapencode_output('?', mapping, res, respos);
8454             if (x==enc_EXCEPTION) {
8455                 return -1;
8456             }
8457             else if (x==enc_FAILED) {
8458                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8459                 return -1;
8460             }
8461         }
8462         /* fall through */
8463     case _Py_ERROR_IGNORE:
8464         *inpos = collendpos;
8465         break;
8466 
8467     case _Py_ERROR_XMLCHARREFREPLACE:
8468         /* generate replacement (temporarily (mis)uses p) */
8469         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8470             char buffer[2+29+1+1];
8471             char *cp;
8472             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8473             for (cp = buffer; *cp; ++cp) {
8474                 x = charmapencode_output(*cp, mapping, res, respos);
8475                 if (x==enc_EXCEPTION)
8476                     return -1;
8477                 else if (x==enc_FAILED) {
8478                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8479                     return -1;
8480                 }
8481             }
8482         }
8483         *inpos = collendpos;
8484         break;
8485 
8486     default:
8487         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8488                                                       encoding, reason, unicode, exceptionObject,
8489                                                       collstartpos, collendpos, &newpos);
8490         if (repunicode == NULL)
8491             return -1;
8492         if (PyBytes_Check(repunicode)) {
8493             /* Directly copy bytes result to output. */
8494             Py_ssize_t outsize = PyBytes_Size(*res);
8495             Py_ssize_t requiredsize;
8496             repsize = PyBytes_Size(repunicode);
8497             requiredsize = *respos + repsize;
8498             if (requiredsize > outsize)
8499                 /* Make room for all additional bytes. */
8500                 if (charmapencode_resize(res, respos, requiredsize)) {
8501                     Py_DECREF(repunicode);
8502                     return -1;
8503                 }
8504             memcpy(PyBytes_AsString(*res) + *respos,
8505                    PyBytes_AsString(repunicode),  repsize);
8506             *respos += repsize;
8507             *inpos = newpos;
8508             Py_DECREF(repunicode);
8509             break;
8510         }
8511         /* generate replacement  */
8512         if (PyUnicode_READY(repunicode) == -1) {
8513             Py_DECREF(repunicode);
8514             return -1;
8515         }
8516         repsize = PyUnicode_GET_LENGTH(repunicode);
8517         data = PyUnicode_DATA(repunicode);
8518         kind = PyUnicode_KIND(repunicode);
8519         for (index = 0; index < repsize; index++) {
8520             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8521             x = charmapencode_output(repch, mapping, res, respos);
8522             if (x==enc_EXCEPTION) {
8523                 Py_DECREF(repunicode);
8524                 return -1;
8525             }
8526             else if (x==enc_FAILED) {
8527                 Py_DECREF(repunicode);
8528                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8529                 return -1;
8530             }
8531         }
8532         *inpos = newpos;
8533         Py_DECREF(repunicode);
8534     }
8535     return 0;
8536 }
8537 
8538 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8539 _PyUnicode_EncodeCharmap(PyObject *unicode,
8540                          PyObject *mapping,
8541                          const char *errors)
8542 {
8543     /* output object */
8544     PyObject *res = NULL;
8545     /* current input position */
8546     Py_ssize_t inpos = 0;
8547     Py_ssize_t size;
8548     /* current output position */
8549     Py_ssize_t respos = 0;
8550     PyObject *error_handler_obj = NULL;
8551     PyObject *exc = NULL;
8552     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8553     void *data;
8554     int kind;
8555 
8556     if (PyUnicode_READY(unicode) == -1)
8557         return NULL;
8558     size = PyUnicode_GET_LENGTH(unicode);
8559     data = PyUnicode_DATA(unicode);
8560     kind = PyUnicode_KIND(unicode);
8561 
8562     /* Default to Latin-1 */
8563     if (mapping == NULL)
8564         return unicode_encode_ucs1(unicode, errors, 256);
8565 
8566     /* allocate enough for a simple encoding without
8567        replacements, if we need more, we'll resize */
8568     res = PyBytes_FromStringAndSize(NULL, size);
8569     if (res == NULL)
8570         goto onError;
8571     if (size == 0)
8572         return res;
8573 
8574     while (inpos<size) {
8575         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8576         /* try to encode it */
8577         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8578         if (x==enc_EXCEPTION) /* error */
8579             goto onError;
8580         if (x==enc_FAILED) { /* unencodable character */
8581             if (charmap_encoding_error(unicode, &inpos, mapping,
8582                                        &exc,
8583                                        &error_handler, &error_handler_obj, errors,
8584                                        &res, &respos)) {
8585                 goto onError;
8586             }
8587         }
8588         else
8589             /* done with this character => adjust input position */
8590             ++inpos;
8591     }
8592 
8593     /* Resize if we allocated to much */
8594     if (respos<PyBytes_GET_SIZE(res))
8595         if (_PyBytes_Resize(&res, respos) < 0)
8596             goto onError;
8597 
8598     Py_XDECREF(exc);
8599     Py_XDECREF(error_handler_obj);
8600     return res;
8601 
8602   onError:
8603     Py_XDECREF(res);
8604     Py_XDECREF(exc);
8605     Py_XDECREF(error_handler_obj);
8606     return NULL;
8607 }
8608 
8609 /* Deprecated */
8610 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8611 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8612                         Py_ssize_t size,
8613                         PyObject *mapping,
8614                         const char *errors)
8615 {
8616     PyObject *result;
8617     PyObject *unicode = PyUnicode_FromWideChar(p, size);
8618     if (unicode == NULL)
8619         return NULL;
8620     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8621     Py_DECREF(unicode);
8622     return result;
8623 }
8624 
8625 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8626 PyUnicode_AsCharmapString(PyObject *unicode,
8627                           PyObject *mapping)
8628 {
8629     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8630         PyErr_BadArgument();
8631         return NULL;
8632     }
8633     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8634 }
8635 
8636 /* create or adjust a UnicodeTranslateError */
8637 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8638 make_translate_exception(PyObject **exceptionObject,
8639                          PyObject *unicode,
8640                          Py_ssize_t startpos, Py_ssize_t endpos,
8641                          const char *reason)
8642 {
8643     if (*exceptionObject == NULL) {
8644         *exceptionObject = _PyUnicodeTranslateError_Create(
8645             unicode, startpos, endpos, reason);
8646     }
8647     else {
8648         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8649             goto onError;
8650         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8651             goto onError;
8652         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8653             goto onError;
8654         return;
8655       onError:
8656         Py_CLEAR(*exceptionObject);
8657     }
8658 }
8659 
8660 /* error handling callback helper:
8661    build arguments, call the callback and check the arguments,
8662    put the result into newpos and return the replacement string, which
8663    has to be freed by the caller */
8664 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8665 unicode_translate_call_errorhandler(const char *errors,
8666                                     PyObject **errorHandler,
8667                                     const char *reason,
8668                                     PyObject *unicode, PyObject **exceptionObject,
8669                                     Py_ssize_t startpos, Py_ssize_t endpos,
8670                                     Py_ssize_t *newpos)
8671 {
8672     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8673 
8674     Py_ssize_t i_newpos;
8675     PyObject *restuple;
8676     PyObject *resunicode;
8677 
8678     if (*errorHandler == NULL) {
8679         *errorHandler = PyCodec_LookupError(errors);
8680         if (*errorHandler == NULL)
8681             return NULL;
8682     }
8683 
8684     make_translate_exception(exceptionObject,
8685                              unicode, startpos, endpos, reason);
8686     if (*exceptionObject == NULL)
8687         return NULL;
8688 
8689     restuple = PyObject_CallFunctionObjArgs(
8690         *errorHandler, *exceptionObject, NULL);
8691     if (restuple == NULL)
8692         return NULL;
8693     if (!PyTuple_Check(restuple)) {
8694         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8695         Py_DECREF(restuple);
8696         return NULL;
8697     }
8698     if (!PyArg_ParseTuple(restuple, argparse,
8699                           &resunicode, &i_newpos)) {
8700         Py_DECREF(restuple);
8701         return NULL;
8702     }
8703     if (i_newpos<0)
8704         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8705     else
8706         *newpos = i_newpos;
8707     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8708         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8709         Py_DECREF(restuple);
8710         return NULL;
8711     }
8712     Py_INCREF(resunicode);
8713     Py_DECREF(restuple);
8714     return resunicode;
8715 }
8716 
8717 /* Lookup the character ch in the mapping and put the result in result,
8718    which must be decrefed by the caller.
8719    Return 0 on success, -1 on error */
8720 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8721 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8722 {
8723     PyObject *w = PyLong_FromLong((long)c);
8724     PyObject *x;
8725 
8726     if (w == NULL)
8727         return -1;
8728     x = PyObject_GetItem(mapping, w);
8729     Py_DECREF(w);
8730     if (x == NULL) {
8731         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8732             /* No mapping found means: use 1:1 mapping. */
8733             PyErr_Clear();
8734             *result = NULL;
8735             return 0;
8736         } else
8737             return -1;
8738     }
8739     else if (x == Py_None) {
8740         *result = x;
8741         return 0;
8742     }
8743     else if (PyLong_Check(x)) {
8744         long value = PyLong_AS_LONG(x);
8745         if (value < 0 || value > MAX_UNICODE) {
8746             PyErr_Format(PyExc_ValueError,
8747                          "character mapping must be in range(0x%x)",
8748                          MAX_UNICODE+1);
8749             Py_DECREF(x);
8750             return -1;
8751         }
8752         *result = x;
8753         return 0;
8754     }
8755     else if (PyUnicode_Check(x)) {
8756         *result = x;
8757         return 0;
8758     }
8759     else {
8760         /* wrong return value */
8761         PyErr_SetString(PyExc_TypeError,
8762                         "character mapping must return integer, None or str");
8763         Py_DECREF(x);
8764         return -1;
8765     }
8766 }
8767 
8768 /* lookup the character, write the result into the writer.
8769    Return 1 if the result was written into the writer, return 0 if the mapping
8770    was undefined, raise an exception return -1 on error. */
8771 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8772 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8773                         _PyUnicodeWriter *writer)
8774 {
8775     PyObject *item;
8776 
8777     if (charmaptranslate_lookup(ch, mapping, &item))
8778         return -1;
8779 
8780     if (item == NULL) {
8781         /* not found => default to 1:1 mapping */
8782         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8783             return -1;
8784         }
8785         return 1;
8786     }
8787 
8788     if (item == Py_None) {
8789         Py_DECREF(item);
8790         return 0;
8791     }
8792 
8793     if (PyLong_Check(item)) {
8794         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8795         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8796            used it */
8797         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8798             Py_DECREF(item);
8799             return -1;
8800         }
8801         Py_DECREF(item);
8802         return 1;
8803     }
8804 
8805     if (!PyUnicode_Check(item)) {
8806         Py_DECREF(item);
8807         return -1;
8808     }
8809 
8810     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8811         Py_DECREF(item);
8812         return -1;
8813     }
8814 
8815     Py_DECREF(item);
8816     return 1;
8817 }
8818 
8819 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8820 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8821                               Py_UCS1 *translate)
8822 {
8823     PyObject *item = NULL;
8824     int ret = 0;
8825 
8826     if (charmaptranslate_lookup(ch, mapping, &item)) {
8827         return -1;
8828     }
8829 
8830     if (item == Py_None) {
8831         /* deletion */
8832         translate[ch] = 0xfe;
8833     }
8834     else if (item == NULL) {
8835         /* not found => default to 1:1 mapping */
8836         translate[ch] = ch;
8837         return 1;
8838     }
8839     else if (PyLong_Check(item)) {
8840         long replace = PyLong_AS_LONG(item);
8841         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8842            used it */
8843         if (127 < replace) {
8844             /* invalid character or character outside ASCII:
8845                skip the fast translate */
8846             goto exit;
8847         }
8848         translate[ch] = (Py_UCS1)replace;
8849     }
8850     else if (PyUnicode_Check(item)) {
8851         Py_UCS4 replace;
8852 
8853         if (PyUnicode_READY(item) == -1) {
8854             Py_DECREF(item);
8855             return -1;
8856         }
8857         if (PyUnicode_GET_LENGTH(item) != 1)
8858             goto exit;
8859 
8860         replace = PyUnicode_READ_CHAR(item, 0);
8861         if (replace > 127)
8862             goto exit;
8863         translate[ch] = (Py_UCS1)replace;
8864     }
8865     else {
8866         /* not None, NULL, long or unicode */
8867         goto exit;
8868     }
8869     ret = 1;
8870 
8871   exit:
8872     Py_DECREF(item);
8873     return ret;
8874 }
8875 
8876 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8877    was translated into writer, return 0 if the input string was partially
8878    translated into writer, raise an exception and return -1 on error. */
8879 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8880 unicode_fast_translate(PyObject *input, PyObject *mapping,
8881                        _PyUnicodeWriter *writer, int ignore,
8882                        Py_ssize_t *input_pos)
8883 {
8884     Py_UCS1 ascii_table[128], ch, ch2;
8885     Py_ssize_t len;
8886     Py_UCS1 *in, *end, *out;
8887     int res = 0;
8888 
8889     len = PyUnicode_GET_LENGTH(input);
8890 
8891     memset(ascii_table, 0xff, 128);
8892 
8893     in = PyUnicode_1BYTE_DATA(input);
8894     end = in + len;
8895 
8896     assert(PyUnicode_IS_ASCII(writer->buffer));
8897     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8898     out = PyUnicode_1BYTE_DATA(writer->buffer);
8899 
8900     for (; in < end; in++) {
8901         ch = *in;
8902         ch2 = ascii_table[ch];
8903         if (ch2 == 0xff) {
8904             int translate = unicode_fast_translate_lookup(mapping, ch,
8905                                                           ascii_table);
8906             if (translate < 0)
8907                 return -1;
8908             if (translate == 0)
8909                 goto exit;
8910             ch2 = ascii_table[ch];
8911         }
8912         if (ch2 == 0xfe) {
8913             if (ignore)
8914                 continue;
8915             goto exit;
8916         }
8917         assert(ch2 < 128);
8918         *out = ch2;
8919         out++;
8920     }
8921     res = 1;
8922 
8923 exit:
8924     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8925     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8926     return res;
8927 }
8928 
8929 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8930 _PyUnicode_TranslateCharmap(PyObject *input,
8931                             PyObject *mapping,
8932                             const char *errors)
8933 {
8934     /* input object */
8935     char *data;
8936     Py_ssize_t size, i;
8937     int kind;
8938     /* output buffer */
8939     _PyUnicodeWriter writer;
8940     /* error handler */
8941     const char *reason = "character maps to <undefined>";
8942     PyObject *errorHandler = NULL;
8943     PyObject *exc = NULL;
8944     int ignore;
8945     int res;
8946 
8947     if (mapping == NULL) {
8948         PyErr_BadArgument();
8949         return NULL;
8950     }
8951 
8952     if (PyUnicode_READY(input) == -1)
8953         return NULL;
8954     data = (char*)PyUnicode_DATA(input);
8955     kind = PyUnicode_KIND(input);
8956     size = PyUnicode_GET_LENGTH(input);
8957 
8958     if (size == 0)
8959         return PyUnicode_FromObject(input);
8960 
8961     /* allocate enough for a simple 1:1 translation without
8962        replacements, if we need more, we'll resize */
8963     _PyUnicodeWriter_Init(&writer);
8964     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8965         goto onError;
8966 
8967     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8968 
8969     if (PyUnicode_READY(input) == -1)
8970         return NULL;
8971     if (PyUnicode_IS_ASCII(input)) {
8972         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8973         if (res < 0) {
8974             _PyUnicodeWriter_Dealloc(&writer);
8975             return NULL;
8976         }
8977         if (res == 1)
8978             return _PyUnicodeWriter_Finish(&writer);
8979     }
8980     else {
8981         i = 0;
8982     }
8983 
8984     while (i<size) {
8985         /* try to encode it */
8986         int translate;
8987         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8988         Py_ssize_t newpos;
8989         /* startpos for collecting untranslatable chars */
8990         Py_ssize_t collstart;
8991         Py_ssize_t collend;
8992         Py_UCS4 ch;
8993 
8994         ch = PyUnicode_READ(kind, data, i);
8995         translate = charmaptranslate_output(ch, mapping, &writer);
8996         if (translate < 0)
8997             goto onError;
8998 
8999         if (translate != 0) {
9000             /* it worked => adjust input pointer */
9001             ++i;
9002             continue;
9003         }
9004 
9005         /* untranslatable character */
9006         collstart = i;
9007         collend = i+1;
9008 
9009         /* find all untranslatable characters */
9010         while (collend < size) {
9011             PyObject *x;
9012             ch = PyUnicode_READ(kind, data, collend);
9013             if (charmaptranslate_lookup(ch, mapping, &x))
9014                 goto onError;
9015             Py_XDECREF(x);
9016             if (x != Py_None)
9017                 break;
9018             ++collend;
9019         }
9020 
9021         if (ignore) {
9022             i = collend;
9023         }
9024         else {
9025             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9026                                                              reason, input, &exc,
9027                                                              collstart, collend, &newpos);
9028             if (repunicode == NULL)
9029                 goto onError;
9030             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9031                 Py_DECREF(repunicode);
9032                 goto onError;
9033             }
9034             Py_DECREF(repunicode);
9035             i = newpos;
9036         }
9037     }
9038     Py_XDECREF(exc);
9039     Py_XDECREF(errorHandler);
9040     return _PyUnicodeWriter_Finish(&writer);
9041 
9042   onError:
9043     _PyUnicodeWriter_Dealloc(&writer);
9044     Py_XDECREF(exc);
9045     Py_XDECREF(errorHandler);
9046     return NULL;
9047 }
9048 
9049 /* Deprecated. Use PyUnicode_Translate instead. */
9050 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9051 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9052                            Py_ssize_t size,
9053                            PyObject *mapping,
9054                            const char *errors)
9055 {
9056     PyObject *result;
9057     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9058     if (!unicode)
9059         return NULL;
9060     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9061     Py_DECREF(unicode);
9062     return result;
9063 }
9064 
9065 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9066 PyUnicode_Translate(PyObject *str,
9067                     PyObject *mapping,
9068                     const char *errors)
9069 {
9070     if (ensure_unicode(str) < 0)
9071         return NULL;
9072     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9073 }
9074 
9075 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9076 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9077 {
9078     if (!PyUnicode_Check(unicode)) {
9079         PyErr_BadInternalCall();
9080         return NULL;
9081     }
9082     if (PyUnicode_READY(unicode) == -1)
9083         return NULL;
9084     if (PyUnicode_IS_ASCII(unicode)) {
9085         /* If the string is already ASCII, just return the same string */
9086         Py_INCREF(unicode);
9087         return unicode;
9088     }
9089 
9090     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9091     PyObject *result = PyUnicode_New(len, 127);
9092     if (result == NULL) {
9093         return NULL;
9094     }
9095 
9096     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9097     int kind = PyUnicode_KIND(unicode);
9098     const void *data = PyUnicode_DATA(unicode);
9099     Py_ssize_t i;
9100     for (i = 0; i < len; ++i) {
9101         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9102         if (ch < 127) {
9103             out[i] = ch;
9104         }
9105         else if (Py_UNICODE_ISSPACE(ch)) {
9106             out[i] = ' ';
9107         }
9108         else {
9109             int decimal = Py_UNICODE_TODECIMAL(ch);
9110             if (decimal < 0) {
9111                 out[i] = '?';
9112                 out[i+1] = '\0';
9113                 _PyUnicode_LENGTH(result) = i + 1;
9114                 break;
9115             }
9116             out[i] = '0' + decimal;
9117         }
9118     }
9119 
9120     assert(_PyUnicode_CheckConsistency(result, 1));
9121     return result;
9122 }
9123 
9124 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9125 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9126                                   Py_ssize_t length)
9127 {
9128     PyObject *decimal;
9129     Py_ssize_t i;
9130     Py_UCS4 maxchar;
9131     enum PyUnicode_Kind kind;
9132     void *data;
9133 
9134     maxchar = 127;
9135     for (i = 0; i < length; i++) {
9136         Py_UCS4 ch = s[i];
9137         if (ch > 127) {
9138             int decimal = Py_UNICODE_TODECIMAL(ch);
9139             if (decimal >= 0)
9140                 ch = '0' + decimal;
9141             maxchar = Py_MAX(maxchar, ch);
9142         }
9143     }
9144 
9145     /* Copy to a new string */
9146     decimal = PyUnicode_New(length, maxchar);
9147     if (decimal == NULL)
9148         return decimal;
9149     kind = PyUnicode_KIND(decimal);
9150     data = PyUnicode_DATA(decimal);
9151     /* Iterate over code points */
9152     for (i = 0; i < length; i++) {
9153         Py_UCS4 ch = s[i];
9154         if (ch > 127) {
9155             int decimal = Py_UNICODE_TODECIMAL(ch);
9156             if (decimal >= 0)
9157                 ch = '0' + decimal;
9158         }
9159         PyUnicode_WRITE(kind, data, i, ch);
9160     }
9161     return unicode_result(decimal);
9162 }
9163 /* --- Decimal Encoder ---------------------------------------------------- */
9164 
9165 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9166 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9167                         Py_ssize_t length,
9168                         char *output,
9169                         const char *errors)
9170 {
9171     PyObject *unicode;
9172     Py_ssize_t i;
9173     enum PyUnicode_Kind kind;
9174     void *data;
9175 
9176     if (output == NULL) {
9177         PyErr_BadArgument();
9178         return -1;
9179     }
9180 
9181     unicode = PyUnicode_FromWideChar(s, length);
9182     if (unicode == NULL)
9183         return -1;
9184 
9185     kind = PyUnicode_KIND(unicode);
9186     data = PyUnicode_DATA(unicode);
9187 
9188     for (i=0; i < length; ) {
9189         PyObject *exc;
9190         Py_UCS4 ch;
9191         int decimal;
9192         Py_ssize_t startpos;
9193 
9194         ch = PyUnicode_READ(kind, data, i);
9195 
9196         if (Py_UNICODE_ISSPACE(ch)) {
9197             *output++ = ' ';
9198             i++;
9199             continue;
9200         }
9201         decimal = Py_UNICODE_TODECIMAL(ch);
9202         if (decimal >= 0) {
9203             *output++ = '0' + decimal;
9204             i++;
9205             continue;
9206         }
9207         if (0 < ch && ch < 256) {
9208             *output++ = (char)ch;
9209             i++;
9210             continue;
9211         }
9212 
9213         startpos = i;
9214         exc = NULL;
9215         raise_encode_exception(&exc, "decimal", unicode,
9216                                startpos, startpos+1,
9217                                "invalid decimal Unicode string");
9218         Py_XDECREF(exc);
9219         Py_DECREF(unicode);
9220         return -1;
9221     }
9222     /* 0-terminate the output string */
9223     *output++ = '\0';
9224     Py_DECREF(unicode);
9225     return 0;
9226 }
9227 
9228 /* --- Helpers ------------------------------------------------------------ */
9229 
9230 /* helper macro to fixup start/end slice values */
9231 #define ADJUST_INDICES(start, end, len)         \
9232     if (end > len)                              \
9233         end = len;                              \
9234     else if (end < 0) {                         \
9235         end += len;                             \
9236         if (end < 0)                            \
9237             end = 0;                            \
9238     }                                           \
9239     if (start < 0) {                            \
9240         start += len;                           \
9241         if (start < 0)                          \
9242             start = 0;                          \
9243     }
9244 
9245 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9246 any_find_slice(PyObject* s1, PyObject* s2,
9247                Py_ssize_t start,
9248                Py_ssize_t end,
9249                int direction)
9250 {
9251     int kind1, kind2;
9252     void *buf1, *buf2;
9253     Py_ssize_t len1, len2, result;
9254 
9255     kind1 = PyUnicode_KIND(s1);
9256     kind2 = PyUnicode_KIND(s2);
9257     if (kind1 < kind2)
9258         return -1;
9259 
9260     len1 = PyUnicode_GET_LENGTH(s1);
9261     len2 = PyUnicode_GET_LENGTH(s2);
9262     ADJUST_INDICES(start, end, len1);
9263     if (end - start < len2)
9264         return -1;
9265 
9266     buf1 = PyUnicode_DATA(s1);
9267     buf2 = PyUnicode_DATA(s2);
9268     if (len2 == 1) {
9269         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9270         result = findchar((const char *)buf1 + kind1*start,
9271                           kind1, end - start, ch, direction);
9272         if (result == -1)
9273             return -1;
9274         else
9275             return start + result;
9276     }
9277 
9278     if (kind2 != kind1) {
9279         buf2 = _PyUnicode_AsKind(s2, kind1);
9280         if (!buf2)
9281             return -2;
9282     }
9283 
9284     if (direction > 0) {
9285         switch (kind1) {
9286         case PyUnicode_1BYTE_KIND:
9287             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9288                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9289             else
9290                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9291             break;
9292         case PyUnicode_2BYTE_KIND:
9293             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9294             break;
9295         case PyUnicode_4BYTE_KIND:
9296             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9297             break;
9298         default:
9299             Py_UNREACHABLE();
9300         }
9301     }
9302     else {
9303         switch (kind1) {
9304         case PyUnicode_1BYTE_KIND:
9305             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9306                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307             else
9308                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309             break;
9310         case PyUnicode_2BYTE_KIND:
9311             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9312             break;
9313         case PyUnicode_4BYTE_KIND:
9314             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9315             break;
9316         default:
9317             Py_UNREACHABLE();
9318         }
9319     }
9320 
9321     if (kind2 != kind1)
9322         PyMem_Free(buf2);
9323 
9324     return result;
9325 }
9326 
9327 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9328 #include "stringlib/localeutil.h"
9329 
9330 /**
9331  * InsertThousandsGrouping:
9332  * @writer: Unicode writer.
9333  * @n_buffer: Number of characters in @buffer.
9334  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9335  * @d_pos: Start of digits string.
9336  * @n_digits: The number of digits in the string, in which we want
9337  *            to put the grouping chars.
9338  * @min_width: The minimum width of the digits in the output string.
9339  *             Output will be zero-padded on the left to fill.
9340  * @grouping: see definition in localeconv().
9341  * @thousands_sep: see definition in localeconv().
9342  *
9343  * There are 2 modes: counting and filling. If @writer is NULL,
9344  *  we are in counting mode, else filling mode.
9345  * If counting, the required buffer size is returned.
9346  * If filling, we know the buffer will be large enough, so we don't
9347  *  need to pass in the buffer size.
9348  * Inserts thousand grouping characters (as defined by grouping and
9349  *  thousands_sep) into @writer.
9350  *
9351  * Return value: -1 on error, number of characters otherwise.
9352  **/
9353 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9354 _PyUnicode_InsertThousandsGrouping(
9355     _PyUnicodeWriter *writer,
9356     Py_ssize_t n_buffer,
9357     PyObject *digits,
9358     Py_ssize_t d_pos,
9359     Py_ssize_t n_digits,
9360     Py_ssize_t min_width,
9361     const char *grouping,
9362     PyObject *thousands_sep,
9363     Py_UCS4 *maxchar)
9364 {
9365     min_width = Py_MAX(0, min_width);
9366     if (writer) {
9367         assert(digits != NULL);
9368         assert(maxchar == NULL);
9369     }
9370     else {
9371         assert(digits == NULL);
9372         assert(maxchar != NULL);
9373     }
9374     assert(0 <= d_pos);
9375     assert(0 <= n_digits);
9376     assert(grouping != NULL);
9377 
9378     if (digits != NULL) {
9379         if (PyUnicode_READY(digits) == -1) {
9380             return -1;
9381         }
9382     }
9383     if (PyUnicode_READY(thousands_sep) == -1) {
9384         return -1;
9385     }
9386 
9387     Py_ssize_t count = 0;
9388     Py_ssize_t n_zeros;
9389     int loop_broken = 0;
9390     int use_separator = 0; /* First time through, don't append the
9391                               separator. They only go between
9392                               groups. */
9393     Py_ssize_t buffer_pos;
9394     Py_ssize_t digits_pos;
9395     Py_ssize_t len;
9396     Py_ssize_t n_chars;
9397     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9398                                         be looked at */
9399     /* A generator that returns all of the grouping widths, until it
9400        returns 0. */
9401     GroupGenerator groupgen;
9402     GroupGenerator_init(&groupgen, grouping);
9403     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9404 
9405     /* if digits are not grouped, thousands separator
9406        should be an empty string */
9407     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9408 
9409     digits_pos = d_pos + n_digits;
9410     if (writer) {
9411         buffer_pos = writer->pos + n_buffer;
9412         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9413         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9414     }
9415     else {
9416         buffer_pos = n_buffer;
9417     }
9418 
9419     if (!writer) {
9420         *maxchar = 127;
9421     }
9422 
9423     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9424         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9425         n_zeros = Py_MAX(0, len - remaining);
9426         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9427 
9428         /* Use n_zero zero's and n_chars chars */
9429 
9430         /* Count only, don't do anything. */
9431         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9432 
9433         /* Copy into the writer. */
9434         InsertThousandsGrouping_fill(writer, &buffer_pos,
9435                                      digits, &digits_pos,
9436                                      n_chars, n_zeros,
9437                                      use_separator ? thousands_sep : NULL,
9438                                      thousands_sep_len, maxchar);
9439 
9440         /* Use a separator next time. */
9441         use_separator = 1;
9442 
9443         remaining -= n_chars;
9444         min_width -= len;
9445 
9446         if (remaining <= 0 && min_width <= 0) {
9447             loop_broken = 1;
9448             break;
9449         }
9450         min_width -= thousands_sep_len;
9451     }
9452     if (!loop_broken) {
9453         /* We left the loop without using a break statement. */
9454 
9455         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9456         n_zeros = Py_MAX(0, len - remaining);
9457         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9458 
9459         /* Use n_zero zero's and n_chars chars */
9460         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9461 
9462         /* Copy into the writer. */
9463         InsertThousandsGrouping_fill(writer, &buffer_pos,
9464                                      digits, &digits_pos,
9465                                      n_chars, n_zeros,
9466                                      use_separator ? thousands_sep : NULL,
9467                                      thousands_sep_len, maxchar);
9468     }
9469     return count;
9470 }
9471 
9472 
9473 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9474 PyUnicode_Count(PyObject *str,
9475                 PyObject *substr,
9476                 Py_ssize_t start,
9477                 Py_ssize_t end)
9478 {
9479     Py_ssize_t result;
9480     int kind1, kind2;
9481     void *buf1 = NULL, *buf2 = NULL;
9482     Py_ssize_t len1, len2;
9483 
9484     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9485         return -1;
9486 
9487     kind1 = PyUnicode_KIND(str);
9488     kind2 = PyUnicode_KIND(substr);
9489     if (kind1 < kind2)
9490         return 0;
9491 
9492     len1 = PyUnicode_GET_LENGTH(str);
9493     len2 = PyUnicode_GET_LENGTH(substr);
9494     ADJUST_INDICES(start, end, len1);
9495     if (end - start < len2)
9496         return 0;
9497 
9498     buf1 = PyUnicode_DATA(str);
9499     buf2 = PyUnicode_DATA(substr);
9500     if (kind2 != kind1) {
9501         buf2 = _PyUnicode_AsKind(substr, kind1);
9502         if (!buf2)
9503             goto onError;
9504     }
9505 
9506     switch (kind1) {
9507     case PyUnicode_1BYTE_KIND:
9508         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9509             result = asciilib_count(
9510                 ((Py_UCS1*)buf1) + start, end - start,
9511                 buf2, len2, PY_SSIZE_T_MAX
9512                 );
9513         else
9514             result = ucs1lib_count(
9515                 ((Py_UCS1*)buf1) + start, end - start,
9516                 buf2, len2, PY_SSIZE_T_MAX
9517                 );
9518         break;
9519     case PyUnicode_2BYTE_KIND:
9520         result = ucs2lib_count(
9521             ((Py_UCS2*)buf1) + start, end - start,
9522             buf2, len2, PY_SSIZE_T_MAX
9523             );
9524         break;
9525     case PyUnicode_4BYTE_KIND:
9526         result = ucs4lib_count(
9527             ((Py_UCS4*)buf1) + start, end - start,
9528             buf2, len2, PY_SSIZE_T_MAX
9529             );
9530         break;
9531     default:
9532         Py_UNREACHABLE();
9533     }
9534 
9535     if (kind2 != kind1)
9536         PyMem_Free(buf2);
9537 
9538     return result;
9539   onError:
9540     if (kind2 != kind1 && buf2)
9541         PyMem_Free(buf2);
9542     return -1;
9543 }
9544 
9545 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9546 PyUnicode_Find(PyObject *str,
9547                PyObject *substr,
9548                Py_ssize_t start,
9549                Py_ssize_t end,
9550                int direction)
9551 {
9552     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9553         return -2;
9554 
9555     return any_find_slice(str, substr, start, end, direction);
9556 }
9557 
9558 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9559 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9560                    Py_ssize_t start, Py_ssize_t end,
9561                    int direction)
9562 {
9563     int kind;
9564     Py_ssize_t len, result;
9565     if (PyUnicode_READY(str) == -1)
9566         return -2;
9567     len = PyUnicode_GET_LENGTH(str);
9568     ADJUST_INDICES(start, end, len);
9569     if (end - start < 1)
9570         return -1;
9571     kind = PyUnicode_KIND(str);
9572     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9573                       kind, end-start, ch, direction);
9574     if (result == -1)
9575         return -1;
9576     else
9577         return start + result;
9578 }
9579 
9580 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9581 tailmatch(PyObject *self,
9582           PyObject *substring,
9583           Py_ssize_t start,
9584           Py_ssize_t end,
9585           int direction)
9586 {
9587     int kind_self;
9588     int kind_sub;
9589     void *data_self;
9590     void *data_sub;
9591     Py_ssize_t offset;
9592     Py_ssize_t i;
9593     Py_ssize_t end_sub;
9594 
9595     if (PyUnicode_READY(self) == -1 ||
9596         PyUnicode_READY(substring) == -1)
9597         return -1;
9598 
9599     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9600     end -= PyUnicode_GET_LENGTH(substring);
9601     if (end < start)
9602         return 0;
9603 
9604     if (PyUnicode_GET_LENGTH(substring) == 0)
9605         return 1;
9606 
9607     kind_self = PyUnicode_KIND(self);
9608     data_self = PyUnicode_DATA(self);
9609     kind_sub = PyUnicode_KIND(substring);
9610     data_sub = PyUnicode_DATA(substring);
9611     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9612 
9613     if (direction > 0)
9614         offset = end;
9615     else
9616         offset = start;
9617 
9618     if (PyUnicode_READ(kind_self, data_self, offset) ==
9619         PyUnicode_READ(kind_sub, data_sub, 0) &&
9620         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9621         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9622         /* If both are of the same kind, memcmp is sufficient */
9623         if (kind_self == kind_sub) {
9624             return ! memcmp((char *)data_self +
9625                                 (offset * PyUnicode_KIND(substring)),
9626                             data_sub,
9627                             PyUnicode_GET_LENGTH(substring) *
9628                                 PyUnicode_KIND(substring));
9629         }
9630         /* otherwise we have to compare each character by first accessing it */
9631         else {
9632             /* We do not need to compare 0 and len(substring)-1 because
9633                the if statement above ensured already that they are equal
9634                when we end up here. */
9635             for (i = 1; i < end_sub; ++i) {
9636                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9637                     PyUnicode_READ(kind_sub, data_sub, i))
9638                     return 0;
9639             }
9640             return 1;
9641         }
9642     }
9643 
9644     return 0;
9645 }
9646 
9647 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9648 PyUnicode_Tailmatch(PyObject *str,
9649                     PyObject *substr,
9650                     Py_ssize_t start,
9651                     Py_ssize_t end,
9652                     int direction)
9653 {
9654     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9655         return -1;
9656 
9657     return tailmatch(str, substr, start, end, direction);
9658 }
9659 
9660 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9661 ascii_upper_or_lower(PyObject *self, int lower)
9662 {
9663     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9664     char *resdata, *data = PyUnicode_DATA(self);
9665     PyObject *res;
9666 
9667     res = PyUnicode_New(len, 127);
9668     if (res == NULL)
9669         return NULL;
9670     resdata = PyUnicode_DATA(res);
9671     if (lower)
9672         _Py_bytes_lower(resdata, data, len);
9673     else
9674         _Py_bytes_upper(resdata, data, len);
9675     return res;
9676 }
9677 
9678 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9679 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9680 {
9681     Py_ssize_t j;
9682     int final_sigma;
9683     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9684     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9685 
9686      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9687 
9688     where ! is a negation and \p{xxx} is a character with property xxx.
9689     */
9690     for (j = i - 1; j >= 0; j--) {
9691         c = PyUnicode_READ(kind, data, j);
9692         if (!_PyUnicode_IsCaseIgnorable(c))
9693             break;
9694     }
9695     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9696     if (final_sigma) {
9697         for (j = i + 1; j < length; j++) {
9698             c = PyUnicode_READ(kind, data, j);
9699             if (!_PyUnicode_IsCaseIgnorable(c))
9700                 break;
9701         }
9702         final_sigma = j == length || !_PyUnicode_IsCased(c);
9703     }
9704     return (final_sigma) ? 0x3C2 : 0x3C3;
9705 }
9706 
9707 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9708 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9709            Py_UCS4 c, Py_UCS4 *mapped)
9710 {
9711     /* Obscure special case. */
9712     if (c == 0x3A3) {
9713         mapped[0] = handle_capital_sigma(kind, data, length, i);
9714         return 1;
9715     }
9716     return _PyUnicode_ToLowerFull(c, mapped);
9717 }
9718 
9719 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9720 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9721 {
9722     Py_ssize_t i, k = 0;
9723     int n_res, j;
9724     Py_UCS4 c, mapped[3];
9725 
9726     c = PyUnicode_READ(kind, data, 0);
9727     n_res = _PyUnicode_ToUpperFull(c, mapped);
9728     for (j = 0; j < n_res; j++) {
9729         *maxchar = Py_MAX(*maxchar, mapped[j]);
9730         res[k++] = mapped[j];
9731     }
9732     for (i = 1; i < length; i++) {
9733         c = PyUnicode_READ(kind, data, i);
9734         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9735         for (j = 0; j < n_res; j++) {
9736             *maxchar = Py_MAX(*maxchar, mapped[j]);
9737             res[k++] = mapped[j];
9738         }
9739     }
9740     return k;
9741 }
9742 
9743 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9744 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9745     Py_ssize_t i, k = 0;
9746 
9747     for (i = 0; i < length; i++) {
9748         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9749         int n_res, j;
9750         if (Py_UNICODE_ISUPPER(c)) {
9751             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9752         }
9753         else if (Py_UNICODE_ISLOWER(c)) {
9754             n_res = _PyUnicode_ToUpperFull(c, mapped);
9755         }
9756         else {
9757             n_res = 1;
9758             mapped[0] = c;
9759         }
9760         for (j = 0; j < n_res; j++) {
9761             *maxchar = Py_MAX(*maxchar, mapped[j]);
9762             res[k++] = mapped[j];
9763         }
9764     }
9765     return k;
9766 }
9767 
9768 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9769 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9770                   Py_UCS4 *maxchar, int lower)
9771 {
9772     Py_ssize_t i, k = 0;
9773 
9774     for (i = 0; i < length; i++) {
9775         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776         int n_res, j;
9777         if (lower)
9778             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779         else
9780             n_res = _PyUnicode_ToUpperFull(c, mapped);
9781         for (j = 0; j < n_res; j++) {
9782             *maxchar = Py_MAX(*maxchar, mapped[j]);
9783             res[k++] = mapped[j];
9784         }
9785     }
9786     return k;
9787 }
9788 
9789 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9790 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791 {
9792     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9793 }
9794 
9795 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9796 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797 {
9798     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9799 }
9800 
9801 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9802 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9803 {
9804     Py_ssize_t i, k = 0;
9805 
9806     for (i = 0; i < length; i++) {
9807         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808         Py_UCS4 mapped[3];
9809         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9810         for (j = 0; j < n_res; j++) {
9811             *maxchar = Py_MAX(*maxchar, mapped[j]);
9812             res[k++] = mapped[j];
9813         }
9814     }
9815     return k;
9816 }
9817 
9818 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9819 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820 {
9821     Py_ssize_t i, k = 0;
9822     int previous_is_cased;
9823 
9824     previous_is_cased = 0;
9825     for (i = 0; i < length; i++) {
9826         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827         Py_UCS4 mapped[3];
9828         int n_res, j;
9829 
9830         if (previous_is_cased)
9831             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9832         else
9833             n_res = _PyUnicode_ToTitleFull(c, mapped);
9834 
9835         for (j = 0; j < n_res; j++) {
9836             *maxchar = Py_MAX(*maxchar, mapped[j]);
9837             res[k++] = mapped[j];
9838         }
9839 
9840         previous_is_cased = _PyUnicode_IsCased(c);
9841     }
9842     return k;
9843 }
9844 
9845 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9846 case_operation(PyObject *self,
9847                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9848 {
9849     PyObject *res = NULL;
9850     Py_ssize_t length, newlength = 0;
9851     int kind, outkind;
9852     void *data, *outdata;
9853     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9854 
9855     assert(PyUnicode_IS_READY(self));
9856 
9857     kind = PyUnicode_KIND(self);
9858     data = PyUnicode_DATA(self);
9859     length = PyUnicode_GET_LENGTH(self);
9860     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9861         PyErr_SetString(PyExc_OverflowError, "string is too long");
9862         return NULL;
9863     }
9864     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9865     if (tmp == NULL)
9866         return PyErr_NoMemory();
9867     newlength = perform(kind, data, length, tmp, &maxchar);
9868     res = PyUnicode_New(newlength, maxchar);
9869     if (res == NULL)
9870         goto leave;
9871     tmpend = tmp + newlength;
9872     outdata = PyUnicode_DATA(res);
9873     outkind = PyUnicode_KIND(res);
9874     switch (outkind) {
9875     case PyUnicode_1BYTE_KIND:
9876         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9877         break;
9878     case PyUnicode_2BYTE_KIND:
9879         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9880         break;
9881     case PyUnicode_4BYTE_KIND:
9882         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9883         break;
9884     default:
9885         Py_UNREACHABLE();
9886     }
9887   leave:
9888     PyMem_FREE(tmp);
9889     return res;
9890 }
9891 
9892 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9893 PyUnicode_Join(PyObject *separator, PyObject *seq)
9894 {
9895     PyObject *res;
9896     PyObject *fseq;
9897     Py_ssize_t seqlen;
9898     PyObject **items;
9899 
9900     fseq = PySequence_Fast(seq, "can only join an iterable");
9901     if (fseq == NULL) {
9902         return NULL;
9903     }
9904 
9905     /* NOTE: the following code can't call back into Python code,
9906      * so we are sure that fseq won't be mutated.
9907      */
9908 
9909     items = PySequence_Fast_ITEMS(fseq);
9910     seqlen = PySequence_Fast_GET_SIZE(fseq);
9911     res = _PyUnicode_JoinArray(separator, items, seqlen);
9912     Py_DECREF(fseq);
9913     return res;
9914 }
9915 
9916 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9917 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9918 {
9919     PyObject *res = NULL; /* the result */
9920     PyObject *sep = NULL;
9921     Py_ssize_t seplen;
9922     PyObject *item;
9923     Py_ssize_t sz, i, res_offset;
9924     Py_UCS4 maxchar;
9925     Py_UCS4 item_maxchar;
9926     int use_memcpy;
9927     unsigned char *res_data = NULL, *sep_data = NULL;
9928     PyObject *last_obj;
9929     unsigned int kind = 0;
9930 
9931     /* If empty sequence, return u"". */
9932     if (seqlen == 0) {
9933         _Py_RETURN_UNICODE_EMPTY();
9934     }
9935 
9936     /* If singleton sequence with an exact Unicode, return that. */
9937     last_obj = NULL;
9938     if (seqlen == 1) {
9939         if (PyUnicode_CheckExact(items[0])) {
9940             res = items[0];
9941             Py_INCREF(res);
9942             return res;
9943         }
9944         seplen = 0;
9945         maxchar = 0;
9946     }
9947     else {
9948         /* Set up sep and seplen */
9949         if (separator == NULL) {
9950             /* fall back to a blank space separator */
9951             sep = PyUnicode_FromOrdinal(' ');
9952             if (!sep)
9953                 goto onError;
9954             seplen = 1;
9955             maxchar = 32;
9956         }
9957         else {
9958             if (!PyUnicode_Check(separator)) {
9959                 PyErr_Format(PyExc_TypeError,
9960                              "separator: expected str instance,"
9961                              " %.80s found",
9962                              Py_TYPE(separator)->tp_name);
9963                 goto onError;
9964             }
9965             if (PyUnicode_READY(separator))
9966                 goto onError;
9967             sep = separator;
9968             seplen = PyUnicode_GET_LENGTH(separator);
9969             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9970             /* inc refcount to keep this code path symmetric with the
9971                above case of a blank separator */
9972             Py_INCREF(sep);
9973         }
9974         last_obj = sep;
9975     }
9976 
9977     /* There are at least two things to join, or else we have a subclass
9978      * of str in the sequence.
9979      * Do a pre-pass to figure out the total amount of space we'll
9980      * need (sz), and see whether all argument are strings.
9981      */
9982     sz = 0;
9983 #ifdef Py_DEBUG
9984     use_memcpy = 0;
9985 #else
9986     use_memcpy = 1;
9987 #endif
9988     for (i = 0; i < seqlen; i++) {
9989         size_t add_sz;
9990         item = items[i];
9991         if (!PyUnicode_Check(item)) {
9992             PyErr_Format(PyExc_TypeError,
9993                          "sequence item %zd: expected str instance,"
9994                          " %.80s found",
9995                          i, Py_TYPE(item)->tp_name);
9996             goto onError;
9997         }
9998         if (PyUnicode_READY(item) == -1)
9999             goto onError;
10000         add_sz = PyUnicode_GET_LENGTH(item);
10001         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10002         maxchar = Py_MAX(maxchar, item_maxchar);
10003         if (i != 0) {
10004             add_sz += seplen;
10005         }
10006         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10007             PyErr_SetString(PyExc_OverflowError,
10008                             "join() result is too long for a Python string");
10009             goto onError;
10010         }
10011         sz += add_sz;
10012         if (use_memcpy && last_obj != NULL) {
10013             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10014                 use_memcpy = 0;
10015         }
10016         last_obj = item;
10017     }
10018 
10019     res = PyUnicode_New(sz, maxchar);
10020     if (res == NULL)
10021         goto onError;
10022 
10023     /* Catenate everything. */
10024 #ifdef Py_DEBUG
10025     use_memcpy = 0;
10026 #else
10027     if (use_memcpy) {
10028         res_data = PyUnicode_1BYTE_DATA(res);
10029         kind = PyUnicode_KIND(res);
10030         if (seplen != 0)
10031             sep_data = PyUnicode_1BYTE_DATA(sep);
10032     }
10033 #endif
10034     if (use_memcpy) {
10035         for (i = 0; i < seqlen; ++i) {
10036             Py_ssize_t itemlen;
10037             item = items[i];
10038 
10039             /* Copy item, and maybe the separator. */
10040             if (i && seplen != 0) {
10041                 memcpy(res_data,
10042                           sep_data,
10043                           kind * seplen);
10044                 res_data += kind * seplen;
10045             }
10046 
10047             itemlen = PyUnicode_GET_LENGTH(item);
10048             if (itemlen != 0) {
10049                 memcpy(res_data,
10050                           PyUnicode_DATA(item),
10051                           kind * itemlen);
10052                 res_data += kind * itemlen;
10053             }
10054         }
10055         assert(res_data == PyUnicode_1BYTE_DATA(res)
10056                            + kind * PyUnicode_GET_LENGTH(res));
10057     }
10058     else {
10059         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10060             Py_ssize_t itemlen;
10061             item = items[i];
10062 
10063             /* Copy item, and maybe the separator. */
10064             if (i && seplen != 0) {
10065                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10066                 res_offset += seplen;
10067             }
10068 
10069             itemlen = PyUnicode_GET_LENGTH(item);
10070             if (itemlen != 0) {
10071                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10072                 res_offset += itemlen;
10073             }
10074         }
10075         assert(res_offset == PyUnicode_GET_LENGTH(res));
10076     }
10077 
10078     Py_XDECREF(sep);
10079     assert(_PyUnicode_CheckConsistency(res, 1));
10080     return res;
10081 
10082   onError:
10083     Py_XDECREF(sep);
10084     Py_XDECREF(res);
10085     return NULL;
10086 }
10087 
10088 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10089 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10090                     Py_UCS4 fill_char)
10091 {
10092     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10093     void *data = PyUnicode_DATA(unicode);
10094     assert(PyUnicode_IS_READY(unicode));
10095     assert(unicode_modifiable(unicode));
10096     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10097     assert(start >= 0);
10098     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10099     FILL(kind, data, fill_char, start, length);
10100 }
10101 
10102 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10103 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10104                Py_UCS4 fill_char)
10105 {
10106     Py_ssize_t maxlen;
10107 
10108     if (!PyUnicode_Check(unicode)) {
10109         PyErr_BadInternalCall();
10110         return -1;
10111     }
10112     if (PyUnicode_READY(unicode) == -1)
10113         return -1;
10114     if (unicode_check_modifiable(unicode))
10115         return -1;
10116 
10117     if (start < 0) {
10118         PyErr_SetString(PyExc_IndexError, "string index out of range");
10119         return -1;
10120     }
10121     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10122         PyErr_SetString(PyExc_ValueError,
10123                          "fill character is bigger than "
10124                          "the string maximum character");
10125         return -1;
10126     }
10127 
10128     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10129     length = Py_MIN(maxlen, length);
10130     if (length <= 0)
10131         return 0;
10132 
10133     _PyUnicode_FastFill(unicode, start, length, fill_char);
10134     return length;
10135 }
10136 
10137 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10138 pad(PyObject *self,
10139     Py_ssize_t left,
10140     Py_ssize_t right,
10141     Py_UCS4 fill)
10142 {
10143     PyObject *u;
10144     Py_UCS4 maxchar;
10145     int kind;
10146     void *data;
10147 
10148     if (left < 0)
10149         left = 0;
10150     if (right < 0)
10151         right = 0;
10152 
10153     if (left == 0 && right == 0)
10154         return unicode_result_unchanged(self);
10155 
10156     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10157         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10158         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10159         return NULL;
10160     }
10161     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10162     maxchar = Py_MAX(maxchar, fill);
10163     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10164     if (!u)
10165         return NULL;
10166 
10167     kind = PyUnicode_KIND(u);
10168     data = PyUnicode_DATA(u);
10169     if (left)
10170         FILL(kind, data, fill, 0, left);
10171     if (right)
10172         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10173     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10174     assert(_PyUnicode_CheckConsistency(u, 1));
10175     return u;
10176 }
10177 
10178 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10179 PyUnicode_Splitlines(PyObject *string, int keepends)
10180 {
10181     PyObject *list;
10182 
10183     if (ensure_unicode(string) < 0)
10184         return NULL;
10185 
10186     switch (PyUnicode_KIND(string)) {
10187     case PyUnicode_1BYTE_KIND:
10188         if (PyUnicode_IS_ASCII(string))
10189             list = asciilib_splitlines(
10190                 string, PyUnicode_1BYTE_DATA(string),
10191                 PyUnicode_GET_LENGTH(string), keepends);
10192         else
10193             list = ucs1lib_splitlines(
10194                 string, PyUnicode_1BYTE_DATA(string),
10195                 PyUnicode_GET_LENGTH(string), keepends);
10196         break;
10197     case PyUnicode_2BYTE_KIND:
10198         list = ucs2lib_splitlines(
10199             string, PyUnicode_2BYTE_DATA(string),
10200             PyUnicode_GET_LENGTH(string), keepends);
10201         break;
10202     case PyUnicode_4BYTE_KIND:
10203         list = ucs4lib_splitlines(
10204             string, PyUnicode_4BYTE_DATA(string),
10205             PyUnicode_GET_LENGTH(string), keepends);
10206         break;
10207     default:
10208         Py_UNREACHABLE();
10209     }
10210     return list;
10211 }
10212 
10213 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10214 split(PyObject *self,
10215       PyObject *substring,
10216       Py_ssize_t maxcount)
10217 {
10218     int kind1, kind2;
10219     void *buf1, *buf2;
10220     Py_ssize_t len1, len2;
10221     PyObject* out;
10222 
10223     if (maxcount < 0)
10224         maxcount = PY_SSIZE_T_MAX;
10225 
10226     if (PyUnicode_READY(self) == -1)
10227         return NULL;
10228 
10229     if (substring == NULL)
10230         switch (PyUnicode_KIND(self)) {
10231         case PyUnicode_1BYTE_KIND:
10232             if (PyUnicode_IS_ASCII(self))
10233                 return asciilib_split_whitespace(
10234                     self,  PyUnicode_1BYTE_DATA(self),
10235                     PyUnicode_GET_LENGTH(self), maxcount
10236                     );
10237             else
10238                 return ucs1lib_split_whitespace(
10239                     self,  PyUnicode_1BYTE_DATA(self),
10240                     PyUnicode_GET_LENGTH(self), maxcount
10241                     );
10242         case PyUnicode_2BYTE_KIND:
10243             return ucs2lib_split_whitespace(
10244                 self,  PyUnicode_2BYTE_DATA(self),
10245                 PyUnicode_GET_LENGTH(self), maxcount
10246                 );
10247         case PyUnicode_4BYTE_KIND:
10248             return ucs4lib_split_whitespace(
10249                 self,  PyUnicode_4BYTE_DATA(self),
10250                 PyUnicode_GET_LENGTH(self), maxcount
10251                 );
10252         default:
10253             Py_UNREACHABLE();
10254         }
10255 
10256     if (PyUnicode_READY(substring) == -1)
10257         return NULL;
10258 
10259     kind1 = PyUnicode_KIND(self);
10260     kind2 = PyUnicode_KIND(substring);
10261     len1 = PyUnicode_GET_LENGTH(self);
10262     len2 = PyUnicode_GET_LENGTH(substring);
10263     if (kind1 < kind2 || len1 < len2) {
10264         out = PyList_New(1);
10265         if (out == NULL)
10266             return NULL;
10267         Py_INCREF(self);
10268         PyList_SET_ITEM(out, 0, self);
10269         return out;
10270     }
10271     buf1 = PyUnicode_DATA(self);
10272     buf2 = PyUnicode_DATA(substring);
10273     if (kind2 != kind1) {
10274         buf2 = _PyUnicode_AsKind(substring, kind1);
10275         if (!buf2)
10276             return NULL;
10277     }
10278 
10279     switch (kind1) {
10280     case PyUnicode_1BYTE_KIND:
10281         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10282             out = asciilib_split(
10283                 self,  buf1, len1, buf2, len2, maxcount);
10284         else
10285             out = ucs1lib_split(
10286                 self,  buf1, len1, buf2, len2, maxcount);
10287         break;
10288     case PyUnicode_2BYTE_KIND:
10289         out = ucs2lib_split(
10290             self,  buf1, len1, buf2, len2, maxcount);
10291         break;
10292     case PyUnicode_4BYTE_KIND:
10293         out = ucs4lib_split(
10294             self,  buf1, len1, buf2, len2, maxcount);
10295         break;
10296     default:
10297         out = NULL;
10298     }
10299     if (kind2 != kind1)
10300         PyMem_Free(buf2);
10301     return out;
10302 }
10303 
10304 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10305 rsplit(PyObject *self,
10306        PyObject *substring,
10307        Py_ssize_t maxcount)
10308 {
10309     int kind1, kind2;
10310     void *buf1, *buf2;
10311     Py_ssize_t len1, len2;
10312     PyObject* out;
10313 
10314     if (maxcount < 0)
10315         maxcount = PY_SSIZE_T_MAX;
10316 
10317     if (PyUnicode_READY(self) == -1)
10318         return NULL;
10319 
10320     if (substring == NULL)
10321         switch (PyUnicode_KIND(self)) {
10322         case PyUnicode_1BYTE_KIND:
10323             if (PyUnicode_IS_ASCII(self))
10324                 return asciilib_rsplit_whitespace(
10325                     self,  PyUnicode_1BYTE_DATA(self),
10326                     PyUnicode_GET_LENGTH(self), maxcount
10327                     );
10328             else
10329                 return ucs1lib_rsplit_whitespace(
10330                     self,  PyUnicode_1BYTE_DATA(self),
10331                     PyUnicode_GET_LENGTH(self), maxcount
10332                     );
10333         case PyUnicode_2BYTE_KIND:
10334             return ucs2lib_rsplit_whitespace(
10335                 self,  PyUnicode_2BYTE_DATA(self),
10336                 PyUnicode_GET_LENGTH(self), maxcount
10337                 );
10338         case PyUnicode_4BYTE_KIND:
10339             return ucs4lib_rsplit_whitespace(
10340                 self,  PyUnicode_4BYTE_DATA(self),
10341                 PyUnicode_GET_LENGTH(self), maxcount
10342                 );
10343         default:
10344             Py_UNREACHABLE();
10345         }
10346 
10347     if (PyUnicode_READY(substring) == -1)
10348         return NULL;
10349 
10350     kind1 = PyUnicode_KIND(self);
10351     kind2 = PyUnicode_KIND(substring);
10352     len1 = PyUnicode_GET_LENGTH(self);
10353     len2 = PyUnicode_GET_LENGTH(substring);
10354     if (kind1 < kind2 || len1 < len2) {
10355         out = PyList_New(1);
10356         if (out == NULL)
10357             return NULL;
10358         Py_INCREF(self);
10359         PyList_SET_ITEM(out, 0, self);
10360         return out;
10361     }
10362     buf1 = PyUnicode_DATA(self);
10363     buf2 = PyUnicode_DATA(substring);
10364     if (kind2 != kind1) {
10365         buf2 = _PyUnicode_AsKind(substring, kind1);
10366         if (!buf2)
10367             return NULL;
10368     }
10369 
10370     switch (kind1) {
10371     case PyUnicode_1BYTE_KIND:
10372         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10373             out = asciilib_rsplit(
10374                 self,  buf1, len1, buf2, len2, maxcount);
10375         else
10376             out = ucs1lib_rsplit(
10377                 self,  buf1, len1, buf2, len2, maxcount);
10378         break;
10379     case PyUnicode_2BYTE_KIND:
10380         out = ucs2lib_rsplit(
10381             self,  buf1, len1, buf2, len2, maxcount);
10382         break;
10383     case PyUnicode_4BYTE_KIND:
10384         out = ucs4lib_rsplit(
10385             self,  buf1, len1, buf2, len2, maxcount);
10386         break;
10387     default:
10388         out = NULL;
10389     }
10390     if (kind2 != kind1)
10391         PyMem_Free(buf2);
10392     return out;
10393 }
10394 
10395 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10396 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10397             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10398 {
10399     switch (kind) {
10400     case PyUnicode_1BYTE_KIND:
10401         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10402             return asciilib_find(buf1, len1, buf2, len2, offset);
10403         else
10404             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10405     case PyUnicode_2BYTE_KIND:
10406         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10407     case PyUnicode_4BYTE_KIND:
10408         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10409     }
10410     Py_UNREACHABLE();
10411 }
10412 
10413 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10414 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10415              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10416 {
10417     switch (kind) {
10418     case PyUnicode_1BYTE_KIND:
10419         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10420             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10421         else
10422             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10423     case PyUnicode_2BYTE_KIND:
10424         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10425     case PyUnicode_4BYTE_KIND:
10426         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10427     }
10428     Py_UNREACHABLE();
10429 }
10430 
10431 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10432 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10433                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10434 {
10435     int kind = PyUnicode_KIND(u);
10436     void *data = PyUnicode_DATA(u);
10437     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10438     if (kind == PyUnicode_1BYTE_KIND) {
10439         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10440                                       (Py_UCS1 *)data + len,
10441                                       u1, u2, maxcount);
10442     }
10443     else if (kind == PyUnicode_2BYTE_KIND) {
10444         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10445                                       (Py_UCS2 *)data + len,
10446                                       u1, u2, maxcount);
10447     }
10448     else {
10449         assert(kind == PyUnicode_4BYTE_KIND);
10450         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10451                                       (Py_UCS4 *)data + len,
10452                                       u1, u2, maxcount);
10453     }
10454 }
10455 
10456 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10457 replace(PyObject *self, PyObject *str1,
10458         PyObject *str2, Py_ssize_t maxcount)
10459 {
10460     PyObject *u;
10461     char *sbuf = PyUnicode_DATA(self);
10462     char *buf1 = PyUnicode_DATA(str1);
10463     char *buf2 = PyUnicode_DATA(str2);
10464     int srelease = 0, release1 = 0, release2 = 0;
10465     int skind = PyUnicode_KIND(self);
10466     int kind1 = PyUnicode_KIND(str1);
10467     int kind2 = PyUnicode_KIND(str2);
10468     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10469     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10470     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10471     int mayshrink;
10472     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10473 
10474     if (maxcount < 0)
10475         maxcount = PY_SSIZE_T_MAX;
10476     else if (maxcount == 0 || slen == 0)
10477         goto nothing;
10478 
10479     if (str1 == str2)
10480         goto nothing;
10481 
10482     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10483     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10484     if (maxchar < maxchar_str1)
10485         /* substring too wide to be present */
10486         goto nothing;
10487     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10488     /* Replacing str1 with str2 may cause a maxchar reduction in the
10489        result string. */
10490     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10491     maxchar = Py_MAX(maxchar, maxchar_str2);
10492 
10493     if (len1 == len2) {
10494         /* same length */
10495         if (len1 == 0)
10496             goto nothing;
10497         if (len1 == 1) {
10498             /* replace characters */
10499             Py_UCS4 u1, u2;
10500             Py_ssize_t pos;
10501 
10502             u1 = PyUnicode_READ(kind1, buf1, 0);
10503             pos = findchar(sbuf, skind, slen, u1, 1);
10504             if (pos < 0)
10505                 goto nothing;
10506             u2 = PyUnicode_READ(kind2, buf2, 0);
10507             u = PyUnicode_New(slen, maxchar);
10508             if (!u)
10509                 goto error;
10510 
10511             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10512             replace_1char_inplace(u, pos, u1, u2, maxcount);
10513         }
10514         else {
10515             int rkind = skind;
10516             char *res;
10517             Py_ssize_t i;
10518 
10519             if (kind1 < rkind) {
10520                 /* widen substring */
10521                 buf1 = _PyUnicode_AsKind(str1, rkind);
10522                 if (!buf1) goto error;
10523                 release1 = 1;
10524             }
10525             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10526             if (i < 0)
10527                 goto nothing;
10528             if (rkind > kind2) {
10529                 /* widen replacement */
10530                 buf2 = _PyUnicode_AsKind(str2, rkind);
10531                 if (!buf2) goto error;
10532                 release2 = 1;
10533             }
10534             else if (rkind < kind2) {
10535                 /* widen self and buf1 */
10536                 rkind = kind2;
10537                 if (release1) PyMem_Free(buf1);
10538                 release1 = 0;
10539                 sbuf = _PyUnicode_AsKind(self, rkind);
10540                 if (!sbuf) goto error;
10541                 srelease = 1;
10542                 buf1 = _PyUnicode_AsKind(str1, rkind);
10543                 if (!buf1) goto error;
10544                 release1 = 1;
10545             }
10546             u = PyUnicode_New(slen, maxchar);
10547             if (!u)
10548                 goto error;
10549             assert(PyUnicode_KIND(u) == rkind);
10550             res = PyUnicode_DATA(u);
10551 
10552             memcpy(res, sbuf, rkind * slen);
10553             /* change everything in-place, starting with this one */
10554             memcpy(res + rkind * i,
10555                    buf2,
10556                    rkind * len2);
10557             i += len1;
10558 
10559             while ( --maxcount > 0) {
10560                 i = anylib_find(rkind, self,
10561                                 sbuf+rkind*i, slen-i,
10562                                 str1, buf1, len1, i);
10563                 if (i == -1)
10564                     break;
10565                 memcpy(res + rkind * i,
10566                        buf2,
10567                        rkind * len2);
10568                 i += len1;
10569             }
10570         }
10571     }
10572     else {
10573         Py_ssize_t n, i, j, ires;
10574         Py_ssize_t new_size;
10575         int rkind = skind;
10576         char *res;
10577 
10578         if (kind1 < rkind) {
10579             /* widen substring */
10580             buf1 = _PyUnicode_AsKind(str1, rkind);
10581             if (!buf1) goto error;
10582             release1 = 1;
10583         }
10584         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10585         if (n == 0)
10586             goto nothing;
10587         if (kind2 < rkind) {
10588             /* widen replacement */
10589             buf2 = _PyUnicode_AsKind(str2, rkind);
10590             if (!buf2) goto error;
10591             release2 = 1;
10592         }
10593         else if (kind2 > rkind) {
10594             /* widen self and buf1 */
10595             rkind = kind2;
10596             sbuf = _PyUnicode_AsKind(self, rkind);
10597             if (!sbuf) goto error;
10598             srelease = 1;
10599             if (release1) PyMem_Free(buf1);
10600             release1 = 0;
10601             buf1 = _PyUnicode_AsKind(str1, rkind);
10602             if (!buf1) goto error;
10603             release1 = 1;
10604         }
10605         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10606            PyUnicode_GET_LENGTH(str1))); */
10607         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10608                 PyErr_SetString(PyExc_OverflowError,
10609                                 "replace string is too long");
10610                 goto error;
10611         }
10612         new_size = slen + n * (len2 - len1);
10613         if (new_size == 0) {
10614             _Py_INCREF_UNICODE_EMPTY();
10615             if (!unicode_empty)
10616                 goto error;
10617             u = unicode_empty;
10618             goto done;
10619         }
10620         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10621             PyErr_SetString(PyExc_OverflowError,
10622                             "replace string is too long");
10623             goto error;
10624         }
10625         u = PyUnicode_New(new_size, maxchar);
10626         if (!u)
10627             goto error;
10628         assert(PyUnicode_KIND(u) == rkind);
10629         res = PyUnicode_DATA(u);
10630         ires = i = 0;
10631         if (len1 > 0) {
10632             while (n-- > 0) {
10633                 /* look for next match */
10634                 j = anylib_find(rkind, self,
10635                                 sbuf + rkind * i, slen-i,
10636                                 str1, buf1, len1, i);
10637                 if (j == -1)
10638                     break;
10639                 else if (j > i) {
10640                     /* copy unchanged part [i:j] */
10641                     memcpy(res + rkind * ires,
10642                            sbuf + rkind * i,
10643                            rkind * (j-i));
10644                     ires += j - i;
10645                 }
10646                 /* copy substitution string */
10647                 if (len2 > 0) {
10648                     memcpy(res + rkind * ires,
10649                            buf2,
10650                            rkind * len2);
10651                     ires += len2;
10652                 }
10653                 i = j + len1;
10654             }
10655             if (i < slen)
10656                 /* copy tail [i:] */
10657                 memcpy(res + rkind * ires,
10658                        sbuf + rkind * i,
10659                        rkind * (slen-i));
10660         }
10661         else {
10662             /* interleave */
10663             while (n > 0) {
10664                 memcpy(res + rkind * ires,
10665                        buf2,
10666                        rkind * len2);
10667                 ires += len2;
10668                 if (--n <= 0)
10669                     break;
10670                 memcpy(res + rkind * ires,
10671                        sbuf + rkind * i,
10672                        rkind);
10673                 ires++;
10674                 i++;
10675             }
10676             memcpy(res + rkind * ires,
10677                    sbuf + rkind * i,
10678                    rkind * (slen-i));
10679         }
10680     }
10681 
10682     if (mayshrink) {
10683         unicode_adjust_maxchar(&u);
10684         if (u == NULL)
10685             goto error;
10686     }
10687 
10688   done:
10689     if (srelease)
10690         PyMem_FREE(sbuf);
10691     if (release1)
10692         PyMem_FREE(buf1);
10693     if (release2)
10694         PyMem_FREE(buf2);
10695     assert(_PyUnicode_CheckConsistency(u, 1));
10696     return u;
10697 
10698   nothing:
10699     /* nothing to replace; return original string (when possible) */
10700     if (srelease)
10701         PyMem_FREE(sbuf);
10702     if (release1)
10703         PyMem_FREE(buf1);
10704     if (release2)
10705         PyMem_FREE(buf2);
10706     return unicode_result_unchanged(self);
10707 
10708   error:
10709     if (srelease && sbuf)
10710         PyMem_FREE(sbuf);
10711     if (release1 && buf1)
10712         PyMem_FREE(buf1);
10713     if (release2 && buf2)
10714         PyMem_FREE(buf2);
10715     return NULL;
10716 }
10717 
10718 /* --- Unicode Object Methods --------------------------------------------- */
10719 
10720 /*[clinic input]
10721 str.title as unicode_title
10722 
10723 Return a version of the string where each word is titlecased.
10724 
10725 More specifically, words start with uppercased characters and all remaining
10726 cased characters have lower case.
10727 [clinic start generated code]*/
10728 
10729 static PyObject *
unicode_title_impl(PyObject * self)10730 unicode_title_impl(PyObject *self)
10731 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10732 {
10733     if (PyUnicode_READY(self) == -1)
10734         return NULL;
10735     return case_operation(self, do_title);
10736 }
10737 
10738 /*[clinic input]
10739 str.capitalize as unicode_capitalize
10740 
10741 Return a capitalized version of the string.
10742 
10743 More specifically, make the first character have upper case and the rest lower
10744 case.
10745 [clinic start generated code]*/
10746 
10747 static PyObject *
unicode_capitalize_impl(PyObject * self)10748 unicode_capitalize_impl(PyObject *self)
10749 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10750 {
10751     if (PyUnicode_READY(self) == -1)
10752         return NULL;
10753     if (PyUnicode_GET_LENGTH(self) == 0)
10754         return unicode_result_unchanged(self);
10755     return case_operation(self, do_capitalize);
10756 }
10757 
10758 /*[clinic input]
10759 str.casefold as unicode_casefold
10760 
10761 Return a version of the string suitable for caseless comparisons.
10762 [clinic start generated code]*/
10763 
10764 static PyObject *
unicode_casefold_impl(PyObject * self)10765 unicode_casefold_impl(PyObject *self)
10766 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10767 {
10768     if (PyUnicode_READY(self) == -1)
10769         return NULL;
10770     if (PyUnicode_IS_ASCII(self))
10771         return ascii_upper_or_lower(self, 1);
10772     return case_operation(self, do_casefold);
10773 }
10774 
10775 
10776 /* Argument converter. Accepts a single Unicode character. */
10777 
10778 static int
convert_uc(PyObject * obj,void * addr)10779 convert_uc(PyObject *obj, void *addr)
10780 {
10781     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10782 
10783     if (!PyUnicode_Check(obj)) {
10784         PyErr_Format(PyExc_TypeError,
10785                      "The fill character must be a unicode character, "
10786                      "not %.100s", Py_TYPE(obj)->tp_name);
10787         return 0;
10788     }
10789     if (PyUnicode_READY(obj) < 0)
10790         return 0;
10791     if (PyUnicode_GET_LENGTH(obj) != 1) {
10792         PyErr_SetString(PyExc_TypeError,
10793                         "The fill character must be exactly one character long");
10794         return 0;
10795     }
10796     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10797     return 1;
10798 }
10799 
10800 /*[clinic input]
10801 str.center as unicode_center
10802 
10803     width: Py_ssize_t
10804     fillchar: Py_UCS4 = ' '
10805     /
10806 
10807 Return a centered string of length width.
10808 
10809 Padding is done using the specified fill character (default is a space).
10810 [clinic start generated code]*/
10811 
10812 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10813 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10814 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10815 {
10816     Py_ssize_t marg, left;
10817 
10818     if (PyUnicode_READY(self) == -1)
10819         return NULL;
10820 
10821     if (PyUnicode_GET_LENGTH(self) >= width)
10822         return unicode_result_unchanged(self);
10823 
10824     marg = width - PyUnicode_GET_LENGTH(self);
10825     left = marg / 2 + (marg & width & 1);
10826 
10827     return pad(self, left, marg - left, fillchar);
10828 }
10829 
10830 /* This function assumes that str1 and str2 are readied by the caller. */
10831 
10832 static int
unicode_compare(PyObject * str1,PyObject * str2)10833 unicode_compare(PyObject *str1, PyObject *str2)
10834 {
10835 #define COMPARE(TYPE1, TYPE2) \
10836     do { \
10837         TYPE1* p1 = (TYPE1 *)data1; \
10838         TYPE2* p2 = (TYPE2 *)data2; \
10839         TYPE1* end = p1 + len; \
10840         Py_UCS4 c1, c2; \
10841         for (; p1 != end; p1++, p2++) { \
10842             c1 = *p1; \
10843             c2 = *p2; \
10844             if (c1 != c2) \
10845                 return (c1 < c2) ? -1 : 1; \
10846         } \
10847     } \
10848     while (0)
10849 
10850     int kind1, kind2;
10851     void *data1, *data2;
10852     Py_ssize_t len1, len2, len;
10853 
10854     kind1 = PyUnicode_KIND(str1);
10855     kind2 = PyUnicode_KIND(str2);
10856     data1 = PyUnicode_DATA(str1);
10857     data2 = PyUnicode_DATA(str2);
10858     len1 = PyUnicode_GET_LENGTH(str1);
10859     len2 = PyUnicode_GET_LENGTH(str2);
10860     len = Py_MIN(len1, len2);
10861 
10862     switch(kind1) {
10863     case PyUnicode_1BYTE_KIND:
10864     {
10865         switch(kind2) {
10866         case PyUnicode_1BYTE_KIND:
10867         {
10868             int cmp = memcmp(data1, data2, len);
10869             /* normalize result of memcmp() into the range [-1; 1] */
10870             if (cmp < 0)
10871                 return -1;
10872             if (cmp > 0)
10873                 return 1;
10874             break;
10875         }
10876         case PyUnicode_2BYTE_KIND:
10877             COMPARE(Py_UCS1, Py_UCS2);
10878             break;
10879         case PyUnicode_4BYTE_KIND:
10880             COMPARE(Py_UCS1, Py_UCS4);
10881             break;
10882         default:
10883             Py_UNREACHABLE();
10884         }
10885         break;
10886     }
10887     case PyUnicode_2BYTE_KIND:
10888     {
10889         switch(kind2) {
10890         case PyUnicode_1BYTE_KIND:
10891             COMPARE(Py_UCS2, Py_UCS1);
10892             break;
10893         case PyUnicode_2BYTE_KIND:
10894         {
10895             COMPARE(Py_UCS2, Py_UCS2);
10896             break;
10897         }
10898         case PyUnicode_4BYTE_KIND:
10899             COMPARE(Py_UCS2, Py_UCS4);
10900             break;
10901         default:
10902             Py_UNREACHABLE();
10903         }
10904         break;
10905     }
10906     case PyUnicode_4BYTE_KIND:
10907     {
10908         switch(kind2) {
10909         case PyUnicode_1BYTE_KIND:
10910             COMPARE(Py_UCS4, Py_UCS1);
10911             break;
10912         case PyUnicode_2BYTE_KIND:
10913             COMPARE(Py_UCS4, Py_UCS2);
10914             break;
10915         case PyUnicode_4BYTE_KIND:
10916         {
10917 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10918             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10919             /* normalize result of wmemcmp() into the range [-1; 1] */
10920             if (cmp < 0)
10921                 return -1;
10922             if (cmp > 0)
10923                 return 1;
10924 #else
10925             COMPARE(Py_UCS4, Py_UCS4);
10926 #endif
10927             break;
10928         }
10929         default:
10930             Py_UNREACHABLE();
10931         }
10932         break;
10933     }
10934     default:
10935         Py_UNREACHABLE();
10936     }
10937 
10938     if (len1 == len2)
10939         return 0;
10940     if (len1 < len2)
10941         return -1;
10942     else
10943         return 1;
10944 
10945 #undef COMPARE
10946 }
10947 
10948 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10949 unicode_compare_eq(PyObject *str1, PyObject *str2)
10950 {
10951     int kind;
10952     void *data1, *data2;
10953     Py_ssize_t len;
10954     int cmp;
10955 
10956     len = PyUnicode_GET_LENGTH(str1);
10957     if (PyUnicode_GET_LENGTH(str2) != len)
10958         return 0;
10959     kind = PyUnicode_KIND(str1);
10960     if (PyUnicode_KIND(str2) != kind)
10961         return 0;
10962     data1 = PyUnicode_DATA(str1);
10963     data2 = PyUnicode_DATA(str2);
10964 
10965     cmp = memcmp(data1, data2, len * kind);
10966     return (cmp == 0);
10967 }
10968 
10969 
10970 int
PyUnicode_Compare(PyObject * left,PyObject * right)10971 PyUnicode_Compare(PyObject *left, PyObject *right)
10972 {
10973     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10974         if (PyUnicode_READY(left) == -1 ||
10975             PyUnicode_READY(right) == -1)
10976             return -1;
10977 
10978         /* a string is equal to itself */
10979         if (left == right)
10980             return 0;
10981 
10982         return unicode_compare(left, right);
10983     }
10984     PyErr_Format(PyExc_TypeError,
10985                  "Can't compare %.100s and %.100s",
10986                  left->ob_type->tp_name,
10987                  right->ob_type->tp_name);
10988     return -1;
10989 }
10990 
10991 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)10992 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10993 {
10994     Py_ssize_t i;
10995     int kind;
10996     Py_UCS4 chr;
10997     const unsigned char *ustr = (const unsigned char *)str;
10998 
10999     assert(_PyUnicode_CHECK(uni));
11000     if (!PyUnicode_IS_READY(uni)) {
11001         const wchar_t *ws = _PyUnicode_WSTR(uni);
11002         /* Compare Unicode string and source character set string */
11003         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11004             if (chr != ustr[i])
11005                 return (chr < ustr[i]) ? -1 : 1;
11006         }
11007         /* This check keeps Python strings that end in '\0' from comparing equal
11008          to C strings identical up to that point. */
11009         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11010             return 1; /* uni is longer */
11011         if (ustr[i])
11012             return -1; /* str is longer */
11013         return 0;
11014     }
11015     kind = PyUnicode_KIND(uni);
11016     if (kind == PyUnicode_1BYTE_KIND) {
11017         const void *data = PyUnicode_1BYTE_DATA(uni);
11018         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11019         size_t len, len2 = strlen(str);
11020         int cmp;
11021 
11022         len = Py_MIN(len1, len2);
11023         cmp = memcmp(data, str, len);
11024         if (cmp != 0) {
11025             if (cmp < 0)
11026                 return -1;
11027             else
11028                 return 1;
11029         }
11030         if (len1 > len2)
11031             return 1; /* uni is longer */
11032         if (len1 < len2)
11033             return -1; /* str is longer */
11034         return 0;
11035     }
11036     else {
11037         void *data = PyUnicode_DATA(uni);
11038         /* Compare Unicode string and source character set string */
11039         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11040             if (chr != (unsigned char)str[i])
11041                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11042         /* This check keeps Python strings that end in '\0' from comparing equal
11043          to C strings identical up to that point. */
11044         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11045             return 1; /* uni is longer */
11046         if (str[i])
11047             return -1; /* str is longer */
11048         return 0;
11049     }
11050 }
11051 
11052 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11053 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11054 {
11055     size_t i, len;
11056     const wchar_t *p;
11057     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11058     if (strlen(str) != len)
11059         return 0;
11060     p = _PyUnicode_WSTR(unicode);
11061     assert(p);
11062     for (i = 0; i < len; i++) {
11063         unsigned char c = (unsigned char)str[i];
11064         if (c >= 128 || p[i] != (wchar_t)c)
11065             return 0;
11066     }
11067     return 1;
11068 }
11069 
11070 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11071 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11072 {
11073     size_t len;
11074     assert(_PyUnicode_CHECK(unicode));
11075     assert(str);
11076 #ifndef NDEBUG
11077     for (const char *p = str; *p; p++) {
11078         assert((unsigned char)*p < 128);
11079     }
11080 #endif
11081     if (PyUnicode_READY(unicode) == -1) {
11082         /* Memory error or bad data */
11083         PyErr_Clear();
11084         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11085     }
11086     if (!PyUnicode_IS_ASCII(unicode))
11087         return 0;
11088     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11089     return strlen(str) == len &&
11090            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11091 }
11092 
11093 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11094 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11095 {
11096     PyObject *right_uni;
11097     Py_hash_t hash;
11098 
11099     assert(_PyUnicode_CHECK(left));
11100     assert(right->string);
11101 #ifndef NDEBUG
11102     for (const char *p = right->string; *p; p++) {
11103         assert((unsigned char)*p < 128);
11104     }
11105 #endif
11106 
11107     if (PyUnicode_READY(left) == -1) {
11108         /* memory error or bad data */
11109         PyErr_Clear();
11110         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11111     }
11112 
11113     if (!PyUnicode_IS_ASCII(left))
11114         return 0;
11115 
11116     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11117     if (right_uni == NULL) {
11118         /* memory error or bad data */
11119         PyErr_Clear();
11120         return _PyUnicode_EqualToASCIIString(left, right->string);
11121     }
11122 
11123     if (left == right_uni)
11124         return 1;
11125 
11126     if (PyUnicode_CHECK_INTERNED(left))
11127         return 0;
11128 
11129     assert(_PyUnicode_HASH(right_uni) != -1);
11130     hash = _PyUnicode_HASH(left);
11131     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11132         return 0;
11133 
11134     return unicode_compare_eq(left, right_uni);
11135 }
11136 
11137 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11138 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11139 {
11140     int result;
11141 
11142     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11143         Py_RETURN_NOTIMPLEMENTED;
11144 
11145     if (PyUnicode_READY(left) == -1 ||
11146         PyUnicode_READY(right) == -1)
11147         return NULL;
11148 
11149     if (left == right) {
11150         switch (op) {
11151         case Py_EQ:
11152         case Py_LE:
11153         case Py_GE:
11154             /* a string is equal to itself */
11155             Py_RETURN_TRUE;
11156         case Py_NE:
11157         case Py_LT:
11158         case Py_GT:
11159             Py_RETURN_FALSE;
11160         default:
11161             PyErr_BadArgument();
11162             return NULL;
11163         }
11164     }
11165     else if (op == Py_EQ || op == Py_NE) {
11166         result = unicode_compare_eq(left, right);
11167         result ^= (op == Py_NE);
11168         return PyBool_FromLong(result);
11169     }
11170     else {
11171         result = unicode_compare(left, right);
11172         Py_RETURN_RICHCOMPARE(result, 0, op);
11173     }
11174 }
11175 
11176 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11177 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11178 {
11179     return unicode_eq(aa, bb);
11180 }
11181 
11182 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11183 PyUnicode_Contains(PyObject *str, PyObject *substr)
11184 {
11185     int kind1, kind2;
11186     void *buf1, *buf2;
11187     Py_ssize_t len1, len2;
11188     int result;
11189 
11190     if (!PyUnicode_Check(substr)) {
11191         PyErr_Format(PyExc_TypeError,
11192                      "'in <string>' requires string as left operand, not %.100s",
11193                      Py_TYPE(substr)->tp_name);
11194         return -1;
11195     }
11196     if (PyUnicode_READY(substr) == -1)
11197         return -1;
11198     if (ensure_unicode(str) < 0)
11199         return -1;
11200 
11201     kind1 = PyUnicode_KIND(str);
11202     kind2 = PyUnicode_KIND(substr);
11203     if (kind1 < kind2)
11204         return 0;
11205     len1 = PyUnicode_GET_LENGTH(str);
11206     len2 = PyUnicode_GET_LENGTH(substr);
11207     if (len1 < len2)
11208         return 0;
11209     buf1 = PyUnicode_DATA(str);
11210     buf2 = PyUnicode_DATA(substr);
11211     if (len2 == 1) {
11212         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11213         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11214         return result;
11215     }
11216     if (kind2 != kind1) {
11217         buf2 = _PyUnicode_AsKind(substr, kind1);
11218         if (!buf2)
11219             return -1;
11220     }
11221 
11222     switch (kind1) {
11223     case PyUnicode_1BYTE_KIND:
11224         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11225         break;
11226     case PyUnicode_2BYTE_KIND:
11227         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11228         break;
11229     case PyUnicode_4BYTE_KIND:
11230         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11231         break;
11232     default:
11233         Py_UNREACHABLE();
11234     }
11235 
11236     if (kind2 != kind1)
11237         PyMem_Free(buf2);
11238 
11239     return result;
11240 }
11241 
11242 /* Concat to string or Unicode object giving a new Unicode object. */
11243 
11244 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11245 PyUnicode_Concat(PyObject *left, PyObject *right)
11246 {
11247     PyObject *result;
11248     Py_UCS4 maxchar, maxchar2;
11249     Py_ssize_t left_len, right_len, new_len;
11250 
11251     if (ensure_unicode(left) < 0)
11252         return NULL;
11253 
11254     if (!PyUnicode_Check(right)) {
11255         PyErr_Format(PyExc_TypeError,
11256                      "can only concatenate str (not \"%.200s\") to str",
11257                      right->ob_type->tp_name);
11258         return NULL;
11259     }
11260     if (PyUnicode_READY(right) < 0)
11261         return NULL;
11262 
11263     /* Shortcuts */
11264     if (left == unicode_empty)
11265         return PyUnicode_FromObject(right);
11266     if (right == unicode_empty)
11267         return PyUnicode_FromObject(left);
11268 
11269     left_len = PyUnicode_GET_LENGTH(left);
11270     right_len = PyUnicode_GET_LENGTH(right);
11271     if (left_len > PY_SSIZE_T_MAX - right_len) {
11272         PyErr_SetString(PyExc_OverflowError,
11273                         "strings are too large to concat");
11274         return NULL;
11275     }
11276     new_len = left_len + right_len;
11277 
11278     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11279     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11280     maxchar = Py_MAX(maxchar, maxchar2);
11281 
11282     /* Concat the two Unicode strings */
11283     result = PyUnicode_New(new_len, maxchar);
11284     if (result == NULL)
11285         return NULL;
11286     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11287     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11288     assert(_PyUnicode_CheckConsistency(result, 1));
11289     return result;
11290 }
11291 
11292 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11293 PyUnicode_Append(PyObject **p_left, PyObject *right)
11294 {
11295     PyObject *left, *res;
11296     Py_UCS4 maxchar, maxchar2;
11297     Py_ssize_t left_len, right_len, new_len;
11298 
11299     if (p_left == NULL) {
11300         if (!PyErr_Occurred())
11301             PyErr_BadInternalCall();
11302         return;
11303     }
11304     left = *p_left;
11305     if (right == NULL || left == NULL
11306         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11307         if (!PyErr_Occurred())
11308             PyErr_BadInternalCall();
11309         goto error;
11310     }
11311 
11312     if (PyUnicode_READY(left) == -1)
11313         goto error;
11314     if (PyUnicode_READY(right) == -1)
11315         goto error;
11316 
11317     /* Shortcuts */
11318     if (left == unicode_empty) {
11319         Py_DECREF(left);
11320         Py_INCREF(right);
11321         *p_left = right;
11322         return;
11323     }
11324     if (right == unicode_empty)
11325         return;
11326 
11327     left_len = PyUnicode_GET_LENGTH(left);
11328     right_len = PyUnicode_GET_LENGTH(right);
11329     if (left_len > PY_SSIZE_T_MAX - right_len) {
11330         PyErr_SetString(PyExc_OverflowError,
11331                         "strings are too large to concat");
11332         goto error;
11333     }
11334     new_len = left_len + right_len;
11335 
11336     if (unicode_modifiable(left)
11337         && PyUnicode_CheckExact(right)
11338         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11339         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11340            to change the structure size, but characters are stored just after
11341            the structure, and so it requires to move all characters which is
11342            not so different than duplicating the string. */
11343         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11344     {
11345         /* append inplace */
11346         if (unicode_resize(p_left, new_len) != 0)
11347             goto error;
11348 
11349         /* copy 'right' into the newly allocated area of 'left' */
11350         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11351     }
11352     else {
11353         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11354         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11355         maxchar = Py_MAX(maxchar, maxchar2);
11356 
11357         /* Concat the two Unicode strings */
11358         res = PyUnicode_New(new_len, maxchar);
11359         if (res == NULL)
11360             goto error;
11361         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11362         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11363         Py_DECREF(left);
11364         *p_left = res;
11365     }
11366     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11367     return;
11368 
11369 error:
11370     Py_CLEAR(*p_left);
11371 }
11372 
11373 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11374 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11375 {
11376     PyUnicode_Append(pleft, right);
11377     Py_XDECREF(right);
11378 }
11379 
11380 /*
11381 Wraps stringlib_parse_args_finds() and additionally ensures that the
11382 first argument is a unicode object.
11383 */
11384 
11385 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11386 parse_args_finds_unicode(const char * function_name, PyObject *args,
11387                          PyObject **substring,
11388                          Py_ssize_t *start, Py_ssize_t *end)
11389 {
11390     if(stringlib_parse_args_finds(function_name, args, substring,
11391                                   start, end)) {
11392         if (ensure_unicode(*substring) < 0)
11393             return 0;
11394         return 1;
11395     }
11396     return 0;
11397 }
11398 
11399 PyDoc_STRVAR(count__doc__,
11400              "S.count(sub[, start[, end]]) -> int\n\
11401 \n\
11402 Return the number of non-overlapping occurrences of substring sub in\n\
11403 string S[start:end].  Optional arguments start and end are\n\
11404 interpreted as in slice notation.");
11405 
11406 static PyObject *
unicode_count(PyObject * self,PyObject * args)11407 unicode_count(PyObject *self, PyObject *args)
11408 {
11409     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11410     Py_ssize_t start = 0;
11411     Py_ssize_t end = PY_SSIZE_T_MAX;
11412     PyObject *result;
11413     int kind1, kind2;
11414     void *buf1, *buf2;
11415     Py_ssize_t len1, len2, iresult;
11416 
11417     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11418         return NULL;
11419 
11420     kind1 = PyUnicode_KIND(self);
11421     kind2 = PyUnicode_KIND(substring);
11422     if (kind1 < kind2)
11423         return PyLong_FromLong(0);
11424 
11425     len1 = PyUnicode_GET_LENGTH(self);
11426     len2 = PyUnicode_GET_LENGTH(substring);
11427     ADJUST_INDICES(start, end, len1);
11428     if (end - start < len2)
11429         return PyLong_FromLong(0);
11430 
11431     buf1 = PyUnicode_DATA(self);
11432     buf2 = PyUnicode_DATA(substring);
11433     if (kind2 != kind1) {
11434         buf2 = _PyUnicode_AsKind(substring, kind1);
11435         if (!buf2)
11436             return NULL;
11437     }
11438     switch (kind1) {
11439     case PyUnicode_1BYTE_KIND:
11440         iresult = ucs1lib_count(
11441             ((Py_UCS1*)buf1) + start, end - start,
11442             buf2, len2, PY_SSIZE_T_MAX
11443             );
11444         break;
11445     case PyUnicode_2BYTE_KIND:
11446         iresult = ucs2lib_count(
11447             ((Py_UCS2*)buf1) + start, end - start,
11448             buf2, len2, PY_SSIZE_T_MAX
11449             );
11450         break;
11451     case PyUnicode_4BYTE_KIND:
11452         iresult = ucs4lib_count(
11453             ((Py_UCS4*)buf1) + start, end - start,
11454             buf2, len2, PY_SSIZE_T_MAX
11455             );
11456         break;
11457     default:
11458         Py_UNREACHABLE();
11459     }
11460 
11461     result = PyLong_FromSsize_t(iresult);
11462 
11463     if (kind2 != kind1)
11464         PyMem_Free(buf2);
11465 
11466     return result;
11467 }
11468 
11469 /*[clinic input]
11470 str.encode as unicode_encode
11471 
11472     encoding: str(c_default="NULL") = 'utf-8'
11473         The encoding in which to encode the string.
11474     errors: str(c_default="NULL") = 'strict'
11475         The error handling scheme to use for encoding errors.
11476         The default is 'strict' meaning that encoding errors raise a
11477         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11478         'xmlcharrefreplace' as well as any other name registered with
11479         codecs.register_error that can handle UnicodeEncodeErrors.
11480 
11481 Encode the string using the codec registered for encoding.
11482 [clinic start generated code]*/
11483 
11484 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11485 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11486 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11487 {
11488     return PyUnicode_AsEncodedString(self, encoding, errors);
11489 }
11490 
11491 /*[clinic input]
11492 str.expandtabs as unicode_expandtabs
11493 
11494     tabsize: int = 8
11495 
11496 Return a copy where all tab characters are expanded using spaces.
11497 
11498 If tabsize is not given, a tab size of 8 characters is assumed.
11499 [clinic start generated code]*/
11500 
11501 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11502 unicode_expandtabs_impl(PyObject *self, int tabsize)
11503 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11504 {
11505     Py_ssize_t i, j, line_pos, src_len, incr;
11506     Py_UCS4 ch;
11507     PyObject *u;
11508     void *src_data, *dest_data;
11509     int kind;
11510     int found;
11511 
11512     if (PyUnicode_READY(self) == -1)
11513         return NULL;
11514 
11515     /* First pass: determine size of output string */
11516     src_len = PyUnicode_GET_LENGTH(self);
11517     i = j = line_pos = 0;
11518     kind = PyUnicode_KIND(self);
11519     src_data = PyUnicode_DATA(self);
11520     found = 0;
11521     for (; i < src_len; i++) {
11522         ch = PyUnicode_READ(kind, src_data, i);
11523         if (ch == '\t') {
11524             found = 1;
11525             if (tabsize > 0) {
11526                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11527                 if (j > PY_SSIZE_T_MAX - incr)
11528                     goto overflow;
11529                 line_pos += incr;
11530                 j += incr;
11531             }
11532         }
11533         else {
11534             if (j > PY_SSIZE_T_MAX - 1)
11535                 goto overflow;
11536             line_pos++;
11537             j++;
11538             if (ch == '\n' || ch == '\r')
11539                 line_pos = 0;
11540         }
11541     }
11542     if (!found)
11543         return unicode_result_unchanged(self);
11544 
11545     /* Second pass: create output string and fill it */
11546     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11547     if (!u)
11548         return NULL;
11549     dest_data = PyUnicode_DATA(u);
11550 
11551     i = j = line_pos = 0;
11552 
11553     for (; i < src_len; i++) {
11554         ch = PyUnicode_READ(kind, src_data, i);
11555         if (ch == '\t') {
11556             if (tabsize > 0) {
11557                 incr = tabsize - (line_pos % tabsize);
11558                 line_pos += incr;
11559                 FILL(kind, dest_data, ' ', j, incr);
11560                 j += incr;
11561             }
11562         }
11563         else {
11564             line_pos++;
11565             PyUnicode_WRITE(kind, dest_data, j, ch);
11566             j++;
11567             if (ch == '\n' || ch == '\r')
11568                 line_pos = 0;
11569         }
11570     }
11571     assert (j == PyUnicode_GET_LENGTH(u));
11572     return unicode_result(u);
11573 
11574   overflow:
11575     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11576     return NULL;
11577 }
11578 
11579 PyDoc_STRVAR(find__doc__,
11580              "S.find(sub[, start[, end]]) -> int\n\
11581 \n\
11582 Return the lowest index in S where substring sub is found,\n\
11583 such that sub is contained within S[start:end].  Optional\n\
11584 arguments start and end are interpreted as in slice notation.\n\
11585 \n\
11586 Return -1 on failure.");
11587 
11588 static PyObject *
unicode_find(PyObject * self,PyObject * args)11589 unicode_find(PyObject *self, PyObject *args)
11590 {
11591     /* initialize variables to prevent gcc warning */
11592     PyObject *substring = NULL;
11593     Py_ssize_t start = 0;
11594     Py_ssize_t end = 0;
11595     Py_ssize_t result;
11596 
11597     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11598         return NULL;
11599 
11600     if (PyUnicode_READY(self) == -1)
11601         return NULL;
11602 
11603     result = any_find_slice(self, substring, start, end, 1);
11604 
11605     if (result == -2)
11606         return NULL;
11607 
11608     return PyLong_FromSsize_t(result);
11609 }
11610 
11611 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11612 unicode_getitem(PyObject *self, Py_ssize_t index)
11613 {
11614     void *data;
11615     enum PyUnicode_Kind kind;
11616     Py_UCS4 ch;
11617 
11618     if (!PyUnicode_Check(self)) {
11619         PyErr_BadArgument();
11620         return NULL;
11621     }
11622     if (PyUnicode_READY(self) == -1) {
11623         return NULL;
11624     }
11625     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11626         PyErr_SetString(PyExc_IndexError, "string index out of range");
11627         return NULL;
11628     }
11629     kind = PyUnicode_KIND(self);
11630     data = PyUnicode_DATA(self);
11631     ch = PyUnicode_READ(kind, data, index);
11632     return unicode_char(ch);
11633 }
11634 
11635 /* Believe it or not, this produces the same value for ASCII strings
11636    as bytes_hash(). */
11637 static Py_hash_t
unicode_hash(PyObject * self)11638 unicode_hash(PyObject *self)
11639 {
11640     Py_ssize_t len;
11641     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11642 
11643 #ifdef Py_DEBUG
11644     assert(_Py_HashSecret_Initialized);
11645 #endif
11646     if (_PyUnicode_HASH(self) != -1)
11647         return _PyUnicode_HASH(self);
11648     if (PyUnicode_READY(self) == -1)
11649         return -1;
11650     len = PyUnicode_GET_LENGTH(self);
11651     /*
11652       We make the hash of the empty string be 0, rather than using
11653       (prefix ^ suffix), since this slightly obfuscates the hash secret
11654     */
11655     if (len == 0) {
11656         _PyUnicode_HASH(self) = 0;
11657         return 0;
11658     }
11659     x = _Py_HashBytes(PyUnicode_DATA(self),
11660                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11661     _PyUnicode_HASH(self) = x;
11662     return x;
11663 }
11664 
11665 PyDoc_STRVAR(index__doc__,
11666              "S.index(sub[, start[, end]]) -> int\n\
11667 \n\
11668 Return the lowest index in S where substring sub is found, \n\
11669 such that sub is contained within S[start:end].  Optional\n\
11670 arguments start and end are interpreted as in slice notation.\n\
11671 \n\
11672 Raises ValueError when the substring is not found.");
11673 
11674 static PyObject *
unicode_index(PyObject * self,PyObject * args)11675 unicode_index(PyObject *self, PyObject *args)
11676 {
11677     /* initialize variables to prevent gcc warning */
11678     Py_ssize_t result;
11679     PyObject *substring = NULL;
11680     Py_ssize_t start = 0;
11681     Py_ssize_t end = 0;
11682 
11683     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11684         return NULL;
11685 
11686     if (PyUnicode_READY(self) == -1)
11687         return NULL;
11688 
11689     result = any_find_slice(self, substring, start, end, 1);
11690 
11691     if (result == -2)
11692         return NULL;
11693 
11694     if (result < 0) {
11695         PyErr_SetString(PyExc_ValueError, "substring not found");
11696         return NULL;
11697     }
11698 
11699     return PyLong_FromSsize_t(result);
11700 }
11701 
11702 /*[clinic input]
11703 str.isascii as unicode_isascii
11704 
11705 Return True if all characters in the string are ASCII, False otherwise.
11706 
11707 ASCII characters have code points in the range U+0000-U+007F.
11708 Empty string is ASCII too.
11709 [clinic start generated code]*/
11710 
11711 static PyObject *
unicode_isascii_impl(PyObject * self)11712 unicode_isascii_impl(PyObject *self)
11713 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11714 {
11715     if (PyUnicode_READY(self) == -1) {
11716         return NULL;
11717     }
11718     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11719 }
11720 
11721 /*[clinic input]
11722 str.islower as unicode_islower
11723 
11724 Return True if the string is a lowercase string, False otherwise.
11725 
11726 A string is lowercase if all cased characters in the string are lowercase and
11727 there is at least one cased character in the string.
11728 [clinic start generated code]*/
11729 
11730 static PyObject *
unicode_islower_impl(PyObject * self)11731 unicode_islower_impl(PyObject *self)
11732 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11733 {
11734     Py_ssize_t i, length;
11735     int kind;
11736     void *data;
11737     int cased;
11738 
11739     if (PyUnicode_READY(self) == -1)
11740         return NULL;
11741     length = PyUnicode_GET_LENGTH(self);
11742     kind = PyUnicode_KIND(self);
11743     data = PyUnicode_DATA(self);
11744 
11745     /* Shortcut for single character strings */
11746     if (length == 1)
11747         return PyBool_FromLong(
11748             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11749 
11750     /* Special case for empty strings */
11751     if (length == 0)
11752         Py_RETURN_FALSE;
11753 
11754     cased = 0;
11755     for (i = 0; i < length; i++) {
11756         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11757 
11758         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11759             Py_RETURN_FALSE;
11760         else if (!cased && Py_UNICODE_ISLOWER(ch))
11761             cased = 1;
11762     }
11763     return PyBool_FromLong(cased);
11764 }
11765 
11766 /*[clinic input]
11767 str.isupper as unicode_isupper
11768 
11769 Return True if the string is an uppercase string, False otherwise.
11770 
11771 A string is uppercase if all cased characters in the string are uppercase and
11772 there is at least one cased character in the string.
11773 [clinic start generated code]*/
11774 
11775 static PyObject *
unicode_isupper_impl(PyObject * self)11776 unicode_isupper_impl(PyObject *self)
11777 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11778 {
11779     Py_ssize_t i, length;
11780     int kind;
11781     void *data;
11782     int cased;
11783 
11784     if (PyUnicode_READY(self) == -1)
11785         return NULL;
11786     length = PyUnicode_GET_LENGTH(self);
11787     kind = PyUnicode_KIND(self);
11788     data = PyUnicode_DATA(self);
11789 
11790     /* Shortcut for single character strings */
11791     if (length == 1)
11792         return PyBool_FromLong(
11793             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11794 
11795     /* Special case for empty strings */
11796     if (length == 0)
11797         Py_RETURN_FALSE;
11798 
11799     cased = 0;
11800     for (i = 0; i < length; i++) {
11801         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11802 
11803         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11804             Py_RETURN_FALSE;
11805         else if (!cased && Py_UNICODE_ISUPPER(ch))
11806             cased = 1;
11807     }
11808     return PyBool_FromLong(cased);
11809 }
11810 
11811 /*[clinic input]
11812 str.istitle as unicode_istitle
11813 
11814 Return True if the string is a title-cased string, False otherwise.
11815 
11816 In a title-cased string, upper- and title-case characters may only
11817 follow uncased characters and lowercase characters only cased ones.
11818 [clinic start generated code]*/
11819 
11820 static PyObject *
unicode_istitle_impl(PyObject * self)11821 unicode_istitle_impl(PyObject *self)
11822 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11823 {
11824     Py_ssize_t i, length;
11825     int kind;
11826     void *data;
11827     int cased, previous_is_cased;
11828 
11829     if (PyUnicode_READY(self) == -1)
11830         return NULL;
11831     length = PyUnicode_GET_LENGTH(self);
11832     kind = PyUnicode_KIND(self);
11833     data = PyUnicode_DATA(self);
11834 
11835     /* Shortcut for single character strings */
11836     if (length == 1) {
11837         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11838         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11839                                (Py_UNICODE_ISUPPER(ch) != 0));
11840     }
11841 
11842     /* Special case for empty strings */
11843     if (length == 0)
11844         Py_RETURN_FALSE;
11845 
11846     cased = 0;
11847     previous_is_cased = 0;
11848     for (i = 0; i < length; i++) {
11849         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11850 
11851         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11852             if (previous_is_cased)
11853                 Py_RETURN_FALSE;
11854             previous_is_cased = 1;
11855             cased = 1;
11856         }
11857         else if (Py_UNICODE_ISLOWER(ch)) {
11858             if (!previous_is_cased)
11859                 Py_RETURN_FALSE;
11860             previous_is_cased = 1;
11861             cased = 1;
11862         }
11863         else
11864             previous_is_cased = 0;
11865     }
11866     return PyBool_FromLong(cased);
11867 }
11868 
11869 /*[clinic input]
11870 str.isspace as unicode_isspace
11871 
11872 Return True if the string is a whitespace string, False otherwise.
11873 
11874 A string is whitespace if all characters in the string are whitespace and there
11875 is at least one character in the string.
11876 [clinic start generated code]*/
11877 
11878 static PyObject *
unicode_isspace_impl(PyObject * self)11879 unicode_isspace_impl(PyObject *self)
11880 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11881 {
11882     Py_ssize_t i, length;
11883     int kind;
11884     void *data;
11885 
11886     if (PyUnicode_READY(self) == -1)
11887         return NULL;
11888     length = PyUnicode_GET_LENGTH(self);
11889     kind = PyUnicode_KIND(self);
11890     data = PyUnicode_DATA(self);
11891 
11892     /* Shortcut for single character strings */
11893     if (length == 1)
11894         return PyBool_FromLong(
11895             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11896 
11897     /* Special case for empty strings */
11898     if (length == 0)
11899         Py_RETURN_FALSE;
11900 
11901     for (i = 0; i < length; i++) {
11902         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903         if (!Py_UNICODE_ISSPACE(ch))
11904             Py_RETURN_FALSE;
11905     }
11906     Py_RETURN_TRUE;
11907 }
11908 
11909 /*[clinic input]
11910 str.isalpha as unicode_isalpha
11911 
11912 Return True if the string is an alphabetic string, False otherwise.
11913 
11914 A string is alphabetic if all characters in the string are alphabetic and there
11915 is at least one character in the string.
11916 [clinic start generated code]*/
11917 
11918 static PyObject *
unicode_isalpha_impl(PyObject * self)11919 unicode_isalpha_impl(PyObject *self)
11920 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11921 {
11922     Py_ssize_t i, length;
11923     int kind;
11924     void *data;
11925 
11926     if (PyUnicode_READY(self) == -1)
11927         return NULL;
11928     length = PyUnicode_GET_LENGTH(self);
11929     kind = PyUnicode_KIND(self);
11930     data = PyUnicode_DATA(self);
11931 
11932     /* Shortcut for single character strings */
11933     if (length == 1)
11934         return PyBool_FromLong(
11935             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11936 
11937     /* Special case for empty strings */
11938     if (length == 0)
11939         Py_RETURN_FALSE;
11940 
11941     for (i = 0; i < length; i++) {
11942         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11943             Py_RETURN_FALSE;
11944     }
11945     Py_RETURN_TRUE;
11946 }
11947 
11948 /*[clinic input]
11949 str.isalnum as unicode_isalnum
11950 
11951 Return True if the string is an alpha-numeric string, False otherwise.
11952 
11953 A string is alpha-numeric if all characters in the string are alpha-numeric and
11954 there is at least one character in the string.
11955 [clinic start generated code]*/
11956 
11957 static PyObject *
unicode_isalnum_impl(PyObject * self)11958 unicode_isalnum_impl(PyObject *self)
11959 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11960 {
11961     int kind;
11962     void *data;
11963     Py_ssize_t len, i;
11964 
11965     if (PyUnicode_READY(self) == -1)
11966         return NULL;
11967 
11968     kind = PyUnicode_KIND(self);
11969     data = PyUnicode_DATA(self);
11970     len = PyUnicode_GET_LENGTH(self);
11971 
11972     /* Shortcut for single character strings */
11973     if (len == 1) {
11974         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11975         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11976     }
11977 
11978     /* Special case for empty strings */
11979     if (len == 0)
11980         Py_RETURN_FALSE;
11981 
11982     for (i = 0; i < len; i++) {
11983         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11984         if (!Py_UNICODE_ISALNUM(ch))
11985             Py_RETURN_FALSE;
11986     }
11987     Py_RETURN_TRUE;
11988 }
11989 
11990 /*[clinic input]
11991 str.isdecimal as unicode_isdecimal
11992 
11993 Return True if the string is a decimal string, False otherwise.
11994 
11995 A string is a decimal string if all characters in the string are decimal and
11996 there is at least one character in the string.
11997 [clinic start generated code]*/
11998 
11999 static PyObject *
unicode_isdecimal_impl(PyObject * self)12000 unicode_isdecimal_impl(PyObject *self)
12001 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12002 {
12003     Py_ssize_t i, length;
12004     int kind;
12005     void *data;
12006 
12007     if (PyUnicode_READY(self) == -1)
12008         return NULL;
12009     length = PyUnicode_GET_LENGTH(self);
12010     kind = PyUnicode_KIND(self);
12011     data = PyUnicode_DATA(self);
12012 
12013     /* Shortcut for single character strings */
12014     if (length == 1)
12015         return PyBool_FromLong(
12016             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12017 
12018     /* Special case for empty strings */
12019     if (length == 0)
12020         Py_RETURN_FALSE;
12021 
12022     for (i = 0; i < length; i++) {
12023         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12024             Py_RETURN_FALSE;
12025     }
12026     Py_RETURN_TRUE;
12027 }
12028 
12029 /*[clinic input]
12030 str.isdigit as unicode_isdigit
12031 
12032 Return True if the string is a digit string, False otherwise.
12033 
12034 A string is a digit string if all characters in the string are digits and there
12035 is at least one character in the string.
12036 [clinic start generated code]*/
12037 
12038 static PyObject *
unicode_isdigit_impl(PyObject * self)12039 unicode_isdigit_impl(PyObject *self)
12040 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12041 {
12042     Py_ssize_t i, length;
12043     int kind;
12044     void *data;
12045 
12046     if (PyUnicode_READY(self) == -1)
12047         return NULL;
12048     length = PyUnicode_GET_LENGTH(self);
12049     kind = PyUnicode_KIND(self);
12050     data = PyUnicode_DATA(self);
12051 
12052     /* Shortcut for single character strings */
12053     if (length == 1) {
12054         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12055         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12056     }
12057 
12058     /* Special case for empty strings */
12059     if (length == 0)
12060         Py_RETURN_FALSE;
12061 
12062     for (i = 0; i < length; i++) {
12063         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12064             Py_RETURN_FALSE;
12065     }
12066     Py_RETURN_TRUE;
12067 }
12068 
12069 /*[clinic input]
12070 str.isnumeric as unicode_isnumeric
12071 
12072 Return True if the string is a numeric string, False otherwise.
12073 
12074 A string is numeric if all characters in the string are numeric and there is at
12075 least one character in the string.
12076 [clinic start generated code]*/
12077 
12078 static PyObject *
unicode_isnumeric_impl(PyObject * self)12079 unicode_isnumeric_impl(PyObject *self)
12080 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12081 {
12082     Py_ssize_t i, length;
12083     int kind;
12084     void *data;
12085 
12086     if (PyUnicode_READY(self) == -1)
12087         return NULL;
12088     length = PyUnicode_GET_LENGTH(self);
12089     kind = PyUnicode_KIND(self);
12090     data = PyUnicode_DATA(self);
12091 
12092     /* Shortcut for single character strings */
12093     if (length == 1)
12094         return PyBool_FromLong(
12095             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12096 
12097     /* Special case for empty strings */
12098     if (length == 0)
12099         Py_RETURN_FALSE;
12100 
12101     for (i = 0; i < length; i++) {
12102         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12103             Py_RETURN_FALSE;
12104     }
12105     Py_RETURN_TRUE;
12106 }
12107 
12108 int
PyUnicode_IsIdentifier(PyObject * self)12109 PyUnicode_IsIdentifier(PyObject *self)
12110 {
12111     int kind;
12112     void *data;
12113     Py_ssize_t i;
12114     Py_UCS4 first;
12115 
12116     if (PyUnicode_READY(self) == -1) {
12117         Py_FatalError("identifier not ready");
12118         return 0;
12119     }
12120 
12121     /* Special case for empty strings */
12122     if (PyUnicode_GET_LENGTH(self) == 0)
12123         return 0;
12124     kind = PyUnicode_KIND(self);
12125     data = PyUnicode_DATA(self);
12126 
12127     /* PEP 3131 says that the first character must be in
12128        XID_Start and subsequent characters in XID_Continue,
12129        and for the ASCII range, the 2.x rules apply (i.e
12130        start with letters and underscore, continue with
12131        letters, digits, underscore). However, given the current
12132        definition of XID_Start and XID_Continue, it is sufficient
12133        to check just for these, except that _ must be allowed
12134        as starting an identifier.  */
12135     first = PyUnicode_READ(kind, data, 0);
12136     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12137         return 0;
12138 
12139     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12140         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12141             return 0;
12142     return 1;
12143 }
12144 
12145 /*[clinic input]
12146 str.isidentifier as unicode_isidentifier
12147 
12148 Return True if the string is a valid Python identifier, False otherwise.
12149 
12150 Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12151 "class".
12152 [clinic start generated code]*/
12153 
12154 static PyObject *
unicode_isidentifier_impl(PyObject * self)12155 unicode_isidentifier_impl(PyObject *self)
12156 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12157 {
12158     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12159 }
12160 
12161 /*[clinic input]
12162 str.isprintable as unicode_isprintable
12163 
12164 Return True if the string is printable, False otherwise.
12165 
12166 A string is printable if all of its characters are considered printable in
12167 repr() or if it is empty.
12168 [clinic start generated code]*/
12169 
12170 static PyObject *
unicode_isprintable_impl(PyObject * self)12171 unicode_isprintable_impl(PyObject *self)
12172 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12173 {
12174     Py_ssize_t i, length;
12175     int kind;
12176     void *data;
12177 
12178     if (PyUnicode_READY(self) == -1)
12179         return NULL;
12180     length = PyUnicode_GET_LENGTH(self);
12181     kind = PyUnicode_KIND(self);
12182     data = PyUnicode_DATA(self);
12183 
12184     /* Shortcut for single character strings */
12185     if (length == 1)
12186         return PyBool_FromLong(
12187             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12188 
12189     for (i = 0; i < length; i++) {
12190         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12191             Py_RETURN_FALSE;
12192         }
12193     }
12194     Py_RETURN_TRUE;
12195 }
12196 
12197 /*[clinic input]
12198 str.join as unicode_join
12199 
12200     iterable: object
12201     /
12202 
12203 Concatenate any number of strings.
12204 
12205 The string whose method is called is inserted in between each given string.
12206 The result is returned as a new string.
12207 
12208 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12209 [clinic start generated code]*/
12210 
12211 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12212 unicode_join(PyObject *self, PyObject *iterable)
12213 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12214 {
12215     return PyUnicode_Join(self, iterable);
12216 }
12217 
12218 static Py_ssize_t
unicode_length(PyObject * self)12219 unicode_length(PyObject *self)
12220 {
12221     if (PyUnicode_READY(self) == -1)
12222         return -1;
12223     return PyUnicode_GET_LENGTH(self);
12224 }
12225 
12226 /*[clinic input]
12227 str.ljust as unicode_ljust
12228 
12229     width: Py_ssize_t
12230     fillchar: Py_UCS4 = ' '
12231     /
12232 
12233 Return a left-justified string of length width.
12234 
12235 Padding is done using the specified fill character (default is a space).
12236 [clinic start generated code]*/
12237 
12238 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12239 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12240 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12241 {
12242     if (PyUnicode_READY(self) == -1)
12243         return NULL;
12244 
12245     if (PyUnicode_GET_LENGTH(self) >= width)
12246         return unicode_result_unchanged(self);
12247 
12248     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12249 }
12250 
12251 /*[clinic input]
12252 str.lower as unicode_lower
12253 
12254 Return a copy of the string converted to lowercase.
12255 [clinic start generated code]*/
12256 
12257 static PyObject *
unicode_lower_impl(PyObject * self)12258 unicode_lower_impl(PyObject *self)
12259 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12260 {
12261     if (PyUnicode_READY(self) == -1)
12262         return NULL;
12263     if (PyUnicode_IS_ASCII(self))
12264         return ascii_upper_or_lower(self, 1);
12265     return case_operation(self, do_lower);
12266 }
12267 
12268 #define LEFTSTRIP 0
12269 #define RIGHTSTRIP 1
12270 #define BOTHSTRIP 2
12271 
12272 /* Arrays indexed by above */
12273 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12274 
12275 #define STRIPNAME(i) (stripfuncnames[i])
12276 
12277 /* externally visible for str.strip(unicode) */
12278 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12279 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12280 {
12281     void *data;
12282     int kind;
12283     Py_ssize_t i, j, len;
12284     BLOOM_MASK sepmask;
12285     Py_ssize_t seplen;
12286 
12287     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12288         return NULL;
12289 
12290     kind = PyUnicode_KIND(self);
12291     data = PyUnicode_DATA(self);
12292     len = PyUnicode_GET_LENGTH(self);
12293     seplen = PyUnicode_GET_LENGTH(sepobj);
12294     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12295                               PyUnicode_DATA(sepobj),
12296                               seplen);
12297 
12298     i = 0;
12299     if (striptype != RIGHTSTRIP) {
12300         while (i < len) {
12301             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12302             if (!BLOOM(sepmask, ch))
12303                 break;
12304             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12305                 break;
12306             i++;
12307         }
12308     }
12309 
12310     j = len;
12311     if (striptype != LEFTSTRIP) {
12312         j--;
12313         while (j >= i) {
12314             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12315             if (!BLOOM(sepmask, ch))
12316                 break;
12317             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12318                 break;
12319             j--;
12320         }
12321 
12322         j++;
12323     }
12324 
12325     return PyUnicode_Substring(self, i, j);
12326 }
12327 
12328 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12329 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12330 {
12331     unsigned char *data;
12332     int kind;
12333     Py_ssize_t length;
12334 
12335     if (PyUnicode_READY(self) == -1)
12336         return NULL;
12337 
12338     length = PyUnicode_GET_LENGTH(self);
12339     end = Py_MIN(end, length);
12340 
12341     if (start == 0 && end == length)
12342         return unicode_result_unchanged(self);
12343 
12344     if (start < 0 || end < 0) {
12345         PyErr_SetString(PyExc_IndexError, "string index out of range");
12346         return NULL;
12347     }
12348     if (start >= length || end < start)
12349         _Py_RETURN_UNICODE_EMPTY();
12350 
12351     length = end - start;
12352     if (PyUnicode_IS_ASCII(self)) {
12353         data = PyUnicode_1BYTE_DATA(self);
12354         return _PyUnicode_FromASCII((char*)(data + start), length);
12355     }
12356     else {
12357         kind = PyUnicode_KIND(self);
12358         data = PyUnicode_1BYTE_DATA(self);
12359         return PyUnicode_FromKindAndData(kind,
12360                                          data + kind * start,
12361                                          length);
12362     }
12363 }
12364 
12365 static PyObject *
do_strip(PyObject * self,int striptype)12366 do_strip(PyObject *self, int striptype)
12367 {
12368     Py_ssize_t len, i, j;
12369 
12370     if (PyUnicode_READY(self) == -1)
12371         return NULL;
12372 
12373     len = PyUnicode_GET_LENGTH(self);
12374 
12375     if (PyUnicode_IS_ASCII(self)) {
12376         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12377 
12378         i = 0;
12379         if (striptype != RIGHTSTRIP) {
12380             while (i < len) {
12381                 Py_UCS1 ch = data[i];
12382                 if (!_Py_ascii_whitespace[ch])
12383                     break;
12384                 i++;
12385             }
12386         }
12387 
12388         j = len;
12389         if (striptype != LEFTSTRIP) {
12390             j--;
12391             while (j >= i) {
12392                 Py_UCS1 ch = data[j];
12393                 if (!_Py_ascii_whitespace[ch])
12394                     break;
12395                 j--;
12396             }
12397             j++;
12398         }
12399     }
12400     else {
12401         int kind = PyUnicode_KIND(self);
12402         void *data = PyUnicode_DATA(self);
12403 
12404         i = 0;
12405         if (striptype != RIGHTSTRIP) {
12406             while (i < len) {
12407                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12408                 if (!Py_UNICODE_ISSPACE(ch))
12409                     break;
12410                 i++;
12411             }
12412         }
12413 
12414         j = len;
12415         if (striptype != LEFTSTRIP) {
12416             j--;
12417             while (j >= i) {
12418                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12419                 if (!Py_UNICODE_ISSPACE(ch))
12420                     break;
12421                 j--;
12422             }
12423             j++;
12424         }
12425     }
12426 
12427     return PyUnicode_Substring(self, i, j);
12428 }
12429 
12430 
12431 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12432 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12433 {
12434     if (sep != NULL && sep != Py_None) {
12435         if (PyUnicode_Check(sep))
12436             return _PyUnicode_XStrip(self, striptype, sep);
12437         else {
12438             PyErr_Format(PyExc_TypeError,
12439                          "%s arg must be None or str",
12440                          STRIPNAME(striptype));
12441             return NULL;
12442         }
12443     }
12444 
12445     return do_strip(self, striptype);
12446 }
12447 
12448 
12449 /*[clinic input]
12450 str.strip as unicode_strip
12451 
12452     chars: object = None
12453     /
12454 
12455 Return a copy of the string with leading and trailing whitespace remove.
12456 
12457 If chars is given and not None, remove characters in chars instead.
12458 [clinic start generated code]*/
12459 
12460 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12461 unicode_strip_impl(PyObject *self, PyObject *chars)
12462 /*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
12463 {
12464     return do_argstrip(self, BOTHSTRIP, chars);
12465 }
12466 
12467 
12468 /*[clinic input]
12469 str.lstrip as unicode_lstrip
12470 
12471     chars: object = NULL
12472     /
12473 
12474 Return a copy of the string with leading whitespace removed.
12475 
12476 If chars is given and not None, remove characters in chars instead.
12477 [clinic start generated code]*/
12478 
12479 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12480 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12481 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12482 {
12483     return do_argstrip(self, LEFTSTRIP, chars);
12484 }
12485 
12486 
12487 /*[clinic input]
12488 str.rstrip as unicode_rstrip
12489 
12490     chars: object = NULL
12491     /
12492 
12493 Return a copy of the string with trailing whitespace removed.
12494 
12495 If chars is given and not None, remove characters in chars instead.
12496 [clinic start generated code]*/
12497 
12498 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12499 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12500 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12501 {
12502     return do_argstrip(self, RIGHTSTRIP, chars);
12503 }
12504 
12505 
12506 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12507 unicode_repeat(PyObject *str, Py_ssize_t len)
12508 {
12509     PyObject *u;
12510     Py_ssize_t nchars, n;
12511 
12512     if (len < 1)
12513         _Py_RETURN_UNICODE_EMPTY();
12514 
12515     /* no repeat, return original string */
12516     if (len == 1)
12517         return unicode_result_unchanged(str);
12518 
12519     if (PyUnicode_READY(str) == -1)
12520         return NULL;
12521 
12522     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12523         PyErr_SetString(PyExc_OverflowError,
12524                         "repeated string is too long");
12525         return NULL;
12526     }
12527     nchars = len * PyUnicode_GET_LENGTH(str);
12528 
12529     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12530     if (!u)
12531         return NULL;
12532     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12533 
12534     if (PyUnicode_GET_LENGTH(str) == 1) {
12535         const int kind = PyUnicode_KIND(str);
12536         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12537         if (kind == PyUnicode_1BYTE_KIND) {
12538             void *to = PyUnicode_DATA(u);
12539             memset(to, (unsigned char)fill_char, len);
12540         }
12541         else if (kind == PyUnicode_2BYTE_KIND) {
12542             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12543             for (n = 0; n < len; ++n)
12544                 ucs2[n] = fill_char;
12545         } else {
12546             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12547             assert(kind == PyUnicode_4BYTE_KIND);
12548             for (n = 0; n < len; ++n)
12549                 ucs4[n] = fill_char;
12550         }
12551     }
12552     else {
12553         /* number of characters copied this far */
12554         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12555         const Py_ssize_t char_size = PyUnicode_KIND(str);
12556         char *to = (char *) PyUnicode_DATA(u);
12557         memcpy(to, PyUnicode_DATA(str),
12558                   PyUnicode_GET_LENGTH(str) * char_size);
12559         while (done < nchars) {
12560             n = (done <= nchars-done) ? done : nchars-done;
12561             memcpy(to + (done * char_size), to, n * char_size);
12562             done += n;
12563         }
12564     }
12565 
12566     assert(_PyUnicode_CheckConsistency(u, 1));
12567     return u;
12568 }
12569 
12570 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12571 PyUnicode_Replace(PyObject *str,
12572                   PyObject *substr,
12573                   PyObject *replstr,
12574                   Py_ssize_t maxcount)
12575 {
12576     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12577             ensure_unicode(replstr) < 0)
12578         return NULL;
12579     return replace(str, substr, replstr, maxcount);
12580 }
12581 
12582 /*[clinic input]
12583 str.replace as unicode_replace
12584 
12585     old: unicode
12586     new: unicode
12587     count: Py_ssize_t = -1
12588         Maximum number of occurrences to replace.
12589         -1 (the default value) means replace all occurrences.
12590     /
12591 
12592 Return a copy with all occurrences of substring old replaced by new.
12593 
12594 If the optional argument count is given, only the first count occurrences are
12595 replaced.
12596 [clinic start generated code]*/
12597 
12598 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12599 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12600                      Py_ssize_t count)
12601 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12602 {
12603     if (PyUnicode_READY(self) == -1)
12604         return NULL;
12605     return replace(self, old, new, count);
12606 }
12607 
12608 static PyObject *
unicode_repr(PyObject * unicode)12609 unicode_repr(PyObject *unicode)
12610 {
12611     PyObject *repr;
12612     Py_ssize_t isize;
12613     Py_ssize_t osize, squote, dquote, i, o;
12614     Py_UCS4 max, quote;
12615     int ikind, okind, unchanged;
12616     void *idata, *odata;
12617 
12618     if (PyUnicode_READY(unicode) == -1)
12619         return NULL;
12620 
12621     isize = PyUnicode_GET_LENGTH(unicode);
12622     idata = PyUnicode_DATA(unicode);
12623 
12624     /* Compute length of output, quote characters, and
12625        maximum character */
12626     osize = 0;
12627     max = 127;
12628     squote = dquote = 0;
12629     ikind = PyUnicode_KIND(unicode);
12630     for (i = 0; i < isize; i++) {
12631         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12632         Py_ssize_t incr = 1;
12633         switch (ch) {
12634         case '\'': squote++; break;
12635         case '"':  dquote++; break;
12636         case '\\': case '\t': case '\r': case '\n':
12637             incr = 2;
12638             break;
12639         default:
12640             /* Fast-path ASCII */
12641             if (ch < ' ' || ch == 0x7f)
12642                 incr = 4; /* \xHH */
12643             else if (ch < 0x7f)
12644                 ;
12645             else if (Py_UNICODE_ISPRINTABLE(ch))
12646                 max = ch > max ? ch : max;
12647             else if (ch < 0x100)
12648                 incr = 4; /* \xHH */
12649             else if (ch < 0x10000)
12650                 incr = 6; /* \uHHHH */
12651             else
12652                 incr = 10; /* \uHHHHHHHH */
12653         }
12654         if (osize > PY_SSIZE_T_MAX - incr) {
12655             PyErr_SetString(PyExc_OverflowError,
12656                             "string is too long to generate repr");
12657             return NULL;
12658         }
12659         osize += incr;
12660     }
12661 
12662     quote = '\'';
12663     unchanged = (osize == isize);
12664     if (squote) {
12665         unchanged = 0;
12666         if (dquote)
12667             /* Both squote and dquote present. Use squote,
12668                and escape them */
12669             osize += squote;
12670         else
12671             quote = '"';
12672     }
12673     osize += 2;   /* quotes */
12674 
12675     repr = PyUnicode_New(osize, max);
12676     if (repr == NULL)
12677         return NULL;
12678     okind = PyUnicode_KIND(repr);
12679     odata = PyUnicode_DATA(repr);
12680 
12681     PyUnicode_WRITE(okind, odata, 0, quote);
12682     PyUnicode_WRITE(okind, odata, osize-1, quote);
12683     if (unchanged) {
12684         _PyUnicode_FastCopyCharacters(repr, 1,
12685                                       unicode, 0,
12686                                       isize);
12687     }
12688     else {
12689         for (i = 0, o = 1; i < isize; i++) {
12690             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12691 
12692             /* Escape quotes and backslashes */
12693             if ((ch == quote) || (ch == '\\')) {
12694                 PyUnicode_WRITE(okind, odata, o++, '\\');
12695                 PyUnicode_WRITE(okind, odata, o++, ch);
12696                 continue;
12697             }
12698 
12699             /* Map special whitespace to '\t', \n', '\r' */
12700             if (ch == '\t') {
12701                 PyUnicode_WRITE(okind, odata, o++, '\\');
12702                 PyUnicode_WRITE(okind, odata, o++, 't');
12703             }
12704             else if (ch == '\n') {
12705                 PyUnicode_WRITE(okind, odata, o++, '\\');
12706                 PyUnicode_WRITE(okind, odata, o++, 'n');
12707             }
12708             else if (ch == '\r') {
12709                 PyUnicode_WRITE(okind, odata, o++, '\\');
12710                 PyUnicode_WRITE(okind, odata, o++, 'r');
12711             }
12712 
12713             /* Map non-printable US ASCII to '\xhh' */
12714             else if (ch < ' ' || ch == 0x7F) {
12715                 PyUnicode_WRITE(okind, odata, o++, '\\');
12716                 PyUnicode_WRITE(okind, odata, o++, 'x');
12717                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12718                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12719             }
12720 
12721             /* Copy ASCII characters as-is */
12722             else if (ch < 0x7F) {
12723                 PyUnicode_WRITE(okind, odata, o++, ch);
12724             }
12725 
12726             /* Non-ASCII characters */
12727             else {
12728                 /* Map Unicode whitespace and control characters
12729                    (categories Z* and C* except ASCII space)
12730                 */
12731                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12732                     PyUnicode_WRITE(okind, odata, o++, '\\');
12733                     /* Map 8-bit characters to '\xhh' */
12734                     if (ch <= 0xff) {
12735                         PyUnicode_WRITE(okind, odata, o++, 'x');
12736                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12737                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12738                     }
12739                     /* Map 16-bit characters to '\uxxxx' */
12740                     else if (ch <= 0xffff) {
12741                         PyUnicode_WRITE(okind, odata, o++, 'u');
12742                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12743                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12744                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12745                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12746                     }
12747                     /* Map 21-bit characters to '\U00xxxxxx' */
12748                     else {
12749                         PyUnicode_WRITE(okind, odata, o++, 'U');
12750                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12751                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12752                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12753                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12754                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12755                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12756                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12757                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12758                     }
12759                 }
12760                 /* Copy characters as-is */
12761                 else {
12762                     PyUnicode_WRITE(okind, odata, o++, ch);
12763                 }
12764             }
12765         }
12766     }
12767     /* Closing quote already added at the beginning */
12768     assert(_PyUnicode_CheckConsistency(repr, 1));
12769     return repr;
12770 }
12771 
12772 PyDoc_STRVAR(rfind__doc__,
12773              "S.rfind(sub[, start[, end]]) -> int\n\
12774 \n\
12775 Return the highest index in S where substring sub is found,\n\
12776 such that sub is contained within S[start:end].  Optional\n\
12777 arguments start and end are interpreted as in slice notation.\n\
12778 \n\
12779 Return -1 on failure.");
12780 
12781 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12782 unicode_rfind(PyObject *self, PyObject *args)
12783 {
12784     /* initialize variables to prevent gcc warning */
12785     PyObject *substring = NULL;
12786     Py_ssize_t start = 0;
12787     Py_ssize_t end = 0;
12788     Py_ssize_t result;
12789 
12790     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12791         return NULL;
12792 
12793     if (PyUnicode_READY(self) == -1)
12794         return NULL;
12795 
12796     result = any_find_slice(self, substring, start, end, -1);
12797 
12798     if (result == -2)
12799         return NULL;
12800 
12801     return PyLong_FromSsize_t(result);
12802 }
12803 
12804 PyDoc_STRVAR(rindex__doc__,
12805              "S.rindex(sub[, start[, end]]) -> int\n\
12806 \n\
12807 Return the highest index in S where substring sub is found,\n\
12808 such that sub is contained within S[start:end].  Optional\n\
12809 arguments start and end are interpreted as in slice notation.\n\
12810 \n\
12811 Raises ValueError when the substring is not found.");
12812 
12813 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12814 unicode_rindex(PyObject *self, PyObject *args)
12815 {
12816     /* initialize variables to prevent gcc warning */
12817     PyObject *substring = NULL;
12818     Py_ssize_t start = 0;
12819     Py_ssize_t end = 0;
12820     Py_ssize_t result;
12821 
12822     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12823         return NULL;
12824 
12825     if (PyUnicode_READY(self) == -1)
12826         return NULL;
12827 
12828     result = any_find_slice(self, substring, start, end, -1);
12829 
12830     if (result == -2)
12831         return NULL;
12832 
12833     if (result < 0) {
12834         PyErr_SetString(PyExc_ValueError, "substring not found");
12835         return NULL;
12836     }
12837 
12838     return PyLong_FromSsize_t(result);
12839 }
12840 
12841 /*[clinic input]
12842 str.rjust as unicode_rjust
12843 
12844     width: Py_ssize_t
12845     fillchar: Py_UCS4 = ' '
12846     /
12847 
12848 Return a right-justified string of length width.
12849 
12850 Padding is done using the specified fill character (default is a space).
12851 [clinic start generated code]*/
12852 
12853 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12854 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12855 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12856 {
12857     if (PyUnicode_READY(self) == -1)
12858         return NULL;
12859 
12860     if (PyUnicode_GET_LENGTH(self) >= width)
12861         return unicode_result_unchanged(self);
12862 
12863     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12864 }
12865 
12866 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12867 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12868 {
12869     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12870         return NULL;
12871 
12872     return split(s, sep, maxsplit);
12873 }
12874 
12875 /*[clinic input]
12876 str.split as unicode_split
12877 
12878     sep: object = None
12879         The delimiter according which to split the string.
12880         None (the default value) means split according to any whitespace,
12881         and discard empty strings from the result.
12882     maxsplit: Py_ssize_t = -1
12883         Maximum number of splits to do.
12884         -1 (the default value) means no limit.
12885 
12886 Return a list of the words in the string, using sep as the delimiter string.
12887 [clinic start generated code]*/
12888 
12889 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12890 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12891 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12892 {
12893     if (sep == Py_None)
12894         return split(self, NULL, maxsplit);
12895     if (PyUnicode_Check(sep))
12896         return split(self, sep, maxsplit);
12897 
12898     PyErr_Format(PyExc_TypeError,
12899                  "must be str or None, not %.100s",
12900                  Py_TYPE(sep)->tp_name);
12901     return NULL;
12902 }
12903 
12904 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12905 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12906 {
12907     PyObject* out;
12908     int kind1, kind2;
12909     void *buf1, *buf2;
12910     Py_ssize_t len1, len2;
12911 
12912     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12913         return NULL;
12914 
12915     kind1 = PyUnicode_KIND(str_obj);
12916     kind2 = PyUnicode_KIND(sep_obj);
12917     len1 = PyUnicode_GET_LENGTH(str_obj);
12918     len2 = PyUnicode_GET_LENGTH(sep_obj);
12919     if (kind1 < kind2 || len1 < len2) {
12920         _Py_INCREF_UNICODE_EMPTY();
12921         if (!unicode_empty)
12922             out = NULL;
12923         else {
12924             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12925             Py_DECREF(unicode_empty);
12926         }
12927         return out;
12928     }
12929     buf1 = PyUnicode_DATA(str_obj);
12930     buf2 = PyUnicode_DATA(sep_obj);
12931     if (kind2 != kind1) {
12932         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12933         if (!buf2)
12934             return NULL;
12935     }
12936 
12937     switch (kind1) {
12938     case PyUnicode_1BYTE_KIND:
12939         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12940             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941         else
12942             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943         break;
12944     case PyUnicode_2BYTE_KIND:
12945         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946         break;
12947     case PyUnicode_4BYTE_KIND:
12948         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12949         break;
12950     default:
12951         Py_UNREACHABLE();
12952     }
12953 
12954     if (kind2 != kind1)
12955         PyMem_Free(buf2);
12956 
12957     return out;
12958 }
12959 
12960 
12961 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12962 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12963 {
12964     PyObject* out;
12965     int kind1, kind2;
12966     void *buf1, *buf2;
12967     Py_ssize_t len1, len2;
12968 
12969     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12970         return NULL;
12971 
12972     kind1 = PyUnicode_KIND(str_obj);
12973     kind2 = PyUnicode_KIND(sep_obj);
12974     len1 = PyUnicode_GET_LENGTH(str_obj);
12975     len2 = PyUnicode_GET_LENGTH(sep_obj);
12976     if (kind1 < kind2 || len1 < len2) {
12977         _Py_INCREF_UNICODE_EMPTY();
12978         if (!unicode_empty)
12979             out = NULL;
12980         else {
12981             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12982             Py_DECREF(unicode_empty);
12983         }
12984         return out;
12985     }
12986     buf1 = PyUnicode_DATA(str_obj);
12987     buf2 = PyUnicode_DATA(sep_obj);
12988     if (kind2 != kind1) {
12989         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12990         if (!buf2)
12991             return NULL;
12992     }
12993 
12994     switch (kind1) {
12995     case PyUnicode_1BYTE_KIND:
12996         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12997             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12998         else
12999             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13000         break;
13001     case PyUnicode_2BYTE_KIND:
13002         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13003         break;
13004     case PyUnicode_4BYTE_KIND:
13005         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13006         break;
13007     default:
13008         Py_UNREACHABLE();
13009     }
13010 
13011     if (kind2 != kind1)
13012         PyMem_Free(buf2);
13013 
13014     return out;
13015 }
13016 
13017 /*[clinic input]
13018 str.partition as unicode_partition
13019 
13020     sep: object
13021     /
13022 
13023 Partition the string into three parts using the given separator.
13024 
13025 This will search for the separator in the string.  If the separator is found,
13026 returns a 3-tuple containing the part before the separator, the separator
13027 itself, and the part after it.
13028 
13029 If the separator is not found, returns a 3-tuple containing the original string
13030 and two empty strings.
13031 [clinic start generated code]*/
13032 
13033 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13034 unicode_partition(PyObject *self, PyObject *sep)
13035 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13036 {
13037     return PyUnicode_Partition(self, sep);
13038 }
13039 
13040 /*[clinic input]
13041 str.rpartition as unicode_rpartition = str.partition
13042 
13043 Partition the string into three parts using the given separator.
13044 
13045 This will search for the separator in the string, starting at the end. If
13046 the separator is found, returns a 3-tuple containing the part before the
13047 separator, the separator itself, and the part after it.
13048 
13049 If the separator is not found, returns a 3-tuple containing two empty strings
13050 and the original string.
13051 [clinic start generated code]*/
13052 
13053 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13054 unicode_rpartition(PyObject *self, PyObject *sep)
13055 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13056 {
13057     return PyUnicode_RPartition(self, sep);
13058 }
13059 
13060 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13061 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13062 {
13063     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13064         return NULL;
13065 
13066     return rsplit(s, sep, maxsplit);
13067 }
13068 
13069 /*[clinic input]
13070 str.rsplit as unicode_rsplit = str.split
13071 
13072 Return a list of the words in the string, using sep as the delimiter string.
13073 
13074 Splits are done starting at the end of the string and working to the front.
13075 [clinic start generated code]*/
13076 
13077 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13078 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13079 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13080 {
13081     if (sep == Py_None)
13082         return rsplit(self, NULL, maxsplit);
13083     if (PyUnicode_Check(sep))
13084         return rsplit(self, sep, maxsplit);
13085 
13086     PyErr_Format(PyExc_TypeError,
13087                  "must be str or None, not %.100s",
13088                  Py_TYPE(sep)->tp_name);
13089     return NULL;
13090 }
13091 
13092 /*[clinic input]
13093 str.splitlines as unicode_splitlines
13094 
13095     keepends: bool(accept={int}) = False
13096 
13097 Return a list of the lines in the string, breaking at line boundaries.
13098 
13099 Line breaks are not included in the resulting list unless keepends is given and
13100 true.
13101 [clinic start generated code]*/
13102 
13103 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13104 unicode_splitlines_impl(PyObject *self, int keepends)
13105 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13106 {
13107     return PyUnicode_Splitlines(self, keepends);
13108 }
13109 
13110 static
unicode_str(PyObject * self)13111 PyObject *unicode_str(PyObject *self)
13112 {
13113     return unicode_result_unchanged(self);
13114 }
13115 
13116 /*[clinic input]
13117 str.swapcase as unicode_swapcase
13118 
13119 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13120 [clinic start generated code]*/
13121 
13122 static PyObject *
unicode_swapcase_impl(PyObject * self)13123 unicode_swapcase_impl(PyObject *self)
13124 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13125 {
13126     if (PyUnicode_READY(self) == -1)
13127         return NULL;
13128     return case_operation(self, do_swapcase);
13129 }
13130 
13131 /*[clinic input]
13132 
13133 @staticmethod
13134 str.maketrans as unicode_maketrans
13135 
13136   x: object
13137 
13138   y: unicode=NULL
13139 
13140   z: unicode=NULL
13141 
13142   /
13143 
13144 Return a translation table usable for str.translate().
13145 
13146 If there is only one argument, it must be a dictionary mapping Unicode
13147 ordinals (integers) or characters to Unicode ordinals, strings or None.
13148 Character keys will be then converted to ordinals.
13149 If there are two arguments, they must be strings of equal length, and
13150 in the resulting dictionary, each character in x will be mapped to the
13151 character at the same position in y. If there is a third argument, it
13152 must be a string, whose characters will be mapped to None in the result.
13153 [clinic start generated code]*/
13154 
13155 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13156 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13157 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13158 {
13159     PyObject *new = NULL, *key, *value;
13160     Py_ssize_t i = 0;
13161     int res;
13162 
13163     new = PyDict_New();
13164     if (!new)
13165         return NULL;
13166     if (y != NULL) {
13167         int x_kind, y_kind, z_kind;
13168         void *x_data, *y_data, *z_data;
13169 
13170         /* x must be a string too, of equal length */
13171         if (!PyUnicode_Check(x)) {
13172             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13173                             "be a string if there is a second argument");
13174             goto err;
13175         }
13176         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13177             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13178                             "arguments must have equal length");
13179             goto err;
13180         }
13181         /* create entries for translating chars in x to those in y */
13182         x_kind = PyUnicode_KIND(x);
13183         y_kind = PyUnicode_KIND(y);
13184         x_data = PyUnicode_DATA(x);
13185         y_data = PyUnicode_DATA(y);
13186         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13187             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13188             if (!key)
13189                 goto err;
13190             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13191             if (!value) {
13192                 Py_DECREF(key);
13193                 goto err;
13194             }
13195             res = PyDict_SetItem(new, key, value);
13196             Py_DECREF(key);
13197             Py_DECREF(value);
13198             if (res < 0)
13199                 goto err;
13200         }
13201         /* create entries for deleting chars in z */
13202         if (z != NULL) {
13203             z_kind = PyUnicode_KIND(z);
13204             z_data = PyUnicode_DATA(z);
13205             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13206                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13207                 if (!key)
13208                     goto err;
13209                 res = PyDict_SetItem(new, key, Py_None);
13210                 Py_DECREF(key);
13211                 if (res < 0)
13212                     goto err;
13213             }
13214         }
13215     } else {
13216         int kind;
13217         void *data;
13218 
13219         /* x must be a dict */
13220         if (!PyDict_CheckExact(x)) {
13221             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13222                             "to maketrans it must be a dict");
13223             goto err;
13224         }
13225         /* copy entries into the new dict, converting string keys to int keys */
13226         while (PyDict_Next(x, &i, &key, &value)) {
13227             if (PyUnicode_Check(key)) {
13228                 /* convert string keys to integer keys */
13229                 PyObject *newkey;
13230                 if (PyUnicode_GET_LENGTH(key) != 1) {
13231                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13232                                     "table must be of length 1");
13233                     goto err;
13234                 }
13235                 kind = PyUnicode_KIND(key);
13236                 data = PyUnicode_DATA(key);
13237                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13238                 if (!newkey)
13239                     goto err;
13240                 res = PyDict_SetItem(new, newkey, value);
13241                 Py_DECREF(newkey);
13242                 if (res < 0)
13243                     goto err;
13244             } else if (PyLong_Check(key)) {
13245                 /* just keep integer keys */
13246                 if (PyDict_SetItem(new, key, value) < 0)
13247                     goto err;
13248             } else {
13249                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13250                                 "be strings or integers");
13251                 goto err;
13252             }
13253         }
13254     }
13255     return new;
13256   err:
13257     Py_DECREF(new);
13258     return NULL;
13259 }
13260 
13261 /*[clinic input]
13262 str.translate as unicode_translate
13263 
13264     table: object
13265         Translation table, which must be a mapping of Unicode ordinals to
13266         Unicode ordinals, strings, or None.
13267     /
13268 
13269 Replace each character in the string using the given translation table.
13270 
13271 The table must implement lookup/indexing via __getitem__, for instance a
13272 dictionary or list.  If this operation raises LookupError, the character is
13273 left untouched.  Characters mapped to None are deleted.
13274 [clinic start generated code]*/
13275 
13276 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13277 unicode_translate(PyObject *self, PyObject *table)
13278 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13279 {
13280     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13281 }
13282 
13283 /*[clinic input]
13284 str.upper as unicode_upper
13285 
13286 Return a copy of the string converted to uppercase.
13287 [clinic start generated code]*/
13288 
13289 static PyObject *
unicode_upper_impl(PyObject * self)13290 unicode_upper_impl(PyObject *self)
13291 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13292 {
13293     if (PyUnicode_READY(self) == -1)
13294         return NULL;
13295     if (PyUnicode_IS_ASCII(self))
13296         return ascii_upper_or_lower(self, 0);
13297     return case_operation(self, do_upper);
13298 }
13299 
13300 /*[clinic input]
13301 str.zfill as unicode_zfill
13302 
13303     width: Py_ssize_t
13304     /
13305 
13306 Pad a numeric string with zeros on the left, to fill a field of the given width.
13307 
13308 The string is never truncated.
13309 [clinic start generated code]*/
13310 
13311 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13312 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13313 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13314 {
13315     Py_ssize_t fill;
13316     PyObject *u;
13317     int kind;
13318     void *data;
13319     Py_UCS4 chr;
13320 
13321     if (PyUnicode_READY(self) == -1)
13322         return NULL;
13323 
13324     if (PyUnicode_GET_LENGTH(self) >= width)
13325         return unicode_result_unchanged(self);
13326 
13327     fill = width - PyUnicode_GET_LENGTH(self);
13328 
13329     u = pad(self, fill, 0, '0');
13330 
13331     if (u == NULL)
13332         return NULL;
13333 
13334     kind = PyUnicode_KIND(u);
13335     data = PyUnicode_DATA(u);
13336     chr = PyUnicode_READ(kind, data, fill);
13337 
13338     if (chr == '+' || chr == '-') {
13339         /* move sign to beginning of string */
13340         PyUnicode_WRITE(kind, data, 0, chr);
13341         PyUnicode_WRITE(kind, data, fill, '0');
13342     }
13343 
13344     assert(_PyUnicode_CheckConsistency(u, 1));
13345     return u;
13346 }
13347 
13348 #if 0
13349 static PyObject *
13350 unicode__decimal2ascii(PyObject *self)
13351 {
13352     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13353 }
13354 #endif
13355 
13356 PyDoc_STRVAR(startswith__doc__,
13357              "S.startswith(prefix[, start[, end]]) -> bool\n\
13358 \n\
13359 Return True if S starts with the specified prefix, False otherwise.\n\
13360 With optional start, test S beginning at that position.\n\
13361 With optional end, stop comparing S at that position.\n\
13362 prefix can also be a tuple of strings to try.");
13363 
13364 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13365 unicode_startswith(PyObject *self,
13366                    PyObject *args)
13367 {
13368     PyObject *subobj;
13369     PyObject *substring;
13370     Py_ssize_t start = 0;
13371     Py_ssize_t end = PY_SSIZE_T_MAX;
13372     int result;
13373 
13374     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13375         return NULL;
13376     if (PyTuple_Check(subobj)) {
13377         Py_ssize_t i;
13378         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13379             substring = PyTuple_GET_ITEM(subobj, i);
13380             if (!PyUnicode_Check(substring)) {
13381                 PyErr_Format(PyExc_TypeError,
13382                              "tuple for startswith must only contain str, "
13383                              "not %.100s",
13384                              Py_TYPE(substring)->tp_name);
13385                 return NULL;
13386             }
13387             result = tailmatch(self, substring, start, end, -1);
13388             if (result == -1)
13389                 return NULL;
13390             if (result) {
13391                 Py_RETURN_TRUE;
13392             }
13393         }
13394         /* nothing matched */
13395         Py_RETURN_FALSE;
13396     }
13397     if (!PyUnicode_Check(subobj)) {
13398         PyErr_Format(PyExc_TypeError,
13399                      "startswith first arg must be str or "
13400                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13401         return NULL;
13402     }
13403     result = tailmatch(self, subobj, start, end, -1);
13404     if (result == -1)
13405         return NULL;
13406     return PyBool_FromLong(result);
13407 }
13408 
13409 
13410 PyDoc_STRVAR(endswith__doc__,
13411              "S.endswith(suffix[, start[, end]]) -> bool\n\
13412 \n\
13413 Return True if S ends with the specified suffix, False otherwise.\n\
13414 With optional start, test S beginning at that position.\n\
13415 With optional end, stop comparing S at that position.\n\
13416 suffix can also be a tuple of strings to try.");
13417 
13418 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13419 unicode_endswith(PyObject *self,
13420                  PyObject *args)
13421 {
13422     PyObject *subobj;
13423     PyObject *substring;
13424     Py_ssize_t start = 0;
13425     Py_ssize_t end = PY_SSIZE_T_MAX;
13426     int result;
13427 
13428     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13429         return NULL;
13430     if (PyTuple_Check(subobj)) {
13431         Py_ssize_t i;
13432         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13433             substring = PyTuple_GET_ITEM(subobj, i);
13434             if (!PyUnicode_Check(substring)) {
13435                 PyErr_Format(PyExc_TypeError,
13436                              "tuple for endswith must only contain str, "
13437                              "not %.100s",
13438                              Py_TYPE(substring)->tp_name);
13439                 return NULL;
13440             }
13441             result = tailmatch(self, substring, start, end, +1);
13442             if (result == -1)
13443                 return NULL;
13444             if (result) {
13445                 Py_RETURN_TRUE;
13446             }
13447         }
13448         Py_RETURN_FALSE;
13449     }
13450     if (!PyUnicode_Check(subobj)) {
13451         PyErr_Format(PyExc_TypeError,
13452                      "endswith first arg must be str or "
13453                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13454         return NULL;
13455     }
13456     result = tailmatch(self, subobj, start, end, +1);
13457     if (result == -1)
13458         return NULL;
13459     return PyBool_FromLong(result);
13460 }
13461 
13462 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13463 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13464 {
13465     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13466     writer->data = PyUnicode_DATA(writer->buffer);
13467 
13468     if (!writer->readonly) {
13469         writer->kind = PyUnicode_KIND(writer->buffer);
13470         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13471     }
13472     else {
13473         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13474            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13475         writer->kind = PyUnicode_WCHAR_KIND;
13476         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13477 
13478         /* Copy-on-write mode: set buffer size to 0 so
13479          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13480          * next write. */
13481         writer->size = 0;
13482     }
13483 }
13484 
13485 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13486 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13487 {
13488     memset(writer, 0, sizeof(*writer));
13489 
13490     /* ASCII is the bare minimum */
13491     writer->min_char = 127;
13492 
13493     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13494        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13495     writer->kind = PyUnicode_WCHAR_KIND;
13496     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13497 }
13498 
13499 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13500 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13501                                  Py_ssize_t length, Py_UCS4 maxchar)
13502 {
13503     Py_ssize_t newlen;
13504     PyObject *newbuffer;
13505 
13506     assert(maxchar <= MAX_UNICODE);
13507 
13508     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13509     assert((maxchar > writer->maxchar && length >= 0)
13510            || length > 0);
13511 
13512     if (length > PY_SSIZE_T_MAX - writer->pos) {
13513         PyErr_NoMemory();
13514         return -1;
13515     }
13516     newlen = writer->pos + length;
13517 
13518     maxchar = Py_MAX(maxchar, writer->min_char);
13519 
13520     if (writer->buffer == NULL) {
13521         assert(!writer->readonly);
13522         if (writer->overallocate
13523             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13524             /* overallocate to limit the number of realloc() */
13525             newlen += newlen / OVERALLOCATE_FACTOR;
13526         }
13527         if (newlen < writer->min_length)
13528             newlen = writer->min_length;
13529 
13530         writer->buffer = PyUnicode_New(newlen, maxchar);
13531         if (writer->buffer == NULL)
13532             return -1;
13533     }
13534     else if (newlen > writer->size) {
13535         if (writer->overallocate
13536             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13537             /* overallocate to limit the number of realloc() */
13538             newlen += newlen / OVERALLOCATE_FACTOR;
13539         }
13540         if (newlen < writer->min_length)
13541             newlen = writer->min_length;
13542 
13543         if (maxchar > writer->maxchar || writer->readonly) {
13544             /* resize + widen */
13545             maxchar = Py_MAX(maxchar, writer->maxchar);
13546             newbuffer = PyUnicode_New(newlen, maxchar);
13547             if (newbuffer == NULL)
13548                 return -1;
13549             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13550                                           writer->buffer, 0, writer->pos);
13551             Py_DECREF(writer->buffer);
13552             writer->readonly = 0;
13553         }
13554         else {
13555             newbuffer = resize_compact(writer->buffer, newlen);
13556             if (newbuffer == NULL)
13557                 return -1;
13558         }
13559         writer->buffer = newbuffer;
13560     }
13561     else if (maxchar > writer->maxchar) {
13562         assert(!writer->readonly);
13563         newbuffer = PyUnicode_New(writer->size, maxchar);
13564         if (newbuffer == NULL)
13565             return -1;
13566         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13567                                       writer->buffer, 0, writer->pos);
13568         Py_SETREF(writer->buffer, newbuffer);
13569     }
13570     _PyUnicodeWriter_Update(writer);
13571     return 0;
13572 
13573 #undef OVERALLOCATE_FACTOR
13574 }
13575 
13576 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13577 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13578                                      enum PyUnicode_Kind kind)
13579 {
13580     Py_UCS4 maxchar;
13581 
13582     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13583     assert(writer->kind < kind);
13584 
13585     switch (kind)
13586     {
13587     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13588     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13589     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13590     default:
13591         Py_UNREACHABLE();
13592     }
13593 
13594     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13595 }
13596 
13597 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13598 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13599 {
13600     assert(ch <= MAX_UNICODE);
13601     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13602         return -1;
13603     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13604     writer->pos++;
13605     return 0;
13606 }
13607 
13608 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13609 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13610 {
13611     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13612 }
13613 
13614 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13615 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13616 {
13617     Py_UCS4 maxchar;
13618     Py_ssize_t len;
13619 
13620     if (PyUnicode_READY(str) == -1)
13621         return -1;
13622     len = PyUnicode_GET_LENGTH(str);
13623     if (len == 0)
13624         return 0;
13625     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13626     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13627         if (writer->buffer == NULL && !writer->overallocate) {
13628             assert(_PyUnicode_CheckConsistency(str, 1));
13629             writer->readonly = 1;
13630             Py_INCREF(str);
13631             writer->buffer = str;
13632             _PyUnicodeWriter_Update(writer);
13633             writer->pos += len;
13634             return 0;
13635         }
13636         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13637             return -1;
13638     }
13639     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13640                                   str, 0, len);
13641     writer->pos += len;
13642     return 0;
13643 }
13644 
13645 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13646 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13647                                 Py_ssize_t start, Py_ssize_t end)
13648 {
13649     Py_UCS4 maxchar;
13650     Py_ssize_t len;
13651 
13652     if (PyUnicode_READY(str) == -1)
13653         return -1;
13654 
13655     assert(0 <= start);
13656     assert(end <= PyUnicode_GET_LENGTH(str));
13657     assert(start <= end);
13658 
13659     if (end == 0)
13660         return 0;
13661 
13662     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13663         return _PyUnicodeWriter_WriteStr(writer, str);
13664 
13665     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13666         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13667     else
13668         maxchar = writer->maxchar;
13669     len = end - start;
13670 
13671     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13672         return -1;
13673 
13674     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13675                                   str, start, len);
13676     writer->pos += len;
13677     return 0;
13678 }
13679 
13680 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13681 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13682                                   const char *ascii, Py_ssize_t len)
13683 {
13684     if (len == -1)
13685         len = strlen(ascii);
13686 
13687     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13688 
13689     if (writer->buffer == NULL && !writer->overallocate) {
13690         PyObject *str;
13691 
13692         str = _PyUnicode_FromASCII(ascii, len);
13693         if (str == NULL)
13694             return -1;
13695 
13696         writer->readonly = 1;
13697         writer->buffer = str;
13698         _PyUnicodeWriter_Update(writer);
13699         writer->pos += len;
13700         return 0;
13701     }
13702 
13703     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13704         return -1;
13705 
13706     switch (writer->kind)
13707     {
13708     case PyUnicode_1BYTE_KIND:
13709     {
13710         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13711         Py_UCS1 *data = writer->data;
13712 
13713         memcpy(data + writer->pos, str, len);
13714         break;
13715     }
13716     case PyUnicode_2BYTE_KIND:
13717     {
13718         _PyUnicode_CONVERT_BYTES(
13719             Py_UCS1, Py_UCS2,
13720             ascii, ascii + len,
13721             (Py_UCS2 *)writer->data + writer->pos);
13722         break;
13723     }
13724     case PyUnicode_4BYTE_KIND:
13725     {
13726         _PyUnicode_CONVERT_BYTES(
13727             Py_UCS1, Py_UCS4,
13728             ascii, ascii + len,
13729             (Py_UCS4 *)writer->data + writer->pos);
13730         break;
13731     }
13732     default:
13733         Py_UNREACHABLE();
13734     }
13735 
13736     writer->pos += len;
13737     return 0;
13738 }
13739 
13740 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13741 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13742                                    const char *str, Py_ssize_t len)
13743 {
13744     Py_UCS4 maxchar;
13745 
13746     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13747     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13748         return -1;
13749     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13750     writer->pos += len;
13751     return 0;
13752 }
13753 
13754 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13755 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13756 {
13757     PyObject *str;
13758 
13759     if (writer->pos == 0) {
13760         Py_CLEAR(writer->buffer);
13761         _Py_RETURN_UNICODE_EMPTY();
13762     }
13763 
13764     str = writer->buffer;
13765     writer->buffer = NULL;
13766 
13767     if (writer->readonly) {
13768         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13769         return str;
13770     }
13771 
13772     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13773         PyObject *str2;
13774         str2 = resize_compact(str, writer->pos);
13775         if (str2 == NULL) {
13776             Py_DECREF(str);
13777             return NULL;
13778         }
13779         str = str2;
13780     }
13781 
13782     assert(_PyUnicode_CheckConsistency(str, 1));
13783     return unicode_result_ready(str);
13784 }
13785 
13786 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13787 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13788 {
13789     Py_CLEAR(writer->buffer);
13790 }
13791 
13792 #include "stringlib/unicode_format.h"
13793 
13794 PyDoc_STRVAR(format__doc__,
13795              "S.format(*args, **kwargs) -> str\n\
13796 \n\
13797 Return a formatted version of S, using substitutions from args and kwargs.\n\
13798 The substitutions are identified by braces ('{' and '}').");
13799 
13800 PyDoc_STRVAR(format_map__doc__,
13801              "S.format_map(mapping) -> str\n\
13802 \n\
13803 Return a formatted version of S, using substitutions from mapping.\n\
13804 The substitutions are identified by braces ('{' and '}').");
13805 
13806 /*[clinic input]
13807 str.__format__ as unicode___format__
13808 
13809     format_spec: unicode
13810     /
13811 
13812 Return a formatted version of the string as described by format_spec.
13813 [clinic start generated code]*/
13814 
13815 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13816 unicode___format___impl(PyObject *self, PyObject *format_spec)
13817 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13818 {
13819     _PyUnicodeWriter writer;
13820     int ret;
13821 
13822     if (PyUnicode_READY(self) == -1)
13823         return NULL;
13824     _PyUnicodeWriter_Init(&writer);
13825     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13826                                           self, format_spec, 0,
13827                                           PyUnicode_GET_LENGTH(format_spec));
13828     if (ret == -1) {
13829         _PyUnicodeWriter_Dealloc(&writer);
13830         return NULL;
13831     }
13832     return _PyUnicodeWriter_Finish(&writer);
13833 }
13834 
13835 /*[clinic input]
13836 str.__sizeof__ as unicode_sizeof
13837 
13838 Return the size of the string in memory, in bytes.
13839 [clinic start generated code]*/
13840 
13841 static PyObject *
unicode_sizeof_impl(PyObject * self)13842 unicode_sizeof_impl(PyObject *self)
13843 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13844 {
13845     Py_ssize_t size;
13846 
13847     /* If it's a compact object, account for base structure +
13848        character data. */
13849     if (PyUnicode_IS_COMPACT_ASCII(self))
13850         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13851     else if (PyUnicode_IS_COMPACT(self))
13852         size = sizeof(PyCompactUnicodeObject) +
13853             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13854     else {
13855         /* If it is a two-block object, account for base object, and
13856            for character block if present. */
13857         size = sizeof(PyUnicodeObject);
13858         if (_PyUnicode_DATA_ANY(self))
13859             size += (PyUnicode_GET_LENGTH(self) + 1) *
13860                 PyUnicode_KIND(self);
13861     }
13862     /* If the wstr pointer is present, account for it unless it is shared
13863        with the data pointer. Check if the data is not shared. */
13864     if (_PyUnicode_HAS_WSTR_MEMORY(self))
13865         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13866     if (_PyUnicode_HAS_UTF8_MEMORY(self))
13867         size += PyUnicode_UTF8_LENGTH(self) + 1;
13868 
13869     return PyLong_FromSsize_t(size);
13870 }
13871 
13872 static PyObject *
unicode_getnewargs(PyObject * v)13873 unicode_getnewargs(PyObject *v)
13874 {
13875     PyObject *copy = _PyUnicode_Copy(v);
13876     if (!copy)
13877         return NULL;
13878     return Py_BuildValue("(N)", copy);
13879 }
13880 
13881 static PyMethodDef unicode_methods[] = {
13882     UNICODE_ENCODE_METHODDEF
13883     UNICODE_REPLACE_METHODDEF
13884     UNICODE_SPLIT_METHODDEF
13885     UNICODE_RSPLIT_METHODDEF
13886     UNICODE_JOIN_METHODDEF
13887     UNICODE_CAPITALIZE_METHODDEF
13888     UNICODE_CASEFOLD_METHODDEF
13889     UNICODE_TITLE_METHODDEF
13890     UNICODE_CENTER_METHODDEF
13891     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13892     UNICODE_EXPANDTABS_METHODDEF
13893     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13894     UNICODE_PARTITION_METHODDEF
13895     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13896     UNICODE_LJUST_METHODDEF
13897     UNICODE_LOWER_METHODDEF
13898     UNICODE_LSTRIP_METHODDEF
13899     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13900     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13901     UNICODE_RJUST_METHODDEF
13902     UNICODE_RSTRIP_METHODDEF
13903     UNICODE_RPARTITION_METHODDEF
13904     UNICODE_SPLITLINES_METHODDEF
13905     UNICODE_STRIP_METHODDEF
13906     UNICODE_SWAPCASE_METHODDEF
13907     UNICODE_TRANSLATE_METHODDEF
13908     UNICODE_UPPER_METHODDEF
13909     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13910     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13911     UNICODE_ISASCII_METHODDEF
13912     UNICODE_ISLOWER_METHODDEF
13913     UNICODE_ISUPPER_METHODDEF
13914     UNICODE_ISTITLE_METHODDEF
13915     UNICODE_ISSPACE_METHODDEF
13916     UNICODE_ISDECIMAL_METHODDEF
13917     UNICODE_ISDIGIT_METHODDEF
13918     UNICODE_ISNUMERIC_METHODDEF
13919     UNICODE_ISALPHA_METHODDEF
13920     UNICODE_ISALNUM_METHODDEF
13921     UNICODE_ISIDENTIFIER_METHODDEF
13922     UNICODE_ISPRINTABLE_METHODDEF
13923     UNICODE_ZFILL_METHODDEF
13924     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13925     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13926     UNICODE___FORMAT___METHODDEF
13927     UNICODE_MAKETRANS_METHODDEF
13928     UNICODE_SIZEOF_METHODDEF
13929 #if 0
13930     /* These methods are just used for debugging the implementation. */
13931     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13932 #endif
13933 
13934     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13935     {NULL, NULL}
13936 };
13937 
13938 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13939 unicode_mod(PyObject *v, PyObject *w)
13940 {
13941     if (!PyUnicode_Check(v))
13942         Py_RETURN_NOTIMPLEMENTED;
13943     return PyUnicode_Format(v, w);
13944 }
13945 
13946 static PyNumberMethods unicode_as_number = {
13947     0,              /*nb_add*/
13948     0,              /*nb_subtract*/
13949     0,              /*nb_multiply*/
13950     unicode_mod,            /*nb_remainder*/
13951 };
13952 
13953 static PySequenceMethods unicode_as_sequence = {
13954     (lenfunc) unicode_length,       /* sq_length */
13955     PyUnicode_Concat,           /* sq_concat */
13956     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13957     (ssizeargfunc) unicode_getitem,     /* sq_item */
13958     0,                  /* sq_slice */
13959     0,                  /* sq_ass_item */
13960     0,                  /* sq_ass_slice */
13961     PyUnicode_Contains,         /* sq_contains */
13962 };
13963 
13964 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13965 unicode_subscript(PyObject* self, PyObject* item)
13966 {
13967     if (PyUnicode_READY(self) == -1)
13968         return NULL;
13969 
13970     if (PyIndex_Check(item)) {
13971         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13972         if (i == -1 && PyErr_Occurred())
13973             return NULL;
13974         if (i < 0)
13975             i += PyUnicode_GET_LENGTH(self);
13976         return unicode_getitem(self, i);
13977     } else if (PySlice_Check(item)) {
13978         Py_ssize_t start, stop, step, slicelength, cur, i;
13979         PyObject *result;
13980         void *src_data, *dest_data;
13981         int src_kind, dest_kind;
13982         Py_UCS4 ch, max_char, kind_limit;
13983 
13984         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13985             return NULL;
13986         }
13987         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13988                                             &start, &stop, step);
13989 
13990         if (slicelength <= 0) {
13991             _Py_RETURN_UNICODE_EMPTY();
13992         } else if (start == 0 && step == 1 &&
13993                    slicelength == PyUnicode_GET_LENGTH(self)) {
13994             return unicode_result_unchanged(self);
13995         } else if (step == 1) {
13996             return PyUnicode_Substring(self,
13997                                        start, start + slicelength);
13998         }
13999         /* General case */
14000         src_kind = PyUnicode_KIND(self);
14001         src_data = PyUnicode_DATA(self);
14002         if (!PyUnicode_IS_ASCII(self)) {
14003             kind_limit = kind_maxchar_limit(src_kind);
14004             max_char = 0;
14005             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14006                 ch = PyUnicode_READ(src_kind, src_data, cur);
14007                 if (ch > max_char) {
14008                     max_char = ch;
14009                     if (max_char >= kind_limit)
14010                         break;
14011                 }
14012             }
14013         }
14014         else
14015             max_char = 127;
14016         result = PyUnicode_New(slicelength, max_char);
14017         if (result == NULL)
14018             return NULL;
14019         dest_kind = PyUnicode_KIND(result);
14020         dest_data = PyUnicode_DATA(result);
14021 
14022         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14023             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14024             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14025         }
14026         assert(_PyUnicode_CheckConsistency(result, 1));
14027         return result;
14028     } else {
14029         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14030         return NULL;
14031     }
14032 }
14033 
14034 static PyMappingMethods unicode_as_mapping = {
14035     (lenfunc)unicode_length,        /* mp_length */
14036     (binaryfunc)unicode_subscript,  /* mp_subscript */
14037     (objobjargproc)0,           /* mp_ass_subscript */
14038 };
14039 
14040 
14041 /* Helpers for PyUnicode_Format() */
14042 
14043 struct unicode_formatter_t {
14044     PyObject *args;
14045     int args_owned;
14046     Py_ssize_t arglen, argidx;
14047     PyObject *dict;
14048 
14049     enum PyUnicode_Kind fmtkind;
14050     Py_ssize_t fmtcnt, fmtpos;
14051     void *fmtdata;
14052     PyObject *fmtstr;
14053 
14054     _PyUnicodeWriter writer;
14055 };
14056 
14057 struct unicode_format_arg_t {
14058     Py_UCS4 ch;
14059     int flags;
14060     Py_ssize_t width;
14061     int prec;
14062     int sign;
14063 };
14064 
14065 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14066 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14067 {
14068     Py_ssize_t argidx = ctx->argidx;
14069 
14070     if (argidx < ctx->arglen) {
14071         ctx->argidx++;
14072         if (ctx->arglen < 0)
14073             return ctx->args;
14074         else
14075             return PyTuple_GetItem(ctx->args, argidx);
14076     }
14077     PyErr_SetString(PyExc_TypeError,
14078                     "not enough arguments for format string");
14079     return NULL;
14080 }
14081 
14082 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14083 
14084 /* Format a float into the writer if the writer is not NULL, or into *p_output
14085    otherwise.
14086 
14087    Return 0 on success, raise an exception and return -1 on error. */
14088 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14089 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14090             PyObject **p_output,
14091             _PyUnicodeWriter *writer)
14092 {
14093     char *p;
14094     double x;
14095     Py_ssize_t len;
14096     int prec;
14097     int dtoa_flags;
14098 
14099     x = PyFloat_AsDouble(v);
14100     if (x == -1.0 && PyErr_Occurred())
14101         return -1;
14102 
14103     prec = arg->prec;
14104     if (prec < 0)
14105         prec = 6;
14106 
14107     if (arg->flags & F_ALT)
14108         dtoa_flags = Py_DTSF_ALT;
14109     else
14110         dtoa_flags = 0;
14111     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14112     if (p == NULL)
14113         return -1;
14114     len = strlen(p);
14115     if (writer) {
14116         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14117             PyMem_Free(p);
14118             return -1;
14119         }
14120     }
14121     else
14122         *p_output = _PyUnicode_FromASCII(p, len);
14123     PyMem_Free(p);
14124     return 0;
14125 }
14126 
14127 /* formatlong() emulates the format codes d, u, o, x and X, and
14128  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14129  * Python's regular ints.
14130  * Return value:  a new PyUnicodeObject*, or NULL if error.
14131  *     The output string is of the form
14132  *         "-"? ("0x" | "0X")? digit+
14133  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14134  *         set in flags.  The case of hex digits will be correct,
14135  *     There will be at least prec digits, zero-filled on the left if
14136  *         necessary to get that many.
14137  * val          object to be converted
14138  * flags        bitmask of format flags; only F_ALT is looked at
14139  * prec         minimum number of digits; 0-fill on left if needed
14140  * type         a character in [duoxX]; u acts the same as d
14141  *
14142  * CAUTION:  o, x and X conversions on regular ints can never
14143  * produce a '-' sign, but can for Python's unbounded ints.
14144  */
14145 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14146 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14147 {
14148     PyObject *result = NULL;
14149     char *buf;
14150     Py_ssize_t i;
14151     int sign;           /* 1 if '-', else 0 */
14152     int len;            /* number of characters */
14153     Py_ssize_t llen;
14154     int numdigits;      /* len == numnondigits + numdigits */
14155     int numnondigits = 0;
14156 
14157     /* Avoid exceeding SSIZE_T_MAX */
14158     if (prec > INT_MAX-3) {
14159         PyErr_SetString(PyExc_OverflowError,
14160                         "precision too large");
14161         return NULL;
14162     }
14163 
14164     assert(PyLong_Check(val));
14165 
14166     switch (type) {
14167     default:
14168         Py_UNREACHABLE();
14169     case 'd':
14170     case 'i':
14171     case 'u':
14172         /* int and int subclasses should print numerically when a numeric */
14173         /* format code is used (see issue18780) */
14174         result = PyNumber_ToBase(val, 10);
14175         break;
14176     case 'o':
14177         numnondigits = 2;
14178         result = PyNumber_ToBase(val, 8);
14179         break;
14180     case 'x':
14181     case 'X':
14182         numnondigits = 2;
14183         result = PyNumber_ToBase(val, 16);
14184         break;
14185     }
14186     if (!result)
14187         return NULL;
14188 
14189     assert(unicode_modifiable(result));
14190     assert(PyUnicode_IS_READY(result));
14191     assert(PyUnicode_IS_ASCII(result));
14192 
14193     /* To modify the string in-place, there can only be one reference. */
14194     if (Py_REFCNT(result) != 1) {
14195         Py_DECREF(result);
14196         PyErr_BadInternalCall();
14197         return NULL;
14198     }
14199     buf = PyUnicode_DATA(result);
14200     llen = PyUnicode_GET_LENGTH(result);
14201     if (llen > INT_MAX) {
14202         Py_DECREF(result);
14203         PyErr_SetString(PyExc_ValueError,
14204                         "string too large in _PyUnicode_FormatLong");
14205         return NULL;
14206     }
14207     len = (int)llen;
14208     sign = buf[0] == '-';
14209     numnondigits += sign;
14210     numdigits = len - numnondigits;
14211     assert(numdigits > 0);
14212 
14213     /* Get rid of base marker unless F_ALT */
14214     if (((alt) == 0 &&
14215         (type == 'o' || type == 'x' || type == 'X'))) {
14216         assert(buf[sign] == '0');
14217         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14218                buf[sign+1] == 'o');
14219         numnondigits -= 2;
14220         buf += 2;
14221         len -= 2;
14222         if (sign)
14223             buf[0] = '-';
14224         assert(len == numnondigits + numdigits);
14225         assert(numdigits > 0);
14226     }
14227 
14228     /* Fill with leading zeroes to meet minimum width. */
14229     if (prec > numdigits) {
14230         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14231                                 numnondigits + prec);
14232         char *b1;
14233         if (!r1) {
14234             Py_DECREF(result);
14235             return NULL;
14236         }
14237         b1 = PyBytes_AS_STRING(r1);
14238         for (i = 0; i < numnondigits; ++i)
14239             *b1++ = *buf++;
14240         for (i = 0; i < prec - numdigits; i++)
14241             *b1++ = '0';
14242         for (i = 0; i < numdigits; i++)
14243             *b1++ = *buf++;
14244         *b1 = '\0';
14245         Py_DECREF(result);
14246         result = r1;
14247         buf = PyBytes_AS_STRING(result);
14248         len = numnondigits + prec;
14249     }
14250 
14251     /* Fix up case for hex conversions. */
14252     if (type == 'X') {
14253         /* Need to convert all lower case letters to upper case.
14254            and need to convert 0x to 0X (and -0x to -0X). */
14255         for (i = 0; i < len; i++)
14256             if (buf[i] >= 'a' && buf[i] <= 'x')
14257                 buf[i] -= 'a'-'A';
14258     }
14259     if (!PyUnicode_Check(result)
14260         || buf != PyUnicode_DATA(result)) {
14261         PyObject *unicode;
14262         unicode = _PyUnicode_FromASCII(buf, len);
14263         Py_DECREF(result);
14264         result = unicode;
14265     }
14266     else if (len != PyUnicode_GET_LENGTH(result)) {
14267         if (PyUnicode_Resize(&result, len) < 0)
14268             Py_CLEAR(result);
14269     }
14270     return result;
14271 }
14272 
14273 /* Format an integer or a float as an integer.
14274  * Return 1 if the number has been formatted into the writer,
14275  *        0 if the number has been formatted into *p_output
14276  *       -1 and raise an exception on error */
14277 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14278 mainformatlong(PyObject *v,
14279                struct unicode_format_arg_t *arg,
14280                PyObject **p_output,
14281                _PyUnicodeWriter *writer)
14282 {
14283     PyObject *iobj, *res;
14284     char type = (char)arg->ch;
14285 
14286     if (!PyNumber_Check(v))
14287         goto wrongtype;
14288 
14289     /* make sure number is a type of integer for o, x, and X */
14290     if (!PyLong_Check(v)) {
14291         if (type == 'o' || type == 'x' || type == 'X') {
14292             iobj = PyNumber_Index(v);
14293             if (iobj == NULL) {
14294                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14295                     goto wrongtype;
14296                 return -1;
14297             }
14298         }
14299         else {
14300             iobj = PyNumber_Long(v);
14301             if (iobj == NULL ) {
14302                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14303                     goto wrongtype;
14304                 return -1;
14305             }
14306         }
14307         assert(PyLong_Check(iobj));
14308     }
14309     else {
14310         iobj = v;
14311         Py_INCREF(iobj);
14312     }
14313 
14314     if (PyLong_CheckExact(v)
14315         && arg->width == -1 && arg->prec == -1
14316         && !(arg->flags & (F_SIGN | F_BLANK))
14317         && type != 'X')
14318     {
14319         /* Fast path */
14320         int alternate = arg->flags & F_ALT;
14321         int base;
14322 
14323         switch(type)
14324         {
14325             default:
14326                 Py_UNREACHABLE();
14327             case 'd':
14328             case 'i':
14329             case 'u':
14330                 base = 10;
14331                 break;
14332             case 'o':
14333                 base = 8;
14334                 break;
14335             case 'x':
14336             case 'X':
14337                 base = 16;
14338                 break;
14339         }
14340 
14341         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14342             Py_DECREF(iobj);
14343             return -1;
14344         }
14345         Py_DECREF(iobj);
14346         return 1;
14347     }
14348 
14349     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14350     Py_DECREF(iobj);
14351     if (res == NULL)
14352         return -1;
14353     *p_output = res;
14354     return 0;
14355 
14356 wrongtype:
14357     switch(type)
14358     {
14359         case 'o':
14360         case 'x':
14361         case 'X':
14362             PyErr_Format(PyExc_TypeError,
14363                     "%%%c format: an integer is required, "
14364                     "not %.200s",
14365                     type, Py_TYPE(v)->tp_name);
14366             break;
14367         default:
14368             PyErr_Format(PyExc_TypeError,
14369                     "%%%c format: a number is required, "
14370                     "not %.200s",
14371                     type, Py_TYPE(v)->tp_name);
14372             break;
14373     }
14374     return -1;
14375 }
14376 
14377 static Py_UCS4
formatchar(PyObject * v)14378 formatchar(PyObject *v)
14379 {
14380     /* presume that the buffer is at least 3 characters long */
14381     if (PyUnicode_Check(v)) {
14382         if (PyUnicode_GET_LENGTH(v) == 1) {
14383             return PyUnicode_READ_CHAR(v, 0);
14384         }
14385         goto onError;
14386     }
14387     else {
14388         PyObject *iobj;
14389         long x;
14390         /* make sure number is a type of integer */
14391         if (!PyLong_Check(v)) {
14392             iobj = PyNumber_Index(v);
14393             if (iobj == NULL) {
14394                 goto onError;
14395             }
14396             x = PyLong_AsLong(iobj);
14397             Py_DECREF(iobj);
14398         }
14399         else {
14400             x = PyLong_AsLong(v);
14401         }
14402         if (x == -1 && PyErr_Occurred())
14403             goto onError;
14404 
14405         if (x < 0 || x > MAX_UNICODE) {
14406             PyErr_SetString(PyExc_OverflowError,
14407                             "%c arg not in range(0x110000)");
14408             return (Py_UCS4) -1;
14409         }
14410 
14411         return (Py_UCS4) x;
14412     }
14413 
14414   onError:
14415     PyErr_SetString(PyExc_TypeError,
14416                     "%c requires int or char");
14417     return (Py_UCS4) -1;
14418 }
14419 
14420 /* Parse options of an argument: flags, width, precision.
14421    Handle also "%(name)" syntax.
14422 
14423    Return 0 if the argument has been formatted into arg->str.
14424    Return 1 if the argument has been written into ctx->writer,
14425    Raise an exception and return -1 on error. */
14426 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14427 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14428                          struct unicode_format_arg_t *arg)
14429 {
14430 #define FORMAT_READ(ctx) \
14431         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14432 
14433     PyObject *v;
14434 
14435     if (arg->ch == '(') {
14436         /* Get argument value from a dictionary. Example: "%(name)s". */
14437         Py_ssize_t keystart;
14438         Py_ssize_t keylen;
14439         PyObject *key;
14440         int pcount = 1;
14441 
14442         if (ctx->dict == NULL) {
14443             PyErr_SetString(PyExc_TypeError,
14444                             "format requires a mapping");
14445             return -1;
14446         }
14447         ++ctx->fmtpos;
14448         --ctx->fmtcnt;
14449         keystart = ctx->fmtpos;
14450         /* Skip over balanced parentheses */
14451         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14452             arg->ch = FORMAT_READ(ctx);
14453             if (arg->ch == ')')
14454                 --pcount;
14455             else if (arg->ch == '(')
14456                 ++pcount;
14457             ctx->fmtpos++;
14458         }
14459         keylen = ctx->fmtpos - keystart - 1;
14460         if (ctx->fmtcnt < 0 || pcount > 0) {
14461             PyErr_SetString(PyExc_ValueError,
14462                             "incomplete format key");
14463             return -1;
14464         }
14465         key = PyUnicode_Substring(ctx->fmtstr,
14466                                   keystart, keystart + keylen);
14467         if (key == NULL)
14468             return -1;
14469         if (ctx->args_owned) {
14470             ctx->args_owned = 0;
14471             Py_DECREF(ctx->args);
14472         }
14473         ctx->args = PyObject_GetItem(ctx->dict, key);
14474         Py_DECREF(key);
14475         if (ctx->args == NULL)
14476             return -1;
14477         ctx->args_owned = 1;
14478         ctx->arglen = -1;
14479         ctx->argidx = -2;
14480     }
14481 
14482     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14483     while (--ctx->fmtcnt >= 0) {
14484         arg->ch = FORMAT_READ(ctx);
14485         ctx->fmtpos++;
14486         switch (arg->ch) {
14487         case '-': arg->flags |= F_LJUST; continue;
14488         case '+': arg->flags |= F_SIGN; continue;
14489         case ' ': arg->flags |= F_BLANK; continue;
14490         case '#': arg->flags |= F_ALT; continue;
14491         case '0': arg->flags |= F_ZERO; continue;
14492         }
14493         break;
14494     }
14495 
14496     /* Parse width. Example: "%10s" => width=10 */
14497     if (arg->ch == '*') {
14498         v = unicode_format_getnextarg(ctx);
14499         if (v == NULL)
14500             return -1;
14501         if (!PyLong_Check(v)) {
14502             PyErr_SetString(PyExc_TypeError,
14503                             "* wants int");
14504             return -1;
14505         }
14506         arg->width = PyLong_AsSsize_t(v);
14507         if (arg->width == -1 && PyErr_Occurred())
14508             return -1;
14509         if (arg->width < 0) {
14510             arg->flags |= F_LJUST;
14511             arg->width = -arg->width;
14512         }
14513         if (--ctx->fmtcnt >= 0) {
14514             arg->ch = FORMAT_READ(ctx);
14515             ctx->fmtpos++;
14516         }
14517     }
14518     else if (arg->ch >= '0' && arg->ch <= '9') {
14519         arg->width = arg->ch - '0';
14520         while (--ctx->fmtcnt >= 0) {
14521             arg->ch = FORMAT_READ(ctx);
14522             ctx->fmtpos++;
14523             if (arg->ch < '0' || arg->ch > '9')
14524                 break;
14525             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14526                mixing signed and unsigned comparison. Since arg->ch is between
14527                '0' and '9', casting to int is safe. */
14528             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14529                 PyErr_SetString(PyExc_ValueError,
14530                                 "width too big");
14531                 return -1;
14532             }
14533             arg->width = arg->width*10 + (arg->ch - '0');
14534         }
14535     }
14536 
14537     /* Parse precision. Example: "%.3f" => prec=3 */
14538     if (arg->ch == '.') {
14539         arg->prec = 0;
14540         if (--ctx->fmtcnt >= 0) {
14541             arg->ch = FORMAT_READ(ctx);
14542             ctx->fmtpos++;
14543         }
14544         if (arg->ch == '*') {
14545             v = unicode_format_getnextarg(ctx);
14546             if (v == NULL)
14547                 return -1;
14548             if (!PyLong_Check(v)) {
14549                 PyErr_SetString(PyExc_TypeError,
14550                                 "* wants int");
14551                 return -1;
14552             }
14553             arg->prec = _PyLong_AsInt(v);
14554             if (arg->prec == -1 && PyErr_Occurred())
14555                 return -1;
14556             if (arg->prec < 0)
14557                 arg->prec = 0;
14558             if (--ctx->fmtcnt >= 0) {
14559                 arg->ch = FORMAT_READ(ctx);
14560                 ctx->fmtpos++;
14561             }
14562         }
14563         else if (arg->ch >= '0' && arg->ch <= '9') {
14564             arg->prec = arg->ch - '0';
14565             while (--ctx->fmtcnt >= 0) {
14566                 arg->ch = FORMAT_READ(ctx);
14567                 ctx->fmtpos++;
14568                 if (arg->ch < '0' || arg->ch > '9')
14569                     break;
14570                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14571                     PyErr_SetString(PyExc_ValueError,
14572                                     "precision too big");
14573                     return -1;
14574                 }
14575                 arg->prec = arg->prec*10 + (arg->ch - '0');
14576             }
14577         }
14578     }
14579 
14580     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14581     if (ctx->fmtcnt >= 0) {
14582         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14583             if (--ctx->fmtcnt >= 0) {
14584                 arg->ch = FORMAT_READ(ctx);
14585                 ctx->fmtpos++;
14586             }
14587         }
14588     }
14589     if (ctx->fmtcnt < 0) {
14590         PyErr_SetString(PyExc_ValueError,
14591                         "incomplete format");
14592         return -1;
14593     }
14594     return 0;
14595 
14596 #undef FORMAT_READ
14597 }
14598 
14599 /* Format one argument. Supported conversion specifiers:
14600 
14601    - "s", "r", "a": any type
14602    - "i", "d", "u": int or float
14603    - "o", "x", "X": int
14604    - "e", "E", "f", "F", "g", "G": float
14605    - "c": int or str (1 character)
14606 
14607    When possible, the output is written directly into the Unicode writer
14608    (ctx->writer). A string is created when padding is required.
14609 
14610    Return 0 if the argument has been formatted into *p_str,
14611           1 if the argument has been written into ctx->writer,
14612          -1 on error. */
14613 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14614 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14615                           struct unicode_format_arg_t *arg,
14616                           PyObject **p_str)
14617 {
14618     PyObject *v;
14619     _PyUnicodeWriter *writer = &ctx->writer;
14620 
14621     if (ctx->fmtcnt == 0)
14622         ctx->writer.overallocate = 0;
14623 
14624     v = unicode_format_getnextarg(ctx);
14625     if (v == NULL)
14626         return -1;
14627 
14628 
14629     switch (arg->ch) {
14630     case 's':
14631     case 'r':
14632     case 'a':
14633         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14634             /* Fast path */
14635             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14636                 return -1;
14637             return 1;
14638         }
14639 
14640         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14641             *p_str = v;
14642             Py_INCREF(*p_str);
14643         }
14644         else {
14645             if (arg->ch == 's')
14646                 *p_str = PyObject_Str(v);
14647             else if (arg->ch == 'r')
14648                 *p_str = PyObject_Repr(v);
14649             else
14650                 *p_str = PyObject_ASCII(v);
14651         }
14652         break;
14653 
14654     case 'i':
14655     case 'd':
14656     case 'u':
14657     case 'o':
14658     case 'x':
14659     case 'X':
14660     {
14661         int ret = mainformatlong(v, arg, p_str, writer);
14662         if (ret != 0)
14663             return ret;
14664         arg->sign = 1;
14665         break;
14666     }
14667 
14668     case 'e':
14669     case 'E':
14670     case 'f':
14671     case 'F':
14672     case 'g':
14673     case 'G':
14674         if (arg->width == -1 && arg->prec == -1
14675             && !(arg->flags & (F_SIGN | F_BLANK)))
14676         {
14677             /* Fast path */
14678             if (formatfloat(v, arg, NULL, writer) == -1)
14679                 return -1;
14680             return 1;
14681         }
14682 
14683         arg->sign = 1;
14684         if (formatfloat(v, arg, p_str, NULL) == -1)
14685             return -1;
14686         break;
14687 
14688     case 'c':
14689     {
14690         Py_UCS4 ch = formatchar(v);
14691         if (ch == (Py_UCS4) -1)
14692             return -1;
14693         if (arg->width == -1 && arg->prec == -1) {
14694             /* Fast path */
14695             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14696                 return -1;
14697             return 1;
14698         }
14699         *p_str = PyUnicode_FromOrdinal(ch);
14700         break;
14701     }
14702 
14703     default:
14704         PyErr_Format(PyExc_ValueError,
14705                      "unsupported format character '%c' (0x%x) "
14706                      "at index %zd",
14707                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14708                      (int)arg->ch,
14709                      ctx->fmtpos - 1);
14710         return -1;
14711     }
14712     if (*p_str == NULL)
14713         return -1;
14714     assert (PyUnicode_Check(*p_str));
14715     return 0;
14716 }
14717 
14718 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14719 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14720                           struct unicode_format_arg_t *arg,
14721                           PyObject *str)
14722 {
14723     Py_ssize_t len;
14724     enum PyUnicode_Kind kind;
14725     void *pbuf;
14726     Py_ssize_t pindex;
14727     Py_UCS4 signchar;
14728     Py_ssize_t buflen;
14729     Py_UCS4 maxchar;
14730     Py_ssize_t sublen;
14731     _PyUnicodeWriter *writer = &ctx->writer;
14732     Py_UCS4 fill;
14733 
14734     fill = ' ';
14735     if (arg->sign && arg->flags & F_ZERO)
14736         fill = '0';
14737 
14738     if (PyUnicode_READY(str) == -1)
14739         return -1;
14740 
14741     len = PyUnicode_GET_LENGTH(str);
14742     if ((arg->width == -1 || arg->width <= len)
14743         && (arg->prec == -1 || arg->prec >= len)
14744         && !(arg->flags & (F_SIGN | F_BLANK)))
14745     {
14746         /* Fast path */
14747         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14748             return -1;
14749         return 0;
14750     }
14751 
14752     /* Truncate the string for "s", "r" and "a" formats
14753        if the precision is set */
14754     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14755         if (arg->prec >= 0 && len > arg->prec)
14756             len = arg->prec;
14757     }
14758 
14759     /* Adjust sign and width */
14760     kind = PyUnicode_KIND(str);
14761     pbuf = PyUnicode_DATA(str);
14762     pindex = 0;
14763     signchar = '\0';
14764     if (arg->sign) {
14765         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14766         if (ch == '-' || ch == '+') {
14767             signchar = ch;
14768             len--;
14769             pindex++;
14770         }
14771         else if (arg->flags & F_SIGN)
14772             signchar = '+';
14773         else if (arg->flags & F_BLANK)
14774             signchar = ' ';
14775         else
14776             arg->sign = 0;
14777     }
14778     if (arg->width < len)
14779         arg->width = len;
14780 
14781     /* Prepare the writer */
14782     maxchar = writer->maxchar;
14783     if (!(arg->flags & F_LJUST)) {
14784         if (arg->sign) {
14785             if ((arg->width-1) > len)
14786                 maxchar = Py_MAX(maxchar, fill);
14787         }
14788         else {
14789             if (arg->width > len)
14790                 maxchar = Py_MAX(maxchar, fill);
14791         }
14792     }
14793     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14794         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14795         maxchar = Py_MAX(maxchar, strmaxchar);
14796     }
14797 
14798     buflen = arg->width;
14799     if (arg->sign && len == arg->width)
14800         buflen++;
14801     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14802         return -1;
14803 
14804     /* Write the sign if needed */
14805     if (arg->sign) {
14806         if (fill != ' ') {
14807             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14808             writer->pos += 1;
14809         }
14810         if (arg->width > len)
14811             arg->width--;
14812     }
14813 
14814     /* Write the numeric prefix for "x", "X" and "o" formats
14815        if the alternate form is used.
14816        For example, write "0x" for the "%#x" format. */
14817     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14818         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14819         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14820         if (fill != ' ') {
14821             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14822             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14823             writer->pos += 2;
14824             pindex += 2;
14825         }
14826         arg->width -= 2;
14827         if (arg->width < 0)
14828             arg->width = 0;
14829         len -= 2;
14830     }
14831 
14832     /* Pad left with the fill character if needed */
14833     if (arg->width > len && !(arg->flags & F_LJUST)) {
14834         sublen = arg->width - len;
14835         FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14836         writer->pos += sublen;
14837         arg->width = len;
14838     }
14839 
14840     /* If padding with spaces: write sign if needed and/or numeric prefix if
14841        the alternate form is used */
14842     if (fill == ' ') {
14843         if (arg->sign) {
14844             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14845             writer->pos += 1;
14846         }
14847         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14848             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14849             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14850             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14851             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14852             writer->pos += 2;
14853             pindex += 2;
14854         }
14855     }
14856 
14857     /* Write characters */
14858     if (len) {
14859         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14860                                       str, pindex, len);
14861         writer->pos += len;
14862     }
14863 
14864     /* Pad right with the fill character if needed */
14865     if (arg->width > len) {
14866         sublen = arg->width - len;
14867         FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14868         writer->pos += sublen;
14869     }
14870     return 0;
14871 }
14872 
14873 /* Helper of PyUnicode_Format(): format one arg.
14874    Return 0 on success, raise an exception and return -1 on error. */
14875 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14876 unicode_format_arg(struct unicode_formatter_t *ctx)
14877 {
14878     struct unicode_format_arg_t arg;
14879     PyObject *str;
14880     int ret;
14881 
14882     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14883     if (arg.ch == '%') {
14884         ctx->fmtpos++;
14885         ctx->fmtcnt--;
14886         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14887             return -1;
14888         return 0;
14889     }
14890     arg.flags = 0;
14891     arg.width = -1;
14892     arg.prec = -1;
14893     arg.sign = 0;
14894     str = NULL;
14895 
14896     ret = unicode_format_arg_parse(ctx, &arg);
14897     if (ret == -1)
14898         return -1;
14899 
14900     ret = unicode_format_arg_format(ctx, &arg, &str);
14901     if (ret == -1)
14902         return -1;
14903 
14904     if (ret != 1) {
14905         ret = unicode_format_arg_output(ctx, &arg, str);
14906         Py_DECREF(str);
14907         if (ret == -1)
14908             return -1;
14909     }
14910 
14911     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14912         PyErr_SetString(PyExc_TypeError,
14913                         "not all arguments converted during string formatting");
14914         return -1;
14915     }
14916     return 0;
14917 }
14918 
14919 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14920 PyUnicode_Format(PyObject *format, PyObject *args)
14921 {
14922     struct unicode_formatter_t ctx;
14923 
14924     if (format == NULL || args == NULL) {
14925         PyErr_BadInternalCall();
14926         return NULL;
14927     }
14928 
14929     if (ensure_unicode(format) < 0)
14930         return NULL;
14931 
14932     ctx.fmtstr = format;
14933     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14934     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14935     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14936     ctx.fmtpos = 0;
14937 
14938     _PyUnicodeWriter_Init(&ctx.writer);
14939     ctx.writer.min_length = ctx.fmtcnt + 100;
14940     ctx.writer.overallocate = 1;
14941 
14942     if (PyTuple_Check(args)) {
14943         ctx.arglen = PyTuple_Size(args);
14944         ctx.argidx = 0;
14945     }
14946     else {
14947         ctx.arglen = -1;
14948         ctx.argidx = -2;
14949     }
14950     ctx.args_owned = 0;
14951     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14952         ctx.dict = args;
14953     else
14954         ctx.dict = NULL;
14955     ctx.args = args;
14956 
14957     while (--ctx.fmtcnt >= 0) {
14958         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14959             Py_ssize_t nonfmtpos;
14960 
14961             nonfmtpos = ctx.fmtpos++;
14962             while (ctx.fmtcnt >= 0 &&
14963                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14964                 ctx.fmtpos++;
14965                 ctx.fmtcnt--;
14966             }
14967             if (ctx.fmtcnt < 0) {
14968                 ctx.fmtpos--;
14969                 ctx.writer.overallocate = 0;
14970             }
14971 
14972             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14973                                                 nonfmtpos, ctx.fmtpos) < 0)
14974                 goto onError;
14975         }
14976         else {
14977             ctx.fmtpos++;
14978             if (unicode_format_arg(&ctx) == -1)
14979                 goto onError;
14980         }
14981     }
14982 
14983     if (ctx.argidx < ctx.arglen && !ctx.dict) {
14984         PyErr_SetString(PyExc_TypeError,
14985                         "not all arguments converted during string formatting");
14986         goto onError;
14987     }
14988 
14989     if (ctx.args_owned) {
14990         Py_DECREF(ctx.args);
14991     }
14992     return _PyUnicodeWriter_Finish(&ctx.writer);
14993 
14994   onError:
14995     _PyUnicodeWriter_Dealloc(&ctx.writer);
14996     if (ctx.args_owned) {
14997         Py_DECREF(ctx.args);
14998     }
14999     return NULL;
15000 }
15001 
15002 static PyObject *
15003 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15004 
15005 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15006 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15007 {
15008     PyObject *x = NULL;
15009     static char *kwlist[] = {"object", "encoding", "errors", 0};
15010     char *encoding = NULL;
15011     char *errors = NULL;
15012 
15013     if (type != &PyUnicode_Type)
15014         return unicode_subtype_new(type, args, kwds);
15015     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15016                                      kwlist, &x, &encoding, &errors))
15017         return NULL;
15018     if (x == NULL)
15019         _Py_RETURN_UNICODE_EMPTY();
15020     if (encoding == NULL && errors == NULL)
15021         return PyObject_Str(x);
15022     else
15023         return PyUnicode_FromEncodedObject(x, encoding, errors);
15024 }
15025 
15026 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15027 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15028 {
15029     PyObject *unicode, *self;
15030     Py_ssize_t length, char_size;
15031     int share_wstr, share_utf8;
15032     unsigned int kind;
15033     void *data;
15034 
15035     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15036 
15037     unicode = unicode_new(&PyUnicode_Type, args, kwds);
15038     if (unicode == NULL)
15039         return NULL;
15040     assert(_PyUnicode_CHECK(unicode));
15041     if (PyUnicode_READY(unicode) == -1) {
15042         Py_DECREF(unicode);
15043         return NULL;
15044     }
15045 
15046     self = type->tp_alloc(type, 0);
15047     if (self == NULL) {
15048         Py_DECREF(unicode);
15049         return NULL;
15050     }
15051     kind = PyUnicode_KIND(unicode);
15052     length = PyUnicode_GET_LENGTH(unicode);
15053 
15054     _PyUnicode_LENGTH(self) = length;
15055 #ifdef Py_DEBUG
15056     _PyUnicode_HASH(self) = -1;
15057 #else
15058     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15059 #endif
15060     _PyUnicode_STATE(self).interned = 0;
15061     _PyUnicode_STATE(self).kind = kind;
15062     _PyUnicode_STATE(self).compact = 0;
15063     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15064     _PyUnicode_STATE(self).ready = 1;
15065     _PyUnicode_WSTR(self) = NULL;
15066     _PyUnicode_UTF8_LENGTH(self) = 0;
15067     _PyUnicode_UTF8(self) = NULL;
15068     _PyUnicode_WSTR_LENGTH(self) = 0;
15069     _PyUnicode_DATA_ANY(self) = NULL;
15070 
15071     share_utf8 = 0;
15072     share_wstr = 0;
15073     if (kind == PyUnicode_1BYTE_KIND) {
15074         char_size = 1;
15075         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15076             share_utf8 = 1;
15077     }
15078     else if (kind == PyUnicode_2BYTE_KIND) {
15079         char_size = 2;
15080         if (sizeof(wchar_t) == 2)
15081             share_wstr = 1;
15082     }
15083     else {
15084         assert(kind == PyUnicode_4BYTE_KIND);
15085         char_size = 4;
15086         if (sizeof(wchar_t) == 4)
15087             share_wstr = 1;
15088     }
15089 
15090     /* Ensure we won't overflow the length. */
15091     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15092         PyErr_NoMemory();
15093         goto onError;
15094     }
15095     data = PyObject_MALLOC((length + 1) * char_size);
15096     if (data == NULL) {
15097         PyErr_NoMemory();
15098         goto onError;
15099     }
15100 
15101     _PyUnicode_DATA_ANY(self) = data;
15102     if (share_utf8) {
15103         _PyUnicode_UTF8_LENGTH(self) = length;
15104         _PyUnicode_UTF8(self) = data;
15105     }
15106     if (share_wstr) {
15107         _PyUnicode_WSTR_LENGTH(self) = length;
15108         _PyUnicode_WSTR(self) = (wchar_t *)data;
15109     }
15110 
15111     memcpy(data, PyUnicode_DATA(unicode),
15112               kind * (length + 1));
15113     assert(_PyUnicode_CheckConsistency(self, 1));
15114 #ifdef Py_DEBUG
15115     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15116 #endif
15117     Py_DECREF(unicode);
15118     return self;
15119 
15120 onError:
15121     Py_DECREF(unicode);
15122     Py_DECREF(self);
15123     return NULL;
15124 }
15125 
15126 PyDoc_STRVAR(unicode_doc,
15127 "str(object='') -> str\n\
15128 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15129 \n\
15130 Create a new string object from the given object. If encoding or\n\
15131 errors is specified, then the object must expose a data buffer\n\
15132 that will be decoded using the given encoding and error handler.\n\
15133 Otherwise, returns the result of object.__str__() (if defined)\n\
15134 or repr(object).\n\
15135 encoding defaults to sys.getdefaultencoding().\n\
15136 errors defaults to 'strict'.");
15137 
15138 static PyObject *unicode_iter(PyObject *seq);
15139 
15140 PyTypeObject PyUnicode_Type = {
15141     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15142     "str",              /* tp_name */
15143     sizeof(PyUnicodeObject),        /* tp_size */
15144     0,                  /* tp_itemsize */
15145     /* Slots */
15146     (destructor)unicode_dealloc,    /* tp_dealloc */
15147     0,                  /* tp_print */
15148     0,                  /* tp_getattr */
15149     0,                  /* tp_setattr */
15150     0,                  /* tp_reserved */
15151     unicode_repr,           /* tp_repr */
15152     &unicode_as_number,         /* tp_as_number */
15153     &unicode_as_sequence,       /* tp_as_sequence */
15154     &unicode_as_mapping,        /* tp_as_mapping */
15155     (hashfunc) unicode_hash,        /* tp_hash*/
15156     0,                  /* tp_call*/
15157     (reprfunc) unicode_str,     /* tp_str */
15158     PyObject_GenericGetAttr,        /* tp_getattro */
15159     0,                  /* tp_setattro */
15160     0,                  /* tp_as_buffer */
15161     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15162     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
15163     unicode_doc,            /* tp_doc */
15164     0,                  /* tp_traverse */
15165     0,                  /* tp_clear */
15166     PyUnicode_RichCompare,      /* tp_richcompare */
15167     0,                  /* tp_weaklistoffset */
15168     unicode_iter,           /* tp_iter */
15169     0,                  /* tp_iternext */
15170     unicode_methods,            /* tp_methods */
15171     0,                  /* tp_members */
15172     0,                  /* tp_getset */
15173     &PyBaseObject_Type,         /* tp_base */
15174     0,                  /* tp_dict */
15175     0,                  /* tp_descr_get */
15176     0,                  /* tp_descr_set */
15177     0,                  /* tp_dictoffset */
15178     0,                  /* tp_init */
15179     0,                  /* tp_alloc */
15180     unicode_new,            /* tp_new */
15181     PyObject_Del,           /* tp_free */
15182 };
15183 
15184 /* Initialize the Unicode implementation */
15185 
_PyUnicode_Init(void)15186 int _PyUnicode_Init(void)
15187 {
15188     /* XXX - move this array to unicodectype.c ? */
15189     Py_UCS2 linebreak[] = {
15190         0x000A, /* LINE FEED */
15191         0x000D, /* CARRIAGE RETURN */
15192         0x001C, /* FILE SEPARATOR */
15193         0x001D, /* GROUP SEPARATOR */
15194         0x001E, /* RECORD SEPARATOR */
15195         0x0085, /* NEXT LINE */
15196         0x2028, /* LINE SEPARATOR */
15197         0x2029, /* PARAGRAPH SEPARATOR */
15198     };
15199 
15200     /* Init the implementation */
15201     _Py_INCREF_UNICODE_EMPTY();
15202     if (!unicode_empty)
15203         Py_FatalError("Can't create empty string");
15204     Py_DECREF(unicode_empty);
15205 
15206     if (PyType_Ready(&PyUnicode_Type) < 0)
15207         Py_FatalError("Can't initialize 'unicode'");
15208 
15209     /* initialize the linebreak bloom filter */
15210     bloom_linebreak = make_bloom_mask(
15211         PyUnicode_2BYTE_KIND, linebreak,
15212         Py_ARRAY_LENGTH(linebreak));
15213 
15214     if (PyType_Ready(&EncodingMapType) < 0)
15215          Py_FatalError("Can't initialize encoding map type");
15216 
15217     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15218         Py_FatalError("Can't initialize field name iterator type");
15219 
15220     if (PyType_Ready(&PyFormatterIter_Type) < 0)
15221         Py_FatalError("Can't initialize formatter iter type");
15222 
15223     return 0;
15224 }
15225 
15226 /* Finalize the Unicode implementation */
15227 
15228 int
PyUnicode_ClearFreeList(void)15229 PyUnicode_ClearFreeList(void)
15230 {
15231     return 0;
15232 }
15233 
15234 void
_PyUnicode_Fini(void)15235 _PyUnicode_Fini(void)
15236 {
15237     int i;
15238 
15239     Py_CLEAR(unicode_empty);
15240 
15241     for (i = 0; i < 256; i++)
15242         Py_CLEAR(unicode_latin1[i]);
15243     _PyUnicode_ClearStaticStrings();
15244     (void)PyUnicode_ClearFreeList();
15245 }
15246 
15247 void
PyUnicode_InternInPlace(PyObject ** p)15248 PyUnicode_InternInPlace(PyObject **p)
15249 {
15250     PyObject *s = *p;
15251     PyObject *t;
15252 #ifdef Py_DEBUG
15253     assert(s != NULL);
15254     assert(_PyUnicode_CHECK(s));
15255 #else
15256     if (s == NULL || !PyUnicode_Check(s))
15257         return;
15258 #endif
15259     /* If it's a subclass, we don't really know what putting
15260        it in the interned dict might do. */
15261     if (!PyUnicode_CheckExact(s))
15262         return;
15263     if (PyUnicode_CHECK_INTERNED(s))
15264         return;
15265     if (interned == NULL) {
15266         interned = PyDict_New();
15267         if (interned == NULL) {
15268             PyErr_Clear(); /* Don't leave an exception */
15269             return;
15270         }
15271     }
15272     Py_ALLOW_RECURSION
15273     t = PyDict_SetDefault(interned, s, s);
15274     Py_END_ALLOW_RECURSION
15275     if (t == NULL) {
15276         PyErr_Clear();
15277         return;
15278     }
15279     if (t != s) {
15280         Py_INCREF(t);
15281         Py_SETREF(*p, t);
15282         return;
15283     }
15284     /* The two references in interned are not counted by refcnt.
15285        The deallocator will take care of this */
15286     Py_REFCNT(s) -= 2;
15287     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15288 }
15289 
15290 void
PyUnicode_InternImmortal(PyObject ** p)15291 PyUnicode_InternImmortal(PyObject **p)
15292 {
15293     PyUnicode_InternInPlace(p);
15294     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15295         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15296         Py_INCREF(*p);
15297     }
15298 }
15299 
15300 PyObject *
PyUnicode_InternFromString(const char * cp)15301 PyUnicode_InternFromString(const char *cp)
15302 {
15303     PyObject *s = PyUnicode_FromString(cp);
15304     if (s == NULL)
15305         return NULL;
15306     PyUnicode_InternInPlace(&s);
15307     return s;
15308 }
15309 
15310 void
_Py_ReleaseInternedUnicodeStrings(void)15311 _Py_ReleaseInternedUnicodeStrings(void)
15312 {
15313     PyObject *keys;
15314     PyObject *s;
15315     Py_ssize_t i, n;
15316     Py_ssize_t immortal_size = 0, mortal_size = 0;
15317 
15318     if (interned == NULL || !PyDict_Check(interned))
15319         return;
15320     keys = PyDict_Keys(interned);
15321     if (keys == NULL || !PyList_Check(keys)) {
15322         PyErr_Clear();
15323         return;
15324     }
15325 
15326     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15327        detector, interned unicode strings are not forcibly deallocated;
15328        rather, we give them their stolen references back, and then clear
15329        and DECREF the interned dict. */
15330 
15331     n = PyList_GET_SIZE(keys);
15332     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15333             n);
15334     for (i = 0; i < n; i++) {
15335         s = PyList_GET_ITEM(keys, i);
15336         if (PyUnicode_READY(s) == -1) {
15337             Py_UNREACHABLE();
15338         }
15339         switch (PyUnicode_CHECK_INTERNED(s)) {
15340         case SSTATE_NOT_INTERNED:
15341             /* XXX Shouldn't happen */
15342             break;
15343         case SSTATE_INTERNED_IMMORTAL:
15344             Py_REFCNT(s) += 1;
15345             immortal_size += PyUnicode_GET_LENGTH(s);
15346             break;
15347         case SSTATE_INTERNED_MORTAL:
15348             Py_REFCNT(s) += 2;
15349             mortal_size += PyUnicode_GET_LENGTH(s);
15350             break;
15351         default:
15352             Py_FatalError("Inconsistent interned string state.");
15353         }
15354         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15355     }
15356     fprintf(stderr, "total size of all interned strings: "
15357             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15358             "mortal/immortal\n", mortal_size, immortal_size);
15359     Py_DECREF(keys);
15360     PyDict_Clear(interned);
15361     Py_CLEAR(interned);
15362 }
15363 
15364 
15365 /********************* Unicode Iterator **************************/
15366 
15367 typedef struct {
15368     PyObject_HEAD
15369     Py_ssize_t it_index;
15370     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15371 } unicodeiterobject;
15372 
15373 static void
unicodeiter_dealloc(unicodeiterobject * it)15374 unicodeiter_dealloc(unicodeiterobject *it)
15375 {
15376     _PyObject_GC_UNTRACK(it);
15377     Py_XDECREF(it->it_seq);
15378     PyObject_GC_Del(it);
15379 }
15380 
15381 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15382 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15383 {
15384     Py_VISIT(it->it_seq);
15385     return 0;
15386 }
15387 
15388 static PyObject *
unicodeiter_next(unicodeiterobject * it)15389 unicodeiter_next(unicodeiterobject *it)
15390 {
15391     PyObject *seq, *item;
15392 
15393     assert(it != NULL);
15394     seq = it->it_seq;
15395     if (seq == NULL)
15396         return NULL;
15397     assert(_PyUnicode_CHECK(seq));
15398 
15399     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15400         int kind = PyUnicode_KIND(seq);
15401         void *data = PyUnicode_DATA(seq);
15402         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15403         item = PyUnicode_FromOrdinal(chr);
15404         if (item != NULL)
15405             ++it->it_index;
15406         return item;
15407     }
15408 
15409     it->it_seq = NULL;
15410     Py_DECREF(seq);
15411     return NULL;
15412 }
15413 
15414 static PyObject *
unicodeiter_len(unicodeiterobject * it)15415 unicodeiter_len(unicodeiterobject *it)
15416 {
15417     Py_ssize_t len = 0;
15418     if (it->it_seq)
15419         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15420     return PyLong_FromSsize_t(len);
15421 }
15422 
15423 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15424 
15425 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15426 unicodeiter_reduce(unicodeiterobject *it)
15427 {
15428     if (it->it_seq != NULL) {
15429         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15430                              it->it_seq, it->it_index);
15431     } else {
15432         PyObject *u = (PyObject *)_PyUnicode_New(0);
15433         if (u == NULL)
15434             return NULL;
15435         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15436     }
15437 }
15438 
15439 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15440 
15441 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15442 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15443 {
15444     Py_ssize_t index = PyLong_AsSsize_t(state);
15445     if (index == -1 && PyErr_Occurred())
15446         return NULL;
15447     if (it->it_seq != NULL) {
15448         if (index < 0)
15449             index = 0;
15450         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15451             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15452         it->it_index = index;
15453     }
15454     Py_RETURN_NONE;
15455 }
15456 
15457 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15458 
15459 static PyMethodDef unicodeiter_methods[] = {
15460     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15461      length_hint_doc},
15462     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15463      reduce_doc},
15464     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15465      setstate_doc},
15466     {NULL,      NULL}       /* sentinel */
15467 };
15468 
15469 PyTypeObject PyUnicodeIter_Type = {
15470     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15471     "str_iterator",         /* tp_name */
15472     sizeof(unicodeiterobject),      /* tp_basicsize */
15473     0,                  /* tp_itemsize */
15474     /* methods */
15475     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15476     0,                  /* tp_print */
15477     0,                  /* tp_getattr */
15478     0,                  /* tp_setattr */
15479     0,                  /* tp_reserved */
15480     0,                  /* tp_repr */
15481     0,                  /* tp_as_number */
15482     0,                  /* tp_as_sequence */
15483     0,                  /* tp_as_mapping */
15484     0,                  /* tp_hash */
15485     0,                  /* tp_call */
15486     0,                  /* tp_str */
15487     PyObject_GenericGetAttr,        /* tp_getattro */
15488     0,                  /* tp_setattro */
15489     0,                  /* tp_as_buffer */
15490     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15491     0,                  /* tp_doc */
15492     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15493     0,                  /* tp_clear */
15494     0,                  /* tp_richcompare */
15495     0,                  /* tp_weaklistoffset */
15496     PyObject_SelfIter,          /* tp_iter */
15497     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15498     unicodeiter_methods,            /* tp_methods */
15499     0,
15500 };
15501 
15502 static PyObject *
unicode_iter(PyObject * seq)15503 unicode_iter(PyObject *seq)
15504 {
15505     unicodeiterobject *it;
15506 
15507     if (!PyUnicode_Check(seq)) {
15508         PyErr_BadInternalCall();
15509         return NULL;
15510     }
15511     if (PyUnicode_READY(seq) == -1)
15512         return NULL;
15513     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15514     if (it == NULL)
15515         return NULL;
15516     it->it_index = 0;
15517     Py_INCREF(seq);
15518     it->it_seq = seq;
15519     _PyObject_GC_TRACK(it);
15520     return (PyObject *)it;
15521 }
15522 
15523 
15524 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15525 Py_UNICODE_strlen(const Py_UNICODE *u)
15526 {
15527     return wcslen(u);
15528 }
15529 
15530 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15531 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15532 {
15533     Py_UNICODE *u = s1;
15534     while ((*u++ = *s2++));
15535     return s1;
15536 }
15537 
15538 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15539 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15540 {
15541     Py_UNICODE *u = s1;
15542     while ((*u++ = *s2++))
15543         if (n-- == 0)
15544             break;
15545     return s1;
15546 }
15547 
15548 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15549 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15550 {
15551     Py_UNICODE *u1 = s1;
15552     u1 += wcslen(u1);
15553     while ((*u1++ = *s2++));
15554     return s1;
15555 }
15556 
15557 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15558 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15559 {
15560     while (*s1 && *s2 && *s1 == *s2)
15561         s1++, s2++;
15562     if (*s1 && *s2)
15563         return (*s1 < *s2) ? -1 : +1;
15564     if (*s1)
15565         return 1;
15566     if (*s2)
15567         return -1;
15568     return 0;
15569 }
15570 
15571 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15572 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15573 {
15574     Py_UNICODE u1, u2;
15575     for (; n != 0; n--) {
15576         u1 = *s1;
15577         u2 = *s2;
15578         if (u1 != u2)
15579             return (u1 < u2) ? -1 : +1;
15580         if (u1 == '\0')
15581             return 0;
15582         s1++;
15583         s2++;
15584     }
15585     return 0;
15586 }
15587 
15588 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15589 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15590 {
15591     const Py_UNICODE *p;
15592     for (p = s; *p; p++)
15593         if (*p == c)
15594             return (Py_UNICODE*)p;
15595     return NULL;
15596 }
15597 
15598 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15599 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15600 {
15601     const Py_UNICODE *p;
15602     p = s + wcslen(s);
15603     while (p != s) {
15604         p--;
15605         if (*p == c)
15606             return (Py_UNICODE*)p;
15607     }
15608     return NULL;
15609 }
15610 
15611 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15612 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15613 {
15614     Py_UNICODE *u, *copy;
15615     Py_ssize_t len, size;
15616 
15617     if (!PyUnicode_Check(unicode)) {
15618         PyErr_BadArgument();
15619         return NULL;
15620     }
15621     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15622     if (u == NULL)
15623         return NULL;
15624     /* Ensure we won't overflow the size. */
15625     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15626         PyErr_NoMemory();
15627         return NULL;
15628     }
15629     size = len + 1; /* copy the null character */
15630     size *= sizeof(Py_UNICODE);
15631     copy = PyMem_Malloc(size);
15632     if (copy == NULL) {
15633         PyErr_NoMemory();
15634         return NULL;
15635     }
15636     memcpy(copy, u, size);
15637     return copy;
15638 }
15639 
15640 /* A _string module, to export formatter_parser and formatter_field_name_split
15641    to the string.Formatter class implemented in Python. */
15642 
15643 static PyMethodDef _string_methods[] = {
15644     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15645      METH_O, PyDoc_STR("split the argument as a field name")},
15646     {"formatter_parser", (PyCFunction) formatter_parser,
15647      METH_O, PyDoc_STR("parse the argument as a format string")},
15648     {NULL, NULL}
15649 };
15650 
15651 static struct PyModuleDef _string_module = {
15652     PyModuleDef_HEAD_INIT,
15653     "_string",
15654     PyDoc_STR("string helper module"),
15655     0,
15656     _string_methods,
15657     NULL,
15658     NULL,
15659     NULL,
15660     NULL
15661 };
15662 
15663 PyMODINIT_FUNC
PyInit__string(void)15664 PyInit__string(void)
15665 {
15666     return PyModule_Create(&_string_module);
15667 }
15668 
15669 
15670 #ifdef __cplusplus
15671 }
15672 #endif
15673