• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2  
3  Unicode implementation based on original code by Fredrik Lundh,
4  modified by Marc-Andre Lemburg <mal@lemburg.com>.
5  
6  Major speed upgrades to the method implementations at the Reykjavik
7  NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8  
9  Copyright (c) Corporation for National Research Initiatives.
10  
11  --------------------------------------------------------------------
12  The original string type implementation is:
13  
14    Copyright (c) 1999 by Secret Labs AB
15    Copyright (c) 1999 by Fredrik Lundh
16  
17  By obtaining, using, and/or copying this software and/or its
18  associated documentation, you agree that you have read, understood,
19  and will comply with the following terms and conditions:
20  
21  Permission to use, copy, modify, and distribute this software and its
22  associated documentation for any purpose and without fee is hereby
23  granted, provided that the above copyright notice appears in all
24  copies, and that both that copyright notice and this permission notice
25  appear in supporting documentation, and that the name of Secret Labs
26  AB or the author not be used in advertising or publicity pertaining to
27  distribution of the software without specific, written prior
28  permission.
29  
30  SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31  THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32  FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33  ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36  OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37  --------------------------------------------------------------------
38  
39  */
40  
41  #define PY_SSIZE_T_CLEAN
42  #include "Python.h"
43  #include "pycore_abstract.h"      // _PyIndex_Check()
44  #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45  #include "pycore_bytes_methods.h" // _Py_bytes_lower()
46  #include "pycore_format.h"        // F_LJUST
47  #include "pycore_initconfig.h"    // _PyStatus_OK()
48  #include "pycore_interp.h"        // PyInterpreterState.fs_codec
49  #include "pycore_object.h"        // _PyObject_GC_TRACK()
50  #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
51  #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
52  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
53  #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
54  #include "stringlib/eq.h"         // unicode_eq()
55  
56  #ifdef MS_WINDOWS
57  #include <windows.h>
58  #endif
59  
60  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61  #include "pycore_fileutils.h"     // _Py_LocaleUsesNonUnicodeWchar()
62  #endif
63  
64  /* Uncomment to display statistics on interned strings at exit
65     in _PyUnicode_ClearInterned(). */
66  /* #define INTERNED_STATS 1 */
67  
68  
69  /*[clinic input]
70  class str "PyObject *" "&PyUnicode_Type"
71  [clinic start generated code]*/
72  /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73  
74  /*[python input]
75  class Py_UCS4_converter(CConverter):
76      type = 'Py_UCS4'
77      converter = 'convert_uc'
78  
79      def converter_init(self):
80          if self.default is not unspecified:
81              self.c_default = ascii(self.default)
82              if len(self.c_default) > 4 or self.c_default[0] != "'":
83                  self.c_default = hex(ord(self.default))
84  
85  [python start generated code]*/
86  /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
87  
88  /* --- Globals ------------------------------------------------------------
89  
90  NOTE: In the interpreter's initialization phase, some globals are currently
91        initialized dynamically as needed. In the process Unicode objects may
92        be created before the Unicode type is ready.
93  
94  */
95  
96  
97  #ifdef __cplusplus
98  extern "C" {
99  #endif
100  
101  // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102  // The value must be the same in fileutils.c.
103  #define MAX_UNICODE 0x10ffff
104  
105  #ifdef Py_DEBUG
106  #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
107  #else
108  #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109  #endif
110  
111  #define _PyUnicode_UTF8(op)                             \
112      (((PyCompactUnicodeObject*)(op))->utf8)
113  #define PyUnicode_UTF8(op)                              \
114      (assert(_PyUnicode_CHECK(op)),                      \
115       assert(PyUnicode_IS_READY(op)),                    \
116       PyUnicode_IS_COMPACT_ASCII(op) ?                   \
117           ((char*)((PyASCIIObject*)(op) + 1)) :          \
118           _PyUnicode_UTF8(op))
119  #define _PyUnicode_UTF8_LENGTH(op)                      \
120      (((PyCompactUnicodeObject*)(op))->utf8_length)
121  #define PyUnicode_UTF8_LENGTH(op)                       \
122      (assert(_PyUnicode_CHECK(op)),                      \
123       assert(PyUnicode_IS_READY(op)),                    \
124       PyUnicode_IS_COMPACT_ASCII(op) ?                   \
125           ((PyASCIIObject*)(op))->length :               \
126           _PyUnicode_UTF8_LENGTH(op))
127  #define _PyUnicode_WSTR(op)                             \
128      (((PyASCIIObject*)(op))->wstr)
129  
130  /* Don't use deprecated macro of unicodeobject.h */
131  #undef PyUnicode_WSTR_LENGTH
132  #define PyUnicode_WSTR_LENGTH(op) \
133      (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
134       ((PyASCIIObject*)op)->length :                    \
135       ((PyCompactUnicodeObject*)op)->wstr_length)
136  #define _PyUnicode_WSTR_LENGTH(op)                      \
137      (((PyCompactUnicodeObject*)(op))->wstr_length)
138  #define _PyUnicode_LENGTH(op)                           \
139      (((PyASCIIObject *)(op))->length)
140  #define _PyUnicode_STATE(op)                            \
141      (((PyASCIIObject *)(op))->state)
142  #define _PyUnicode_HASH(op)                             \
143      (((PyASCIIObject *)(op))->hash)
144  #define _PyUnicode_KIND(op)                             \
145      (assert(_PyUnicode_CHECK(op)),                      \
146       ((PyASCIIObject *)(op))->state.kind)
147  #define _PyUnicode_GET_LENGTH(op)                       \
148      (assert(_PyUnicode_CHECK(op)),                      \
149       ((PyASCIIObject *)(op))->length)
150  #define _PyUnicode_DATA_ANY(op)                         \
151      (((PyUnicodeObject*)(op))->data.any)
152  
153  #undef PyUnicode_READY
154  #define PyUnicode_READY(op)                             \
155      (assert(_PyUnicode_CHECK(op)),                      \
156       (PyUnicode_IS_READY(op) ?                          \
157        0 :                                               \
158        _PyUnicode_Ready(op)))
159  
160  #define _PyUnicode_SHARE_UTF8(op)                       \
161      (assert(_PyUnicode_CHECK(op)),                      \
162       assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
163       (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164  #define _PyUnicode_SHARE_WSTR(op)                       \
165      (assert(_PyUnicode_CHECK(op)),                      \
166       (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167  
168  /* true if the Unicode object has an allocated UTF-8 memory block
169     (not shared with other data) */
170  #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
171      ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
172        && _PyUnicode_UTF8(op)                            \
173        && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174  
175  /* true if the Unicode object has an allocated wstr memory block
176     (not shared with other data) */
177  #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
178      ((_PyUnicode_WSTR(op) &&                            \
179        (!PyUnicode_IS_READY(op) ||                       \
180         _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181  
182  /* Generic helper macro to convert characters of different types.
183     from_type and to_type have to be valid type names, begin and end
184     are pointers to the source characters which should be of type
185     "from_type *".  to is a pointer of type "to_type *" and points to the
186     buffer where the result characters are written to. */
187  #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188      do {                                                \
189          to_type *_to = (to_type *)(to);                \
190          const from_type *_iter = (const from_type *)(begin);\
191          const from_type *_end = (const from_type *)(end);\
192          Py_ssize_t n = (_end) - (_iter);                \
193          const from_type *_unrolled_end =                \
194              _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
195          while (_iter < (_unrolled_end)) {               \
196              _to[0] = (to_type) _iter[0];                \
197              _to[1] = (to_type) _iter[1];                \
198              _to[2] = (to_type) _iter[2];                \
199              _to[3] = (to_type) _iter[3];                \
200              _iter += 4; _to += 4;                       \
201          }                                               \
202          while (_iter < (_end))                          \
203              *_to++ = (to_type) *_iter++;                \
204      } while (0)
205  
206  #ifdef MS_WINDOWS
207     /* On Windows, overallocate by 50% is the best factor */
208  #  define OVERALLOCATE_FACTOR 2
209  #else
210     /* On Linux, overallocate by 25% is the best factor */
211  #  define OVERALLOCATE_FACTOR 4
212  #endif
213  
214  /* bpo-40521: Interned strings are shared by all interpreters. */
215  #ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
216  #  define INTERNED_STRINGS
217  #endif
218  
219  /* This dictionary holds all interned unicode strings.  Note that references
220     to strings in this dictionary are *not* counted in the string's ob_refcnt.
221     When the interned string reaches a refcnt of 0 the string deallocation
222     function will delete the reference from this dictionary.
223  
224     Another way to look at this is that to say that the actual reference
225     count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
226  */
227  #ifdef INTERNED_STRINGS
228  static PyObject *interned = NULL;
229  #endif
230  
231  static struct _Py_unicode_state*
get_unicode_state(void)232  get_unicode_state(void)
233  {
234      PyInterpreterState *interp = _PyInterpreterState_GET();
235      return &interp->unicode;
236  }
237  
238  
239  // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)240  static inline PyObject* unicode_get_empty(void)
241  {
242      struct _Py_unicode_state *state = get_unicode_state();
243      // unicode_get_empty() must not be called before _PyUnicode_Init()
244      // or after _PyUnicode_Fini()
245      assert(state->empty_string != NULL);
246      return state->empty_string;
247  }
248  
249  
250  // Return a strong reference to the empty string singleton.
unicode_new_empty(void)251  static inline PyObject* unicode_new_empty(void)
252  {
253      PyObject *empty = unicode_get_empty();
254      Py_INCREF(empty);
255      return empty;
256  }
257  
258  #define _Py_RETURN_UNICODE_EMPTY()   \
259      do {                             \
260          return unicode_new_empty();  \
261      } while (0)
262  
263  static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)264  unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265               Py_ssize_t start, Py_ssize_t length)
266  {
267      assert(0 <= start);
268      assert(kind != PyUnicode_WCHAR_KIND);
269      switch (kind) {
270      case PyUnicode_1BYTE_KIND: {
271          assert(value <= 0xff);
272          Py_UCS1 ch = (unsigned char)value;
273          Py_UCS1 *to = (Py_UCS1 *)data + start;
274          memset(to, ch, length);
275          break;
276      }
277      case PyUnicode_2BYTE_KIND: {
278          assert(value <= 0xffff);
279          Py_UCS2 ch = (Py_UCS2)value;
280          Py_UCS2 *to = (Py_UCS2 *)data + start;
281          const Py_UCS2 *end = to + length;
282          for (; to < end; ++to) *to = ch;
283          break;
284      }
285      case PyUnicode_4BYTE_KIND: {
286          assert(value <= MAX_UNICODE);
287          Py_UCS4 ch = value;
288          Py_UCS4 * to = (Py_UCS4 *)data + start;
289          const Py_UCS4 *end = to + length;
290          for (; to < end; ++to) *to = ch;
291          break;
292      }
293      default: Py_UNREACHABLE();
294      }
295  }
296  
297  
298  /* Forward declaration */
299  static inline int
300  _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
301  static inline void
302  _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
303  static PyObject *
304  unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
305                      const char *errors);
306  static PyObject *
307  unicode_decode_utf8(const char *s, Py_ssize_t size,
308                      _Py_error_handler error_handler, const char *errors,
309                      Py_ssize_t *consumed);
310  
311  /* Fast detection of the most frequent whitespace characters */
312  const unsigned char _Py_ascii_whitespace[] = {
313      0, 0, 0, 0, 0, 0, 0, 0,
314  /*     case 0x0009: * CHARACTER TABULATION */
315  /*     case 0x000A: * LINE FEED */
316  /*     case 0x000B: * LINE TABULATION */
317  /*     case 0x000C: * FORM FEED */
318  /*     case 0x000D: * CARRIAGE RETURN */
319      0, 1, 1, 1, 1, 1, 0, 0,
320      0, 0, 0, 0, 0, 0, 0, 0,
321  /*     case 0x001C: * FILE SEPARATOR */
322  /*     case 0x001D: * GROUP SEPARATOR */
323  /*     case 0x001E: * RECORD SEPARATOR */
324  /*     case 0x001F: * UNIT SEPARATOR */
325      0, 0, 0, 0, 1, 1, 1, 1,
326  /*     case 0x0020: * SPACE */
327      1, 0, 0, 0, 0, 0, 0, 0,
328      0, 0, 0, 0, 0, 0, 0, 0,
329      0, 0, 0, 0, 0, 0, 0, 0,
330      0, 0, 0, 0, 0, 0, 0, 0,
331  
332      0, 0, 0, 0, 0, 0, 0, 0,
333      0, 0, 0, 0, 0, 0, 0, 0,
334      0, 0, 0, 0, 0, 0, 0, 0,
335      0, 0, 0, 0, 0, 0, 0, 0,
336      0, 0, 0, 0, 0, 0, 0, 0,
337      0, 0, 0, 0, 0, 0, 0, 0,
338      0, 0, 0, 0, 0, 0, 0, 0,
339      0, 0, 0, 0, 0, 0, 0, 0
340  };
341  
342  /* forward */
343  static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
344  static PyObject* get_latin1_char(unsigned char ch);
345  static int unicode_modifiable(PyObject *unicode);
346  
347  
348  static PyObject *
349  _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
350  static PyObject *
351  _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
352  static PyObject *
353  _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
354  
355  static PyObject *
356  unicode_encode_call_errorhandler(const char *errors,
357         PyObject **errorHandler,const char *encoding, const char *reason,
358         PyObject *unicode, PyObject **exceptionObject,
359         Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
360  
361  static void
362  raise_encode_exception(PyObject **exceptionObject,
363                         const char *encoding,
364                         PyObject *unicode,
365                         Py_ssize_t startpos, Py_ssize_t endpos,
366                         const char *reason);
367  
368  /* Same for linebreaks */
369  static const unsigned char ascii_linebreak[] = {
370      0, 0, 0, 0, 0, 0, 0, 0,
371  /*         0x000A, * LINE FEED */
372  /*         0x000B, * LINE TABULATION */
373  /*         0x000C, * FORM FEED */
374  /*         0x000D, * CARRIAGE RETURN */
375      0, 0, 1, 1, 1, 1, 0, 0,
376      0, 0, 0, 0, 0, 0, 0, 0,
377  /*         0x001C, * FILE SEPARATOR */
378  /*         0x001D, * GROUP SEPARATOR */
379  /*         0x001E, * RECORD SEPARATOR */
380      0, 0, 0, 0, 1, 1, 1, 0,
381      0, 0, 0, 0, 0, 0, 0, 0,
382      0, 0, 0, 0, 0, 0, 0, 0,
383      0, 0, 0, 0, 0, 0, 0, 0,
384      0, 0, 0, 0, 0, 0, 0, 0,
385  
386      0, 0, 0, 0, 0, 0, 0, 0,
387      0, 0, 0, 0, 0, 0, 0, 0,
388      0, 0, 0, 0, 0, 0, 0, 0,
389      0, 0, 0, 0, 0, 0, 0, 0,
390      0, 0, 0, 0, 0, 0, 0, 0,
391      0, 0, 0, 0, 0, 0, 0, 0,
392      0, 0, 0, 0, 0, 0, 0, 0,
393      0, 0, 0, 0, 0, 0, 0, 0
394  };
395  
396  static int convert_uc(PyObject *obj, void *addr);
397  
398  #include "clinic/unicodeobject.c.h"
399  
400  _Py_error_handler
_Py_GetErrorHandler(const char * errors)401  _Py_GetErrorHandler(const char *errors)
402  {
403      if (errors == NULL || strcmp(errors, "strict") == 0) {
404          return _Py_ERROR_STRICT;
405      }
406      if (strcmp(errors, "surrogateescape") == 0) {
407          return _Py_ERROR_SURROGATEESCAPE;
408      }
409      if (strcmp(errors, "replace") == 0) {
410          return _Py_ERROR_REPLACE;
411      }
412      if (strcmp(errors, "ignore") == 0) {
413          return _Py_ERROR_IGNORE;
414      }
415      if (strcmp(errors, "backslashreplace") == 0) {
416          return _Py_ERROR_BACKSLASHREPLACE;
417      }
418      if (strcmp(errors, "surrogatepass") == 0) {
419          return _Py_ERROR_SURROGATEPASS;
420      }
421      if (strcmp(errors, "xmlcharrefreplace") == 0) {
422          return _Py_ERROR_XMLCHARREFREPLACE;
423      }
424      return _Py_ERROR_OTHER;
425  }
426  
427  
428  static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)429  get_error_handler_wide(const wchar_t *errors)
430  {
431      if (errors == NULL || wcscmp(errors, L"strict") == 0) {
432          return _Py_ERROR_STRICT;
433      }
434      if (wcscmp(errors, L"surrogateescape") == 0) {
435          return _Py_ERROR_SURROGATEESCAPE;
436      }
437      if (wcscmp(errors, L"replace") == 0) {
438          return _Py_ERROR_REPLACE;
439      }
440      if (wcscmp(errors, L"ignore") == 0) {
441          return _Py_ERROR_IGNORE;
442      }
443      if (wcscmp(errors, L"backslashreplace") == 0) {
444          return _Py_ERROR_BACKSLASHREPLACE;
445      }
446      if (wcscmp(errors, L"surrogatepass") == 0) {
447          return _Py_ERROR_SURROGATEPASS;
448      }
449      if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
450          return _Py_ERROR_XMLCHARREFREPLACE;
451      }
452      return _Py_ERROR_OTHER;
453  }
454  
455  
456  static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)457  unicode_check_encoding_errors(const char *encoding, const char *errors)
458  {
459      if (encoding == NULL && errors == NULL) {
460          return 0;
461      }
462  
463      PyInterpreterState *interp = _PyInterpreterState_GET();
464  #ifndef Py_DEBUG
465      /* In release mode, only check in development mode (-X dev) */
466      if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
467          return 0;
468      }
469  #else
470      /* Always check in debug mode */
471  #endif
472  
473      /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
474         codec registry is ready: before_PyUnicode_InitEncodings() is called. */
475      if (!interp->unicode.fs_codec.encoding) {
476          return 0;
477      }
478  
479      /* Disable checks during Python finalization. For example, it allows to
480         call _PyObject_Dump() during finalization for debugging purpose. */
481      if (interp->finalizing) {
482          return 0;
483      }
484  
485      if (encoding != NULL) {
486          PyObject *handler = _PyCodec_Lookup(encoding);
487          if (handler == NULL) {
488              return -1;
489          }
490          Py_DECREF(handler);
491      }
492  
493      if (errors != NULL) {
494          PyObject *handler = PyCodec_LookupError(errors);
495          if (handler == NULL) {
496              return -1;
497          }
498          Py_DECREF(handler);
499      }
500      return 0;
501  }
502  
503  
504  int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)505  _PyUnicode_CheckConsistency(PyObject *op, int check_content)
506  {
507  #define CHECK(expr) \
508      do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
509  
510      PyASCIIObject *ascii;
511      unsigned int kind;
512  
513      assert(op != NULL);
514      CHECK(PyUnicode_Check(op));
515  
516      ascii = (PyASCIIObject *)op;
517      kind = ascii->state.kind;
518  
519      if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
520          CHECK(kind == PyUnicode_1BYTE_KIND);
521          CHECK(ascii->state.ready == 1);
522      }
523      else {
524          PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
525          void *data;
526  
527          if (ascii->state.compact == 1) {
528              data = compact + 1;
529              CHECK(kind == PyUnicode_1BYTE_KIND
530                                   || kind == PyUnicode_2BYTE_KIND
531                                   || kind == PyUnicode_4BYTE_KIND);
532              CHECK(ascii->state.ascii == 0);
533              CHECK(ascii->state.ready == 1);
534              CHECK(compact->utf8 != data);
535          }
536          else {
537              PyUnicodeObject *unicode = (PyUnicodeObject *)op;
538  
539              data = unicode->data.any;
540              if (kind == PyUnicode_WCHAR_KIND) {
541                  CHECK(ascii->length == 0);
542                  CHECK(ascii->hash == -1);
543                  CHECK(ascii->state.compact == 0);
544                  CHECK(ascii->state.ascii == 0);
545                  CHECK(ascii->state.ready == 0);
546                  CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
547                  CHECK(ascii->wstr != NULL);
548                  CHECK(data == NULL);
549                  CHECK(compact->utf8 == NULL);
550              }
551              else {
552                  CHECK(kind == PyUnicode_1BYTE_KIND
553                                       || kind == PyUnicode_2BYTE_KIND
554                                       || kind == PyUnicode_4BYTE_KIND);
555                  CHECK(ascii->state.compact == 0);
556                  CHECK(ascii->state.ready == 1);
557                  CHECK(data != NULL);
558                  if (ascii->state.ascii) {
559                      CHECK(compact->utf8 == data);
560                      CHECK(compact->utf8_length == ascii->length);
561                  }
562                  else
563                      CHECK(compact->utf8 != data);
564              }
565          }
566          if (kind != PyUnicode_WCHAR_KIND) {
567              if (
568  #if SIZEOF_WCHAR_T == 2
569                  kind == PyUnicode_2BYTE_KIND
570  #else
571                  kind == PyUnicode_4BYTE_KIND
572  #endif
573                 )
574              {
575                  CHECK(ascii->wstr == data);
576                  CHECK(compact->wstr_length == ascii->length);
577              } else
578                  CHECK(ascii->wstr != data);
579          }
580  
581          if (compact->utf8 == NULL)
582              CHECK(compact->utf8_length == 0);
583          if (ascii->wstr == NULL)
584              CHECK(compact->wstr_length == 0);
585      }
586  
587      /* check that the best kind is used: O(n) operation */
588      if (check_content && kind != PyUnicode_WCHAR_KIND) {
589          Py_ssize_t i;
590          Py_UCS4 maxchar = 0;
591          const void *data;
592          Py_UCS4 ch;
593  
594          data = PyUnicode_DATA(ascii);
595          for (i=0; i < ascii->length; i++)
596          {
597              ch = PyUnicode_READ(kind, data, i);
598              if (ch > maxchar)
599                  maxchar = ch;
600          }
601          if (kind == PyUnicode_1BYTE_KIND) {
602              if (ascii->state.ascii == 0) {
603                  CHECK(maxchar >= 128);
604                  CHECK(maxchar <= 255);
605              }
606              else
607                  CHECK(maxchar < 128);
608          }
609          else if (kind == PyUnicode_2BYTE_KIND) {
610              CHECK(maxchar >= 0x100);
611              CHECK(maxchar <= 0xFFFF);
612          }
613          else {
614              CHECK(maxchar >= 0x10000);
615              CHECK(maxchar <= MAX_UNICODE);
616          }
617          CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
618      }
619      return 1;
620  
621  #undef CHECK
622  }
623  
624  
625  static PyObject*
unicode_result_wchar(PyObject * unicode)626  unicode_result_wchar(PyObject *unicode)
627  {
628  #ifndef Py_DEBUG
629      Py_ssize_t len;
630  
631      len = _PyUnicode_WSTR_LENGTH(unicode);
632      if (len == 0) {
633          Py_DECREF(unicode);
634          _Py_RETURN_UNICODE_EMPTY();
635      }
636  
637      if (len == 1) {
638          wchar_t ch = _PyUnicode_WSTR(unicode)[0];
639          if ((Py_UCS4)ch < 256) {
640              Py_DECREF(unicode);
641              return get_latin1_char((unsigned char)ch);
642          }
643      }
644  
645      if (_PyUnicode_Ready(unicode) < 0) {
646          Py_DECREF(unicode);
647          return NULL;
648      }
649  #else
650      assert(Py_REFCNT(unicode) == 1);
651  
652      /* don't make the result ready in debug mode to ensure that the caller
653         makes the string ready before using it */
654      assert(_PyUnicode_CheckConsistency(unicode, 1));
655  #endif
656      return unicode;
657  }
658  
659  static PyObject*
unicode_result_ready(PyObject * unicode)660  unicode_result_ready(PyObject *unicode)
661  {
662      Py_ssize_t length;
663  
664      length = PyUnicode_GET_LENGTH(unicode);
665      if (length == 0) {
666          PyObject *empty = unicode_get_empty();
667          if (unicode != empty) {
668              Py_DECREF(unicode);
669              Py_INCREF(empty);
670          }
671          return empty;
672      }
673  
674      if (length == 1) {
675          int kind = PyUnicode_KIND(unicode);
676          if (kind == PyUnicode_1BYTE_KIND) {
677              const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
678              Py_UCS1 ch = data[0];
679              struct _Py_unicode_state *state = get_unicode_state();
680              PyObject *latin1_char = state->latin1[ch];
681              if (latin1_char != NULL) {
682                  if (unicode != latin1_char) {
683                      Py_INCREF(latin1_char);
684                      Py_DECREF(unicode);
685                  }
686                  return latin1_char;
687              }
688              else {
689                  assert(_PyUnicode_CheckConsistency(unicode, 1));
690                  Py_INCREF(unicode);
691                  state->latin1[ch] = unicode;
692                  return unicode;
693              }
694          }
695          else {
696              assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
697          }
698      }
699  
700      assert(_PyUnicode_CheckConsistency(unicode, 1));
701      return unicode;
702  }
703  
704  static PyObject*
unicode_result(PyObject * unicode)705  unicode_result(PyObject *unicode)
706  {
707      assert(_PyUnicode_CHECK(unicode));
708      if (PyUnicode_IS_READY(unicode))
709          return unicode_result_ready(unicode);
710      else
711          return unicode_result_wchar(unicode);
712  }
713  
714  static PyObject*
unicode_result_unchanged(PyObject * unicode)715  unicode_result_unchanged(PyObject *unicode)
716  {
717      if (PyUnicode_CheckExact(unicode)) {
718          if (PyUnicode_READY(unicode) == -1)
719              return NULL;
720          Py_INCREF(unicode);
721          return unicode;
722      }
723      else
724          /* Subtype -- return genuine unicode string with the same value. */
725          return _PyUnicode_Copy(unicode);
726  }
727  
728  /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
729     ASCII, Latin1, UTF-8, etc. */
730  static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)731  backslashreplace(_PyBytesWriter *writer, char *str,
732                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
733  {
734      Py_ssize_t size, i;
735      Py_UCS4 ch;
736      enum PyUnicode_Kind kind;
737      const void *data;
738  
739      assert(PyUnicode_IS_READY(unicode));
740      kind = PyUnicode_KIND(unicode);
741      data = PyUnicode_DATA(unicode);
742  
743      size = 0;
744      /* determine replacement size */
745      for (i = collstart; i < collend; ++i) {
746          Py_ssize_t incr;
747  
748          ch = PyUnicode_READ(kind, data, i);
749          if (ch < 0x100)
750              incr = 2+2;
751          else if (ch < 0x10000)
752              incr = 2+4;
753          else {
754              assert(ch <= MAX_UNICODE);
755              incr = 2+8;
756          }
757          if (size > PY_SSIZE_T_MAX - incr) {
758              PyErr_SetString(PyExc_OverflowError,
759                              "encoded result is too long for a Python string");
760              return NULL;
761          }
762          size += incr;
763      }
764  
765      str = _PyBytesWriter_Prepare(writer, str, size);
766      if (str == NULL)
767          return NULL;
768  
769      /* generate replacement */
770      for (i = collstart; i < collend; ++i) {
771          ch = PyUnicode_READ(kind, data, i);
772          *str++ = '\\';
773          if (ch >= 0x00010000) {
774              *str++ = 'U';
775              *str++ = Py_hexdigits[(ch>>28)&0xf];
776              *str++ = Py_hexdigits[(ch>>24)&0xf];
777              *str++ = Py_hexdigits[(ch>>20)&0xf];
778              *str++ = Py_hexdigits[(ch>>16)&0xf];
779              *str++ = Py_hexdigits[(ch>>12)&0xf];
780              *str++ = Py_hexdigits[(ch>>8)&0xf];
781          }
782          else if (ch >= 0x100) {
783              *str++ = 'u';
784              *str++ = Py_hexdigits[(ch>>12)&0xf];
785              *str++ = Py_hexdigits[(ch>>8)&0xf];
786          }
787          else
788              *str++ = 'x';
789          *str++ = Py_hexdigits[(ch>>4)&0xf];
790          *str++ = Py_hexdigits[ch&0xf];
791      }
792      return str;
793  }
794  
795  /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
796     ASCII, Latin1, UTF-8, etc. */
797  static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)798  xmlcharrefreplace(_PyBytesWriter *writer, char *str,
799                    PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
800  {
801      Py_ssize_t size, i;
802      Py_UCS4 ch;
803      enum PyUnicode_Kind kind;
804      const void *data;
805  
806      assert(PyUnicode_IS_READY(unicode));
807      kind = PyUnicode_KIND(unicode);
808      data = PyUnicode_DATA(unicode);
809  
810      size = 0;
811      /* determine replacement size */
812      for (i = collstart; i < collend; ++i) {
813          Py_ssize_t incr;
814  
815          ch = PyUnicode_READ(kind, data, i);
816          if (ch < 10)
817              incr = 2+1+1;
818          else if (ch < 100)
819              incr = 2+2+1;
820          else if (ch < 1000)
821              incr = 2+3+1;
822          else if (ch < 10000)
823              incr = 2+4+1;
824          else if (ch < 100000)
825              incr = 2+5+1;
826          else if (ch < 1000000)
827              incr = 2+6+1;
828          else {
829              assert(ch <= MAX_UNICODE);
830              incr = 2+7+1;
831          }
832          if (size > PY_SSIZE_T_MAX - incr) {
833              PyErr_SetString(PyExc_OverflowError,
834                              "encoded result is too long for a Python string");
835              return NULL;
836          }
837          size += incr;
838      }
839  
840      str = _PyBytesWriter_Prepare(writer, str, size);
841      if (str == NULL)
842          return NULL;
843  
844      /* generate replacement */
845      for (i = collstart; i < collend; ++i) {
846          size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
847          if (size < 0) {
848              return NULL;
849          }
850          str += size;
851      }
852      return str;
853  }
854  
855  /* --- Bloom Filters ----------------------------------------------------- */
856  
857  /* stuff to implement simple "bloom filters" for Unicode characters.
858     to keep things simple, we use a single bitmask, using the least 5
859     bits from each unicode characters as the bit index. */
860  
861  /* the linebreak mask is set up by _PyUnicode_Init() below */
862  
863  #if LONG_BIT >= 128
864  #define BLOOM_WIDTH 128
865  #elif LONG_BIT >= 64
866  #define BLOOM_WIDTH 64
867  #elif LONG_BIT >= 32
868  #define BLOOM_WIDTH 32
869  #else
870  #error "LONG_BIT is smaller than 32"
871  #endif
872  
873  #define BLOOM_MASK unsigned long
874  
875  static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
876  
877  #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
878  
879  #define BLOOM_LINEBREAK(ch)                                             \
880      ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
881       (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
882  
883  static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)884  make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
885  {
886  #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
887      do {                                               \
888          TYPE *data = (TYPE *)PTR;                      \
889          TYPE *end = data + LEN;                        \
890          Py_UCS4 ch;                                    \
891          for (; data != end; data++) {                  \
892              ch = *data;                                \
893              MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894          }                                              \
895          break;                                         \
896      } while (0)
897  
898      /* calculate simple bloom-style bitmask for a given unicode string */
899  
900      BLOOM_MASK mask;
901  
902      mask = 0;
903      switch (kind) {
904      case PyUnicode_1BYTE_KIND:
905          BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906          break;
907      case PyUnicode_2BYTE_KIND:
908          BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909          break;
910      case PyUnicode_4BYTE_KIND:
911          BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912          break;
913      default:
914          Py_UNREACHABLE();
915      }
916      return mask;
917  
918  #undef BLOOM_UPDATE
919  }
920  
921  static int
ensure_unicode(PyObject * obj)922  ensure_unicode(PyObject *obj)
923  {
924      if (!PyUnicode_Check(obj)) {
925          PyErr_Format(PyExc_TypeError,
926                       "must be str, not %.100s",
927                       Py_TYPE(obj)->tp_name);
928          return -1;
929      }
930      return PyUnicode_READY(obj);
931  }
932  
933  /* Compilation of templated routines */
934  
935  #define STRINGLIB_GET_EMPTY() unicode_get_empty()
936  
937  #include "stringlib/asciilib.h"
938  #include "stringlib/fastsearch.h"
939  #include "stringlib/partition.h"
940  #include "stringlib/split.h"
941  #include "stringlib/count.h"
942  #include "stringlib/find.h"
943  #include "stringlib/find_max_char.h"
944  #include "stringlib/undef.h"
945  
946  #include "stringlib/ucs1lib.h"
947  #include "stringlib/fastsearch.h"
948  #include "stringlib/partition.h"
949  #include "stringlib/split.h"
950  #include "stringlib/count.h"
951  #include "stringlib/find.h"
952  #include "stringlib/replace.h"
953  #include "stringlib/find_max_char.h"
954  #include "stringlib/undef.h"
955  
956  #include "stringlib/ucs2lib.h"
957  #include "stringlib/fastsearch.h"
958  #include "stringlib/partition.h"
959  #include "stringlib/split.h"
960  #include "stringlib/count.h"
961  #include "stringlib/find.h"
962  #include "stringlib/replace.h"
963  #include "stringlib/find_max_char.h"
964  #include "stringlib/undef.h"
965  
966  #include "stringlib/ucs4lib.h"
967  #include "stringlib/fastsearch.h"
968  #include "stringlib/partition.h"
969  #include "stringlib/split.h"
970  #include "stringlib/count.h"
971  #include "stringlib/find.h"
972  #include "stringlib/replace.h"
973  #include "stringlib/find_max_char.h"
974  #include "stringlib/undef.h"
975  
976  _Py_COMP_DIAG_PUSH
977  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
978  #include "stringlib/unicodedefs.h"
979  #include "stringlib/fastsearch.h"
980  #include "stringlib/count.h"
981  #include "stringlib/find.h"
982  #include "stringlib/undef.h"
983  _Py_COMP_DIAG_POP
984  
985  #undef STRINGLIB_GET_EMPTY
986  
987  /* --- Unicode Object ----------------------------------------------------- */
988  
989  static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)990  findchar(const void *s, int kind,
991           Py_ssize_t size, Py_UCS4 ch,
992           int direction)
993  {
994      switch (kind) {
995      case PyUnicode_1BYTE_KIND:
996          if ((Py_UCS1) ch != ch)
997              return -1;
998          if (direction > 0)
999              return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000          else
1001              return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002      case PyUnicode_2BYTE_KIND:
1003          if ((Py_UCS2) ch != ch)
1004              return -1;
1005          if (direction > 0)
1006              return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007          else
1008              return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009      case PyUnicode_4BYTE_KIND:
1010          if (direction > 0)
1011              return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012          else
1013              return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014      default:
1015          Py_UNREACHABLE();
1016      }
1017  }
1018  
1019  #ifdef Py_DEBUG
1020  /* Fill the data of a Unicode string with invalid characters to detect bugs
1021     earlier.
1022  
1023     _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024     ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025     invalid character in Unicode 6.0. */
1026  static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1027  unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028  {
1029      int kind = PyUnicode_KIND(unicode);
1030      Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031      Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032      if (length <= old_length)
1033          return;
1034      memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035  }
1036  #endif
1037  
1038  static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1039  resize_compact(PyObject *unicode, Py_ssize_t length)
1040  {
1041      Py_ssize_t char_size;
1042      Py_ssize_t struct_size;
1043      Py_ssize_t new_size;
1044      int share_wstr;
1045      PyObject *new_unicode;
1046  #ifdef Py_DEBUG
1047      Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048  #endif
1049  
1050      assert(unicode_modifiable(unicode));
1051      assert(PyUnicode_IS_READY(unicode));
1052      assert(PyUnicode_IS_COMPACT(unicode));
1053  
1054      char_size = PyUnicode_KIND(unicode);
1055      if (PyUnicode_IS_ASCII(unicode))
1056          struct_size = sizeof(PyASCIIObject);
1057      else
1058          struct_size = sizeof(PyCompactUnicodeObject);
1059      share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060  
1061      if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062          PyErr_NoMemory();
1063          return NULL;
1064      }
1065      new_size = (struct_size + (length + 1) * char_size);
1066  
1067      if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068          PyObject_Free(_PyUnicode_UTF8(unicode));
1069          _PyUnicode_UTF8(unicode) = NULL;
1070          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071      }
1072  #ifdef Py_REF_DEBUG
1073      _Py_RefTotal--;
1074  #endif
1075  #ifdef Py_TRACE_REFS
1076      _Py_ForgetReference(unicode);
1077  #endif
1078  
1079      new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1080      if (new_unicode == NULL) {
1081          _Py_NewReference(unicode);
1082          PyErr_NoMemory();
1083          return NULL;
1084      }
1085      unicode = new_unicode;
1086      _Py_NewReference(unicode);
1087  
1088      _PyUnicode_LENGTH(unicode) = length;
1089      if (share_wstr) {
1090          _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091          if (!PyUnicode_IS_ASCII(unicode))
1092              _PyUnicode_WSTR_LENGTH(unicode) = length;
1093      }
1094      else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095          PyObject_Free(_PyUnicode_WSTR(unicode));
1096          _PyUnicode_WSTR(unicode) = NULL;
1097          if (!PyUnicode_IS_ASCII(unicode))
1098              _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099      }
1100  #ifdef Py_DEBUG
1101      unicode_fill_invalid(unicode, old_length);
1102  #endif
1103      PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104                      length, 0);
1105      assert(_PyUnicode_CheckConsistency(unicode, 0));
1106      return unicode;
1107  }
1108  
1109  static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1110  resize_inplace(PyObject *unicode, Py_ssize_t length)
1111  {
1112      wchar_t *wstr;
1113      Py_ssize_t new_size;
1114      assert(!PyUnicode_IS_COMPACT(unicode));
1115      assert(Py_REFCNT(unicode) == 1);
1116  
1117      if (PyUnicode_IS_READY(unicode)) {
1118          Py_ssize_t char_size;
1119          int share_wstr, share_utf8;
1120          void *data;
1121  #ifdef Py_DEBUG
1122          Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123  #endif
1124  
1125          data = _PyUnicode_DATA_ANY(unicode);
1126          char_size = PyUnicode_KIND(unicode);
1127          share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128          share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129  
1130          if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131              PyErr_NoMemory();
1132              return -1;
1133          }
1134          new_size = (length + 1) * char_size;
1135  
1136          if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137          {
1138              PyObject_Free(_PyUnicode_UTF8(unicode));
1139              _PyUnicode_UTF8(unicode) = NULL;
1140              _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141          }
1142  
1143          data = (PyObject *)PyObject_Realloc(data, new_size);
1144          if (data == NULL) {
1145              PyErr_NoMemory();
1146              return -1;
1147          }
1148          _PyUnicode_DATA_ANY(unicode) = data;
1149          if (share_wstr) {
1150              _PyUnicode_WSTR(unicode) = data;
1151              _PyUnicode_WSTR_LENGTH(unicode) = length;
1152          }
1153          if (share_utf8) {
1154              _PyUnicode_UTF8(unicode) = data;
1155              _PyUnicode_UTF8_LENGTH(unicode) = length;
1156          }
1157          _PyUnicode_LENGTH(unicode) = length;
1158          PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159  #ifdef Py_DEBUG
1160          unicode_fill_invalid(unicode, old_length);
1161  #endif
1162          if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163              assert(_PyUnicode_CheckConsistency(unicode, 0));
1164              return 0;
1165          }
1166      }
1167      assert(_PyUnicode_WSTR(unicode) != NULL);
1168  
1169      /* check for integer overflow */
1170      if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171          PyErr_NoMemory();
1172          return -1;
1173      }
1174      new_size = sizeof(wchar_t) * (length + 1);
1175      wstr =  _PyUnicode_WSTR(unicode);
1176      wstr = PyObject_Realloc(wstr, new_size);
1177      if (!wstr) {
1178          PyErr_NoMemory();
1179          return -1;
1180      }
1181      _PyUnicode_WSTR(unicode) = wstr;
1182      _PyUnicode_WSTR(unicode)[length] = 0;
1183      _PyUnicode_WSTR_LENGTH(unicode) = length;
1184      assert(_PyUnicode_CheckConsistency(unicode, 0));
1185      return 0;
1186  }
1187  
1188  static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1189  resize_copy(PyObject *unicode, Py_ssize_t length)
1190  {
1191      Py_ssize_t copy_length;
1192      if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193          PyObject *copy;
1194  
1195          assert(PyUnicode_IS_READY(unicode));
1196  
1197          copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198          if (copy == NULL)
1199              return NULL;
1200  
1201          copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202          _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203          return copy;
1204      }
1205      else {
1206          PyObject *w;
1207  
1208          w = (PyObject*)_PyUnicode_New(length);
1209          if (w == NULL)
1210              return NULL;
1211          copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212          copy_length = Py_MIN(copy_length, length);
1213          memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214                    copy_length * sizeof(wchar_t));
1215          return w;
1216      }
1217  }
1218  
1219  /* We allocate one more byte to make sure the string is
1220     Ux0000 terminated; some code (e.g. new_identifier)
1221     relies on that.
1222  
1223     XXX This allocator could further be enhanced by assuring that the
1224     free list never reduces its size below 1.
1225  
1226  */
1227  
1228  static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1229  _PyUnicode_New(Py_ssize_t length)
1230  {
1231      PyUnicodeObject *unicode;
1232      size_t new_size;
1233  
1234      /* Optimization for empty strings */
1235      if (length == 0) {
1236          return (PyUnicodeObject *)unicode_new_empty();
1237      }
1238  
1239      /* Ensure we won't overflow the size. */
1240      if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1241          return (PyUnicodeObject *)PyErr_NoMemory();
1242      }
1243      if (length < 0) {
1244          PyErr_SetString(PyExc_SystemError,
1245                          "Negative size passed to _PyUnicode_New");
1246          return NULL;
1247      }
1248  
1249      unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1250      if (unicode == NULL)
1251          return NULL;
1252      new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1253  
1254      _PyUnicode_WSTR_LENGTH(unicode) = length;
1255      _PyUnicode_HASH(unicode) = -1;
1256      _PyUnicode_STATE(unicode).interned = 0;
1257      _PyUnicode_STATE(unicode).kind = 0;
1258      _PyUnicode_STATE(unicode).compact = 0;
1259      _PyUnicode_STATE(unicode).ready = 0;
1260      _PyUnicode_STATE(unicode).ascii = 0;
1261      _PyUnicode_DATA_ANY(unicode) = NULL;
1262      _PyUnicode_LENGTH(unicode) = 0;
1263      _PyUnicode_UTF8(unicode) = NULL;
1264      _PyUnicode_UTF8_LENGTH(unicode) = 0;
1265  
1266      _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1267      if (!_PyUnicode_WSTR(unicode)) {
1268          Py_DECREF(unicode);
1269          PyErr_NoMemory();
1270          return NULL;
1271      }
1272  
1273      /* Initialize the first element to guard against cases where
1274       * the caller fails before initializing str -- unicode_resize()
1275       * reads str[0], and the Keep-Alive optimization can keep memory
1276       * allocated for str alive across a call to unicode_dealloc(unicode).
1277       * We don't want unicode_resize to read uninitialized memory in
1278       * that case.
1279       */
1280      _PyUnicode_WSTR(unicode)[0] = 0;
1281      _PyUnicode_WSTR(unicode)[length] = 0;
1282  
1283      assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1284      return unicode;
1285  }
1286  
1287  static const char*
unicode_kind_name(PyObject * unicode)1288  unicode_kind_name(PyObject *unicode)
1289  {
1290      /* don't check consistency: unicode_kind_name() is called from
1291         _PyUnicode_Dump() */
1292      if (!PyUnicode_IS_COMPACT(unicode))
1293      {
1294          if (!PyUnicode_IS_READY(unicode))
1295              return "wstr";
1296          switch (PyUnicode_KIND(unicode))
1297          {
1298          case PyUnicode_1BYTE_KIND:
1299              if (PyUnicode_IS_ASCII(unicode))
1300                  return "legacy ascii";
1301              else
1302                  return "legacy latin1";
1303          case PyUnicode_2BYTE_KIND:
1304              return "legacy UCS2";
1305          case PyUnicode_4BYTE_KIND:
1306              return "legacy UCS4";
1307          default:
1308              return "<legacy invalid kind>";
1309          }
1310      }
1311      assert(PyUnicode_IS_READY(unicode));
1312      switch (PyUnicode_KIND(unicode)) {
1313      case PyUnicode_1BYTE_KIND:
1314          if (PyUnicode_IS_ASCII(unicode))
1315              return "ascii";
1316          else
1317              return "latin1";
1318      case PyUnicode_2BYTE_KIND:
1319          return "UCS2";
1320      case PyUnicode_4BYTE_KIND:
1321          return "UCS4";
1322      default:
1323          return "<invalid compact kind>";
1324      }
1325  }
1326  
1327  #ifdef Py_DEBUG
1328  /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1329  const char *_PyUnicode_utf8(void *unicode_raw){
1330      PyObject *unicode = _PyObject_CAST(unicode_raw);
1331      return PyUnicode_UTF8(unicode);
1332  }
1333  
_PyUnicode_compact_data(void * unicode_raw)1334  const void *_PyUnicode_compact_data(void *unicode_raw) {
1335      PyObject *unicode = _PyObject_CAST(unicode_raw);
1336      return _PyUnicode_COMPACT_DATA(unicode);
1337  }
_PyUnicode_data(void * unicode_raw)1338  const void *_PyUnicode_data(void *unicode_raw) {
1339      PyObject *unicode = _PyObject_CAST(unicode_raw);
1340      printf("obj %p\n", (void*)unicode);
1341      printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1342      printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1343      printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1344      printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1345      printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1346      return PyUnicode_DATA(unicode);
1347  }
1348  
1349  void
_PyUnicode_Dump(PyObject * op)1350  _PyUnicode_Dump(PyObject *op)
1351  {
1352      PyASCIIObject *ascii = (PyASCIIObject *)op;
1353      PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1354      PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1355      const void *data;
1356  
1357      if (ascii->state.compact)
1358      {
1359          if (ascii->state.ascii)
1360              data = (ascii + 1);
1361          else
1362              data = (compact + 1);
1363      }
1364      else
1365          data = unicode->data.any;
1366      printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1367  
1368      if (ascii->wstr == data)
1369          printf("shared ");
1370      printf("wstr=%p", (void *)ascii->wstr);
1371  
1372      if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1373          printf(" (%zu), ", compact->wstr_length);
1374          if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1375              printf("shared ");
1376          }
1377          printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1378      }
1379      printf(", data=%p\n", data);
1380  }
1381  #endif
1382  
1383  static int
unicode_create_empty_string_singleton(struct _Py_unicode_state * state)1384  unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1385  {
1386      // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1387      // optimized to always use state->empty_string without having to check if
1388      // it is NULL or not.
1389      PyObject *empty = PyUnicode_New(1, 0);
1390      if (empty == NULL) {
1391          return -1;
1392      }
1393      PyUnicode_1BYTE_DATA(empty)[0] = 0;
1394      _PyUnicode_LENGTH(empty) = 0;
1395      assert(_PyUnicode_CheckConsistency(empty, 1));
1396  
1397      assert(state->empty_string == NULL);
1398      state->empty_string = empty;
1399      return 0;
1400  }
1401  
1402  
1403  PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1404  PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1405  {
1406      /* Optimization for empty strings */
1407      if (size == 0) {
1408          return unicode_new_empty();
1409      }
1410  
1411      PyObject *obj;
1412      PyCompactUnicodeObject *unicode;
1413      void *data;
1414      enum PyUnicode_Kind kind;
1415      int is_sharing, is_ascii;
1416      Py_ssize_t char_size;
1417      Py_ssize_t struct_size;
1418  
1419      is_ascii = 0;
1420      is_sharing = 0;
1421      struct_size = sizeof(PyCompactUnicodeObject);
1422      if (maxchar < 128) {
1423          kind = PyUnicode_1BYTE_KIND;
1424          char_size = 1;
1425          is_ascii = 1;
1426          struct_size = sizeof(PyASCIIObject);
1427      }
1428      else if (maxchar < 256) {
1429          kind = PyUnicode_1BYTE_KIND;
1430          char_size = 1;
1431      }
1432      else if (maxchar < 65536) {
1433          kind = PyUnicode_2BYTE_KIND;
1434          char_size = 2;
1435          if (sizeof(wchar_t) == 2)
1436              is_sharing = 1;
1437      }
1438      else {
1439          if (maxchar > MAX_UNICODE) {
1440              PyErr_SetString(PyExc_SystemError,
1441                              "invalid maximum character passed to PyUnicode_New");
1442              return NULL;
1443          }
1444          kind = PyUnicode_4BYTE_KIND;
1445          char_size = 4;
1446          if (sizeof(wchar_t) == 4)
1447              is_sharing = 1;
1448      }
1449  
1450      /* Ensure we won't overflow the size. */
1451      if (size < 0) {
1452          PyErr_SetString(PyExc_SystemError,
1453                          "Negative size passed to PyUnicode_New");
1454          return NULL;
1455      }
1456      if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1457          return PyErr_NoMemory();
1458  
1459      /* Duplicated allocation code from _PyObject_New() instead of a call to
1460       * PyObject_New() so we are able to allocate space for the object and
1461       * it's data buffer.
1462       */
1463      obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1464      if (obj == NULL) {
1465          return PyErr_NoMemory();
1466      }
1467      _PyObject_Init(obj, &PyUnicode_Type);
1468  
1469      unicode = (PyCompactUnicodeObject *)obj;
1470      if (is_ascii)
1471          data = ((PyASCIIObject*)obj) + 1;
1472      else
1473          data = unicode + 1;
1474      _PyUnicode_LENGTH(unicode) = size;
1475      _PyUnicode_HASH(unicode) = -1;
1476      _PyUnicode_STATE(unicode).interned = 0;
1477      _PyUnicode_STATE(unicode).kind = kind;
1478      _PyUnicode_STATE(unicode).compact = 1;
1479      _PyUnicode_STATE(unicode).ready = 1;
1480      _PyUnicode_STATE(unicode).ascii = is_ascii;
1481      if (is_ascii) {
1482          ((char*)data)[size] = 0;
1483          _PyUnicode_WSTR(unicode) = NULL;
1484      }
1485      else if (kind == PyUnicode_1BYTE_KIND) {
1486          ((char*)data)[size] = 0;
1487          _PyUnicode_WSTR(unicode) = NULL;
1488          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489          unicode->utf8 = NULL;
1490          unicode->utf8_length = 0;
1491      }
1492      else {
1493          unicode->utf8 = NULL;
1494          unicode->utf8_length = 0;
1495          if (kind == PyUnicode_2BYTE_KIND)
1496              ((Py_UCS2*)data)[size] = 0;
1497          else /* kind == PyUnicode_4BYTE_KIND */
1498              ((Py_UCS4*)data)[size] = 0;
1499          if (is_sharing) {
1500              _PyUnicode_WSTR_LENGTH(unicode) = size;
1501              _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1502          }
1503          else {
1504              _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505              _PyUnicode_WSTR(unicode) = NULL;
1506          }
1507      }
1508  #ifdef Py_DEBUG
1509      unicode_fill_invalid((PyObject*)unicode, 0);
1510  #endif
1511      assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1512      return obj;
1513  }
1514  
1515  #if SIZEOF_WCHAR_T == 2
1516  /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1517     will decode surrogate pairs, the other conversions are implemented as macros
1518     for efficiency.
1519  
1520     This function assumes that unicode can hold one more code point than wstr
1521     characters for a terminating null character. */
1522  static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1523  unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1524                                PyObject *unicode)
1525  {
1526      const wchar_t *iter;
1527      Py_UCS4 *ucs4_out;
1528  
1529      assert(unicode != NULL);
1530      assert(_PyUnicode_CHECK(unicode));
1531      assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1532      ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1533  
1534      for (iter = begin; iter < end; ) {
1535          assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1536                             _PyUnicode_GET_LENGTH(unicode)));
1537          if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1538              && (iter+1) < end
1539              && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1540          {
1541              *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1542              iter += 2;
1543          }
1544          else {
1545              *ucs4_out++ = *iter;
1546              iter++;
1547          }
1548      }
1549      assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1550                          _PyUnicode_GET_LENGTH(unicode)));
1551  
1552  }
1553  #endif
1554  
1555  static int
unicode_check_modifiable(PyObject * unicode)1556  unicode_check_modifiable(PyObject *unicode)
1557  {
1558      if (!unicode_modifiable(unicode)) {
1559          PyErr_SetString(PyExc_SystemError,
1560                          "Cannot modify a string currently used");
1561          return -1;
1562      }
1563      return 0;
1564  }
1565  
1566  static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1567  _copy_characters(PyObject *to, Py_ssize_t to_start,
1568                   PyObject *from, Py_ssize_t from_start,
1569                   Py_ssize_t how_many, int check_maxchar)
1570  {
1571      unsigned int from_kind, to_kind;
1572      const void *from_data;
1573      void *to_data;
1574  
1575      assert(0 <= how_many);
1576      assert(0 <= from_start);
1577      assert(0 <= to_start);
1578      assert(PyUnicode_Check(from));
1579      assert(PyUnicode_IS_READY(from));
1580      assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1581  
1582      assert(PyUnicode_Check(to));
1583      assert(PyUnicode_IS_READY(to));
1584      assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1585  
1586      if (how_many == 0)
1587          return 0;
1588  
1589      from_kind = PyUnicode_KIND(from);
1590      from_data = PyUnicode_DATA(from);
1591      to_kind = PyUnicode_KIND(to);
1592      to_data = PyUnicode_DATA(to);
1593  
1594  #ifdef Py_DEBUG
1595      if (!check_maxchar
1596          && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1597      {
1598          Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1599          Py_UCS4 ch;
1600          Py_ssize_t i;
1601          for (i=0; i < how_many; i++) {
1602              ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1603              assert(ch <= to_maxchar);
1604          }
1605      }
1606  #endif
1607  
1608      if (from_kind == to_kind) {
1609          if (check_maxchar
1610              && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1611          {
1612              /* Writing Latin-1 characters into an ASCII string requires to
1613                 check that all written characters are pure ASCII */
1614              Py_UCS4 max_char;
1615              max_char = ucs1lib_find_max_char(from_data,
1616                                               (const Py_UCS1*)from_data + how_many);
1617              if (max_char >= 128)
1618                  return -1;
1619          }
1620          memcpy((char*)to_data + to_kind * to_start,
1621                    (const char*)from_data + from_kind * from_start,
1622                    to_kind * how_many);
1623      }
1624      else if (from_kind == PyUnicode_1BYTE_KIND
1625               && to_kind == PyUnicode_2BYTE_KIND)
1626      {
1627          _PyUnicode_CONVERT_BYTES(
1628              Py_UCS1, Py_UCS2,
1629              PyUnicode_1BYTE_DATA(from) + from_start,
1630              PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1631              PyUnicode_2BYTE_DATA(to) + to_start
1632              );
1633      }
1634      else if (from_kind == PyUnicode_1BYTE_KIND
1635               && to_kind == PyUnicode_4BYTE_KIND)
1636      {
1637          _PyUnicode_CONVERT_BYTES(
1638              Py_UCS1, Py_UCS4,
1639              PyUnicode_1BYTE_DATA(from) + from_start,
1640              PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1641              PyUnicode_4BYTE_DATA(to) + to_start
1642              );
1643      }
1644      else if (from_kind == PyUnicode_2BYTE_KIND
1645               && to_kind == PyUnicode_4BYTE_KIND)
1646      {
1647          _PyUnicode_CONVERT_BYTES(
1648              Py_UCS2, Py_UCS4,
1649              PyUnicode_2BYTE_DATA(from) + from_start,
1650              PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1651              PyUnicode_4BYTE_DATA(to) + to_start
1652              );
1653      }
1654      else {
1655          assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1656  
1657          if (!check_maxchar) {
1658              if (from_kind == PyUnicode_2BYTE_KIND
1659                  && to_kind == PyUnicode_1BYTE_KIND)
1660              {
1661                  _PyUnicode_CONVERT_BYTES(
1662                      Py_UCS2, Py_UCS1,
1663                      PyUnicode_2BYTE_DATA(from) + from_start,
1664                      PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1665                      PyUnicode_1BYTE_DATA(to) + to_start
1666                      );
1667              }
1668              else if (from_kind == PyUnicode_4BYTE_KIND
1669                       && to_kind == PyUnicode_1BYTE_KIND)
1670              {
1671                  _PyUnicode_CONVERT_BYTES(
1672                      Py_UCS4, Py_UCS1,
1673                      PyUnicode_4BYTE_DATA(from) + from_start,
1674                      PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1675                      PyUnicode_1BYTE_DATA(to) + to_start
1676                      );
1677              }
1678              else if (from_kind == PyUnicode_4BYTE_KIND
1679                       && to_kind == PyUnicode_2BYTE_KIND)
1680              {
1681                  _PyUnicode_CONVERT_BYTES(
1682                      Py_UCS4, Py_UCS2,
1683                      PyUnicode_4BYTE_DATA(from) + from_start,
1684                      PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1685                      PyUnicode_2BYTE_DATA(to) + to_start
1686                      );
1687              }
1688              else {
1689                  Py_UNREACHABLE();
1690              }
1691          }
1692          else {
1693              const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1694              Py_UCS4 ch;
1695              Py_ssize_t i;
1696  
1697              for (i=0; i < how_many; i++) {
1698                  ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1699                  if (ch > to_maxchar)
1700                      return -1;
1701                  PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1702              }
1703          }
1704      }
1705      return 0;
1706  }
1707  
1708  void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1709  _PyUnicode_FastCopyCharacters(
1710      PyObject *to, Py_ssize_t to_start,
1711      PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1712  {
1713      (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1714  }
1715  
1716  Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1717  PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1718                           PyObject *from, Py_ssize_t from_start,
1719                           Py_ssize_t how_many)
1720  {
1721      int err;
1722  
1723      if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1724          PyErr_BadInternalCall();
1725          return -1;
1726      }
1727  
1728      if (PyUnicode_READY(from) == -1)
1729          return -1;
1730      if (PyUnicode_READY(to) == -1)
1731          return -1;
1732  
1733      if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1734          PyErr_SetString(PyExc_IndexError, "string index out of range");
1735          return -1;
1736      }
1737      if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1738          PyErr_SetString(PyExc_IndexError, "string index out of range");
1739          return -1;
1740      }
1741      if (how_many < 0) {
1742          PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1743          return -1;
1744      }
1745      how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1746      if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1747          PyErr_Format(PyExc_SystemError,
1748                       "Cannot write %zi characters at %zi "
1749                       "in a string of %zi characters",
1750                       how_many, to_start, PyUnicode_GET_LENGTH(to));
1751          return -1;
1752      }
1753  
1754      if (how_many == 0)
1755          return 0;
1756  
1757      if (unicode_check_modifiable(to))
1758          return -1;
1759  
1760      err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1761      if (err) {
1762          PyErr_Format(PyExc_SystemError,
1763                       "Cannot copy %s characters "
1764                       "into a string of %s characters",
1765                       unicode_kind_name(from),
1766                       unicode_kind_name(to));
1767          return -1;
1768      }
1769      return how_many;
1770  }
1771  
1772  /* Find the maximum code point and count the number of surrogate pairs so a
1773     correct string length can be computed before converting a string to UCS4.
1774     This function counts single surrogates as a character and not as a pair.
1775  
1776     Return 0 on success, or -1 on error. */
1777  static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1778  find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1779                          Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1780  {
1781      const wchar_t *iter;
1782      Py_UCS4 ch;
1783  
1784      assert(num_surrogates != NULL && maxchar != NULL);
1785      *num_surrogates = 0;
1786      *maxchar = 0;
1787  
1788      for (iter = begin; iter < end; ) {
1789  #if SIZEOF_WCHAR_T == 2
1790          if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1791              && (iter+1) < end
1792              && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1793          {
1794              ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1795              ++(*num_surrogates);
1796              iter += 2;
1797          }
1798          else
1799  #endif
1800          {
1801              ch = *iter;
1802              iter++;
1803          }
1804          if (ch > *maxchar) {
1805              *maxchar = ch;
1806              if (*maxchar > MAX_UNICODE) {
1807                  PyErr_Format(PyExc_ValueError,
1808                               "character U+%x is not in range [U+0000; U+%x]",
1809                               ch, MAX_UNICODE);
1810                  return -1;
1811              }
1812          }
1813      }
1814      return 0;
1815  }
1816  
1817  int
_PyUnicode_Ready(PyObject * unicode)1818  _PyUnicode_Ready(PyObject *unicode)
1819  {
1820      wchar_t *end;
1821      Py_UCS4 maxchar = 0;
1822      Py_ssize_t num_surrogates;
1823  #if SIZEOF_WCHAR_T == 2
1824      Py_ssize_t length_wo_surrogates;
1825  #endif
1826  
1827      /* _PyUnicode_Ready() is only intended for old-style API usage where
1828         strings were created using _PyObject_New() and where no canonical
1829         representation (the str field) has been set yet aka strings
1830         which are not yet ready. */
1831      assert(_PyUnicode_CHECK(unicode));
1832      assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1833      assert(_PyUnicode_WSTR(unicode) != NULL);
1834      assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1835      assert(_PyUnicode_UTF8(unicode) == NULL);
1836      /* Actually, it should neither be interned nor be anything else: */
1837      assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1838  
1839      end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1840      if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1841                                  &maxchar, &num_surrogates) == -1)
1842          return -1;
1843  
1844      if (maxchar < 256) {
1845          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1846          if (!_PyUnicode_DATA_ANY(unicode)) {
1847              PyErr_NoMemory();
1848              return -1;
1849          }
1850          _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1851                                  _PyUnicode_WSTR(unicode), end,
1852                                  PyUnicode_1BYTE_DATA(unicode));
1853          PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855          _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1856          if (maxchar < 128) {
1857              _PyUnicode_STATE(unicode).ascii = 1;
1858              _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1859              _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860          }
1861          else {
1862              _PyUnicode_STATE(unicode).ascii = 0;
1863              _PyUnicode_UTF8(unicode) = NULL;
1864              _PyUnicode_UTF8_LENGTH(unicode) = 0;
1865          }
1866          PyObject_Free(_PyUnicode_WSTR(unicode));
1867          _PyUnicode_WSTR(unicode) = NULL;
1868          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1869      }
1870      /* In this case we might have to convert down from 4-byte native
1871         wchar_t to 2-byte unicode. */
1872      else if (maxchar < 65536) {
1873          assert(num_surrogates == 0 &&
1874                 "FindMaxCharAndNumSurrogatePairs() messed up");
1875  
1876  #if SIZEOF_WCHAR_T == 2
1877          /* We can share representations and are done. */
1878          _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1879          PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881          _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882          _PyUnicode_UTF8(unicode) = NULL;
1883          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884  #else
1885          /* sizeof(wchar_t) == 4 */
1886          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1887              2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1888          if (!_PyUnicode_DATA_ANY(unicode)) {
1889              PyErr_NoMemory();
1890              return -1;
1891          }
1892          _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1893                                  _PyUnicode_WSTR(unicode), end,
1894                                  PyUnicode_2BYTE_DATA(unicode));
1895          PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1896          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897          _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1898          _PyUnicode_UTF8(unicode) = NULL;
1899          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1900          PyObject_Free(_PyUnicode_WSTR(unicode));
1901          _PyUnicode_WSTR(unicode) = NULL;
1902          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1903  #endif
1904      }
1905      /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1906      else {
1907  #if SIZEOF_WCHAR_T == 2
1908          /* in case the native representation is 2-bytes, we need to allocate a
1909             new normalized 4-byte version. */
1910          length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1911          if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1912              PyErr_NoMemory();
1913              return -1;
1914          }
1915          _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1916          if (!_PyUnicode_DATA_ANY(unicode)) {
1917              PyErr_NoMemory();
1918              return -1;
1919          }
1920          _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1921          _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922          _PyUnicode_UTF8(unicode) = NULL;
1923          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1924          /* unicode_convert_wchar_to_ucs4() requires a ready string */
1925          _PyUnicode_STATE(unicode).ready = 1;
1926          unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1927          PyObject_Free(_PyUnicode_WSTR(unicode));
1928          _PyUnicode_WSTR(unicode) = NULL;
1929          _PyUnicode_WSTR_LENGTH(unicode) = 0;
1930  #else
1931          assert(num_surrogates == 0);
1932  
1933          _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1934          _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1935          _PyUnicode_UTF8(unicode) = NULL;
1936          _PyUnicode_UTF8_LENGTH(unicode) = 0;
1937          _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1938  #endif
1939          PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1940      }
1941      _PyUnicode_STATE(unicode).ready = 1;
1942      assert(_PyUnicode_CheckConsistency(unicode, 1));
1943      return 0;
1944  }
1945  
1946  static void
unicode_dealloc(PyObject * unicode)1947  unicode_dealloc(PyObject *unicode)
1948  {
1949      switch (PyUnicode_CHECK_INTERNED(unicode)) {
1950      case SSTATE_NOT_INTERNED:
1951          break;
1952  
1953      case SSTATE_INTERNED_MORTAL:
1954      {
1955  #ifdef INTERNED_STRINGS
1956          /* Revive the dead object temporarily. PyDict_DelItem() removes two
1957             references (key and value) which were ignored by
1958             PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1959             to prevent calling unicode_dealloc() again. Adjust refcnt after
1960             PyDict_DelItem(). */
1961          assert(Py_REFCNT(unicode) == 0);
1962          Py_SET_REFCNT(unicode, 3);
1963          if (PyDict_DelItem(interned, unicode) != 0) {
1964              _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965                                        NULL);
1966          }
1967          assert(Py_REFCNT(unicode) == 1);
1968          Py_SET_REFCNT(unicode, 0);
1969  #endif
1970          break;
1971      }
1972  
1973      case SSTATE_INTERNED_IMMORTAL:
1974          _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1975          break;
1976  
1977      default:
1978          Py_UNREACHABLE();
1979      }
1980  
1981      if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1982          PyObject_Free(_PyUnicode_WSTR(unicode));
1983      }
1984      if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1985          PyObject_Free(_PyUnicode_UTF8(unicode));
1986      }
1987      if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1988          PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1989      }
1990  
1991      Py_TYPE(unicode)->tp_free(unicode);
1992  }
1993  
1994  #ifdef Py_DEBUG
1995  static int
unicode_is_singleton(PyObject * unicode)1996  unicode_is_singleton(PyObject *unicode)
1997  {
1998      struct _Py_unicode_state *state = get_unicode_state();
1999      if (unicode == state->empty_string) {
2000          return 1;
2001      }
2002      PyASCIIObject *ascii = (PyASCIIObject *)unicode;
2003      if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
2004      {
2005          Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
2006          if (ch < 256 && state->latin1[ch] == unicode) {
2007              return 1;
2008          }
2009      }
2010      return 0;
2011  }
2012  #endif
2013  
2014  static int
unicode_modifiable(PyObject * unicode)2015  unicode_modifiable(PyObject *unicode)
2016  {
2017      assert(_PyUnicode_CHECK(unicode));
2018      if (Py_REFCNT(unicode) != 1)
2019          return 0;
2020      if (_PyUnicode_HASH(unicode) != -1)
2021          return 0;
2022      if (PyUnicode_CHECK_INTERNED(unicode))
2023          return 0;
2024      if (!PyUnicode_CheckExact(unicode))
2025          return 0;
2026  #ifdef Py_DEBUG
2027      /* singleton refcount is greater than 1 */
2028      assert(!unicode_is_singleton(unicode));
2029  #endif
2030      return 1;
2031  }
2032  
2033  static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)2034  unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2035  {
2036      PyObject *unicode;
2037      Py_ssize_t old_length;
2038  
2039      assert(p_unicode != NULL);
2040      unicode = *p_unicode;
2041  
2042      assert(unicode != NULL);
2043      assert(PyUnicode_Check(unicode));
2044      assert(0 <= length);
2045  
2046      if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2047          old_length = PyUnicode_WSTR_LENGTH(unicode);
2048      else
2049          old_length = PyUnicode_GET_LENGTH(unicode);
2050      if (old_length == length)
2051          return 0;
2052  
2053      if (length == 0) {
2054          PyObject *empty = unicode_new_empty();
2055          Py_SETREF(*p_unicode, empty);
2056          return 0;
2057      }
2058  
2059      if (!unicode_modifiable(unicode)) {
2060          PyObject *copy = resize_copy(unicode, length);
2061          if (copy == NULL)
2062              return -1;
2063          Py_SETREF(*p_unicode, copy);
2064          return 0;
2065      }
2066  
2067      if (PyUnicode_IS_COMPACT(unicode)) {
2068          PyObject *new_unicode = resize_compact(unicode, length);
2069          if (new_unicode == NULL)
2070              return -1;
2071          *p_unicode = new_unicode;
2072          return 0;
2073      }
2074      return resize_inplace(unicode, length);
2075  }
2076  
2077  int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2078  PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2079  {
2080      PyObject *unicode;
2081      if (p_unicode == NULL) {
2082          PyErr_BadInternalCall();
2083          return -1;
2084      }
2085      unicode = *p_unicode;
2086      if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2087      {
2088          PyErr_BadInternalCall();
2089          return -1;
2090      }
2091      return unicode_resize(p_unicode, length);
2092  }
2093  
2094  /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2095  
2096     WARNING: The function doesn't copy the terminating null character and
2097     doesn't check the maximum character (may write a latin1 character in an
2098     ASCII string). */
2099  static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2100  unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2101                     const char *str, Py_ssize_t len)
2102  {
2103      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2104      const void *data = PyUnicode_DATA(unicode);
2105      const char *end = str + len;
2106  
2107      assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2108      switch (kind) {
2109      case PyUnicode_1BYTE_KIND: {
2110  #ifdef Py_DEBUG
2111          if (PyUnicode_IS_ASCII(unicode)) {
2112              Py_UCS4 maxchar = ucs1lib_find_max_char(
2113                  (const Py_UCS1*)str,
2114                  (const Py_UCS1*)str + len);
2115              assert(maxchar < 128);
2116          }
2117  #endif
2118          memcpy((char *) data + index, str, len);
2119          break;
2120      }
2121      case PyUnicode_2BYTE_KIND: {
2122          Py_UCS2 *start = (Py_UCS2 *)data + index;
2123          Py_UCS2 *ucs2 = start;
2124  
2125          for (; str < end; ++ucs2, ++str)
2126              *ucs2 = (Py_UCS2)*str;
2127  
2128          assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2129          break;
2130      }
2131      case PyUnicode_4BYTE_KIND: {
2132          Py_UCS4 *start = (Py_UCS4 *)data + index;
2133          Py_UCS4 *ucs4 = start;
2134  
2135          for (; str < end; ++ucs4, ++str)
2136              *ucs4 = (Py_UCS4)*str;
2137  
2138          assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2139          break;
2140      }
2141      default:
2142          Py_UNREACHABLE();
2143      }
2144  }
2145  
2146  static PyObject*
get_latin1_char(Py_UCS1 ch)2147  get_latin1_char(Py_UCS1 ch)
2148  {
2149      struct _Py_unicode_state *state = get_unicode_state();
2150  
2151      PyObject *unicode = state->latin1[ch];
2152      if (unicode) {
2153          Py_INCREF(unicode);
2154          return unicode;
2155      }
2156  
2157      unicode = PyUnicode_New(1, ch);
2158      if (!unicode) {
2159          return NULL;
2160      }
2161  
2162      PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2163      assert(_PyUnicode_CheckConsistency(unicode, 1));
2164  
2165      Py_INCREF(unicode);
2166      state->latin1[ch] = unicode;
2167      return unicode;
2168  }
2169  
2170  static PyObject*
unicode_char(Py_UCS4 ch)2171  unicode_char(Py_UCS4 ch)
2172  {
2173      PyObject *unicode;
2174  
2175      assert(ch <= MAX_UNICODE);
2176  
2177      if (ch < 256) {
2178          return get_latin1_char(ch);
2179      }
2180  
2181      unicode = PyUnicode_New(1, ch);
2182      if (unicode == NULL)
2183          return NULL;
2184  
2185      assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2186      if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2187          PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2188      } else {
2189          assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2190          PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2191      }
2192      assert(_PyUnicode_CheckConsistency(unicode, 1));
2193      return unicode;
2194  }
2195  
2196  PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2197  PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2198  {
2199      if (u == NULL) {
2200          if (size > 0) {
2201              if (PyErr_WarnEx(PyExc_DeprecationWarning,
2202                      "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2203                      "use PyUnicode_New() instead", 1) < 0) {
2204                  return NULL;
2205              }
2206          }
2207          return (PyObject*)_PyUnicode_New(size);
2208      }
2209  
2210      if (size < 0) {
2211          PyErr_BadInternalCall();
2212          return NULL;
2213      }
2214  
2215      return PyUnicode_FromWideChar(u, size);
2216  }
2217  
2218  PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2219  PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2220  {
2221      PyObject *unicode;
2222      Py_UCS4 maxchar = 0;
2223      Py_ssize_t num_surrogates;
2224  
2225      if (u == NULL && size != 0) {
2226          PyErr_BadInternalCall();
2227          return NULL;
2228      }
2229  
2230      if (size == -1) {
2231          size = wcslen(u);
2232      }
2233  
2234      /* If the Unicode data is known at construction time, we can apply
2235         some optimizations which share commonly used objects. */
2236  
2237      /* Optimization for empty strings */
2238      if (size == 0)
2239          _Py_RETURN_UNICODE_EMPTY();
2240  
2241  #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2242      /* Oracle Solaris uses non-Unicode internal wchar_t form for
2243         non-Unicode locales and hence needs conversion to UCS-4 first. */
2244      if (_Py_LocaleUsesNonUnicodeWchar()) {
2245          wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2246          if (!converted) {
2247              return NULL;
2248          }
2249          PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2250          PyMem_Free(converted);
2251          return unicode;
2252      }
2253  #endif
2254  
2255      /* Single character Unicode objects in the Latin-1 range are
2256         shared when using this constructor */
2257      if (size == 1 && (Py_UCS4)*u < 256)
2258          return get_latin1_char((unsigned char)*u);
2259  
2260      /* If not empty and not single character, copy the Unicode data
2261         into the new object */
2262      if (find_maxchar_surrogates(u, u + size,
2263                                  &maxchar, &num_surrogates) == -1)
2264          return NULL;
2265  
2266      unicode = PyUnicode_New(size - num_surrogates, maxchar);
2267      if (!unicode)
2268          return NULL;
2269  
2270      switch (PyUnicode_KIND(unicode)) {
2271      case PyUnicode_1BYTE_KIND:
2272          _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2273                                  u, u + size, PyUnicode_1BYTE_DATA(unicode));
2274          break;
2275      case PyUnicode_2BYTE_KIND:
2276  #if Py_UNICODE_SIZE == 2
2277          memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2278  #else
2279          _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2280                                  u, u + size, PyUnicode_2BYTE_DATA(unicode));
2281  #endif
2282          break;
2283      case PyUnicode_4BYTE_KIND:
2284  #if SIZEOF_WCHAR_T == 2
2285          /* This is the only case which has to process surrogates, thus
2286             a simple copy loop is not enough and we need a function. */
2287          unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2288  #else
2289          assert(num_surrogates == 0);
2290          memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2291  #endif
2292          break;
2293      default:
2294          Py_UNREACHABLE();
2295      }
2296  
2297      return unicode_result(unicode);
2298  }
2299  
2300  PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2301  PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2302  {
2303      if (size < 0) {
2304          PyErr_SetString(PyExc_SystemError,
2305                          "Negative size passed to PyUnicode_FromStringAndSize");
2306          return NULL;
2307      }
2308      if (u != NULL) {
2309          return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2310      }
2311      else {
2312          if (size > 0) {
2313              if (PyErr_WarnEx(PyExc_DeprecationWarning,
2314                      "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2315                      "use PyUnicode_New() instead", 1) < 0) {
2316                  return NULL;
2317              }
2318          }
2319          return (PyObject *)_PyUnicode_New(size);
2320      }
2321  }
2322  
2323  PyObject *
PyUnicode_FromString(const char * u)2324  PyUnicode_FromString(const char *u)
2325  {
2326      size_t size = strlen(u);
2327      if (size > PY_SSIZE_T_MAX) {
2328          PyErr_SetString(PyExc_OverflowError, "input too long");
2329          return NULL;
2330      }
2331      return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2332  }
2333  
2334  
2335  PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2336  _PyUnicode_FromId(_Py_Identifier *id)
2337  {
2338      PyInterpreterState *interp = _PyInterpreterState_GET();
2339      struct _Py_unicode_ids *ids = &interp->unicode.ids;
2340  
2341      Py_ssize_t index = _Py_atomic_size_get(&id->index);
2342      if (index < 0) {
2343          struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2344  
2345          PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2346          // Check again to detect concurrent access. Another thread can have
2347          // initialized the index while this thread waited for the lock.
2348          index = _Py_atomic_size_get(&id->index);
2349          if (index < 0) {
2350              assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2351              index = rt_ids->next_index;
2352              rt_ids->next_index++;
2353              _Py_atomic_size_set(&id->index, index);
2354          }
2355          PyThread_release_lock(rt_ids->lock);
2356      }
2357      assert(index >= 0);
2358  
2359      PyObject *obj;
2360      if (index < ids->size) {
2361          obj = ids->array[index];
2362          if (obj) {
2363              // Return a borrowed reference
2364              return obj;
2365          }
2366      }
2367  
2368      obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2369                                         NULL, NULL);
2370      if (!obj) {
2371          return NULL;
2372      }
2373      PyUnicode_InternInPlace(&obj);
2374  
2375      if (index >= ids->size) {
2376          // Overallocate to reduce the number of realloc
2377          Py_ssize_t new_size = Py_MAX(index * 2, 16);
2378          Py_ssize_t item_size = sizeof(ids->array[0]);
2379          PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2380          if (new_array == NULL) {
2381              PyErr_NoMemory();
2382              return NULL;
2383          }
2384          memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2385          ids->array = new_array;
2386          ids->size = new_size;
2387      }
2388  
2389      // The array stores a strong reference
2390      ids->array[index] = obj;
2391  
2392      // Return a borrowed reference
2393      return obj;
2394  }
2395  
2396  
2397  static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2398  unicode_clear_identifiers(struct _Py_unicode_state *state)
2399  {
2400      struct _Py_unicode_ids *ids = &state->ids;
2401      for (Py_ssize_t i=0; i < ids->size; i++) {
2402          Py_XDECREF(ids->array[i]);
2403      }
2404      ids->size = 0;
2405      PyMem_Free(ids->array);
2406      ids->array = NULL;
2407      // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2408      // after Py_Finalize().
2409  }
2410  
2411  
2412  /* Internal function, doesn't check maximum character */
2413  
2414  PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2415  _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2416  {
2417      const unsigned char *s = (const unsigned char *)buffer;
2418      PyObject *unicode;
2419      if (size == 1) {
2420  #ifdef Py_DEBUG
2421          assert((unsigned char)s[0] < 128);
2422  #endif
2423          return get_latin1_char(s[0]);
2424      }
2425      unicode = PyUnicode_New(size, 127);
2426      if (!unicode)
2427          return NULL;
2428      memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2429      assert(_PyUnicode_CheckConsistency(unicode, 1));
2430      return unicode;
2431  }
2432  
2433  static Py_UCS4
kind_maxchar_limit(unsigned int kind)2434  kind_maxchar_limit(unsigned int kind)
2435  {
2436      switch (kind) {
2437      case PyUnicode_1BYTE_KIND:
2438          return 0x80;
2439      case PyUnicode_2BYTE_KIND:
2440          return 0x100;
2441      case PyUnicode_4BYTE_KIND:
2442          return 0x10000;
2443      default:
2444          Py_UNREACHABLE();
2445      }
2446  }
2447  
2448  static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2449  _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2450  {
2451      PyObject *res;
2452      unsigned char max_char;
2453  
2454      if (size == 0) {
2455          _Py_RETURN_UNICODE_EMPTY();
2456      }
2457      assert(size > 0);
2458      if (size == 1) {
2459          return get_latin1_char(u[0]);
2460      }
2461  
2462      max_char = ucs1lib_find_max_char(u, u + size);
2463      res = PyUnicode_New(size, max_char);
2464      if (!res)
2465          return NULL;
2466      memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2467      assert(_PyUnicode_CheckConsistency(res, 1));
2468      return res;
2469  }
2470  
2471  static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2472  _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2473  {
2474      PyObject *res;
2475      Py_UCS2 max_char;
2476  
2477      if (size == 0)
2478          _Py_RETURN_UNICODE_EMPTY();
2479      assert(size > 0);
2480      if (size == 1)
2481          return unicode_char(u[0]);
2482  
2483      max_char = ucs2lib_find_max_char(u, u + size);
2484      res = PyUnicode_New(size, max_char);
2485      if (!res)
2486          return NULL;
2487      if (max_char >= 256)
2488          memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2489      else {
2490          _PyUnicode_CONVERT_BYTES(
2491              Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2492      }
2493      assert(_PyUnicode_CheckConsistency(res, 1));
2494      return res;
2495  }
2496  
2497  static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2498  _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2499  {
2500      PyObject *res;
2501      Py_UCS4 max_char;
2502  
2503      if (size == 0)
2504          _Py_RETURN_UNICODE_EMPTY();
2505      assert(size > 0);
2506      if (size == 1)
2507          return unicode_char(u[0]);
2508  
2509      max_char = ucs4lib_find_max_char(u, u + size);
2510      res = PyUnicode_New(size, max_char);
2511      if (!res)
2512          return NULL;
2513      if (max_char < 256)
2514          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2515                                   PyUnicode_1BYTE_DATA(res));
2516      else if (max_char < 0x10000)
2517          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2518                                   PyUnicode_2BYTE_DATA(res));
2519      else
2520          memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2521      assert(_PyUnicode_CheckConsistency(res, 1));
2522      return res;
2523  }
2524  
2525  PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2526  PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2527  {
2528      if (size < 0) {
2529          PyErr_SetString(PyExc_ValueError, "size must be positive");
2530          return NULL;
2531      }
2532      switch (kind) {
2533      case PyUnicode_1BYTE_KIND:
2534          return _PyUnicode_FromUCS1(buffer, size);
2535      case PyUnicode_2BYTE_KIND:
2536          return _PyUnicode_FromUCS2(buffer, size);
2537      case PyUnicode_4BYTE_KIND:
2538          return _PyUnicode_FromUCS4(buffer, size);
2539      default:
2540          PyErr_SetString(PyExc_SystemError, "invalid kind");
2541          return NULL;
2542      }
2543  }
2544  
2545  Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2546  _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2547  {
2548      enum PyUnicode_Kind kind;
2549      const void *startptr, *endptr;
2550  
2551      assert(PyUnicode_IS_READY(unicode));
2552      assert(0 <= start);
2553      assert(end <= PyUnicode_GET_LENGTH(unicode));
2554      assert(start <= end);
2555  
2556      if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2557          return PyUnicode_MAX_CHAR_VALUE(unicode);
2558  
2559      if (start == end)
2560          return 127;
2561  
2562      if (PyUnicode_IS_ASCII(unicode))
2563          return 127;
2564  
2565      kind = PyUnicode_KIND(unicode);
2566      startptr = PyUnicode_DATA(unicode);
2567      endptr = (char *)startptr + end * kind;
2568      startptr = (char *)startptr + start * kind;
2569      switch(kind) {
2570      case PyUnicode_1BYTE_KIND:
2571          return ucs1lib_find_max_char(startptr, endptr);
2572      case PyUnicode_2BYTE_KIND:
2573          return ucs2lib_find_max_char(startptr, endptr);
2574      case PyUnicode_4BYTE_KIND:
2575          return ucs4lib_find_max_char(startptr, endptr);
2576      default:
2577          Py_UNREACHABLE();
2578      }
2579  }
2580  
2581  /* Ensure that a string uses the most efficient storage, if it is not the
2582     case: create a new string with of the right kind. Write NULL into *p_unicode
2583     on error. */
2584  static void
unicode_adjust_maxchar(PyObject ** p_unicode)2585  unicode_adjust_maxchar(PyObject **p_unicode)
2586  {
2587      PyObject *unicode, *copy;
2588      Py_UCS4 max_char;
2589      Py_ssize_t len;
2590      unsigned int kind;
2591  
2592      assert(p_unicode != NULL);
2593      unicode = *p_unicode;
2594      assert(PyUnicode_IS_READY(unicode));
2595      if (PyUnicode_IS_ASCII(unicode))
2596          return;
2597  
2598      len = PyUnicode_GET_LENGTH(unicode);
2599      kind = PyUnicode_KIND(unicode);
2600      if (kind == PyUnicode_1BYTE_KIND) {
2601          const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2602          max_char = ucs1lib_find_max_char(u, u + len);
2603          if (max_char >= 128)
2604              return;
2605      }
2606      else if (kind == PyUnicode_2BYTE_KIND) {
2607          const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2608          max_char = ucs2lib_find_max_char(u, u + len);
2609          if (max_char >= 256)
2610              return;
2611      }
2612      else if (kind == PyUnicode_4BYTE_KIND) {
2613          const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2614          max_char = ucs4lib_find_max_char(u, u + len);
2615          if (max_char >= 0x10000)
2616              return;
2617      }
2618      else
2619          Py_UNREACHABLE();
2620  
2621      copy = PyUnicode_New(len, max_char);
2622      if (copy != NULL)
2623          _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2624      Py_DECREF(unicode);
2625      *p_unicode = copy;
2626  }
2627  
2628  PyObject*
_PyUnicode_Copy(PyObject * unicode)2629  _PyUnicode_Copy(PyObject *unicode)
2630  {
2631      Py_ssize_t length;
2632      PyObject *copy;
2633  
2634      if (!PyUnicode_Check(unicode)) {
2635          PyErr_BadInternalCall();
2636          return NULL;
2637      }
2638      if (PyUnicode_READY(unicode) == -1)
2639          return NULL;
2640  
2641      length = PyUnicode_GET_LENGTH(unicode);
2642      copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2643      if (!copy)
2644          return NULL;
2645      assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2646  
2647      memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2648                length * PyUnicode_KIND(unicode));
2649      assert(_PyUnicode_CheckConsistency(copy, 1));
2650      return copy;
2651  }
2652  
2653  
2654  /* Widen Unicode objects to larger buffers. Don't write terminating null
2655     character. Return NULL on error. */
2656  
2657  static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2658  unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2659  {
2660      void *result;
2661  
2662      assert(skind < kind);
2663      switch (kind) {
2664      case PyUnicode_2BYTE_KIND:
2665          result = PyMem_New(Py_UCS2, len);
2666          if (!result)
2667              return PyErr_NoMemory();
2668          assert(skind == PyUnicode_1BYTE_KIND);
2669          _PyUnicode_CONVERT_BYTES(
2670              Py_UCS1, Py_UCS2,
2671              (const Py_UCS1 *)data,
2672              ((const Py_UCS1 *)data) + len,
2673              result);
2674          return result;
2675      case PyUnicode_4BYTE_KIND:
2676          result = PyMem_New(Py_UCS4, len);
2677          if (!result)
2678              return PyErr_NoMemory();
2679          if (skind == PyUnicode_2BYTE_KIND) {
2680              _PyUnicode_CONVERT_BYTES(
2681                  Py_UCS2, Py_UCS4,
2682                  (const Py_UCS2 *)data,
2683                  ((const Py_UCS2 *)data) + len,
2684                  result);
2685          }
2686          else {
2687              assert(skind == PyUnicode_1BYTE_KIND);
2688              _PyUnicode_CONVERT_BYTES(
2689                  Py_UCS1, Py_UCS4,
2690                  (const Py_UCS1 *)data,
2691                  ((const Py_UCS1 *)data) + len,
2692                  result);
2693          }
2694          return result;
2695      default:
2696          Py_UNREACHABLE();
2697          return NULL;
2698      }
2699  }
2700  
2701  static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2702  as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2703          int copy_null)
2704  {
2705      int kind;
2706      const void *data;
2707      Py_ssize_t len, targetlen;
2708      if (PyUnicode_READY(string) == -1)
2709          return NULL;
2710      kind = PyUnicode_KIND(string);
2711      data = PyUnicode_DATA(string);
2712      len = PyUnicode_GET_LENGTH(string);
2713      targetlen = len;
2714      if (copy_null)
2715          targetlen++;
2716      if (!target) {
2717          target = PyMem_New(Py_UCS4, targetlen);
2718          if (!target) {
2719              PyErr_NoMemory();
2720              return NULL;
2721          }
2722      }
2723      else {
2724          if (targetsize < targetlen) {
2725              PyErr_Format(PyExc_SystemError,
2726                           "string is longer than the buffer");
2727              if (copy_null && 0 < targetsize)
2728                  target[0] = 0;
2729              return NULL;
2730          }
2731      }
2732      if (kind == PyUnicode_1BYTE_KIND) {
2733          const Py_UCS1 *start = (const Py_UCS1 *) data;
2734          _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2735      }
2736      else if (kind == PyUnicode_2BYTE_KIND) {
2737          const Py_UCS2 *start = (const Py_UCS2 *) data;
2738          _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2739      }
2740      else if (kind == PyUnicode_4BYTE_KIND) {
2741          memcpy(target, data, len * sizeof(Py_UCS4));
2742      }
2743      else {
2744          Py_UNREACHABLE();
2745      }
2746      if (copy_null)
2747          target[len] = 0;
2748      return target;
2749  }
2750  
2751  Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2752  PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2753                   int copy_null)
2754  {
2755      if (target == NULL || targetsize < 0) {
2756          PyErr_BadInternalCall();
2757          return NULL;
2758      }
2759      return as_ucs4(string, target, targetsize, copy_null);
2760  }
2761  
2762  Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2763  PyUnicode_AsUCS4Copy(PyObject *string)
2764  {
2765      return as_ucs4(string, NULL, 0, 1);
2766  }
2767  
2768  /* maximum number of characters required for output of %lld or %p.
2769     We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2770     plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2771  #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2772  
2773  static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2774  unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2775                               Py_ssize_t width, Py_ssize_t precision)
2776  {
2777      Py_ssize_t length, fill, arglen;
2778      Py_UCS4 maxchar;
2779  
2780      if (PyUnicode_READY(str) == -1)
2781          return -1;
2782  
2783      length = PyUnicode_GET_LENGTH(str);
2784      if ((precision == -1 || precision >= length)
2785          && width <= length)
2786          return _PyUnicodeWriter_WriteStr(writer, str);
2787  
2788      if (precision != -1)
2789          length = Py_MIN(precision, length);
2790  
2791      arglen = Py_MAX(length, width);
2792      if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2793          maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2794      else
2795          maxchar = writer->maxchar;
2796  
2797      if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2798          return -1;
2799  
2800      if (width > length) {
2801          fill = width - length;
2802          if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2803              return -1;
2804          writer->pos += fill;
2805      }
2806  
2807      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2808                                    str, 0, length);
2809      writer->pos += length;
2810      return 0;
2811  }
2812  
2813  static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2814  unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2815                                Py_ssize_t width, Py_ssize_t precision)
2816  {
2817      /* UTF-8 */
2818      Py_ssize_t length;
2819      PyObject *unicode;
2820      int res;
2821  
2822      if (precision == -1) {
2823          length = strlen(str);
2824      }
2825      else {
2826          length = 0;
2827          while (length < precision && str[length]) {
2828              length++;
2829          }
2830      }
2831      unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2832      if (unicode == NULL)
2833          return -1;
2834  
2835      res = unicode_fromformat_write_str(writer, unicode, width, -1);
2836      Py_DECREF(unicode);
2837      return res;
2838  }
2839  
2840  static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2841  unicode_fromformat_arg(_PyUnicodeWriter *writer,
2842                         const char *f, va_list *vargs)
2843  {
2844      const char *p;
2845      Py_ssize_t len;
2846      int zeropad;
2847      Py_ssize_t width;
2848      Py_ssize_t precision;
2849      int longflag;
2850      int longlongflag;
2851      int size_tflag;
2852      Py_ssize_t fill;
2853  
2854      p = f;
2855      f++;
2856      zeropad = 0;
2857      if (*f == '0') {
2858          zeropad = 1;
2859          f++;
2860      }
2861  
2862      /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2863      width = -1;
2864      if (Py_ISDIGIT((unsigned)*f)) {
2865          width = *f - '0';
2866          f++;
2867          while (Py_ISDIGIT((unsigned)*f)) {
2868              if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2869                  PyErr_SetString(PyExc_ValueError,
2870                                  "width too big");
2871                  return NULL;
2872              }
2873              width = (width * 10) + (*f - '0');
2874              f++;
2875          }
2876      }
2877      precision = -1;
2878      if (*f == '.') {
2879          f++;
2880          if (Py_ISDIGIT((unsigned)*f)) {
2881              precision = (*f - '0');
2882              f++;
2883              while (Py_ISDIGIT((unsigned)*f)) {
2884                  if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2885                      PyErr_SetString(PyExc_ValueError,
2886                                      "precision too big");
2887                      return NULL;
2888                  }
2889                  precision = (precision * 10) + (*f - '0');
2890                  f++;
2891              }
2892          }
2893          if (*f == '%') {
2894              /* "%.3%s" => f points to "3" */
2895              f--;
2896          }
2897      }
2898      if (*f == '\0') {
2899          /* bogus format "%.123" => go backward, f points to "3" */
2900          f--;
2901      }
2902  
2903      /* Handle %ld, %lu, %lld and %llu. */
2904      longflag = 0;
2905      longlongflag = 0;
2906      size_tflag = 0;
2907      if (*f == 'l') {
2908          if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2909              longflag = 1;
2910              ++f;
2911          }
2912          else if (f[1] == 'l' &&
2913                   (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2914              longlongflag = 1;
2915              f += 2;
2916          }
2917      }
2918      /* handle the size_t flag. */
2919      else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2920          size_tflag = 1;
2921          ++f;
2922      }
2923  
2924      if (f[1] == '\0')
2925          writer->overallocate = 0;
2926  
2927      switch (*f) {
2928      case 'c':
2929      {
2930          int ordinal = va_arg(*vargs, int);
2931          if (ordinal < 0 || ordinal > MAX_UNICODE) {
2932              PyErr_SetString(PyExc_OverflowError,
2933                              "character argument not in range(0x110000)");
2934              return NULL;
2935          }
2936          if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2937              return NULL;
2938          break;
2939      }
2940  
2941      case 'i':
2942      case 'd':
2943      case 'u':
2944      case 'x':
2945      {
2946          /* used by sprintf */
2947          char buffer[MAX_LONG_LONG_CHARS];
2948          Py_ssize_t arglen;
2949  
2950          if (*f == 'u') {
2951              if (longflag) {
2952                  len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2953              }
2954              else if (longlongflag) {
2955                  len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2956              }
2957              else if (size_tflag) {
2958                  len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2959              }
2960              else {
2961                  len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2962              }
2963          }
2964          else if (*f == 'x') {
2965              len = sprintf(buffer, "%x", va_arg(*vargs, int));
2966          }
2967          else {
2968              if (longflag) {
2969                  len = sprintf(buffer, "%li", va_arg(*vargs, long));
2970              }
2971              else if (longlongflag) {
2972                  len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2973              }
2974              else if (size_tflag) {
2975                  len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2976              }
2977              else {
2978                  len = sprintf(buffer, "%i", va_arg(*vargs, int));
2979              }
2980          }
2981          assert(len >= 0);
2982  
2983          if (precision < len)
2984              precision = len;
2985  
2986          arglen = Py_MAX(precision, width);
2987          if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2988              return NULL;
2989  
2990          if (width > precision) {
2991              Py_UCS4 fillchar;
2992              fill = width - precision;
2993              fillchar = zeropad?'0':' ';
2994              if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2995                  return NULL;
2996              writer->pos += fill;
2997          }
2998          if (precision > len) {
2999              fill = precision - len;
3000              if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
3001                  return NULL;
3002              writer->pos += fill;
3003          }
3004  
3005          if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
3006              return NULL;
3007          break;
3008      }
3009  
3010      case 'p':
3011      {
3012          char number[MAX_LONG_LONG_CHARS];
3013  
3014          len = sprintf(number, "%p", va_arg(*vargs, void*));
3015          assert(len >= 0);
3016  
3017          /* %p is ill-defined:  ensure leading 0x. */
3018          if (number[1] == 'X')
3019              number[1] = 'x';
3020          else if (number[1] != 'x') {
3021              memmove(number + 2, number,
3022                      strlen(number) + 1);
3023              number[0] = '0';
3024              number[1] = 'x';
3025              len += 2;
3026          }
3027  
3028          if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3029              return NULL;
3030          break;
3031      }
3032  
3033      case 's':
3034      {
3035          /* UTF-8 */
3036          const char *s = va_arg(*vargs, const char*);
3037          if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
3038              return NULL;
3039          break;
3040      }
3041  
3042      case 'U':
3043      {
3044          PyObject *obj = va_arg(*vargs, PyObject *);
3045          assert(obj && _PyUnicode_CHECK(obj));
3046  
3047          if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3048              return NULL;
3049          break;
3050      }
3051  
3052      case 'V':
3053      {
3054          PyObject *obj = va_arg(*vargs, PyObject *);
3055          const char *str = va_arg(*vargs, const char *);
3056          if (obj) {
3057              assert(_PyUnicode_CHECK(obj));
3058              if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3059                  return NULL;
3060          }
3061          else {
3062              assert(str != NULL);
3063              if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3064                  return NULL;
3065          }
3066          break;
3067      }
3068  
3069      case 'S':
3070      {
3071          PyObject *obj = va_arg(*vargs, PyObject *);
3072          PyObject *str;
3073          assert(obj);
3074          str = PyObject_Str(obj);
3075          if (!str)
3076              return NULL;
3077          if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3078              Py_DECREF(str);
3079              return NULL;
3080          }
3081          Py_DECREF(str);
3082          break;
3083      }
3084  
3085      case 'R':
3086      {
3087          PyObject *obj = va_arg(*vargs, PyObject *);
3088          PyObject *repr;
3089          assert(obj);
3090          repr = PyObject_Repr(obj);
3091          if (!repr)
3092              return NULL;
3093          if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3094              Py_DECREF(repr);
3095              return NULL;
3096          }
3097          Py_DECREF(repr);
3098          break;
3099      }
3100  
3101      case 'A':
3102      {
3103          PyObject *obj = va_arg(*vargs, PyObject *);
3104          PyObject *ascii;
3105          assert(obj);
3106          ascii = PyObject_ASCII(obj);
3107          if (!ascii)
3108              return NULL;
3109          if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3110              Py_DECREF(ascii);
3111              return NULL;
3112          }
3113          Py_DECREF(ascii);
3114          break;
3115      }
3116  
3117      case '%':
3118          if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3119              return NULL;
3120          break;
3121  
3122      default:
3123          /* if we stumble upon an unknown formatting code, copy the rest
3124             of the format string to the output string. (we cannot just
3125             skip the code, since there's no way to know what's in the
3126             argument list) */
3127          len = strlen(p);
3128          if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3129              return NULL;
3130          f = p+len;
3131          return f;
3132      }
3133  
3134      f++;
3135      return f;
3136  }
3137  
3138  PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3139  PyUnicode_FromFormatV(const char *format, va_list vargs)
3140  {
3141      va_list vargs2;
3142      const char *f;
3143      _PyUnicodeWriter writer;
3144  
3145      _PyUnicodeWriter_Init(&writer);
3146      writer.min_length = strlen(format) + 100;
3147      writer.overallocate = 1;
3148  
3149      // Copy varags to be able to pass a reference to a subfunction.
3150      va_copy(vargs2, vargs);
3151  
3152      for (f = format; *f; ) {
3153          if (*f == '%') {
3154              f = unicode_fromformat_arg(&writer, f, &vargs2);
3155              if (f == NULL)
3156                  goto fail;
3157          }
3158          else {
3159              const char *p;
3160              Py_ssize_t len;
3161  
3162              p = f;
3163              do
3164              {
3165                  if ((unsigned char)*p > 127) {
3166                      PyErr_Format(PyExc_ValueError,
3167                          "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3168                          "string, got a non-ASCII byte: 0x%02x",
3169                          (unsigned char)*p);
3170                      goto fail;
3171                  }
3172                  p++;
3173              }
3174              while (*p != '\0' && *p != '%');
3175              len = p - f;
3176  
3177              if (*p == '\0')
3178                  writer.overallocate = 0;
3179  
3180              if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3181                  goto fail;
3182  
3183              f = p;
3184          }
3185      }
3186      va_end(vargs2);
3187      return _PyUnicodeWriter_Finish(&writer);
3188  
3189    fail:
3190      va_end(vargs2);
3191      _PyUnicodeWriter_Dealloc(&writer);
3192      return NULL;
3193  }
3194  
3195  PyObject *
PyUnicode_FromFormat(const char * format,...)3196  PyUnicode_FromFormat(const char *format, ...)
3197  {
3198      PyObject* ret;
3199      va_list vargs;
3200  
3201  #ifdef HAVE_STDARG_PROTOTYPES
3202      va_start(vargs, format);
3203  #else
3204      va_start(vargs);
3205  #endif
3206      ret = PyUnicode_FromFormatV(format, vargs);
3207      va_end(vargs);
3208      return ret;
3209  }
3210  
3211  static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3212  unicode_get_widechar_size(PyObject *unicode)
3213  {
3214      Py_ssize_t res;
3215  
3216      assert(unicode != NULL);
3217      assert(_PyUnicode_CHECK(unicode));
3218  
3219  #if USE_UNICODE_WCHAR_CACHE
3220      if (_PyUnicode_WSTR(unicode) != NULL) {
3221          return PyUnicode_WSTR_LENGTH(unicode);
3222      }
3223  #endif /* USE_UNICODE_WCHAR_CACHE */
3224      assert(PyUnicode_IS_READY(unicode));
3225  
3226      res = _PyUnicode_LENGTH(unicode);
3227  #if SIZEOF_WCHAR_T == 2
3228      if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3229          const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3230          const Py_UCS4 *end = s + res;
3231          for (; s < end; ++s) {
3232              if (*s > 0xFFFF) {
3233                  ++res;
3234              }
3235          }
3236      }
3237  #endif
3238      return res;
3239  }
3240  
3241  static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3242  unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3243  {
3244      assert(unicode != NULL);
3245      assert(_PyUnicode_CHECK(unicode));
3246  
3247  #if USE_UNICODE_WCHAR_CACHE
3248      const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3249      if (wstr != NULL) {
3250          memcpy(w, wstr, size * sizeof(wchar_t));
3251          return;
3252      }
3253  #else /* USE_UNICODE_WCHAR_CACHE */
3254      if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3255          memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3256          return;
3257      }
3258  #endif /* USE_UNICODE_WCHAR_CACHE */
3259      assert(PyUnicode_IS_READY(unicode));
3260  
3261      if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3262          const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3263          for (; size--; ++s, ++w) {
3264              *w = *s;
3265          }
3266      }
3267      else {
3268  #if SIZEOF_WCHAR_T == 4
3269          assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3270          const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3271          for (; size--; ++s, ++w) {
3272              *w = *s;
3273          }
3274  #else
3275          assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3276          const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3277          for (; size--; ++s, ++w) {
3278              Py_UCS4 ch = *s;
3279              if (ch > 0xFFFF) {
3280                  assert(ch <= MAX_UNICODE);
3281                  /* encode surrogate pair in this case */
3282                  *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3283                  if (!size--)
3284                      break;
3285                  *w = Py_UNICODE_LOW_SURROGATE(ch);
3286              }
3287              else {
3288                  *w = ch;
3289              }
3290          }
3291  #endif
3292      }
3293  }
3294  
3295  #ifdef HAVE_WCHAR_H
3296  
3297  /* Convert a Unicode object to a wide character string.
3298  
3299     - If w is NULL: return the number of wide characters (including the null
3300       character) required to convert the unicode object. Ignore size argument.
3301  
3302     - Otherwise: return the number of wide characters (excluding the null
3303       character) written into w. Write at most size wide characters (including
3304       the null character). */
3305  Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3306  PyUnicode_AsWideChar(PyObject *unicode,
3307                       wchar_t *w,
3308                       Py_ssize_t size)
3309  {
3310      Py_ssize_t res;
3311  
3312      if (unicode == NULL) {
3313          PyErr_BadInternalCall();
3314          return -1;
3315      }
3316      if (!PyUnicode_Check(unicode)) {
3317          PyErr_BadArgument();
3318          return -1;
3319      }
3320  
3321      res = unicode_get_widechar_size(unicode);
3322      if (w == NULL) {
3323          return res + 1;
3324      }
3325  
3326      if (size > res) {
3327          size = res + 1;
3328      }
3329      else {
3330          res = size;
3331      }
3332      unicode_copy_as_widechar(unicode, w, size);
3333  
3334  #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3335      /* Oracle Solaris uses non-Unicode internal wchar_t form for
3336         non-Unicode locales and hence needs conversion first. */
3337      if (_Py_LocaleUsesNonUnicodeWchar()) {
3338          if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3339              return -1;
3340          }
3341      }
3342  #endif
3343  
3344      return res;
3345  }
3346  
3347  wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3348  PyUnicode_AsWideCharString(PyObject *unicode,
3349                             Py_ssize_t *size)
3350  {
3351      wchar_t *buffer;
3352      Py_ssize_t buflen;
3353  
3354      if (unicode == NULL) {
3355          PyErr_BadInternalCall();
3356          return NULL;
3357      }
3358      if (!PyUnicode_Check(unicode)) {
3359          PyErr_BadArgument();
3360          return NULL;
3361      }
3362  
3363      buflen = unicode_get_widechar_size(unicode);
3364      buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3365      if (buffer == NULL) {
3366          PyErr_NoMemory();
3367          return NULL;
3368      }
3369      unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3370  
3371  #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3372      /* Oracle Solaris uses non-Unicode internal wchar_t form for
3373         non-Unicode locales and hence needs conversion first. */
3374      if (_Py_LocaleUsesNonUnicodeWchar()) {
3375          if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3376              return NULL;
3377          }
3378      }
3379  #endif
3380  
3381      if (size != NULL) {
3382          *size = buflen;
3383      }
3384      else if (wcslen(buffer) != (size_t)buflen) {
3385          PyMem_Free(buffer);
3386          PyErr_SetString(PyExc_ValueError,
3387                          "embedded null character");
3388          return NULL;
3389      }
3390      return buffer;
3391  }
3392  
3393  #endif /* HAVE_WCHAR_H */
3394  
3395  int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3396  _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3397  {
3398      wchar_t **p = (wchar_t **)ptr;
3399      if (obj == NULL) {
3400  #if !USE_UNICODE_WCHAR_CACHE
3401          PyMem_Free(*p);
3402  #endif /* USE_UNICODE_WCHAR_CACHE */
3403          *p = NULL;
3404          return 1;
3405      }
3406      if (PyUnicode_Check(obj)) {
3407  #if USE_UNICODE_WCHAR_CACHE
3408          *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3409          if (*p == NULL) {
3410              return 0;
3411          }
3412          return 1;
3413  #else /* USE_UNICODE_WCHAR_CACHE */
3414          *p = PyUnicode_AsWideCharString(obj, NULL);
3415          if (*p == NULL) {
3416              return 0;
3417          }
3418          return Py_CLEANUP_SUPPORTED;
3419  #endif /* USE_UNICODE_WCHAR_CACHE */
3420      }
3421      PyErr_Format(PyExc_TypeError,
3422                   "argument must be str, not %.50s",
3423                   Py_TYPE(obj)->tp_name);
3424      return 0;
3425  }
3426  
3427  int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3428  _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3429  {
3430      wchar_t **p = (wchar_t **)ptr;
3431      if (obj == NULL) {
3432  #if !USE_UNICODE_WCHAR_CACHE
3433          PyMem_Free(*p);
3434  #endif /* USE_UNICODE_WCHAR_CACHE */
3435          *p = NULL;
3436          return 1;
3437      }
3438      if (obj == Py_None) {
3439          *p = NULL;
3440          return 1;
3441      }
3442      if (PyUnicode_Check(obj)) {
3443  #if USE_UNICODE_WCHAR_CACHE
3444          *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3445          if (*p == NULL) {
3446              return 0;
3447          }
3448          return 1;
3449  #else /* USE_UNICODE_WCHAR_CACHE */
3450          *p = PyUnicode_AsWideCharString(obj, NULL);
3451          if (*p == NULL) {
3452              return 0;
3453          }
3454          return Py_CLEANUP_SUPPORTED;
3455  #endif /* USE_UNICODE_WCHAR_CACHE */
3456      }
3457      PyErr_Format(PyExc_TypeError,
3458                   "argument must be str or None, not %.50s",
3459                   Py_TYPE(obj)->tp_name);
3460      return 0;
3461  }
3462  
3463  PyObject *
PyUnicode_FromOrdinal(int ordinal)3464  PyUnicode_FromOrdinal(int ordinal)
3465  {
3466      if (ordinal < 0 || ordinal > MAX_UNICODE) {
3467          PyErr_SetString(PyExc_ValueError,
3468                          "chr() arg not in range(0x110000)");
3469          return NULL;
3470      }
3471  
3472      return unicode_char((Py_UCS4)ordinal);
3473  }
3474  
3475  PyObject *
PyUnicode_FromObject(PyObject * obj)3476  PyUnicode_FromObject(PyObject *obj)
3477  {
3478      /* XXX Perhaps we should make this API an alias of
3479         PyObject_Str() instead ?! */
3480      if (PyUnicode_CheckExact(obj)) {
3481          if (PyUnicode_READY(obj) == -1)
3482              return NULL;
3483          Py_INCREF(obj);
3484          return obj;
3485      }
3486      if (PyUnicode_Check(obj)) {
3487          /* For a Unicode subtype that's not a Unicode object,
3488             return a true Unicode object with the same data. */
3489          return _PyUnicode_Copy(obj);
3490      }
3491      PyErr_Format(PyExc_TypeError,
3492                   "Can't convert '%.100s' object to str implicitly",
3493                   Py_TYPE(obj)->tp_name);
3494      return NULL;
3495  }
3496  
3497  PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3498  PyUnicode_FromEncodedObject(PyObject *obj,
3499                              const char *encoding,
3500                              const char *errors)
3501  {
3502      Py_buffer buffer;
3503      PyObject *v;
3504  
3505      if (obj == NULL) {
3506          PyErr_BadInternalCall();
3507          return NULL;
3508      }
3509  
3510      /* Decoding bytes objects is the most common case and should be fast */
3511      if (PyBytes_Check(obj)) {
3512          if (PyBytes_GET_SIZE(obj) == 0) {
3513              if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514                  return NULL;
3515              }
3516              _Py_RETURN_UNICODE_EMPTY();
3517          }
3518          return PyUnicode_Decode(
3519                  PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3520                  encoding, errors);
3521      }
3522  
3523      if (PyUnicode_Check(obj)) {
3524          PyErr_SetString(PyExc_TypeError,
3525                          "decoding str is not supported");
3526          return NULL;
3527      }
3528  
3529      /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3530      if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3531          PyErr_Format(PyExc_TypeError,
3532                       "decoding to str: need a bytes-like object, %.80s found",
3533                       Py_TYPE(obj)->tp_name);
3534          return NULL;
3535      }
3536  
3537      if (buffer.len == 0) {
3538          PyBuffer_Release(&buffer);
3539          if (unicode_check_encoding_errors(encoding, errors) < 0) {
3540              return NULL;
3541          }
3542          _Py_RETURN_UNICODE_EMPTY();
3543      }
3544  
3545      v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3546      PyBuffer_Release(&buffer);
3547      return v;
3548  }
3549  
3550  /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3551     also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3552     longer than lower_len-1). */
3553  int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3554  _Py_normalize_encoding(const char *encoding,
3555                         char *lower,
3556                         size_t lower_len)
3557  {
3558      const char *e;
3559      char *l;
3560      char *l_end;
3561      int punct;
3562  
3563      assert(encoding != NULL);
3564  
3565      e = encoding;
3566      l = lower;
3567      l_end = &lower[lower_len - 1];
3568      punct = 0;
3569      while (1) {
3570          char c = *e;
3571          if (c == 0) {
3572              break;
3573          }
3574  
3575          if (Py_ISALNUM(c) || c == '.') {
3576              if (punct && l != lower) {
3577                  if (l == l_end) {
3578                      return 0;
3579                  }
3580                  *l++ = '_';
3581              }
3582              punct = 0;
3583  
3584              if (l == l_end) {
3585                  return 0;
3586              }
3587              *l++ = Py_TOLOWER(c);
3588          }
3589          else {
3590              punct = 1;
3591          }
3592  
3593          e++;
3594      }
3595      *l = '\0';
3596      return 1;
3597  }
3598  
3599  PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3600  PyUnicode_Decode(const char *s,
3601                   Py_ssize_t size,
3602                   const char *encoding,
3603                   const char *errors)
3604  {
3605      PyObject *buffer = NULL, *unicode;
3606      Py_buffer info;
3607      char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3608  
3609      if (unicode_check_encoding_errors(encoding, errors) < 0) {
3610          return NULL;
3611      }
3612  
3613      if (size == 0) {
3614          _Py_RETURN_UNICODE_EMPTY();
3615      }
3616  
3617      if (encoding == NULL) {
3618          return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3619      }
3620  
3621      /* Shortcuts for common default encodings */
3622      if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3623          char *lower = buflower;
3624  
3625          /* Fast paths */
3626          if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3627              lower += 3;
3628              if (*lower == '_') {
3629                  /* Match "utf8" and "utf_8" */
3630                  lower++;
3631              }
3632  
3633              if (lower[0] == '8' && lower[1] == 0) {
3634                  return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3635              }
3636              else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3637                  return PyUnicode_DecodeUTF16(s, size, errors, 0);
3638              }
3639              else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3640                  return PyUnicode_DecodeUTF32(s, size, errors, 0);
3641              }
3642          }
3643          else {
3644              if (strcmp(lower, "ascii") == 0
3645                  || strcmp(lower, "us_ascii") == 0) {
3646                  return PyUnicode_DecodeASCII(s, size, errors);
3647              }
3648      #ifdef MS_WINDOWS
3649              else if (strcmp(lower, "mbcs") == 0) {
3650                  return PyUnicode_DecodeMBCS(s, size, errors);
3651              }
3652      #endif
3653              else if (strcmp(lower, "latin1") == 0
3654                       || strcmp(lower, "latin_1") == 0
3655                       || strcmp(lower, "iso_8859_1") == 0
3656                       || strcmp(lower, "iso8859_1") == 0) {
3657                  return PyUnicode_DecodeLatin1(s, size, errors);
3658              }
3659          }
3660      }
3661  
3662      /* Decode via the codec registry */
3663      buffer = NULL;
3664      if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3665          goto onError;
3666      buffer = PyMemoryView_FromBuffer(&info);
3667      if (buffer == NULL)
3668          goto onError;
3669      unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3670      if (unicode == NULL)
3671          goto onError;
3672      if (!PyUnicode_Check(unicode)) {
3673          PyErr_Format(PyExc_TypeError,
3674                       "'%.400s' decoder returned '%.400s' instead of 'str'; "
3675                       "use codecs.decode() to decode to arbitrary types",
3676                       encoding,
3677                       Py_TYPE(unicode)->tp_name);
3678          Py_DECREF(unicode);
3679          goto onError;
3680      }
3681      Py_DECREF(buffer);
3682      return unicode_result(unicode);
3683  
3684    onError:
3685      Py_XDECREF(buffer);
3686      return NULL;
3687  }
3688  
3689  PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3690  PyUnicode_AsDecodedObject(PyObject *unicode,
3691                            const char *encoding,
3692                            const char *errors)
3693  {
3694      if (!PyUnicode_Check(unicode)) {
3695          PyErr_BadArgument();
3696          return NULL;
3697      }
3698  
3699      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3700                       "PyUnicode_AsDecodedObject() is deprecated; "
3701                       "use PyCodec_Decode() to decode from str", 1) < 0)
3702          return NULL;
3703  
3704      if (encoding == NULL)
3705          encoding = PyUnicode_GetDefaultEncoding();
3706  
3707      /* Decode via the codec registry */
3708      return PyCodec_Decode(unicode, encoding, errors);
3709  }
3710  
3711  PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3712  PyUnicode_AsDecodedUnicode(PyObject *unicode,
3713                             const char *encoding,
3714                             const char *errors)
3715  {
3716      PyObject *v;
3717  
3718      if (!PyUnicode_Check(unicode)) {
3719          PyErr_BadArgument();
3720          goto onError;
3721      }
3722  
3723      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3724                       "PyUnicode_AsDecodedUnicode() is deprecated; "
3725                       "use PyCodec_Decode() to decode from str to str", 1) < 0)
3726          return NULL;
3727  
3728      if (encoding == NULL)
3729          encoding = PyUnicode_GetDefaultEncoding();
3730  
3731      /* Decode via the codec registry */
3732      v = PyCodec_Decode(unicode, encoding, errors);
3733      if (v == NULL)
3734          goto onError;
3735      if (!PyUnicode_Check(v)) {
3736          PyErr_Format(PyExc_TypeError,
3737                       "'%.400s' decoder returned '%.400s' instead of 'str'; "
3738                       "use codecs.decode() to decode to arbitrary types",
3739                       encoding,
3740                       Py_TYPE(unicode)->tp_name);
3741          Py_DECREF(v);
3742          goto onError;
3743      }
3744      return unicode_result(v);
3745  
3746    onError:
3747      return NULL;
3748  }
3749  
3750  PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3751  PyUnicode_Encode(const Py_UNICODE *s,
3752                   Py_ssize_t size,
3753                   const char *encoding,
3754                   const char *errors)
3755  {
3756      PyObject *v, *unicode;
3757  
3758      unicode = PyUnicode_FromWideChar(s, size);
3759      if (unicode == NULL)
3760          return NULL;
3761      v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3762      Py_DECREF(unicode);
3763      return v;
3764  }
3765  
3766  PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3767  PyUnicode_AsEncodedObject(PyObject *unicode,
3768                            const char *encoding,
3769                            const char *errors)
3770  {
3771      PyObject *v;
3772  
3773      if (!PyUnicode_Check(unicode)) {
3774          PyErr_BadArgument();
3775          goto onError;
3776      }
3777  
3778      if (PyErr_WarnEx(PyExc_DeprecationWarning,
3779                       "PyUnicode_AsEncodedObject() is deprecated; "
3780                       "use PyUnicode_AsEncodedString() to encode from str to bytes "
3781                       "or PyCodec_Encode() for generic encoding", 1) < 0)
3782          return NULL;
3783  
3784      if (encoding == NULL)
3785          encoding = PyUnicode_GetDefaultEncoding();
3786  
3787      /* Encode via the codec registry */
3788      v = PyCodec_Encode(unicode, encoding, errors);
3789      if (v == NULL)
3790          goto onError;
3791      return v;
3792  
3793    onError:
3794      return NULL;
3795  }
3796  
3797  
3798  static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3799  unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3800                        int current_locale)
3801  {
3802      Py_ssize_t wlen;
3803      wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3804      if (wstr == NULL) {
3805          return NULL;
3806      }
3807  
3808      if ((size_t)wlen != wcslen(wstr)) {
3809          PyErr_SetString(PyExc_ValueError, "embedded null character");
3810          PyMem_Free(wstr);
3811          return NULL;
3812      }
3813  
3814      char *str;
3815      size_t error_pos;
3816      const char *reason;
3817      int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3818                                   current_locale, error_handler);
3819      PyMem_Free(wstr);
3820  
3821      if (res != 0) {
3822          if (res == -2) {
3823              PyObject *exc;
3824              exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3825                      "locale", unicode,
3826                      (Py_ssize_t)error_pos,
3827                      (Py_ssize_t)(error_pos+1),
3828                      reason);
3829              if (exc != NULL) {
3830                  PyCodec_StrictErrors(exc);
3831                  Py_DECREF(exc);
3832              }
3833          }
3834          else if (res == -3) {
3835              PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3836          }
3837          else {
3838              PyErr_NoMemory();
3839          }
3840          return NULL;
3841      }
3842  
3843      PyObject *bytes = PyBytes_FromString(str);
3844      PyMem_RawFree(str);
3845      return bytes;
3846  }
3847  
3848  PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3849  PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3850  {
3851      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3852      return unicode_encode_locale(unicode, error_handler, 1);
3853  }
3854  
3855  PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3856  PyUnicode_EncodeFSDefault(PyObject *unicode)
3857  {
3858      PyInterpreterState *interp = _PyInterpreterState_GET();
3859      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3860      if (fs_codec->utf8) {
3861          return unicode_encode_utf8(unicode,
3862                                     fs_codec->error_handler,
3863                                     fs_codec->errors);
3864      }
3865  #ifndef _Py_FORCE_UTF8_FS_ENCODING
3866      else if (fs_codec->encoding) {
3867          return PyUnicode_AsEncodedString(unicode,
3868                                           fs_codec->encoding,
3869                                           fs_codec->errors);
3870      }
3871  #endif
3872      else {
3873          /* Before _PyUnicode_InitEncodings() is called, the Python codec
3874             machinery is not ready and so cannot be used:
3875             use wcstombs() in this case. */
3876          const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3877          const wchar_t *filesystem_errors = config->filesystem_errors;
3878          assert(filesystem_errors != NULL);
3879          _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3880          assert(errors != _Py_ERROR_UNKNOWN);
3881  #ifdef _Py_FORCE_UTF8_FS_ENCODING
3882          return unicode_encode_utf8(unicode, errors, NULL);
3883  #else
3884          return unicode_encode_locale(unicode, errors, 0);
3885  #endif
3886      }
3887  }
3888  
3889  PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3890  PyUnicode_AsEncodedString(PyObject *unicode,
3891                            const char *encoding,
3892                            const char *errors)
3893  {
3894      PyObject *v;
3895      char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3896  
3897      if (!PyUnicode_Check(unicode)) {
3898          PyErr_BadArgument();
3899          return NULL;
3900      }
3901  
3902      if (unicode_check_encoding_errors(encoding, errors) < 0) {
3903          return NULL;
3904      }
3905  
3906      if (encoding == NULL) {
3907          return _PyUnicode_AsUTF8String(unicode, errors);
3908      }
3909  
3910      /* Shortcuts for common default encodings */
3911      if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3912          char *lower = buflower;
3913  
3914          /* Fast paths */
3915          if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3916              lower += 3;
3917              if (*lower == '_') {
3918                  /* Match "utf8" and "utf_8" */
3919                  lower++;
3920              }
3921  
3922              if (lower[0] == '8' && lower[1] == 0) {
3923                  return _PyUnicode_AsUTF8String(unicode, errors);
3924              }
3925              else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3926                  return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3927              }
3928              else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3929                  return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3930              }
3931          }
3932          else {
3933              if (strcmp(lower, "ascii") == 0
3934                  || strcmp(lower, "us_ascii") == 0) {
3935                  return _PyUnicode_AsASCIIString(unicode, errors);
3936              }
3937  #ifdef MS_WINDOWS
3938              else if (strcmp(lower, "mbcs") == 0) {
3939                  return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3940              }
3941  #endif
3942              else if (strcmp(lower, "latin1") == 0 ||
3943                       strcmp(lower, "latin_1") == 0 ||
3944                       strcmp(lower, "iso_8859_1") == 0 ||
3945                       strcmp(lower, "iso8859_1") == 0) {
3946                  return _PyUnicode_AsLatin1String(unicode, errors);
3947              }
3948          }
3949      }
3950  
3951      /* Encode via the codec registry */
3952      v = _PyCodec_EncodeText(unicode, encoding, errors);
3953      if (v == NULL)
3954          return NULL;
3955  
3956      /* The normal path */
3957      if (PyBytes_Check(v))
3958          return v;
3959  
3960      /* If the codec returns a buffer, raise a warning and convert to bytes */
3961      if (PyByteArray_Check(v)) {
3962          int error;
3963          PyObject *b;
3964  
3965          error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3966              "encoder %s returned bytearray instead of bytes; "
3967              "use codecs.encode() to encode to arbitrary types",
3968              encoding);
3969          if (error) {
3970              Py_DECREF(v);
3971              return NULL;
3972          }
3973  
3974          b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3975                                        PyByteArray_GET_SIZE(v));
3976          Py_DECREF(v);
3977          return b;
3978      }
3979  
3980      PyErr_Format(PyExc_TypeError,
3981                   "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3982                   "use codecs.encode() to encode to arbitrary types",
3983                   encoding,
3984                   Py_TYPE(v)->tp_name);
3985      Py_DECREF(v);
3986      return NULL;
3987  }
3988  
3989  PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3990  PyUnicode_AsEncodedUnicode(PyObject *unicode,
3991                             const char *encoding,
3992                             const char *errors)
3993  {
3994      PyObject *v;
3995  
3996      if (!PyUnicode_Check(unicode)) {
3997          PyErr_BadArgument();
3998          goto onError;
3999      }
4000  
4001      if (PyErr_WarnEx(PyExc_DeprecationWarning,
4002                       "PyUnicode_AsEncodedUnicode() is deprecated; "
4003                       "use PyCodec_Encode() to encode from str to str", 1) < 0)
4004          return NULL;
4005  
4006      if (encoding == NULL)
4007          encoding = PyUnicode_GetDefaultEncoding();
4008  
4009      /* Encode via the codec registry */
4010      v = PyCodec_Encode(unicode, encoding, errors);
4011      if (v == NULL)
4012          goto onError;
4013      if (!PyUnicode_Check(v)) {
4014          PyErr_Format(PyExc_TypeError,
4015                       "'%.400s' encoder returned '%.400s' instead of 'str'; "
4016                       "use codecs.encode() to encode to arbitrary types",
4017                       encoding,
4018                       Py_TYPE(v)->tp_name);
4019          Py_DECREF(v);
4020          goto onError;
4021      }
4022      return v;
4023  
4024    onError:
4025      return NULL;
4026  }
4027  
4028  static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)4029  unicode_decode_locale(const char *str, Py_ssize_t len,
4030                        _Py_error_handler errors, int current_locale)
4031  {
4032      if (str[len] != '\0' || (size_t)len != strlen(str))  {
4033          PyErr_SetString(PyExc_ValueError, "embedded null byte");
4034          return NULL;
4035      }
4036  
4037      wchar_t *wstr;
4038      size_t wlen;
4039      const char *reason;
4040      int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4041                                   current_locale, errors);
4042      if (res != 0) {
4043          if (res == -2) {
4044              PyObject *exc;
4045              exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4046                                          "locale", str, len,
4047                                          (Py_ssize_t)wlen,
4048                                          (Py_ssize_t)(wlen + 1),
4049                                          reason);
4050              if (exc != NULL) {
4051                  PyCodec_StrictErrors(exc);
4052                  Py_DECREF(exc);
4053              }
4054          }
4055          else if (res == -3) {
4056              PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4057          }
4058          else {
4059              PyErr_NoMemory();
4060          }
4061          return NULL;
4062      }
4063  
4064      PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4065      PyMem_RawFree(wstr);
4066      return unicode;
4067  }
4068  
4069  PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4070  PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4071                                const char *errors)
4072  {
4073      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4074      return unicode_decode_locale(str, len, error_handler, 1);
4075  }
4076  
4077  PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4078  PyUnicode_DecodeLocale(const char *str, const char *errors)
4079  {
4080      Py_ssize_t size = (Py_ssize_t)strlen(str);
4081      _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4082      return unicode_decode_locale(str, size, error_handler, 1);
4083  }
4084  
4085  
4086  PyObject*
PyUnicode_DecodeFSDefault(const char * s)4087  PyUnicode_DecodeFSDefault(const char *s) {
4088      Py_ssize_t size = (Py_ssize_t)strlen(s);
4089      return PyUnicode_DecodeFSDefaultAndSize(s, size);
4090  }
4091  
4092  PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4093  PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4094  {
4095      PyInterpreterState *interp = _PyInterpreterState_GET();
4096      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4097      if (fs_codec->utf8) {
4098          return unicode_decode_utf8(s, size,
4099                                     fs_codec->error_handler,
4100                                     fs_codec->errors,
4101                                     NULL);
4102      }
4103  #ifndef _Py_FORCE_UTF8_FS_ENCODING
4104      else if (fs_codec->encoding) {
4105          return PyUnicode_Decode(s, size,
4106                                  fs_codec->encoding,
4107                                  fs_codec->errors);
4108      }
4109  #endif
4110      else {
4111          /* Before _PyUnicode_InitEncodings() is called, the Python codec
4112             machinery is not ready and so cannot be used:
4113             use mbstowcs() in this case. */
4114          const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4115          const wchar_t *filesystem_errors = config->filesystem_errors;
4116          assert(filesystem_errors != NULL);
4117          _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4118          assert(errors != _Py_ERROR_UNKNOWN);
4119  #ifdef _Py_FORCE_UTF8_FS_ENCODING
4120          return unicode_decode_utf8(s, size, errors, NULL, NULL);
4121  #else
4122          return unicode_decode_locale(s, size, errors, 0);
4123  #endif
4124      }
4125  }
4126  
4127  
4128  int
PyUnicode_FSConverter(PyObject * arg,void * addr)4129  PyUnicode_FSConverter(PyObject* arg, void* addr)
4130  {
4131      PyObject *path = NULL;
4132      PyObject *output = NULL;
4133      Py_ssize_t size;
4134      const char *data;
4135      if (arg == NULL) {
4136          Py_DECREF(*(PyObject**)addr);
4137          *(PyObject**)addr = NULL;
4138          return 1;
4139      }
4140      path = PyOS_FSPath(arg);
4141      if (path == NULL) {
4142          return 0;
4143      }
4144      if (PyBytes_Check(path)) {
4145          output = path;
4146      }
4147      else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4148          output = PyUnicode_EncodeFSDefault(path);
4149          Py_DECREF(path);
4150          if (!output) {
4151              return 0;
4152          }
4153          assert(PyBytes_Check(output));
4154      }
4155  
4156      size = PyBytes_GET_SIZE(output);
4157      data = PyBytes_AS_STRING(output);
4158      if ((size_t)size != strlen(data)) {
4159          PyErr_SetString(PyExc_ValueError, "embedded null byte");
4160          Py_DECREF(output);
4161          return 0;
4162      }
4163      *(PyObject**)addr = output;
4164      return Py_CLEANUP_SUPPORTED;
4165  }
4166  
4167  
4168  int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4169  PyUnicode_FSDecoder(PyObject* arg, void* addr)
4170  {
4171      int is_buffer = 0;
4172      PyObject *path = NULL;
4173      PyObject *output = NULL;
4174      if (arg == NULL) {
4175          Py_DECREF(*(PyObject**)addr);
4176          *(PyObject**)addr = NULL;
4177          return 1;
4178      }
4179  
4180      is_buffer = PyObject_CheckBuffer(arg);
4181      if (!is_buffer) {
4182          path = PyOS_FSPath(arg);
4183          if (path == NULL) {
4184              return 0;
4185          }
4186      }
4187      else {
4188          path = arg;
4189          Py_INCREF(arg);
4190      }
4191  
4192      if (PyUnicode_Check(path)) {
4193          output = path;
4194      }
4195      else if (PyBytes_Check(path) || is_buffer) {
4196          PyObject *path_bytes = NULL;
4197  
4198          if (!PyBytes_Check(path) &&
4199              PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4200              "path should be string, bytes, or os.PathLike, not %.200s",
4201              Py_TYPE(arg)->tp_name)) {
4202                  Py_DECREF(path);
4203              return 0;
4204          }
4205          path_bytes = PyBytes_FromObject(path);
4206          Py_DECREF(path);
4207          if (!path_bytes) {
4208              return 0;
4209          }
4210          output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4211                                                    PyBytes_GET_SIZE(path_bytes));
4212          Py_DECREF(path_bytes);
4213          if (!output) {
4214              return 0;
4215          }
4216      }
4217      else {
4218          PyErr_Format(PyExc_TypeError,
4219                       "path should be string, bytes, or os.PathLike, not %.200s",
4220                       Py_TYPE(arg)->tp_name);
4221          Py_DECREF(path);
4222          return 0;
4223      }
4224      if (PyUnicode_READY(output) == -1) {
4225          Py_DECREF(output);
4226          return 0;
4227      }
4228      if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4229                   PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4230          PyErr_SetString(PyExc_ValueError, "embedded null character");
4231          Py_DECREF(output);
4232          return 0;
4233      }
4234      *(PyObject**)addr = output;
4235      return Py_CLEANUP_SUPPORTED;
4236  }
4237  
4238  
4239  static int unicode_fill_utf8(PyObject *unicode);
4240  
4241  const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4242  PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4243  {
4244      if (!PyUnicode_Check(unicode)) {
4245          PyErr_BadArgument();
4246          return NULL;
4247      }
4248      if (PyUnicode_READY(unicode) == -1)
4249          return NULL;
4250  
4251      if (PyUnicode_UTF8(unicode) == NULL) {
4252          if (unicode_fill_utf8(unicode) == -1) {
4253              return NULL;
4254          }
4255      }
4256  
4257      if (psize)
4258          *psize = PyUnicode_UTF8_LENGTH(unicode);
4259      return PyUnicode_UTF8(unicode);
4260  }
4261  
4262  const char *
PyUnicode_AsUTF8(PyObject * unicode)4263  PyUnicode_AsUTF8(PyObject *unicode)
4264  {
4265      return PyUnicode_AsUTF8AndSize(unicode, NULL);
4266  }
4267  
4268  Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4269  PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4270  {
4271      if (!PyUnicode_Check(unicode)) {
4272          PyErr_BadArgument();
4273          return NULL;
4274      }
4275      Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4276      if (w == NULL) {
4277          /* Non-ASCII compact unicode object */
4278          assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4279          assert(PyUnicode_IS_READY(unicode));
4280  
4281          Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4282          if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4283              PyErr_NoMemory();
4284              return NULL;
4285          }
4286          w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4287          if (w == NULL) {
4288              PyErr_NoMemory();
4289              return NULL;
4290          }
4291          unicode_copy_as_widechar(unicode, w, wlen + 1);
4292          _PyUnicode_WSTR(unicode) = w;
4293          if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4294              _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4295          }
4296      }
4297      if (size != NULL)
4298          *size = PyUnicode_WSTR_LENGTH(unicode);
4299      return w;
4300  }
4301  
4302  /* Deprecated APIs */
4303  
4304  _Py_COMP_DIAG_PUSH
4305  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4306  
4307  Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4308  PyUnicode_AsUnicode(PyObject *unicode)
4309  {
4310      return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4311  }
4312  
4313  const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4314  _PyUnicode_AsUnicode(PyObject *unicode)
4315  {
4316      Py_ssize_t size;
4317      const Py_UNICODE *wstr;
4318  
4319      wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4320      if (wstr && wcslen(wstr) != (size_t)size) {
4321          PyErr_SetString(PyExc_ValueError, "embedded null character");
4322          return NULL;
4323      }
4324      return wstr;
4325  }
4326  
4327  
4328  Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4329  PyUnicode_GetSize(PyObject *unicode)
4330  {
4331      if (!PyUnicode_Check(unicode)) {
4332          PyErr_BadArgument();
4333          goto onError;
4334      }
4335      if (_PyUnicode_WSTR(unicode) == NULL) {
4336          if (PyUnicode_AsUnicode(unicode) == NULL)
4337              goto onError;
4338      }
4339      return PyUnicode_WSTR_LENGTH(unicode);
4340  
4341    onError:
4342      return -1;
4343  }
4344  
4345  _Py_COMP_DIAG_POP
4346  
4347  Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4348  PyUnicode_GetLength(PyObject *unicode)
4349  {
4350      if (!PyUnicode_Check(unicode)) {
4351          PyErr_BadArgument();
4352          return -1;
4353      }
4354      if (PyUnicode_READY(unicode) == -1)
4355          return -1;
4356      return PyUnicode_GET_LENGTH(unicode);
4357  }
4358  
4359  Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4360  PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4361  {
4362      const void *data;
4363      int kind;
4364  
4365      if (!PyUnicode_Check(unicode)) {
4366          PyErr_BadArgument();
4367          return (Py_UCS4)-1;
4368      }
4369      if (PyUnicode_READY(unicode) == -1) {
4370          return (Py_UCS4)-1;
4371      }
4372      if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4373          PyErr_SetString(PyExc_IndexError, "string index out of range");
4374          return (Py_UCS4)-1;
4375      }
4376      data = PyUnicode_DATA(unicode);
4377      kind = PyUnicode_KIND(unicode);
4378      return PyUnicode_READ(kind, data, index);
4379  }
4380  
4381  int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4382  PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4383  {
4384      if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4385          PyErr_BadArgument();
4386          return -1;
4387      }
4388      assert(PyUnicode_IS_READY(unicode));
4389      if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4390          PyErr_SetString(PyExc_IndexError, "string index out of range");
4391          return -1;
4392      }
4393      if (unicode_check_modifiable(unicode))
4394          return -1;
4395      if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4396          PyErr_SetString(PyExc_ValueError, "character out of range");
4397          return -1;
4398      }
4399      PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4400                      index, ch);
4401      return 0;
4402  }
4403  
4404  const char *
PyUnicode_GetDefaultEncoding(void)4405  PyUnicode_GetDefaultEncoding(void)
4406  {
4407      return "utf-8";
4408  }
4409  
4410  /* create or adjust a UnicodeDecodeError */
4411  static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4412  make_decode_exception(PyObject **exceptionObject,
4413                        const char *encoding,
4414                        const char *input, Py_ssize_t length,
4415                        Py_ssize_t startpos, Py_ssize_t endpos,
4416                        const char *reason)
4417  {
4418      if (*exceptionObject == NULL) {
4419          *exceptionObject = PyUnicodeDecodeError_Create(
4420              encoding, input, length, startpos, endpos, reason);
4421      }
4422      else {
4423          if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4424              goto onError;
4425          if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4426              goto onError;
4427          if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4428              goto onError;
4429      }
4430      return;
4431  
4432  onError:
4433      Py_CLEAR(*exceptionObject);
4434  }
4435  
4436  #ifdef MS_WINDOWS
4437  static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4438  widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4439  {
4440      if (newsize > *size) {
4441          wchar_t *newbuf = *buf;
4442          if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4443              PyErr_NoMemory();
4444              return -1;
4445          }
4446          *buf = newbuf;
4447      }
4448      *size = newsize;
4449      return 0;
4450  }
4451  
4452  /* error handling callback helper:
4453     build arguments, call the callback and check the arguments,
4454     if no exception occurred, copy the replacement to the output
4455     and adjust various state variables.
4456     return 0 on success, -1 on error
4457  */
4458  
4459  static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4460  unicode_decode_call_errorhandler_wchar(
4461      const char *errors, PyObject **errorHandler,
4462      const char *encoding, const char *reason,
4463      const char **input, const char **inend, Py_ssize_t *startinpos,
4464      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4465      wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4466  {
4467      static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4468  
4469      PyObject *restuple = NULL;
4470      PyObject *repunicode = NULL;
4471      Py_ssize_t outsize;
4472      Py_ssize_t insize;
4473      Py_ssize_t requiredsize;
4474      Py_ssize_t newpos;
4475      PyObject *inputobj = NULL;
4476      Py_ssize_t repwlen;
4477  
4478      if (*errorHandler == NULL) {
4479          *errorHandler = PyCodec_LookupError(errors);
4480          if (*errorHandler == NULL)
4481              goto onError;
4482      }
4483  
4484      make_decode_exception(exceptionObject,
4485          encoding,
4486          *input, *inend - *input,
4487          *startinpos, *endinpos,
4488          reason);
4489      if (*exceptionObject == NULL)
4490          goto onError;
4491  
4492      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4493      if (restuple == NULL)
4494          goto onError;
4495      if (!PyTuple_Check(restuple)) {
4496          PyErr_SetString(PyExc_TypeError, &argparse[3]);
4497          goto onError;
4498      }
4499      if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4500          goto onError;
4501  
4502      /* Copy back the bytes variables, which might have been modified by the
4503         callback */
4504      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4505      if (!inputobj)
4506          goto onError;
4507      *input = PyBytes_AS_STRING(inputobj);
4508      insize = PyBytes_GET_SIZE(inputobj);
4509      *inend = *input + insize;
4510      /* we can DECREF safely, as the exception has another reference,
4511         so the object won't go away. */
4512      Py_DECREF(inputobj);
4513  
4514      if (newpos<0)
4515          newpos = insize+newpos;
4516      if (newpos<0 || newpos>insize) {
4517          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4518          goto onError;
4519      }
4520  
4521  #if USE_UNICODE_WCHAR_CACHE
4522  _Py_COMP_DIAG_PUSH
4523  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4524      repwlen = PyUnicode_GetSize(repunicode);
4525      if (repwlen < 0)
4526          goto onError;
4527  _Py_COMP_DIAG_POP
4528  #else /* USE_UNICODE_WCHAR_CACHE */
4529      repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4530      if (repwlen < 0)
4531          goto onError;
4532      repwlen--;
4533  #endif /* USE_UNICODE_WCHAR_CACHE */
4534      /* need more space? (at least enough for what we
4535         have+the replacement+the rest of the string (starting
4536         at the new input position), so we won't have to check space
4537         when there are no errors in the rest of the string) */
4538      requiredsize = *outpos;
4539      if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4540          goto overflow;
4541      requiredsize += repwlen;
4542      if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4543          goto overflow;
4544      requiredsize += insize - newpos;
4545      outsize = *bufsize;
4546      if (requiredsize > outsize) {
4547          if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4548              requiredsize = 2*outsize;
4549          if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4550              goto onError;
4551          }
4552      }
4553      PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4554      *outpos += repwlen;
4555      *endinpos = newpos;
4556      *inptr = *input + newpos;
4557  
4558      /* we made it! */
4559      Py_DECREF(restuple);
4560      return 0;
4561  
4562    overflow:
4563      PyErr_SetString(PyExc_OverflowError,
4564                      "decoded result is too long for a Python string");
4565  
4566    onError:
4567      Py_XDECREF(restuple);
4568      return -1;
4569  }
4570  #endif   /* MS_WINDOWS */
4571  
4572  static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4573  unicode_decode_call_errorhandler_writer(
4574      const char *errors, PyObject **errorHandler,
4575      const char *encoding, const char *reason,
4576      const char **input, const char **inend, Py_ssize_t *startinpos,
4577      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4578      _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4579  {
4580      static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4581  
4582      PyObject *restuple = NULL;
4583      PyObject *repunicode = NULL;
4584      Py_ssize_t insize;
4585      Py_ssize_t newpos;
4586      Py_ssize_t replen;
4587      Py_ssize_t remain;
4588      PyObject *inputobj = NULL;
4589      int need_to_grow = 0;
4590      const char *new_inptr;
4591  
4592      if (*errorHandler == NULL) {
4593          *errorHandler = PyCodec_LookupError(errors);
4594          if (*errorHandler == NULL)
4595              goto onError;
4596      }
4597  
4598      make_decode_exception(exceptionObject,
4599          encoding,
4600          *input, *inend - *input,
4601          *startinpos, *endinpos,
4602          reason);
4603      if (*exceptionObject == NULL)
4604          goto onError;
4605  
4606      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4607      if (restuple == NULL)
4608          goto onError;
4609      if (!PyTuple_Check(restuple)) {
4610          PyErr_SetString(PyExc_TypeError, &argparse[3]);
4611          goto onError;
4612      }
4613      if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4614          goto onError;
4615  
4616      /* Copy back the bytes variables, which might have been modified by the
4617         callback */
4618      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4619      if (!inputobj)
4620          goto onError;
4621      remain = *inend - *input - *endinpos;
4622      *input = PyBytes_AS_STRING(inputobj);
4623      insize = PyBytes_GET_SIZE(inputobj);
4624      *inend = *input + insize;
4625      /* we can DECREF safely, as the exception has another reference,
4626         so the object won't go away. */
4627      Py_DECREF(inputobj);
4628  
4629      if (newpos<0)
4630          newpos = insize+newpos;
4631      if (newpos<0 || newpos>insize) {
4632          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4633          goto onError;
4634      }
4635  
4636      replen = PyUnicode_GET_LENGTH(repunicode);
4637      if (replen > 1) {
4638          writer->min_length += replen - 1;
4639          need_to_grow = 1;
4640      }
4641      new_inptr = *input + newpos;
4642      if (*inend - new_inptr > remain) {
4643          /* We don't know the decoding algorithm here so we make the worst
4644             assumption that one byte decodes to one unicode character.
4645             If unfortunately one byte could decode to more unicode characters,
4646             the decoder may write out-of-bound then.  Is it possible for the
4647             algorithms using this function? */
4648          writer->min_length += *inend - new_inptr - remain;
4649          need_to_grow = 1;
4650      }
4651      if (need_to_grow) {
4652          writer->overallocate = 1;
4653          if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4654                              PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4655              goto onError;
4656      }
4657      if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4658          goto onError;
4659  
4660      *endinpos = newpos;
4661      *inptr = new_inptr;
4662  
4663      /* we made it! */
4664      Py_DECREF(restuple);
4665      return 0;
4666  
4667    onError:
4668      Py_XDECREF(restuple);
4669      return -1;
4670  }
4671  
4672  /* --- UTF-7 Codec -------------------------------------------------------- */
4673  
4674  /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4675  
4676  /* Three simple macros defining base-64. */
4677  
4678  /* Is c a base-64 character? */
4679  
4680  #define IS_BASE64(c) \
4681      (((c) >= 'A' && (c) <= 'Z') ||     \
4682       ((c) >= 'a' && (c) <= 'z') ||     \
4683       ((c) >= '0' && (c) <= '9') ||     \
4684       (c) == '+' || (c) == '/')
4685  
4686  /* given that c is a base-64 character, what is its base-64 value? */
4687  
4688  #define FROM_BASE64(c)                                                  \
4689      (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4690       ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4691       ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4692       (c) == '+' ? 62 : 63)
4693  
4694  /* What is the base-64 character of the bottom 6 bits of n? */
4695  
4696  #define TO_BASE64(n)  \
4697      ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4698  
4699  /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4700   * decoded as itself.  We are permissive on decoding; the only ASCII
4701   * byte not decoding to itself is the + which begins a base64
4702   * string. */
4703  
4704  #define DECODE_DIRECT(c)                                \
4705      ((c) <= 127 && (c) != '+')
4706  
4707  /* The UTF-7 encoder treats ASCII characters differently according to
4708   * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4709   * the above).  See RFC2152.  This array identifies these different
4710   * sets:
4711   * 0 : "Set D"
4712   *     alphanumeric and '(),-./:?
4713   * 1 : "Set O"
4714   *     !"#$%&*;<=>@[]^_`{|}
4715   * 2 : "whitespace"
4716   *     ht nl cr sp
4717   * 3 : special (must be base64 encoded)
4718   *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4719   */
4720  
4721  static
4722  char utf7_category[128] = {
4723  /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4724      3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4725  /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4726      3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4727  /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4728      2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4729  /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4730      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4731  /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4732      1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4733  /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4734      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4735  /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4736      1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4737  /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4738      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4739  };
4740  
4741  /* ENCODE_DIRECT: this character should be encoded as itself.  The
4742   * answer depends on whether we are encoding set O as itself, and also
4743   * on whether we are encoding whitespace as itself.  RFC2152 makes it
4744   * clear that the answers to these questions vary between
4745   * applications, so this code needs to be flexible.  */
4746  
4747  #define ENCODE_DIRECT(c, directO, directWS)             \
4748      ((c) < 128 && (c) > 0 &&                            \
4749       ((utf7_category[(c)] == 0) ||                      \
4750        (directWS && (utf7_category[(c)] == 2)) ||        \
4751        (directO && (utf7_category[(c)] == 1))))
4752  
4753  PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4754  PyUnicode_DecodeUTF7(const char *s,
4755                       Py_ssize_t size,
4756                       const char *errors)
4757  {
4758      return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4759  }
4760  
4761  /* The decoder.  The only state we preserve is our read position,
4762   * i.e. how many characters we have consumed.  So if we end in the
4763   * middle of a shift sequence we have to back off the read position
4764   * and the output to the beginning of the sequence, otherwise we lose
4765   * all the shift state (seen bits, number of bits seen, high
4766   * surrogate). */
4767  
4768  PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4769  PyUnicode_DecodeUTF7Stateful(const char *s,
4770                               Py_ssize_t size,
4771                               const char *errors,
4772                               Py_ssize_t *consumed)
4773  {
4774      const char *starts = s;
4775      Py_ssize_t startinpos;
4776      Py_ssize_t endinpos;
4777      const char *e;
4778      _PyUnicodeWriter writer;
4779      const char *errmsg = "";
4780      int inShift = 0;
4781      Py_ssize_t shiftOutStart;
4782      unsigned int base64bits = 0;
4783      unsigned long base64buffer = 0;
4784      Py_UCS4 surrogate = 0;
4785      PyObject *errorHandler = NULL;
4786      PyObject *exc = NULL;
4787  
4788      if (size == 0) {
4789          if (consumed)
4790              *consumed = 0;
4791          _Py_RETURN_UNICODE_EMPTY();
4792      }
4793  
4794      /* Start off assuming it's all ASCII. Widen later as necessary. */
4795      _PyUnicodeWriter_Init(&writer);
4796      writer.min_length = size;
4797  
4798      shiftOutStart = 0;
4799      e = s + size;
4800  
4801      while (s < e) {
4802          Py_UCS4 ch;
4803        restart:
4804          ch = (unsigned char) *s;
4805  
4806          if (inShift) { /* in a base-64 section */
4807              if (IS_BASE64(ch)) { /* consume a base-64 character */
4808                  base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4809                  base64bits += 6;
4810                  s++;
4811                  if (base64bits >= 16) {
4812                      /* we have enough bits for a UTF-16 value */
4813                      Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4814                      base64bits -= 16;
4815                      base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4816                      assert(outCh <= 0xffff);
4817                      if (surrogate) {
4818                          /* expecting a second surrogate */
4819                          if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4820                              Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4821                              if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4822                                  goto onError;
4823                              surrogate = 0;
4824                              continue;
4825                          }
4826                          else {
4827                              if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4828                                  goto onError;
4829                              surrogate = 0;
4830                          }
4831                      }
4832                      if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4833                          /* first surrogate */
4834                          surrogate = outCh;
4835                      }
4836                      else {
4837                          if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4838                              goto onError;
4839                      }
4840                  }
4841              }
4842              else { /* now leaving a base-64 section */
4843                  inShift = 0;
4844                  if (base64bits > 0) { /* left-over bits */
4845                      if (base64bits >= 6) {
4846                          /* We've seen at least one base-64 character */
4847                          s++;
4848                          errmsg = "partial character in shift sequence";
4849                          goto utf7Error;
4850                      }
4851                      else {
4852                          /* Some bits remain; they should be zero */
4853                          if (base64buffer != 0) {
4854                              s++;
4855                              errmsg = "non-zero padding bits in shift sequence";
4856                              goto utf7Error;
4857                          }
4858                      }
4859                  }
4860                  if (surrogate && DECODE_DIRECT(ch)) {
4861                      if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4862                          goto onError;
4863                  }
4864                  surrogate = 0;
4865                  if (ch == '-') {
4866                      /* '-' is absorbed; other terminating
4867                         characters are preserved */
4868                      s++;
4869                  }
4870              }
4871          }
4872          else if ( ch == '+' ) {
4873              startinpos = s-starts;
4874              s++; /* consume '+' */
4875              if (s < e && *s == '-') { /* '+-' encodes '+' */
4876                  s++;
4877                  if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4878                      goto onError;
4879              }
4880              else if (s < e && !IS_BASE64(*s)) {
4881                  s++;
4882                  errmsg = "ill-formed sequence";
4883                  goto utf7Error;
4884              }
4885              else { /* begin base64-encoded section */
4886                  inShift = 1;
4887                  surrogate = 0;
4888                  shiftOutStart = writer.pos;
4889                  base64bits = 0;
4890                  base64buffer = 0;
4891              }
4892          }
4893          else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4894              s++;
4895              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4896                  goto onError;
4897          }
4898          else {
4899              startinpos = s-starts;
4900              s++;
4901              errmsg = "unexpected special character";
4902              goto utf7Error;
4903          }
4904          continue;
4905  utf7Error:
4906          endinpos = s-starts;
4907          if (unicode_decode_call_errorhandler_writer(
4908                  errors, &errorHandler,
4909                  "utf7", errmsg,
4910                  &starts, &e, &startinpos, &endinpos, &exc, &s,
4911                  &writer))
4912              goto onError;
4913      }
4914  
4915      /* end of string */
4916  
4917      if (inShift && !consumed) { /* in shift sequence, no more to follow */
4918          /* if we're in an inconsistent state, that's an error */
4919          inShift = 0;
4920          if (surrogate ||
4921                  (base64bits >= 6) ||
4922                  (base64bits > 0 && base64buffer != 0)) {
4923              endinpos = size;
4924              if (unicode_decode_call_errorhandler_writer(
4925                      errors, &errorHandler,
4926                      "utf7", "unterminated shift sequence",
4927                      &starts, &e, &startinpos, &endinpos, &exc, &s,
4928                      &writer))
4929                  goto onError;
4930              if (s < e)
4931                  goto restart;
4932          }
4933      }
4934  
4935      /* return state */
4936      if (consumed) {
4937          if (inShift) {
4938              *consumed = startinpos;
4939              if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4940                  PyObject *result = PyUnicode_FromKindAndData(
4941                          writer.kind, writer.data, shiftOutStart);
4942                  Py_XDECREF(errorHandler);
4943                  Py_XDECREF(exc);
4944                  _PyUnicodeWriter_Dealloc(&writer);
4945                  return result;
4946              }
4947              writer.pos = shiftOutStart; /* back off output */
4948          }
4949          else {
4950              *consumed = s-starts;
4951          }
4952      }
4953  
4954      Py_XDECREF(errorHandler);
4955      Py_XDECREF(exc);
4956      return _PyUnicodeWriter_Finish(&writer);
4957  
4958    onError:
4959      Py_XDECREF(errorHandler);
4960      Py_XDECREF(exc);
4961      _PyUnicodeWriter_Dealloc(&writer);
4962      return NULL;
4963  }
4964  
4965  
4966  PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4967  _PyUnicode_EncodeUTF7(PyObject *str,
4968                        int base64SetO,
4969                        int base64WhiteSpace,
4970                        const char *errors)
4971  {
4972      int kind;
4973      const void *data;
4974      Py_ssize_t len;
4975      PyObject *v;
4976      int inShift = 0;
4977      Py_ssize_t i;
4978      unsigned int base64bits = 0;
4979      unsigned long base64buffer = 0;
4980      char * out;
4981      const char * start;
4982  
4983      if (PyUnicode_READY(str) == -1)
4984          return NULL;
4985      kind = PyUnicode_KIND(str);
4986      data = PyUnicode_DATA(str);
4987      len = PyUnicode_GET_LENGTH(str);
4988  
4989      if (len == 0)
4990          return PyBytes_FromStringAndSize(NULL, 0);
4991  
4992      /* It might be possible to tighten this worst case */
4993      if (len > PY_SSIZE_T_MAX / 8)
4994          return PyErr_NoMemory();
4995      v = PyBytes_FromStringAndSize(NULL, len * 8);
4996      if (v == NULL)
4997          return NULL;
4998  
4999      start = out = PyBytes_AS_STRING(v);
5000      for (i = 0; i < len; ++i) {
5001          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5002  
5003          if (inShift) {
5004              if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5005                  /* shifting out */
5006                  if (base64bits) { /* output remaining bits */
5007                      *out++ = TO_BASE64(base64buffer << (6-base64bits));
5008                      base64buffer = 0;
5009                      base64bits = 0;
5010                  }
5011                  inShift = 0;
5012                  /* Characters not in the BASE64 set implicitly unshift the sequence
5013                     so no '-' is required, except if the character is itself a '-' */
5014                  if (IS_BASE64(ch) || ch == '-') {
5015                      *out++ = '-';
5016                  }
5017                  *out++ = (char) ch;
5018              }
5019              else {
5020                  goto encode_char;
5021              }
5022          }
5023          else { /* not in a shift sequence */
5024              if (ch == '+') {
5025                  *out++ = '+';
5026                          *out++ = '-';
5027              }
5028              else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5029                  *out++ = (char) ch;
5030              }
5031              else {
5032                  *out++ = '+';
5033                  inShift = 1;
5034                  goto encode_char;
5035              }
5036          }
5037          continue;
5038  encode_char:
5039          if (ch >= 0x10000) {
5040              assert(ch <= MAX_UNICODE);
5041  
5042              /* code first surrogate */
5043              base64bits += 16;
5044              base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
5045              while (base64bits >= 6) {
5046                  *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5047                  base64bits -= 6;
5048              }
5049              /* prepare second surrogate */
5050              ch = Py_UNICODE_LOW_SURROGATE(ch);
5051          }
5052          base64bits += 16;
5053          base64buffer = (base64buffer << 16) | ch;
5054          while (base64bits >= 6) {
5055              *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5056              base64bits -= 6;
5057          }
5058      }
5059      if (base64bits)
5060          *out++= TO_BASE64(base64buffer << (6-base64bits) );
5061      if (inShift)
5062          *out++ = '-';
5063      if (_PyBytes_Resize(&v, out - start) < 0)
5064          return NULL;
5065      return v;
5066  }
5067  PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)5068  PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5069                       Py_ssize_t size,
5070                       int base64SetO,
5071                       int base64WhiteSpace,
5072                       const char *errors)
5073  {
5074      PyObject *result;
5075      PyObject *tmp = PyUnicode_FromWideChar(s, size);
5076      if (tmp == NULL)
5077          return NULL;
5078      result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
5079                                     base64WhiteSpace, errors);
5080      Py_DECREF(tmp);
5081      return result;
5082  }
5083  
5084  #undef IS_BASE64
5085  #undef FROM_BASE64
5086  #undef TO_BASE64
5087  #undef DECODE_DIRECT
5088  #undef ENCODE_DIRECT
5089  
5090  /* --- UTF-8 Codec -------------------------------------------------------- */
5091  
5092  PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5093  PyUnicode_DecodeUTF8(const char *s,
5094                       Py_ssize_t size,
5095                       const char *errors)
5096  {
5097      return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5098  }
5099  
5100  #include "stringlib/asciilib.h"
5101  #include "stringlib/codecs.h"
5102  #include "stringlib/undef.h"
5103  
5104  #include "stringlib/ucs1lib.h"
5105  #include "stringlib/codecs.h"
5106  #include "stringlib/undef.h"
5107  
5108  #include "stringlib/ucs2lib.h"
5109  #include "stringlib/codecs.h"
5110  #include "stringlib/undef.h"
5111  
5112  #include "stringlib/ucs4lib.h"
5113  #include "stringlib/codecs.h"
5114  #include "stringlib/undef.h"
5115  
5116  /* Mask to quickly check whether a C 'size_t' contains a
5117     non-ASCII, UTF8-encoded char. */
5118  #if (SIZEOF_SIZE_T == 8)
5119  # define ASCII_CHAR_MASK 0x8080808080808080ULL
5120  #elif (SIZEOF_SIZE_T == 4)
5121  # define ASCII_CHAR_MASK 0x80808080U
5122  #else
5123  # error C 'size_t' size should be either 4 or 8!
5124  #endif
5125  
5126  static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5127  ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5128  {
5129      const char *p = start;
5130  
5131  #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5132      assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5133      if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5134          /* Fast path, see in STRINGLIB(utf8_decode) for
5135             an explanation. */
5136          /* Help allocation */
5137          const char *_p = p;
5138          Py_UCS1 * q = dest;
5139          while (_p + SIZEOF_SIZE_T <= end) {
5140              size_t value = *(const size_t *) _p;
5141              if (value & ASCII_CHAR_MASK)
5142                  break;
5143              *((size_t *)q) = value;
5144              _p += SIZEOF_SIZE_T;
5145              q += SIZEOF_SIZE_T;
5146          }
5147          p = _p;
5148          while (p < end) {
5149              if ((unsigned char)*p & 0x80)
5150                  break;
5151              *q++ = *p++;
5152          }
5153          return p - start;
5154      }
5155  #endif
5156      while (p < end) {
5157          /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5158             for an explanation. */
5159          if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5160              /* Help allocation */
5161              const char *_p = p;
5162              while (_p + SIZEOF_SIZE_T <= end) {
5163                  size_t value = *(const size_t *) _p;
5164                  if (value & ASCII_CHAR_MASK)
5165                      break;
5166                  _p += SIZEOF_SIZE_T;
5167              }
5168              p = _p;
5169              if (_p == end)
5170                  break;
5171          }
5172          if ((unsigned char)*p & 0x80)
5173              break;
5174          ++p;
5175      }
5176      memcpy(dest, start, p - start);
5177      return p - start;
5178  }
5179  
5180  static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5181  unicode_decode_utf8(const char *s, Py_ssize_t size,
5182                      _Py_error_handler error_handler, const char *errors,
5183                      Py_ssize_t *consumed)
5184  {
5185      if (size == 0) {
5186          if (consumed)
5187              *consumed = 0;
5188          _Py_RETURN_UNICODE_EMPTY();
5189      }
5190  
5191      /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5192      if (size == 1 && (unsigned char)s[0] < 128) {
5193          if (consumed) {
5194              *consumed = 1;
5195          }
5196          return get_latin1_char((unsigned char)s[0]);
5197      }
5198  
5199      const char *starts = s;
5200      const char *end = s + size;
5201  
5202      // fast path: try ASCII string.
5203      PyObject *u = PyUnicode_New(size, 127);
5204      if (u == NULL) {
5205          return NULL;
5206      }
5207      s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5208      if (s == end) {
5209          return u;
5210      }
5211  
5212      // Use _PyUnicodeWriter after fast path is failed.
5213      _PyUnicodeWriter writer;
5214      _PyUnicodeWriter_InitWithBuffer(&writer, u);
5215      writer.pos = s - starts;
5216  
5217      Py_ssize_t startinpos, endinpos;
5218      const char *errmsg = "";
5219      PyObject *error_handler_obj = NULL;
5220      PyObject *exc = NULL;
5221  
5222      while (s < end) {
5223          Py_UCS4 ch;
5224          int kind = writer.kind;
5225  
5226          if (kind == PyUnicode_1BYTE_KIND) {
5227              if (PyUnicode_IS_ASCII(writer.buffer))
5228                  ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5229              else
5230                  ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5231          } else if (kind == PyUnicode_2BYTE_KIND) {
5232              ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5233          } else {
5234              assert(kind == PyUnicode_4BYTE_KIND);
5235              ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5236          }
5237  
5238          switch (ch) {
5239          case 0:
5240              if (s == end || consumed)
5241                  goto End;
5242              errmsg = "unexpected end of data";
5243              startinpos = s - starts;
5244              endinpos = end - starts;
5245              break;
5246          case 1:
5247              errmsg = "invalid start byte";
5248              startinpos = s - starts;
5249              endinpos = startinpos + 1;
5250              break;
5251          case 2:
5252              if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5253                  && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5254              {
5255                  /* Truncated surrogate code in range D800-DFFF */
5256                  goto End;
5257              }
5258              /* fall through */
5259          case 3:
5260          case 4:
5261              errmsg = "invalid continuation byte";
5262              startinpos = s - starts;
5263              endinpos = startinpos + ch - 1;
5264              break;
5265          default:
5266              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5267                  goto onError;
5268              continue;
5269          }
5270  
5271          if (error_handler == _Py_ERROR_UNKNOWN)
5272              error_handler = _Py_GetErrorHandler(errors);
5273  
5274          switch (error_handler) {
5275          case _Py_ERROR_IGNORE:
5276              s += (endinpos - startinpos);
5277              break;
5278  
5279          case _Py_ERROR_REPLACE:
5280              if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5281                  goto onError;
5282              s += (endinpos - startinpos);
5283              break;
5284  
5285          case _Py_ERROR_SURROGATEESCAPE:
5286          {
5287              Py_ssize_t i;
5288  
5289              if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5290                  goto onError;
5291              for (i=startinpos; i<endinpos; i++) {
5292                  ch = (Py_UCS4)(unsigned char)(starts[i]);
5293                  PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5294                                  ch + 0xdc00);
5295                  writer.pos++;
5296              }
5297              s += (endinpos - startinpos);
5298              break;
5299          }
5300  
5301          default:
5302              if (unicode_decode_call_errorhandler_writer(
5303                      errors, &error_handler_obj,
5304                      "utf-8", errmsg,
5305                      &starts, &end, &startinpos, &endinpos, &exc, &s,
5306                      &writer))
5307                  goto onError;
5308          }
5309      }
5310  
5311  End:
5312      if (consumed)
5313          *consumed = s - starts;
5314  
5315      Py_XDECREF(error_handler_obj);
5316      Py_XDECREF(exc);
5317      return _PyUnicodeWriter_Finish(&writer);
5318  
5319  onError:
5320      Py_XDECREF(error_handler_obj);
5321      Py_XDECREF(exc);
5322      _PyUnicodeWriter_Dealloc(&writer);
5323      return NULL;
5324  }
5325  
5326  
5327  PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5328  PyUnicode_DecodeUTF8Stateful(const char *s,
5329                               Py_ssize_t size,
5330                               const char *errors,
5331                               Py_ssize_t *consumed)
5332  {
5333      return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5334  }
5335  
5336  
5337  /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5338     non-zero, use strict error handler otherwise.
5339  
5340     On success, write a pointer to a newly allocated wide character string into
5341     *wstr (use PyMem_RawFree() to free the memory) and write the output length
5342     (in number of wchar_t units) into *wlen (if wlen is set).
5343  
5344     On memory allocation failure, return -1.
5345  
5346     On decoding error (if surrogateescape is zero), return -2. If wlen is
5347     non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5348     is not NULL, write the decoding error message into *reason. */
5349  int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5350  _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5351                   const char **reason, _Py_error_handler errors)
5352  {
5353      const char *orig_s = s;
5354      const char *e;
5355      wchar_t *unicode;
5356      Py_ssize_t outpos;
5357  
5358      int surrogateescape = 0;
5359      int surrogatepass = 0;
5360      switch (errors)
5361      {
5362      case _Py_ERROR_STRICT:
5363          break;
5364      case _Py_ERROR_SURROGATEESCAPE:
5365          surrogateescape = 1;
5366          break;
5367      case _Py_ERROR_SURROGATEPASS:
5368          surrogatepass = 1;
5369          break;
5370      default:
5371          return -3;
5372      }
5373  
5374      /* Note: size will always be longer than the resulting Unicode
5375         character count */
5376      if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5377          return -1;
5378      }
5379  
5380      unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5381      if (!unicode) {
5382          return -1;
5383      }
5384  
5385      /* Unpack UTF-8 encoded data */
5386      e = s + size;
5387      outpos = 0;
5388      while (s < e) {
5389          Py_UCS4 ch;
5390  #if SIZEOF_WCHAR_T == 4
5391          ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5392  #else
5393          ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5394  #endif
5395          if (ch > 0xFF) {
5396  #if SIZEOF_WCHAR_T == 4
5397              Py_UNREACHABLE();
5398  #else
5399              assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5400              /* write a surrogate pair */
5401              unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5402              unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5403  #endif
5404          }
5405          else {
5406              if (!ch && s == e) {
5407                  break;
5408              }
5409  
5410              if (surrogateescape) {
5411                  unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5412              }
5413              else {
5414                  /* Is it a valid three-byte code? */
5415                  if (surrogatepass
5416                      && (e - s) >= 3
5417                      && (s[0] & 0xf0) == 0xe0
5418                      && (s[1] & 0xc0) == 0x80
5419                      && (s[2] & 0xc0) == 0x80)
5420                  {
5421                      ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5422                      s += 3;
5423                      unicode[outpos++] = ch;
5424                  }
5425                  else {
5426                      PyMem_RawFree(unicode );
5427                      if (reason != NULL) {
5428                          switch (ch) {
5429                          case 0:
5430                              *reason = "unexpected end of data";
5431                              break;
5432                          case 1:
5433                              *reason = "invalid start byte";
5434                              break;
5435                          /* 2, 3, 4 */
5436                          default:
5437                              *reason = "invalid continuation byte";
5438                              break;
5439                          }
5440                      }
5441                      if (wlen != NULL) {
5442                          *wlen = s - orig_s;
5443                      }
5444                      return -2;
5445                  }
5446              }
5447          }
5448      }
5449      unicode[outpos] = L'\0';
5450      if (wlen) {
5451          *wlen = outpos;
5452      }
5453      *wstr = unicode;
5454      return 0;
5455  }
5456  
5457  
5458  wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5459  _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5460                                 size_t *wlen)
5461  {
5462      wchar_t *wstr;
5463      int res = _Py_DecodeUTF8Ex(arg, arglen,
5464                                 &wstr, wlen,
5465                                 NULL, _Py_ERROR_SURROGATEESCAPE);
5466      if (res != 0) {
5467          /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5468          assert(res != -3);
5469          if (wlen) {
5470              *wlen = (size_t)res;
5471          }
5472          return NULL;
5473      }
5474      return wstr;
5475  }
5476  
5477  
5478  /* UTF-8 encoder using the surrogateescape error handler .
5479  
5480     On success, return 0 and write the newly allocated character string (use
5481     PyMem_Free() to free the memory) into *str.
5482  
5483     On encoding failure, return -2 and write the position of the invalid
5484     surrogate character into *error_pos (if error_pos is set) and the decoding
5485     error message into *reason (if reason is set).
5486  
5487     On memory allocation failure, return -1. */
5488  int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5489  _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5490                   const char **reason, int raw_malloc, _Py_error_handler errors)
5491  {
5492      const Py_ssize_t max_char_size = 4;
5493      Py_ssize_t len = wcslen(text);
5494  
5495      assert(len >= 0);
5496  
5497      int surrogateescape = 0;
5498      int surrogatepass = 0;
5499      switch (errors)
5500      {
5501      case _Py_ERROR_STRICT:
5502          break;
5503      case _Py_ERROR_SURROGATEESCAPE:
5504          surrogateescape = 1;
5505          break;
5506      case _Py_ERROR_SURROGATEPASS:
5507          surrogatepass = 1;
5508          break;
5509      default:
5510          return -3;
5511      }
5512  
5513      if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5514          return -1;
5515      }
5516      char *bytes;
5517      if (raw_malloc) {
5518          bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5519      }
5520      else {
5521          bytes = PyMem_Malloc((len + 1) * max_char_size);
5522      }
5523      if (bytes == NULL) {
5524          return -1;
5525      }
5526  
5527      char *p = bytes;
5528      Py_ssize_t i;
5529      for (i = 0; i < len; ) {
5530          Py_ssize_t ch_pos = i;
5531          Py_UCS4 ch = text[i];
5532          i++;
5533  #if Py_UNICODE_SIZE == 2
5534          if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5535              && i < len
5536              && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5537          {
5538              ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5539              i++;
5540          }
5541  #endif
5542  
5543          if (ch < 0x80) {
5544              /* Encode ASCII */
5545              *p++ = (char) ch;
5546  
5547          }
5548          else if (ch < 0x0800) {
5549              /* Encode Latin-1 */
5550              *p++ = (char)(0xc0 | (ch >> 6));
5551              *p++ = (char)(0x80 | (ch & 0x3f));
5552          }
5553          else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5554              /* surrogateescape error handler */
5555              if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5556                  if (error_pos != NULL) {
5557                      *error_pos = (size_t)ch_pos;
5558                  }
5559                  if (reason != NULL) {
5560                      *reason = "encoding error";
5561                  }
5562                  if (raw_malloc) {
5563                      PyMem_RawFree(bytes);
5564                  }
5565                  else {
5566                      PyMem_Free(bytes);
5567                  }
5568                  return -2;
5569              }
5570              *p++ = (char)(ch & 0xff);
5571          }
5572          else if (ch < 0x10000) {
5573              *p++ = (char)(0xe0 | (ch >> 12));
5574              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5575              *p++ = (char)(0x80 | (ch & 0x3f));
5576          }
5577          else {  /* ch >= 0x10000 */
5578              assert(ch <= MAX_UNICODE);
5579              /* Encode UCS4 Unicode ordinals */
5580              *p++ = (char)(0xf0 | (ch >> 18));
5581              *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5582              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5583              *p++ = (char)(0x80 | (ch & 0x3f));
5584          }
5585      }
5586      *p++ = '\0';
5587  
5588      size_t final_size = (p - bytes);
5589      char *bytes2;
5590      if (raw_malloc) {
5591          bytes2 = PyMem_RawRealloc(bytes, final_size);
5592      }
5593      else {
5594          bytes2 = PyMem_Realloc(bytes, final_size);
5595      }
5596      if (bytes2 == NULL) {
5597          if (error_pos != NULL) {
5598              *error_pos = (size_t)-1;
5599          }
5600          if (raw_malloc) {
5601              PyMem_RawFree(bytes);
5602          }
5603          else {
5604              PyMem_Free(bytes);
5605          }
5606          return -1;
5607      }
5608      *str = bytes2;
5609      return 0;
5610  }
5611  
5612  
5613  /* Primary internal function which creates utf8 encoded bytes objects.
5614  
5615     Allocation strategy:  if the string is short, convert into a stack buffer
5616     and allocate exactly as much space needed at the end.  Else allocate the
5617     maximum possible needed (4 result bytes per Unicode character), and return
5618     the excess memory at the end.
5619  */
5620  static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5621  unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5622                      const char *errors)
5623  {
5624      if (!PyUnicode_Check(unicode)) {
5625          PyErr_BadArgument();
5626          return NULL;
5627      }
5628  
5629      if (PyUnicode_READY(unicode) == -1)
5630          return NULL;
5631  
5632      if (PyUnicode_UTF8(unicode))
5633          return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5634                                           PyUnicode_UTF8_LENGTH(unicode));
5635  
5636      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5637      const void *data = PyUnicode_DATA(unicode);
5638      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5639  
5640      _PyBytesWriter writer;
5641      char *end;
5642  
5643      switch (kind) {
5644      default:
5645          Py_UNREACHABLE();
5646      case PyUnicode_1BYTE_KIND:
5647          /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5648          assert(!PyUnicode_IS_ASCII(unicode));
5649          end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5650          break;
5651      case PyUnicode_2BYTE_KIND:
5652          end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5653          break;
5654      case PyUnicode_4BYTE_KIND:
5655          end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5656          break;
5657      }
5658  
5659      if (end == NULL) {
5660          _PyBytesWriter_Dealloc(&writer);
5661          return NULL;
5662      }
5663      return _PyBytesWriter_Finish(&writer, end);
5664  }
5665  
5666  static int
unicode_fill_utf8(PyObject * unicode)5667  unicode_fill_utf8(PyObject *unicode)
5668  {
5669      /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5670      assert(!PyUnicode_IS_ASCII(unicode));
5671  
5672      enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5673      const void *data = PyUnicode_DATA(unicode);
5674      Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5675  
5676      _PyBytesWriter writer;
5677      char *end;
5678  
5679      switch (kind) {
5680      default:
5681          Py_UNREACHABLE();
5682      case PyUnicode_1BYTE_KIND:
5683          end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5684                                     _Py_ERROR_STRICT, NULL);
5685          break;
5686      case PyUnicode_2BYTE_KIND:
5687          end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5688                                     _Py_ERROR_STRICT, NULL);
5689          break;
5690      case PyUnicode_4BYTE_KIND:
5691          end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5692                                     _Py_ERROR_STRICT, NULL);
5693          break;
5694      }
5695      if (end == NULL) {
5696          _PyBytesWriter_Dealloc(&writer);
5697          return -1;
5698      }
5699  
5700      const char *start = writer.use_small_buffer ? writer.small_buffer :
5701                      PyBytes_AS_STRING(writer.buffer);
5702      Py_ssize_t len = end - start;
5703  
5704      char *cache = PyObject_Malloc(len + 1);
5705      if (cache == NULL) {
5706          _PyBytesWriter_Dealloc(&writer);
5707          PyErr_NoMemory();
5708          return -1;
5709      }
5710      _PyUnicode_UTF8(unicode) = cache;
5711      _PyUnicode_UTF8_LENGTH(unicode) = len;
5712      memcpy(cache, start, len);
5713      cache[len] = '\0';
5714      _PyBytesWriter_Dealloc(&writer);
5715      return 0;
5716  }
5717  
5718  PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5719  _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5720  {
5721      return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5722  }
5723  
5724  
5725  PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5726  PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5727                       Py_ssize_t size,
5728                       const char *errors)
5729  {
5730      PyObject *v, *unicode;
5731  
5732      unicode = PyUnicode_FromWideChar(s, size);
5733      if (unicode == NULL)
5734          return NULL;
5735      v = _PyUnicode_AsUTF8String(unicode, errors);
5736      Py_DECREF(unicode);
5737      return v;
5738  }
5739  
5740  PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5741  PyUnicode_AsUTF8String(PyObject *unicode)
5742  {
5743      return _PyUnicode_AsUTF8String(unicode, NULL);
5744  }
5745  
5746  /* --- UTF-32 Codec ------------------------------------------------------- */
5747  
5748  PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5749  PyUnicode_DecodeUTF32(const char *s,
5750                        Py_ssize_t size,
5751                        const char *errors,
5752                        int *byteorder)
5753  {
5754      return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5755  }
5756  
5757  PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5758  PyUnicode_DecodeUTF32Stateful(const char *s,
5759                                Py_ssize_t size,
5760                                const char *errors,
5761                                int *byteorder,
5762                                Py_ssize_t *consumed)
5763  {
5764      const char *starts = s;
5765      Py_ssize_t startinpos;
5766      Py_ssize_t endinpos;
5767      _PyUnicodeWriter writer;
5768      const unsigned char *q, *e;
5769      int le, bo = 0;       /* assume native ordering by default */
5770      const char *encoding;
5771      const char *errmsg = "";
5772      PyObject *errorHandler = NULL;
5773      PyObject *exc = NULL;
5774  
5775      q = (const unsigned char *)s;
5776      e = q + size;
5777  
5778      if (byteorder)
5779          bo = *byteorder;
5780  
5781      /* Check for BOM marks (U+FEFF) in the input and adjust current
5782         byte order setting accordingly. In native mode, the leading BOM
5783         mark is skipped, in all other modes, it is copied to the output
5784         stream as-is (giving a ZWNBSP character). */
5785      if (bo == 0 && size >= 4) {
5786          Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5787          if (bom == 0x0000FEFF) {
5788              bo = -1;
5789              q += 4;
5790          }
5791          else if (bom == 0xFFFE0000) {
5792              bo = 1;
5793              q += 4;
5794          }
5795          if (byteorder)
5796              *byteorder = bo;
5797      }
5798  
5799      if (q == e) {
5800          if (consumed)
5801              *consumed = size;
5802          _Py_RETURN_UNICODE_EMPTY();
5803      }
5804  
5805  #ifdef WORDS_BIGENDIAN
5806      le = bo < 0;
5807  #else
5808      le = bo <= 0;
5809  #endif
5810      encoding = le ? "utf-32-le" : "utf-32-be";
5811  
5812      _PyUnicodeWriter_Init(&writer);
5813      writer.min_length = (e - q + 3) / 4;
5814      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5815          goto onError;
5816  
5817      while (1) {
5818          Py_UCS4 ch = 0;
5819          Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5820  
5821          if (e - q >= 4) {
5822              enum PyUnicode_Kind kind = writer.kind;
5823              void *data = writer.data;
5824              const unsigned char *last = e - 4;
5825              Py_ssize_t pos = writer.pos;
5826              if (le) {
5827                  do {
5828                      ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5829                      if (ch > maxch)
5830                          break;
5831                      if (kind != PyUnicode_1BYTE_KIND &&
5832                          Py_UNICODE_IS_SURROGATE(ch))
5833                          break;
5834                      PyUnicode_WRITE(kind, data, pos++, ch);
5835                      q += 4;
5836                  } while (q <= last);
5837              }
5838              else {
5839                  do {
5840                      ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5841                      if (ch > maxch)
5842                          break;
5843                      if (kind != PyUnicode_1BYTE_KIND &&
5844                          Py_UNICODE_IS_SURROGATE(ch))
5845                          break;
5846                      PyUnicode_WRITE(kind, data, pos++, ch);
5847                      q += 4;
5848                  } while (q <= last);
5849              }
5850              writer.pos = pos;
5851          }
5852  
5853          if (Py_UNICODE_IS_SURROGATE(ch)) {
5854              errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5855              startinpos = ((const char *)q) - starts;
5856              endinpos = startinpos + 4;
5857          }
5858          else if (ch <= maxch) {
5859              if (q == e || consumed)
5860                  break;
5861              /* remaining bytes at the end? (size should be divisible by 4) */
5862              errmsg = "truncated data";
5863              startinpos = ((const char *)q) - starts;
5864              endinpos = ((const char *)e) - starts;
5865          }
5866          else {
5867              if (ch < 0x110000) {
5868                  if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5869                      goto onError;
5870                  q += 4;
5871                  continue;
5872              }
5873              errmsg = "code point not in range(0x110000)";
5874              startinpos = ((const char *)q) - starts;
5875              endinpos = startinpos + 4;
5876          }
5877  
5878          /* The remaining input chars are ignored if the callback
5879             chooses to skip the input */
5880          if (unicode_decode_call_errorhandler_writer(
5881                  errors, &errorHandler,
5882                  encoding, errmsg,
5883                  &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5884                  &writer))
5885              goto onError;
5886      }
5887  
5888      if (consumed)
5889          *consumed = (const char *)q-starts;
5890  
5891      Py_XDECREF(errorHandler);
5892      Py_XDECREF(exc);
5893      return _PyUnicodeWriter_Finish(&writer);
5894  
5895    onError:
5896      _PyUnicodeWriter_Dealloc(&writer);
5897      Py_XDECREF(errorHandler);
5898      Py_XDECREF(exc);
5899      return NULL;
5900  }
5901  
5902  PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5903  _PyUnicode_EncodeUTF32(PyObject *str,
5904                         const char *errors,
5905                         int byteorder)
5906  {
5907      enum PyUnicode_Kind kind;
5908      const void *data;
5909      Py_ssize_t len;
5910      PyObject *v;
5911      uint32_t *out;
5912  #if PY_LITTLE_ENDIAN
5913      int native_ordering = byteorder <= 0;
5914  #else
5915      int native_ordering = byteorder >= 0;
5916  #endif
5917      const char *encoding;
5918      Py_ssize_t nsize, pos;
5919      PyObject *errorHandler = NULL;
5920      PyObject *exc = NULL;
5921      PyObject *rep = NULL;
5922  
5923      if (!PyUnicode_Check(str)) {
5924          PyErr_BadArgument();
5925          return NULL;
5926      }
5927      if (PyUnicode_READY(str) == -1)
5928          return NULL;
5929      kind = PyUnicode_KIND(str);
5930      data = PyUnicode_DATA(str);
5931      len = PyUnicode_GET_LENGTH(str);
5932  
5933      if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5934          return PyErr_NoMemory();
5935      nsize = len + (byteorder == 0);
5936      v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5937      if (v == NULL)
5938          return NULL;
5939  
5940      /* output buffer is 4-bytes aligned */
5941      assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5942      out = (uint32_t *)PyBytes_AS_STRING(v);
5943      if (byteorder == 0)
5944          *out++ = 0xFEFF;
5945      if (len == 0)
5946          goto done;
5947  
5948      if (byteorder == -1)
5949          encoding = "utf-32-le";
5950      else if (byteorder == 1)
5951          encoding = "utf-32-be";
5952      else
5953          encoding = "utf-32";
5954  
5955      if (kind == PyUnicode_1BYTE_KIND) {
5956          ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5957          goto done;
5958      }
5959  
5960      pos = 0;
5961      while (pos < len) {
5962          Py_ssize_t repsize, moreunits;
5963  
5964          if (kind == PyUnicode_2BYTE_KIND) {
5965              pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5966                                          &out, native_ordering);
5967          }
5968          else {
5969              assert(kind == PyUnicode_4BYTE_KIND);
5970              pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5971                                          &out, native_ordering);
5972          }
5973          if (pos == len)
5974              break;
5975  
5976          rep = unicode_encode_call_errorhandler(
5977                  errors, &errorHandler,
5978                  encoding, "surrogates not allowed",
5979                  str, &exc, pos, pos + 1, &pos);
5980          if (!rep)
5981              goto error;
5982  
5983          if (PyBytes_Check(rep)) {
5984              repsize = PyBytes_GET_SIZE(rep);
5985              if (repsize & 3) {
5986                  raise_encode_exception(&exc, encoding,
5987                                         str, pos - 1, pos,
5988                                         "surrogates not allowed");
5989                  goto error;
5990              }
5991              moreunits = repsize / 4;
5992          }
5993          else {
5994              assert(PyUnicode_Check(rep));
5995              if (PyUnicode_READY(rep) < 0)
5996                  goto error;
5997              moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5998              if (!PyUnicode_IS_ASCII(rep)) {
5999                  raise_encode_exception(&exc, encoding,
6000                                         str, pos - 1, pos,
6001                                         "surrogates not allowed");
6002                  goto error;
6003              }
6004          }
6005  
6006          /* four bytes are reserved for each surrogate */
6007          if (moreunits > 1) {
6008              Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6009              if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6010                  /* integer overflow */
6011                  PyErr_NoMemory();
6012                  goto error;
6013              }
6014              if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
6015                  goto error;
6016              out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6017          }
6018  
6019          if (PyBytes_Check(rep)) {
6020              memcpy(out, PyBytes_AS_STRING(rep), repsize);
6021              out += moreunits;
6022          } else /* rep is unicode */ {
6023              assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6024              ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6025                                   &out, native_ordering);
6026          }
6027  
6028          Py_CLEAR(rep);
6029      }
6030  
6031      /* Cut back to size actually needed. This is necessary for, for example,
6032         encoding of a string containing isolated surrogates and the 'ignore'
6033         handler is used. */
6034      nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6035      if (nsize != PyBytes_GET_SIZE(v))
6036        _PyBytes_Resize(&v, nsize);
6037      Py_XDECREF(errorHandler);
6038      Py_XDECREF(exc);
6039    done:
6040      return v;
6041    error:
6042      Py_XDECREF(rep);
6043      Py_XDECREF(errorHandler);
6044      Py_XDECREF(exc);
6045      Py_XDECREF(v);
6046      return NULL;
6047  }
6048  
6049  PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6050  PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6051                        Py_ssize_t size,
6052                        const char *errors,
6053                        int byteorder)
6054  {
6055      PyObject *result;
6056      PyObject *tmp = PyUnicode_FromWideChar(s, size);
6057      if (tmp == NULL)
6058          return NULL;
6059      result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6060      Py_DECREF(tmp);
6061      return result;
6062  }
6063  
6064  PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)6065  PyUnicode_AsUTF32String(PyObject *unicode)
6066  {
6067      return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6068  }
6069  
6070  /* --- UTF-16 Codec ------------------------------------------------------- */
6071  
6072  PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)6073  PyUnicode_DecodeUTF16(const char *s,
6074                        Py_ssize_t size,
6075                        const char *errors,
6076                        int *byteorder)
6077  {
6078      return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6079  }
6080  
6081  PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)6082  PyUnicode_DecodeUTF16Stateful(const char *s,
6083                                Py_ssize_t size,
6084                                const char *errors,
6085                                int *byteorder,
6086                                Py_ssize_t *consumed)
6087  {
6088      const char *starts = s;
6089      Py_ssize_t startinpos;
6090      Py_ssize_t endinpos;
6091      _PyUnicodeWriter writer;
6092      const unsigned char *q, *e;
6093      int bo = 0;       /* assume native ordering by default */
6094      int native_ordering;
6095      const char *errmsg = "";
6096      PyObject *errorHandler = NULL;
6097      PyObject *exc = NULL;
6098      const char *encoding;
6099  
6100      q = (const unsigned char *)s;
6101      e = q + size;
6102  
6103      if (byteorder)
6104          bo = *byteorder;
6105  
6106      /* Check for BOM marks (U+FEFF) in the input and adjust current
6107         byte order setting accordingly. In native mode, the leading BOM
6108         mark is skipped, in all other modes, it is copied to the output
6109         stream as-is (giving a ZWNBSP character). */
6110      if (bo == 0 && size >= 2) {
6111          const Py_UCS4 bom = (q[1] << 8) | q[0];
6112          if (bom == 0xFEFF) {
6113              q += 2;
6114              bo = -1;
6115          }
6116          else if (bom == 0xFFFE) {
6117              q += 2;
6118              bo = 1;
6119          }
6120          if (byteorder)
6121              *byteorder = bo;
6122      }
6123  
6124      if (q == e) {
6125          if (consumed)
6126              *consumed = size;
6127          _Py_RETURN_UNICODE_EMPTY();
6128      }
6129  
6130  #if PY_LITTLE_ENDIAN
6131      native_ordering = bo <= 0;
6132      encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6133  #else
6134      native_ordering = bo >= 0;
6135      encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6136  #endif
6137  
6138      /* Note: size will always be longer than the resulting Unicode
6139         character count normally.  Error handler will take care of
6140         resizing when needed. */
6141      _PyUnicodeWriter_Init(&writer);
6142      writer.min_length = (e - q + 1) / 2;
6143      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6144          goto onError;
6145  
6146      while (1) {
6147          Py_UCS4 ch = 0;
6148          if (e - q >= 2) {
6149              int kind = writer.kind;
6150              if (kind == PyUnicode_1BYTE_KIND) {
6151                  if (PyUnicode_IS_ASCII(writer.buffer))
6152                      ch = asciilib_utf16_decode(&q, e,
6153                              (Py_UCS1*)writer.data, &writer.pos,
6154                              native_ordering);
6155                  else
6156                      ch = ucs1lib_utf16_decode(&q, e,
6157                              (Py_UCS1*)writer.data, &writer.pos,
6158                              native_ordering);
6159              } else if (kind == PyUnicode_2BYTE_KIND) {
6160                  ch = ucs2lib_utf16_decode(&q, e,
6161                          (Py_UCS2*)writer.data, &writer.pos,
6162                          native_ordering);
6163              } else {
6164                  assert(kind == PyUnicode_4BYTE_KIND);
6165                  ch = ucs4lib_utf16_decode(&q, e,
6166                          (Py_UCS4*)writer.data, &writer.pos,
6167                          native_ordering);
6168              }
6169          }
6170  
6171          switch (ch)
6172          {
6173          case 0:
6174              /* remaining byte at the end? (size should be even) */
6175              if (q == e || consumed)
6176                  goto End;
6177              errmsg = "truncated data";
6178              startinpos = ((const char *)q) - starts;
6179              endinpos = ((const char *)e) - starts;
6180              break;
6181              /* The remaining input chars are ignored if the callback
6182                 chooses to skip the input */
6183          case 1:
6184              q -= 2;
6185              if (consumed)
6186                  goto End;
6187              errmsg = "unexpected end of data";
6188              startinpos = ((const char *)q) - starts;
6189              endinpos = ((const char *)e) - starts;
6190              break;
6191          case 2:
6192              errmsg = "illegal encoding";
6193              startinpos = ((const char *)q) - 2 - starts;
6194              endinpos = startinpos + 2;
6195              break;
6196          case 3:
6197              errmsg = "illegal UTF-16 surrogate";
6198              startinpos = ((const char *)q) - 4 - starts;
6199              endinpos = startinpos + 2;
6200              break;
6201          default:
6202              if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6203                  goto onError;
6204              continue;
6205          }
6206  
6207          if (unicode_decode_call_errorhandler_writer(
6208                  errors,
6209                  &errorHandler,
6210                  encoding, errmsg,
6211                  &starts,
6212                  (const char **)&e,
6213                  &startinpos,
6214                  &endinpos,
6215                  &exc,
6216                  (const char **)&q,
6217                  &writer))
6218              goto onError;
6219      }
6220  
6221  End:
6222      if (consumed)
6223          *consumed = (const char *)q-starts;
6224  
6225      Py_XDECREF(errorHandler);
6226      Py_XDECREF(exc);
6227      return _PyUnicodeWriter_Finish(&writer);
6228  
6229    onError:
6230      _PyUnicodeWriter_Dealloc(&writer);
6231      Py_XDECREF(errorHandler);
6232      Py_XDECREF(exc);
6233      return NULL;
6234  }
6235  
6236  PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6237  _PyUnicode_EncodeUTF16(PyObject *str,
6238                         const char *errors,
6239                         int byteorder)
6240  {
6241      enum PyUnicode_Kind kind;
6242      const void *data;
6243      Py_ssize_t len;
6244      PyObject *v;
6245      unsigned short *out;
6246      Py_ssize_t pairs;
6247  #if PY_BIG_ENDIAN
6248      int native_ordering = byteorder >= 0;
6249  #else
6250      int native_ordering = byteorder <= 0;
6251  #endif
6252      const char *encoding;
6253      Py_ssize_t nsize, pos;
6254      PyObject *errorHandler = NULL;
6255      PyObject *exc = NULL;
6256      PyObject *rep = NULL;
6257  
6258      if (!PyUnicode_Check(str)) {
6259          PyErr_BadArgument();
6260          return NULL;
6261      }
6262      if (PyUnicode_READY(str) == -1)
6263          return NULL;
6264      kind = PyUnicode_KIND(str);
6265      data = PyUnicode_DATA(str);
6266      len = PyUnicode_GET_LENGTH(str);
6267  
6268      pairs = 0;
6269      if (kind == PyUnicode_4BYTE_KIND) {
6270          const Py_UCS4 *in = (const Py_UCS4 *)data;
6271          const Py_UCS4 *end = in + len;
6272          while (in < end) {
6273              if (*in++ >= 0x10000) {
6274                  pairs++;
6275              }
6276          }
6277      }
6278      if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6279          return PyErr_NoMemory();
6280      }
6281      nsize = len + pairs + (byteorder == 0);
6282      v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6283      if (v == NULL) {
6284          return NULL;
6285      }
6286  
6287      /* output buffer is 2-bytes aligned */
6288      assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6289      out = (unsigned short *)PyBytes_AS_STRING(v);
6290      if (byteorder == 0) {
6291          *out++ = 0xFEFF;
6292      }
6293      if (len == 0) {
6294          goto done;
6295      }
6296  
6297      if (kind == PyUnicode_1BYTE_KIND) {
6298          ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6299          goto done;
6300      }
6301  
6302      if (byteorder < 0) {
6303          encoding = "utf-16-le";
6304      }
6305      else if (byteorder > 0) {
6306          encoding = "utf-16-be";
6307      }
6308      else {
6309          encoding = "utf-16";
6310      }
6311  
6312      pos = 0;
6313      while (pos < len) {
6314          Py_ssize_t repsize, moreunits;
6315  
6316          if (kind == PyUnicode_2BYTE_KIND) {
6317              pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6318                                          &out, native_ordering);
6319          }
6320          else {
6321              assert(kind == PyUnicode_4BYTE_KIND);
6322              pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6323                                          &out, native_ordering);
6324          }
6325          if (pos == len)
6326              break;
6327  
6328          rep = unicode_encode_call_errorhandler(
6329                  errors, &errorHandler,
6330                  encoding, "surrogates not allowed",
6331                  str, &exc, pos, pos + 1, &pos);
6332          if (!rep)
6333              goto error;
6334  
6335          if (PyBytes_Check(rep)) {
6336              repsize = PyBytes_GET_SIZE(rep);
6337              if (repsize & 1) {
6338                  raise_encode_exception(&exc, encoding,
6339                                         str, pos - 1, pos,
6340                                         "surrogates not allowed");
6341                  goto error;
6342              }
6343              moreunits = repsize / 2;
6344          }
6345          else {
6346              assert(PyUnicode_Check(rep));
6347              if (PyUnicode_READY(rep) < 0)
6348                  goto error;
6349              moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6350              if (!PyUnicode_IS_ASCII(rep)) {
6351                  raise_encode_exception(&exc, encoding,
6352                                         str, pos - 1, pos,
6353                                         "surrogates not allowed");
6354                  goto error;
6355              }
6356          }
6357  
6358          /* two bytes are reserved for each surrogate */
6359          if (moreunits > 1) {
6360              Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6361              if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6362                  /* integer overflow */
6363                  PyErr_NoMemory();
6364                  goto error;
6365              }
6366              if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6367                  goto error;
6368              out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6369          }
6370  
6371          if (PyBytes_Check(rep)) {
6372              memcpy(out, PyBytes_AS_STRING(rep), repsize);
6373              out += moreunits;
6374          } else /* rep is unicode */ {
6375              assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6376              ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6377                                   &out, native_ordering);
6378          }
6379  
6380          Py_CLEAR(rep);
6381      }
6382  
6383      /* Cut back to size actually needed. This is necessary for, for example,
6384      encoding of a string containing isolated surrogates and the 'ignore' handler
6385      is used. */
6386      nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6387      if (nsize != PyBytes_GET_SIZE(v))
6388        _PyBytes_Resize(&v, nsize);
6389      Py_XDECREF(errorHandler);
6390      Py_XDECREF(exc);
6391    done:
6392      return v;
6393    error:
6394      Py_XDECREF(rep);
6395      Py_XDECREF(errorHandler);
6396      Py_XDECREF(exc);
6397      Py_XDECREF(v);
6398      return NULL;
6399  #undef STORECHAR
6400  }
6401  
6402  PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6403  PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6404                        Py_ssize_t size,
6405                        const char *errors,
6406                        int byteorder)
6407  {
6408      PyObject *result;
6409      PyObject *tmp = PyUnicode_FromWideChar(s, size);
6410      if (tmp == NULL)
6411          return NULL;
6412      result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6413      Py_DECREF(tmp);
6414      return result;
6415  }
6416  
6417  PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6418  PyUnicode_AsUTF16String(PyObject *unicode)
6419  {
6420      return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6421  }
6422  
6423  /* --- Unicode Escape Codec ----------------------------------------------- */
6424  
6425  static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6426  
6427  PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6428  _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6429                                 Py_ssize_t size,
6430                                 const char *errors,
6431                                 Py_ssize_t *consumed,
6432                                 const char **first_invalid_escape)
6433  {
6434      const char *starts = s;
6435      _PyUnicodeWriter writer;
6436      const char *end;
6437      PyObject *errorHandler = NULL;
6438      PyObject *exc = NULL;
6439  
6440      // so we can remember if we've seen an invalid escape char or not
6441      *first_invalid_escape = NULL;
6442  
6443      if (size == 0) {
6444          if (consumed) {
6445              *consumed = 0;
6446          }
6447          _Py_RETURN_UNICODE_EMPTY();
6448      }
6449      /* Escaped strings will always be longer than the resulting
6450         Unicode string, so we start with size here and then reduce the
6451         length after conversion to the true value.
6452         (but if the error callback returns a long replacement string
6453         we'll have to allocate more space) */
6454      _PyUnicodeWriter_Init(&writer);
6455      writer.min_length = size;
6456      if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6457          goto onError;
6458      }
6459  
6460      end = s + size;
6461      while (s < end) {
6462          unsigned char c = (unsigned char) *s++;
6463          Py_UCS4 ch;
6464          int count;
6465          const char *message;
6466  
6467  #define WRITE_ASCII_CHAR(ch)                                                  \
6468              do {                                                              \
6469                  assert(ch <= 127);                                            \
6470                  assert(writer.pos < writer.size);                             \
6471                  PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6472              } while(0)
6473  
6474  #define WRITE_CHAR(ch)                                                        \
6475              do {                                                              \
6476                  if (ch <= writer.maxchar) {                                   \
6477                      assert(writer.pos < writer.size);                         \
6478                      PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6479                  }                                                             \
6480                  else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6481                      goto onError;                                             \
6482                  }                                                             \
6483              } while(0)
6484  
6485          /* Non-escape characters are interpreted as Unicode ordinals */
6486          if (c != '\\') {
6487              WRITE_CHAR(c);
6488              continue;
6489          }
6490  
6491          Py_ssize_t startinpos = s - starts - 1;
6492          /* \ - Escapes */
6493          if (s >= end) {
6494              message = "\\ at end of string";
6495              goto incomplete;
6496          }
6497          c = (unsigned char) *s++;
6498  
6499          assert(writer.pos < writer.size);
6500          switch (c) {
6501  
6502              /* \x escapes */
6503          case '\n': continue;
6504          case '\\': WRITE_ASCII_CHAR('\\'); continue;
6505          case '\'': WRITE_ASCII_CHAR('\''); continue;
6506          case '\"': WRITE_ASCII_CHAR('\"'); continue;
6507          case 'b': WRITE_ASCII_CHAR('\b'); continue;
6508          /* FF */
6509          case 'f': WRITE_ASCII_CHAR('\014'); continue;
6510          case 't': WRITE_ASCII_CHAR('\t'); continue;
6511          case 'n': WRITE_ASCII_CHAR('\n'); continue;
6512          case 'r': WRITE_ASCII_CHAR('\r'); continue;
6513          /* VT */
6514          case 'v': WRITE_ASCII_CHAR('\013'); continue;
6515          /* BEL, not classic C */
6516          case 'a': WRITE_ASCII_CHAR('\007'); continue;
6517  
6518              /* \OOO (octal) escapes */
6519          case '0': case '1': case '2': case '3':
6520          case '4': case '5': case '6': case '7':
6521              ch = c - '0';
6522              if (s < end && '0' <= *s && *s <= '7') {
6523                  ch = (ch<<3) + *s++ - '0';
6524                  if (s < end && '0' <= *s && *s <= '7') {
6525                      ch = (ch<<3) + *s++ - '0';
6526                  }
6527              }
6528              WRITE_CHAR(ch);
6529              continue;
6530  
6531              /* hex escapes */
6532              /* \xXX */
6533          case 'x':
6534              count = 2;
6535              message = "truncated \\xXX escape";
6536              goto hexescape;
6537  
6538              /* \uXXXX */
6539          case 'u':
6540              count = 4;
6541              message = "truncated \\uXXXX escape";
6542              goto hexescape;
6543  
6544              /* \UXXXXXXXX */
6545          case 'U':
6546              count = 8;
6547              message = "truncated \\UXXXXXXXX escape";
6548          hexescape:
6549              for (ch = 0; count; ++s, --count) {
6550                  if (s >= end) {
6551                      goto incomplete;
6552                  }
6553                  c = (unsigned char)*s;
6554                  ch <<= 4;
6555                  if (c >= '0' && c <= '9') {
6556                      ch += c - '0';
6557                  }
6558                  else if (c >= 'a' && c <= 'f') {
6559                      ch += c - ('a' - 10);
6560                  }
6561                  else if (c >= 'A' && c <= 'F') {
6562                      ch += c - ('A' - 10);
6563                  }
6564                  else {
6565                      goto error;
6566                  }
6567              }
6568  
6569              /* when we get here, ch is a 32-bit unicode character */
6570              if (ch > MAX_UNICODE) {
6571                  message = "illegal Unicode character";
6572                  goto error;
6573              }
6574  
6575              WRITE_CHAR(ch);
6576              continue;
6577  
6578              /* \N{name} */
6579          case 'N':
6580              if (ucnhash_capi == NULL) {
6581                  /* load the unicode data module */
6582                  ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6583                                                  PyUnicodeData_CAPSULE_NAME, 1);
6584                  if (ucnhash_capi == NULL) {
6585                      PyErr_SetString(
6586                          PyExc_UnicodeError,
6587                          "\\N escapes not supported (can't load unicodedata module)"
6588                          );
6589                      goto onError;
6590                  }
6591              }
6592  
6593              message = "malformed \\N character escape";
6594              if (s >= end) {
6595                  goto incomplete;
6596              }
6597              if (*s == '{') {
6598                  const char *start = ++s;
6599                  size_t namelen;
6600                  /* look for the closing brace */
6601                  while (s < end && *s != '}')
6602                      s++;
6603                  if (s >= end) {
6604                      goto incomplete;
6605                  }
6606                  namelen = s - start;
6607                  if (namelen) {
6608                      /* found a name.  look it up in the unicode database */
6609                      s++;
6610                      ch = 0xffffffff; /* in case 'getcode' messes up */
6611                      if (namelen <= INT_MAX &&
6612                          ucnhash_capi->getcode(start, (int)namelen,
6613                                                &ch, 0)) {
6614                          assert(ch <= MAX_UNICODE);
6615                          WRITE_CHAR(ch);
6616                          continue;
6617                      }
6618                      message = "unknown Unicode character name";
6619                  }
6620              }
6621              goto error;
6622  
6623          default:
6624              if (*first_invalid_escape == NULL) {
6625                  *first_invalid_escape = s-1; /* Back up one char, since we've
6626                                                  already incremented s. */
6627              }
6628              WRITE_ASCII_CHAR('\\');
6629              WRITE_CHAR(c);
6630              continue;
6631          }
6632  
6633        incomplete:
6634          if (consumed) {
6635              *consumed = startinpos;
6636              break;
6637          }
6638        error:;
6639          Py_ssize_t endinpos = s-starts;
6640          writer.min_length = end - s + writer.pos;
6641          if (unicode_decode_call_errorhandler_writer(
6642                  errors, &errorHandler,
6643                  "unicodeescape", message,
6644                  &starts, &end, &startinpos, &endinpos, &exc, &s,
6645                  &writer)) {
6646              goto onError;
6647          }
6648          assert(end - s <= writer.size - writer.pos);
6649  
6650  #undef WRITE_ASCII_CHAR
6651  #undef WRITE_CHAR
6652      }
6653  
6654      Py_XDECREF(errorHandler);
6655      Py_XDECREF(exc);
6656      return _PyUnicodeWriter_Finish(&writer);
6657  
6658    onError:
6659      _PyUnicodeWriter_Dealloc(&writer);
6660      Py_XDECREF(errorHandler);
6661      Py_XDECREF(exc);
6662      return NULL;
6663  }
6664  
6665  PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6666  _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6667                                Py_ssize_t size,
6668                                const char *errors,
6669                                Py_ssize_t *consumed)
6670  {
6671      const char *first_invalid_escape;
6672      PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6673                                                        consumed,
6674                                                        &first_invalid_escape);
6675      if (result == NULL)
6676          return NULL;
6677      if (first_invalid_escape != NULL) {
6678          if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6679                               "invalid escape sequence '\\%c'",
6680                               (unsigned char)*first_invalid_escape) < 0) {
6681              Py_DECREF(result);
6682              return NULL;
6683          }
6684      }
6685      return result;
6686  }
6687  
6688  PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6689  PyUnicode_DecodeUnicodeEscape(const char *s,
6690                                Py_ssize_t size,
6691                                const char *errors)
6692  {
6693      return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6694  }
6695  
6696  /* Return a Unicode-Escape string version of the Unicode object. */
6697  
6698  PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6699  PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6700  {
6701      Py_ssize_t i, len;
6702      PyObject *repr;
6703      char *p;
6704      enum PyUnicode_Kind kind;
6705      const void *data;
6706      Py_ssize_t expandsize;
6707  
6708      /* Initial allocation is based on the longest-possible character
6709         escape.
6710  
6711         For UCS1 strings it's '\xxx', 4 bytes per source character.
6712         For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6713         For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6714      */
6715  
6716      if (!PyUnicode_Check(unicode)) {
6717          PyErr_BadArgument();
6718          return NULL;
6719      }
6720      if (PyUnicode_READY(unicode) == -1) {
6721          return NULL;
6722      }
6723  
6724      len = PyUnicode_GET_LENGTH(unicode);
6725      if (len == 0) {
6726          return PyBytes_FromStringAndSize(NULL, 0);
6727      }
6728  
6729      kind = PyUnicode_KIND(unicode);
6730      data = PyUnicode_DATA(unicode);
6731      /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6732         bytes, and 1 byte characters 4. */
6733      expandsize = kind * 2 + 2;
6734      if (len > PY_SSIZE_T_MAX / expandsize) {
6735          return PyErr_NoMemory();
6736      }
6737      repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6738      if (repr == NULL) {
6739          return NULL;
6740      }
6741  
6742      p = PyBytes_AS_STRING(repr);
6743      for (i = 0; i < len; i++) {
6744          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6745  
6746          /* U+0000-U+00ff range */
6747          if (ch < 0x100) {
6748              if (ch >= ' ' && ch < 127) {
6749                  if (ch != '\\') {
6750                      /* Copy printable US ASCII as-is */
6751                      *p++ = (char) ch;
6752                  }
6753                  /* Escape backslashes */
6754                  else {
6755                      *p++ = '\\';
6756                      *p++ = '\\';
6757                  }
6758              }
6759  
6760              /* Map special whitespace to '\t', \n', '\r' */
6761              else if (ch == '\t') {
6762                  *p++ = '\\';
6763                  *p++ = 't';
6764              }
6765              else if (ch == '\n') {
6766                  *p++ = '\\';
6767                  *p++ = 'n';
6768              }
6769              else if (ch == '\r') {
6770                  *p++ = '\\';
6771                  *p++ = 'r';
6772              }
6773  
6774              /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6775              else {
6776                  *p++ = '\\';
6777                  *p++ = 'x';
6778                  *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6779                  *p++ = Py_hexdigits[ch & 0x000F];
6780              }
6781          }
6782          /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6783          else if (ch < 0x10000) {
6784              *p++ = '\\';
6785              *p++ = 'u';
6786              *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6787              *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6788              *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6789              *p++ = Py_hexdigits[ch & 0x000F];
6790          }
6791          /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6792          else {
6793  
6794              /* Make sure that the first two digits are zero */
6795              assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6796              *p++ = '\\';
6797              *p++ = 'U';
6798              *p++ = '0';
6799              *p++ = '0';
6800              *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6801              *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6802              *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6803              *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6804              *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6805              *p++ = Py_hexdigits[ch & 0x0000000F];
6806          }
6807      }
6808  
6809      assert(p - PyBytes_AS_STRING(repr) > 0);
6810      if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6811          return NULL;
6812      }
6813      return repr;
6814  }
6815  
6816  PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6817  PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6818                                Py_ssize_t size)
6819  {
6820      PyObject *result;
6821      PyObject *tmp = PyUnicode_FromWideChar(s, size);
6822      if (tmp == NULL) {
6823          return NULL;
6824      }
6825  
6826      result = PyUnicode_AsUnicodeEscapeString(tmp);
6827      Py_DECREF(tmp);
6828      return result;
6829  }
6830  
6831  /* --- Raw Unicode Escape Codec ------------------------------------------- */
6832  
6833  PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6834  _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6835                                            Py_ssize_t size,
6836                                            const char *errors,
6837                                            Py_ssize_t *consumed)
6838  {
6839      const char *starts = s;
6840      _PyUnicodeWriter writer;
6841      const char *end;
6842      PyObject *errorHandler = NULL;
6843      PyObject *exc = NULL;
6844  
6845      if (size == 0) {
6846          if (consumed) {
6847              *consumed = 0;
6848          }
6849          _Py_RETURN_UNICODE_EMPTY();
6850      }
6851  
6852      /* Escaped strings will always be longer than the resulting
6853         Unicode string, so we start with size here and then reduce the
6854         length after conversion to the true value. (But decoding error
6855         handler might have to resize the string) */
6856      _PyUnicodeWriter_Init(&writer);
6857      writer.min_length = size;
6858      if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6859          goto onError;
6860      }
6861  
6862      end = s + size;
6863      while (s < end) {
6864          unsigned char c = (unsigned char) *s++;
6865          Py_UCS4 ch;
6866          int count;
6867          const char *message;
6868  
6869  #define WRITE_CHAR(ch)                                                        \
6870              do {                                                              \
6871                  if (ch <= writer.maxchar) {                                   \
6872                      assert(writer.pos < writer.size);                         \
6873                      PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6874                  }                                                             \
6875                  else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6876                      goto onError;                                             \
6877                  }                                                             \
6878              } while(0)
6879  
6880          /* Non-escape characters are interpreted as Unicode ordinals */
6881          if (c != '\\' || (s >= end && !consumed)) {
6882              WRITE_CHAR(c);
6883              continue;
6884          }
6885  
6886          Py_ssize_t startinpos = s - starts - 1;
6887          /* \ - Escapes */
6888          if (s >= end) {
6889              assert(consumed);
6890              // Set message to silent compiler warning.
6891              // Actually it is never used.
6892              message = "\\ at end of string";
6893              goto incomplete;
6894          }
6895  
6896          c = (unsigned char) *s++;
6897          if (c == 'u') {
6898              count = 4;
6899              message = "truncated \\uXXXX escape";
6900          }
6901          else if (c == 'U') {
6902              count = 8;
6903              message = "truncated \\UXXXXXXXX escape";
6904          }
6905          else {
6906              assert(writer.pos < writer.size);
6907              PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6908              WRITE_CHAR(c);
6909              continue;
6910          }
6911  
6912          /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6913          for (ch = 0; count; ++s, --count) {
6914              if (s >= end) {
6915                  goto incomplete;
6916              }
6917              c = (unsigned char)*s;
6918              ch <<= 4;
6919              if (c >= '0' && c <= '9') {
6920                  ch += c - '0';
6921              }
6922              else if (c >= 'a' && c <= 'f') {
6923                  ch += c - ('a' - 10);
6924              }
6925              else if (c >= 'A' && c <= 'F') {
6926                  ch += c - ('A' - 10);
6927              }
6928              else {
6929                  goto error;
6930              }
6931          }
6932          if (ch > MAX_UNICODE) {
6933              message = "\\Uxxxxxxxx out of range";
6934              goto error;
6935          }
6936          WRITE_CHAR(ch);
6937          continue;
6938  
6939        incomplete:
6940          if (consumed) {
6941              *consumed = startinpos;
6942              break;
6943          }
6944        error:;
6945          Py_ssize_t endinpos = s-starts;
6946          writer.min_length = end - s + writer.pos;
6947          if (unicode_decode_call_errorhandler_writer(
6948                  errors, &errorHandler,
6949                  "rawunicodeescape", message,
6950                  &starts, &end, &startinpos, &endinpos, &exc, &s,
6951                  &writer)) {
6952              goto onError;
6953          }
6954          assert(end - s <= writer.size - writer.pos);
6955  
6956  #undef WRITE_CHAR
6957      }
6958      Py_XDECREF(errorHandler);
6959      Py_XDECREF(exc);
6960      return _PyUnicodeWriter_Finish(&writer);
6961  
6962    onError:
6963      _PyUnicodeWriter_Dealloc(&writer);
6964      Py_XDECREF(errorHandler);
6965      Py_XDECREF(exc);
6966      return NULL;
6967  }
6968  
6969  PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6970  PyUnicode_DecodeRawUnicodeEscape(const char *s,
6971                                   Py_ssize_t size,
6972                                   const char *errors)
6973  {
6974      return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6975  }
6976  
6977  
6978  PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6979  PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6980  {
6981      PyObject *repr;
6982      char *p;
6983      Py_ssize_t expandsize, pos;
6984      int kind;
6985      const void *data;
6986      Py_ssize_t len;
6987  
6988      if (!PyUnicode_Check(unicode)) {
6989          PyErr_BadArgument();
6990          return NULL;
6991      }
6992      if (PyUnicode_READY(unicode) == -1) {
6993          return NULL;
6994      }
6995      kind = PyUnicode_KIND(unicode);
6996      data = PyUnicode_DATA(unicode);
6997      len = PyUnicode_GET_LENGTH(unicode);
6998      if (kind == PyUnicode_1BYTE_KIND) {
6999          return PyBytes_FromStringAndSize(data, len);
7000      }
7001  
7002      /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7003         bytes, and 1 byte characters 4. */
7004      expandsize = kind * 2 + 2;
7005  
7006      if (len > PY_SSIZE_T_MAX / expandsize) {
7007          return PyErr_NoMemory();
7008      }
7009      repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7010      if (repr == NULL) {
7011          return NULL;
7012      }
7013      if (len == 0) {
7014          return repr;
7015      }
7016  
7017      p = PyBytes_AS_STRING(repr);
7018      for (pos = 0; pos < len; pos++) {
7019          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7020  
7021          /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7022          if (ch < 0x100) {
7023              *p++ = (char) ch;
7024          }
7025          /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7026          else if (ch < 0x10000) {
7027              *p++ = '\\';
7028              *p++ = 'u';
7029              *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7030              *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7031              *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7032              *p++ = Py_hexdigits[ch & 15];
7033          }
7034          /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7035          else {
7036              assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7037              *p++ = '\\';
7038              *p++ = 'U';
7039              *p++ = '0';
7040              *p++ = '0';
7041              *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7042              *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7043              *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7044              *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7045              *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7046              *p++ = Py_hexdigits[ch & 15];
7047          }
7048      }
7049  
7050      assert(p > PyBytes_AS_STRING(repr));
7051      if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7052          return NULL;
7053      }
7054      return repr;
7055  }
7056  
7057  PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)7058  PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7059                                   Py_ssize_t size)
7060  {
7061      PyObject *result;
7062      PyObject *tmp = PyUnicode_FromWideChar(s, size);
7063      if (tmp == NULL)
7064          return NULL;
7065      result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7066      Py_DECREF(tmp);
7067      return result;
7068  }
7069  
7070  /* --- Latin-1 Codec ------------------------------------------------------ */
7071  
7072  PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)7073  PyUnicode_DecodeLatin1(const char *s,
7074                         Py_ssize_t size,
7075                         const char *errors)
7076  {
7077      /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7078      return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7079  }
7080  
7081  /* create or adjust a UnicodeEncodeError */
7082  static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7083  make_encode_exception(PyObject **exceptionObject,
7084                        const char *encoding,
7085                        PyObject *unicode,
7086                        Py_ssize_t startpos, Py_ssize_t endpos,
7087                        const char *reason)
7088  {
7089      if (*exceptionObject == NULL) {
7090          *exceptionObject = PyObject_CallFunction(
7091              PyExc_UnicodeEncodeError, "sOnns",
7092              encoding, unicode, startpos, endpos, reason);
7093      }
7094      else {
7095          if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7096              goto onError;
7097          if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7098              goto onError;
7099          if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7100              goto onError;
7101          return;
7102        onError:
7103          Py_CLEAR(*exceptionObject);
7104      }
7105  }
7106  
7107  /* raises a UnicodeEncodeError */
7108  static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7109  raise_encode_exception(PyObject **exceptionObject,
7110                         const char *encoding,
7111                         PyObject *unicode,
7112                         Py_ssize_t startpos, Py_ssize_t endpos,
7113                         const char *reason)
7114  {
7115      make_encode_exception(exceptionObject,
7116                            encoding, unicode, startpos, endpos, reason);
7117      if (*exceptionObject != NULL)
7118          PyCodec_StrictErrors(*exceptionObject);
7119  }
7120  
7121  /* error handling callback helper:
7122     build arguments, call the callback and check the arguments,
7123     put the result into newpos and return the replacement string, which
7124     has to be freed by the caller */
7125  static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)7126  unicode_encode_call_errorhandler(const char *errors,
7127                                   PyObject **errorHandler,
7128                                   const char *encoding, const char *reason,
7129                                   PyObject *unicode, PyObject **exceptionObject,
7130                                   Py_ssize_t startpos, Py_ssize_t endpos,
7131                                   Py_ssize_t *newpos)
7132  {
7133      static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7134      Py_ssize_t len;
7135      PyObject *restuple;
7136      PyObject *resunicode;
7137  
7138      if (*errorHandler == NULL) {
7139          *errorHandler = PyCodec_LookupError(errors);
7140          if (*errorHandler == NULL)
7141              return NULL;
7142      }
7143  
7144      if (PyUnicode_READY(unicode) == -1)
7145          return NULL;
7146      len = PyUnicode_GET_LENGTH(unicode);
7147  
7148      make_encode_exception(exceptionObject,
7149                            encoding, unicode, startpos, endpos, reason);
7150      if (*exceptionObject == NULL)
7151          return NULL;
7152  
7153      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154      if (restuple == NULL)
7155          return NULL;
7156      if (!PyTuple_Check(restuple)) {
7157          PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158          Py_DECREF(restuple);
7159          return NULL;
7160      }
7161      if (!PyArg_ParseTuple(restuple, argparse,
7162                            &resunicode, newpos)) {
7163          Py_DECREF(restuple);
7164          return NULL;
7165      }
7166      if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167          PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168          Py_DECREF(restuple);
7169          return NULL;
7170      }
7171      if (*newpos<0)
7172          *newpos = len + *newpos;
7173      if (*newpos<0 || *newpos>len) {
7174          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175          Py_DECREF(restuple);
7176          return NULL;
7177      }
7178      Py_INCREF(resunicode);
7179      Py_DECREF(restuple);
7180      return resunicode;
7181  }
7182  
7183  static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7184  unicode_encode_ucs1(PyObject *unicode,
7185                      const char *errors,
7186                      const Py_UCS4 limit)
7187  {
7188      /* input state */
7189      Py_ssize_t pos=0, size;
7190      int kind;
7191      const void *data;
7192      /* pointer into the output */
7193      char *str;
7194      const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7195      const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7196      PyObject *error_handler_obj = NULL;
7197      PyObject *exc = NULL;
7198      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7199      PyObject *rep = NULL;
7200      /* output object */
7201      _PyBytesWriter writer;
7202  
7203      if (PyUnicode_READY(unicode) == -1)
7204          return NULL;
7205      size = PyUnicode_GET_LENGTH(unicode);
7206      kind = PyUnicode_KIND(unicode);
7207      data = PyUnicode_DATA(unicode);
7208      /* allocate enough for a simple encoding without
7209         replacements, if we need more, we'll resize */
7210      if (size == 0)
7211          return PyBytes_FromStringAndSize(NULL, 0);
7212  
7213      _PyBytesWriter_Init(&writer);
7214      str = _PyBytesWriter_Alloc(&writer, size);
7215      if (str == NULL)
7216          return NULL;
7217  
7218      while (pos < size) {
7219          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7220  
7221          /* can we encode this? */
7222          if (ch < limit) {
7223              /* no overflow check, because we know that the space is enough */
7224              *str++ = (char)ch;
7225              ++pos;
7226          }
7227          else {
7228              Py_ssize_t newpos, i;
7229              /* startpos for collecting unencodable chars */
7230              Py_ssize_t collstart = pos;
7231              Py_ssize_t collend = collstart + 1;
7232              /* find all unecodable characters */
7233  
7234              while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7235                  ++collend;
7236  
7237              /* Only overallocate the buffer if it's not the last write */
7238              writer.overallocate = (collend < size);
7239  
7240              /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7241              if (error_handler == _Py_ERROR_UNKNOWN)
7242                  error_handler = _Py_GetErrorHandler(errors);
7243  
7244              switch (error_handler) {
7245              case _Py_ERROR_STRICT:
7246                  raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7247                  goto onError;
7248  
7249              case _Py_ERROR_REPLACE:
7250                  memset(str, '?', collend - collstart);
7251                  str += (collend - collstart);
7252                  /* fall through */
7253              case _Py_ERROR_IGNORE:
7254                  pos = collend;
7255                  break;
7256  
7257              case _Py_ERROR_BACKSLASHREPLACE:
7258                  /* subtract preallocated bytes */
7259                  writer.min_size -= (collend - collstart);
7260                  str = backslashreplace(&writer, str,
7261                                         unicode, collstart, collend);
7262                  if (str == NULL)
7263                      goto onError;
7264                  pos = collend;
7265                  break;
7266  
7267              case _Py_ERROR_XMLCHARREFREPLACE:
7268                  /* subtract preallocated bytes */
7269                  writer.min_size -= (collend - collstart);
7270                  str = xmlcharrefreplace(&writer, str,
7271                                          unicode, collstart, collend);
7272                  if (str == NULL)
7273                      goto onError;
7274                  pos = collend;
7275                  break;
7276  
7277              case _Py_ERROR_SURROGATEESCAPE:
7278                  for (i = collstart; i < collend; ++i) {
7279                      ch = PyUnicode_READ(kind, data, i);
7280                      if (ch < 0xdc80 || 0xdcff < ch) {
7281                          /* Not a UTF-8b surrogate */
7282                          break;
7283                      }
7284                      *str++ = (char)(ch - 0xdc00);
7285                      ++pos;
7286                  }
7287                  if (i >= collend)
7288                      break;
7289                  collstart = pos;
7290                  assert(collstart != collend);
7291                  /* fall through */
7292  
7293              default:
7294                  rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7295                                                         encoding, reason, unicode, &exc,
7296                                                         collstart, collend, &newpos);
7297                  if (rep == NULL)
7298                      goto onError;
7299  
7300                  /* subtract preallocated bytes */
7301                  writer.min_size -= newpos - collstart;
7302  
7303                  if (PyBytes_Check(rep)) {
7304                      /* Directly copy bytes result to output. */
7305                      str = _PyBytesWriter_WriteBytes(&writer, str,
7306                                                      PyBytes_AS_STRING(rep),
7307                                                      PyBytes_GET_SIZE(rep));
7308                  }
7309                  else {
7310                      assert(PyUnicode_Check(rep));
7311  
7312                      if (PyUnicode_READY(rep) < 0)
7313                          goto onError;
7314  
7315                      if (limit == 256 ?
7316                          PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317                          !PyUnicode_IS_ASCII(rep))
7318                      {
7319                          /* Not all characters are smaller than limit */
7320                          raise_encode_exception(&exc, encoding, unicode,
7321                                                 collstart, collend, reason);
7322                          goto onError;
7323                      }
7324                      assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325                      str = _PyBytesWriter_WriteBytes(&writer, str,
7326                                                      PyUnicode_DATA(rep),
7327                                                      PyUnicode_GET_LENGTH(rep));
7328                  }
7329                  if (str == NULL)
7330                      goto onError;
7331  
7332                  pos = newpos;
7333                  Py_CLEAR(rep);
7334              }
7335  
7336              /* If overallocation was disabled, ensure that it was the last
7337                 write. Otherwise, we missed an optimization */
7338              assert(writer.overallocate || pos == size);
7339          }
7340      }
7341  
7342      Py_XDECREF(error_handler_obj);
7343      Py_XDECREF(exc);
7344      return _PyBytesWriter_Finish(&writer, str);
7345  
7346    onError:
7347      Py_XDECREF(rep);
7348      _PyBytesWriter_Dealloc(&writer);
7349      Py_XDECREF(error_handler_obj);
7350      Py_XDECREF(exc);
7351      return NULL;
7352  }
7353  
7354  /* Deprecated */
7355  PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7356  PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7357                         Py_ssize_t size,
7358                         const char *errors)
7359  {
7360      PyObject *result;
7361      PyObject *unicode = PyUnicode_FromWideChar(p, size);
7362      if (unicode == NULL)
7363          return NULL;
7364      result = unicode_encode_ucs1(unicode, errors, 256);
7365      Py_DECREF(unicode);
7366      return result;
7367  }
7368  
7369  PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7370  _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7371  {
7372      if (!PyUnicode_Check(unicode)) {
7373          PyErr_BadArgument();
7374          return NULL;
7375      }
7376      if (PyUnicode_READY(unicode) == -1)
7377          return NULL;
7378      /* Fast path: if it is a one-byte string, construct
7379         bytes object directly. */
7380      if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7381          return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7382                                           PyUnicode_GET_LENGTH(unicode));
7383      /* Non-Latin-1 characters present. Defer to above function to
7384         raise the exception. */
7385      return unicode_encode_ucs1(unicode, errors, 256);
7386  }
7387  
7388  PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7389  PyUnicode_AsLatin1String(PyObject *unicode)
7390  {
7391      return _PyUnicode_AsLatin1String(unicode, NULL);
7392  }
7393  
7394  /* --- 7-bit ASCII Codec -------------------------------------------------- */
7395  
7396  PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7397  PyUnicode_DecodeASCII(const char *s,
7398                        Py_ssize_t size,
7399                        const char *errors)
7400  {
7401      const char *starts = s;
7402      const char *e = s + size;
7403      PyObject *error_handler_obj = NULL;
7404      PyObject *exc = NULL;
7405      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7406  
7407      if (size == 0)
7408          _Py_RETURN_UNICODE_EMPTY();
7409  
7410      /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7411      if (size == 1 && (unsigned char)s[0] < 128) {
7412          return get_latin1_char((unsigned char)s[0]);
7413      }
7414  
7415      // Shortcut for simple case
7416      PyObject *u = PyUnicode_New(size, 127);
7417      if (u == NULL) {
7418          return NULL;
7419      }
7420      Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7421      if (outpos == size) {
7422          return u;
7423      }
7424  
7425      _PyUnicodeWriter writer;
7426      _PyUnicodeWriter_InitWithBuffer(&writer, u);
7427      writer.pos = outpos;
7428  
7429      s += outpos;
7430      int kind = writer.kind;
7431      void *data = writer.data;
7432      Py_ssize_t startinpos, endinpos;
7433  
7434      while (s < e) {
7435          unsigned char c = (unsigned char)*s;
7436          if (c < 128) {
7437              PyUnicode_WRITE(kind, data, writer.pos, c);
7438              writer.pos++;
7439              ++s;
7440              continue;
7441          }
7442  
7443          /* byte outsize range 0x00..0x7f: call the error handler */
7444  
7445          if (error_handler == _Py_ERROR_UNKNOWN)
7446              error_handler = _Py_GetErrorHandler(errors);
7447  
7448          switch (error_handler)
7449          {
7450          case _Py_ERROR_REPLACE:
7451          case _Py_ERROR_SURROGATEESCAPE:
7452              /* Fast-path: the error handler only writes one character,
7453                 but we may switch to UCS2 at the first write */
7454              if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7455                  goto onError;
7456              kind = writer.kind;
7457              data = writer.data;
7458  
7459              if (error_handler == _Py_ERROR_REPLACE)
7460                  PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7461              else
7462                  PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7463              writer.pos++;
7464              ++s;
7465              break;
7466  
7467          case _Py_ERROR_IGNORE:
7468              ++s;
7469              break;
7470  
7471          default:
7472              startinpos = s-starts;
7473              endinpos = startinpos + 1;
7474              if (unicode_decode_call_errorhandler_writer(
7475                      errors, &error_handler_obj,
7476                      "ascii", "ordinal not in range(128)",
7477                      &starts, &e, &startinpos, &endinpos, &exc, &s,
7478                      &writer))
7479                  goto onError;
7480              kind = writer.kind;
7481              data = writer.data;
7482          }
7483      }
7484      Py_XDECREF(error_handler_obj);
7485      Py_XDECREF(exc);
7486      return _PyUnicodeWriter_Finish(&writer);
7487  
7488    onError:
7489      _PyUnicodeWriter_Dealloc(&writer);
7490      Py_XDECREF(error_handler_obj);
7491      Py_XDECREF(exc);
7492      return NULL;
7493  }
7494  
7495  /* Deprecated */
7496  PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7497  PyUnicode_EncodeASCII(const Py_UNICODE *p,
7498                        Py_ssize_t size,
7499                        const char *errors)
7500  {
7501      PyObject *result;
7502      PyObject *unicode = PyUnicode_FromWideChar(p, size);
7503      if (unicode == NULL)
7504          return NULL;
7505      result = unicode_encode_ucs1(unicode, errors, 128);
7506      Py_DECREF(unicode);
7507      return result;
7508  }
7509  
7510  PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7511  _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7512  {
7513      if (!PyUnicode_Check(unicode)) {
7514          PyErr_BadArgument();
7515          return NULL;
7516      }
7517      if (PyUnicode_READY(unicode) == -1)
7518          return NULL;
7519      /* Fast path: if it is an ASCII-only string, construct bytes object
7520         directly. Else defer to above function to raise the exception. */
7521      if (PyUnicode_IS_ASCII(unicode))
7522          return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7523                                           PyUnicode_GET_LENGTH(unicode));
7524      return unicode_encode_ucs1(unicode, errors, 128);
7525  }
7526  
7527  PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7528  PyUnicode_AsASCIIString(PyObject *unicode)
7529  {
7530      return _PyUnicode_AsASCIIString(unicode, NULL);
7531  }
7532  
7533  #ifdef MS_WINDOWS
7534  
7535  /* --- MBCS codecs for Windows -------------------------------------------- */
7536  
7537  #if SIZEOF_INT < SIZEOF_SIZE_T
7538  #define NEED_RETRY
7539  #endif
7540  
7541  /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7542     transcoding from UTF-16), but INT_MAX / 4 performs better in
7543     both cases also and avoids partial characters overrunning the
7544     length limit in MultiByteToWideChar on Windows */
7545  #define DECODING_CHUNK_SIZE (INT_MAX/4)
7546  
7547  #ifndef WC_ERR_INVALID_CHARS
7548  #  define WC_ERR_INVALID_CHARS 0x0080
7549  #endif
7550  
7551  static const char*
code_page_name(UINT code_page,PyObject ** obj)7552  code_page_name(UINT code_page, PyObject **obj)
7553  {
7554      *obj = NULL;
7555      if (code_page == CP_ACP)
7556          return "mbcs";
7557      if (code_page == CP_UTF7)
7558          return "CP_UTF7";
7559      if (code_page == CP_UTF8)
7560          return "CP_UTF8";
7561  
7562      *obj = PyBytes_FromFormat("cp%u", code_page);
7563      if (*obj == NULL)
7564          return NULL;
7565      return PyBytes_AS_STRING(*obj);
7566  }
7567  
7568  static DWORD
decode_code_page_flags(UINT code_page)7569  decode_code_page_flags(UINT code_page)
7570  {
7571      if (code_page == CP_UTF7) {
7572          /* The CP_UTF7 decoder only supports flags=0 */
7573          return 0;
7574      }
7575      else
7576          return MB_ERR_INVALID_CHARS;
7577  }
7578  
7579  /*
7580   * Decode a byte string from a Windows code page into unicode object in strict
7581   * mode.
7582   *
7583   * Returns consumed size if succeed, returns -2 on decode error, or raise an
7584   * OSError and returns -1 on other error.
7585   */
7586  static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7587  decode_code_page_strict(UINT code_page,
7588                          wchar_t **buf,
7589                          Py_ssize_t *bufsize,
7590                          const char *in,
7591                          int insize)
7592  {
7593      DWORD flags = MB_ERR_INVALID_CHARS;
7594      wchar_t *out;
7595      DWORD outsize;
7596  
7597      /* First get the size of the result */
7598      assert(insize > 0);
7599      while ((outsize = MultiByteToWideChar(code_page, flags,
7600                                            in, insize, NULL, 0)) <= 0)
7601      {
7602          if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7603              goto error;
7604          }
7605          /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7606          flags = 0;
7607      }
7608  
7609      /* Extend a wchar_t* buffer */
7610      Py_ssize_t n = *bufsize;   /* Get the current length */
7611      if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7612          return -1;
7613      }
7614      out = *buf + n;
7615  
7616      /* Do the conversion */
7617      outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7618      if (outsize <= 0)
7619          goto error;
7620      return insize;
7621  
7622  error:
7623      if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7624          return -2;
7625      PyErr_SetFromWindowsErr(0);
7626      return -1;
7627  }
7628  
7629  /*
7630   * Decode a byte string from a code page into unicode object with an error
7631   * handler.
7632   *
7633   * Returns consumed size if succeed, or raise an OSError or
7634   * UnicodeDecodeError exception and returns -1 on error.
7635   */
7636  static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7637  decode_code_page_errors(UINT code_page,
7638                          wchar_t **buf,
7639                          Py_ssize_t *bufsize,
7640                          const char *in, const int size,
7641                          const char *errors, int final)
7642  {
7643      const char *startin = in;
7644      const char *endin = in + size;
7645      DWORD flags = MB_ERR_INVALID_CHARS;
7646      /* Ideally, we should get reason from FormatMessage. This is the Windows
7647         2000 English version of the message. */
7648      const char *reason = "No mapping for the Unicode character exists "
7649                           "in the target code page.";
7650      /* each step cannot decode more than 1 character, but a character can be
7651         represented as a surrogate pair */
7652      wchar_t buffer[2], *out;
7653      int insize;
7654      Py_ssize_t outsize;
7655      PyObject *errorHandler = NULL;
7656      PyObject *exc = NULL;
7657      PyObject *encoding_obj = NULL;
7658      const char *encoding;
7659      DWORD err;
7660      int ret = -1;
7661  
7662      assert(size > 0);
7663  
7664      encoding = code_page_name(code_page, &encoding_obj);
7665      if (encoding == NULL)
7666          return -1;
7667  
7668      if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7669          /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7670             UnicodeDecodeError. */
7671          make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7672          if (exc != NULL) {
7673              PyCodec_StrictErrors(exc);
7674              Py_CLEAR(exc);
7675          }
7676          goto error;
7677      }
7678  
7679      /* Extend a wchar_t* buffer */
7680      Py_ssize_t n = *bufsize;   /* Get the current length */
7681      if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7682          PyErr_NoMemory();
7683          goto error;
7684      }
7685      if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7686          goto error;
7687      }
7688      out = *buf + n;
7689  
7690      /* Decode the byte string character per character */
7691      while (in < endin)
7692      {
7693          /* Decode a character */
7694          insize = 1;
7695          do
7696          {
7697              outsize = MultiByteToWideChar(code_page, flags,
7698                                            in, insize,
7699                                            buffer, Py_ARRAY_LENGTH(buffer));
7700              if (outsize > 0)
7701                  break;
7702              err = GetLastError();
7703              if (err == ERROR_INVALID_FLAGS && flags) {
7704                  /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7705                  flags = 0;
7706                  continue;
7707              }
7708              if (err != ERROR_NO_UNICODE_TRANSLATION
7709                  && err != ERROR_INSUFFICIENT_BUFFER)
7710              {
7711                  PyErr_SetFromWindowsErr(0);
7712                  goto error;
7713              }
7714              insize++;
7715          }
7716          /* 4=maximum length of a UTF-8 sequence */
7717          while (insize <= 4 && (in + insize) <= endin);
7718  
7719          if (outsize <= 0) {
7720              Py_ssize_t startinpos, endinpos, outpos;
7721  
7722              /* last character in partial decode? */
7723              if (in + insize >= endin && !final)
7724                  break;
7725  
7726              startinpos = in - startin;
7727              endinpos = startinpos + 1;
7728              outpos = out - *buf;
7729              if (unicode_decode_call_errorhandler_wchar(
7730                      errors, &errorHandler,
7731                      encoding, reason,
7732                      &startin, &endin, &startinpos, &endinpos, &exc, &in,
7733                      buf, bufsize, &outpos))
7734              {
7735                  goto error;
7736              }
7737              out = *buf + outpos;
7738          }
7739          else {
7740              in += insize;
7741              memcpy(out, buffer, outsize * sizeof(wchar_t));
7742              out += outsize;
7743          }
7744      }
7745  
7746      /* Shrink the buffer */
7747      assert(out - *buf <= *bufsize);
7748      *bufsize = out - *buf;
7749      /* (in - startin) <= size and size is an int */
7750      ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7751  
7752  error:
7753      Py_XDECREF(encoding_obj);
7754      Py_XDECREF(errorHandler);
7755      Py_XDECREF(exc);
7756      return ret;
7757  }
7758  
7759  static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7760  decode_code_page_stateful(int code_page,
7761                            const char *s, Py_ssize_t size,
7762                            const char *errors, Py_ssize_t *consumed)
7763  {
7764      wchar_t *buf = NULL;
7765      Py_ssize_t bufsize = 0;
7766      int chunk_size, final, converted, done;
7767  
7768      if (code_page < 0) {
7769          PyErr_SetString(PyExc_ValueError, "invalid code page number");
7770          return NULL;
7771      }
7772      if (size < 0) {
7773          PyErr_BadInternalCall();
7774          return NULL;
7775      }
7776  
7777      if (consumed)
7778          *consumed = 0;
7779  
7780      do
7781      {
7782  #ifdef NEED_RETRY
7783          if (size > DECODING_CHUNK_SIZE) {
7784              chunk_size = DECODING_CHUNK_SIZE;
7785              final = 0;
7786              done = 0;
7787          }
7788          else
7789  #endif
7790          {
7791              chunk_size = (int)size;
7792              final = (consumed == NULL);
7793              done = 1;
7794          }
7795  
7796          if (chunk_size == 0 && done) {
7797              if (buf != NULL)
7798                  break;
7799              _Py_RETURN_UNICODE_EMPTY();
7800          }
7801  
7802          converted = decode_code_page_strict(code_page, &buf, &bufsize,
7803                                              s, chunk_size);
7804          if (converted == -2)
7805              converted = decode_code_page_errors(code_page, &buf, &bufsize,
7806                                                  s, chunk_size,
7807                                                  errors, final);
7808          assert(converted != 0 || done);
7809  
7810          if (converted < 0) {
7811              PyMem_Free(buf);
7812              return NULL;
7813          }
7814  
7815          if (consumed)
7816              *consumed += converted;
7817  
7818          s += converted;
7819          size -= converted;
7820      } while (!done);
7821  
7822      PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7823      PyMem_Free(buf);
7824      return v;
7825  }
7826  
7827  PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7828  PyUnicode_DecodeCodePageStateful(int code_page,
7829                                   const char *s,
7830                                   Py_ssize_t size,
7831                                   const char *errors,
7832                                   Py_ssize_t *consumed)
7833  {
7834      return decode_code_page_stateful(code_page, s, size, errors, consumed);
7835  }
7836  
7837  PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7838  PyUnicode_DecodeMBCSStateful(const char *s,
7839                               Py_ssize_t size,
7840                               const char *errors,
7841                               Py_ssize_t *consumed)
7842  {
7843      return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7844  }
7845  
7846  PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7847  PyUnicode_DecodeMBCS(const char *s,
7848                       Py_ssize_t size,
7849                       const char *errors)
7850  {
7851      return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7852  }
7853  
7854  static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7855  encode_code_page_flags(UINT code_page, const char *errors)
7856  {
7857      if (code_page == CP_UTF8) {
7858          return WC_ERR_INVALID_CHARS;
7859      }
7860      else if (code_page == CP_UTF7) {
7861          /* CP_UTF7 only supports flags=0 */
7862          return 0;
7863      }
7864      else {
7865          if (errors != NULL && strcmp(errors, "replace") == 0)
7866              return 0;
7867          else
7868              return WC_NO_BEST_FIT_CHARS;
7869      }
7870  }
7871  
7872  /*
7873   * Encode a Unicode string to a Windows code page into a byte string in strict
7874   * mode.
7875   *
7876   * Returns consumed characters if succeed, returns -2 on encode error, or raise
7877   * an OSError and returns -1 on other error.
7878   */
7879  static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7880  encode_code_page_strict(UINT code_page, PyObject **outbytes,
7881                          PyObject *unicode, Py_ssize_t offset, int len,
7882                          const char* errors)
7883  {
7884      BOOL usedDefaultChar = FALSE;
7885      BOOL *pusedDefaultChar = &usedDefaultChar;
7886      int outsize;
7887      wchar_t *p;
7888      Py_ssize_t size;
7889      const DWORD flags = encode_code_page_flags(code_page, NULL);
7890      char *out;
7891      /* Create a substring so that we can get the UTF-16 representation
7892         of just the slice under consideration. */
7893      PyObject *substring;
7894      int ret = -1;
7895  
7896      assert(len > 0);
7897  
7898      if (code_page != CP_UTF8 && code_page != CP_UTF7)
7899          pusedDefaultChar = &usedDefaultChar;
7900      else
7901          pusedDefaultChar = NULL;
7902  
7903      substring = PyUnicode_Substring(unicode, offset, offset+len);
7904      if (substring == NULL)
7905          return -1;
7906  #if USE_UNICODE_WCHAR_CACHE
7907  _Py_COMP_DIAG_PUSH
7908  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7909      p = PyUnicode_AsUnicodeAndSize(substring, &size);
7910      if (p == NULL) {
7911          Py_DECREF(substring);
7912          return -1;
7913      }
7914  _Py_COMP_DIAG_POP
7915  #else /* USE_UNICODE_WCHAR_CACHE */
7916      p = PyUnicode_AsWideCharString(substring, &size);
7917      Py_CLEAR(substring);
7918      if (p == NULL) {
7919          return -1;
7920      }
7921  #endif /* USE_UNICODE_WCHAR_CACHE */
7922      assert(size <= INT_MAX);
7923  
7924      /* First get the size of the result */
7925      outsize = WideCharToMultiByte(code_page, flags,
7926                                    p, (int)size,
7927                                    NULL, 0,
7928                                    NULL, pusedDefaultChar);
7929      if (outsize <= 0)
7930          goto error;
7931      /* If we used a default char, then we failed! */
7932      if (pusedDefaultChar && *pusedDefaultChar) {
7933          ret = -2;
7934          goto done;
7935      }
7936  
7937      if (*outbytes == NULL) {
7938          /* Create string object */
7939          *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7940          if (*outbytes == NULL) {
7941              goto done;
7942          }
7943          out = PyBytes_AS_STRING(*outbytes);
7944      }
7945      else {
7946          /* Extend string object */
7947          const Py_ssize_t n = PyBytes_Size(*outbytes);
7948          if (outsize > PY_SSIZE_T_MAX - n) {
7949              PyErr_NoMemory();
7950              goto done;
7951          }
7952          if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7953              goto done;
7954          }
7955          out = PyBytes_AS_STRING(*outbytes) + n;
7956      }
7957  
7958      /* Do the conversion */
7959      outsize = WideCharToMultiByte(code_page, flags,
7960                                    p, (int)size,
7961                                    out, outsize,
7962                                    NULL, pusedDefaultChar);
7963      if (outsize <= 0)
7964          goto error;
7965      if (pusedDefaultChar && *pusedDefaultChar) {
7966          ret = -2;
7967          goto done;
7968      }
7969      ret = 0;
7970  
7971  done:
7972  #if USE_UNICODE_WCHAR_CACHE
7973      Py_DECREF(substring);
7974  #else /* USE_UNICODE_WCHAR_CACHE */
7975      PyMem_Free(p);
7976  #endif /* USE_UNICODE_WCHAR_CACHE */
7977      return ret;
7978  
7979  error:
7980      if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7981          ret = -2;
7982          goto done;
7983      }
7984      PyErr_SetFromWindowsErr(0);
7985      goto done;
7986  }
7987  
7988  /*
7989   * Encode a Unicode string to a Windows code page into a byte string using an
7990   * error handler.
7991   *
7992   * Returns consumed characters if succeed, or raise an OSError and returns
7993   * -1 on other error.
7994   */
7995  static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7996  encode_code_page_errors(UINT code_page, PyObject **outbytes,
7997                          PyObject *unicode, Py_ssize_t unicode_offset,
7998                          Py_ssize_t insize, const char* errors)
7999  {
8000      const DWORD flags = encode_code_page_flags(code_page, errors);
8001      Py_ssize_t pos = unicode_offset;
8002      Py_ssize_t endin = unicode_offset + insize;
8003      /* Ideally, we should get reason from FormatMessage. This is the Windows
8004         2000 English version of the message. */
8005      const char *reason = "invalid character";
8006      /* 4=maximum length of a UTF-8 sequence */
8007      char buffer[4];
8008      BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8009      Py_ssize_t outsize;
8010      char *out;
8011      PyObject *errorHandler = NULL;
8012      PyObject *exc = NULL;
8013      PyObject *encoding_obj = NULL;
8014      const char *encoding;
8015      Py_ssize_t newpos, newoutsize;
8016      PyObject *rep;
8017      int ret = -1;
8018  
8019      assert(insize > 0);
8020  
8021      encoding = code_page_name(code_page, &encoding_obj);
8022      if (encoding == NULL)
8023          return -1;
8024  
8025      if (errors == NULL || strcmp(errors, "strict") == 0) {
8026          /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8027             then we raise a UnicodeEncodeError. */
8028          make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8029          if (exc != NULL) {
8030              PyCodec_StrictErrors(exc);
8031              Py_DECREF(exc);
8032          }
8033          Py_XDECREF(encoding_obj);
8034          return -1;
8035      }
8036  
8037      if (code_page != CP_UTF8 && code_page != CP_UTF7)
8038          pusedDefaultChar = &usedDefaultChar;
8039      else
8040          pusedDefaultChar = NULL;
8041  
8042      if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8043          PyErr_NoMemory();
8044          goto error;
8045      }
8046      outsize = insize * Py_ARRAY_LENGTH(buffer);
8047  
8048      if (*outbytes == NULL) {
8049          /* Create string object */
8050          *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8051          if (*outbytes == NULL)
8052              goto error;
8053          out = PyBytes_AS_STRING(*outbytes);
8054      }
8055      else {
8056          /* Extend string object */
8057          Py_ssize_t n = PyBytes_Size(*outbytes);
8058          if (n > PY_SSIZE_T_MAX - outsize) {
8059              PyErr_NoMemory();
8060              goto error;
8061          }
8062          if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8063              goto error;
8064          out = PyBytes_AS_STRING(*outbytes) + n;
8065      }
8066  
8067      /* Encode the string character per character */
8068      while (pos < endin)
8069      {
8070          Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8071          wchar_t chars[2];
8072          int charsize;
8073          if (ch < 0x10000) {
8074              chars[0] = (wchar_t)ch;
8075              charsize = 1;
8076          }
8077          else {
8078              chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8079              chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8080              charsize = 2;
8081          }
8082  
8083          outsize = WideCharToMultiByte(code_page, flags,
8084                                        chars, charsize,
8085                                        buffer, Py_ARRAY_LENGTH(buffer),
8086                                        NULL, pusedDefaultChar);
8087          if (outsize > 0) {
8088              if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8089              {
8090                  pos++;
8091                  memcpy(out, buffer, outsize);
8092                  out += outsize;
8093                  continue;
8094              }
8095          }
8096          else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8097              PyErr_SetFromWindowsErr(0);
8098              goto error;
8099          }
8100  
8101          rep = unicode_encode_call_errorhandler(
8102                    errors, &errorHandler, encoding, reason,
8103                    unicode, &exc,
8104                    pos, pos + 1, &newpos);
8105          if (rep == NULL)
8106              goto error;
8107          pos = newpos;
8108  
8109          if (PyBytes_Check(rep)) {
8110              outsize = PyBytes_GET_SIZE(rep);
8111              if (outsize != 1) {
8112                  Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8113                  newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8114                  if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8115                      Py_DECREF(rep);
8116                      goto error;
8117                  }
8118                  out = PyBytes_AS_STRING(*outbytes) + offset;
8119              }
8120              memcpy(out, PyBytes_AS_STRING(rep), outsize);
8121              out += outsize;
8122          }
8123          else {
8124              Py_ssize_t i;
8125              enum PyUnicode_Kind kind;
8126              const void *data;
8127  
8128              if (PyUnicode_READY(rep) == -1) {
8129                  Py_DECREF(rep);
8130                  goto error;
8131              }
8132  
8133              outsize = PyUnicode_GET_LENGTH(rep);
8134              if (outsize != 1) {
8135                  Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8136                  newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8137                  if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8138                      Py_DECREF(rep);
8139                      goto error;
8140                  }
8141                  out = PyBytes_AS_STRING(*outbytes) + offset;
8142              }
8143              kind = PyUnicode_KIND(rep);
8144              data = PyUnicode_DATA(rep);
8145              for (i=0; i < outsize; i++) {
8146                  Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8147                  if (ch > 127) {
8148                      raise_encode_exception(&exc,
8149                          encoding, unicode,
8150                          pos, pos + 1,
8151                          "unable to encode error handler result to ASCII");
8152                      Py_DECREF(rep);
8153                      goto error;
8154                  }
8155                  *out = (unsigned char)ch;
8156                  out++;
8157              }
8158          }
8159          Py_DECREF(rep);
8160      }
8161      /* write a NUL byte */
8162      *out = 0;
8163      outsize = out - PyBytes_AS_STRING(*outbytes);
8164      assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8165      if (_PyBytes_Resize(outbytes, outsize) < 0)
8166          goto error;
8167      ret = 0;
8168  
8169  error:
8170      Py_XDECREF(encoding_obj);
8171      Py_XDECREF(errorHandler);
8172      Py_XDECREF(exc);
8173      return ret;
8174  }
8175  
8176  static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8177  encode_code_page(int code_page,
8178                   PyObject *unicode,
8179                   const char *errors)
8180  {
8181      Py_ssize_t len;
8182      PyObject *outbytes = NULL;
8183      Py_ssize_t offset;
8184      int chunk_len, ret, done;
8185  
8186      if (!PyUnicode_Check(unicode)) {
8187          PyErr_BadArgument();
8188          return NULL;
8189      }
8190  
8191      if (PyUnicode_READY(unicode) == -1)
8192          return NULL;
8193      len = PyUnicode_GET_LENGTH(unicode);
8194  
8195      if (code_page < 0) {
8196          PyErr_SetString(PyExc_ValueError, "invalid code page number");
8197          return NULL;
8198      }
8199  
8200      if (len == 0)
8201          return PyBytes_FromStringAndSize(NULL, 0);
8202  
8203      offset = 0;
8204      do
8205      {
8206  #ifdef NEED_RETRY
8207          if (len > DECODING_CHUNK_SIZE) {
8208              chunk_len = DECODING_CHUNK_SIZE;
8209              done = 0;
8210          }
8211          else
8212  #endif
8213          {
8214              chunk_len = (int)len;
8215              done = 1;
8216          }
8217  
8218          ret = encode_code_page_strict(code_page, &outbytes,
8219                                        unicode, offset, chunk_len,
8220                                        errors);
8221          if (ret == -2)
8222              ret = encode_code_page_errors(code_page, &outbytes,
8223                                            unicode, offset,
8224                                            chunk_len, errors);
8225          if (ret < 0) {
8226              Py_XDECREF(outbytes);
8227              return NULL;
8228          }
8229  
8230          offset += chunk_len;
8231          len -= chunk_len;
8232      } while (!done);
8233  
8234      return outbytes;
8235  }
8236  
8237  PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)8238  PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8239                       Py_ssize_t size,
8240                       const char *errors)
8241  {
8242      PyObject *unicode, *res;
8243      unicode = PyUnicode_FromWideChar(p, size);
8244      if (unicode == NULL)
8245          return NULL;
8246      res = encode_code_page(CP_ACP, unicode, errors);
8247      Py_DECREF(unicode);
8248      return res;
8249  }
8250  
8251  PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8252  PyUnicode_EncodeCodePage(int code_page,
8253                           PyObject *unicode,
8254                           const char *errors)
8255  {
8256      return encode_code_page(code_page, unicode, errors);
8257  }
8258  
8259  PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8260  PyUnicode_AsMBCSString(PyObject *unicode)
8261  {
8262      return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8263  }
8264  
8265  #undef NEED_RETRY
8266  
8267  #endif /* MS_WINDOWS */
8268  
8269  /* --- Character Mapping Codec -------------------------------------------- */
8270  
8271  static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8272  charmap_decode_string(const char *s,
8273                        Py_ssize_t size,
8274                        PyObject *mapping,
8275                        const char *errors,
8276                        _PyUnicodeWriter *writer)
8277  {
8278      const char *starts = s;
8279      const char *e;
8280      Py_ssize_t startinpos, endinpos;
8281      PyObject *errorHandler = NULL, *exc = NULL;
8282      Py_ssize_t maplen;
8283      enum PyUnicode_Kind mapkind;
8284      const void *mapdata;
8285      Py_UCS4 x;
8286      unsigned char ch;
8287  
8288      if (PyUnicode_READY(mapping) == -1)
8289          return -1;
8290  
8291      maplen = PyUnicode_GET_LENGTH(mapping);
8292      mapdata = PyUnicode_DATA(mapping);
8293      mapkind = PyUnicode_KIND(mapping);
8294  
8295      e = s + size;
8296  
8297      if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8298          /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8299           * is disabled in encoding aliases, latin1 is preferred because
8300           * its implementation is faster. */
8301          const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8302          Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8303          Py_UCS4 maxchar = writer->maxchar;
8304  
8305          assert (writer->kind == PyUnicode_1BYTE_KIND);
8306          while (s < e) {
8307              ch = *s;
8308              x = mapdata_ucs1[ch];
8309              if (x > maxchar) {
8310                  if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8311                      goto onError;
8312                  maxchar = writer->maxchar;
8313                  outdata = (Py_UCS1 *)writer->data;
8314              }
8315              outdata[writer->pos] = x;
8316              writer->pos++;
8317              ++s;
8318          }
8319          return 0;
8320      }
8321  
8322      while (s < e) {
8323          if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8324              enum PyUnicode_Kind outkind = writer->kind;
8325              const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8326              if (outkind == PyUnicode_1BYTE_KIND) {
8327                  Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8328                  Py_UCS4 maxchar = writer->maxchar;
8329                  while (s < e) {
8330                      ch = *s;
8331                      x = mapdata_ucs2[ch];
8332                      if (x > maxchar)
8333                          goto Error;
8334                      outdata[writer->pos] = x;
8335                      writer->pos++;
8336                      ++s;
8337                  }
8338                  break;
8339              }
8340              else if (outkind == PyUnicode_2BYTE_KIND) {
8341                  Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8342                  while (s < e) {
8343                      ch = *s;
8344                      x = mapdata_ucs2[ch];
8345                      if (x == 0xFFFE)
8346                          goto Error;
8347                      outdata[writer->pos] = x;
8348                      writer->pos++;
8349                      ++s;
8350                  }
8351                  break;
8352              }
8353          }
8354          ch = *s;
8355  
8356          if (ch < maplen)
8357              x = PyUnicode_READ(mapkind, mapdata, ch);
8358          else
8359              x = 0xfffe; /* invalid value */
8360  Error:
8361          if (x == 0xfffe)
8362          {
8363              /* undefined mapping */
8364              startinpos = s-starts;
8365              endinpos = startinpos+1;
8366              if (unicode_decode_call_errorhandler_writer(
8367                      errors, &errorHandler,
8368                      "charmap", "character maps to <undefined>",
8369                      &starts, &e, &startinpos, &endinpos, &exc, &s,
8370                      writer)) {
8371                  goto onError;
8372              }
8373              continue;
8374          }
8375  
8376          if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8377              goto onError;
8378          ++s;
8379      }
8380      Py_XDECREF(errorHandler);
8381      Py_XDECREF(exc);
8382      return 0;
8383  
8384  onError:
8385      Py_XDECREF(errorHandler);
8386      Py_XDECREF(exc);
8387      return -1;
8388  }
8389  
8390  static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8391  charmap_decode_mapping(const char *s,
8392                         Py_ssize_t size,
8393                         PyObject *mapping,
8394                         const char *errors,
8395                         _PyUnicodeWriter *writer)
8396  {
8397      const char *starts = s;
8398      const char *e;
8399      Py_ssize_t startinpos, endinpos;
8400      PyObject *errorHandler = NULL, *exc = NULL;
8401      unsigned char ch;
8402      PyObject *key, *item = NULL;
8403  
8404      e = s + size;
8405  
8406      while (s < e) {
8407          ch = *s;
8408  
8409          /* Get mapping (char ordinal -> integer, Unicode char or None) */
8410          key = PyLong_FromLong((long)ch);
8411          if (key == NULL)
8412              goto onError;
8413  
8414          item = PyObject_GetItem(mapping, key);
8415          Py_DECREF(key);
8416          if (item == NULL) {
8417              if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8418                  /* No mapping found means: mapping is undefined. */
8419                  PyErr_Clear();
8420                  goto Undefined;
8421              } else
8422                  goto onError;
8423          }
8424  
8425          /* Apply mapping */
8426          if (item == Py_None)
8427              goto Undefined;
8428          if (PyLong_Check(item)) {
8429              long value = PyLong_AS_LONG(item);
8430              if (value == 0xFFFE)
8431                  goto Undefined;
8432              if (value < 0 || value > MAX_UNICODE) {
8433                  PyErr_Format(PyExc_TypeError,
8434                               "character mapping must be in range(0x%x)",
8435                               (unsigned long)MAX_UNICODE + 1);
8436                  goto onError;
8437              }
8438  
8439              if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8440                  goto onError;
8441          }
8442          else if (PyUnicode_Check(item)) {
8443              if (PyUnicode_READY(item) == -1)
8444                  goto onError;
8445              if (PyUnicode_GET_LENGTH(item) == 1) {
8446                  Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8447                  if (value == 0xFFFE)
8448                      goto Undefined;
8449                  if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8450                      goto onError;
8451              }
8452              else {
8453                  writer->overallocate = 1;
8454                  if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8455                      goto onError;
8456              }
8457          }
8458          else {
8459              /* wrong return value */
8460              PyErr_SetString(PyExc_TypeError,
8461                              "character mapping must return integer, None or str");
8462              goto onError;
8463          }
8464          Py_CLEAR(item);
8465          ++s;
8466          continue;
8467  
8468  Undefined:
8469          /* undefined mapping */
8470          Py_CLEAR(item);
8471          startinpos = s-starts;
8472          endinpos = startinpos+1;
8473          if (unicode_decode_call_errorhandler_writer(
8474                  errors, &errorHandler,
8475                  "charmap", "character maps to <undefined>",
8476                  &starts, &e, &startinpos, &endinpos, &exc, &s,
8477                  writer)) {
8478              goto onError;
8479          }
8480      }
8481      Py_XDECREF(errorHandler);
8482      Py_XDECREF(exc);
8483      return 0;
8484  
8485  onError:
8486      Py_XDECREF(item);
8487      Py_XDECREF(errorHandler);
8488      Py_XDECREF(exc);
8489      return -1;
8490  }
8491  
8492  PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8493  PyUnicode_DecodeCharmap(const char *s,
8494                          Py_ssize_t size,
8495                          PyObject *mapping,
8496                          const char *errors)
8497  {
8498      _PyUnicodeWriter writer;
8499  
8500      /* Default to Latin-1 */
8501      if (mapping == NULL)
8502          return PyUnicode_DecodeLatin1(s, size, errors);
8503  
8504      if (size == 0)
8505          _Py_RETURN_UNICODE_EMPTY();
8506      _PyUnicodeWriter_Init(&writer);
8507      writer.min_length = size;
8508      if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8509          goto onError;
8510  
8511      if (PyUnicode_CheckExact(mapping)) {
8512          if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8513              goto onError;
8514      }
8515      else {
8516          if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8517              goto onError;
8518      }
8519      return _PyUnicodeWriter_Finish(&writer);
8520  
8521    onError:
8522      _PyUnicodeWriter_Dealloc(&writer);
8523      return NULL;
8524  }
8525  
8526  /* Charmap encoding: the lookup table */
8527  
8528  struct encoding_map {
8529      PyObject_HEAD
8530      unsigned char level1[32];
8531      int count2, count3;
8532      unsigned char level23[1];
8533  };
8534  
8535  static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8536  encoding_map_size(PyObject *obj, PyObject* args)
8537  {
8538      struct encoding_map *map = (struct encoding_map*)obj;
8539      return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8540                             128*map->count3);
8541  }
8542  
8543  static PyMethodDef encoding_map_methods[] = {
8544      {"size", encoding_map_size, METH_NOARGS,
8545       PyDoc_STR("Return the size (in bytes) of this object") },
8546      { 0 }
8547  };
8548  
8549  static PyTypeObject EncodingMapType = {
8550      PyVarObject_HEAD_INIT(NULL, 0)
8551      "EncodingMap",          /*tp_name*/
8552      sizeof(struct encoding_map),   /*tp_basicsize*/
8553      0,                      /*tp_itemsize*/
8554      /* methods */
8555      0,                      /*tp_dealloc*/
8556      0,                      /*tp_vectorcall_offset*/
8557      0,                      /*tp_getattr*/
8558      0,                      /*tp_setattr*/
8559      0,                      /*tp_as_async*/
8560      0,                      /*tp_repr*/
8561      0,                      /*tp_as_number*/
8562      0,                      /*tp_as_sequence*/
8563      0,                      /*tp_as_mapping*/
8564      0,                      /*tp_hash*/
8565      0,                      /*tp_call*/
8566      0,                      /*tp_str*/
8567      0,                      /*tp_getattro*/
8568      0,                      /*tp_setattro*/
8569      0,                      /*tp_as_buffer*/
8570      Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8571      0,                      /*tp_doc*/
8572      0,                      /*tp_traverse*/
8573      0,                      /*tp_clear*/
8574      0,                      /*tp_richcompare*/
8575      0,                      /*tp_weaklistoffset*/
8576      0,                      /*tp_iter*/
8577      0,                      /*tp_iternext*/
8578      encoding_map_methods,   /*tp_methods*/
8579      0,                      /*tp_members*/
8580      0,                      /*tp_getset*/
8581      0,                      /*tp_base*/
8582      0,                      /*tp_dict*/
8583      0,                      /*tp_descr_get*/
8584      0,                      /*tp_descr_set*/
8585      0,                      /*tp_dictoffset*/
8586      0,                      /*tp_init*/
8587      0,                      /*tp_alloc*/
8588      0,                      /*tp_new*/
8589      0,                      /*tp_free*/
8590      0,                      /*tp_is_gc*/
8591  };
8592  
8593  PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8594  PyUnicode_BuildEncodingMap(PyObject* string)
8595  {
8596      PyObject *result;
8597      struct encoding_map *mresult;
8598      int i;
8599      int need_dict = 0;
8600      unsigned char level1[32];
8601      unsigned char level2[512];
8602      unsigned char *mlevel1, *mlevel2, *mlevel3;
8603      int count2 = 0, count3 = 0;
8604      int kind;
8605      const void *data;
8606      Py_ssize_t length;
8607      Py_UCS4 ch;
8608  
8609      if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8610          PyErr_BadArgument();
8611          return NULL;
8612      }
8613      kind = PyUnicode_KIND(string);
8614      data = PyUnicode_DATA(string);
8615      length = PyUnicode_GET_LENGTH(string);
8616      length = Py_MIN(length, 256);
8617      memset(level1, 0xFF, sizeof level1);
8618      memset(level2, 0xFF, sizeof level2);
8619  
8620      /* If there isn't a one-to-one mapping of NULL to \0,
8621         or if there are non-BMP characters, we need to use
8622         a mapping dictionary. */
8623      if (PyUnicode_READ(kind, data, 0) != 0)
8624          need_dict = 1;
8625      for (i = 1; i < length; i++) {
8626          int l1, l2;
8627          ch = PyUnicode_READ(kind, data, i);
8628          if (ch == 0 || ch > 0xFFFF) {
8629              need_dict = 1;
8630              break;
8631          }
8632          if (ch == 0xFFFE)
8633              /* unmapped character */
8634              continue;
8635          l1 = ch >> 11;
8636          l2 = ch >> 7;
8637          if (level1[l1] == 0xFF)
8638              level1[l1] = count2++;
8639          if (level2[l2] == 0xFF)
8640              level2[l2] = count3++;
8641      }
8642  
8643      if (count2 >= 0xFF || count3 >= 0xFF)
8644          need_dict = 1;
8645  
8646      if (need_dict) {
8647          PyObject *result = PyDict_New();
8648          PyObject *key, *value;
8649          if (!result)
8650              return NULL;
8651          for (i = 0; i < length; i++) {
8652              key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8653              value = PyLong_FromLong(i);
8654              if (!key || !value)
8655                  goto failed1;
8656              if (PyDict_SetItem(result, key, value) == -1)
8657                  goto failed1;
8658              Py_DECREF(key);
8659              Py_DECREF(value);
8660          }
8661          return result;
8662        failed1:
8663          Py_XDECREF(key);
8664          Py_XDECREF(value);
8665          Py_DECREF(result);
8666          return NULL;
8667      }
8668  
8669      /* Create a three-level trie */
8670      result = PyObject_Malloc(sizeof(struct encoding_map) +
8671                               16*count2 + 128*count3 - 1);
8672      if (!result) {
8673          return PyErr_NoMemory();
8674      }
8675  
8676      _PyObject_Init(result, &EncodingMapType);
8677      mresult = (struct encoding_map*)result;
8678      mresult->count2 = count2;
8679      mresult->count3 = count3;
8680      mlevel1 = mresult->level1;
8681      mlevel2 = mresult->level23;
8682      mlevel3 = mresult->level23 + 16*count2;
8683      memcpy(mlevel1, level1, 32);
8684      memset(mlevel2, 0xFF, 16*count2);
8685      memset(mlevel3, 0, 128*count3);
8686      count3 = 0;
8687      for (i = 1; i < length; i++) {
8688          int o1, o2, o3, i2, i3;
8689          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8690          if (ch == 0xFFFE)
8691              /* unmapped character */
8692              continue;
8693          o1 = ch>>11;
8694          o2 = (ch>>7) & 0xF;
8695          i2 = 16*mlevel1[o1] + o2;
8696          if (mlevel2[i2] == 0xFF)
8697              mlevel2[i2] = count3++;
8698          o3 = ch & 0x7F;
8699          i3 = 128*mlevel2[i2] + o3;
8700          mlevel3[i3] = i;
8701      }
8702      return result;
8703  }
8704  
8705  static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8706  encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8707  {
8708      struct encoding_map *map = (struct encoding_map*)mapping;
8709      int l1 = c>>11;
8710      int l2 = (c>>7) & 0xF;
8711      int l3 = c & 0x7F;
8712      int i;
8713  
8714      if (c > 0xFFFF)
8715          return -1;
8716      if (c == 0)
8717          return 0;
8718      /* level 1*/
8719      i = map->level1[l1];
8720      if (i == 0xFF) {
8721          return -1;
8722      }
8723      /* level 2*/
8724      i = map->level23[16*i+l2];
8725      if (i == 0xFF) {
8726          return -1;
8727      }
8728      /* level 3 */
8729      i = map->level23[16*map->count2 + 128*i + l3];
8730      if (i == 0) {
8731          return -1;
8732      }
8733      return i;
8734  }
8735  
8736  /* Lookup the character ch in the mapping. If the character
8737     can't be found, Py_None is returned (or NULL, if another
8738     error occurred). */
8739  static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8740  charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8741  {
8742      PyObject *w = PyLong_FromLong((long)c);
8743      PyObject *x;
8744  
8745      if (w == NULL)
8746          return NULL;
8747      x = PyObject_GetItem(mapping, w);
8748      Py_DECREF(w);
8749      if (x == NULL) {
8750          if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8751              /* No mapping found means: mapping is undefined. */
8752              PyErr_Clear();
8753              Py_RETURN_NONE;
8754          } else
8755              return NULL;
8756      }
8757      else if (x == Py_None)
8758          return x;
8759      else if (PyLong_Check(x)) {
8760          long value = PyLong_AS_LONG(x);
8761          if (value < 0 || value > 255) {
8762              PyErr_SetString(PyExc_TypeError,
8763                              "character mapping must be in range(256)");
8764              Py_DECREF(x);
8765              return NULL;
8766          }
8767          return x;
8768      }
8769      else if (PyBytes_Check(x))
8770          return x;
8771      else {
8772          /* wrong return value */
8773          PyErr_Format(PyExc_TypeError,
8774                       "character mapping must return integer, bytes or None, not %.400s",
8775                       Py_TYPE(x)->tp_name);
8776          Py_DECREF(x);
8777          return NULL;
8778      }
8779  }
8780  
8781  static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8782  charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8783  {
8784      Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8785      /* exponentially overallocate to minimize reallocations */
8786      if (requiredsize < 2*outsize)
8787          requiredsize = 2*outsize;
8788      if (_PyBytes_Resize(outobj, requiredsize))
8789          return -1;
8790      return 0;
8791  }
8792  
8793  typedef enum charmapencode_result {
8794      enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8795  } charmapencode_result;
8796  /* lookup the character, put the result in the output string and adjust
8797     various state variables. Resize the output bytes object if not enough
8798     space is available. Return a new reference to the object that
8799     was put in the output buffer, or Py_None, if the mapping was undefined
8800     (in which case no character was written) or NULL, if a
8801     reallocation error occurred. The caller must decref the result */
8802  static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8803  charmapencode_output(Py_UCS4 c, PyObject *mapping,
8804                       PyObject **outobj, Py_ssize_t *outpos)
8805  {
8806      PyObject *rep;
8807      char *outstart;
8808      Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8809  
8810      if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8811          int res = encoding_map_lookup(c, mapping);
8812          Py_ssize_t requiredsize = *outpos+1;
8813          if (res == -1)
8814              return enc_FAILED;
8815          if (outsize<requiredsize)
8816              if (charmapencode_resize(outobj, outpos, requiredsize))
8817                  return enc_EXCEPTION;
8818          outstart = PyBytes_AS_STRING(*outobj);
8819          outstart[(*outpos)++] = (char)res;
8820          return enc_SUCCESS;
8821      }
8822  
8823      rep = charmapencode_lookup(c, mapping);
8824      if (rep==NULL)
8825          return enc_EXCEPTION;
8826      else if (rep==Py_None) {
8827          Py_DECREF(rep);
8828          return enc_FAILED;
8829      } else {
8830          if (PyLong_Check(rep)) {
8831              Py_ssize_t requiredsize = *outpos+1;
8832              if (outsize<requiredsize)
8833                  if (charmapencode_resize(outobj, outpos, requiredsize)) {
8834                      Py_DECREF(rep);
8835                      return enc_EXCEPTION;
8836                  }
8837              outstart = PyBytes_AS_STRING(*outobj);
8838              outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8839          }
8840          else {
8841              const char *repchars = PyBytes_AS_STRING(rep);
8842              Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8843              Py_ssize_t requiredsize = *outpos+repsize;
8844              if (outsize<requiredsize)
8845                  if (charmapencode_resize(outobj, outpos, requiredsize)) {
8846                      Py_DECREF(rep);
8847                      return enc_EXCEPTION;
8848                  }
8849              outstart = PyBytes_AS_STRING(*outobj);
8850              memcpy(outstart + *outpos, repchars, repsize);
8851              *outpos += repsize;
8852          }
8853      }
8854      Py_DECREF(rep);
8855      return enc_SUCCESS;
8856  }
8857  
8858  /* handle an error in PyUnicode_EncodeCharmap
8859     Return 0 on success, -1 on error */
8860  static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8861  charmap_encoding_error(
8862      PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8863      PyObject **exceptionObject,
8864      _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8865      PyObject **res, Py_ssize_t *respos)
8866  {
8867      PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8868      Py_ssize_t size, repsize;
8869      Py_ssize_t newpos;
8870      enum PyUnicode_Kind kind;
8871      const void *data;
8872      Py_ssize_t index;
8873      /* startpos for collecting unencodable chars */
8874      Py_ssize_t collstartpos = *inpos;
8875      Py_ssize_t collendpos = *inpos+1;
8876      Py_ssize_t collpos;
8877      const char *encoding = "charmap";
8878      const char *reason = "character maps to <undefined>";
8879      charmapencode_result x;
8880      Py_UCS4 ch;
8881      int val;
8882  
8883      if (PyUnicode_READY(unicode) == -1)
8884          return -1;
8885      size = PyUnicode_GET_LENGTH(unicode);
8886      /* find all unencodable characters */
8887      while (collendpos < size) {
8888          PyObject *rep;
8889          if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8890              ch = PyUnicode_READ_CHAR(unicode, collendpos);
8891              val = encoding_map_lookup(ch, mapping);
8892              if (val != -1)
8893                  break;
8894              ++collendpos;
8895              continue;
8896          }
8897  
8898          ch = PyUnicode_READ_CHAR(unicode, collendpos);
8899          rep = charmapencode_lookup(ch, mapping);
8900          if (rep==NULL)
8901              return -1;
8902          else if (rep!=Py_None) {
8903              Py_DECREF(rep);
8904              break;
8905          }
8906          Py_DECREF(rep);
8907          ++collendpos;
8908      }
8909      /* cache callback name lookup
8910       * (if not done yet, i.e. it's the first error) */
8911      if (*error_handler == _Py_ERROR_UNKNOWN)
8912          *error_handler = _Py_GetErrorHandler(errors);
8913  
8914      switch (*error_handler) {
8915      case _Py_ERROR_STRICT:
8916          raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8917          return -1;
8918  
8919      case _Py_ERROR_REPLACE:
8920          for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8921              x = charmapencode_output('?', mapping, res, respos);
8922              if (x==enc_EXCEPTION) {
8923                  return -1;
8924              }
8925              else if (x==enc_FAILED) {
8926                  raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8927                  return -1;
8928              }
8929          }
8930          /* fall through */
8931      case _Py_ERROR_IGNORE:
8932          *inpos = collendpos;
8933          break;
8934  
8935      case _Py_ERROR_XMLCHARREFREPLACE:
8936          /* generate replacement (temporarily (mis)uses p) */
8937          for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8938              char buffer[2+29+1+1];
8939              char *cp;
8940              sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8941              for (cp = buffer; *cp; ++cp) {
8942                  x = charmapencode_output(*cp, mapping, res, respos);
8943                  if (x==enc_EXCEPTION)
8944                      return -1;
8945                  else if (x==enc_FAILED) {
8946                      raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8947                      return -1;
8948                  }
8949              }
8950          }
8951          *inpos = collendpos;
8952          break;
8953  
8954      default:
8955          repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8956                                                        encoding, reason, unicode, exceptionObject,
8957                                                        collstartpos, collendpos, &newpos);
8958          if (repunicode == NULL)
8959              return -1;
8960          if (PyBytes_Check(repunicode)) {
8961              /* Directly copy bytes result to output. */
8962              Py_ssize_t outsize = PyBytes_Size(*res);
8963              Py_ssize_t requiredsize;
8964              repsize = PyBytes_Size(repunicode);
8965              requiredsize = *respos + repsize;
8966              if (requiredsize > outsize)
8967                  /* Make room for all additional bytes. */
8968                  if (charmapencode_resize(res, respos, requiredsize)) {
8969                      Py_DECREF(repunicode);
8970                      return -1;
8971                  }
8972              memcpy(PyBytes_AsString(*res) + *respos,
8973                     PyBytes_AsString(repunicode),  repsize);
8974              *respos += repsize;
8975              *inpos = newpos;
8976              Py_DECREF(repunicode);
8977              break;
8978          }
8979          /* generate replacement  */
8980          if (PyUnicode_READY(repunicode) == -1) {
8981              Py_DECREF(repunicode);
8982              return -1;
8983          }
8984          repsize = PyUnicode_GET_LENGTH(repunicode);
8985          data = PyUnicode_DATA(repunicode);
8986          kind = PyUnicode_KIND(repunicode);
8987          for (index = 0; index < repsize; index++) {
8988              Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8989              x = charmapencode_output(repch, mapping, res, respos);
8990              if (x==enc_EXCEPTION) {
8991                  Py_DECREF(repunicode);
8992                  return -1;
8993              }
8994              else if (x==enc_FAILED) {
8995                  Py_DECREF(repunicode);
8996                  raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8997                  return -1;
8998              }
8999          }
9000          *inpos = newpos;
9001          Py_DECREF(repunicode);
9002      }
9003      return 0;
9004  }
9005  
9006  PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)9007  _PyUnicode_EncodeCharmap(PyObject *unicode,
9008                           PyObject *mapping,
9009                           const char *errors)
9010  {
9011      /* output object */
9012      PyObject *res = NULL;
9013      /* current input position */
9014      Py_ssize_t inpos = 0;
9015      Py_ssize_t size;
9016      /* current output position */
9017      Py_ssize_t respos = 0;
9018      PyObject *error_handler_obj = NULL;
9019      PyObject *exc = NULL;
9020      _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9021      const void *data;
9022      int kind;
9023  
9024      if (PyUnicode_READY(unicode) == -1)
9025          return NULL;
9026      size = PyUnicode_GET_LENGTH(unicode);
9027      data = PyUnicode_DATA(unicode);
9028      kind = PyUnicode_KIND(unicode);
9029  
9030      /* Default to Latin-1 */
9031      if (mapping == NULL)
9032          return unicode_encode_ucs1(unicode, errors, 256);
9033  
9034      /* allocate enough for a simple encoding without
9035         replacements, if we need more, we'll resize */
9036      res = PyBytes_FromStringAndSize(NULL, size);
9037      if (res == NULL)
9038          goto onError;
9039      if (size == 0)
9040          return res;
9041  
9042      while (inpos<size) {
9043          Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9044          /* try to encode it */
9045          charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9046          if (x==enc_EXCEPTION) /* error */
9047              goto onError;
9048          if (x==enc_FAILED) { /* unencodable character */
9049              if (charmap_encoding_error(unicode, &inpos, mapping,
9050                                         &exc,
9051                                         &error_handler, &error_handler_obj, errors,
9052                                         &res, &respos)) {
9053                  goto onError;
9054              }
9055          }
9056          else
9057              /* done with this character => adjust input position */
9058              ++inpos;
9059      }
9060  
9061      /* Resize if we allocated to much */
9062      if (respos<PyBytes_GET_SIZE(res))
9063          if (_PyBytes_Resize(&res, respos) < 0)
9064              goto onError;
9065  
9066      Py_XDECREF(exc);
9067      Py_XDECREF(error_handler_obj);
9068      return res;
9069  
9070    onError:
9071      Py_XDECREF(res);
9072      Py_XDECREF(exc);
9073      Py_XDECREF(error_handler_obj);
9074      return NULL;
9075  }
9076  
9077  /* Deprecated */
9078  PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9079  PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9080                          Py_ssize_t size,
9081                          PyObject *mapping,
9082                          const char *errors)
9083  {
9084      PyObject *result;
9085      PyObject *unicode = PyUnicode_FromWideChar(p, size);
9086      if (unicode == NULL)
9087          return NULL;
9088      result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9089      Py_DECREF(unicode);
9090      return result;
9091  }
9092  
9093  PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)9094  PyUnicode_AsCharmapString(PyObject *unicode,
9095                            PyObject *mapping)
9096  {
9097      if (!PyUnicode_Check(unicode) || mapping == NULL) {
9098          PyErr_BadArgument();
9099          return NULL;
9100      }
9101      return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9102  }
9103  
9104  /* create or adjust a UnicodeTranslateError */
9105  static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)9106  make_translate_exception(PyObject **exceptionObject,
9107                           PyObject *unicode,
9108                           Py_ssize_t startpos, Py_ssize_t endpos,
9109                           const char *reason)
9110  {
9111      if (*exceptionObject == NULL) {
9112          *exceptionObject = _PyUnicodeTranslateError_Create(
9113              unicode, startpos, endpos, reason);
9114      }
9115      else {
9116          if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9117              goto onError;
9118          if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9119              goto onError;
9120          if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9121              goto onError;
9122          return;
9123        onError:
9124          Py_CLEAR(*exceptionObject);
9125      }
9126  }
9127  
9128  /* error handling callback helper:
9129     build arguments, call the callback and check the arguments,
9130     put the result into newpos and return the replacement string, which
9131     has to be freed by the caller */
9132  static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)9133  unicode_translate_call_errorhandler(const char *errors,
9134                                      PyObject **errorHandler,
9135                                      const char *reason,
9136                                      PyObject *unicode, PyObject **exceptionObject,
9137                                      Py_ssize_t startpos, Py_ssize_t endpos,
9138                                      Py_ssize_t *newpos)
9139  {
9140      static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9141  
9142      Py_ssize_t i_newpos;
9143      PyObject *restuple;
9144      PyObject *resunicode;
9145  
9146      if (*errorHandler == NULL) {
9147          *errorHandler = PyCodec_LookupError(errors);
9148          if (*errorHandler == NULL)
9149              return NULL;
9150      }
9151  
9152      make_translate_exception(exceptionObject,
9153                               unicode, startpos, endpos, reason);
9154      if (*exceptionObject == NULL)
9155          return NULL;
9156  
9157      restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9158      if (restuple == NULL)
9159          return NULL;
9160      if (!PyTuple_Check(restuple)) {
9161          PyErr_SetString(PyExc_TypeError, &argparse[3]);
9162          Py_DECREF(restuple);
9163          return NULL;
9164      }
9165      if (!PyArg_ParseTuple(restuple, argparse,
9166                            &resunicode, &i_newpos)) {
9167          Py_DECREF(restuple);
9168          return NULL;
9169      }
9170      if (i_newpos<0)
9171          *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9172      else
9173          *newpos = i_newpos;
9174      if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9175          PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9176          Py_DECREF(restuple);
9177          return NULL;
9178      }
9179      Py_INCREF(resunicode);
9180      Py_DECREF(restuple);
9181      return resunicode;
9182  }
9183  
9184  /* Lookup the character ch in the mapping and put the result in result,
9185     which must be decrefed by the caller.
9186     Return 0 on success, -1 on error */
9187  static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)9188  charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9189  {
9190      PyObject *w = PyLong_FromLong((long)c);
9191      PyObject *x;
9192  
9193      if (w == NULL)
9194          return -1;
9195      x = PyObject_GetItem(mapping, w);
9196      Py_DECREF(w);
9197      if (x == NULL) {
9198          if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9199              /* No mapping found means: use 1:1 mapping. */
9200              PyErr_Clear();
9201              *result = NULL;
9202              return 0;
9203          } else
9204              return -1;
9205      }
9206      else if (x == Py_None) {
9207          *result = x;
9208          return 0;
9209      }
9210      else if (PyLong_Check(x)) {
9211          long value = PyLong_AS_LONG(x);
9212          if (value < 0 || value > MAX_UNICODE) {
9213              PyErr_Format(PyExc_ValueError,
9214                           "character mapping must be in range(0x%x)",
9215                           MAX_UNICODE+1);
9216              Py_DECREF(x);
9217              return -1;
9218          }
9219          *result = x;
9220          return 0;
9221      }
9222      else if (PyUnicode_Check(x)) {
9223          *result = x;
9224          return 0;
9225      }
9226      else {
9227          /* wrong return value */
9228          PyErr_SetString(PyExc_TypeError,
9229                          "character mapping must return integer, None or str");
9230          Py_DECREF(x);
9231          return -1;
9232      }
9233  }
9234  
9235  /* lookup the character, write the result into the writer.
9236     Return 1 if the result was written into the writer, return 0 if the mapping
9237     was undefined, raise an exception return -1 on error. */
9238  static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9239  charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9240                          _PyUnicodeWriter *writer)
9241  {
9242      PyObject *item;
9243  
9244      if (charmaptranslate_lookup(ch, mapping, &item))
9245          return -1;
9246  
9247      if (item == NULL) {
9248          /* not found => default to 1:1 mapping */
9249          if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9250              return -1;
9251          }
9252          return 1;
9253      }
9254  
9255      if (item == Py_None) {
9256          Py_DECREF(item);
9257          return 0;
9258      }
9259  
9260      if (PyLong_Check(item)) {
9261          long ch = (Py_UCS4)PyLong_AS_LONG(item);
9262          /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9263             used it */
9264          if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9265              Py_DECREF(item);
9266              return -1;
9267          }
9268          Py_DECREF(item);
9269          return 1;
9270      }
9271  
9272      if (!PyUnicode_Check(item)) {
9273          Py_DECREF(item);
9274          return -1;
9275      }
9276  
9277      if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9278          Py_DECREF(item);
9279          return -1;
9280      }
9281  
9282      Py_DECREF(item);
9283      return 1;
9284  }
9285  
9286  static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9287  unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9288                                Py_UCS1 *translate)
9289  {
9290      PyObject *item = NULL;
9291      int ret = 0;
9292  
9293      if (charmaptranslate_lookup(ch, mapping, &item)) {
9294          return -1;
9295      }
9296  
9297      if (item == Py_None) {
9298          /* deletion */
9299          translate[ch] = 0xfe;
9300      }
9301      else if (item == NULL) {
9302          /* not found => default to 1:1 mapping */
9303          translate[ch] = ch;
9304          return 1;
9305      }
9306      else if (PyLong_Check(item)) {
9307          long replace = PyLong_AS_LONG(item);
9308          /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9309             used it */
9310          if (127 < replace) {
9311              /* invalid character or character outside ASCII:
9312                 skip the fast translate */
9313              goto exit;
9314          }
9315          translate[ch] = (Py_UCS1)replace;
9316      }
9317      else if (PyUnicode_Check(item)) {
9318          Py_UCS4 replace;
9319  
9320          if (PyUnicode_READY(item) == -1) {
9321              Py_DECREF(item);
9322              return -1;
9323          }
9324          if (PyUnicode_GET_LENGTH(item) != 1)
9325              goto exit;
9326  
9327          replace = PyUnicode_READ_CHAR(item, 0);
9328          if (replace > 127)
9329              goto exit;
9330          translate[ch] = (Py_UCS1)replace;
9331      }
9332      else {
9333          /* not None, NULL, long or unicode */
9334          goto exit;
9335      }
9336      ret = 1;
9337  
9338    exit:
9339      Py_DECREF(item);
9340      return ret;
9341  }
9342  
9343  /* Fast path for ascii => ascii translation. Return 1 if the whole string
9344     was translated into writer, return 0 if the input string was partially
9345     translated into writer, raise an exception and return -1 on error. */
9346  static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9347  unicode_fast_translate(PyObject *input, PyObject *mapping,
9348                         _PyUnicodeWriter *writer, int ignore,
9349                         Py_ssize_t *input_pos)
9350  {
9351      Py_UCS1 ascii_table[128], ch, ch2;
9352      Py_ssize_t len;
9353      const Py_UCS1 *in, *end;
9354      Py_UCS1 *out;
9355      int res = 0;
9356  
9357      len = PyUnicode_GET_LENGTH(input);
9358  
9359      memset(ascii_table, 0xff, 128);
9360  
9361      in = PyUnicode_1BYTE_DATA(input);
9362      end = in + len;
9363  
9364      assert(PyUnicode_IS_ASCII(writer->buffer));
9365      assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9366      out = PyUnicode_1BYTE_DATA(writer->buffer);
9367  
9368      for (; in < end; in++) {
9369          ch = *in;
9370          ch2 = ascii_table[ch];
9371          if (ch2 == 0xff) {
9372              int translate = unicode_fast_translate_lookup(mapping, ch,
9373                                                            ascii_table);
9374              if (translate < 0)
9375                  return -1;
9376              if (translate == 0)
9377                  goto exit;
9378              ch2 = ascii_table[ch];
9379          }
9380          if (ch2 == 0xfe) {
9381              if (ignore)
9382                  continue;
9383              goto exit;
9384          }
9385          assert(ch2 < 128);
9386          *out = ch2;
9387          out++;
9388      }
9389      res = 1;
9390  
9391  exit:
9392      writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9393      *input_pos = in - PyUnicode_1BYTE_DATA(input);
9394      return res;
9395  }
9396  
9397  static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9398  _PyUnicode_TranslateCharmap(PyObject *input,
9399                              PyObject *mapping,
9400                              const char *errors)
9401  {
9402      /* input object */
9403      const void *data;
9404      Py_ssize_t size, i;
9405      int kind;
9406      /* output buffer */
9407      _PyUnicodeWriter writer;
9408      /* error handler */
9409      const char *reason = "character maps to <undefined>";
9410      PyObject *errorHandler = NULL;
9411      PyObject *exc = NULL;
9412      int ignore;
9413      int res;
9414  
9415      if (mapping == NULL) {
9416          PyErr_BadArgument();
9417          return NULL;
9418      }
9419  
9420      if (PyUnicode_READY(input) == -1)
9421          return NULL;
9422      data = PyUnicode_DATA(input);
9423      kind = PyUnicode_KIND(input);
9424      size = PyUnicode_GET_LENGTH(input);
9425  
9426      if (size == 0)
9427          return PyUnicode_FromObject(input);
9428  
9429      /* allocate enough for a simple 1:1 translation without
9430         replacements, if we need more, we'll resize */
9431      _PyUnicodeWriter_Init(&writer);
9432      if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9433          goto onError;
9434  
9435      ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9436  
9437      if (PyUnicode_READY(input) == -1)
9438          return NULL;
9439      if (PyUnicode_IS_ASCII(input)) {
9440          res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9441          if (res < 0) {
9442              _PyUnicodeWriter_Dealloc(&writer);
9443              return NULL;
9444          }
9445          if (res == 1)
9446              return _PyUnicodeWriter_Finish(&writer);
9447      }
9448      else {
9449          i = 0;
9450      }
9451  
9452      while (i<size) {
9453          /* try to encode it */
9454          int translate;
9455          PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9456          Py_ssize_t newpos;
9457          /* startpos for collecting untranslatable chars */
9458          Py_ssize_t collstart;
9459          Py_ssize_t collend;
9460          Py_UCS4 ch;
9461  
9462          ch = PyUnicode_READ(kind, data, i);
9463          translate = charmaptranslate_output(ch, mapping, &writer);
9464          if (translate < 0)
9465              goto onError;
9466  
9467          if (translate != 0) {
9468              /* it worked => adjust input pointer */
9469              ++i;
9470              continue;
9471          }
9472  
9473          /* untranslatable character */
9474          collstart = i;
9475          collend = i+1;
9476  
9477          /* find all untranslatable characters */
9478          while (collend < size) {
9479              PyObject *x;
9480              ch = PyUnicode_READ(kind, data, collend);
9481              if (charmaptranslate_lookup(ch, mapping, &x))
9482                  goto onError;
9483              Py_XDECREF(x);
9484              if (x != Py_None)
9485                  break;
9486              ++collend;
9487          }
9488  
9489          if (ignore) {
9490              i = collend;
9491          }
9492          else {
9493              repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9494                                                               reason, input, &exc,
9495                                                               collstart, collend, &newpos);
9496              if (repunicode == NULL)
9497                  goto onError;
9498              if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9499                  Py_DECREF(repunicode);
9500                  goto onError;
9501              }
9502              Py_DECREF(repunicode);
9503              i = newpos;
9504          }
9505      }
9506      Py_XDECREF(exc);
9507      Py_XDECREF(errorHandler);
9508      return _PyUnicodeWriter_Finish(&writer);
9509  
9510    onError:
9511      _PyUnicodeWriter_Dealloc(&writer);
9512      Py_XDECREF(exc);
9513      Py_XDECREF(errorHandler);
9514      return NULL;
9515  }
9516  
9517  /* Deprecated. Use PyUnicode_Translate instead. */
9518  PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9519  PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9520                             Py_ssize_t size,
9521                             PyObject *mapping,
9522                             const char *errors)
9523  {
9524      PyObject *result;
9525      PyObject *unicode = PyUnicode_FromWideChar(p, size);
9526      if (!unicode)
9527          return NULL;
9528      result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9529      Py_DECREF(unicode);
9530      return result;
9531  }
9532  
9533  PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9534  PyUnicode_Translate(PyObject *str,
9535                      PyObject *mapping,
9536                      const char *errors)
9537  {
9538      if (ensure_unicode(str) < 0)
9539          return NULL;
9540      return _PyUnicode_TranslateCharmap(str, mapping, errors);
9541  }
9542  
9543  PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9544  _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9545  {
9546      if (!PyUnicode_Check(unicode)) {
9547          PyErr_BadInternalCall();
9548          return NULL;
9549      }
9550      if (PyUnicode_READY(unicode) == -1)
9551          return NULL;
9552      if (PyUnicode_IS_ASCII(unicode)) {
9553          /* If the string is already ASCII, just return the same string */
9554          Py_INCREF(unicode);
9555          return unicode;
9556      }
9557  
9558      Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9559      PyObject *result = PyUnicode_New(len, 127);
9560      if (result == NULL) {
9561          return NULL;
9562      }
9563  
9564      Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9565      int kind = PyUnicode_KIND(unicode);
9566      const void *data = PyUnicode_DATA(unicode);
9567      Py_ssize_t i;
9568      for (i = 0; i < len; ++i) {
9569          Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9570          if (ch < 127) {
9571              out[i] = ch;
9572          }
9573          else if (Py_UNICODE_ISSPACE(ch)) {
9574              out[i] = ' ';
9575          }
9576          else {
9577              int decimal = Py_UNICODE_TODECIMAL(ch);
9578              if (decimal < 0) {
9579                  out[i] = '?';
9580                  out[i+1] = '\0';
9581                  _PyUnicode_LENGTH(result) = i + 1;
9582                  break;
9583              }
9584              out[i] = '0' + decimal;
9585          }
9586      }
9587  
9588      assert(_PyUnicode_CheckConsistency(result, 1));
9589      return result;
9590  }
9591  
9592  PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9593  PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9594                                    Py_ssize_t length)
9595  {
9596      PyObject *decimal;
9597      Py_ssize_t i;
9598      Py_UCS4 maxchar;
9599      enum PyUnicode_Kind kind;
9600      const void *data;
9601  
9602      maxchar = 127;
9603      for (i = 0; i < length; i++) {
9604          Py_UCS4 ch = s[i];
9605          if (ch > 127) {
9606              int decimal = Py_UNICODE_TODECIMAL(ch);
9607              if (decimal >= 0)
9608                  ch = '0' + decimal;
9609              maxchar = Py_MAX(maxchar, ch);
9610          }
9611      }
9612  
9613      /* Copy to a new string */
9614      decimal = PyUnicode_New(length, maxchar);
9615      if (decimal == NULL)
9616          return decimal;
9617      kind = PyUnicode_KIND(decimal);
9618      data = PyUnicode_DATA(decimal);
9619      /* Iterate over code points */
9620      for (i = 0; i < length; i++) {
9621          Py_UCS4 ch = s[i];
9622          if (ch > 127) {
9623              int decimal = Py_UNICODE_TODECIMAL(ch);
9624              if (decimal >= 0)
9625                  ch = '0' + decimal;
9626          }
9627          PyUnicode_WRITE(kind, data, i, ch);
9628      }
9629      return unicode_result(decimal);
9630  }
9631  /* --- Decimal Encoder ---------------------------------------------------- */
9632  
9633  int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9634  PyUnicode_EncodeDecimal(Py_UNICODE *s,
9635                          Py_ssize_t length,
9636                          char *output,
9637                          const char *errors)
9638  {
9639      PyObject *unicode;
9640      Py_ssize_t i;
9641      enum PyUnicode_Kind kind;
9642      const void *data;
9643  
9644      if (output == NULL) {
9645          PyErr_BadArgument();
9646          return -1;
9647      }
9648  
9649      unicode = PyUnicode_FromWideChar(s, length);
9650      if (unicode == NULL)
9651          return -1;
9652  
9653      kind = PyUnicode_KIND(unicode);
9654      data = PyUnicode_DATA(unicode);
9655  
9656      for (i=0; i < length; ) {
9657          PyObject *exc;
9658          Py_UCS4 ch;
9659          int decimal;
9660          Py_ssize_t startpos;
9661  
9662          ch = PyUnicode_READ(kind, data, i);
9663  
9664          if (Py_UNICODE_ISSPACE(ch)) {
9665              *output++ = ' ';
9666              i++;
9667              continue;
9668          }
9669          decimal = Py_UNICODE_TODECIMAL(ch);
9670          if (decimal >= 0) {
9671              *output++ = '0' + decimal;
9672              i++;
9673              continue;
9674          }
9675          if (0 < ch && ch < 256) {
9676              *output++ = (char)ch;
9677              i++;
9678              continue;
9679          }
9680  
9681          startpos = i;
9682          exc = NULL;
9683          raise_encode_exception(&exc, "decimal", unicode,
9684                                 startpos, startpos+1,
9685                                 "invalid decimal Unicode string");
9686          Py_XDECREF(exc);
9687          Py_DECREF(unicode);
9688          return -1;
9689      }
9690      /* 0-terminate the output string */
9691      *output++ = '\0';
9692      Py_DECREF(unicode);
9693      return 0;
9694  }
9695  
9696  /* --- Helpers ------------------------------------------------------------ */
9697  
9698  /* helper macro to fixup start/end slice values */
9699  #define ADJUST_INDICES(start, end, len)         \
9700      if (end > len)                              \
9701          end = len;                              \
9702      else if (end < 0) {                         \
9703          end += len;                             \
9704          if (end < 0)                            \
9705              end = 0;                            \
9706      }                                           \
9707      if (start < 0) {                            \
9708          start += len;                           \
9709          if (start < 0)                          \
9710              start = 0;                          \
9711      }
9712  
9713  static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9714  any_find_slice(PyObject* s1, PyObject* s2,
9715                 Py_ssize_t start,
9716                 Py_ssize_t end,
9717                 int direction)
9718  {
9719      int kind1, kind2;
9720      const void *buf1, *buf2;
9721      Py_ssize_t len1, len2, result;
9722  
9723      kind1 = PyUnicode_KIND(s1);
9724      kind2 = PyUnicode_KIND(s2);
9725      if (kind1 < kind2)
9726          return -1;
9727  
9728      len1 = PyUnicode_GET_LENGTH(s1);
9729      len2 = PyUnicode_GET_LENGTH(s2);
9730      ADJUST_INDICES(start, end, len1);
9731      if (end - start < len2)
9732          return -1;
9733  
9734      buf1 = PyUnicode_DATA(s1);
9735      buf2 = PyUnicode_DATA(s2);
9736      if (len2 == 1) {
9737          Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9738          result = findchar((const char *)buf1 + kind1*start,
9739                            kind1, end - start, ch, direction);
9740          if (result == -1)
9741              return -1;
9742          else
9743              return start + result;
9744      }
9745  
9746      if (kind2 != kind1) {
9747          buf2 = unicode_askind(kind2, buf2, len2, kind1);
9748          if (!buf2)
9749              return -2;
9750      }
9751  
9752      if (direction > 0) {
9753          switch (kind1) {
9754          case PyUnicode_1BYTE_KIND:
9755              if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9756                  result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9757              else
9758                  result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9759              break;
9760          case PyUnicode_2BYTE_KIND:
9761              result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9762              break;
9763          case PyUnicode_4BYTE_KIND:
9764              result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9765              break;
9766          default:
9767              Py_UNREACHABLE();
9768          }
9769      }
9770      else {
9771          switch (kind1) {
9772          case PyUnicode_1BYTE_KIND:
9773              if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9774                  result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9775              else
9776                  result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9777              break;
9778          case PyUnicode_2BYTE_KIND:
9779              result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9780              break;
9781          case PyUnicode_4BYTE_KIND:
9782              result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9783              break;
9784          default:
9785              Py_UNREACHABLE();
9786          }
9787      }
9788  
9789      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9790      if (kind2 != kind1)
9791          PyMem_Free((void *)buf2);
9792  
9793      return result;
9794  }
9795  
9796  /* _PyUnicode_InsertThousandsGrouping() helper functions */
9797  #include "stringlib/localeutil.h"
9798  
9799  /**
9800   * InsertThousandsGrouping:
9801   * @writer: Unicode writer.
9802   * @n_buffer: Number of characters in @buffer.
9803   * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9804   * @d_pos: Start of digits string.
9805   * @n_digits: The number of digits in the string, in which we want
9806   *            to put the grouping chars.
9807   * @min_width: The minimum width of the digits in the output string.
9808   *             Output will be zero-padded on the left to fill.
9809   * @grouping: see definition in localeconv().
9810   * @thousands_sep: see definition in localeconv().
9811   *
9812   * There are 2 modes: counting and filling. If @writer is NULL,
9813   *  we are in counting mode, else filling mode.
9814   * If counting, the required buffer size is returned.
9815   * If filling, we know the buffer will be large enough, so we don't
9816   *  need to pass in the buffer size.
9817   * Inserts thousand grouping characters (as defined by grouping and
9818   *  thousands_sep) into @writer.
9819   *
9820   * Return value: -1 on error, number of characters otherwise.
9821   **/
9822  Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9823  _PyUnicode_InsertThousandsGrouping(
9824      _PyUnicodeWriter *writer,
9825      Py_ssize_t n_buffer,
9826      PyObject *digits,
9827      Py_ssize_t d_pos,
9828      Py_ssize_t n_digits,
9829      Py_ssize_t min_width,
9830      const char *grouping,
9831      PyObject *thousands_sep,
9832      Py_UCS4 *maxchar)
9833  {
9834      min_width = Py_MAX(0, min_width);
9835      if (writer) {
9836          assert(digits != NULL);
9837          assert(maxchar == NULL);
9838      }
9839      else {
9840          assert(digits == NULL);
9841          assert(maxchar != NULL);
9842      }
9843      assert(0 <= d_pos);
9844      assert(0 <= n_digits);
9845      assert(grouping != NULL);
9846  
9847      if (digits != NULL) {
9848          if (PyUnicode_READY(digits) == -1) {
9849              return -1;
9850          }
9851      }
9852      if (PyUnicode_READY(thousands_sep) == -1) {
9853          return -1;
9854      }
9855  
9856      Py_ssize_t count = 0;
9857      Py_ssize_t n_zeros;
9858      int loop_broken = 0;
9859      int use_separator = 0; /* First time through, don't append the
9860                                separator. They only go between
9861                                groups. */
9862      Py_ssize_t buffer_pos;
9863      Py_ssize_t digits_pos;
9864      Py_ssize_t len;
9865      Py_ssize_t n_chars;
9866      Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9867                                          be looked at */
9868      /* A generator that returns all of the grouping widths, until it
9869         returns 0. */
9870      GroupGenerator groupgen;
9871      GroupGenerator_init(&groupgen, grouping);
9872      const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9873  
9874      /* if digits are not grouped, thousands separator
9875         should be an empty string */
9876      assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9877  
9878      digits_pos = d_pos + n_digits;
9879      if (writer) {
9880          buffer_pos = writer->pos + n_buffer;
9881          assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9882          assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9883      }
9884      else {
9885          buffer_pos = n_buffer;
9886      }
9887  
9888      if (!writer) {
9889          *maxchar = 127;
9890      }
9891  
9892      while ((len = GroupGenerator_next(&groupgen)) > 0) {
9893          len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9894          n_zeros = Py_MAX(0, len - remaining);
9895          n_chars = Py_MAX(0, Py_MIN(remaining, len));
9896  
9897          /* Use n_zero zero's and n_chars chars */
9898  
9899          /* Count only, don't do anything. */
9900          count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9901  
9902          /* Copy into the writer. */
9903          InsertThousandsGrouping_fill(writer, &buffer_pos,
9904                                       digits, &digits_pos,
9905                                       n_chars, n_zeros,
9906                                       use_separator ? thousands_sep : NULL,
9907                                       thousands_sep_len, maxchar);
9908  
9909          /* Use a separator next time. */
9910          use_separator = 1;
9911  
9912          remaining -= n_chars;
9913          min_width -= len;
9914  
9915          if (remaining <= 0 && min_width <= 0) {
9916              loop_broken = 1;
9917              break;
9918          }
9919          min_width -= thousands_sep_len;
9920      }
9921      if (!loop_broken) {
9922          /* We left the loop without using a break statement. */
9923  
9924          len = Py_MAX(Py_MAX(remaining, min_width), 1);
9925          n_zeros = Py_MAX(0, len - remaining);
9926          n_chars = Py_MAX(0, Py_MIN(remaining, len));
9927  
9928          /* Use n_zero zero's and n_chars chars */
9929          count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9930  
9931          /* Copy into the writer. */
9932          InsertThousandsGrouping_fill(writer, &buffer_pos,
9933                                       digits, &digits_pos,
9934                                       n_chars, n_zeros,
9935                                       use_separator ? thousands_sep : NULL,
9936                                       thousands_sep_len, maxchar);
9937      }
9938      return count;
9939  }
9940  
9941  
9942  Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9943  PyUnicode_Count(PyObject *str,
9944                  PyObject *substr,
9945                  Py_ssize_t start,
9946                  Py_ssize_t end)
9947  {
9948      Py_ssize_t result;
9949      int kind1, kind2;
9950      const void *buf1 = NULL, *buf2 = NULL;
9951      Py_ssize_t len1, len2;
9952  
9953      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9954          return -1;
9955  
9956      kind1 = PyUnicode_KIND(str);
9957      kind2 = PyUnicode_KIND(substr);
9958      if (kind1 < kind2)
9959          return 0;
9960  
9961      len1 = PyUnicode_GET_LENGTH(str);
9962      len2 = PyUnicode_GET_LENGTH(substr);
9963      ADJUST_INDICES(start, end, len1);
9964      if (end - start < len2)
9965          return 0;
9966  
9967      buf1 = PyUnicode_DATA(str);
9968      buf2 = PyUnicode_DATA(substr);
9969      if (kind2 != kind1) {
9970          buf2 = unicode_askind(kind2, buf2, len2, kind1);
9971          if (!buf2)
9972              goto onError;
9973      }
9974  
9975      switch (kind1) {
9976      case PyUnicode_1BYTE_KIND:
9977          if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9978              result = asciilib_count(
9979                  ((const Py_UCS1*)buf1) + start, end - start,
9980                  buf2, len2, PY_SSIZE_T_MAX
9981                  );
9982          else
9983              result = ucs1lib_count(
9984                  ((const Py_UCS1*)buf1) + start, end - start,
9985                  buf2, len2, PY_SSIZE_T_MAX
9986                  );
9987          break;
9988      case PyUnicode_2BYTE_KIND:
9989          result = ucs2lib_count(
9990              ((const Py_UCS2*)buf1) + start, end - start,
9991              buf2, len2, PY_SSIZE_T_MAX
9992              );
9993          break;
9994      case PyUnicode_4BYTE_KIND:
9995          result = ucs4lib_count(
9996              ((const Py_UCS4*)buf1) + start, end - start,
9997              buf2, len2, PY_SSIZE_T_MAX
9998              );
9999          break;
10000      default:
10001          Py_UNREACHABLE();
10002      }
10003  
10004      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10005      if (kind2 != kind1)
10006          PyMem_Free((void *)buf2);
10007  
10008      return result;
10009    onError:
10010      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10011      if (kind2 != kind1)
10012          PyMem_Free((void *)buf2);
10013      return -1;
10014  }
10015  
10016  Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10017  PyUnicode_Find(PyObject *str,
10018                 PyObject *substr,
10019                 Py_ssize_t start,
10020                 Py_ssize_t end,
10021                 int direction)
10022  {
10023      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10024          return -2;
10025  
10026      return any_find_slice(str, substr, start, end, direction);
10027  }
10028  
10029  Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)10030  PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10031                     Py_ssize_t start, Py_ssize_t end,
10032                     int direction)
10033  {
10034      int kind;
10035      Py_ssize_t len, result;
10036      if (PyUnicode_READY(str) == -1)
10037          return -2;
10038      len = PyUnicode_GET_LENGTH(str);
10039      ADJUST_INDICES(start, end, len);
10040      if (end - start < 1)
10041          return -1;
10042      kind = PyUnicode_KIND(str);
10043      result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10044                        kind, end-start, ch, direction);
10045      if (result == -1)
10046          return -1;
10047      else
10048          return start + result;
10049  }
10050  
10051  static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)10052  tailmatch(PyObject *self,
10053            PyObject *substring,
10054            Py_ssize_t start,
10055            Py_ssize_t end,
10056            int direction)
10057  {
10058      int kind_self;
10059      int kind_sub;
10060      const void *data_self;
10061      const void *data_sub;
10062      Py_ssize_t offset;
10063      Py_ssize_t i;
10064      Py_ssize_t end_sub;
10065  
10066      if (PyUnicode_READY(self) == -1 ||
10067          PyUnicode_READY(substring) == -1)
10068          return -1;
10069  
10070      ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10071      end -= PyUnicode_GET_LENGTH(substring);
10072      if (end < start)
10073          return 0;
10074  
10075      if (PyUnicode_GET_LENGTH(substring) == 0)
10076          return 1;
10077  
10078      kind_self = PyUnicode_KIND(self);
10079      data_self = PyUnicode_DATA(self);
10080      kind_sub = PyUnicode_KIND(substring);
10081      data_sub = PyUnicode_DATA(substring);
10082      end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10083  
10084      if (direction > 0)
10085          offset = end;
10086      else
10087          offset = start;
10088  
10089      if (PyUnicode_READ(kind_self, data_self, offset) ==
10090          PyUnicode_READ(kind_sub, data_sub, 0) &&
10091          PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10092          PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10093          /* If both are of the same kind, memcmp is sufficient */
10094          if (kind_self == kind_sub) {
10095              return ! memcmp((char *)data_self +
10096                                  (offset * PyUnicode_KIND(substring)),
10097                              data_sub,
10098                              PyUnicode_GET_LENGTH(substring) *
10099                                  PyUnicode_KIND(substring));
10100          }
10101          /* otherwise we have to compare each character by first accessing it */
10102          else {
10103              /* We do not need to compare 0 and len(substring)-1 because
10104                 the if statement above ensured already that they are equal
10105                 when we end up here. */
10106              for (i = 1; i < end_sub; ++i) {
10107                  if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10108                      PyUnicode_READ(kind_sub, data_sub, i))
10109                      return 0;
10110              }
10111              return 1;
10112          }
10113      }
10114  
10115      return 0;
10116  }
10117  
10118  Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10119  PyUnicode_Tailmatch(PyObject *str,
10120                      PyObject *substr,
10121                      Py_ssize_t start,
10122                      Py_ssize_t end,
10123                      int direction)
10124  {
10125      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10126          return -1;
10127  
10128      return tailmatch(str, substr, start, end, direction);
10129  }
10130  
10131  static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)10132  ascii_upper_or_lower(PyObject *self, int lower)
10133  {
10134      Py_ssize_t len = PyUnicode_GET_LENGTH(self);
10135      const char *data = PyUnicode_DATA(self);
10136      char *resdata;
10137      PyObject *res;
10138  
10139      res = PyUnicode_New(len, 127);
10140      if (res == NULL)
10141          return NULL;
10142      resdata = PyUnicode_DATA(res);
10143      if (lower)
10144          _Py_bytes_lower(resdata, data, len);
10145      else
10146          _Py_bytes_upper(resdata, data, len);
10147      return res;
10148  }
10149  
10150  static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)10151  handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10152  {
10153      Py_ssize_t j;
10154      int final_sigma;
10155      Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
10156      /* U+03A3 is in the Final_Sigma context when, it is found like this:
10157  
10158       \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10159  
10160      where ! is a negation and \p{xxx} is a character with property xxx.
10161      */
10162      for (j = i - 1; j >= 0; j--) {
10163          c = PyUnicode_READ(kind, data, j);
10164          if (!_PyUnicode_IsCaseIgnorable(c))
10165              break;
10166      }
10167      final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10168      if (final_sigma) {
10169          for (j = i + 1; j < length; j++) {
10170              c = PyUnicode_READ(kind, data, j);
10171              if (!_PyUnicode_IsCaseIgnorable(c))
10172                  break;
10173          }
10174          final_sigma = j == length || !_PyUnicode_IsCased(c);
10175      }
10176      return (final_sigma) ? 0x3C2 : 0x3C3;
10177  }
10178  
10179  static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)10180  lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10181             Py_UCS4 c, Py_UCS4 *mapped)
10182  {
10183      /* Obscure special case. */
10184      if (c == 0x3A3) {
10185          mapped[0] = handle_capital_sigma(kind, data, length, i);
10186          return 1;
10187      }
10188      return _PyUnicode_ToLowerFull(c, mapped);
10189  }
10190  
10191  static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10192  do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10193  {
10194      Py_ssize_t i, k = 0;
10195      int n_res, j;
10196      Py_UCS4 c, mapped[3];
10197  
10198      c = PyUnicode_READ(kind, data, 0);
10199      n_res = _PyUnicode_ToTitleFull(c, mapped);
10200      for (j = 0; j < n_res; j++) {
10201          *maxchar = Py_MAX(*maxchar, mapped[j]);
10202          res[k++] = mapped[j];
10203      }
10204      for (i = 1; i < length; i++) {
10205          c = PyUnicode_READ(kind, data, i);
10206          n_res = lower_ucs4(kind, data, length, i, c, mapped);
10207          for (j = 0; j < n_res; j++) {
10208              *maxchar = Py_MAX(*maxchar, mapped[j]);
10209              res[k++] = mapped[j];
10210          }
10211      }
10212      return k;
10213  }
10214  
10215  static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10216  do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10217      Py_ssize_t i, k = 0;
10218  
10219      for (i = 0; i < length; i++) {
10220          Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10221          int n_res, j;
10222          if (Py_UNICODE_ISUPPER(c)) {
10223              n_res = lower_ucs4(kind, data, length, i, c, mapped);
10224          }
10225          else if (Py_UNICODE_ISLOWER(c)) {
10226              n_res = _PyUnicode_ToUpperFull(c, mapped);
10227          }
10228          else {
10229              n_res = 1;
10230              mapped[0] = c;
10231          }
10232          for (j = 0; j < n_res; j++) {
10233              *maxchar = Py_MAX(*maxchar, mapped[j]);
10234              res[k++] = mapped[j];
10235          }
10236      }
10237      return k;
10238  }
10239  
10240  static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)10241  do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10242                    Py_UCS4 *maxchar, int lower)
10243  {
10244      Py_ssize_t i, k = 0;
10245  
10246      for (i = 0; i < length; i++) {
10247          Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10248          int n_res, j;
10249          if (lower)
10250              n_res = lower_ucs4(kind, data, length, i, c, mapped);
10251          else
10252              n_res = _PyUnicode_ToUpperFull(c, mapped);
10253          for (j = 0; j < n_res; j++) {
10254              *maxchar = Py_MAX(*maxchar, mapped[j]);
10255              res[k++] = mapped[j];
10256          }
10257      }
10258      return k;
10259  }
10260  
10261  static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10262  do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10263  {
10264      return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10265  }
10266  
10267  static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10268  do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10269  {
10270      return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10271  }
10272  
10273  static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10274  do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10275  {
10276      Py_ssize_t i, k = 0;
10277  
10278      for (i = 0; i < length; i++) {
10279          Py_UCS4 c = PyUnicode_READ(kind, data, i);
10280          Py_UCS4 mapped[3];
10281          int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10282          for (j = 0; j < n_res; j++) {
10283              *maxchar = Py_MAX(*maxchar, mapped[j]);
10284              res[k++] = mapped[j];
10285          }
10286      }
10287      return k;
10288  }
10289  
10290  static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10291  do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10292  {
10293      Py_ssize_t i, k = 0;
10294      int previous_is_cased;
10295  
10296      previous_is_cased = 0;
10297      for (i = 0; i < length; i++) {
10298          const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10299          Py_UCS4 mapped[3];
10300          int n_res, j;
10301  
10302          if (previous_is_cased)
10303              n_res = lower_ucs4(kind, data, length, i, c, mapped);
10304          else
10305              n_res = _PyUnicode_ToTitleFull(c, mapped);
10306  
10307          for (j = 0; j < n_res; j++) {
10308              *maxchar = Py_MAX(*maxchar, mapped[j]);
10309              res[k++] = mapped[j];
10310          }
10311  
10312          previous_is_cased = _PyUnicode_IsCased(c);
10313      }
10314      return k;
10315  }
10316  
10317  static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10318  case_operation(PyObject *self,
10319                 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10320  {
10321      PyObject *res = NULL;
10322      Py_ssize_t length, newlength = 0;
10323      int kind, outkind;
10324      const void *data;
10325      void *outdata;
10326      Py_UCS4 maxchar = 0, *tmp, *tmpend;
10327  
10328      assert(PyUnicode_IS_READY(self));
10329  
10330      kind = PyUnicode_KIND(self);
10331      data = PyUnicode_DATA(self);
10332      length = PyUnicode_GET_LENGTH(self);
10333      if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10334          PyErr_SetString(PyExc_OverflowError, "string is too long");
10335          return NULL;
10336      }
10337      tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10338      if (tmp == NULL)
10339          return PyErr_NoMemory();
10340      newlength = perform(kind, data, length, tmp, &maxchar);
10341      res = PyUnicode_New(newlength, maxchar);
10342      if (res == NULL)
10343          goto leave;
10344      tmpend = tmp + newlength;
10345      outdata = PyUnicode_DATA(res);
10346      outkind = PyUnicode_KIND(res);
10347      switch (outkind) {
10348      case PyUnicode_1BYTE_KIND:
10349          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10350          break;
10351      case PyUnicode_2BYTE_KIND:
10352          _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10353          break;
10354      case PyUnicode_4BYTE_KIND:
10355          memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10356          break;
10357      default:
10358          Py_UNREACHABLE();
10359      }
10360    leave:
10361      PyMem_Free(tmp);
10362      return res;
10363  }
10364  
10365  PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10366  PyUnicode_Join(PyObject *separator, PyObject *seq)
10367  {
10368      PyObject *res;
10369      PyObject *fseq;
10370      Py_ssize_t seqlen;
10371      PyObject **items;
10372  
10373      fseq = PySequence_Fast(seq, "can only join an iterable");
10374      if (fseq == NULL) {
10375          return NULL;
10376      }
10377  
10378      /* NOTE: the following code can't call back into Python code,
10379       * so we are sure that fseq won't be mutated.
10380       */
10381  
10382      items = PySequence_Fast_ITEMS(fseq);
10383      seqlen = PySequence_Fast_GET_SIZE(fseq);
10384      res = _PyUnicode_JoinArray(separator, items, seqlen);
10385      Py_DECREF(fseq);
10386      return res;
10387  }
10388  
10389  PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10390  _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10391  {
10392      PyObject *res = NULL; /* the result */
10393      PyObject *sep = NULL;
10394      Py_ssize_t seplen;
10395      PyObject *item;
10396      Py_ssize_t sz, i, res_offset;
10397      Py_UCS4 maxchar;
10398      Py_UCS4 item_maxchar;
10399      int use_memcpy;
10400      unsigned char *res_data = NULL, *sep_data = NULL;
10401      PyObject *last_obj;
10402      unsigned int kind = 0;
10403  
10404      /* If empty sequence, return u"". */
10405      if (seqlen == 0) {
10406          _Py_RETURN_UNICODE_EMPTY();
10407      }
10408  
10409      /* If singleton sequence with an exact Unicode, return that. */
10410      last_obj = NULL;
10411      if (seqlen == 1) {
10412          if (PyUnicode_CheckExact(items[0])) {
10413              res = items[0];
10414              Py_INCREF(res);
10415              return res;
10416          }
10417          seplen = 0;
10418          maxchar = 0;
10419      }
10420      else {
10421          /* Set up sep and seplen */
10422          if (separator == NULL) {
10423              /* fall back to a blank space separator */
10424              sep = PyUnicode_FromOrdinal(' ');
10425              if (!sep)
10426                  goto onError;
10427              seplen = 1;
10428              maxchar = 32;
10429          }
10430          else {
10431              if (!PyUnicode_Check(separator)) {
10432                  PyErr_Format(PyExc_TypeError,
10433                               "separator: expected str instance,"
10434                               " %.80s found",
10435                               Py_TYPE(separator)->tp_name);
10436                  goto onError;
10437              }
10438              if (PyUnicode_READY(separator))
10439                  goto onError;
10440              sep = separator;
10441              seplen = PyUnicode_GET_LENGTH(separator);
10442              maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10443              /* inc refcount to keep this code path symmetric with the
10444                 above case of a blank separator */
10445              Py_INCREF(sep);
10446          }
10447          last_obj = sep;
10448      }
10449  
10450      /* There are at least two things to join, or else we have a subclass
10451       * of str in the sequence.
10452       * Do a pre-pass to figure out the total amount of space we'll
10453       * need (sz), and see whether all argument are strings.
10454       */
10455      sz = 0;
10456  #ifdef Py_DEBUG
10457      use_memcpy = 0;
10458  #else
10459      use_memcpy = 1;
10460  #endif
10461      for (i = 0; i < seqlen; i++) {
10462          size_t add_sz;
10463          item = items[i];
10464          if (!PyUnicode_Check(item)) {
10465              PyErr_Format(PyExc_TypeError,
10466                           "sequence item %zd: expected str instance,"
10467                           " %.80s found",
10468                           i, Py_TYPE(item)->tp_name);
10469              goto onError;
10470          }
10471          if (PyUnicode_READY(item) == -1)
10472              goto onError;
10473          add_sz = PyUnicode_GET_LENGTH(item);
10474          item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10475          maxchar = Py_MAX(maxchar, item_maxchar);
10476          if (i != 0) {
10477              add_sz += seplen;
10478          }
10479          if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10480              PyErr_SetString(PyExc_OverflowError,
10481                              "join() result is too long for a Python string");
10482              goto onError;
10483          }
10484          sz += add_sz;
10485          if (use_memcpy && last_obj != NULL) {
10486              if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10487                  use_memcpy = 0;
10488          }
10489          last_obj = item;
10490      }
10491  
10492      res = PyUnicode_New(sz, maxchar);
10493      if (res == NULL)
10494          goto onError;
10495  
10496      /* Catenate everything. */
10497  #ifdef Py_DEBUG
10498      use_memcpy = 0;
10499  #else
10500      if (use_memcpy) {
10501          res_data = PyUnicode_1BYTE_DATA(res);
10502          kind = PyUnicode_KIND(res);
10503          if (seplen != 0)
10504              sep_data = PyUnicode_1BYTE_DATA(sep);
10505      }
10506  #endif
10507      if (use_memcpy) {
10508          for (i = 0; i < seqlen; ++i) {
10509              Py_ssize_t itemlen;
10510              item = items[i];
10511  
10512              /* Copy item, and maybe the separator. */
10513              if (i && seplen != 0) {
10514                  memcpy(res_data,
10515                            sep_data,
10516                            kind * seplen);
10517                  res_data += kind * seplen;
10518              }
10519  
10520              itemlen = PyUnicode_GET_LENGTH(item);
10521              if (itemlen != 0) {
10522                  memcpy(res_data,
10523                            PyUnicode_DATA(item),
10524                            kind * itemlen);
10525                  res_data += kind * itemlen;
10526              }
10527          }
10528          assert(res_data == PyUnicode_1BYTE_DATA(res)
10529                             + kind * PyUnicode_GET_LENGTH(res));
10530      }
10531      else {
10532          for (i = 0, res_offset = 0; i < seqlen; ++i) {
10533              Py_ssize_t itemlen;
10534              item = items[i];
10535  
10536              /* Copy item, and maybe the separator. */
10537              if (i && seplen != 0) {
10538                  _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10539                  res_offset += seplen;
10540              }
10541  
10542              itemlen = PyUnicode_GET_LENGTH(item);
10543              if (itemlen != 0) {
10544                  _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10545                  res_offset += itemlen;
10546              }
10547          }
10548          assert(res_offset == PyUnicode_GET_LENGTH(res));
10549      }
10550  
10551      Py_XDECREF(sep);
10552      assert(_PyUnicode_CheckConsistency(res, 1));
10553      return res;
10554  
10555    onError:
10556      Py_XDECREF(sep);
10557      Py_XDECREF(res);
10558      return NULL;
10559  }
10560  
10561  void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10562  _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10563                      Py_UCS4 fill_char)
10564  {
10565      const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10566      void *data = PyUnicode_DATA(unicode);
10567      assert(PyUnicode_IS_READY(unicode));
10568      assert(unicode_modifiable(unicode));
10569      assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10570      assert(start >= 0);
10571      assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10572      unicode_fill(kind, data, fill_char, start, length);
10573  }
10574  
10575  Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10576  PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10577                 Py_UCS4 fill_char)
10578  {
10579      Py_ssize_t maxlen;
10580  
10581      if (!PyUnicode_Check(unicode)) {
10582          PyErr_BadInternalCall();
10583          return -1;
10584      }
10585      if (PyUnicode_READY(unicode) == -1)
10586          return -1;
10587      if (unicode_check_modifiable(unicode))
10588          return -1;
10589  
10590      if (start < 0) {
10591          PyErr_SetString(PyExc_IndexError, "string index out of range");
10592          return -1;
10593      }
10594      if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10595          PyErr_SetString(PyExc_ValueError,
10596                           "fill character is bigger than "
10597                           "the string maximum character");
10598          return -1;
10599      }
10600  
10601      maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10602      length = Py_MIN(maxlen, length);
10603      if (length <= 0)
10604          return 0;
10605  
10606      _PyUnicode_FastFill(unicode, start, length, fill_char);
10607      return length;
10608  }
10609  
10610  static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10611  pad(PyObject *self,
10612      Py_ssize_t left,
10613      Py_ssize_t right,
10614      Py_UCS4 fill)
10615  {
10616      PyObject *u;
10617      Py_UCS4 maxchar;
10618      int kind;
10619      void *data;
10620  
10621      if (left < 0)
10622          left = 0;
10623      if (right < 0)
10624          right = 0;
10625  
10626      if (left == 0 && right == 0)
10627          return unicode_result_unchanged(self);
10628  
10629      if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10630          right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10631          PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10632          return NULL;
10633      }
10634      maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10635      maxchar = Py_MAX(maxchar, fill);
10636      u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10637      if (!u)
10638          return NULL;
10639  
10640      kind = PyUnicode_KIND(u);
10641      data = PyUnicode_DATA(u);
10642      if (left)
10643          unicode_fill(kind, data, fill, 0, left);
10644      if (right)
10645          unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10646      _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10647      assert(_PyUnicode_CheckConsistency(u, 1));
10648      return u;
10649  }
10650  
10651  PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10652  PyUnicode_Splitlines(PyObject *string, int keepends)
10653  {
10654      PyObject *list;
10655  
10656      if (ensure_unicode(string) < 0)
10657          return NULL;
10658  
10659      switch (PyUnicode_KIND(string)) {
10660      case PyUnicode_1BYTE_KIND:
10661          if (PyUnicode_IS_ASCII(string))
10662              list = asciilib_splitlines(
10663                  string, PyUnicode_1BYTE_DATA(string),
10664                  PyUnicode_GET_LENGTH(string), keepends);
10665          else
10666              list = ucs1lib_splitlines(
10667                  string, PyUnicode_1BYTE_DATA(string),
10668                  PyUnicode_GET_LENGTH(string), keepends);
10669          break;
10670      case PyUnicode_2BYTE_KIND:
10671          list = ucs2lib_splitlines(
10672              string, PyUnicode_2BYTE_DATA(string),
10673              PyUnicode_GET_LENGTH(string), keepends);
10674          break;
10675      case PyUnicode_4BYTE_KIND:
10676          list = ucs4lib_splitlines(
10677              string, PyUnicode_4BYTE_DATA(string),
10678              PyUnicode_GET_LENGTH(string), keepends);
10679          break;
10680      default:
10681          Py_UNREACHABLE();
10682      }
10683      return list;
10684  }
10685  
10686  static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10687  split(PyObject *self,
10688        PyObject *substring,
10689        Py_ssize_t maxcount)
10690  {
10691      int kind1, kind2;
10692      const void *buf1, *buf2;
10693      Py_ssize_t len1, len2;
10694      PyObject* out;
10695  
10696      if (maxcount < 0)
10697          maxcount = PY_SSIZE_T_MAX;
10698  
10699      if (PyUnicode_READY(self) == -1)
10700          return NULL;
10701  
10702      if (substring == NULL)
10703          switch (PyUnicode_KIND(self)) {
10704          case PyUnicode_1BYTE_KIND:
10705              if (PyUnicode_IS_ASCII(self))
10706                  return asciilib_split_whitespace(
10707                      self,  PyUnicode_1BYTE_DATA(self),
10708                      PyUnicode_GET_LENGTH(self), maxcount
10709                      );
10710              else
10711                  return ucs1lib_split_whitespace(
10712                      self,  PyUnicode_1BYTE_DATA(self),
10713                      PyUnicode_GET_LENGTH(self), maxcount
10714                      );
10715          case PyUnicode_2BYTE_KIND:
10716              return ucs2lib_split_whitespace(
10717                  self,  PyUnicode_2BYTE_DATA(self),
10718                  PyUnicode_GET_LENGTH(self), maxcount
10719                  );
10720          case PyUnicode_4BYTE_KIND:
10721              return ucs4lib_split_whitespace(
10722                  self,  PyUnicode_4BYTE_DATA(self),
10723                  PyUnicode_GET_LENGTH(self), maxcount
10724                  );
10725          default:
10726              Py_UNREACHABLE();
10727          }
10728  
10729      if (PyUnicode_READY(substring) == -1)
10730          return NULL;
10731  
10732      kind1 = PyUnicode_KIND(self);
10733      kind2 = PyUnicode_KIND(substring);
10734      len1 = PyUnicode_GET_LENGTH(self);
10735      len2 = PyUnicode_GET_LENGTH(substring);
10736      if (kind1 < kind2 || len1 < len2) {
10737          out = PyList_New(1);
10738          if (out == NULL)
10739              return NULL;
10740          Py_INCREF(self);
10741          PyList_SET_ITEM(out, 0, self);
10742          return out;
10743      }
10744      buf1 = PyUnicode_DATA(self);
10745      buf2 = PyUnicode_DATA(substring);
10746      if (kind2 != kind1) {
10747          buf2 = unicode_askind(kind2, buf2, len2, kind1);
10748          if (!buf2)
10749              return NULL;
10750      }
10751  
10752      switch (kind1) {
10753      case PyUnicode_1BYTE_KIND:
10754          if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10755              out = asciilib_split(
10756                  self,  buf1, len1, buf2, len2, maxcount);
10757          else
10758              out = ucs1lib_split(
10759                  self,  buf1, len1, buf2, len2, maxcount);
10760          break;
10761      case PyUnicode_2BYTE_KIND:
10762          out = ucs2lib_split(
10763              self,  buf1, len1, buf2, len2, maxcount);
10764          break;
10765      case PyUnicode_4BYTE_KIND:
10766          out = ucs4lib_split(
10767              self,  buf1, len1, buf2, len2, maxcount);
10768          break;
10769      default:
10770          out = NULL;
10771      }
10772      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10773      if (kind2 != kind1)
10774          PyMem_Free((void *)buf2);
10775      return out;
10776  }
10777  
10778  static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10779  rsplit(PyObject *self,
10780         PyObject *substring,
10781         Py_ssize_t maxcount)
10782  {
10783      int kind1, kind2;
10784      const void *buf1, *buf2;
10785      Py_ssize_t len1, len2;
10786      PyObject* out;
10787  
10788      if (maxcount < 0)
10789          maxcount = PY_SSIZE_T_MAX;
10790  
10791      if (PyUnicode_READY(self) == -1)
10792          return NULL;
10793  
10794      if (substring == NULL)
10795          switch (PyUnicode_KIND(self)) {
10796          case PyUnicode_1BYTE_KIND:
10797              if (PyUnicode_IS_ASCII(self))
10798                  return asciilib_rsplit_whitespace(
10799                      self,  PyUnicode_1BYTE_DATA(self),
10800                      PyUnicode_GET_LENGTH(self), maxcount
10801                      );
10802              else
10803                  return ucs1lib_rsplit_whitespace(
10804                      self,  PyUnicode_1BYTE_DATA(self),
10805                      PyUnicode_GET_LENGTH(self), maxcount
10806                      );
10807          case PyUnicode_2BYTE_KIND:
10808              return ucs2lib_rsplit_whitespace(
10809                  self,  PyUnicode_2BYTE_DATA(self),
10810                  PyUnicode_GET_LENGTH(self), maxcount
10811                  );
10812          case PyUnicode_4BYTE_KIND:
10813              return ucs4lib_rsplit_whitespace(
10814                  self,  PyUnicode_4BYTE_DATA(self),
10815                  PyUnicode_GET_LENGTH(self), maxcount
10816                  );
10817          default:
10818              Py_UNREACHABLE();
10819          }
10820  
10821      if (PyUnicode_READY(substring) == -1)
10822          return NULL;
10823  
10824      kind1 = PyUnicode_KIND(self);
10825      kind2 = PyUnicode_KIND(substring);
10826      len1 = PyUnicode_GET_LENGTH(self);
10827      len2 = PyUnicode_GET_LENGTH(substring);
10828      if (kind1 < kind2 || len1 < len2) {
10829          out = PyList_New(1);
10830          if (out == NULL)
10831              return NULL;
10832          Py_INCREF(self);
10833          PyList_SET_ITEM(out, 0, self);
10834          return out;
10835      }
10836      buf1 = PyUnicode_DATA(self);
10837      buf2 = PyUnicode_DATA(substring);
10838      if (kind2 != kind1) {
10839          buf2 = unicode_askind(kind2, buf2, len2, kind1);
10840          if (!buf2)
10841              return NULL;
10842      }
10843  
10844      switch (kind1) {
10845      case PyUnicode_1BYTE_KIND:
10846          if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10847              out = asciilib_rsplit(
10848                  self,  buf1, len1, buf2, len2, maxcount);
10849          else
10850              out = ucs1lib_rsplit(
10851                  self,  buf1, len1, buf2, len2, maxcount);
10852          break;
10853      case PyUnicode_2BYTE_KIND:
10854          out = ucs2lib_rsplit(
10855              self,  buf1, len1, buf2, len2, maxcount);
10856          break;
10857      case PyUnicode_4BYTE_KIND:
10858          out = ucs4lib_rsplit(
10859              self,  buf1, len1, buf2, len2, maxcount);
10860          break;
10861      default:
10862          out = NULL;
10863      }
10864      assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10865      if (kind2 != kind1)
10866          PyMem_Free((void *)buf2);
10867      return out;
10868  }
10869  
10870  static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10871  anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10872              PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10873  {
10874      switch (kind) {
10875      case PyUnicode_1BYTE_KIND:
10876          if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10877              return asciilib_find(buf1, len1, buf2, len2, offset);
10878          else
10879              return ucs1lib_find(buf1, len1, buf2, len2, offset);
10880      case PyUnicode_2BYTE_KIND:
10881          return ucs2lib_find(buf1, len1, buf2, len2, offset);
10882      case PyUnicode_4BYTE_KIND:
10883          return ucs4lib_find(buf1, len1, buf2, len2, offset);
10884      }
10885      Py_UNREACHABLE();
10886  }
10887  
10888  static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10889  anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10890               PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10891  {
10892      switch (kind) {
10893      case PyUnicode_1BYTE_KIND:
10894          if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10895              return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10896          else
10897              return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10898      case PyUnicode_2BYTE_KIND:
10899          return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10900      case PyUnicode_4BYTE_KIND:
10901          return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10902      }
10903      Py_UNREACHABLE();
10904  }
10905  
10906  static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10907  replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10908                        Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10909  {
10910      int kind = PyUnicode_KIND(u);
10911      void *data = PyUnicode_DATA(u);
10912      Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10913      if (kind == PyUnicode_1BYTE_KIND) {
10914          ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10915                                        (Py_UCS1 *)data + len,
10916                                        u1, u2, maxcount);
10917      }
10918      else if (kind == PyUnicode_2BYTE_KIND) {
10919          ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10920                                        (Py_UCS2 *)data + len,
10921                                        u1, u2, maxcount);
10922      }
10923      else {
10924          assert(kind == PyUnicode_4BYTE_KIND);
10925          ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10926                                        (Py_UCS4 *)data + len,
10927                                        u1, u2, maxcount);
10928      }
10929  }
10930  
10931  static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10932  replace(PyObject *self, PyObject *str1,
10933          PyObject *str2, Py_ssize_t maxcount)
10934  {
10935      PyObject *u;
10936      const char *sbuf = PyUnicode_DATA(self);
10937      const void *buf1 = PyUnicode_DATA(str1);
10938      const void *buf2 = PyUnicode_DATA(str2);
10939      int srelease = 0, release1 = 0, release2 = 0;
10940      int skind = PyUnicode_KIND(self);
10941      int kind1 = PyUnicode_KIND(str1);
10942      int kind2 = PyUnicode_KIND(str2);
10943      Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10944      Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10945      Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10946      int mayshrink;
10947      Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10948  
10949      if (slen < len1)
10950          goto nothing;
10951  
10952      if (maxcount < 0)
10953          maxcount = PY_SSIZE_T_MAX;
10954      else if (maxcount == 0)
10955          goto nothing;
10956  
10957      if (str1 == str2)
10958          goto nothing;
10959  
10960      maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10961      maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10962      if (maxchar < maxchar_str1)
10963          /* substring too wide to be present */
10964          goto nothing;
10965      maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10966      /* Replacing str1 with str2 may cause a maxchar reduction in the
10967         result string. */
10968      mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10969      maxchar = Py_MAX(maxchar, maxchar_str2);
10970  
10971      if (len1 == len2) {
10972          /* same length */
10973          if (len1 == 0)
10974              goto nothing;
10975          if (len1 == 1) {
10976              /* replace characters */
10977              Py_UCS4 u1, u2;
10978              Py_ssize_t pos;
10979  
10980              u1 = PyUnicode_READ(kind1, buf1, 0);
10981              pos = findchar(sbuf, skind, slen, u1, 1);
10982              if (pos < 0)
10983                  goto nothing;
10984              u2 = PyUnicode_READ(kind2, buf2, 0);
10985              u = PyUnicode_New(slen, maxchar);
10986              if (!u)
10987                  goto error;
10988  
10989              _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10990              replace_1char_inplace(u, pos, u1, u2, maxcount);
10991          }
10992          else {
10993              int rkind = skind;
10994              char *res;
10995              Py_ssize_t i;
10996  
10997              if (kind1 < rkind) {
10998                  /* widen substring */
10999                  buf1 = unicode_askind(kind1, buf1, len1, rkind);
11000                  if (!buf1) goto error;
11001                  release1 = 1;
11002              }
11003              i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
11004              if (i < 0)
11005                  goto nothing;
11006              if (rkind > kind2) {
11007                  /* widen replacement */
11008                  buf2 = unicode_askind(kind2, buf2, len2, rkind);
11009                  if (!buf2) goto error;
11010                  release2 = 1;
11011              }
11012              else if (rkind < kind2) {
11013                  /* widen self and buf1 */
11014                  rkind = kind2;
11015                  if (release1) {
11016                      assert(buf1 != PyUnicode_DATA(str1));
11017                      PyMem_Free((void *)buf1);
11018                      buf1 = PyUnicode_DATA(str1);
11019                      release1 = 0;
11020                  }
11021                  sbuf = unicode_askind(skind, sbuf, slen, rkind);
11022                  if (!sbuf) goto error;
11023                  srelease = 1;
11024                  buf1 = unicode_askind(kind1, buf1, len1, rkind);
11025                  if (!buf1) goto error;
11026                  release1 = 1;
11027              }
11028              u = PyUnicode_New(slen, maxchar);
11029              if (!u)
11030                  goto error;
11031              assert(PyUnicode_KIND(u) == rkind);
11032              res = PyUnicode_DATA(u);
11033  
11034              memcpy(res, sbuf, rkind * slen);
11035              /* change everything in-place, starting with this one */
11036              memcpy(res + rkind * i,
11037                     buf2,
11038                     rkind * len2);
11039              i += len1;
11040  
11041              while ( --maxcount > 0) {
11042                  i = anylib_find(rkind, self,
11043                                  sbuf+rkind*i, slen-i,
11044                                  str1, buf1, len1, i);
11045                  if (i == -1)
11046                      break;
11047                  memcpy(res + rkind * i,
11048                         buf2,
11049                         rkind * len2);
11050                  i += len1;
11051              }
11052          }
11053      }
11054      else {
11055          Py_ssize_t n, i, j, ires;
11056          Py_ssize_t new_size;
11057          int rkind = skind;
11058          char *res;
11059  
11060          if (kind1 < rkind) {
11061              /* widen substring */
11062              buf1 = unicode_askind(kind1, buf1, len1, rkind);
11063              if (!buf1) goto error;
11064              release1 = 1;
11065          }
11066          n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
11067          if (n == 0)
11068              goto nothing;
11069          if (kind2 < rkind) {
11070              /* widen replacement */
11071              buf2 = unicode_askind(kind2, buf2, len2, rkind);
11072              if (!buf2) goto error;
11073              release2 = 1;
11074          }
11075          else if (kind2 > rkind) {
11076              /* widen self and buf1 */
11077              rkind = kind2;
11078              sbuf = unicode_askind(skind, sbuf, slen, rkind);
11079              if (!sbuf) goto error;
11080              srelease = 1;
11081              if (release1) {
11082                  assert(buf1 != PyUnicode_DATA(str1));
11083                  PyMem_Free((void *)buf1);
11084                  buf1 = PyUnicode_DATA(str1);
11085                  release1 = 0;
11086              }
11087              buf1 = unicode_askind(kind1, buf1, len1, rkind);
11088              if (!buf1) goto error;
11089              release1 = 1;
11090          }
11091          /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11092             PyUnicode_GET_LENGTH(str1)); */
11093          if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
11094                  PyErr_SetString(PyExc_OverflowError,
11095                                  "replace string is too long");
11096                  goto error;
11097          }
11098          new_size = slen + n * (len2 - len1);
11099          if (new_size == 0) {
11100              u = unicode_new_empty();
11101              goto done;
11102          }
11103          if (new_size > (PY_SSIZE_T_MAX / rkind)) {
11104              PyErr_SetString(PyExc_OverflowError,
11105                              "replace string is too long");
11106              goto error;
11107          }
11108          u = PyUnicode_New(new_size, maxchar);
11109          if (!u)
11110              goto error;
11111          assert(PyUnicode_KIND(u) == rkind);
11112          res = PyUnicode_DATA(u);
11113          ires = i = 0;
11114          if (len1 > 0) {
11115              while (n-- > 0) {
11116                  /* look for next match */
11117                  j = anylib_find(rkind, self,
11118                                  sbuf + rkind * i, slen-i,
11119                                  str1, buf1, len1, i);
11120                  if (j == -1)
11121                      break;
11122                  else if (j > i) {
11123                      /* copy unchanged part [i:j] */
11124                      memcpy(res + rkind * ires,
11125                             sbuf + rkind * i,
11126                             rkind * (j-i));
11127                      ires += j - i;
11128                  }
11129                  /* copy substitution string */
11130                  if (len2 > 0) {
11131                      memcpy(res + rkind * ires,
11132                             buf2,
11133                             rkind * len2);
11134                      ires += len2;
11135                  }
11136                  i = j + len1;
11137              }
11138              if (i < slen)
11139                  /* copy tail [i:] */
11140                  memcpy(res + rkind * ires,
11141                         sbuf + rkind * i,
11142                         rkind * (slen-i));
11143          }
11144          else {
11145              /* interleave */
11146              while (n > 0) {
11147                  memcpy(res + rkind * ires,
11148                         buf2,
11149                         rkind * len2);
11150                  ires += len2;
11151                  if (--n <= 0)
11152                      break;
11153                  memcpy(res + rkind * ires,
11154                         sbuf + rkind * i,
11155                         rkind);
11156                  ires++;
11157                  i++;
11158              }
11159              memcpy(res + rkind * ires,
11160                     sbuf + rkind * i,
11161                     rkind * (slen-i));
11162          }
11163      }
11164  
11165      if (mayshrink) {
11166          unicode_adjust_maxchar(&u);
11167          if (u == NULL)
11168              goto error;
11169      }
11170  
11171    done:
11172      assert(srelease == (sbuf != PyUnicode_DATA(self)));
11173      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11174      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11175      if (srelease)
11176          PyMem_Free((void *)sbuf);
11177      if (release1)
11178          PyMem_Free((void *)buf1);
11179      if (release2)
11180          PyMem_Free((void *)buf2);
11181      assert(_PyUnicode_CheckConsistency(u, 1));
11182      return u;
11183  
11184    nothing:
11185      /* nothing to replace; return original string (when possible) */
11186      assert(srelease == (sbuf != PyUnicode_DATA(self)));
11187      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11188      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11189      if (srelease)
11190          PyMem_Free((void *)sbuf);
11191      if (release1)
11192          PyMem_Free((void *)buf1);
11193      if (release2)
11194          PyMem_Free((void *)buf2);
11195      return unicode_result_unchanged(self);
11196  
11197    error:
11198      assert(srelease == (sbuf != PyUnicode_DATA(self)));
11199      assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11200      assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11201      if (srelease)
11202          PyMem_Free((void *)sbuf);
11203      if (release1)
11204          PyMem_Free((void *)buf1);
11205      if (release2)
11206          PyMem_Free((void *)buf2);
11207      return NULL;
11208  }
11209  
11210  /* --- Unicode Object Methods --------------------------------------------- */
11211  
11212  /*[clinic input]
11213  str.title as unicode_title
11214  
11215  Return a version of the string where each word is titlecased.
11216  
11217  More specifically, words start with uppercased characters and all remaining
11218  cased characters have lower case.
11219  [clinic start generated code]*/
11220  
11221  static PyObject *
unicode_title_impl(PyObject * self)11222  unicode_title_impl(PyObject *self)
11223  /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11224  {
11225      if (PyUnicode_READY(self) == -1)
11226          return NULL;
11227      return case_operation(self, do_title);
11228  }
11229  
11230  /*[clinic input]
11231  str.capitalize as unicode_capitalize
11232  
11233  Return a capitalized version of the string.
11234  
11235  More specifically, make the first character have upper case and the rest lower
11236  case.
11237  [clinic start generated code]*/
11238  
11239  static PyObject *
unicode_capitalize_impl(PyObject * self)11240  unicode_capitalize_impl(PyObject *self)
11241  /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11242  {
11243      if (PyUnicode_READY(self) == -1)
11244          return NULL;
11245      if (PyUnicode_GET_LENGTH(self) == 0)
11246          return unicode_result_unchanged(self);
11247      return case_operation(self, do_capitalize);
11248  }
11249  
11250  /*[clinic input]
11251  str.casefold as unicode_casefold
11252  
11253  Return a version of the string suitable for caseless comparisons.
11254  [clinic start generated code]*/
11255  
11256  static PyObject *
unicode_casefold_impl(PyObject * self)11257  unicode_casefold_impl(PyObject *self)
11258  /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11259  {
11260      if (PyUnicode_READY(self) == -1)
11261          return NULL;
11262      if (PyUnicode_IS_ASCII(self))
11263          return ascii_upper_or_lower(self, 1);
11264      return case_operation(self, do_casefold);
11265  }
11266  
11267  
11268  /* Argument converter. Accepts a single Unicode character. */
11269  
11270  static int
convert_uc(PyObject * obj,void * addr)11271  convert_uc(PyObject *obj, void *addr)
11272  {
11273      Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11274  
11275      if (!PyUnicode_Check(obj)) {
11276          PyErr_Format(PyExc_TypeError,
11277                       "The fill character must be a unicode character, "
11278                       "not %.100s", Py_TYPE(obj)->tp_name);
11279          return 0;
11280      }
11281      if (PyUnicode_READY(obj) < 0)
11282          return 0;
11283      if (PyUnicode_GET_LENGTH(obj) != 1) {
11284          PyErr_SetString(PyExc_TypeError,
11285                          "The fill character must be exactly one character long");
11286          return 0;
11287      }
11288      *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11289      return 1;
11290  }
11291  
11292  /*[clinic input]
11293  str.center as unicode_center
11294  
11295      width: Py_ssize_t
11296      fillchar: Py_UCS4 = ' '
11297      /
11298  
11299  Return a centered string of length width.
11300  
11301  Padding is done using the specified fill character (default is a space).
11302  [clinic start generated code]*/
11303  
11304  static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11305  unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11306  /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11307  {
11308      Py_ssize_t marg, left;
11309  
11310      if (PyUnicode_READY(self) == -1)
11311          return NULL;
11312  
11313      if (PyUnicode_GET_LENGTH(self) >= width)
11314          return unicode_result_unchanged(self);
11315  
11316      marg = width - PyUnicode_GET_LENGTH(self);
11317      left = marg / 2 + (marg & width & 1);
11318  
11319      return pad(self, left, marg - left, fillchar);
11320  }
11321  
11322  /* This function assumes that str1 and str2 are readied by the caller. */
11323  
11324  static int
unicode_compare(PyObject * str1,PyObject * str2)11325  unicode_compare(PyObject *str1, PyObject *str2)
11326  {
11327  #define COMPARE(TYPE1, TYPE2) \
11328      do { \
11329          TYPE1* p1 = (TYPE1 *)data1; \
11330          TYPE2* p2 = (TYPE2 *)data2; \
11331          TYPE1* end = p1 + len; \
11332          Py_UCS4 c1, c2; \
11333          for (; p1 != end; p1++, p2++) { \
11334              c1 = *p1; \
11335              c2 = *p2; \
11336              if (c1 != c2) \
11337                  return (c1 < c2) ? -1 : 1; \
11338          } \
11339      } \
11340      while (0)
11341  
11342      int kind1, kind2;
11343      const void *data1, *data2;
11344      Py_ssize_t len1, len2, len;
11345  
11346      kind1 = PyUnicode_KIND(str1);
11347      kind2 = PyUnicode_KIND(str2);
11348      data1 = PyUnicode_DATA(str1);
11349      data2 = PyUnicode_DATA(str2);
11350      len1 = PyUnicode_GET_LENGTH(str1);
11351      len2 = PyUnicode_GET_LENGTH(str2);
11352      len = Py_MIN(len1, len2);
11353  
11354      switch(kind1) {
11355      case PyUnicode_1BYTE_KIND:
11356      {
11357          switch(kind2) {
11358          case PyUnicode_1BYTE_KIND:
11359          {
11360              int cmp = memcmp(data1, data2, len);
11361              /* normalize result of memcmp() into the range [-1; 1] */
11362              if (cmp < 0)
11363                  return -1;
11364              if (cmp > 0)
11365                  return 1;
11366              break;
11367          }
11368          case PyUnicode_2BYTE_KIND:
11369              COMPARE(Py_UCS1, Py_UCS2);
11370              break;
11371          case PyUnicode_4BYTE_KIND:
11372              COMPARE(Py_UCS1, Py_UCS4);
11373              break;
11374          default:
11375              Py_UNREACHABLE();
11376          }
11377          break;
11378      }
11379      case PyUnicode_2BYTE_KIND:
11380      {
11381          switch(kind2) {
11382          case PyUnicode_1BYTE_KIND:
11383              COMPARE(Py_UCS2, Py_UCS1);
11384              break;
11385          case PyUnicode_2BYTE_KIND:
11386          {
11387              COMPARE(Py_UCS2, Py_UCS2);
11388              break;
11389          }
11390          case PyUnicode_4BYTE_KIND:
11391              COMPARE(Py_UCS2, Py_UCS4);
11392              break;
11393          default:
11394              Py_UNREACHABLE();
11395          }
11396          break;
11397      }
11398      case PyUnicode_4BYTE_KIND:
11399      {
11400          switch(kind2) {
11401          case PyUnicode_1BYTE_KIND:
11402              COMPARE(Py_UCS4, Py_UCS1);
11403              break;
11404          case PyUnicode_2BYTE_KIND:
11405              COMPARE(Py_UCS4, Py_UCS2);
11406              break;
11407          case PyUnicode_4BYTE_KIND:
11408          {
11409  #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11410              int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11411              /* normalize result of wmemcmp() into the range [-1; 1] */
11412              if (cmp < 0)
11413                  return -1;
11414              if (cmp > 0)
11415                  return 1;
11416  #else
11417              COMPARE(Py_UCS4, Py_UCS4);
11418  #endif
11419              break;
11420          }
11421          default:
11422              Py_UNREACHABLE();
11423          }
11424          break;
11425      }
11426      default:
11427          Py_UNREACHABLE();
11428      }
11429  
11430      if (len1 == len2)
11431          return 0;
11432      if (len1 < len2)
11433          return -1;
11434      else
11435          return 1;
11436  
11437  #undef COMPARE
11438  }
11439  
11440  static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11441  unicode_compare_eq(PyObject *str1, PyObject *str2)
11442  {
11443      int kind;
11444      const void *data1, *data2;
11445      Py_ssize_t len;
11446      int cmp;
11447  
11448      len = PyUnicode_GET_LENGTH(str1);
11449      if (PyUnicode_GET_LENGTH(str2) != len)
11450          return 0;
11451      kind = PyUnicode_KIND(str1);
11452      if (PyUnicode_KIND(str2) != kind)
11453          return 0;
11454      data1 = PyUnicode_DATA(str1);
11455      data2 = PyUnicode_DATA(str2);
11456  
11457      cmp = memcmp(data1, data2, len * kind);
11458      return (cmp == 0);
11459  }
11460  
11461  
11462  int
PyUnicode_Compare(PyObject * left,PyObject * right)11463  PyUnicode_Compare(PyObject *left, PyObject *right)
11464  {
11465      if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11466          if (PyUnicode_READY(left) == -1 ||
11467              PyUnicode_READY(right) == -1)
11468              return -1;
11469  
11470          /* a string is equal to itself */
11471          if (left == right)
11472              return 0;
11473  
11474          return unicode_compare(left, right);
11475      }
11476      PyErr_Format(PyExc_TypeError,
11477                   "Can't compare %.100s and %.100s",
11478                   Py_TYPE(left)->tp_name,
11479                   Py_TYPE(right)->tp_name);
11480      return -1;
11481  }
11482  
11483  int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11484  PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11485  {
11486      Py_ssize_t i;
11487      int kind;
11488      Py_UCS4 chr;
11489      const unsigned char *ustr = (const unsigned char *)str;
11490  
11491      assert(_PyUnicode_CHECK(uni));
11492      if (!PyUnicode_IS_READY(uni)) {
11493          const wchar_t *ws = _PyUnicode_WSTR(uni);
11494          /* Compare Unicode string and source character set string */
11495          for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11496              if (chr != ustr[i])
11497                  return (chr < ustr[i]) ? -1 : 1;
11498          }
11499          /* This check keeps Python strings that end in '\0' from comparing equal
11500           to C strings identical up to that point. */
11501          if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11502              return 1; /* uni is longer */
11503          if (ustr[i])
11504              return -1; /* str is longer */
11505          return 0;
11506      }
11507      kind = PyUnicode_KIND(uni);
11508      if (kind == PyUnicode_1BYTE_KIND) {
11509          const void *data = PyUnicode_1BYTE_DATA(uni);
11510          size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11511          size_t len, len2 = strlen(str);
11512          int cmp;
11513  
11514          len = Py_MIN(len1, len2);
11515          cmp = memcmp(data, str, len);
11516          if (cmp != 0) {
11517              if (cmp < 0)
11518                  return -1;
11519              else
11520                  return 1;
11521          }
11522          if (len1 > len2)
11523              return 1; /* uni is longer */
11524          if (len1 < len2)
11525              return -1; /* str is longer */
11526          return 0;
11527      }
11528      else {
11529          const void *data = PyUnicode_DATA(uni);
11530          /* Compare Unicode string and source character set string */
11531          for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11532              if (chr != (unsigned char)str[i])
11533                  return (chr < (unsigned char)(str[i])) ? -1 : 1;
11534          /* This check keeps Python strings that end in '\0' from comparing equal
11535           to C strings identical up to that point. */
11536          if (PyUnicode_GET_LENGTH(uni) != i || chr)
11537              return 1; /* uni is longer */
11538          if (str[i])
11539              return -1; /* str is longer */
11540          return 0;
11541      }
11542  }
11543  
11544  static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11545  non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11546  {
11547      size_t i, len;
11548      const wchar_t *p;
11549      len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11550      if (strlen(str) != len)
11551          return 0;
11552      p = _PyUnicode_WSTR(unicode);
11553      assert(p);
11554      for (i = 0; i < len; i++) {
11555          unsigned char c = (unsigned char)str[i];
11556          if (c >= 128 || p[i] != (wchar_t)c)
11557              return 0;
11558      }
11559      return 1;
11560  }
11561  
11562  int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11563  _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11564  {
11565      size_t len;
11566      assert(_PyUnicode_CHECK(unicode));
11567      assert(str);
11568  #ifndef NDEBUG
11569      for (const char *p = str; *p; p++) {
11570          assert((unsigned char)*p < 128);
11571      }
11572  #endif
11573      if (PyUnicode_READY(unicode) == -1) {
11574          /* Memory error or bad data */
11575          PyErr_Clear();
11576          return non_ready_unicode_equal_to_ascii_string(unicode, str);
11577      }
11578      if (!PyUnicode_IS_ASCII(unicode))
11579          return 0;
11580      len = (size_t)PyUnicode_GET_LENGTH(unicode);
11581      return strlen(str) == len &&
11582             memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11583  }
11584  
11585  int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11586  _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11587  {
11588      PyObject *right_uni;
11589  
11590      assert(_PyUnicode_CHECK(left));
11591      assert(right->string);
11592  #ifndef NDEBUG
11593      for (const char *p = right->string; *p; p++) {
11594          assert((unsigned char)*p < 128);
11595      }
11596  #endif
11597  
11598      if (PyUnicode_READY(left) == -1) {
11599          /* memory error or bad data */
11600          PyErr_Clear();
11601          return non_ready_unicode_equal_to_ascii_string(left, right->string);
11602      }
11603  
11604      if (!PyUnicode_IS_ASCII(left))
11605          return 0;
11606  
11607      right_uni = _PyUnicode_FromId(right);       /* borrowed */
11608      if (right_uni == NULL) {
11609          /* memory error or bad data */
11610          PyErr_Clear();
11611          return _PyUnicode_EqualToASCIIString(left, right->string);
11612      }
11613  
11614      if (left == right_uni)
11615          return 1;
11616  
11617      if (PyUnicode_CHECK_INTERNED(left))
11618          return 0;
11619  
11620  #ifdef INTERNED_STRINGS
11621      assert(_PyUnicode_HASH(right_uni) != -1);
11622      Py_hash_t hash = _PyUnicode_HASH(left);
11623      if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11624          return 0;
11625      }
11626  #endif
11627  
11628      return unicode_compare_eq(left, right_uni);
11629  }
11630  
11631  PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11632  PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11633  {
11634      int result;
11635  
11636      if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11637          Py_RETURN_NOTIMPLEMENTED;
11638  
11639      if (PyUnicode_READY(left) == -1 ||
11640          PyUnicode_READY(right) == -1)
11641          return NULL;
11642  
11643      if (left == right) {
11644          switch (op) {
11645          case Py_EQ:
11646          case Py_LE:
11647          case Py_GE:
11648              /* a string is equal to itself */
11649              Py_RETURN_TRUE;
11650          case Py_NE:
11651          case Py_LT:
11652          case Py_GT:
11653              Py_RETURN_FALSE;
11654          default:
11655              PyErr_BadArgument();
11656              return NULL;
11657          }
11658      }
11659      else if (op == Py_EQ || op == Py_NE) {
11660          result = unicode_compare_eq(left, right);
11661          result ^= (op == Py_NE);
11662          return PyBool_FromLong(result);
11663      }
11664      else {
11665          result = unicode_compare(left, right);
11666          Py_RETURN_RICHCOMPARE(result, 0, op);
11667      }
11668  }
11669  
11670  int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11671  _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11672  {
11673      return unicode_eq(aa, bb);
11674  }
11675  
11676  int
PyUnicode_Contains(PyObject * str,PyObject * substr)11677  PyUnicode_Contains(PyObject *str, PyObject *substr)
11678  {
11679      int kind1, kind2;
11680      const void *buf1, *buf2;
11681      Py_ssize_t len1, len2;
11682      int result;
11683  
11684      if (!PyUnicode_Check(substr)) {
11685          PyErr_Format(PyExc_TypeError,
11686                       "'in <string>' requires string as left operand, not %.100s",
11687                       Py_TYPE(substr)->tp_name);
11688          return -1;
11689      }
11690      if (PyUnicode_READY(substr) == -1)
11691          return -1;
11692      if (ensure_unicode(str) < 0)
11693          return -1;
11694  
11695      kind1 = PyUnicode_KIND(str);
11696      kind2 = PyUnicode_KIND(substr);
11697      if (kind1 < kind2)
11698          return 0;
11699      len1 = PyUnicode_GET_LENGTH(str);
11700      len2 = PyUnicode_GET_LENGTH(substr);
11701      if (len1 < len2)
11702          return 0;
11703      buf1 = PyUnicode_DATA(str);
11704      buf2 = PyUnicode_DATA(substr);
11705      if (len2 == 1) {
11706          Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11707          result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11708          return result;
11709      }
11710      if (kind2 != kind1) {
11711          buf2 = unicode_askind(kind2, buf2, len2, kind1);
11712          if (!buf2)
11713              return -1;
11714      }
11715  
11716      switch (kind1) {
11717      case PyUnicode_1BYTE_KIND:
11718          result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11719          break;
11720      case PyUnicode_2BYTE_KIND:
11721          result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11722          break;
11723      case PyUnicode_4BYTE_KIND:
11724          result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11725          break;
11726      default:
11727          Py_UNREACHABLE();
11728      }
11729  
11730      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11731      if (kind2 != kind1)
11732          PyMem_Free((void *)buf2);
11733  
11734      return result;
11735  }
11736  
11737  /* Concat to string or Unicode object giving a new Unicode object. */
11738  
11739  PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11740  PyUnicode_Concat(PyObject *left, PyObject *right)
11741  {
11742      PyObject *result;
11743      Py_UCS4 maxchar, maxchar2;
11744      Py_ssize_t left_len, right_len, new_len;
11745  
11746      if (ensure_unicode(left) < 0)
11747          return NULL;
11748  
11749      if (!PyUnicode_Check(right)) {
11750          PyErr_Format(PyExc_TypeError,
11751                       "can only concatenate str (not \"%.200s\") to str",
11752                       Py_TYPE(right)->tp_name);
11753          return NULL;
11754      }
11755      if (PyUnicode_READY(right) < 0)
11756          return NULL;
11757  
11758      /* Shortcuts */
11759      PyObject *empty = unicode_get_empty();  // Borrowed reference
11760      if (left == empty) {
11761          return PyUnicode_FromObject(right);
11762      }
11763      if (right == empty) {
11764          return PyUnicode_FromObject(left);
11765      }
11766  
11767      left_len = PyUnicode_GET_LENGTH(left);
11768      right_len = PyUnicode_GET_LENGTH(right);
11769      if (left_len > PY_SSIZE_T_MAX - right_len) {
11770          PyErr_SetString(PyExc_OverflowError,
11771                          "strings are too large to concat");
11772          return NULL;
11773      }
11774      new_len = left_len + right_len;
11775  
11776      maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11777      maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11778      maxchar = Py_MAX(maxchar, maxchar2);
11779  
11780      /* Concat the two Unicode strings */
11781      result = PyUnicode_New(new_len, maxchar);
11782      if (result == NULL)
11783          return NULL;
11784      _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11785      _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11786      assert(_PyUnicode_CheckConsistency(result, 1));
11787      return result;
11788  }
11789  
11790  void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11791  PyUnicode_Append(PyObject **p_left, PyObject *right)
11792  {
11793      PyObject *left, *res;
11794      Py_UCS4 maxchar, maxchar2;
11795      Py_ssize_t left_len, right_len, new_len;
11796  
11797      if (p_left == NULL) {
11798          if (!PyErr_Occurred())
11799              PyErr_BadInternalCall();
11800          return;
11801      }
11802      left = *p_left;
11803      if (right == NULL || left == NULL
11804          || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11805          if (!PyErr_Occurred())
11806              PyErr_BadInternalCall();
11807          goto error;
11808      }
11809  
11810      if (PyUnicode_READY(left) == -1)
11811          goto error;
11812      if (PyUnicode_READY(right) == -1)
11813          goto error;
11814  
11815      /* Shortcuts */
11816      PyObject *empty = unicode_get_empty();  // Borrowed reference
11817      if (left == empty) {
11818          Py_DECREF(left);
11819          Py_INCREF(right);
11820          *p_left = right;
11821          return;
11822      }
11823      if (right == empty) {
11824          return;
11825      }
11826  
11827      left_len = PyUnicode_GET_LENGTH(left);
11828      right_len = PyUnicode_GET_LENGTH(right);
11829      if (left_len > PY_SSIZE_T_MAX - right_len) {
11830          PyErr_SetString(PyExc_OverflowError,
11831                          "strings are too large to concat");
11832          goto error;
11833      }
11834      new_len = left_len + right_len;
11835  
11836      if (unicode_modifiable(left)
11837          && PyUnicode_CheckExact(right)
11838          && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11839          /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11840             to change the structure size, but characters are stored just after
11841             the structure, and so it requires to move all characters which is
11842             not so different than duplicating the string. */
11843          && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11844      {
11845          /* append inplace */
11846          if (unicode_resize(p_left, new_len) != 0)
11847              goto error;
11848  
11849          /* copy 'right' into the newly allocated area of 'left' */
11850          _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11851      }
11852      else {
11853          maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11854          maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11855          maxchar = Py_MAX(maxchar, maxchar2);
11856  
11857          /* Concat the two Unicode strings */
11858          res = PyUnicode_New(new_len, maxchar);
11859          if (res == NULL)
11860              goto error;
11861          _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11862          _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11863          Py_DECREF(left);
11864          *p_left = res;
11865      }
11866      assert(_PyUnicode_CheckConsistency(*p_left, 1));
11867      return;
11868  
11869  error:
11870      Py_CLEAR(*p_left);
11871  }
11872  
11873  void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11874  PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11875  {
11876      PyUnicode_Append(pleft, right);
11877      Py_XDECREF(right);
11878  }
11879  
11880  /*
11881  Wraps stringlib_parse_args_finds() and additionally ensures that the
11882  first argument is a unicode object.
11883  */
11884  
11885  static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11886  parse_args_finds_unicode(const char * function_name, PyObject *args,
11887                           PyObject **substring,
11888                           Py_ssize_t *start, Py_ssize_t *end)
11889  {
11890      if(stringlib_parse_args_finds(function_name, args, substring,
11891                                    start, end)) {
11892          if (ensure_unicode(*substring) < 0)
11893              return 0;
11894          return 1;
11895      }
11896      return 0;
11897  }
11898  
11899  PyDoc_STRVAR(count__doc__,
11900               "S.count(sub[, start[, end]]) -> int\n\
11901  \n\
11902  Return the number of non-overlapping occurrences of substring sub in\n\
11903  string S[start:end].  Optional arguments start and end are\n\
11904  interpreted as in slice notation.");
11905  
11906  static PyObject *
unicode_count(PyObject * self,PyObject * args)11907  unicode_count(PyObject *self, PyObject *args)
11908  {
11909      PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11910      Py_ssize_t start = 0;
11911      Py_ssize_t end = PY_SSIZE_T_MAX;
11912      PyObject *result;
11913      int kind1, kind2;
11914      const void *buf1, *buf2;
11915      Py_ssize_t len1, len2, iresult;
11916  
11917      if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11918          return NULL;
11919  
11920      kind1 = PyUnicode_KIND(self);
11921      kind2 = PyUnicode_KIND(substring);
11922      if (kind1 < kind2)
11923          return PyLong_FromLong(0);
11924  
11925      len1 = PyUnicode_GET_LENGTH(self);
11926      len2 = PyUnicode_GET_LENGTH(substring);
11927      ADJUST_INDICES(start, end, len1);
11928      if (end - start < len2)
11929          return PyLong_FromLong(0);
11930  
11931      buf1 = PyUnicode_DATA(self);
11932      buf2 = PyUnicode_DATA(substring);
11933      if (kind2 != kind1) {
11934          buf2 = unicode_askind(kind2, buf2, len2, kind1);
11935          if (!buf2)
11936              return NULL;
11937      }
11938      switch (kind1) {
11939      case PyUnicode_1BYTE_KIND:
11940          iresult = ucs1lib_count(
11941              ((const Py_UCS1*)buf1) + start, end - start,
11942              buf2, len2, PY_SSIZE_T_MAX
11943              );
11944          break;
11945      case PyUnicode_2BYTE_KIND:
11946          iresult = ucs2lib_count(
11947              ((const Py_UCS2*)buf1) + start, end - start,
11948              buf2, len2, PY_SSIZE_T_MAX
11949              );
11950          break;
11951      case PyUnicode_4BYTE_KIND:
11952          iresult = ucs4lib_count(
11953              ((const Py_UCS4*)buf1) + start, end - start,
11954              buf2, len2, PY_SSIZE_T_MAX
11955              );
11956          break;
11957      default:
11958          Py_UNREACHABLE();
11959      }
11960  
11961      result = PyLong_FromSsize_t(iresult);
11962  
11963      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11964      if (kind2 != kind1)
11965          PyMem_Free((void *)buf2);
11966  
11967      return result;
11968  }
11969  
11970  /*[clinic input]
11971  str.encode as unicode_encode
11972  
11973      encoding: str(c_default="NULL") = 'utf-8'
11974          The encoding in which to encode the string.
11975      errors: str(c_default="NULL") = 'strict'
11976          The error handling scheme to use for encoding errors.
11977          The default is 'strict' meaning that encoding errors raise a
11978          UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11979          'xmlcharrefreplace' as well as any other name registered with
11980          codecs.register_error that can handle UnicodeEncodeErrors.
11981  
11982  Encode the string using the codec registered for encoding.
11983  [clinic start generated code]*/
11984  
11985  static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11986  unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11987  /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11988  {
11989      return PyUnicode_AsEncodedString(self, encoding, errors);
11990  }
11991  
11992  /*[clinic input]
11993  str.expandtabs as unicode_expandtabs
11994  
11995      tabsize: int = 8
11996  
11997  Return a copy where all tab characters are expanded using spaces.
11998  
11999  If tabsize is not given, a tab size of 8 characters is assumed.
12000  [clinic start generated code]*/
12001  
12002  static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)12003  unicode_expandtabs_impl(PyObject *self, int tabsize)
12004  /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
12005  {
12006      Py_ssize_t i, j, line_pos, src_len, incr;
12007      Py_UCS4 ch;
12008      PyObject *u;
12009      const void *src_data;
12010      void *dest_data;
12011      int kind;
12012      int found;
12013  
12014      if (PyUnicode_READY(self) == -1)
12015          return NULL;
12016  
12017      /* First pass: determine size of output string */
12018      src_len = PyUnicode_GET_LENGTH(self);
12019      i = j = line_pos = 0;
12020      kind = PyUnicode_KIND(self);
12021      src_data = PyUnicode_DATA(self);
12022      found = 0;
12023      for (; i < src_len; i++) {
12024          ch = PyUnicode_READ(kind, src_data, i);
12025          if (ch == '\t') {
12026              found = 1;
12027              if (tabsize > 0) {
12028                  incr = tabsize - (line_pos % tabsize); /* cannot overflow */
12029                  if (j > PY_SSIZE_T_MAX - incr)
12030                      goto overflow;
12031                  line_pos += incr;
12032                  j += incr;
12033              }
12034          }
12035          else {
12036              if (j > PY_SSIZE_T_MAX - 1)
12037                  goto overflow;
12038              line_pos++;
12039              j++;
12040              if (ch == '\n' || ch == '\r')
12041                  line_pos = 0;
12042          }
12043      }
12044      if (!found)
12045          return unicode_result_unchanged(self);
12046  
12047      /* Second pass: create output string and fill it */
12048      u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
12049      if (!u)
12050          return NULL;
12051      dest_data = PyUnicode_DATA(u);
12052  
12053      i = j = line_pos = 0;
12054  
12055      for (; i < src_len; i++) {
12056          ch = PyUnicode_READ(kind, src_data, i);
12057          if (ch == '\t') {
12058              if (tabsize > 0) {
12059                  incr = tabsize - (line_pos % tabsize);
12060                  line_pos += incr;
12061                  unicode_fill(kind, dest_data, ' ', j, incr);
12062                  j += incr;
12063              }
12064          }
12065          else {
12066              line_pos++;
12067              PyUnicode_WRITE(kind, dest_data, j, ch);
12068              j++;
12069              if (ch == '\n' || ch == '\r')
12070                  line_pos = 0;
12071          }
12072      }
12073      assert (j == PyUnicode_GET_LENGTH(u));
12074      return unicode_result(u);
12075  
12076    overflow:
12077      PyErr_SetString(PyExc_OverflowError, "new string is too long");
12078      return NULL;
12079  }
12080  
12081  PyDoc_STRVAR(find__doc__,
12082               "S.find(sub[, start[, end]]) -> int\n\
12083  \n\
12084  Return the lowest index in S where substring sub is found,\n\
12085  such that sub is contained within S[start:end].  Optional\n\
12086  arguments start and end are interpreted as in slice notation.\n\
12087  \n\
12088  Return -1 on failure.");
12089  
12090  static PyObject *
unicode_find(PyObject * self,PyObject * args)12091  unicode_find(PyObject *self, PyObject *args)
12092  {
12093      /* initialize variables to prevent gcc warning */
12094      PyObject *substring = NULL;
12095      Py_ssize_t start = 0;
12096      Py_ssize_t end = 0;
12097      Py_ssize_t result;
12098  
12099      if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
12100          return NULL;
12101  
12102      if (PyUnicode_READY(self) == -1)
12103          return NULL;
12104  
12105      result = any_find_slice(self, substring, start, end, 1);
12106  
12107      if (result == -2)
12108          return NULL;
12109  
12110      return PyLong_FromSsize_t(result);
12111  }
12112  
12113  static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)12114  unicode_getitem(PyObject *self, Py_ssize_t index)
12115  {
12116      const void *data;
12117      enum PyUnicode_Kind kind;
12118      Py_UCS4 ch;
12119  
12120      if (!PyUnicode_Check(self)) {
12121          PyErr_BadArgument();
12122          return NULL;
12123      }
12124      if (PyUnicode_READY(self) == -1) {
12125          return NULL;
12126      }
12127      if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12128          PyErr_SetString(PyExc_IndexError, "string index out of range");
12129          return NULL;
12130      }
12131      kind = PyUnicode_KIND(self);
12132      data = PyUnicode_DATA(self);
12133      ch = PyUnicode_READ(kind, data, index);
12134      return unicode_char(ch);
12135  }
12136  
12137  /* Believe it or not, this produces the same value for ASCII strings
12138     as bytes_hash(). */
12139  static Py_hash_t
unicode_hash(PyObject * self)12140  unicode_hash(PyObject *self)
12141  {
12142      Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
12143  
12144  #ifdef Py_DEBUG
12145      assert(_Py_HashSecret_Initialized);
12146  #endif
12147      if (_PyUnicode_HASH(self) != -1)
12148          return _PyUnicode_HASH(self);
12149      if (PyUnicode_READY(self) == -1)
12150          return -1;
12151  
12152      x = _Py_HashBytes(PyUnicode_DATA(self),
12153                        PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
12154      _PyUnicode_HASH(self) = x;
12155      return x;
12156  }
12157  
12158  PyDoc_STRVAR(index__doc__,
12159               "S.index(sub[, start[, end]]) -> int\n\
12160  \n\
12161  Return the lowest index in S where substring sub is found,\n\
12162  such that sub is contained within S[start:end].  Optional\n\
12163  arguments start and end are interpreted as in slice notation.\n\
12164  \n\
12165  Raises ValueError when the substring is not found.");
12166  
12167  static PyObject *
unicode_index(PyObject * self,PyObject * args)12168  unicode_index(PyObject *self, PyObject *args)
12169  {
12170      /* initialize variables to prevent gcc warning */
12171      Py_ssize_t result;
12172      PyObject *substring = NULL;
12173      Py_ssize_t start = 0;
12174      Py_ssize_t end = 0;
12175  
12176      if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
12177          return NULL;
12178  
12179      if (PyUnicode_READY(self) == -1)
12180          return NULL;
12181  
12182      result = any_find_slice(self, substring, start, end, 1);
12183  
12184      if (result == -2)
12185          return NULL;
12186  
12187      if (result < 0) {
12188          PyErr_SetString(PyExc_ValueError, "substring not found");
12189          return NULL;
12190      }
12191  
12192      return PyLong_FromSsize_t(result);
12193  }
12194  
12195  /*[clinic input]
12196  str.isascii as unicode_isascii
12197  
12198  Return True if all characters in the string are ASCII, False otherwise.
12199  
12200  ASCII characters have code points in the range U+0000-U+007F.
12201  Empty string is ASCII too.
12202  [clinic start generated code]*/
12203  
12204  static PyObject *
unicode_isascii_impl(PyObject * self)12205  unicode_isascii_impl(PyObject *self)
12206  /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12207  {
12208      if (PyUnicode_READY(self) == -1) {
12209          return NULL;
12210      }
12211      return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12212  }
12213  
12214  /*[clinic input]
12215  str.islower as unicode_islower
12216  
12217  Return True if the string is a lowercase string, False otherwise.
12218  
12219  A string is lowercase if all cased characters in the string are lowercase and
12220  there is at least one cased character in the string.
12221  [clinic start generated code]*/
12222  
12223  static PyObject *
unicode_islower_impl(PyObject * self)12224  unicode_islower_impl(PyObject *self)
12225  /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12226  {
12227      Py_ssize_t i, length;
12228      int kind;
12229      const void *data;
12230      int cased;
12231  
12232      if (PyUnicode_READY(self) == -1)
12233          return NULL;
12234      length = PyUnicode_GET_LENGTH(self);
12235      kind = PyUnicode_KIND(self);
12236      data = PyUnicode_DATA(self);
12237  
12238      /* Shortcut for single character strings */
12239      if (length == 1)
12240          return PyBool_FromLong(
12241              Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12242  
12243      /* Special case for empty strings */
12244      if (length == 0)
12245          Py_RETURN_FALSE;
12246  
12247      cased = 0;
12248      for (i = 0; i < length; i++) {
12249          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12250  
12251          if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12252              Py_RETURN_FALSE;
12253          else if (!cased && Py_UNICODE_ISLOWER(ch))
12254              cased = 1;
12255      }
12256      return PyBool_FromLong(cased);
12257  }
12258  
12259  /*[clinic input]
12260  str.isupper as unicode_isupper
12261  
12262  Return True if the string is an uppercase string, False otherwise.
12263  
12264  A string is uppercase if all cased characters in the string are uppercase and
12265  there is at least one cased character in the string.
12266  [clinic start generated code]*/
12267  
12268  static PyObject *
unicode_isupper_impl(PyObject * self)12269  unicode_isupper_impl(PyObject *self)
12270  /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12271  {
12272      Py_ssize_t i, length;
12273      int kind;
12274      const void *data;
12275      int cased;
12276  
12277      if (PyUnicode_READY(self) == -1)
12278          return NULL;
12279      length = PyUnicode_GET_LENGTH(self);
12280      kind = PyUnicode_KIND(self);
12281      data = PyUnicode_DATA(self);
12282  
12283      /* Shortcut for single character strings */
12284      if (length == 1)
12285          return PyBool_FromLong(
12286              Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12287  
12288      /* Special case for empty strings */
12289      if (length == 0)
12290          Py_RETURN_FALSE;
12291  
12292      cased = 0;
12293      for (i = 0; i < length; i++) {
12294          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12295  
12296          if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12297              Py_RETURN_FALSE;
12298          else if (!cased && Py_UNICODE_ISUPPER(ch))
12299              cased = 1;
12300      }
12301      return PyBool_FromLong(cased);
12302  }
12303  
12304  /*[clinic input]
12305  str.istitle as unicode_istitle
12306  
12307  Return True if the string is a title-cased string, False otherwise.
12308  
12309  In a title-cased string, upper- and title-case characters may only
12310  follow uncased characters and lowercase characters only cased ones.
12311  [clinic start generated code]*/
12312  
12313  static PyObject *
unicode_istitle_impl(PyObject * self)12314  unicode_istitle_impl(PyObject *self)
12315  /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12316  {
12317      Py_ssize_t i, length;
12318      int kind;
12319      const void *data;
12320      int cased, previous_is_cased;
12321  
12322      if (PyUnicode_READY(self) == -1)
12323          return NULL;
12324      length = PyUnicode_GET_LENGTH(self);
12325      kind = PyUnicode_KIND(self);
12326      data = PyUnicode_DATA(self);
12327  
12328      /* Shortcut for single character strings */
12329      if (length == 1) {
12330          Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12331          return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12332                                 (Py_UNICODE_ISUPPER(ch) != 0));
12333      }
12334  
12335      /* Special case for empty strings */
12336      if (length == 0)
12337          Py_RETURN_FALSE;
12338  
12339      cased = 0;
12340      previous_is_cased = 0;
12341      for (i = 0; i < length; i++) {
12342          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12343  
12344          if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12345              if (previous_is_cased)
12346                  Py_RETURN_FALSE;
12347              previous_is_cased = 1;
12348              cased = 1;
12349          }
12350          else if (Py_UNICODE_ISLOWER(ch)) {
12351              if (!previous_is_cased)
12352                  Py_RETURN_FALSE;
12353              previous_is_cased = 1;
12354              cased = 1;
12355          }
12356          else
12357              previous_is_cased = 0;
12358      }
12359      return PyBool_FromLong(cased);
12360  }
12361  
12362  /*[clinic input]
12363  str.isspace as unicode_isspace
12364  
12365  Return True if the string is a whitespace string, False otherwise.
12366  
12367  A string is whitespace if all characters in the string are whitespace and there
12368  is at least one character in the string.
12369  [clinic start generated code]*/
12370  
12371  static PyObject *
unicode_isspace_impl(PyObject * self)12372  unicode_isspace_impl(PyObject *self)
12373  /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12374  {
12375      Py_ssize_t i, length;
12376      int kind;
12377      const void *data;
12378  
12379      if (PyUnicode_READY(self) == -1)
12380          return NULL;
12381      length = PyUnicode_GET_LENGTH(self);
12382      kind = PyUnicode_KIND(self);
12383      data = PyUnicode_DATA(self);
12384  
12385      /* Shortcut for single character strings */
12386      if (length == 1)
12387          return PyBool_FromLong(
12388              Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12389  
12390      /* Special case for empty strings */
12391      if (length == 0)
12392          Py_RETURN_FALSE;
12393  
12394      for (i = 0; i < length; i++) {
12395          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12396          if (!Py_UNICODE_ISSPACE(ch))
12397              Py_RETURN_FALSE;
12398      }
12399      Py_RETURN_TRUE;
12400  }
12401  
12402  /*[clinic input]
12403  str.isalpha as unicode_isalpha
12404  
12405  Return True if the string is an alphabetic string, False otherwise.
12406  
12407  A string is alphabetic if all characters in the string are alphabetic and there
12408  is at least one character in the string.
12409  [clinic start generated code]*/
12410  
12411  static PyObject *
unicode_isalpha_impl(PyObject * self)12412  unicode_isalpha_impl(PyObject *self)
12413  /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12414  {
12415      Py_ssize_t i, length;
12416      int kind;
12417      const void *data;
12418  
12419      if (PyUnicode_READY(self) == -1)
12420          return NULL;
12421      length = PyUnicode_GET_LENGTH(self);
12422      kind = PyUnicode_KIND(self);
12423      data = PyUnicode_DATA(self);
12424  
12425      /* Shortcut for single character strings */
12426      if (length == 1)
12427          return PyBool_FromLong(
12428              Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12429  
12430      /* Special case for empty strings */
12431      if (length == 0)
12432          Py_RETURN_FALSE;
12433  
12434      for (i = 0; i < length; i++) {
12435          if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12436              Py_RETURN_FALSE;
12437      }
12438      Py_RETURN_TRUE;
12439  }
12440  
12441  /*[clinic input]
12442  str.isalnum as unicode_isalnum
12443  
12444  Return True if the string is an alpha-numeric string, False otherwise.
12445  
12446  A string is alpha-numeric if all characters in the string are alpha-numeric and
12447  there is at least one character in the string.
12448  [clinic start generated code]*/
12449  
12450  static PyObject *
unicode_isalnum_impl(PyObject * self)12451  unicode_isalnum_impl(PyObject *self)
12452  /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12453  {
12454      int kind;
12455      const void *data;
12456      Py_ssize_t len, i;
12457  
12458      if (PyUnicode_READY(self) == -1)
12459          return NULL;
12460  
12461      kind = PyUnicode_KIND(self);
12462      data = PyUnicode_DATA(self);
12463      len = PyUnicode_GET_LENGTH(self);
12464  
12465      /* Shortcut for single character strings */
12466      if (len == 1) {
12467          const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12468          return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12469      }
12470  
12471      /* Special case for empty strings */
12472      if (len == 0)
12473          Py_RETURN_FALSE;
12474  
12475      for (i = 0; i < len; i++) {
12476          const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12477          if (!Py_UNICODE_ISALNUM(ch))
12478              Py_RETURN_FALSE;
12479      }
12480      Py_RETURN_TRUE;
12481  }
12482  
12483  /*[clinic input]
12484  str.isdecimal as unicode_isdecimal
12485  
12486  Return True if the string is a decimal string, False otherwise.
12487  
12488  A string is a decimal string if all characters in the string are decimal and
12489  there is at least one character in the string.
12490  [clinic start generated code]*/
12491  
12492  static PyObject *
unicode_isdecimal_impl(PyObject * self)12493  unicode_isdecimal_impl(PyObject *self)
12494  /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12495  {
12496      Py_ssize_t i, length;
12497      int kind;
12498      const void *data;
12499  
12500      if (PyUnicode_READY(self) == -1)
12501          return NULL;
12502      length = PyUnicode_GET_LENGTH(self);
12503      kind = PyUnicode_KIND(self);
12504      data = PyUnicode_DATA(self);
12505  
12506      /* Shortcut for single character strings */
12507      if (length == 1)
12508          return PyBool_FromLong(
12509              Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12510  
12511      /* Special case for empty strings */
12512      if (length == 0)
12513          Py_RETURN_FALSE;
12514  
12515      for (i = 0; i < length; i++) {
12516          if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12517              Py_RETURN_FALSE;
12518      }
12519      Py_RETURN_TRUE;
12520  }
12521  
12522  /*[clinic input]
12523  str.isdigit as unicode_isdigit
12524  
12525  Return True if the string is a digit string, False otherwise.
12526  
12527  A string is a digit string if all characters in the string are digits and there
12528  is at least one character in the string.
12529  [clinic start generated code]*/
12530  
12531  static PyObject *
unicode_isdigit_impl(PyObject * self)12532  unicode_isdigit_impl(PyObject *self)
12533  /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12534  {
12535      Py_ssize_t i, length;
12536      int kind;
12537      const void *data;
12538  
12539      if (PyUnicode_READY(self) == -1)
12540          return NULL;
12541      length = PyUnicode_GET_LENGTH(self);
12542      kind = PyUnicode_KIND(self);
12543      data = PyUnicode_DATA(self);
12544  
12545      /* Shortcut for single character strings */
12546      if (length == 1) {
12547          const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12548          return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12549      }
12550  
12551      /* Special case for empty strings */
12552      if (length == 0)
12553          Py_RETURN_FALSE;
12554  
12555      for (i = 0; i < length; i++) {
12556          if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12557              Py_RETURN_FALSE;
12558      }
12559      Py_RETURN_TRUE;
12560  }
12561  
12562  /*[clinic input]
12563  str.isnumeric as unicode_isnumeric
12564  
12565  Return True if the string is a numeric string, False otherwise.
12566  
12567  A string is numeric if all characters in the string are numeric and there is at
12568  least one character in the string.
12569  [clinic start generated code]*/
12570  
12571  static PyObject *
unicode_isnumeric_impl(PyObject * self)12572  unicode_isnumeric_impl(PyObject *self)
12573  /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12574  {
12575      Py_ssize_t i, length;
12576      int kind;
12577      const void *data;
12578  
12579      if (PyUnicode_READY(self) == -1)
12580          return NULL;
12581      length = PyUnicode_GET_LENGTH(self);
12582      kind = PyUnicode_KIND(self);
12583      data = PyUnicode_DATA(self);
12584  
12585      /* Shortcut for single character strings */
12586      if (length == 1)
12587          return PyBool_FromLong(
12588              Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12589  
12590      /* Special case for empty strings */
12591      if (length == 0)
12592          Py_RETURN_FALSE;
12593  
12594      for (i = 0; i < length; i++) {
12595          if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12596              Py_RETURN_FALSE;
12597      }
12598      Py_RETURN_TRUE;
12599  }
12600  
12601  Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12602  _PyUnicode_ScanIdentifier(PyObject *self)
12603  {
12604      Py_ssize_t i;
12605      if (PyUnicode_READY(self) == -1)
12606          return -1;
12607  
12608      Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12609      if (len == 0) {
12610          /* an empty string is not a valid identifier */
12611          return 0;
12612      }
12613  
12614      int kind = PyUnicode_KIND(self);
12615      const void *data = PyUnicode_DATA(self);
12616      Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12617      /* PEP 3131 says that the first character must be in
12618         XID_Start and subsequent characters in XID_Continue,
12619         and for the ASCII range, the 2.x rules apply (i.e
12620         start with letters and underscore, continue with
12621         letters, digits, underscore). However, given the current
12622         definition of XID_Start and XID_Continue, it is sufficient
12623         to check just for these, except that _ must be allowed
12624         as starting an identifier.  */
12625      if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12626          return 0;
12627      }
12628  
12629      for (i = 1; i < len; i++) {
12630          ch = PyUnicode_READ(kind, data, i);
12631          if (!_PyUnicode_IsXidContinue(ch)) {
12632              return i;
12633          }
12634      }
12635      return i;
12636  }
12637  
12638  int
PyUnicode_IsIdentifier(PyObject * self)12639  PyUnicode_IsIdentifier(PyObject *self)
12640  {
12641      if (PyUnicode_IS_READY(self)) {
12642          Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12643          Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12644          /* an empty string is not a valid identifier */
12645          return len && i == len;
12646      }
12647      else {
12648  _Py_COMP_DIAG_PUSH
12649  _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12650          Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12651          if (len == 0) {
12652              /* an empty string is not a valid identifier */
12653              return 0;
12654          }
12655  
12656          const wchar_t *wstr = _PyUnicode_WSTR(self);
12657          Py_UCS4 ch = wstr[i++];
12658  #if SIZEOF_WCHAR_T == 2
12659          if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12660              && i < len
12661              && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12662          {
12663              ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12664              i++;
12665          }
12666  #endif
12667          if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12668              return 0;
12669          }
12670  
12671          while (i < len) {
12672              ch = wstr[i++];
12673  #if SIZEOF_WCHAR_T == 2
12674              if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12675                  && i < len
12676                  && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12677              {
12678                  ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12679                  i++;
12680              }
12681  #endif
12682              if (!_PyUnicode_IsXidContinue(ch)) {
12683                  return 0;
12684              }
12685          }
12686          return 1;
12687  _Py_COMP_DIAG_POP
12688      }
12689  }
12690  
12691  /*[clinic input]
12692  str.isidentifier as unicode_isidentifier
12693  
12694  Return True if the string is a valid Python identifier, False otherwise.
12695  
12696  Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12697  such as "def" or "class".
12698  [clinic start generated code]*/
12699  
12700  static PyObject *
unicode_isidentifier_impl(PyObject * self)12701  unicode_isidentifier_impl(PyObject *self)
12702  /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12703  {
12704      return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12705  }
12706  
12707  /*[clinic input]
12708  str.isprintable as unicode_isprintable
12709  
12710  Return True if the string is printable, False otherwise.
12711  
12712  A string is printable if all of its characters are considered printable in
12713  repr() or if it is empty.
12714  [clinic start generated code]*/
12715  
12716  static PyObject *
unicode_isprintable_impl(PyObject * self)12717  unicode_isprintable_impl(PyObject *self)
12718  /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12719  {
12720      Py_ssize_t i, length;
12721      int kind;
12722      const void *data;
12723  
12724      if (PyUnicode_READY(self) == -1)
12725          return NULL;
12726      length = PyUnicode_GET_LENGTH(self);
12727      kind = PyUnicode_KIND(self);
12728      data = PyUnicode_DATA(self);
12729  
12730      /* Shortcut for single character strings */
12731      if (length == 1)
12732          return PyBool_FromLong(
12733              Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12734  
12735      for (i = 0; i < length; i++) {
12736          if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12737              Py_RETURN_FALSE;
12738          }
12739      }
12740      Py_RETURN_TRUE;
12741  }
12742  
12743  /*[clinic input]
12744  str.join as unicode_join
12745  
12746      iterable: object
12747      /
12748  
12749  Concatenate any number of strings.
12750  
12751  The string whose method is called is inserted in between each given string.
12752  The result is returned as a new string.
12753  
12754  Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12755  [clinic start generated code]*/
12756  
12757  static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12758  unicode_join(PyObject *self, PyObject *iterable)
12759  /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12760  {
12761      return PyUnicode_Join(self, iterable);
12762  }
12763  
12764  static Py_ssize_t
unicode_length(PyObject * self)12765  unicode_length(PyObject *self)
12766  {
12767      if (PyUnicode_READY(self) == -1)
12768          return -1;
12769      return PyUnicode_GET_LENGTH(self);
12770  }
12771  
12772  /*[clinic input]
12773  str.ljust as unicode_ljust
12774  
12775      width: Py_ssize_t
12776      fillchar: Py_UCS4 = ' '
12777      /
12778  
12779  Return a left-justified string of length width.
12780  
12781  Padding is done using the specified fill character (default is a space).
12782  [clinic start generated code]*/
12783  
12784  static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12785  unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12786  /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12787  {
12788      if (PyUnicode_READY(self) == -1)
12789          return NULL;
12790  
12791      if (PyUnicode_GET_LENGTH(self) >= width)
12792          return unicode_result_unchanged(self);
12793  
12794      return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12795  }
12796  
12797  /*[clinic input]
12798  str.lower as unicode_lower
12799  
12800  Return a copy of the string converted to lowercase.
12801  [clinic start generated code]*/
12802  
12803  static PyObject *
unicode_lower_impl(PyObject * self)12804  unicode_lower_impl(PyObject *self)
12805  /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12806  {
12807      if (PyUnicode_READY(self) == -1)
12808          return NULL;
12809      if (PyUnicode_IS_ASCII(self))
12810          return ascii_upper_or_lower(self, 1);
12811      return case_operation(self, do_lower);
12812  }
12813  
12814  #define LEFTSTRIP 0
12815  #define RIGHTSTRIP 1
12816  #define BOTHSTRIP 2
12817  
12818  /* Arrays indexed by above */
12819  static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12820  
12821  #define STRIPNAME(i) (stripfuncnames[i])
12822  
12823  /* externally visible for str.strip(unicode) */
12824  PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12825  _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12826  {
12827      const void *data;
12828      int kind;
12829      Py_ssize_t i, j, len;
12830      BLOOM_MASK sepmask;
12831      Py_ssize_t seplen;
12832  
12833      if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12834          return NULL;
12835  
12836      kind = PyUnicode_KIND(self);
12837      data = PyUnicode_DATA(self);
12838      len = PyUnicode_GET_LENGTH(self);
12839      seplen = PyUnicode_GET_LENGTH(sepobj);
12840      sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12841                                PyUnicode_DATA(sepobj),
12842                                seplen);
12843  
12844      i = 0;
12845      if (striptype != RIGHTSTRIP) {
12846          while (i < len) {
12847              Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12848              if (!BLOOM(sepmask, ch))
12849                  break;
12850              if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12851                  break;
12852              i++;
12853          }
12854      }
12855  
12856      j = len;
12857      if (striptype != LEFTSTRIP) {
12858          j--;
12859          while (j >= i) {
12860              Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12861              if (!BLOOM(sepmask, ch))
12862                  break;
12863              if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12864                  break;
12865              j--;
12866          }
12867  
12868          j++;
12869      }
12870  
12871      return PyUnicode_Substring(self, i, j);
12872  }
12873  
12874  PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12875  PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12876  {
12877      const unsigned char *data;
12878      int kind;
12879      Py_ssize_t length;
12880  
12881      if (PyUnicode_READY(self) == -1)
12882          return NULL;
12883  
12884      length = PyUnicode_GET_LENGTH(self);
12885      end = Py_MIN(end, length);
12886  
12887      if (start == 0 && end == length)
12888          return unicode_result_unchanged(self);
12889  
12890      if (start < 0 || end < 0) {
12891          PyErr_SetString(PyExc_IndexError, "string index out of range");
12892          return NULL;
12893      }
12894      if (start >= length || end < start)
12895          _Py_RETURN_UNICODE_EMPTY();
12896  
12897      length = end - start;
12898      if (PyUnicode_IS_ASCII(self)) {
12899          data = PyUnicode_1BYTE_DATA(self);
12900          return _PyUnicode_FromASCII((const char*)(data + start), length);
12901      }
12902      else {
12903          kind = PyUnicode_KIND(self);
12904          data = PyUnicode_1BYTE_DATA(self);
12905          return PyUnicode_FromKindAndData(kind,
12906                                           data + kind * start,
12907                                           length);
12908      }
12909  }
12910  
12911  static PyObject *
do_strip(PyObject * self,int striptype)12912  do_strip(PyObject *self, int striptype)
12913  {
12914      Py_ssize_t len, i, j;
12915  
12916      if (PyUnicode_READY(self) == -1)
12917          return NULL;
12918  
12919      len = PyUnicode_GET_LENGTH(self);
12920  
12921      if (PyUnicode_IS_ASCII(self)) {
12922          const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12923  
12924          i = 0;
12925          if (striptype != RIGHTSTRIP) {
12926              while (i < len) {
12927                  Py_UCS1 ch = data[i];
12928                  if (!_Py_ascii_whitespace[ch])
12929                      break;
12930                  i++;
12931              }
12932          }
12933  
12934          j = len;
12935          if (striptype != LEFTSTRIP) {
12936              j--;
12937              while (j >= i) {
12938                  Py_UCS1 ch = data[j];
12939                  if (!_Py_ascii_whitespace[ch])
12940                      break;
12941                  j--;
12942              }
12943              j++;
12944          }
12945      }
12946      else {
12947          int kind = PyUnicode_KIND(self);
12948          const void *data = PyUnicode_DATA(self);
12949  
12950          i = 0;
12951          if (striptype != RIGHTSTRIP) {
12952              while (i < len) {
12953                  Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12954                  if (!Py_UNICODE_ISSPACE(ch))
12955                      break;
12956                  i++;
12957              }
12958          }
12959  
12960          j = len;
12961          if (striptype != LEFTSTRIP) {
12962              j--;
12963              while (j >= i) {
12964                  Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12965                  if (!Py_UNICODE_ISSPACE(ch))
12966                      break;
12967                  j--;
12968              }
12969              j++;
12970          }
12971      }
12972  
12973      return PyUnicode_Substring(self, i, j);
12974  }
12975  
12976  
12977  static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12978  do_argstrip(PyObject *self, int striptype, PyObject *sep)
12979  {
12980      if (sep != Py_None) {
12981          if (PyUnicode_Check(sep))
12982              return _PyUnicode_XStrip(self, striptype, sep);
12983          else {
12984              PyErr_Format(PyExc_TypeError,
12985                           "%s arg must be None or str",
12986                           STRIPNAME(striptype));
12987              return NULL;
12988          }
12989      }
12990  
12991      return do_strip(self, striptype);
12992  }
12993  
12994  
12995  /*[clinic input]
12996  str.strip as unicode_strip
12997  
12998      chars: object = None
12999      /
13000  
13001  Return a copy of the string with leading and trailing whitespace removed.
13002  
13003  If chars is given and not None, remove characters in chars instead.
13004  [clinic start generated code]*/
13005  
13006  static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)13007  unicode_strip_impl(PyObject *self, PyObject *chars)
13008  /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
13009  {
13010      return do_argstrip(self, BOTHSTRIP, chars);
13011  }
13012  
13013  
13014  /*[clinic input]
13015  str.lstrip as unicode_lstrip
13016  
13017      chars: object = None
13018      /
13019  
13020  Return a copy of the string with leading whitespace removed.
13021  
13022  If chars is given and not None, remove characters in chars instead.
13023  [clinic start generated code]*/
13024  
13025  static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)13026  unicode_lstrip_impl(PyObject *self, PyObject *chars)
13027  /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
13028  {
13029      return do_argstrip(self, LEFTSTRIP, chars);
13030  }
13031  
13032  
13033  /*[clinic input]
13034  str.rstrip as unicode_rstrip
13035  
13036      chars: object = None
13037      /
13038  
13039  Return a copy of the string with trailing whitespace removed.
13040  
13041  If chars is given and not None, remove characters in chars instead.
13042  [clinic start generated code]*/
13043  
13044  static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)13045  unicode_rstrip_impl(PyObject *self, PyObject *chars)
13046  /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
13047  {
13048      return do_argstrip(self, RIGHTSTRIP, chars);
13049  }
13050  
13051  
13052  static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)13053  unicode_repeat(PyObject *str, Py_ssize_t len)
13054  {
13055      PyObject *u;
13056      Py_ssize_t nchars, n;
13057  
13058      if (len < 1)
13059          _Py_RETURN_UNICODE_EMPTY();
13060  
13061      /* no repeat, return original string */
13062      if (len == 1)
13063          return unicode_result_unchanged(str);
13064  
13065      if (PyUnicode_READY(str) == -1)
13066          return NULL;
13067  
13068      if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
13069          PyErr_SetString(PyExc_OverflowError,
13070                          "repeated string is too long");
13071          return NULL;
13072      }
13073      nchars = len * PyUnicode_GET_LENGTH(str);
13074  
13075      u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
13076      if (!u)
13077          return NULL;
13078      assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
13079  
13080      if (PyUnicode_GET_LENGTH(str) == 1) {
13081          int kind = PyUnicode_KIND(str);
13082          Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
13083          if (kind == PyUnicode_1BYTE_KIND) {
13084              void *to = PyUnicode_DATA(u);
13085              memset(to, (unsigned char)fill_char, len);
13086          }
13087          else if (kind == PyUnicode_2BYTE_KIND) {
13088              Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
13089              for (n = 0; n < len; ++n)
13090                  ucs2[n] = fill_char;
13091          } else {
13092              Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13093              assert(kind == PyUnicode_4BYTE_KIND);
13094              for (n = 0; n < len; ++n)
13095                  ucs4[n] = fill_char;
13096          }
13097      }
13098      else {
13099          /* number of characters copied this far */
13100          Py_ssize_t done = PyUnicode_GET_LENGTH(str);
13101          Py_ssize_t char_size = PyUnicode_KIND(str);
13102          char *to = (char *) PyUnicode_DATA(u);
13103          memcpy(to, PyUnicode_DATA(str),
13104                    PyUnicode_GET_LENGTH(str) * char_size);
13105          while (done < nchars) {
13106              n = (done <= nchars-done) ? done : nchars-done;
13107              memcpy(to + (done * char_size), to, n * char_size);
13108              done += n;
13109          }
13110      }
13111  
13112      assert(_PyUnicode_CheckConsistency(u, 1));
13113      return u;
13114  }
13115  
13116  PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)13117  PyUnicode_Replace(PyObject *str,
13118                    PyObject *substr,
13119                    PyObject *replstr,
13120                    Py_ssize_t maxcount)
13121  {
13122      if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13123              ensure_unicode(replstr) < 0)
13124          return NULL;
13125      return replace(str, substr, replstr, maxcount);
13126  }
13127  
13128  /*[clinic input]
13129  str.replace as unicode_replace
13130  
13131      old: unicode
13132      new: unicode
13133      count: Py_ssize_t = -1
13134          Maximum number of occurrences to replace.
13135          -1 (the default value) means replace all occurrences.
13136      /
13137  
13138  Return a copy with all occurrences of substring old replaced by new.
13139  
13140  If the optional argument count is given, only the first count occurrences are
13141  replaced.
13142  [clinic start generated code]*/
13143  
13144  static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)13145  unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13146                       Py_ssize_t count)
13147  /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
13148  {
13149      if (PyUnicode_READY(self) == -1)
13150          return NULL;
13151      return replace(self, old, new, count);
13152  }
13153  
13154  /*[clinic input]
13155  str.removeprefix as unicode_removeprefix
13156  
13157      prefix: unicode
13158      /
13159  
13160  Return a str with the given prefix string removed if present.
13161  
13162  If the string starts with the prefix string, return string[len(prefix):].
13163  Otherwise, return a copy of the original string.
13164  [clinic start generated code]*/
13165  
13166  static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)13167  unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13168  /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13169  {
13170      int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13171      if (match == -1) {
13172          return NULL;
13173      }
13174      if (match) {
13175          return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13176                                     PyUnicode_GET_LENGTH(self));
13177      }
13178      return unicode_result_unchanged(self);
13179  }
13180  
13181  /*[clinic input]
13182  str.removesuffix as unicode_removesuffix
13183  
13184      suffix: unicode
13185      /
13186  
13187  Return a str with the given suffix string removed if present.
13188  
13189  If the string ends with the suffix string and that suffix is not empty,
13190  return string[:-len(suffix)]. Otherwise, return a copy of the original
13191  string.
13192  [clinic start generated code]*/
13193  
13194  static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)13195  unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13196  /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13197  {
13198      int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13199      if (match == -1) {
13200          return NULL;
13201      }
13202      if (match) {
13203          return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13204                                              - PyUnicode_GET_LENGTH(suffix));
13205      }
13206      return unicode_result_unchanged(self);
13207  }
13208  
13209  static PyObject *
unicode_repr(PyObject * unicode)13210  unicode_repr(PyObject *unicode)
13211  {
13212      PyObject *repr;
13213      Py_ssize_t isize;
13214      Py_ssize_t osize, squote, dquote, i, o;
13215      Py_UCS4 max, quote;
13216      int ikind, okind, unchanged;
13217      const void *idata;
13218      void *odata;
13219  
13220      if (PyUnicode_READY(unicode) == -1)
13221          return NULL;
13222  
13223      isize = PyUnicode_GET_LENGTH(unicode);
13224      idata = PyUnicode_DATA(unicode);
13225  
13226      /* Compute length of output, quote characters, and
13227         maximum character */
13228      osize = 0;
13229      max = 127;
13230      squote = dquote = 0;
13231      ikind = PyUnicode_KIND(unicode);
13232      for (i = 0; i < isize; i++) {
13233          Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13234          Py_ssize_t incr = 1;
13235          switch (ch) {
13236          case '\'': squote++; break;
13237          case '"':  dquote++; break;
13238          case '\\': case '\t': case '\r': case '\n':
13239              incr = 2;
13240              break;
13241          default:
13242              /* Fast-path ASCII */
13243              if (ch < ' ' || ch == 0x7f)
13244                  incr = 4; /* \xHH */
13245              else if (ch < 0x7f)
13246                  ;
13247              else if (Py_UNICODE_ISPRINTABLE(ch))
13248                  max = ch > max ? ch : max;
13249              else if (ch < 0x100)
13250                  incr = 4; /* \xHH */
13251              else if (ch < 0x10000)
13252                  incr = 6; /* \uHHHH */
13253              else
13254                  incr = 10; /* \uHHHHHHHH */
13255          }
13256          if (osize > PY_SSIZE_T_MAX - incr) {
13257              PyErr_SetString(PyExc_OverflowError,
13258                              "string is too long to generate repr");
13259              return NULL;
13260          }
13261          osize += incr;
13262      }
13263  
13264      quote = '\'';
13265      unchanged = (osize == isize);
13266      if (squote) {
13267          unchanged = 0;
13268          if (dquote)
13269              /* Both squote and dquote present. Use squote,
13270                 and escape them */
13271              osize += squote;
13272          else
13273              quote = '"';
13274      }
13275      osize += 2;   /* quotes */
13276  
13277      repr = PyUnicode_New(osize, max);
13278      if (repr == NULL)
13279          return NULL;
13280      okind = PyUnicode_KIND(repr);
13281      odata = PyUnicode_DATA(repr);
13282  
13283      PyUnicode_WRITE(okind, odata, 0, quote);
13284      PyUnicode_WRITE(okind, odata, osize-1, quote);
13285      if (unchanged) {
13286          _PyUnicode_FastCopyCharacters(repr, 1,
13287                                        unicode, 0,
13288                                        isize);
13289      }
13290      else {
13291          for (i = 0, o = 1; i < isize; i++) {
13292              Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13293  
13294              /* Escape quotes and backslashes */
13295              if ((ch == quote) || (ch == '\\')) {
13296                  PyUnicode_WRITE(okind, odata, o++, '\\');
13297                  PyUnicode_WRITE(okind, odata, o++, ch);
13298                  continue;
13299              }
13300  
13301              /* Map special whitespace to '\t', \n', '\r' */
13302              if (ch == '\t') {
13303                  PyUnicode_WRITE(okind, odata, o++, '\\');
13304                  PyUnicode_WRITE(okind, odata, o++, 't');
13305              }
13306              else if (ch == '\n') {
13307                  PyUnicode_WRITE(okind, odata, o++, '\\');
13308                  PyUnicode_WRITE(okind, odata, o++, 'n');
13309              }
13310              else if (ch == '\r') {
13311                  PyUnicode_WRITE(okind, odata, o++, '\\');
13312                  PyUnicode_WRITE(okind, odata, o++, 'r');
13313              }
13314  
13315              /* Map non-printable US ASCII to '\xhh' */
13316              else if (ch < ' ' || ch == 0x7F) {
13317                  PyUnicode_WRITE(okind, odata, o++, '\\');
13318                  PyUnicode_WRITE(okind, odata, o++, 'x');
13319                  PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13320                  PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13321              }
13322  
13323              /* Copy ASCII characters as-is */
13324              else if (ch < 0x7F) {
13325                  PyUnicode_WRITE(okind, odata, o++, ch);
13326              }
13327  
13328              /* Non-ASCII characters */
13329              else {
13330                  /* Map Unicode whitespace and control characters
13331                     (categories Z* and C* except ASCII space)
13332                  */
13333                  if (!Py_UNICODE_ISPRINTABLE(ch)) {
13334                      PyUnicode_WRITE(okind, odata, o++, '\\');
13335                      /* Map 8-bit characters to '\xhh' */
13336                      if (ch <= 0xff) {
13337                          PyUnicode_WRITE(okind, odata, o++, 'x');
13338                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13339                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13340                      }
13341                      /* Map 16-bit characters to '\uxxxx' */
13342                      else if (ch <= 0xffff) {
13343                          PyUnicode_WRITE(okind, odata, o++, 'u');
13344                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13345                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13346                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13347                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13348                      }
13349                      /* Map 21-bit characters to '\U00xxxxxx' */
13350                      else {
13351                          PyUnicode_WRITE(okind, odata, o++, 'U');
13352                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13353                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13354                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13355                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13356                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13357                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13358                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13359                          PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13360                      }
13361                  }
13362                  /* Copy characters as-is */
13363                  else {
13364                      PyUnicode_WRITE(okind, odata, o++, ch);
13365                  }
13366              }
13367          }
13368      }
13369      /* Closing quote already added at the beginning */
13370      assert(_PyUnicode_CheckConsistency(repr, 1));
13371      return repr;
13372  }
13373  
13374  PyDoc_STRVAR(rfind__doc__,
13375               "S.rfind(sub[, start[, end]]) -> int\n\
13376  \n\
13377  Return the highest index in S where substring sub is found,\n\
13378  such that sub is contained within S[start:end].  Optional\n\
13379  arguments start and end are interpreted as in slice notation.\n\
13380  \n\
13381  Return -1 on failure.");
13382  
13383  static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13384  unicode_rfind(PyObject *self, PyObject *args)
13385  {
13386      /* initialize variables to prevent gcc warning */
13387      PyObject *substring = NULL;
13388      Py_ssize_t start = 0;
13389      Py_ssize_t end = 0;
13390      Py_ssize_t result;
13391  
13392      if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13393          return NULL;
13394  
13395      if (PyUnicode_READY(self) == -1)
13396          return NULL;
13397  
13398      result = any_find_slice(self, substring, start, end, -1);
13399  
13400      if (result == -2)
13401          return NULL;
13402  
13403      return PyLong_FromSsize_t(result);
13404  }
13405  
13406  PyDoc_STRVAR(rindex__doc__,
13407               "S.rindex(sub[, start[, end]]) -> int\n\
13408  \n\
13409  Return the highest index in S where substring sub is found,\n\
13410  such that sub is contained within S[start:end].  Optional\n\
13411  arguments start and end are interpreted as in slice notation.\n\
13412  \n\
13413  Raises ValueError when the substring is not found.");
13414  
13415  static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13416  unicode_rindex(PyObject *self, PyObject *args)
13417  {
13418      /* initialize variables to prevent gcc warning */
13419      PyObject *substring = NULL;
13420      Py_ssize_t start = 0;
13421      Py_ssize_t end = 0;
13422      Py_ssize_t result;
13423  
13424      if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13425          return NULL;
13426  
13427      if (PyUnicode_READY(self) == -1)
13428          return NULL;
13429  
13430      result = any_find_slice(self, substring, start, end, -1);
13431  
13432      if (result == -2)
13433          return NULL;
13434  
13435      if (result < 0) {
13436          PyErr_SetString(PyExc_ValueError, "substring not found");
13437          return NULL;
13438      }
13439  
13440      return PyLong_FromSsize_t(result);
13441  }
13442  
13443  /*[clinic input]
13444  str.rjust as unicode_rjust
13445  
13446      width: Py_ssize_t
13447      fillchar: Py_UCS4 = ' '
13448      /
13449  
13450  Return a right-justified string of length width.
13451  
13452  Padding is done using the specified fill character (default is a space).
13453  [clinic start generated code]*/
13454  
13455  static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13456  unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13457  /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13458  {
13459      if (PyUnicode_READY(self) == -1)
13460          return NULL;
13461  
13462      if (PyUnicode_GET_LENGTH(self) >= width)
13463          return unicode_result_unchanged(self);
13464  
13465      return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13466  }
13467  
13468  PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13469  PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13470  {
13471      if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13472          return NULL;
13473  
13474      return split(s, sep, maxsplit);
13475  }
13476  
13477  /*[clinic input]
13478  str.split as unicode_split
13479  
13480      sep: object = None
13481          The delimiter according which to split the string.
13482          None (the default value) means split according to any whitespace,
13483          and discard empty strings from the result.
13484      maxsplit: Py_ssize_t = -1
13485          Maximum number of splits to do.
13486          -1 (the default value) means no limit.
13487  
13488  Return a list of the words in the string, using sep as the delimiter string.
13489  [clinic start generated code]*/
13490  
13491  static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13492  unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13493  /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
13494  {
13495      if (sep == Py_None)
13496          return split(self, NULL, maxsplit);
13497      if (PyUnicode_Check(sep))
13498          return split(self, sep, maxsplit);
13499  
13500      PyErr_Format(PyExc_TypeError,
13501                   "must be str or None, not %.100s",
13502                   Py_TYPE(sep)->tp_name);
13503      return NULL;
13504  }
13505  
13506  PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13507  PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13508  {
13509      PyObject* out;
13510      int kind1, kind2;
13511      const void *buf1, *buf2;
13512      Py_ssize_t len1, len2;
13513  
13514      if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13515          return NULL;
13516  
13517      kind1 = PyUnicode_KIND(str_obj);
13518      kind2 = PyUnicode_KIND(sep_obj);
13519      len1 = PyUnicode_GET_LENGTH(str_obj);
13520      len2 = PyUnicode_GET_LENGTH(sep_obj);
13521      if (kind1 < kind2 || len1 < len2) {
13522          PyObject *empty = unicode_get_empty();  // Borrowed reference
13523          return PyTuple_Pack(3, str_obj, empty, empty);
13524      }
13525      buf1 = PyUnicode_DATA(str_obj);
13526      buf2 = PyUnicode_DATA(sep_obj);
13527      if (kind2 != kind1) {
13528          buf2 = unicode_askind(kind2, buf2, len2, kind1);
13529          if (!buf2)
13530              return NULL;
13531      }
13532  
13533      switch (kind1) {
13534      case PyUnicode_1BYTE_KIND:
13535          if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13536              out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13537          else
13538              out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13539          break;
13540      case PyUnicode_2BYTE_KIND:
13541          out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13542          break;
13543      case PyUnicode_4BYTE_KIND:
13544          out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13545          break;
13546      default:
13547          Py_UNREACHABLE();
13548      }
13549  
13550      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13551      if (kind2 != kind1)
13552          PyMem_Free((void *)buf2);
13553  
13554      return out;
13555  }
13556  
13557  
13558  PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13559  PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13560  {
13561      PyObject* out;
13562      int kind1, kind2;
13563      const void *buf1, *buf2;
13564      Py_ssize_t len1, len2;
13565  
13566      if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13567          return NULL;
13568  
13569      kind1 = PyUnicode_KIND(str_obj);
13570      kind2 = PyUnicode_KIND(sep_obj);
13571      len1 = PyUnicode_GET_LENGTH(str_obj);
13572      len2 = PyUnicode_GET_LENGTH(sep_obj);
13573      if (kind1 < kind2 || len1 < len2) {
13574          PyObject *empty = unicode_get_empty();  // Borrowed reference
13575          return PyTuple_Pack(3, empty, empty, str_obj);
13576      }
13577      buf1 = PyUnicode_DATA(str_obj);
13578      buf2 = PyUnicode_DATA(sep_obj);
13579      if (kind2 != kind1) {
13580          buf2 = unicode_askind(kind2, buf2, len2, kind1);
13581          if (!buf2)
13582              return NULL;
13583      }
13584  
13585      switch (kind1) {
13586      case PyUnicode_1BYTE_KIND:
13587          if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13588              out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13589          else
13590              out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13591          break;
13592      case PyUnicode_2BYTE_KIND:
13593          out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13594          break;
13595      case PyUnicode_4BYTE_KIND:
13596          out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13597          break;
13598      default:
13599          Py_UNREACHABLE();
13600      }
13601  
13602      assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13603      if (kind2 != kind1)
13604          PyMem_Free((void *)buf2);
13605  
13606      return out;
13607  }
13608  
13609  /*[clinic input]
13610  str.partition as unicode_partition
13611  
13612      sep: object
13613      /
13614  
13615  Partition the string into three parts using the given separator.
13616  
13617  This will search for the separator in the string.  If the separator is found,
13618  returns a 3-tuple containing the part before the separator, the separator
13619  itself, and the part after it.
13620  
13621  If the separator is not found, returns a 3-tuple containing the original string
13622  and two empty strings.
13623  [clinic start generated code]*/
13624  
13625  static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13626  unicode_partition(PyObject *self, PyObject *sep)
13627  /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13628  {
13629      return PyUnicode_Partition(self, sep);
13630  }
13631  
13632  /*[clinic input]
13633  str.rpartition as unicode_rpartition = str.partition
13634  
13635  Partition the string into three parts using the given separator.
13636  
13637  This will search for the separator in the string, starting at the end. If
13638  the separator is found, returns a 3-tuple containing the part before the
13639  separator, the separator itself, and the part after it.
13640  
13641  If the separator is not found, returns a 3-tuple containing two empty strings
13642  and the original string.
13643  [clinic start generated code]*/
13644  
13645  static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13646  unicode_rpartition(PyObject *self, PyObject *sep)
13647  /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13648  {
13649      return PyUnicode_RPartition(self, sep);
13650  }
13651  
13652  PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13653  PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13654  {
13655      if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13656          return NULL;
13657  
13658      return rsplit(s, sep, maxsplit);
13659  }
13660  
13661  /*[clinic input]
13662  str.rsplit as unicode_rsplit = str.split
13663  
13664  Return a list of the words in the string, using sep as the delimiter string.
13665  
13666  Splits are done starting at the end of the string and working to the front.
13667  [clinic start generated code]*/
13668  
13669  static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13670  unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13671  /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13672  {
13673      if (sep == Py_None)
13674          return rsplit(self, NULL, maxsplit);
13675      if (PyUnicode_Check(sep))
13676          return rsplit(self, sep, maxsplit);
13677  
13678      PyErr_Format(PyExc_TypeError,
13679                   "must be str or None, not %.100s",
13680                   Py_TYPE(sep)->tp_name);
13681      return NULL;
13682  }
13683  
13684  /*[clinic input]
13685  str.splitlines as unicode_splitlines
13686  
13687      keepends: bool(accept={int}) = False
13688  
13689  Return a list of the lines in the string, breaking at line boundaries.
13690  
13691  Line breaks are not included in the resulting list unless keepends is given and
13692  true.
13693  [clinic start generated code]*/
13694  
13695  static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13696  unicode_splitlines_impl(PyObject *self, int keepends)
13697  /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13698  {
13699      return PyUnicode_Splitlines(self, keepends);
13700  }
13701  
13702  static
unicode_str(PyObject * self)13703  PyObject *unicode_str(PyObject *self)
13704  {
13705      return unicode_result_unchanged(self);
13706  }
13707  
13708  /*[clinic input]
13709  str.swapcase as unicode_swapcase
13710  
13711  Convert uppercase characters to lowercase and lowercase characters to uppercase.
13712  [clinic start generated code]*/
13713  
13714  static PyObject *
unicode_swapcase_impl(PyObject * self)13715  unicode_swapcase_impl(PyObject *self)
13716  /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13717  {
13718      if (PyUnicode_READY(self) == -1)
13719          return NULL;
13720      return case_operation(self, do_swapcase);
13721  }
13722  
13723  /*[clinic input]
13724  
13725  @staticmethod
13726  str.maketrans as unicode_maketrans
13727  
13728    x: object
13729  
13730    y: unicode=NULL
13731  
13732    z: unicode=NULL
13733  
13734    /
13735  
13736  Return a translation table usable for str.translate().
13737  
13738  If there is only one argument, it must be a dictionary mapping Unicode
13739  ordinals (integers) or characters to Unicode ordinals, strings or None.
13740  Character keys will be then converted to ordinals.
13741  If there are two arguments, they must be strings of equal length, and
13742  in the resulting dictionary, each character in x will be mapped to the
13743  character at the same position in y. If there is a third argument, it
13744  must be a string, whose characters will be mapped to None in the result.
13745  [clinic start generated code]*/
13746  
13747  static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13748  unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13749  /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13750  {
13751      PyObject *new = NULL, *key, *value;
13752      Py_ssize_t i = 0;
13753      int res;
13754  
13755      new = PyDict_New();
13756      if (!new)
13757          return NULL;
13758      if (y != NULL) {
13759          int x_kind, y_kind, z_kind;
13760          const void *x_data, *y_data, *z_data;
13761  
13762          /* x must be a string too, of equal length */
13763          if (!PyUnicode_Check(x)) {
13764              PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13765                              "be a string if there is a second argument");
13766              goto err;
13767          }
13768          if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13769              PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13770                              "arguments must have equal length");
13771              goto err;
13772          }
13773          /* create entries for translating chars in x to those in y */
13774          x_kind = PyUnicode_KIND(x);
13775          y_kind = PyUnicode_KIND(y);
13776          x_data = PyUnicode_DATA(x);
13777          y_data = PyUnicode_DATA(y);
13778          for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13779              key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13780              if (!key)
13781                  goto err;
13782              value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13783              if (!value) {
13784                  Py_DECREF(key);
13785                  goto err;
13786              }
13787              res = PyDict_SetItem(new, key, value);
13788              Py_DECREF(key);
13789              Py_DECREF(value);
13790              if (res < 0)
13791                  goto err;
13792          }
13793          /* create entries for deleting chars in z */
13794          if (z != NULL) {
13795              z_kind = PyUnicode_KIND(z);
13796              z_data = PyUnicode_DATA(z);
13797              for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13798                  key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13799                  if (!key)
13800                      goto err;
13801                  res = PyDict_SetItem(new, key, Py_None);
13802                  Py_DECREF(key);
13803                  if (res < 0)
13804                      goto err;
13805              }
13806          }
13807      } else {
13808          int kind;
13809          const void *data;
13810  
13811          /* x must be a dict */
13812          if (!PyDict_CheckExact(x)) {
13813              PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13814                              "to maketrans it must be a dict");
13815              goto err;
13816          }
13817          /* copy entries into the new dict, converting string keys to int keys */
13818          while (PyDict_Next(x, &i, &key, &value)) {
13819              if (PyUnicode_Check(key)) {
13820                  /* convert string keys to integer keys */
13821                  PyObject *newkey;
13822                  if (PyUnicode_GET_LENGTH(key) != 1) {
13823                      PyErr_SetString(PyExc_ValueError, "string keys in translate "
13824                                      "table must be of length 1");
13825                      goto err;
13826                  }
13827                  kind = PyUnicode_KIND(key);
13828                  data = PyUnicode_DATA(key);
13829                  newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13830                  if (!newkey)
13831                      goto err;
13832                  res = PyDict_SetItem(new, newkey, value);
13833                  Py_DECREF(newkey);
13834                  if (res < 0)
13835                      goto err;
13836              } else if (PyLong_Check(key)) {
13837                  /* just keep integer keys */
13838                  if (PyDict_SetItem(new, key, value) < 0)
13839                      goto err;
13840              } else {
13841                  PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13842                                  "be strings or integers");
13843                  goto err;
13844              }
13845          }
13846      }
13847      return new;
13848    err:
13849      Py_DECREF(new);
13850      return NULL;
13851  }
13852  
13853  /*[clinic input]
13854  str.translate as unicode_translate
13855  
13856      table: object
13857          Translation table, which must be a mapping of Unicode ordinals to
13858          Unicode ordinals, strings, or None.
13859      /
13860  
13861  Replace each character in the string using the given translation table.
13862  
13863  The table must implement lookup/indexing via __getitem__, for instance a
13864  dictionary or list.  If this operation raises LookupError, the character is
13865  left untouched.  Characters mapped to None are deleted.
13866  [clinic start generated code]*/
13867  
13868  static PyObject *
unicode_translate(PyObject * self,PyObject * table)13869  unicode_translate(PyObject *self, PyObject *table)
13870  /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13871  {
13872      return _PyUnicode_TranslateCharmap(self, table, "ignore");
13873  }
13874  
13875  /*[clinic input]
13876  str.upper as unicode_upper
13877  
13878  Return a copy of the string converted to uppercase.
13879  [clinic start generated code]*/
13880  
13881  static PyObject *
unicode_upper_impl(PyObject * self)13882  unicode_upper_impl(PyObject *self)
13883  /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13884  {
13885      if (PyUnicode_READY(self) == -1)
13886          return NULL;
13887      if (PyUnicode_IS_ASCII(self))
13888          return ascii_upper_or_lower(self, 0);
13889      return case_operation(self, do_upper);
13890  }
13891  
13892  /*[clinic input]
13893  str.zfill as unicode_zfill
13894  
13895      width: Py_ssize_t
13896      /
13897  
13898  Pad a numeric string with zeros on the left, to fill a field of the given width.
13899  
13900  The string is never truncated.
13901  [clinic start generated code]*/
13902  
13903  static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13904  unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13905  /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13906  {
13907      Py_ssize_t fill;
13908      PyObject *u;
13909      int kind;
13910      const void *data;
13911      Py_UCS4 chr;
13912  
13913      if (PyUnicode_READY(self) == -1)
13914          return NULL;
13915  
13916      if (PyUnicode_GET_LENGTH(self) >= width)
13917          return unicode_result_unchanged(self);
13918  
13919      fill = width - PyUnicode_GET_LENGTH(self);
13920  
13921      u = pad(self, fill, 0, '0');
13922  
13923      if (u == NULL)
13924          return NULL;
13925  
13926      kind = PyUnicode_KIND(u);
13927      data = PyUnicode_DATA(u);
13928      chr = PyUnicode_READ(kind, data, fill);
13929  
13930      if (chr == '+' || chr == '-') {
13931          /* move sign to beginning of string */
13932          PyUnicode_WRITE(kind, data, 0, chr);
13933          PyUnicode_WRITE(kind, data, fill, '0');
13934      }
13935  
13936      assert(_PyUnicode_CheckConsistency(u, 1));
13937      return u;
13938  }
13939  
13940  #if 0
13941  static PyObject *
13942  unicode__decimal2ascii(PyObject *self)
13943  {
13944      return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13945  }
13946  #endif
13947  
13948  PyDoc_STRVAR(startswith__doc__,
13949               "S.startswith(prefix[, start[, end]]) -> bool\n\
13950  \n\
13951  Return True if S starts with the specified prefix, False otherwise.\n\
13952  With optional start, test S beginning at that position.\n\
13953  With optional end, stop comparing S at that position.\n\
13954  prefix can also be a tuple of strings to try.");
13955  
13956  static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13957  unicode_startswith(PyObject *self,
13958                     PyObject *args)
13959  {
13960      PyObject *subobj;
13961      PyObject *substring;
13962      Py_ssize_t start = 0;
13963      Py_ssize_t end = PY_SSIZE_T_MAX;
13964      int result;
13965  
13966      if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13967          return NULL;
13968      if (PyTuple_Check(subobj)) {
13969          Py_ssize_t i;
13970          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13971              substring = PyTuple_GET_ITEM(subobj, i);
13972              if (!PyUnicode_Check(substring)) {
13973                  PyErr_Format(PyExc_TypeError,
13974                               "tuple for startswith must only contain str, "
13975                               "not %.100s",
13976                               Py_TYPE(substring)->tp_name);
13977                  return NULL;
13978              }
13979              result = tailmatch(self, substring, start, end, -1);
13980              if (result == -1)
13981                  return NULL;
13982              if (result) {
13983                  Py_RETURN_TRUE;
13984              }
13985          }
13986          /* nothing matched */
13987          Py_RETURN_FALSE;
13988      }
13989      if (!PyUnicode_Check(subobj)) {
13990          PyErr_Format(PyExc_TypeError,
13991                       "startswith first arg must be str or "
13992                       "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13993          return NULL;
13994      }
13995      result = tailmatch(self, subobj, start, end, -1);
13996      if (result == -1)
13997          return NULL;
13998      return PyBool_FromLong(result);
13999  }
14000  
14001  
14002  PyDoc_STRVAR(endswith__doc__,
14003               "S.endswith(suffix[, start[, end]]) -> bool\n\
14004  \n\
14005  Return True if S ends with the specified suffix, False otherwise.\n\
14006  With optional start, test S beginning at that position.\n\
14007  With optional end, stop comparing S at that position.\n\
14008  suffix can also be a tuple of strings to try.");
14009  
14010  static PyObject *
unicode_endswith(PyObject * self,PyObject * args)14011  unicode_endswith(PyObject *self,
14012                   PyObject *args)
14013  {
14014      PyObject *subobj;
14015      PyObject *substring;
14016      Py_ssize_t start = 0;
14017      Py_ssize_t end = PY_SSIZE_T_MAX;
14018      int result;
14019  
14020      if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
14021          return NULL;
14022      if (PyTuple_Check(subobj)) {
14023          Py_ssize_t i;
14024          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
14025              substring = PyTuple_GET_ITEM(subobj, i);
14026              if (!PyUnicode_Check(substring)) {
14027                  PyErr_Format(PyExc_TypeError,
14028                               "tuple for endswith must only contain str, "
14029                               "not %.100s",
14030                               Py_TYPE(substring)->tp_name);
14031                  return NULL;
14032              }
14033              result = tailmatch(self, substring, start, end, +1);
14034              if (result == -1)
14035                  return NULL;
14036              if (result) {
14037                  Py_RETURN_TRUE;
14038              }
14039          }
14040          Py_RETURN_FALSE;
14041      }
14042      if (!PyUnicode_Check(subobj)) {
14043          PyErr_Format(PyExc_TypeError,
14044                       "endswith first arg must be str or "
14045                       "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14046          return NULL;
14047      }
14048      result = tailmatch(self, subobj, start, end, +1);
14049      if (result == -1)
14050          return NULL;
14051      return PyBool_FromLong(result);
14052  }
14053  
14054  static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)14055  _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
14056  {
14057      writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14058      writer->data = PyUnicode_DATA(writer->buffer);
14059  
14060      if (!writer->readonly) {
14061          writer->kind = PyUnicode_KIND(writer->buffer);
14062          writer->size = PyUnicode_GET_LENGTH(writer->buffer);
14063      }
14064      else {
14065          /* use a value smaller than PyUnicode_1BYTE_KIND() so
14066             _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14067          writer->kind = PyUnicode_WCHAR_KIND;
14068          assert(writer->kind <= PyUnicode_1BYTE_KIND);
14069  
14070          /* Copy-on-write mode: set buffer size to 0 so
14071           * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14072           * next write. */
14073          writer->size = 0;
14074      }
14075  }
14076  
14077  void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)14078  _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
14079  {
14080      memset(writer, 0, sizeof(*writer));
14081  
14082      /* ASCII is the bare minimum */
14083      writer->min_char = 127;
14084  
14085      /* use a value smaller than PyUnicode_1BYTE_KIND() so
14086         _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14087      writer->kind = PyUnicode_WCHAR_KIND;
14088      assert(writer->kind <= PyUnicode_1BYTE_KIND);
14089  }
14090  
14091  // Initialize _PyUnicodeWriter with initial buffer
14092  static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)14093  _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14094  {
14095      memset(writer, 0, sizeof(*writer));
14096      writer->buffer = buffer;
14097      _PyUnicodeWriter_Update(writer);
14098      writer->min_length = writer->size;
14099  }
14100  
14101  int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)14102  _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14103                                   Py_ssize_t length, Py_UCS4 maxchar)
14104  {
14105      Py_ssize_t newlen;
14106      PyObject *newbuffer;
14107  
14108      assert(maxchar <= MAX_UNICODE);
14109  
14110      /* ensure that the _PyUnicodeWriter_Prepare macro was used */
14111      assert((maxchar > writer->maxchar && length >= 0)
14112             || length > 0);
14113  
14114      if (length > PY_SSIZE_T_MAX - writer->pos) {
14115          PyErr_NoMemory();
14116          return -1;
14117      }
14118      newlen = writer->pos + length;
14119  
14120      maxchar = Py_MAX(maxchar, writer->min_char);
14121  
14122      if (writer->buffer == NULL) {
14123          assert(!writer->readonly);
14124          if (writer->overallocate
14125              && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14126              /* overallocate to limit the number of realloc() */
14127              newlen += newlen / OVERALLOCATE_FACTOR;
14128          }
14129          if (newlen < writer->min_length)
14130              newlen = writer->min_length;
14131  
14132          writer->buffer = PyUnicode_New(newlen, maxchar);
14133          if (writer->buffer == NULL)
14134              return -1;
14135      }
14136      else if (newlen > writer->size) {
14137          if (writer->overallocate
14138              && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14139              /* overallocate to limit the number of realloc() */
14140              newlen += newlen / OVERALLOCATE_FACTOR;
14141          }
14142          if (newlen < writer->min_length)
14143              newlen = writer->min_length;
14144  
14145          if (maxchar > writer->maxchar || writer->readonly) {
14146              /* resize + widen */
14147              maxchar = Py_MAX(maxchar, writer->maxchar);
14148              newbuffer = PyUnicode_New(newlen, maxchar);
14149              if (newbuffer == NULL)
14150                  return -1;
14151              _PyUnicode_FastCopyCharacters(newbuffer, 0,
14152                                            writer->buffer, 0, writer->pos);
14153              Py_DECREF(writer->buffer);
14154              writer->readonly = 0;
14155          }
14156          else {
14157              newbuffer = resize_compact(writer->buffer, newlen);
14158              if (newbuffer == NULL)
14159                  return -1;
14160          }
14161          writer->buffer = newbuffer;
14162      }
14163      else if (maxchar > writer->maxchar) {
14164          assert(!writer->readonly);
14165          newbuffer = PyUnicode_New(writer->size, maxchar);
14166          if (newbuffer == NULL)
14167              return -1;
14168          _PyUnicode_FastCopyCharacters(newbuffer, 0,
14169                                        writer->buffer, 0, writer->pos);
14170          Py_SETREF(writer->buffer, newbuffer);
14171      }
14172      _PyUnicodeWriter_Update(writer);
14173      return 0;
14174  
14175  #undef OVERALLOCATE_FACTOR
14176  }
14177  
14178  int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)14179  _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14180                                       enum PyUnicode_Kind kind)
14181  {
14182      Py_UCS4 maxchar;
14183  
14184      /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14185      assert(writer->kind < kind);
14186  
14187      switch (kind)
14188      {
14189      case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14190      case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14191      case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
14192      default:
14193          Py_UNREACHABLE();
14194      }
14195  
14196      return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14197  }
14198  
14199  static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)14200  _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
14201  {
14202      assert(ch <= MAX_UNICODE);
14203      if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14204          return -1;
14205      PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14206      writer->pos++;
14207      return 0;
14208  }
14209  
14210  int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)14211  _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14212  {
14213      return _PyUnicodeWriter_WriteCharInline(writer, ch);
14214  }
14215  
14216  int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)14217  _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14218  {
14219      Py_UCS4 maxchar;
14220      Py_ssize_t len;
14221  
14222      if (PyUnicode_READY(str) == -1)
14223          return -1;
14224      len = PyUnicode_GET_LENGTH(str);
14225      if (len == 0)
14226          return 0;
14227      maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14228      if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
14229          if (writer->buffer == NULL && !writer->overallocate) {
14230              assert(_PyUnicode_CheckConsistency(str, 1));
14231              writer->readonly = 1;
14232              Py_INCREF(str);
14233              writer->buffer = str;
14234              _PyUnicodeWriter_Update(writer);
14235              writer->pos += len;
14236              return 0;
14237          }
14238          if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14239              return -1;
14240      }
14241      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14242                                    str, 0, len);
14243      writer->pos += len;
14244      return 0;
14245  }
14246  
14247  int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)14248  _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14249                                  Py_ssize_t start, Py_ssize_t end)
14250  {
14251      Py_UCS4 maxchar;
14252      Py_ssize_t len;
14253  
14254      if (PyUnicode_READY(str) == -1)
14255          return -1;
14256  
14257      assert(0 <= start);
14258      assert(end <= PyUnicode_GET_LENGTH(str));
14259      assert(start <= end);
14260  
14261      if (end == 0)
14262          return 0;
14263  
14264      if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14265          return _PyUnicodeWriter_WriteStr(writer, str);
14266  
14267      if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14268          maxchar = _PyUnicode_FindMaxChar(str, start, end);
14269      else
14270          maxchar = writer->maxchar;
14271      len = end - start;
14272  
14273      if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14274          return -1;
14275  
14276      _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14277                                    str, start, len);
14278      writer->pos += len;
14279      return 0;
14280  }
14281  
14282  int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)14283  _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14284                                    const char *ascii, Py_ssize_t len)
14285  {
14286      if (len == -1)
14287          len = strlen(ascii);
14288  
14289      assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14290  
14291      if (writer->buffer == NULL && !writer->overallocate) {
14292          PyObject *str;
14293  
14294          str = _PyUnicode_FromASCII(ascii, len);
14295          if (str == NULL)
14296              return -1;
14297  
14298          writer->readonly = 1;
14299          writer->buffer = str;
14300          _PyUnicodeWriter_Update(writer);
14301          writer->pos += len;
14302          return 0;
14303      }
14304  
14305      if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14306          return -1;
14307  
14308      switch (writer->kind)
14309      {
14310      case PyUnicode_1BYTE_KIND:
14311      {
14312          const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14313          Py_UCS1 *data = writer->data;
14314  
14315          memcpy(data + writer->pos, str, len);
14316          break;
14317      }
14318      case PyUnicode_2BYTE_KIND:
14319      {
14320          _PyUnicode_CONVERT_BYTES(
14321              Py_UCS1, Py_UCS2,
14322              ascii, ascii + len,
14323              (Py_UCS2 *)writer->data + writer->pos);
14324          break;
14325      }
14326      case PyUnicode_4BYTE_KIND:
14327      {
14328          _PyUnicode_CONVERT_BYTES(
14329              Py_UCS1, Py_UCS4,
14330              ascii, ascii + len,
14331              (Py_UCS4 *)writer->data + writer->pos);
14332          break;
14333      }
14334      default:
14335          Py_UNREACHABLE();
14336      }
14337  
14338      writer->pos += len;
14339      return 0;
14340  }
14341  
14342  int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14343  _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14344                                     const char *str, Py_ssize_t len)
14345  {
14346      Py_UCS4 maxchar;
14347  
14348      maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14349      if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14350          return -1;
14351      unicode_write_cstr(writer->buffer, writer->pos, str, len);
14352      writer->pos += len;
14353      return 0;
14354  }
14355  
14356  PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14357  _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14358  {
14359      PyObject *str;
14360  
14361      if (writer->pos == 0) {
14362          Py_CLEAR(writer->buffer);
14363          _Py_RETURN_UNICODE_EMPTY();
14364      }
14365  
14366      str = writer->buffer;
14367      writer->buffer = NULL;
14368  
14369      if (writer->readonly) {
14370          assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14371          return str;
14372      }
14373  
14374      if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14375          PyObject *str2;
14376          str2 = resize_compact(str, writer->pos);
14377          if (str2 == NULL) {
14378              Py_DECREF(str);
14379              return NULL;
14380          }
14381          str = str2;
14382      }
14383  
14384      assert(_PyUnicode_CheckConsistency(str, 1));
14385      return unicode_result_ready(str);
14386  }
14387  
14388  void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14389  _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14390  {
14391      Py_CLEAR(writer->buffer);
14392  }
14393  
14394  #include "stringlib/unicode_format.h"
14395  
14396  PyDoc_STRVAR(format__doc__,
14397               "S.format(*args, **kwargs) -> str\n\
14398  \n\
14399  Return a formatted version of S, using substitutions from args and kwargs.\n\
14400  The substitutions are identified by braces ('{' and '}').");
14401  
14402  PyDoc_STRVAR(format_map__doc__,
14403               "S.format_map(mapping) -> str\n\
14404  \n\
14405  Return a formatted version of S, using substitutions from mapping.\n\
14406  The substitutions are identified by braces ('{' and '}').");
14407  
14408  /*[clinic input]
14409  str.__format__ as unicode___format__
14410  
14411      format_spec: unicode
14412      /
14413  
14414  Return a formatted version of the string as described by format_spec.
14415  [clinic start generated code]*/
14416  
14417  static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14418  unicode___format___impl(PyObject *self, PyObject *format_spec)
14419  /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14420  {
14421      _PyUnicodeWriter writer;
14422      int ret;
14423  
14424      if (PyUnicode_READY(self) == -1)
14425          return NULL;
14426      _PyUnicodeWriter_Init(&writer);
14427      ret = _PyUnicode_FormatAdvancedWriter(&writer,
14428                                            self, format_spec, 0,
14429                                            PyUnicode_GET_LENGTH(format_spec));
14430      if (ret == -1) {
14431          _PyUnicodeWriter_Dealloc(&writer);
14432          return NULL;
14433      }
14434      return _PyUnicodeWriter_Finish(&writer);
14435  }
14436  
14437  /*[clinic input]
14438  str.__sizeof__ as unicode_sizeof
14439  
14440  Return the size of the string in memory, in bytes.
14441  [clinic start generated code]*/
14442  
14443  static PyObject *
unicode_sizeof_impl(PyObject * self)14444  unicode_sizeof_impl(PyObject *self)
14445  /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14446  {
14447      Py_ssize_t size;
14448  
14449      /* If it's a compact object, account for base structure +
14450         character data. */
14451      if (PyUnicode_IS_COMPACT_ASCII(self))
14452          size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14453      else if (PyUnicode_IS_COMPACT(self))
14454          size = sizeof(PyCompactUnicodeObject) +
14455              (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14456      else {
14457          /* If it is a two-block object, account for base object, and
14458             for character block if present. */
14459          size = sizeof(PyUnicodeObject);
14460          if (_PyUnicode_DATA_ANY(self))
14461              size += (PyUnicode_GET_LENGTH(self) + 1) *
14462                  PyUnicode_KIND(self);
14463      }
14464      /* If the wstr pointer is present, account for it unless it is shared
14465         with the data pointer. Check if the data is not shared. */
14466      if (_PyUnicode_HAS_WSTR_MEMORY(self))
14467          size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14468      if (_PyUnicode_HAS_UTF8_MEMORY(self))
14469          size += PyUnicode_UTF8_LENGTH(self) + 1;
14470  
14471      return PyLong_FromSsize_t(size);
14472  }
14473  
14474  static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14475  unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14476  {
14477      PyObject *copy = _PyUnicode_Copy(v);
14478      if (!copy)
14479          return NULL;
14480      return Py_BuildValue("(N)", copy);
14481  }
14482  
14483  static PyMethodDef unicode_methods[] = {
14484      UNICODE_ENCODE_METHODDEF
14485      UNICODE_REPLACE_METHODDEF
14486      UNICODE_SPLIT_METHODDEF
14487      UNICODE_RSPLIT_METHODDEF
14488      UNICODE_JOIN_METHODDEF
14489      UNICODE_CAPITALIZE_METHODDEF
14490      UNICODE_CASEFOLD_METHODDEF
14491      UNICODE_TITLE_METHODDEF
14492      UNICODE_CENTER_METHODDEF
14493      {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14494      UNICODE_EXPANDTABS_METHODDEF
14495      {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14496      UNICODE_PARTITION_METHODDEF
14497      {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14498      UNICODE_LJUST_METHODDEF
14499      UNICODE_LOWER_METHODDEF
14500      UNICODE_LSTRIP_METHODDEF
14501      {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14502      {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14503      UNICODE_RJUST_METHODDEF
14504      UNICODE_RSTRIP_METHODDEF
14505      UNICODE_RPARTITION_METHODDEF
14506      UNICODE_SPLITLINES_METHODDEF
14507      UNICODE_STRIP_METHODDEF
14508      UNICODE_SWAPCASE_METHODDEF
14509      UNICODE_TRANSLATE_METHODDEF
14510      UNICODE_UPPER_METHODDEF
14511      {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14512      {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14513      UNICODE_REMOVEPREFIX_METHODDEF
14514      UNICODE_REMOVESUFFIX_METHODDEF
14515      UNICODE_ISASCII_METHODDEF
14516      UNICODE_ISLOWER_METHODDEF
14517      UNICODE_ISUPPER_METHODDEF
14518      UNICODE_ISTITLE_METHODDEF
14519      UNICODE_ISSPACE_METHODDEF
14520      UNICODE_ISDECIMAL_METHODDEF
14521      UNICODE_ISDIGIT_METHODDEF
14522      UNICODE_ISNUMERIC_METHODDEF
14523      UNICODE_ISALPHA_METHODDEF
14524      UNICODE_ISALNUM_METHODDEF
14525      UNICODE_ISIDENTIFIER_METHODDEF
14526      UNICODE_ISPRINTABLE_METHODDEF
14527      UNICODE_ZFILL_METHODDEF
14528      {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14529      {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14530      UNICODE___FORMAT___METHODDEF
14531      UNICODE_MAKETRANS_METHODDEF
14532      UNICODE_SIZEOF_METHODDEF
14533  #if 0
14534      /* These methods are just used for debugging the implementation. */
14535      {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14536  #endif
14537  
14538      {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14539      {NULL, NULL}
14540  };
14541  
14542  static PyObject *
unicode_mod(PyObject * v,PyObject * w)14543  unicode_mod(PyObject *v, PyObject *w)
14544  {
14545      if (!PyUnicode_Check(v))
14546          Py_RETURN_NOTIMPLEMENTED;
14547      return PyUnicode_Format(v, w);
14548  }
14549  
14550  static PyNumberMethods unicode_as_number = {
14551      0,              /*nb_add*/
14552      0,              /*nb_subtract*/
14553      0,              /*nb_multiply*/
14554      unicode_mod,            /*nb_remainder*/
14555  };
14556  
14557  static PySequenceMethods unicode_as_sequence = {
14558      (lenfunc) unicode_length,       /* sq_length */
14559      PyUnicode_Concat,           /* sq_concat */
14560      (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14561      (ssizeargfunc) unicode_getitem,     /* sq_item */
14562      0,                  /* sq_slice */
14563      0,                  /* sq_ass_item */
14564      0,                  /* sq_ass_slice */
14565      PyUnicode_Contains,         /* sq_contains */
14566  };
14567  
14568  static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14569  unicode_subscript(PyObject* self, PyObject* item)
14570  {
14571      if (PyUnicode_READY(self) == -1)
14572          return NULL;
14573  
14574      if (_PyIndex_Check(item)) {
14575          Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14576          if (i == -1 && PyErr_Occurred())
14577              return NULL;
14578          if (i < 0)
14579              i += PyUnicode_GET_LENGTH(self);
14580          return unicode_getitem(self, i);
14581      } else if (PySlice_Check(item)) {
14582          Py_ssize_t start, stop, step, slicelength, i;
14583          size_t cur;
14584          PyObject *result;
14585          const void *src_data;
14586          void *dest_data;
14587          int src_kind, dest_kind;
14588          Py_UCS4 ch, max_char, kind_limit;
14589  
14590          if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14591              return NULL;
14592          }
14593          slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14594                                              &start, &stop, step);
14595  
14596          if (slicelength <= 0) {
14597              _Py_RETURN_UNICODE_EMPTY();
14598          } else if (start == 0 && step == 1 &&
14599                     slicelength == PyUnicode_GET_LENGTH(self)) {
14600              return unicode_result_unchanged(self);
14601          } else if (step == 1) {
14602              return PyUnicode_Substring(self,
14603                                         start, start + slicelength);
14604          }
14605          /* General case */
14606          src_kind = PyUnicode_KIND(self);
14607          src_data = PyUnicode_DATA(self);
14608          if (!PyUnicode_IS_ASCII(self)) {
14609              kind_limit = kind_maxchar_limit(src_kind);
14610              max_char = 0;
14611              for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14612                  ch = PyUnicode_READ(src_kind, src_data, cur);
14613                  if (ch > max_char) {
14614                      max_char = ch;
14615                      if (max_char >= kind_limit)
14616                          break;
14617                  }
14618              }
14619          }
14620          else
14621              max_char = 127;
14622          result = PyUnicode_New(slicelength, max_char);
14623          if (result == NULL)
14624              return NULL;
14625          dest_kind = PyUnicode_KIND(result);
14626          dest_data = PyUnicode_DATA(result);
14627  
14628          for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14629              Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14630              PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14631          }
14632          assert(_PyUnicode_CheckConsistency(result, 1));
14633          return result;
14634      } else {
14635          PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14636          return NULL;
14637      }
14638  }
14639  
14640  static PyMappingMethods unicode_as_mapping = {
14641      (lenfunc)unicode_length,        /* mp_length */
14642      (binaryfunc)unicode_subscript,  /* mp_subscript */
14643      (objobjargproc)0,           /* mp_ass_subscript */
14644  };
14645  
14646  
14647  /* Helpers for PyUnicode_Format() */
14648  
14649  struct unicode_formatter_t {
14650      PyObject *args;
14651      int args_owned;
14652      Py_ssize_t arglen, argidx;
14653      PyObject *dict;
14654  
14655      enum PyUnicode_Kind fmtkind;
14656      Py_ssize_t fmtcnt, fmtpos;
14657      const void *fmtdata;
14658      PyObject *fmtstr;
14659  
14660      _PyUnicodeWriter writer;
14661  };
14662  
14663  struct unicode_format_arg_t {
14664      Py_UCS4 ch;
14665      int flags;
14666      Py_ssize_t width;
14667      int prec;
14668      int sign;
14669  };
14670  
14671  static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14672  unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14673  {
14674      Py_ssize_t argidx = ctx->argidx;
14675  
14676      if (argidx < ctx->arglen) {
14677          ctx->argidx++;
14678          if (ctx->arglen < 0)
14679              return ctx->args;
14680          else
14681              return PyTuple_GetItem(ctx->args, argidx);
14682      }
14683      PyErr_SetString(PyExc_TypeError,
14684                      "not enough arguments for format string");
14685      return NULL;
14686  }
14687  
14688  /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14689  
14690  /* Format a float into the writer if the writer is not NULL, or into *p_output
14691     otherwise.
14692  
14693     Return 0 on success, raise an exception and return -1 on error. */
14694  static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14695  formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14696              PyObject **p_output,
14697              _PyUnicodeWriter *writer)
14698  {
14699      char *p;
14700      double x;
14701      Py_ssize_t len;
14702      int prec;
14703      int dtoa_flags;
14704  
14705      x = PyFloat_AsDouble(v);
14706      if (x == -1.0 && PyErr_Occurred())
14707          return -1;
14708  
14709      prec = arg->prec;
14710      if (prec < 0)
14711          prec = 6;
14712  
14713      if (arg->flags & F_ALT)
14714          dtoa_flags = Py_DTSF_ALT;
14715      else
14716          dtoa_flags = 0;
14717      p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14718      if (p == NULL)
14719          return -1;
14720      len = strlen(p);
14721      if (writer) {
14722          if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14723              PyMem_Free(p);
14724              return -1;
14725          }
14726      }
14727      else
14728          *p_output = _PyUnicode_FromASCII(p, len);
14729      PyMem_Free(p);
14730      return 0;
14731  }
14732  
14733  /* formatlong() emulates the format codes d, u, o, x and X, and
14734   * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14735   * Python's regular ints.
14736   * Return value:  a new PyUnicodeObject*, or NULL if error.
14737   *     The output string is of the form
14738   *         "-"? ("0x" | "0X")? digit+
14739   *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14740   *         set in flags.  The case of hex digits will be correct,
14741   *     There will be at least prec digits, zero-filled on the left if
14742   *         necessary to get that many.
14743   * val          object to be converted
14744   * flags        bitmask of format flags; only F_ALT is looked at
14745   * prec         minimum number of digits; 0-fill on left if needed
14746   * type         a character in [duoxX]; u acts the same as d
14747   *
14748   * CAUTION:  o, x and X conversions on regular ints can never
14749   * produce a '-' sign, but can for Python's unbounded ints.
14750   */
14751  PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14752  _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14753  {
14754      PyObject *result = NULL;
14755      char *buf;
14756      Py_ssize_t i;
14757      int sign;           /* 1 if '-', else 0 */
14758      int len;            /* number of characters */
14759      Py_ssize_t llen;
14760      int numdigits;      /* len == numnondigits + numdigits */
14761      int numnondigits = 0;
14762  
14763      /* Avoid exceeding SSIZE_T_MAX */
14764      if (prec > INT_MAX-3) {
14765          PyErr_SetString(PyExc_OverflowError,
14766                          "precision too large");
14767          return NULL;
14768      }
14769  
14770      assert(PyLong_Check(val));
14771  
14772      switch (type) {
14773      default:
14774          Py_UNREACHABLE();
14775      case 'd':
14776      case 'i':
14777      case 'u':
14778          /* int and int subclasses should print numerically when a numeric */
14779          /* format code is used (see issue18780) */
14780          result = PyNumber_ToBase(val, 10);
14781          break;
14782      case 'o':
14783          numnondigits = 2;
14784          result = PyNumber_ToBase(val, 8);
14785          break;
14786      case 'x':
14787      case 'X':
14788          numnondigits = 2;
14789          result = PyNumber_ToBase(val, 16);
14790          break;
14791      }
14792      if (!result)
14793          return NULL;
14794  
14795      assert(unicode_modifiable(result));
14796      assert(PyUnicode_IS_READY(result));
14797      assert(PyUnicode_IS_ASCII(result));
14798  
14799      /* To modify the string in-place, there can only be one reference. */
14800      if (Py_REFCNT(result) != 1) {
14801          Py_DECREF(result);
14802          PyErr_BadInternalCall();
14803          return NULL;
14804      }
14805      buf = PyUnicode_DATA(result);
14806      llen = PyUnicode_GET_LENGTH(result);
14807      if (llen > INT_MAX) {
14808          Py_DECREF(result);
14809          PyErr_SetString(PyExc_ValueError,
14810                          "string too large in _PyUnicode_FormatLong");
14811          return NULL;
14812      }
14813      len = (int)llen;
14814      sign = buf[0] == '-';
14815      numnondigits += sign;
14816      numdigits = len - numnondigits;
14817      assert(numdigits > 0);
14818  
14819      /* Get rid of base marker unless F_ALT */
14820      if (((alt) == 0 &&
14821          (type == 'o' || type == 'x' || type == 'X'))) {
14822          assert(buf[sign] == '0');
14823          assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14824                 buf[sign+1] == 'o');
14825          numnondigits -= 2;
14826          buf += 2;
14827          len -= 2;
14828          if (sign)
14829              buf[0] = '-';
14830          assert(len == numnondigits + numdigits);
14831          assert(numdigits > 0);
14832      }
14833  
14834      /* Fill with leading zeroes to meet minimum width. */
14835      if (prec > numdigits) {
14836          PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14837                                  numnondigits + prec);
14838          char *b1;
14839          if (!r1) {
14840              Py_DECREF(result);
14841              return NULL;
14842          }
14843          b1 = PyBytes_AS_STRING(r1);
14844          for (i = 0; i < numnondigits; ++i)
14845              *b1++ = *buf++;
14846          for (i = 0; i < prec - numdigits; i++)
14847              *b1++ = '0';
14848          for (i = 0; i < numdigits; i++)
14849              *b1++ = *buf++;
14850          *b1 = '\0';
14851          Py_DECREF(result);
14852          result = r1;
14853          buf = PyBytes_AS_STRING(result);
14854          len = numnondigits + prec;
14855      }
14856  
14857      /* Fix up case for hex conversions. */
14858      if (type == 'X') {
14859          /* Need to convert all lower case letters to upper case.
14860             and need to convert 0x to 0X (and -0x to -0X). */
14861          for (i = 0; i < len; i++)
14862              if (buf[i] >= 'a' && buf[i] <= 'x')
14863                  buf[i] -= 'a'-'A';
14864      }
14865      if (!PyUnicode_Check(result)
14866          || buf != PyUnicode_DATA(result)) {
14867          PyObject *unicode;
14868          unicode = _PyUnicode_FromASCII(buf, len);
14869          Py_DECREF(result);
14870          result = unicode;
14871      }
14872      else if (len != PyUnicode_GET_LENGTH(result)) {
14873          if (PyUnicode_Resize(&result, len) < 0)
14874              Py_CLEAR(result);
14875      }
14876      return result;
14877  }
14878  
14879  /* Format an integer or a float as an integer.
14880   * Return 1 if the number has been formatted into the writer,
14881   *        0 if the number has been formatted into *p_output
14882   *       -1 and raise an exception on error */
14883  static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14884  mainformatlong(PyObject *v,
14885                 struct unicode_format_arg_t *arg,
14886                 PyObject **p_output,
14887                 _PyUnicodeWriter *writer)
14888  {
14889      PyObject *iobj, *res;
14890      char type = (char)arg->ch;
14891  
14892      if (!PyNumber_Check(v))
14893          goto wrongtype;
14894  
14895      /* make sure number is a type of integer for o, x, and X */
14896      if (!PyLong_Check(v)) {
14897          if (type == 'o' || type == 'x' || type == 'X') {
14898              iobj = _PyNumber_Index(v);
14899          }
14900          else {
14901              iobj = PyNumber_Long(v);
14902          }
14903          if (iobj == NULL ) {
14904              if (PyErr_ExceptionMatches(PyExc_TypeError))
14905                  goto wrongtype;
14906              return -1;
14907          }
14908          assert(PyLong_Check(iobj));
14909      }
14910      else {
14911          iobj = v;
14912          Py_INCREF(iobj);
14913      }
14914  
14915      if (PyLong_CheckExact(v)
14916          && arg->width == -1 && arg->prec == -1
14917          && !(arg->flags & (F_SIGN | F_BLANK))
14918          && type != 'X')
14919      {
14920          /* Fast path */
14921          int alternate = arg->flags & F_ALT;
14922          int base;
14923  
14924          switch(type)
14925          {
14926              default:
14927                  Py_UNREACHABLE();
14928              case 'd':
14929              case 'i':
14930              case 'u':
14931                  base = 10;
14932                  break;
14933              case 'o':
14934                  base = 8;
14935                  break;
14936              case 'x':
14937              case 'X':
14938                  base = 16;
14939                  break;
14940          }
14941  
14942          if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14943              Py_DECREF(iobj);
14944              return -1;
14945          }
14946          Py_DECREF(iobj);
14947          return 1;
14948      }
14949  
14950      res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14951      Py_DECREF(iobj);
14952      if (res == NULL)
14953          return -1;
14954      *p_output = res;
14955      return 0;
14956  
14957  wrongtype:
14958      switch(type)
14959      {
14960          case 'o':
14961          case 'x':
14962          case 'X':
14963              PyErr_Format(PyExc_TypeError,
14964                      "%%%c format: an integer is required, "
14965                      "not %.200s",
14966                      type, Py_TYPE(v)->tp_name);
14967              break;
14968          default:
14969              PyErr_Format(PyExc_TypeError,
14970                      "%%%c format: a real number is required, "
14971                      "not %.200s",
14972                      type, Py_TYPE(v)->tp_name);
14973              break;
14974      }
14975      return -1;
14976  }
14977  
14978  static Py_UCS4
formatchar(PyObject * v)14979  formatchar(PyObject *v)
14980  {
14981      /* presume that the buffer is at least 3 characters long */
14982      if (PyUnicode_Check(v)) {
14983          if (PyUnicode_GET_LENGTH(v) == 1) {
14984              return PyUnicode_READ_CHAR(v, 0);
14985          }
14986          goto onError;
14987      }
14988      else {
14989          int overflow;
14990          long x = PyLong_AsLongAndOverflow(v, &overflow);
14991          if (x == -1 && PyErr_Occurred()) {
14992              if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14993                  goto onError;
14994              }
14995              return (Py_UCS4) -1;
14996          }
14997  
14998          if (x < 0 || x > MAX_UNICODE) {
14999              /* this includes an overflow in converting to C long */
15000              PyErr_SetString(PyExc_OverflowError,
15001                              "%c arg not in range(0x110000)");
15002              return (Py_UCS4) -1;
15003          }
15004  
15005          return (Py_UCS4) x;
15006      }
15007  
15008    onError:
15009      PyErr_SetString(PyExc_TypeError,
15010                      "%c requires int or char");
15011      return (Py_UCS4) -1;
15012  }
15013  
15014  /* Parse options of an argument: flags, width, precision.
15015     Handle also "%(name)" syntax.
15016  
15017     Return 0 if the argument has been formatted into arg->str.
15018     Return 1 if the argument has been written into ctx->writer,
15019     Raise an exception and return -1 on error. */
15020  static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)15021  unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15022                           struct unicode_format_arg_t *arg)
15023  {
15024  #define FORMAT_READ(ctx) \
15025          PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15026  
15027      PyObject *v;
15028  
15029      if (arg->ch == '(') {
15030          /* Get argument value from a dictionary. Example: "%(name)s". */
15031          Py_ssize_t keystart;
15032          Py_ssize_t keylen;
15033          PyObject *key;
15034          int pcount = 1;
15035  
15036          if (ctx->dict == NULL) {
15037              PyErr_SetString(PyExc_TypeError,
15038                              "format requires a mapping");
15039              return -1;
15040          }
15041          ++ctx->fmtpos;
15042          --ctx->fmtcnt;
15043          keystart = ctx->fmtpos;
15044          /* Skip over balanced parentheses */
15045          while (pcount > 0 && --ctx->fmtcnt >= 0) {
15046              arg->ch = FORMAT_READ(ctx);
15047              if (arg->ch == ')')
15048                  --pcount;
15049              else if (arg->ch == '(')
15050                  ++pcount;
15051              ctx->fmtpos++;
15052          }
15053          keylen = ctx->fmtpos - keystart - 1;
15054          if (ctx->fmtcnt < 0 || pcount > 0) {
15055              PyErr_SetString(PyExc_ValueError,
15056                              "incomplete format key");
15057              return -1;
15058          }
15059          key = PyUnicode_Substring(ctx->fmtstr,
15060                                    keystart, keystart + keylen);
15061          if (key == NULL)
15062              return -1;
15063          if (ctx->args_owned) {
15064              ctx->args_owned = 0;
15065              Py_DECREF(ctx->args);
15066          }
15067          ctx->args = PyObject_GetItem(ctx->dict, key);
15068          Py_DECREF(key);
15069          if (ctx->args == NULL)
15070              return -1;
15071          ctx->args_owned = 1;
15072          ctx->arglen = -1;
15073          ctx->argidx = -2;
15074      }
15075  
15076      /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15077      while (--ctx->fmtcnt >= 0) {
15078          arg->ch = FORMAT_READ(ctx);
15079          ctx->fmtpos++;
15080          switch (arg->ch) {
15081          case '-': arg->flags |= F_LJUST; continue;
15082          case '+': arg->flags |= F_SIGN; continue;
15083          case ' ': arg->flags |= F_BLANK; continue;
15084          case '#': arg->flags |= F_ALT; continue;
15085          case '0': arg->flags |= F_ZERO; continue;
15086          }
15087          break;
15088      }
15089  
15090      /* Parse width. Example: "%10s" => width=10 */
15091      if (arg->ch == '*') {
15092          v = unicode_format_getnextarg(ctx);
15093          if (v == NULL)
15094              return -1;
15095          if (!PyLong_Check(v)) {
15096              PyErr_SetString(PyExc_TypeError,
15097                              "* wants int");
15098              return -1;
15099          }
15100          arg->width = PyLong_AsSsize_t(v);
15101          if (arg->width == -1 && PyErr_Occurred())
15102              return -1;
15103          if (arg->width < 0) {
15104              arg->flags |= F_LJUST;
15105              arg->width = -arg->width;
15106          }
15107          if (--ctx->fmtcnt >= 0) {
15108              arg->ch = FORMAT_READ(ctx);
15109              ctx->fmtpos++;
15110          }
15111      }
15112      else if (arg->ch >= '0' && arg->ch <= '9') {
15113          arg->width = arg->ch - '0';
15114          while (--ctx->fmtcnt >= 0) {
15115              arg->ch = FORMAT_READ(ctx);
15116              ctx->fmtpos++;
15117              if (arg->ch < '0' || arg->ch > '9')
15118                  break;
15119              /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15120                 mixing signed and unsigned comparison. Since arg->ch is between
15121                 '0' and '9', casting to int is safe. */
15122              if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15123                  PyErr_SetString(PyExc_ValueError,
15124                                  "width too big");
15125                  return -1;
15126              }
15127              arg->width = arg->width*10 + (arg->ch - '0');
15128          }
15129      }
15130  
15131      /* Parse precision. Example: "%.3f" => prec=3 */
15132      if (arg->ch == '.') {
15133          arg->prec = 0;
15134          if (--ctx->fmtcnt >= 0) {
15135              arg->ch = FORMAT_READ(ctx);
15136              ctx->fmtpos++;
15137          }
15138          if (arg->ch == '*') {
15139              v = unicode_format_getnextarg(ctx);
15140              if (v == NULL)
15141                  return -1;
15142              if (!PyLong_Check(v)) {
15143                  PyErr_SetString(PyExc_TypeError,
15144                                  "* wants int");
15145                  return -1;
15146              }
15147              arg->prec = _PyLong_AsInt(v);
15148              if (arg->prec == -1 && PyErr_Occurred())
15149                  return -1;
15150              if (arg->prec < 0)
15151                  arg->prec = 0;
15152              if (--ctx->fmtcnt >= 0) {
15153                  arg->ch = FORMAT_READ(ctx);
15154                  ctx->fmtpos++;
15155              }
15156          }
15157          else if (arg->ch >= '0' && arg->ch <= '9') {
15158              arg->prec = arg->ch - '0';
15159              while (--ctx->fmtcnt >= 0) {
15160                  arg->ch = FORMAT_READ(ctx);
15161                  ctx->fmtpos++;
15162                  if (arg->ch < '0' || arg->ch > '9')
15163                      break;
15164                  if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15165                      PyErr_SetString(PyExc_ValueError,
15166                                      "precision too big");
15167                      return -1;
15168                  }
15169                  arg->prec = arg->prec*10 + (arg->ch - '0');
15170              }
15171          }
15172      }
15173  
15174      /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15175      if (ctx->fmtcnt >= 0) {
15176          if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15177              if (--ctx->fmtcnt >= 0) {
15178                  arg->ch = FORMAT_READ(ctx);
15179                  ctx->fmtpos++;
15180              }
15181          }
15182      }
15183      if (ctx->fmtcnt < 0) {
15184          PyErr_SetString(PyExc_ValueError,
15185                          "incomplete format");
15186          return -1;
15187      }
15188      return 0;
15189  
15190  #undef FORMAT_READ
15191  }
15192  
15193  /* Format one argument. Supported conversion specifiers:
15194  
15195     - "s", "r", "a": any type
15196     - "i", "d", "u": int or float
15197     - "o", "x", "X": int
15198     - "e", "E", "f", "F", "g", "G": float
15199     - "c": int or str (1 character)
15200  
15201     When possible, the output is written directly into the Unicode writer
15202     (ctx->writer). A string is created when padding is required.
15203  
15204     Return 0 if the argument has been formatted into *p_str,
15205            1 if the argument has been written into ctx->writer,
15206           -1 on error. */
15207  static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)15208  unicode_format_arg_format(struct unicode_formatter_t *ctx,
15209                            struct unicode_format_arg_t *arg,
15210                            PyObject **p_str)
15211  {
15212      PyObject *v;
15213      _PyUnicodeWriter *writer = &ctx->writer;
15214  
15215      if (ctx->fmtcnt == 0)
15216          ctx->writer.overallocate = 0;
15217  
15218      v = unicode_format_getnextarg(ctx);
15219      if (v == NULL)
15220          return -1;
15221  
15222  
15223      switch (arg->ch) {
15224      case 's':
15225      case 'r':
15226      case 'a':
15227          if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15228              /* Fast path */
15229              if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15230                  return -1;
15231              return 1;
15232          }
15233  
15234          if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15235              *p_str = v;
15236              Py_INCREF(*p_str);
15237          }
15238          else {
15239              if (arg->ch == 's')
15240                  *p_str = PyObject_Str(v);
15241              else if (arg->ch == 'r')
15242                  *p_str = PyObject_Repr(v);
15243              else
15244                  *p_str = PyObject_ASCII(v);
15245          }
15246          break;
15247  
15248      case 'i':
15249      case 'd':
15250      case 'u':
15251      case 'o':
15252      case 'x':
15253      case 'X':
15254      {
15255          int ret = mainformatlong(v, arg, p_str, writer);
15256          if (ret != 0)
15257              return ret;
15258          arg->sign = 1;
15259          break;
15260      }
15261  
15262      case 'e':
15263      case 'E':
15264      case 'f':
15265      case 'F':
15266      case 'g':
15267      case 'G':
15268          if (arg->width == -1 && arg->prec == -1
15269              && !(arg->flags & (F_SIGN | F_BLANK)))
15270          {
15271              /* Fast path */
15272              if (formatfloat(v, arg, NULL, writer) == -1)
15273                  return -1;
15274              return 1;
15275          }
15276  
15277          arg->sign = 1;
15278          if (formatfloat(v, arg, p_str, NULL) == -1)
15279              return -1;
15280          break;
15281  
15282      case 'c':
15283      {
15284          Py_UCS4 ch = formatchar(v);
15285          if (ch == (Py_UCS4) -1)
15286              return -1;
15287          if (arg->width == -1 && arg->prec == -1) {
15288              /* Fast path */
15289              if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15290                  return -1;
15291              return 1;
15292          }
15293          *p_str = PyUnicode_FromOrdinal(ch);
15294          break;
15295      }
15296  
15297      default:
15298          PyErr_Format(PyExc_ValueError,
15299                       "unsupported format character '%c' (0x%x) "
15300                       "at index %zd",
15301                       (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15302                       (int)arg->ch,
15303                       ctx->fmtpos - 1);
15304          return -1;
15305      }
15306      if (*p_str == NULL)
15307          return -1;
15308      assert (PyUnicode_Check(*p_str));
15309      return 0;
15310  }
15311  
15312  static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15313  unicode_format_arg_output(struct unicode_formatter_t *ctx,
15314                            struct unicode_format_arg_t *arg,
15315                            PyObject *str)
15316  {
15317      Py_ssize_t len;
15318      enum PyUnicode_Kind kind;
15319      const void *pbuf;
15320      Py_ssize_t pindex;
15321      Py_UCS4 signchar;
15322      Py_ssize_t buflen;
15323      Py_UCS4 maxchar;
15324      Py_ssize_t sublen;
15325      _PyUnicodeWriter *writer = &ctx->writer;
15326      Py_UCS4 fill;
15327  
15328      fill = ' ';
15329      if (arg->sign && arg->flags & F_ZERO)
15330          fill = '0';
15331  
15332      if (PyUnicode_READY(str) == -1)
15333          return -1;
15334  
15335      len = PyUnicode_GET_LENGTH(str);
15336      if ((arg->width == -1 || arg->width <= len)
15337          && (arg->prec == -1 || arg->prec >= len)
15338          && !(arg->flags & (F_SIGN | F_BLANK)))
15339      {
15340          /* Fast path */
15341          if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15342              return -1;
15343          return 0;
15344      }
15345  
15346      /* Truncate the string for "s", "r" and "a" formats
15347         if the precision is set */
15348      if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15349          if (arg->prec >= 0 && len > arg->prec)
15350              len = arg->prec;
15351      }
15352  
15353      /* Adjust sign and width */
15354      kind = PyUnicode_KIND(str);
15355      pbuf = PyUnicode_DATA(str);
15356      pindex = 0;
15357      signchar = '\0';
15358      if (arg->sign) {
15359          Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15360          if (ch == '-' || ch == '+') {
15361              signchar = ch;
15362              len--;
15363              pindex++;
15364          }
15365          else if (arg->flags & F_SIGN)
15366              signchar = '+';
15367          else if (arg->flags & F_BLANK)
15368              signchar = ' ';
15369          else
15370              arg->sign = 0;
15371      }
15372      if (arg->width < len)
15373          arg->width = len;
15374  
15375      /* Prepare the writer */
15376      maxchar = writer->maxchar;
15377      if (!(arg->flags & F_LJUST)) {
15378          if (arg->sign) {
15379              if ((arg->width-1) > len)
15380                  maxchar = Py_MAX(maxchar, fill);
15381          }
15382          else {
15383              if (arg->width > len)
15384                  maxchar = Py_MAX(maxchar, fill);
15385          }
15386      }
15387      if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15388          Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15389          maxchar = Py_MAX(maxchar, strmaxchar);
15390      }
15391  
15392      buflen = arg->width;
15393      if (arg->sign && len == arg->width)
15394          buflen++;
15395      if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15396          return -1;
15397  
15398      /* Write the sign if needed */
15399      if (arg->sign) {
15400          if (fill != ' ') {
15401              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15402              writer->pos += 1;
15403          }
15404          if (arg->width > len)
15405              arg->width--;
15406      }
15407  
15408      /* Write the numeric prefix for "x", "X" and "o" formats
15409         if the alternate form is used.
15410         For example, write "0x" for the "%#x" format. */
15411      if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15412          assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15413          assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15414          if (fill != ' ') {
15415              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15416              PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15417              writer->pos += 2;
15418              pindex += 2;
15419          }
15420          arg->width -= 2;
15421          if (arg->width < 0)
15422              arg->width = 0;
15423          len -= 2;
15424      }
15425  
15426      /* Pad left with the fill character if needed */
15427      if (arg->width > len && !(arg->flags & F_LJUST)) {
15428          sublen = arg->width - len;
15429          unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15430          writer->pos += sublen;
15431          arg->width = len;
15432      }
15433  
15434      /* If padding with spaces: write sign if needed and/or numeric prefix if
15435         the alternate form is used */
15436      if (fill == ' ') {
15437          if (arg->sign) {
15438              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15439              writer->pos += 1;
15440          }
15441          if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15442              assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15443              assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15444              PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15445              PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15446              writer->pos += 2;
15447              pindex += 2;
15448          }
15449      }
15450  
15451      /* Write characters */
15452      if (len) {
15453          _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15454                                        str, pindex, len);
15455          writer->pos += len;
15456      }
15457  
15458      /* Pad right with the fill character if needed */
15459      if (arg->width > len) {
15460          sublen = arg->width - len;
15461          unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15462          writer->pos += sublen;
15463      }
15464      return 0;
15465  }
15466  
15467  /* Helper of PyUnicode_Format(): format one arg.
15468     Return 0 on success, raise an exception and return -1 on error. */
15469  static int
unicode_format_arg(struct unicode_formatter_t * ctx)15470  unicode_format_arg(struct unicode_formatter_t *ctx)
15471  {
15472      struct unicode_format_arg_t arg;
15473      PyObject *str;
15474      int ret;
15475  
15476      arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15477      if (arg.ch == '%') {
15478          ctx->fmtpos++;
15479          ctx->fmtcnt--;
15480          if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15481              return -1;
15482          return 0;
15483      }
15484      arg.flags = 0;
15485      arg.width = -1;
15486      arg.prec = -1;
15487      arg.sign = 0;
15488      str = NULL;
15489  
15490      ret = unicode_format_arg_parse(ctx, &arg);
15491      if (ret == -1)
15492          return -1;
15493  
15494      ret = unicode_format_arg_format(ctx, &arg, &str);
15495      if (ret == -1)
15496          return -1;
15497  
15498      if (ret != 1) {
15499          ret = unicode_format_arg_output(ctx, &arg, str);
15500          Py_DECREF(str);
15501          if (ret == -1)
15502              return -1;
15503      }
15504  
15505      if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15506          PyErr_SetString(PyExc_TypeError,
15507                          "not all arguments converted during string formatting");
15508          return -1;
15509      }
15510      return 0;
15511  }
15512  
15513  PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15514  PyUnicode_Format(PyObject *format, PyObject *args)
15515  {
15516      struct unicode_formatter_t ctx;
15517  
15518      if (format == NULL || args == NULL) {
15519          PyErr_BadInternalCall();
15520          return NULL;
15521      }
15522  
15523      if (ensure_unicode(format) < 0)
15524          return NULL;
15525  
15526      ctx.fmtstr = format;
15527      ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15528      ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15529      ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15530      ctx.fmtpos = 0;
15531  
15532      _PyUnicodeWriter_Init(&ctx.writer);
15533      ctx.writer.min_length = ctx.fmtcnt + 100;
15534      ctx.writer.overallocate = 1;
15535  
15536      if (PyTuple_Check(args)) {
15537          ctx.arglen = PyTuple_Size(args);
15538          ctx.argidx = 0;
15539      }
15540      else {
15541          ctx.arglen = -1;
15542          ctx.argidx = -2;
15543      }
15544      ctx.args_owned = 0;
15545      if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15546          ctx.dict = args;
15547      else
15548          ctx.dict = NULL;
15549      ctx.args = args;
15550  
15551      while (--ctx.fmtcnt >= 0) {
15552          if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15553              Py_ssize_t nonfmtpos;
15554  
15555              nonfmtpos = ctx.fmtpos++;
15556              while (ctx.fmtcnt >= 0 &&
15557                     PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15558                  ctx.fmtpos++;
15559                  ctx.fmtcnt--;
15560              }
15561              if (ctx.fmtcnt < 0) {
15562                  ctx.fmtpos--;
15563                  ctx.writer.overallocate = 0;
15564              }
15565  
15566              if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15567                                                  nonfmtpos, ctx.fmtpos) < 0)
15568                  goto onError;
15569          }
15570          else {
15571              ctx.fmtpos++;
15572              if (unicode_format_arg(&ctx) == -1)
15573                  goto onError;
15574          }
15575      }
15576  
15577      if (ctx.argidx < ctx.arglen && !ctx.dict) {
15578          PyErr_SetString(PyExc_TypeError,
15579                          "not all arguments converted during string formatting");
15580          goto onError;
15581      }
15582  
15583      if (ctx.args_owned) {
15584          Py_DECREF(ctx.args);
15585      }
15586      return _PyUnicodeWriter_Finish(&ctx.writer);
15587  
15588    onError:
15589      _PyUnicodeWriter_Dealloc(&ctx.writer);
15590      if (ctx.args_owned) {
15591          Py_DECREF(ctx.args);
15592      }
15593      return NULL;
15594  }
15595  
15596  static PyObject *
15597  unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15598  
15599  /*[clinic input]
15600  @classmethod
15601  str.__new__ as unicode_new
15602  
15603      object as x: object = NULL
15604      encoding: str = NULL
15605      errors: str = NULL
15606  
15607  [clinic start generated code]*/
15608  
15609  static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15610  unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15611                   const char *errors)
15612  /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15613  {
15614      PyObject *unicode;
15615      if (x == NULL) {
15616          unicode = unicode_new_empty();
15617      }
15618      else if (encoding == NULL && errors == NULL) {
15619          unicode = PyObject_Str(x);
15620      }
15621      else {
15622          unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15623      }
15624  
15625      if (unicode != NULL && type != &PyUnicode_Type) {
15626          Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15627      }
15628      return unicode;
15629  }
15630  
15631  static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15632  unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15633  {
15634      PyObject *self;
15635      Py_ssize_t length, char_size;
15636      int share_wstr, share_utf8;
15637      unsigned int kind;
15638      void *data;
15639  
15640      assert(PyType_IsSubtype(type, &PyUnicode_Type));
15641      assert(_PyUnicode_CHECK(unicode));
15642      if (PyUnicode_READY(unicode) == -1) {
15643          return NULL;
15644      }
15645  
15646      self = type->tp_alloc(type, 0);
15647      if (self == NULL) {
15648          return NULL;
15649      }
15650      kind = PyUnicode_KIND(unicode);
15651      length = PyUnicode_GET_LENGTH(unicode);
15652  
15653      _PyUnicode_LENGTH(self) = length;
15654  #ifdef Py_DEBUG
15655      _PyUnicode_HASH(self) = -1;
15656  #else
15657      _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15658  #endif
15659      _PyUnicode_STATE(self).interned = 0;
15660      _PyUnicode_STATE(self).kind = kind;
15661      _PyUnicode_STATE(self).compact = 0;
15662      _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15663      _PyUnicode_STATE(self).ready = 1;
15664      _PyUnicode_WSTR(self) = NULL;
15665      _PyUnicode_UTF8_LENGTH(self) = 0;
15666      _PyUnicode_UTF8(self) = NULL;
15667      _PyUnicode_WSTR_LENGTH(self) = 0;
15668      _PyUnicode_DATA_ANY(self) = NULL;
15669  
15670      share_utf8 = 0;
15671      share_wstr = 0;
15672      if (kind == PyUnicode_1BYTE_KIND) {
15673          char_size = 1;
15674          if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15675              share_utf8 = 1;
15676      }
15677      else if (kind == PyUnicode_2BYTE_KIND) {
15678          char_size = 2;
15679          if (sizeof(wchar_t) == 2)
15680              share_wstr = 1;
15681      }
15682      else {
15683          assert(kind == PyUnicode_4BYTE_KIND);
15684          char_size = 4;
15685          if (sizeof(wchar_t) == 4)
15686              share_wstr = 1;
15687      }
15688  
15689      /* Ensure we won't overflow the length. */
15690      if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15691          PyErr_NoMemory();
15692          goto onError;
15693      }
15694      data = PyObject_Malloc((length + 1) * char_size);
15695      if (data == NULL) {
15696          PyErr_NoMemory();
15697          goto onError;
15698      }
15699  
15700      _PyUnicode_DATA_ANY(self) = data;
15701      if (share_utf8) {
15702          _PyUnicode_UTF8_LENGTH(self) = length;
15703          _PyUnicode_UTF8(self) = data;
15704      }
15705      if (share_wstr) {
15706          _PyUnicode_WSTR_LENGTH(self) = length;
15707          _PyUnicode_WSTR(self) = (wchar_t *)data;
15708      }
15709  
15710      memcpy(data, PyUnicode_DATA(unicode),
15711                kind * (length + 1));
15712      assert(_PyUnicode_CheckConsistency(self, 1));
15713  #ifdef Py_DEBUG
15714      _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15715  #endif
15716      return self;
15717  
15718  onError:
15719      Py_DECREF(self);
15720      return NULL;
15721  }
15722  
15723  PyDoc_STRVAR(unicode_doc,
15724  "str(object='') -> str\n\
15725  str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15726  \n\
15727  Create a new string object from the given object. If encoding or\n\
15728  errors is specified, then the object must expose a data buffer\n\
15729  that will be decoded using the given encoding and error handler.\n\
15730  Otherwise, returns the result of object.__str__() (if defined)\n\
15731  or repr(object).\n\
15732  encoding defaults to sys.getdefaultencoding().\n\
15733  errors defaults to 'strict'.");
15734  
15735  static PyObject *unicode_iter(PyObject *seq);
15736  
15737  PyTypeObject PyUnicode_Type = {
15738      PyVarObject_HEAD_INIT(&PyType_Type, 0)
15739      "str",                        /* tp_name */
15740      sizeof(PyUnicodeObject),      /* tp_basicsize */
15741      0,                            /* tp_itemsize */
15742      /* Slots */
15743      (destructor)unicode_dealloc,  /* tp_dealloc */
15744      0,                            /* tp_vectorcall_offset */
15745      0,                            /* tp_getattr */
15746      0,                            /* tp_setattr */
15747      0,                            /* tp_as_async */
15748      unicode_repr,                 /* tp_repr */
15749      &unicode_as_number,           /* tp_as_number */
15750      &unicode_as_sequence,         /* tp_as_sequence */
15751      &unicode_as_mapping,          /* tp_as_mapping */
15752      (hashfunc) unicode_hash,      /* tp_hash*/
15753      0,                            /* tp_call*/
15754      (reprfunc) unicode_str,       /* tp_str */
15755      PyObject_GenericGetAttr,      /* tp_getattro */
15756      0,                            /* tp_setattro */
15757      0,                            /* tp_as_buffer */
15758      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15759          Py_TPFLAGS_UNICODE_SUBCLASS |
15760          _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15761      unicode_doc,                  /* tp_doc */
15762      0,                            /* tp_traverse */
15763      0,                            /* tp_clear */
15764      PyUnicode_RichCompare,        /* tp_richcompare */
15765      0,                            /* tp_weaklistoffset */
15766      unicode_iter,                 /* tp_iter */
15767      0,                            /* tp_iternext */
15768      unicode_methods,              /* tp_methods */
15769      0,                            /* tp_members */
15770      0,                            /* tp_getset */
15771      &PyBaseObject_Type,           /* tp_base */
15772      0,                            /* tp_dict */
15773      0,                            /* tp_descr_get */
15774      0,                            /* tp_descr_set */
15775      0,                            /* tp_dictoffset */
15776      0,                            /* tp_init */
15777      0,                            /* tp_alloc */
15778      unicode_new,                  /* tp_new */
15779      PyObject_Del,                 /* tp_free */
15780  };
15781  
15782  /* Initialize the Unicode implementation */
15783  
15784  PyStatus
_PyUnicode_Init(PyInterpreterState * interp)15785  _PyUnicode_Init(PyInterpreterState *interp)
15786  {
15787      struct _Py_unicode_state *state = &interp->unicode;
15788      if (unicode_create_empty_string_singleton(state) < 0) {
15789          return _PyStatus_NO_MEMORY();
15790      }
15791  
15792      if (_Py_IsMainInterpreter(interp)) {
15793          /* initialize the linebreak bloom filter */
15794          const Py_UCS2 linebreak[] = {
15795              0x000A, /* LINE FEED */
15796              0x000D, /* CARRIAGE RETURN */
15797              0x001C, /* FILE SEPARATOR */
15798              0x001D, /* GROUP SEPARATOR */
15799              0x001E, /* RECORD SEPARATOR */
15800              0x0085, /* NEXT LINE */
15801              0x2028, /* LINE SEPARATOR */
15802              0x2029, /* PARAGRAPH SEPARATOR */
15803          };
15804          bloom_linebreak = make_bloom_mask(
15805              PyUnicode_2BYTE_KIND, linebreak,
15806              Py_ARRAY_LENGTH(linebreak));
15807      }
15808  
15809      return _PyStatus_OK();
15810  }
15811  
15812  
15813  PyStatus
_PyUnicode_InitTypes(void)15814  _PyUnicode_InitTypes(void)
15815  {
15816      if (PyType_Ready(&PyUnicode_Type) < 0) {
15817          return _PyStatus_ERR("Can't initialize unicode type");
15818      }
15819      if (PyType_Ready(&EncodingMapType) < 0) {
15820           return _PyStatus_ERR("Can't initialize encoding map type");
15821      }
15822      if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15823          return _PyStatus_ERR("Can't initialize field name iterator type");
15824      }
15825      if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15826          return _PyStatus_ERR("Can't initialize formatter iter type");
15827      }
15828      return _PyStatus_OK();
15829  }
15830  
15831  
15832  void
PyUnicode_InternInPlace(PyObject ** p)15833  PyUnicode_InternInPlace(PyObject **p)
15834  {
15835      PyObject *s = *p;
15836  #ifdef Py_DEBUG
15837      assert(s != NULL);
15838      assert(_PyUnicode_CHECK(s));
15839  #else
15840      if (s == NULL || !PyUnicode_Check(s)) {
15841          return;
15842      }
15843  #endif
15844  
15845      /* If it's a subclass, we don't really know what putting
15846         it in the interned dict might do. */
15847      if (!PyUnicode_CheckExact(s)) {
15848          return;
15849      }
15850  
15851      if (PyUnicode_CHECK_INTERNED(s)) {
15852          return;
15853      }
15854  
15855  #ifdef INTERNED_STRINGS
15856      if (PyUnicode_READY(s) == -1) {
15857          PyErr_Clear();
15858          return;
15859      }
15860  
15861      if (interned == NULL) {
15862          interned = PyDict_New();
15863          if (interned == NULL) {
15864              PyErr_Clear(); /* Don't leave an exception */
15865              return;
15866          }
15867      }
15868  
15869      PyObject *t = PyDict_SetDefault(interned, s, s);
15870      if (t == NULL) {
15871          PyErr_Clear();
15872          return;
15873      }
15874  
15875      if (t != s) {
15876          Py_INCREF(t);
15877          Py_SETREF(*p, t);
15878          return;
15879      }
15880  
15881      /* The two references in interned dict (key and value) are not counted by
15882         refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15883         this. */
15884      Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15885      _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15886  #else
15887      // PyDict expects that interned strings have their hash
15888      // (PyASCIIObject.hash) already computed.
15889      (void)unicode_hash(s);
15890  #endif
15891  }
15892  
15893  void
PyUnicode_InternImmortal(PyObject ** p)15894  PyUnicode_InternImmortal(PyObject **p)
15895  {
15896      if (PyErr_WarnEx(PyExc_DeprecationWarning,
15897              "PyUnicode_InternImmortal() is deprecated; "
15898              "use PyUnicode_InternInPlace() instead", 1) < 0)
15899      {
15900          // The function has no return value, the exception cannot
15901          // be reported to the caller, so just log it.
15902          PyErr_WriteUnraisable(NULL);
15903      }
15904  
15905      PyUnicode_InternInPlace(p);
15906      if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15907          _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15908          Py_INCREF(*p);
15909      }
15910  }
15911  
15912  PyObject *
PyUnicode_InternFromString(const char * cp)15913  PyUnicode_InternFromString(const char *cp)
15914  {
15915      PyObject *s = PyUnicode_FromString(cp);
15916      if (s == NULL)
15917          return NULL;
15918      PyUnicode_InternInPlace(&s);
15919      return s;
15920  }
15921  
15922  
15923  void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15924  _PyUnicode_ClearInterned(PyInterpreterState *interp)
15925  {
15926      if (!_Py_IsMainInterpreter(interp)) {
15927          // interned dict is shared by all interpreters
15928          return;
15929      }
15930  
15931      if (interned == NULL) {
15932          return;
15933      }
15934      assert(PyDict_CheckExact(interned));
15935  
15936      /* Interned unicode strings are not forcibly deallocated; rather, we give
15937         them their stolen references back, and then clear and DECREF the
15938         interned dict. */
15939  
15940  #ifdef INTERNED_STATS
15941      fprintf(stderr, "releasing %zd interned strings\n",
15942              PyDict_GET_SIZE(interned));
15943  
15944      Py_ssize_t immortal_size = 0, mortal_size = 0;
15945  #endif
15946      Py_ssize_t pos = 0;
15947      PyObject *s, *ignored_value;
15948      while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15949          assert(PyUnicode_IS_READY(s));
15950  
15951          switch (PyUnicode_CHECK_INTERNED(s)) {
15952          case SSTATE_INTERNED_IMMORTAL:
15953              Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15954  #ifdef INTERNED_STATS
15955              immortal_size += PyUnicode_GET_LENGTH(s);
15956  #endif
15957              break;
15958          case SSTATE_INTERNED_MORTAL:
15959              // Restore the two references (key and value) ignored
15960              // by PyUnicode_InternInPlace().
15961              Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15962  #ifdef INTERNED_STATS
15963              mortal_size += PyUnicode_GET_LENGTH(s);
15964  #endif
15965              break;
15966          case SSTATE_NOT_INTERNED:
15967              /* fall through */
15968          default:
15969              Py_UNREACHABLE();
15970          }
15971          _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15972      }
15973  #ifdef INTERNED_STATS
15974      fprintf(stderr,
15975              "total size of all interned strings: %zd/%zd mortal/immortal\n",
15976              mortal_size, immortal_size);
15977  #endif
15978  
15979      PyDict_Clear(interned);
15980      Py_CLEAR(interned);
15981  }
15982  
15983  
15984  /********************* Unicode Iterator **************************/
15985  
15986  typedef struct {
15987      PyObject_HEAD
15988      Py_ssize_t it_index;
15989      PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15990  } unicodeiterobject;
15991  
15992  static void
unicodeiter_dealloc(unicodeiterobject * it)15993  unicodeiter_dealloc(unicodeiterobject *it)
15994  {
15995      _PyObject_GC_UNTRACK(it);
15996      Py_XDECREF(it->it_seq);
15997      PyObject_GC_Del(it);
15998  }
15999  
16000  static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)16001  unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
16002  {
16003      Py_VISIT(it->it_seq);
16004      return 0;
16005  }
16006  
16007  static PyObject *
unicodeiter_next(unicodeiterobject * it)16008  unicodeiter_next(unicodeiterobject *it)
16009  {
16010      PyObject *seq, *item;
16011  
16012      assert(it != NULL);
16013      seq = it->it_seq;
16014      if (seq == NULL)
16015          return NULL;
16016      assert(_PyUnicode_CHECK(seq));
16017  
16018      if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16019          int kind = PyUnicode_KIND(seq);
16020          const void *data = PyUnicode_DATA(seq);
16021          Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16022          item = PyUnicode_FromOrdinal(chr);
16023          if (item != NULL)
16024              ++it->it_index;
16025          return item;
16026      }
16027  
16028      it->it_seq = NULL;
16029      Py_DECREF(seq);
16030      return NULL;
16031  }
16032  
16033  static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16034  unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16035  {
16036      Py_ssize_t len = 0;
16037      if (it->it_seq)
16038          len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16039      return PyLong_FromSsize_t(len);
16040  }
16041  
16042  PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16043  
16044  static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16045  unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16046  {
16047      _Py_IDENTIFIER(iter);
16048      if (it->it_seq != NULL) {
16049          return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
16050                               it->it_seq, it->it_index);
16051      } else {
16052          PyObject *u = (PyObject *)_PyUnicode_New(0);
16053          if (u == NULL)
16054              return NULL;
16055          return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
16056      }
16057  }
16058  
16059  PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16060  
16061  static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)16062  unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16063  {
16064      Py_ssize_t index = PyLong_AsSsize_t(state);
16065      if (index == -1 && PyErr_Occurred())
16066          return NULL;
16067      if (it->it_seq != NULL) {
16068          if (index < 0)
16069              index = 0;
16070          else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16071              index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16072          it->it_index = index;
16073      }
16074      Py_RETURN_NONE;
16075  }
16076  
16077  PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16078  
16079  static PyMethodDef unicodeiter_methods[] = {
16080      {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
16081       length_hint_doc},
16082      {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16083       reduce_doc},
16084      {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
16085       setstate_doc},
16086      {NULL,      NULL}       /* sentinel */
16087  };
16088  
16089  PyTypeObject PyUnicodeIter_Type = {
16090      PyVarObject_HEAD_INIT(&PyType_Type, 0)
16091      "str_iterator",         /* tp_name */
16092      sizeof(unicodeiterobject),      /* tp_basicsize */
16093      0,                  /* tp_itemsize */
16094      /* methods */
16095      (destructor)unicodeiter_dealloc,    /* tp_dealloc */
16096      0,                  /* tp_vectorcall_offset */
16097      0,                  /* tp_getattr */
16098      0,                  /* tp_setattr */
16099      0,                  /* tp_as_async */
16100      0,                  /* tp_repr */
16101      0,                  /* tp_as_number */
16102      0,                  /* tp_as_sequence */
16103      0,                  /* tp_as_mapping */
16104      0,                  /* tp_hash */
16105      0,                  /* tp_call */
16106      0,                  /* tp_str */
16107      PyObject_GenericGetAttr,        /* tp_getattro */
16108      0,                  /* tp_setattro */
16109      0,                  /* tp_as_buffer */
16110      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16111      0,                  /* tp_doc */
16112      (traverseproc)unicodeiter_traverse, /* tp_traverse */
16113      0,                  /* tp_clear */
16114      0,                  /* tp_richcompare */
16115      0,                  /* tp_weaklistoffset */
16116      PyObject_SelfIter,          /* tp_iter */
16117      (iternextfunc)unicodeiter_next,     /* tp_iternext */
16118      unicodeiter_methods,            /* tp_methods */
16119      0,
16120  };
16121  
16122  static PyObject *
unicode_iter(PyObject * seq)16123  unicode_iter(PyObject *seq)
16124  {
16125      unicodeiterobject *it;
16126  
16127      if (!PyUnicode_Check(seq)) {
16128          PyErr_BadInternalCall();
16129          return NULL;
16130      }
16131      if (PyUnicode_READY(seq) == -1)
16132          return NULL;
16133      it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16134      if (it == NULL)
16135          return NULL;
16136      it->it_index = 0;
16137      Py_INCREF(seq);
16138      it->it_seq = seq;
16139      _PyObject_GC_TRACK(it);
16140      return (PyObject *)it;
16141  }
16142  
16143  static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)16144  encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16145  {
16146      int res;
16147      res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16148      if (res == -2) {
16149          PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16150          return -1;
16151      }
16152      if (res < 0) {
16153          PyErr_NoMemory();
16154          return -1;
16155      }
16156      return 0;
16157  }
16158  
16159  
16160  static int
config_get_codec_name(wchar_t ** config_encoding)16161  config_get_codec_name(wchar_t **config_encoding)
16162  {
16163      char *encoding;
16164      if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16165          return -1;
16166      }
16167  
16168      PyObject *name_obj = NULL;
16169      PyObject *codec = _PyCodec_Lookup(encoding);
16170      PyMem_RawFree(encoding);
16171  
16172      if (!codec)
16173          goto error;
16174  
16175      name_obj = PyObject_GetAttrString(codec, "name");
16176      Py_CLEAR(codec);
16177      if (!name_obj) {
16178          goto error;
16179      }
16180  
16181      wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16182      Py_DECREF(name_obj);
16183      if (wname == NULL) {
16184          goto error;
16185      }
16186  
16187      wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16188      if (raw_wname == NULL) {
16189          PyMem_Free(wname);
16190          PyErr_NoMemory();
16191          goto error;
16192      }
16193  
16194      PyMem_RawFree(*config_encoding);
16195      *config_encoding = raw_wname;
16196  
16197      PyMem_Free(wname);
16198      return 0;
16199  
16200  error:
16201      Py_XDECREF(codec);
16202      Py_XDECREF(name_obj);
16203      return -1;
16204  }
16205  
16206  
16207  static PyStatus
init_stdio_encoding(PyInterpreterState * interp)16208  init_stdio_encoding(PyInterpreterState *interp)
16209  {
16210      /* Update the stdio encoding to the normalized Python codec name. */
16211      PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16212      if (config_get_codec_name(&config->stdio_encoding) < 0) {
16213          return _PyStatus_ERR("failed to get the Python codec name "
16214                               "of the stdio encoding");
16215      }
16216      return _PyStatus_OK();
16217  }
16218  
16219  
16220  static int
init_fs_codec(PyInterpreterState * interp)16221  init_fs_codec(PyInterpreterState *interp)
16222  {
16223      const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16224  
16225      _Py_error_handler error_handler;
16226      error_handler = get_error_handler_wide(config->filesystem_errors);
16227      if (error_handler == _Py_ERROR_UNKNOWN) {
16228          PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16229          return -1;
16230      }
16231  
16232      char *encoding, *errors;
16233      if (encode_wstr_utf8(config->filesystem_encoding,
16234                           &encoding,
16235                           "filesystem_encoding") < 0) {
16236          return -1;
16237      }
16238  
16239      if (encode_wstr_utf8(config->filesystem_errors,
16240                           &errors,
16241                           "filesystem_errors") < 0) {
16242          PyMem_RawFree(encoding);
16243          return -1;
16244      }
16245  
16246      struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16247      PyMem_RawFree(fs_codec->encoding);
16248      fs_codec->encoding = encoding;
16249      /* encoding has been normalized by init_fs_encoding() */
16250      fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16251      PyMem_RawFree(fs_codec->errors);
16252      fs_codec->errors = errors;
16253      fs_codec->error_handler = error_handler;
16254  
16255  #ifdef _Py_FORCE_UTF8_FS_ENCODING
16256      assert(fs_codec->utf8 == 1);
16257  #endif
16258  
16259      /* At this point, PyUnicode_EncodeFSDefault() and
16260         PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16261         the C implementation of the filesystem encoding. */
16262  
16263      /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16264         global configuration variables. */
16265      if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16266                                    fs_codec->errors) < 0) {
16267          PyErr_NoMemory();
16268          return -1;
16269      }
16270      return 0;
16271  }
16272  
16273  
16274  static PyStatus
init_fs_encoding(PyThreadState * tstate)16275  init_fs_encoding(PyThreadState *tstate)
16276  {
16277      PyInterpreterState *interp = tstate->interp;
16278  
16279      /* Update the filesystem encoding to the normalized Python codec name.
16280         For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16281         (Python codec name). */
16282      PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16283      if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16284          _Py_DumpPathConfig(tstate);
16285          return _PyStatus_ERR("failed to get the Python codec "
16286                               "of the filesystem encoding");
16287      }
16288  
16289      if (init_fs_codec(interp) < 0) {
16290          return _PyStatus_ERR("cannot initialize filesystem codec");
16291      }
16292      return _PyStatus_OK();
16293  }
16294  
16295  
16296  PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16297  _PyUnicode_InitEncodings(PyThreadState *tstate)
16298  {
16299      PyStatus status = init_fs_encoding(tstate);
16300      if (_PyStatus_EXCEPTION(status)) {
16301          return status;
16302      }
16303  
16304      return init_stdio_encoding(tstate->interp);
16305  }
16306  
16307  
16308  static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16309  _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16310  {
16311      PyMem_RawFree(fs_codec->encoding);
16312      fs_codec->encoding = NULL;
16313      fs_codec->utf8 = 0;
16314      PyMem_RawFree(fs_codec->errors);
16315      fs_codec->errors = NULL;
16316      fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16317  }
16318  
16319  
16320  #ifdef MS_WINDOWS
16321  int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16322  _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16323  {
16324      PyInterpreterState *interp = _PyInterpreterState_GET();
16325      PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16326  
16327      /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16328      wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16329      wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16330      if (encoding == NULL || errors == NULL) {
16331          PyMem_RawFree(encoding);
16332          PyMem_RawFree(errors);
16333          PyErr_NoMemory();
16334          return -1;
16335      }
16336  
16337      PyMem_RawFree(config->filesystem_encoding);
16338      config->filesystem_encoding = encoding;
16339      PyMem_RawFree(config->filesystem_errors);
16340      config->filesystem_errors = errors;
16341  
16342      return init_fs_codec(interp);
16343  }
16344  #endif
16345  
16346  
16347  void
_PyUnicode_Fini(PyInterpreterState * interp)16348  _PyUnicode_Fini(PyInterpreterState *interp)
16349  {
16350      struct _Py_unicode_state *state = &interp->unicode;
16351  
16352      if (_Py_IsMainInterpreter(interp)) {
16353          // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16354          assert(interned == NULL);
16355      }
16356  
16357      _PyUnicode_FiniEncodings(&state->fs_codec);
16358  
16359      unicode_clear_identifiers(state);
16360  
16361      for (Py_ssize_t i = 0; i < 256; i++) {
16362          Py_CLEAR(state->latin1[i]);
16363      }
16364      Py_CLEAR(state->empty_string);
16365  }
16366  
16367  
16368  /* A _string module, to export formatter_parser and formatter_field_name_split
16369     to the string.Formatter class implemented in Python. */
16370  
16371  static PyMethodDef _string_methods[] = {
16372      {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16373       METH_O, PyDoc_STR("split the argument as a field name")},
16374      {"formatter_parser", (PyCFunction) formatter_parser,
16375       METH_O, PyDoc_STR("parse the argument as a format string")},
16376      {NULL, NULL}
16377  };
16378  
16379  static struct PyModuleDef _string_module = {
16380      PyModuleDef_HEAD_INIT,
16381      .m_name = "_string",
16382      .m_doc = PyDoc_STR("string helper module"),
16383      .m_size = 0,
16384      .m_methods = _string_methods,
16385  };
16386  
16387  PyMODINIT_FUNC
PyInit__string(void)16388  PyInit__string(void)
16389  {
16390      return PyModuleDef_Init(&_string_module);
16391  }
16392  
16393  
16394  #ifdef __cplusplus
16395  }
16396  #endif
16397