• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 
10 Copyright (c) Corporation for National Research Initiatives.
11 
12 --------------------------------------------------------------------
13 The original string type implementation is:
14 
15   Copyright (c) 1999 by Secret Labs AB
16   Copyright (c) 1999 by Fredrik Lundh
17 
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21 
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30 
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39 
40 */
41 
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44 
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /* Limit for the Unicode object free list */
53 
54 #define PyUnicode_MAXFREELIST       1024
55 
56 /* Limit for the Unicode object free list stay alive optimization.
57 
58    The implementation will keep allocated Unicode memory intact for
59    all objects on the free list having a size less than this
60    limit. This reduces malloc() overhead for small Unicode objects.
61 
62    At worst this will result in PyUnicode_MAXFREELIST *
63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64    malloc()-overhead) bytes of unused garbage.
65 
66    Setting the limit to 0 effectively turns the feature off.
67 
68    Note: This is an experimental feature ! If you get core dumps when
69    using Unicode objects, turn this feature off.
70 
71 */
72 
73 #define KEEPALIVE_SIZE_LIMIT       9
74 
75 /* Endianness switches; defaults to little endian */
76 
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82 
83 /* --- Globals ------------------------------------------------------------
84 
85 NOTE: In the interpreter's initialization phase, some globals are currently
86       initialized dynamically as needed. In the process Unicode objects may
87       be created before the Unicode type is ready.
88 
89 */
90 
91 
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99 
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102 
103 #define _Py_RETURN_UNICODE_EMPTY()                      \
104     do {                                                \
105         if (unicode_empty != NULL)                      \
106             Py_INCREF(unicode_empty);                   \
107         else {                                          \
108             unicode_empty = _PyUnicode_New(0);          \
109             if (unicode_empty != NULL)                  \
110                 Py_INCREF(unicode_empty);               \
111         }                                               \
112         return (PyObject *)unicode_empty;               \
113     } while (0)
114 
115 /* Single character Unicode strings in the Latin-1 range are being
116    shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118 
119 /* Default encoding to use and assume when NULL is passed as encoding
120    parameter; it is initialized by _PyUnicode_Init().
121 
122    Always use the PyUnicode_SetDefaultEncoding() and
123    PyUnicode_GetDefaultEncoding() APIs to access this global.
124 
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127 
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130     0, 0, 0, 0, 0, 0, 0, 0,
131 /*     case 0x0009: * CHARACTER TABULATION */
132 /*     case 0x000A: * LINE FEED */
133 /*     case 0x000B: * LINE TABULATION */
134 /*     case 0x000C: * FORM FEED */
135 /*     case 0x000D: * CARRIAGE RETURN */
136     0, 1, 1, 1, 1, 1, 0, 0,
137     0, 0, 0, 0, 0, 0, 0, 0,
138 /*     case 0x001C: * FILE SEPARATOR */
139 /*     case 0x001D: * GROUP SEPARATOR */
140 /*     case 0x001E: * RECORD SEPARATOR */
141 /*     case 0x001F: * UNIT SEPARATOR */
142     0, 0, 0, 0, 1, 1, 1, 1,
143 /*     case 0x0020: * SPACE */
144     1, 0, 0, 0, 0, 0, 0, 0,
145     0, 0, 0, 0, 0, 0, 0, 0,
146     0, 0, 0, 0, 0, 0, 0, 0,
147     0, 0, 0, 0, 0, 0, 0, 0,
148 
149     0, 0, 0, 0, 0, 0, 0, 0,
150     0, 0, 0, 0, 0, 0, 0, 0,
151     0, 0, 0, 0, 0, 0, 0, 0,
152     0, 0, 0, 0, 0, 0, 0, 0,
153     0, 0, 0, 0, 0, 0, 0, 0,
154     0, 0, 0, 0, 0, 0, 0, 0,
155     0, 0, 0, 0, 0, 0, 0, 0,
156     0, 0, 0, 0, 0, 0, 0, 0
157 };
158 
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161     0, 0, 0, 0, 0, 0, 0, 0,
162 /*         0x000A, * LINE FEED */
163 /*         0x000B, * LINE TABULATION */
164 /*         0x000C, * FORM FEED */
165 /*         0x000D, * CARRIAGE RETURN */
166     0, 0, 1, 1, 1, 1, 0, 0,
167     0, 0, 0, 0, 0, 0, 0, 0,
168 /*         0x001C, * FILE SEPARATOR */
169 /*         0x001D, * GROUP SEPARATOR */
170 /*         0x001E, * RECORD SEPARATOR */
171     0, 0, 0, 0, 1, 1, 1, 0,
172     0, 0, 0, 0, 0, 0, 0, 0,
173     0, 0, 0, 0, 0, 0, 0, 0,
174     0, 0, 0, 0, 0, 0, 0, 0,
175     0, 0, 0, 0, 0, 0, 0, 0,
176 
177     0, 0, 0, 0, 0, 0, 0, 0,
178     0, 0, 0, 0, 0, 0, 0, 0,
179     0, 0, 0, 0, 0, 0, 0, 0,
180     0, 0, 0, 0, 0, 0, 0, 0,
181     0, 0, 0, 0, 0, 0, 0, 0,
182     0, 0, 0, 0, 0, 0, 0, 0,
183     0, 0, 0, 0, 0, 0, 0, 0,
184     0, 0, 0, 0, 0, 0, 0, 0
185 };
186 
187 
188 Py_UNICODE
PyUnicode_GetMax(void)189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192     return 0x10FFFF;
193 #else
194     /* This is actually an illegal character, so it should
195        not be passed to unichr. */
196     return 0xFFFF;
197 #endif
198 }
199 
200 /* --- Bloom Filters ----------------------------------------------------- */
201 
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203    to keep things simple, we use a single bitmask, using the least 5
204    bits from each unicode characters as the bit index. */
205 
206 /* the linebreak mask is set up by Unicode_Init below */
207 
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217 
218 #define BLOOM_MASK unsigned long
219 
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221 
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224 
225 #define BLOOM_LINEBREAK(ch)                                             \
226     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228 
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231     /* calculate simple bloom-style bitmask for a given unicode string */
232 
233     BLOOM_MASK mask;
234     Py_ssize_t i;
235 
236     mask = 0;
237     for (i = 0; i < len; i++)
238         BLOOM_ADD(mask, ptr[i]);
239 
240     return mask;
241 }
242 
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245     Py_ssize_t i;
246 
247     for (i = 0; i < setlen; i++)
248         if (set[i] == chr)
249             return 1;
250 
251     return 0;
252 }
253 
254 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256 
257 /* --- Unicode Object ----------------------------------------------------- */
258 
259 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)260 int unicode_resize(register PyUnicodeObject *unicode,
261                    Py_ssize_t length)
262 {
263     void *oldstr;
264 
265     /* Shortcut if there's nothing much to do. */
266     if (unicode->length == length)
267         goto reset;
268 
269     /* Resizing shared object (unicode_empty or single character
270        objects) in-place is not allowed. Use PyUnicode_Resize()
271        instead ! */
272 
273     if (unicode == unicode_empty ||
274         (unicode->length == 1 &&
275          unicode->str[0] < 256U &&
276          unicode_latin1[unicode->str[0]] == unicode)) {
277         PyErr_SetString(PyExc_SystemError,
278                         "can't resize shared unicode objects");
279         return -1;
280     }
281 
282     /* We allocate one more byte to make sure the string is Ux0000 terminated.
283        The overallocation is also used by fastsearch, which assumes that it's
284        safe to look at str[length] (without making any assumptions about what
285        it contains). */
286 
287     oldstr = unicode->str;
288     unicode->str = PyObject_REALLOC(unicode->str,
289                                     sizeof(Py_UNICODE) * (length + 1));
290     if (!unicode->str) {
291         unicode->str = (Py_UNICODE *)oldstr;
292         PyErr_NoMemory();
293         return -1;
294     }
295     unicode->str[length] = 0;
296     unicode->length = length;
297 
298   reset:
299     /* Reset the object caches */
300     if (unicode->defenc) {
301         Py_CLEAR(unicode->defenc);
302     }
303     unicode->hash = -1;
304 
305     return 0;
306 }
307 
308 /* We allocate one more byte to make sure the string is
309    Ux0000 terminated; some code relies on that.
310 
311    XXX This allocator could further be enhanced by assuring that the
312    free list never reduces its size below 1.
313 
314 */
315 
316 static
_PyUnicode_New(Py_ssize_t length)317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319     register PyUnicodeObject *unicode;
320 
321     /* Optimization for empty strings */
322     if (length == 0 && unicode_empty != NULL) {
323         Py_INCREF(unicode_empty);
324         return unicode_empty;
325     }
326 
327     /* Ensure we won't overflow the size. */
328     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329         return (PyUnicodeObject *)PyErr_NoMemory();
330     }
331 
332     /* Unicode freelist & memory allocation */
333     if (free_list) {
334         unicode = free_list;
335         free_list = *(PyUnicodeObject **)unicode;
336         numfree--;
337         if (unicode->str) {
338             /* Keep-Alive optimization: we only upsize the buffer,
339                never downsize it. */
340             if ((unicode->length < length) &&
341                 unicode_resize(unicode, length) < 0) {
342                 PyObject_DEL(unicode->str);
343                 unicode->str = NULL;
344             }
345         }
346         else {
347             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349         }
350         (void)PyObject_INIT(unicode, &PyUnicode_Type);
351     }
352     else {
353         size_t new_size;
354         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355         if (unicode == NULL)
356             return NULL;
357         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359     }
360 
361     if (!unicode->str) {
362         PyErr_NoMemory();
363         goto onError;
364     }
365     /* Initialize the first element to guard against cases where
366      * the caller fails before initializing str -- unicode_resize()
367      * reads str[0], and the Keep-Alive optimization can keep memory
368      * allocated for str alive across a call to unicode_dealloc(unicode).
369      * We don't want unicode_resize to read uninitialized memory in
370      * that case.
371      */
372     unicode->str[0] = 0;
373     unicode->str[length] = 0;
374     unicode->length = length;
375     unicode->hash = -1;
376     unicode->defenc = NULL;
377     return unicode;
378 
379   onError:
380     /* XXX UNREF/NEWREF interface should be more symmetrical */
381     _Py_DEC_REFTOTAL;
382     _Py_ForgetReference((PyObject *)unicode);
383     PyObject_Del(unicode);
384     return NULL;
385 }
386 
387 static
unicode_dealloc(register PyUnicodeObject * unicode)388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390     if (PyUnicode_CheckExact(unicode) &&
391         numfree < PyUnicode_MAXFREELIST) {
392         /* Keep-Alive optimization */
393         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394             PyObject_DEL(unicode->str);
395             unicode->str = NULL;
396             unicode->length = 0;
397         }
398         if (unicode->defenc) {
399             Py_CLEAR(unicode->defenc);
400         }
401         /* Add to free list */
402         *(PyUnicodeObject **)unicode = free_list;
403         free_list = unicode;
404         numfree++;
405     }
406     else {
407         PyObject_DEL(unicode->str);
408         Py_XDECREF(unicode->defenc);
409         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410     }
411 }
412 
413 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416     register PyUnicodeObject *v;
417 
418     /* Argument checks */
419     if (unicode == NULL) {
420         PyErr_BadInternalCall();
421         return -1;
422     }
423     v = *unicode;
424     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425         PyErr_BadInternalCall();
426         return -1;
427     }
428 
429     /* Resizing unicode_empty and single character objects is not
430        possible since these are being shared. We simply return a fresh
431        copy with the same Unicode content. */
432     if (v->length != length &&
433         (v == unicode_empty || v->length == 1)) {
434         PyUnicodeObject *w = _PyUnicode_New(length);
435         if (w == NULL)
436             return -1;
437         Py_UNICODE_COPY(w->str, v->str,
438                         length < v->length ? length : v->length);
439         Py_SETREF(*unicode, w);
440         return 0;
441     }
442 
443     /* Note that we don't have to modify *unicode for unshared Unicode
444        objects, since we can modify them in-place. */
445     return unicode_resize(v, length);
446 }
447 
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
449 {
450     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
451 }
452 
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
454                                 Py_ssize_t size)
455 {
456     PyUnicodeObject *unicode;
457 
458     /* If the Unicode data is known at construction time, we can apply
459        some optimizations which share commonly used objects. */
460     if (u != NULL) {
461 
462         /* Optimization for empty strings */
463         if (size == 0)
464             _Py_RETURN_UNICODE_EMPTY();
465 
466         /* Single character Unicode objects in the Latin-1 range are
467            shared when using this constructor */
468         if (size == 1 && *u < 256) {
469             unicode = unicode_latin1[*u];
470             if (!unicode) {
471                 unicode = _PyUnicode_New(1);
472                 if (!unicode)
473                     return NULL;
474                 unicode->str[0] = *u;
475                 unicode_latin1[*u] = unicode;
476             }
477             Py_INCREF(unicode);
478             return (PyObject *)unicode;
479         }
480     }
481 
482     unicode = _PyUnicode_New(size);
483     if (!unicode)
484         return NULL;
485 
486     /* Copy the Unicode data into the new object */
487     if (u != NULL)
488         Py_UNICODE_COPY(unicode->str, u, size);
489 
490     return (PyObject *)unicode;
491 }
492 
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
494 {
495     PyUnicodeObject *unicode;
496 
497     if (size < 0) {
498         PyErr_SetString(PyExc_SystemError,
499                         "Negative size passed to PyUnicode_FromStringAndSize");
500         return NULL;
501     }
502 
503     /* If the Unicode data is known at construction time, we can apply
504        some optimizations which share commonly used objects.
505        Also, this means the input must be UTF-8, so fall back to the
506        UTF-8 decoder at the end. */
507     if (u != NULL) {
508 
509         /* Optimization for empty strings */
510         if (size == 0)
511             _Py_RETURN_UNICODE_EMPTY();
512 
513         /* Single characters are shared when using this constructor.
514            Restrict to ASCII, since the input must be UTF-8. */
515         if (size == 1 && Py_CHARMASK(*u) < 128) {
516             unicode = unicode_latin1[Py_CHARMASK(*u)];
517             if (!unicode) {
518                 unicode = _PyUnicode_New(1);
519                 if (!unicode)
520                     return NULL;
521                 unicode->str[0] = Py_CHARMASK(*u);
522                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
523             }
524             Py_INCREF(unicode);
525             return (PyObject *)unicode;
526         }
527 
528         return PyUnicode_DecodeUTF8(u, size, NULL);
529     }
530 
531     unicode = _PyUnicode_New(size);
532     if (!unicode)
533         return NULL;
534 
535     return (PyObject *)unicode;
536 }
537 
PyUnicode_FromString(const char * u)538 PyObject *PyUnicode_FromString(const char *u)
539 {
540     size_t size = strlen(u);
541     if (size > PY_SSIZE_T_MAX) {
542         PyErr_SetString(PyExc_OverflowError, "input too long");
543         return NULL;
544     }
545 
546     return PyUnicode_FromStringAndSize(u, size);
547 }
548 
549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
550  * by 'ptr', possibly combining surrogate pairs on narrow builds.
551  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
552  * that should be returned and 'end' pointing to the end of the buffer.
553  * ('end' is used on narrow builds to detect a lone surrogate at the
554  * end of the buffer that should be returned unchanged.)
555  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
556  * The type of the returned char is always Py_UCS4.
557  *
558  * Note: the macro advances ptr to next char, so it might have side-effects
559  *       (especially if used with other macros).
560  */
561 
562 /* helper macros used by _Py_UNICODE_NEXT */
563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
565 /* Join two surrogate characters and return a single Py_UCS4 value. */
566 #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
567     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
568       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
569 
570 #ifdef Py_UNICODE_WIDE
571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
572 #else
573 #define _Py_UNICODE_NEXT(ptr, end)                                      \
574      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
575         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
576        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
577        (Py_UCS4)*(ptr)++)
578 #endif
579 
580 #ifdef HAVE_WCHAR_H
581 
582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583 # define CONVERT_WCHAR_TO_SURROGATES
584 #endif
585 
586 #ifdef CONVERT_WCHAR_TO_SURROGATES
587 
588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589    to convert from UTF32 to UTF16. */
590 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592                                  Py_ssize_t size)
593 {
594     PyUnicodeObject *unicode;
595     register Py_ssize_t i;
596     Py_ssize_t alloc;
597     const wchar_t *orig_w;
598 
599     if (w == NULL) {
600         PyErr_BadInternalCall();
601         return NULL;
602     }
603 
604     alloc = size;
605     orig_w = w;
606     for (i = size; i > 0; i--) {
607         if (*w > 0xFFFF)
608             alloc++;
609         w++;
610     }
611     w = orig_w;
612     unicode = _PyUnicode_New(alloc);
613     if (!unicode)
614         return NULL;
615 
616     /* Copy the wchar_t data into the new object */
617     {
618         register Py_UNICODE *u;
619         u = PyUnicode_AS_UNICODE(unicode);
620         for (i = size; i > 0; i--) {
621             if (*w > 0xFFFF) {
622                 wchar_t ordinal = *w++;
623                 ordinal -= 0x10000;
624                 *u++ = 0xD800 | (ordinal >> 10);
625                 *u++ = 0xDC00 | (ordinal & 0x3FF);
626             }
627             else
628                 *u++ = *w++;
629         }
630     }
631     return (PyObject *)unicode;
632 }
633 
634 #else
635 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
637                                  Py_ssize_t size)
638 {
639     PyUnicodeObject *unicode;
640 
641     if (w == NULL) {
642         PyErr_BadInternalCall();
643         return NULL;
644     }
645 
646     unicode = _PyUnicode_New(size);
647     if (!unicode)
648         return NULL;
649 
650     /* Copy the wchar_t data into the new object */
651 #ifdef HAVE_USABLE_WCHAR_T
652     memcpy(unicode->str, w, size * sizeof(wchar_t));
653 #else
654     {
655         register Py_UNICODE *u;
656         register Py_ssize_t i;
657         u = PyUnicode_AS_UNICODE(unicode);
658         for (i = size; i > 0; i--)
659             *u++ = *w++;
660     }
661 #endif
662 
663     return (PyObject *)unicode;
664 }
665 
666 #endif /* CONVERT_WCHAR_TO_SURROGATES */
667 
668 #undef CONVERT_WCHAR_TO_SURROGATES
669 
670 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
672 {
673     *fmt++ = '%';
674     if (width) {
675         if (zeropad)
676             *fmt++ = '0';
677         fmt += sprintf(fmt, "%d", width);
678     }
679     if (precision)
680         fmt += sprintf(fmt, ".%d", precision);
681     if (longflag)
682         *fmt++ = 'l';
683     else if (size_tflag) {
684         char *f = PY_FORMAT_SIZE_T;
685         while (*f)
686             *fmt++ = *f++;
687     }
688     *fmt++ = c;
689     *fmt = '\0';
690 }
691 
692 #define appendstring(string) \
693     do { \
694         for (copy = string;*copy; copy++) { \
695             *s++ = (unsigned char)*copy; \
696         } \
697     } while (0)
698 
699 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)700 PyUnicode_FromFormatV(const char *format, va_list vargs)
701 {
702     va_list count;
703     Py_ssize_t callcount = 0;
704     PyObject **callresults = NULL;
705     PyObject **callresult = NULL;
706     Py_ssize_t n = 0;
707     int width = 0;
708     int precision = 0;
709     int zeropad;
710     const char* f;
711     Py_UNICODE *s;
712     PyObject *string;
713     /* used by sprintf */
714     char buffer[21];
715     /* use abuffer instead of buffer, if we need more space
716      * (which can happen if there's a format specifier with width). */
717     char *abuffer = NULL;
718     char *realbuffer;
719     Py_ssize_t abuffersize = 0;
720     char fmt[60]; /* should be enough for %0width.precisionld */
721     const char *copy;
722 
723 #ifdef VA_LIST_IS_ARRAY
724     Py_MEMCPY(count, vargs, sizeof(va_list));
725 #else
726 #ifdef  __va_copy
727     __va_copy(count, vargs);
728 #else
729     count = vargs;
730 #endif
731 #endif
732      /* step 1: count the number of %S/%R/%s format specifications
733       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
734       * objects once during step 3 and put the result in an array) */
735     for (f = format; *f; f++) {
736          if (*f == '%') {
737              f++;
738              while (*f && *f != '%' && !isalpha((unsigned)*f))
739                  f++;
740              if (!*f)
741                  break;
742              if (*f == 's' || *f=='S' || *f=='R')
743                  ++callcount;
744          }
745     }
746     /* step 2: allocate memory for the results of
747      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
748     if (callcount) {
749         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
750         if (!callresults) {
751             PyErr_NoMemory();
752             return NULL;
753         }
754         callresult = callresults;
755     }
756     /* step 3: figure out how large a buffer we need */
757     for (f = format; *f; f++) {
758         if (*f == '%') {
759             const char* p = f++;
760             width = 0;
761             while (isdigit((unsigned)*f))
762                 width = (width*10) + *f++ - '0';
763             precision = 0;
764             if (*f == '.') {
765                 f++;
766                 while (isdigit((unsigned)*f))
767                     precision = (precision*10) + *f++ - '0';
768             }
769 
770             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771              * they don't affect the amount of space we reserve.
772              */
773             if ((*f == 'l' || *f == 'z') &&
774                 (f[1] == 'd' || f[1] == 'u'))
775                 ++f;
776 
777             switch (*f) {
778             case 'c':
779             {
780                 int ordinal = va_arg(count, int);
781 #ifdef Py_UNICODE_WIDE
782                 if (ordinal < 0 || ordinal > 0x10ffff) {
783                     PyErr_SetString(PyExc_OverflowError,
784                                     "%c arg not in range(0x110000) "
785                                     "(wide Python build)");
786                     goto fail;
787                 }
788 #else
789                 if (ordinal < 0 || ordinal > 0xffff) {
790                     PyErr_SetString(PyExc_OverflowError,
791                                     "%c arg not in range(0x10000) "
792                                     "(narrow Python build)");
793                     goto fail;
794                 }
795 #endif
796                 /* fall through... */
797             }
798             case '%':
799                 n++;
800                 break;
801             case 'd': case 'u': case 'i': case 'x':
802                 (void) va_arg(count, int);
803                 if (width < precision)
804                     width = precision;
805                 /* 20 bytes is enough to hold a 64-bit
806                    integer.  Decimal takes the most space.
807                    This isn't enough for octal.
808                    If a width is specified we need more
809                    (which we allocate later). */
810                 if (width < 20)
811                     width = 20;
812                 n += width;
813                 if (abuffersize < width)
814                     abuffersize = width;
815                 break;
816             case 's':
817             {
818                 /* UTF-8 */
819                 const char *s = va_arg(count, const char*);
820                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
821                 if (!str)
822                     goto fail;
823                 n += PyUnicode_GET_SIZE(str);
824                 /* Remember the str and switch to the next slot */
825                 *callresult++ = str;
826                 break;
827             }
828             case 'U':
829             {
830                 PyObject *obj = va_arg(count, PyObject *);
831                 assert(obj && PyUnicode_Check(obj));
832                 n += PyUnicode_GET_SIZE(obj);
833                 break;
834             }
835             case 'V':
836             {
837                 PyObject *obj = va_arg(count, PyObject *);
838                 const char *str = va_arg(count, const char *);
839                 assert(obj || str);
840                 assert(!obj || PyUnicode_Check(obj));
841                 if (obj)
842                     n += PyUnicode_GET_SIZE(obj);
843                 else
844                     n += strlen(str);
845                 break;
846             }
847             case 'S':
848             {
849                 PyObject *obj = va_arg(count, PyObject *);
850                 PyObject *str;
851                 assert(obj);
852                 str = PyObject_Str(obj);
853                 if (!str)
854                     goto fail;
855                 n += PyString_GET_SIZE(str);
856                 /* Remember the str and switch to the next slot */
857                 *callresult++ = str;
858                 break;
859             }
860             case 'R':
861             {
862                 PyObject *obj = va_arg(count, PyObject *);
863                 PyObject *repr;
864                 assert(obj);
865                 repr = PyObject_Repr(obj);
866                 if (!repr)
867                     goto fail;
868                 n += PyUnicode_GET_SIZE(repr);
869                 /* Remember the repr and switch to the next slot */
870                 *callresult++ = repr;
871                 break;
872             }
873             case 'p':
874                 (void) va_arg(count, int);
875                 /* maximum 64-bit pointer representation:
876                  * 0xffffffffffffffff
877                  * so 19 characters is enough.
878                  * XXX I count 18 -- what's the extra for?
879                  */
880                 n += 19;
881                 break;
882             default:
883                 /* if we stumble upon an unknown
884                    formatting code, copy the rest of
885                    the format string to the output
886                    string. (we cannot just skip the
887                    code, since there's no way to know
888                    what's in the argument list) */
889                 n += strlen(p);
890                 goto expand;
891             }
892         } else
893             n++;
894     }
895   expand:
896     if (abuffersize > 20) {
897         /* add 1 for sprintf's trailing null byte */
898         abuffer = PyObject_Malloc(abuffersize + 1);
899         if (!abuffer) {
900             PyErr_NoMemory();
901             goto fail;
902         }
903         realbuffer = abuffer;
904     }
905     else
906         realbuffer = buffer;
907     /* step 4: fill the buffer */
908     /* Since we've analyzed how much space we need for the worst case,
909        we don't have to resize the string.
910        There can be no errors beyond this point. */
911     string = PyUnicode_FromUnicode(NULL, n);
912     if (!string)
913         goto fail;
914 
915     s = PyUnicode_AS_UNICODE(string);
916     callresult = callresults;
917 
918     for (f = format; *f; f++) {
919         if (*f == '%') {
920             const char* p = f++;
921             int longflag = 0;
922             int size_tflag = 0;
923             zeropad = (*f == '0');
924             /* parse the width.precision part */
925             width = 0;
926             while (isdigit((unsigned)*f))
927                 width = (width*10) + *f++ - '0';
928             precision = 0;
929             if (*f == '.') {
930                 f++;
931                 while (isdigit((unsigned)*f))
932                     precision = (precision*10) + *f++ - '0';
933             }
934             /* handle the long flag, but only for %ld and %lu.
935                others can be added when necessary. */
936             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
937                 longflag = 1;
938                 ++f;
939             }
940             /* handle the size_t flag. */
941             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
942                 size_tflag = 1;
943                 ++f;
944             }
945 
946             switch (*f) {
947             case 'c':
948                 *s++ = va_arg(vargs, int);
949                 break;
950             case 'd':
951                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
952                 if (longflag)
953                     sprintf(realbuffer, fmt, va_arg(vargs, long));
954                 else if (size_tflag)
955                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
956                 else
957                     sprintf(realbuffer, fmt, va_arg(vargs, int));
958                 appendstring(realbuffer);
959                 break;
960             case 'u':
961                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
962                 if (longflag)
963                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
964                 else if (size_tflag)
965                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
966                 else
967                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
968                 appendstring(realbuffer);
969                 break;
970             case 'i':
971                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
972                 sprintf(realbuffer, fmt, va_arg(vargs, int));
973                 appendstring(realbuffer);
974                 break;
975             case 'x':
976                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
977                 sprintf(realbuffer, fmt, va_arg(vargs, int));
978                 appendstring(realbuffer);
979                 break;
980             case 's':
981             {
982                 /* unused, since we already have the result */
983                 (void) va_arg(vargs, char *);
984                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
985                                 PyUnicode_GET_SIZE(*callresult));
986                 s += PyUnicode_GET_SIZE(*callresult);
987                 /* We're done with the unicode()/repr() => forget it */
988                 Py_DECREF(*callresult);
989                 /* switch to next unicode()/repr() result */
990                 ++callresult;
991                 break;
992             }
993             case 'U':
994             {
995                 PyObject *obj = va_arg(vargs, PyObject *);
996                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
997                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
998                 s += size;
999                 break;
1000             }
1001             case 'V':
1002             {
1003                 PyObject *obj = va_arg(vargs, PyObject *);
1004                 const char *str = va_arg(vargs, const char *);
1005                 if (obj) {
1006                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1007                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1008                     s += size;
1009                 } else {
1010                     appendstring(str);
1011                 }
1012                 break;
1013             }
1014             case 'S':
1015             case 'R':
1016             {
1017                 const char *str = PyString_AS_STRING(*callresult);
1018                 /* unused, since we already have the result */
1019                 (void) va_arg(vargs, PyObject *);
1020                 appendstring(str);
1021                 /* We're done with the unicode()/repr() => forget it */
1022                 Py_DECREF(*callresult);
1023                 /* switch to next unicode()/repr() result */
1024                 ++callresult;
1025                 break;
1026             }
1027             case 'p':
1028                 sprintf(buffer, "%p", va_arg(vargs, void*));
1029                 /* %p is ill-defined:  ensure leading 0x. */
1030                 if (buffer[1] == 'X')
1031                     buffer[1] = 'x';
1032                 else if (buffer[1] != 'x') {
1033                     memmove(buffer+2, buffer, strlen(buffer)+1);
1034                     buffer[0] = '0';
1035                     buffer[1] = 'x';
1036                 }
1037                 appendstring(buffer);
1038                 break;
1039             case '%':
1040                 *s++ = '%';
1041                 break;
1042             default:
1043                 appendstring(p);
1044                 goto end;
1045             }
1046         } else
1047             *s++ = *f;
1048     }
1049 
1050   end:
1051     if (callresults)
1052         PyObject_Free(callresults);
1053     if (abuffer)
1054         PyObject_Free(abuffer);
1055     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1056     return string;
1057   fail:
1058     if (callresults) {
1059         PyObject **callresult2 = callresults;
1060         while (callresult2 < callresult) {
1061             Py_DECREF(*callresult2);
1062             ++callresult2;
1063         }
1064         PyObject_Free(callresults);
1065     }
1066     if (abuffer)
1067         PyObject_Free(abuffer);
1068     return NULL;
1069 }
1070 
1071 #undef appendstring
1072 
1073 PyObject *
PyUnicode_FromFormat(const char * format,...)1074 PyUnicode_FromFormat(const char *format, ...)
1075 {
1076     PyObject* ret;
1077     va_list vargs;
1078 
1079 #ifdef HAVE_STDARG_PROTOTYPES
1080     va_start(vargs, format);
1081 #else
1082     va_start(vargs);
1083 #endif
1084     ret = PyUnicode_FromFormatV(format, vargs);
1085     va_end(vargs);
1086     return ret;
1087 }
1088 
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1090                                 wchar_t *w,
1091                                 Py_ssize_t size)
1092 {
1093     if (unicode == NULL) {
1094         PyErr_BadInternalCall();
1095         return -1;
1096     }
1097 
1098     /* If possible, try to copy the 0-termination as well */
1099     if (size > PyUnicode_GET_SIZE(unicode))
1100         size = PyUnicode_GET_SIZE(unicode) + 1;
1101 
1102 #ifdef HAVE_USABLE_WCHAR_T
1103     memcpy(w, unicode->str, size * sizeof(wchar_t));
1104 #else
1105     {
1106         register Py_UNICODE *u;
1107         register Py_ssize_t i;
1108         u = PyUnicode_AS_UNICODE(unicode);
1109         for (i = size; i > 0; i--)
1110             *w++ = *u++;
1111     }
1112 #endif
1113 
1114     if (size > PyUnicode_GET_SIZE(unicode))
1115         return PyUnicode_GET_SIZE(unicode);
1116     else
1117         return size;
1118 }
1119 
1120 #endif
1121 
PyUnicode_FromOrdinal(int ordinal)1122 PyObject *PyUnicode_FromOrdinal(int ordinal)
1123 {
1124     Py_UNICODE s[1];
1125 
1126 #ifdef Py_UNICODE_WIDE
1127     if (ordinal < 0 || ordinal > 0x10ffff) {
1128         PyErr_SetString(PyExc_ValueError,
1129                         "unichr() arg not in range(0x110000) "
1130                         "(wide Python build)");
1131         return NULL;
1132     }
1133 #else
1134     if (ordinal < 0 || ordinal > 0xffff) {
1135         PyErr_SetString(PyExc_ValueError,
1136                         "unichr() arg not in range(0x10000) "
1137                         "(narrow Python build)");
1138         return NULL;
1139     }
1140 #endif
1141 
1142     s[0] = (Py_UNICODE)ordinal;
1143     return PyUnicode_FromUnicode(s, 1);
1144 }
1145 
PyUnicode_FromObject(register PyObject * obj)1146 PyObject *PyUnicode_FromObject(register PyObject *obj)
1147 {
1148     /* XXX Perhaps we should make this API an alias of
1149        PyObject_Unicode() instead ?! */
1150     if (PyUnicode_CheckExact(obj)) {
1151         Py_INCREF(obj);
1152         return obj;
1153     }
1154     if (PyUnicode_Check(obj)) {
1155         /* For a Unicode subtype that's not a Unicode object,
1156            return a true Unicode object with the same data. */
1157         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1158                                      PyUnicode_GET_SIZE(obj));
1159     }
1160     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1161 }
1162 
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1164                                       const char *encoding,
1165                                       const char *errors)
1166 {
1167     const char *s = NULL;
1168     Py_ssize_t len;
1169     PyObject *v;
1170 
1171     if (obj == NULL) {
1172         PyErr_BadInternalCall();
1173         return NULL;
1174     }
1175 
1176 #if 0
1177     /* For b/w compatibility we also accept Unicode objects provided
1178        that no encodings is given and then redirect to
1179        PyObject_Unicode() which then applies the additional logic for
1180        Unicode subclasses.
1181 
1182        NOTE: This API should really only be used for object which
1183        represent *encoded* Unicode !
1184 
1185     */
1186     if (PyUnicode_Check(obj)) {
1187         if (encoding) {
1188             PyErr_SetString(PyExc_TypeError,
1189                             "decoding Unicode is not supported");
1190             return NULL;
1191         }
1192         return PyObject_Unicode(obj);
1193     }
1194 #else
1195     if (PyUnicode_Check(obj)) {
1196         PyErr_SetString(PyExc_TypeError,
1197                         "decoding Unicode is not supported");
1198         return NULL;
1199     }
1200 #endif
1201 
1202     /* Coerce object */
1203     if (PyString_Check(obj)) {
1204         s = PyString_AS_STRING(obj);
1205         len = PyString_GET_SIZE(obj);
1206     }
1207     else if (PyByteArray_Check(obj)) {
1208         /* Python 2.x specific */
1209         PyErr_Format(PyExc_TypeError,
1210                      "decoding bytearray is not supported");
1211         return NULL;
1212     }
1213     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1214         /* Overwrite the error message with something more useful in
1215            case of a TypeError. */
1216         if (PyErr_ExceptionMatches(PyExc_TypeError))
1217             PyErr_Format(PyExc_TypeError,
1218                          "coercing to Unicode: need string or buffer, "
1219                          "%.80s found",
1220                          Py_TYPE(obj)->tp_name);
1221         goto onError;
1222     }
1223 
1224     /* Convert to Unicode */
1225     if (len == 0)
1226         _Py_RETURN_UNICODE_EMPTY();
1227 
1228     v = PyUnicode_Decode(s, len, encoding, errors);
1229     return v;
1230 
1231   onError:
1232     return NULL;
1233 }
1234 
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1235 PyObject *PyUnicode_Decode(const char *s,
1236                            Py_ssize_t size,
1237                            const char *encoding,
1238                            const char *errors)
1239 {
1240     PyObject *buffer = NULL, *unicode;
1241 
1242     if (encoding == NULL)
1243         encoding = PyUnicode_GetDefaultEncoding();
1244 
1245     /* Shortcuts for common default encodings */
1246     if (strcmp(encoding, "utf-8") == 0)
1247         return PyUnicode_DecodeUTF8(s, size, errors);
1248     else if (strcmp(encoding, "latin-1") == 0)
1249         return PyUnicode_DecodeLatin1(s, size, errors);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251     else if (strcmp(encoding, "mbcs") == 0)
1252         return PyUnicode_DecodeMBCS(s, size, errors);
1253 #endif
1254     else if (strcmp(encoding, "ascii") == 0)
1255         return PyUnicode_DecodeASCII(s, size, errors);
1256 
1257     /* Decode via the codec registry */
1258     buffer = PyBuffer_FromMemory((void *)s, size);
1259     if (buffer == NULL)
1260         goto onError;
1261     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
1262     if (unicode == NULL)
1263         goto onError;
1264     if (!PyUnicode_Check(unicode)) {
1265         PyErr_Format(PyExc_TypeError,
1266                      "decoder did not return an unicode object (type=%.400s)",
1267                      Py_TYPE(unicode)->tp_name);
1268         Py_DECREF(unicode);
1269         goto onError;
1270     }
1271     Py_DECREF(buffer);
1272     return unicode;
1273 
1274   onError:
1275     Py_XDECREF(buffer);
1276     return NULL;
1277 }
1278 
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1280                                     const char *encoding,
1281                                     const char *errors)
1282 {
1283     PyObject *v;
1284 
1285     if (!PyUnicode_Check(unicode)) {
1286         PyErr_BadArgument();
1287         goto onError;
1288     }
1289 
1290     if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1291         goto onError;
1292 
1293     if (encoding == NULL)
1294         encoding = PyUnicode_GetDefaultEncoding();
1295 
1296     /* Decode via the codec registry */
1297     v = _PyCodec_DecodeText(unicode, encoding, errors);
1298     if (v == NULL)
1299         goto onError;
1300     return v;
1301 
1302   onError:
1303     return NULL;
1304 }
1305 
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1307                            Py_ssize_t size,
1308                            const char *encoding,
1309                            const char *errors)
1310 {
1311     PyObject *v, *unicode;
1312 
1313     unicode = PyUnicode_FromUnicode(s, size);
1314     if (unicode == NULL)
1315         return NULL;
1316     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1317     Py_DECREF(unicode);
1318     return v;
1319 }
1320 
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1322                                     const char *encoding,
1323                                     const char *errors)
1324 {
1325     PyObject *v;
1326 
1327     if (!PyUnicode_Check(unicode)) {
1328         PyErr_BadArgument();
1329         goto onError;
1330     }
1331 
1332     if (encoding == NULL)
1333         encoding = PyUnicode_GetDefaultEncoding();
1334 
1335     /* Encode via the codec registry */
1336     v = _PyCodec_EncodeText(unicode, encoding, errors);
1337     if (v == NULL)
1338         goto onError;
1339     return v;
1340 
1341   onError:
1342     return NULL;
1343 }
1344 
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1346                                     const char *encoding,
1347                                     const char *errors)
1348 {
1349     PyObject *v;
1350 
1351     if (!PyUnicode_Check(unicode)) {
1352         PyErr_BadArgument();
1353         goto onError;
1354     }
1355 
1356     if (encoding == NULL)
1357         encoding = PyUnicode_GetDefaultEncoding();
1358 
1359     /* Shortcuts for common default encodings */
1360     if (errors == NULL) {
1361         if (strcmp(encoding, "utf-8") == 0)
1362             return PyUnicode_AsUTF8String(unicode);
1363         else if (strcmp(encoding, "latin-1") == 0)
1364             return PyUnicode_AsLatin1String(unicode);
1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1366         else if (strcmp(encoding, "mbcs") == 0)
1367             return PyUnicode_AsMBCSString(unicode);
1368 #endif
1369         else if (strcmp(encoding, "ascii") == 0)
1370             return PyUnicode_AsASCIIString(unicode);
1371     }
1372 
1373     /* Encode via the codec registry */
1374     v = _PyCodec_EncodeText(unicode, encoding, errors);
1375     if (v == NULL)
1376         goto onError;
1377     if (!PyString_Check(v)) {
1378         PyErr_Format(PyExc_TypeError,
1379                      "encoder did not return a string object (type=%.400s)",
1380                      Py_TYPE(v)->tp_name);
1381         Py_DECREF(v);
1382         goto onError;
1383     }
1384     return v;
1385 
1386   onError:
1387     return NULL;
1388 }
1389 
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1391                                             const char *errors)
1392 {
1393     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1394 
1395     if (v)
1396         return v;
1397     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1398     if (v && errors == NULL)
1399         ((PyUnicodeObject *)unicode)->defenc = v;
1400     return v;
1401 }
1402 
PyUnicode_AsUnicode(PyObject * unicode)1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1404 {
1405     if (!PyUnicode_Check(unicode)) {
1406         PyErr_BadArgument();
1407         goto onError;
1408     }
1409     return PyUnicode_AS_UNICODE(unicode);
1410 
1411   onError:
1412     return NULL;
1413 }
1414 
PyUnicode_GetSize(PyObject * unicode)1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1416 {
1417     if (!PyUnicode_Check(unicode)) {
1418         PyErr_BadArgument();
1419         goto onError;
1420     }
1421     return PyUnicode_GET_SIZE(unicode);
1422 
1423   onError:
1424     return -1;
1425 }
1426 
PyUnicode_GetDefaultEncoding(void)1427 const char *PyUnicode_GetDefaultEncoding(void)
1428 {
1429     return unicode_default_encoding;
1430 }
1431 
PyUnicode_SetDefaultEncoding(const char * encoding)1432 int PyUnicode_SetDefaultEncoding(const char *encoding)
1433 {
1434     PyObject *v;
1435 
1436     /* Make sure the encoding is valid. As side effect, this also
1437        loads the encoding into the codec registry cache. */
1438     v = _PyCodec_Lookup(encoding);
1439     if (v == NULL)
1440         goto onError;
1441     Py_DECREF(v);
1442     strncpy(unicode_default_encoding,
1443             encoding,
1444             sizeof(unicode_default_encoding) - 1);
1445     return 0;
1446 
1447   onError:
1448     return -1;
1449 }
1450 
1451 /* error handling callback helper:
1452    build arguments, call the callback and check the arguments,
1453    if no exception occurred, copy the replacement to the output
1454    and adjust various state variables.
1455    return 0 on success, -1 on error
1456 */
1457 
1458 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1460                                      const char *encoding, const char *reason,
1461                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1462                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1463                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1464 {
1465     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1466 
1467     PyObject *restuple = NULL;
1468     PyObject *repunicode = NULL;
1469     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1470     Py_ssize_t requiredsize;
1471     Py_ssize_t newpos;
1472     Py_UNICODE *repptr;
1473     Py_ssize_t repsize;
1474     int res = -1;
1475 
1476     if (*errorHandler == NULL) {
1477         *errorHandler = PyCodec_LookupError(errors);
1478         if (*errorHandler == NULL)
1479             goto onError;
1480     }
1481 
1482     if (*exceptionObject == NULL) {
1483         *exceptionObject = PyUnicodeDecodeError_Create(
1484             encoding, input, insize, *startinpos, *endinpos, reason);
1485         if (*exceptionObject == NULL)
1486             goto onError;
1487     }
1488     else {
1489         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1490             goto onError;
1491         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1492             goto onError;
1493         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1494             goto onError;
1495     }
1496 
1497     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1498     if (restuple == NULL)
1499         goto onError;
1500     if (!PyTuple_Check(restuple)) {
1501         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1502         goto onError;
1503     }
1504     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1505         goto onError;
1506     if (newpos<0)
1507         newpos = insize+newpos;
1508     if (newpos<0 || newpos>insize) {
1509         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1510         goto onError;
1511     }
1512 
1513     /* need more space? (at least enough for what we
1514        have+the replacement+the rest of the string (starting
1515        at the new input position), so we won't have to check space
1516        when there are no errors in the rest of the string) */
1517     repptr = PyUnicode_AS_UNICODE(repunicode);
1518     repsize = PyUnicode_GET_SIZE(repunicode);
1519     requiredsize = *outpos;
1520     if (requiredsize > PY_SSIZE_T_MAX - repsize)
1521         goto overflow;
1522     requiredsize += repsize;
1523     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1524         goto overflow;
1525     requiredsize += insize - newpos;
1526     if (requiredsize > outsize) {
1527         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1528             requiredsize = 2*outsize;
1529         if (_PyUnicode_Resize(output, requiredsize) < 0)
1530             goto onError;
1531         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1532     }
1533     *endinpos = newpos;
1534     *inptr = input + newpos;
1535     Py_UNICODE_COPY(*outptr, repptr, repsize);
1536     *outptr += repsize;
1537     *outpos += repsize;
1538     /* we made it! */
1539     res = 0;
1540 
1541   onError:
1542     Py_XDECREF(restuple);
1543     return res;
1544 
1545   overflow:
1546     PyErr_SetString(PyExc_OverflowError,
1547                     "decoded result is too long for a Python string");
1548     goto onError;
1549 }
1550 
1551 /* --- UTF-7 Codec -------------------------------------------------------- */
1552 
1553 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1554 
1555 /* Three simple macros defining base-64. */
1556 
1557 /* Is c a base-64 character? */
1558 
1559 #define IS_BASE64(c) \
1560     (((c) >= 'A' && (c) <= 'Z') ||     \
1561      ((c) >= 'a' && (c) <= 'z') ||     \
1562      ((c) >= '0' && (c) <= '9') ||     \
1563      (c) == '+' || (c) == '/')
1564 
1565 /* given that c is a base-64 character, what is its base-64 value? */
1566 
1567 #define FROM_BASE64(c)                                                  \
1568     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1569      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1570      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1571      (c) == '+' ? 62 : 63)
1572 
1573 /* What is the base-64 character of the bottom 6 bits of n? */
1574 
1575 #define TO_BASE64(n)  \
1576     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1577 
1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1579  * decoded as itself.  We are permissive on decoding; the only ASCII
1580  * byte not decoding to itself is the + which begins a base64
1581  * string. */
1582 
1583 #define DECODE_DIRECT(c)                                \
1584     ((c) <= 127 && (c) != '+')
1585 
1586 /* The UTF-7 encoder treats ASCII characters differently according to
1587  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1588  * the above).  See RFC2152.  This array identifies these different
1589  * sets:
1590  * 0 : "Set D"
1591  *     alphanumeric and '(),-./:?
1592  * 1 : "Set O"
1593  *     !"#$%&*;<=>@[]^_`{|}
1594  * 2 : "whitespace"
1595  *     ht nl cr sp
1596  * 3 : special (must be base64 encoded)
1597  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1598  */
1599 
1600 static
1601 char utf7_category[128] = {
1602 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1603     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1605     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1606 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1607     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1608 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1609     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1610 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1611     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1612 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1613     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1614 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1615     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1616 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1617     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1618 };
1619 
1620 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1621  * answer depends on whether we are encoding set O as itself, and also
1622  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1623  * clear that the answers to these questions vary between
1624  * applications, so this code needs to be flexible.  */
1625 
1626 #define ENCODE_DIRECT(c, directO, directWS)             \
1627     ((c) < 128 && (c) > 0 &&                            \
1628      ((utf7_category[(c)] == 0) ||                      \
1629       (directWS && (utf7_category[(c)] == 2)) ||        \
1630       (directO && (utf7_category[(c)] == 1))))
1631 
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1632 PyObject *PyUnicode_DecodeUTF7(const char *s,
1633                                Py_ssize_t size,
1634                                const char *errors)
1635 {
1636     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1637 }
1638 
1639 /* The decoder.  The only state we preserve is our read position,
1640  * i.e. how many characters we have consumed.  So if we end in the
1641  * middle of a shift sequence we have to back off the read position
1642  * and the output to the beginning of the sequence, otherwise we lose
1643  * all the shift state (seen bits, number of bits seen, high
1644  * surrogate). */
1645 
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1647                                        Py_ssize_t size,
1648                                        const char *errors,
1649                                        Py_ssize_t *consumed)
1650 {
1651     const char *starts = s;
1652     Py_ssize_t startinpos;
1653     Py_ssize_t endinpos;
1654     Py_ssize_t outpos;
1655     const char *e;
1656     PyUnicodeObject *unicode;
1657     Py_UNICODE *p;
1658     const char *errmsg = "";
1659     int inShift = 0;
1660     Py_UNICODE *shiftOutStart;
1661     unsigned int base64bits = 0;
1662     unsigned long base64buffer = 0;
1663     Py_UNICODE surrogate = 0;
1664     PyObject *errorHandler = NULL;
1665     PyObject *exc = NULL;
1666 
1667     unicode = _PyUnicode_New(size);
1668     if (!unicode)
1669         return NULL;
1670     if (size == 0) {
1671         if (consumed)
1672             *consumed = 0;
1673         return (PyObject *)unicode;
1674     }
1675 
1676     p = unicode->str;
1677     shiftOutStart = p;
1678     e = s + size;
1679 
1680     while (s < e) {
1681         Py_UNICODE ch = (unsigned char) *s;
1682 
1683         if (inShift) { /* in a base-64 section */
1684             if (IS_BASE64(ch)) { /* consume a base-64 character */
1685                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1686                 base64bits += 6;
1687                 s++;
1688                 if (base64bits >= 16) {
1689                     /* we have enough bits for a UTF-16 value */
1690                     Py_UNICODE outCh = (Py_UNICODE)
1691                                        (base64buffer >> (base64bits-16));
1692                     base64bits -= 16;
1693                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1694                     assert(outCh <= 0xffff);
1695                     if (surrogate) {
1696                         /* expecting a second surrogate */
1697                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1698 #ifdef Py_UNICODE_WIDE
1699                             *p++ = (((surrogate & 0x3FF)<<10)
1700                                     | (outCh & 0x3FF)) + 0x10000;
1701 #else
1702                             *p++ = surrogate;
1703                             *p++ = outCh;
1704 #endif
1705                             surrogate = 0;
1706                             continue;
1707                         }
1708                         else {
1709                             *p++ = surrogate;
1710                             surrogate = 0;
1711                         }
1712                     }
1713                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1714                         /* first surrogate */
1715                         surrogate = outCh;
1716                     }
1717                     else {
1718                         *p++ = outCh;
1719                     }
1720                 }
1721             }
1722             else { /* now leaving a base-64 section */
1723                 inShift = 0;
1724                 if (base64bits > 0) { /* left-over bits */
1725                     if (base64bits >= 6) {
1726                         /* We've seen at least one base-64 character */
1727                         s++;
1728                         errmsg = "partial character in shift sequence";
1729                         goto utf7Error;
1730                     }
1731                     else {
1732                         /* Some bits remain; they should be zero */
1733                         if (base64buffer != 0) {
1734                             s++;
1735                             errmsg = "non-zero padding bits in shift sequence";
1736                             goto utf7Error;
1737                         }
1738                     }
1739                 }
1740                 if (surrogate && DECODE_DIRECT(ch))
1741                     *p++ = surrogate;
1742                 surrogate = 0;
1743                 if (ch == '-') {
1744                     /* '-' is absorbed; other terminating
1745                        characters are preserved */
1746                     s++;
1747                 }
1748             }
1749         }
1750         else if ( ch == '+' ) {
1751             startinpos = s-starts;
1752             s++; /* consume '+' */
1753             if (s < e && *s == '-') { /* '+-' encodes '+' */
1754                 s++;
1755                 *p++ = '+';
1756             }
1757             else { /* begin base64-encoded section */
1758                 inShift = 1;
1759                 surrogate = 0;
1760                 shiftOutStart = p;
1761                 base64bits = 0;
1762                 base64buffer = 0;
1763             }
1764         }
1765         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1766             *p++ = ch;
1767             s++;
1768         }
1769         else {
1770             startinpos = s-starts;
1771             s++;
1772             errmsg = "unexpected special character";
1773             goto utf7Error;
1774         }
1775         continue;
1776 utf7Error:
1777         outpos = p-PyUnicode_AS_UNICODE(unicode);
1778         endinpos = s-starts;
1779         if (unicode_decode_call_errorhandler(
1780                 errors, &errorHandler,
1781                 "utf7", errmsg,
1782                 starts, size, &startinpos, &endinpos, &exc, &s,
1783                 &unicode, &outpos, &p))
1784             goto onError;
1785     }
1786 
1787     /* end of string */
1788 
1789     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1790         /* if we're in an inconsistent state, that's an error */
1791         inShift = 0;
1792         if (surrogate ||
1793                 (base64bits >= 6) ||
1794                 (base64bits > 0 && base64buffer != 0)) {
1795             outpos = p-PyUnicode_AS_UNICODE(unicode);
1796             endinpos = size;
1797             if (unicode_decode_call_errorhandler(
1798                     errors, &errorHandler,
1799                     "utf7", "unterminated shift sequence",
1800                     starts, size, &startinpos, &endinpos, &exc, &s,
1801                     &unicode, &outpos, &p))
1802                 goto onError;
1803         }
1804     }
1805 
1806     /* return state */
1807     if (consumed) {
1808         if (inShift) {
1809             p = shiftOutStart; /* back off output */
1810             *consumed = startinpos;
1811         }
1812         else {
1813             *consumed = s-starts;
1814         }
1815     }
1816 
1817     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1818         goto onError;
1819 
1820     Py_XDECREF(errorHandler);
1821     Py_XDECREF(exc);
1822     return (PyObject *)unicode;
1823 
1824   onError:
1825     Py_XDECREF(errorHandler);
1826     Py_XDECREF(exc);
1827     Py_DECREF(unicode);
1828     return NULL;
1829 }
1830 
1831 
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1833                                Py_ssize_t size,
1834                                int base64SetO,
1835                                int base64WhiteSpace,
1836                                const char *errors)
1837 {
1838     PyObject *v;
1839     /* It might be possible to tighten this worst case */
1840     Py_ssize_t allocated = 8 * size;
1841     int inShift = 0;
1842     Py_ssize_t i = 0;
1843     unsigned int base64bits = 0;
1844     unsigned long base64buffer = 0;
1845     char * out;
1846     char * start;
1847 
1848     if (allocated / 8 != size)
1849         return PyErr_NoMemory();
1850 
1851     if (size == 0)
1852         return PyString_FromStringAndSize(NULL, 0);
1853 
1854     v = PyString_FromStringAndSize(NULL, allocated);
1855     if (v == NULL)
1856         return NULL;
1857 
1858     start = out = PyString_AS_STRING(v);
1859     for (;i < size; ++i) {
1860         Py_UNICODE ch = s[i];
1861 
1862         if (inShift) {
1863             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864                 /* shifting out */
1865                 if (base64bits) { /* output remaining bits */
1866                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1867                     base64buffer = 0;
1868                     base64bits = 0;
1869                 }
1870                 inShift = 0;
1871                 /* Characters not in the BASE64 set implicitly unshift the sequence
1872                    so no '-' is required, except if the character is itself a '-' */
1873                 if (IS_BASE64(ch) || ch == '-') {
1874                     *out++ = '-';
1875                 }
1876                 *out++ = (char) ch;
1877             }
1878             else {
1879                 goto encode_char;
1880             }
1881         }
1882         else { /* not in a shift sequence */
1883             if (ch == '+') {
1884                 *out++ = '+';
1885                         *out++ = '-';
1886             }
1887             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1888                 *out++ = (char) ch;
1889             }
1890             else {
1891                 *out++ = '+';
1892                 inShift = 1;
1893                 goto encode_char;
1894             }
1895         }
1896         continue;
1897 encode_char:
1898 #ifdef Py_UNICODE_WIDE
1899         if (ch >= 0x10000) {
1900             /* code first surrogate */
1901             base64bits += 16;
1902             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1903             while (base64bits >= 6) {
1904                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905                 base64bits -= 6;
1906             }
1907             /* prepare second surrogate */
1908             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1909         }
1910 #endif
1911         base64bits += 16;
1912         base64buffer = (base64buffer << 16) | ch;
1913         while (base64bits >= 6) {
1914             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1915             base64bits -= 6;
1916         }
1917     }
1918     if (base64bits)
1919         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1920     if (inShift)
1921         *out++ = '-';
1922 
1923     if (_PyString_Resize(&v, out - start))
1924         return NULL;
1925     return v;
1926 }
1927 
1928 #undef IS_BASE64
1929 #undef FROM_BASE64
1930 #undef TO_BASE64
1931 #undef DECODE_DIRECT
1932 #undef ENCODE_DIRECT
1933 
1934 /* --- UTF-8 Codec -------------------------------------------------------- */
1935 
1936 static
1937 char utf8_code_length[256] = {
1938     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1939        illegal prefix.  See RFC 3629 for details */
1940     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1941     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1947     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1948     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1949     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1951     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1952     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1953     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1954     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1955     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1956 };
1957 
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1958 PyObject *PyUnicode_DecodeUTF8(const char *s,
1959                                Py_ssize_t size,
1960                                const char *errors)
1961 {
1962     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1963 }
1964 
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1966                                        Py_ssize_t size,
1967                                        const char *errors,
1968                                        Py_ssize_t *consumed)
1969 {
1970     const char *starts = s;
1971     int n;
1972     int k;
1973     Py_ssize_t startinpos;
1974     Py_ssize_t endinpos;
1975     Py_ssize_t outpos;
1976     const char *e;
1977     PyUnicodeObject *unicode;
1978     Py_UNICODE *p;
1979     const char *errmsg = "";
1980     PyObject *errorHandler = NULL;
1981     PyObject *exc = NULL;
1982 
1983     /* Note: size will always be longer than the resulting Unicode
1984        character count */
1985     unicode = _PyUnicode_New(size);
1986     if (!unicode)
1987         return NULL;
1988     if (size == 0) {
1989         if (consumed)
1990             *consumed = 0;
1991         return (PyObject *)unicode;
1992     }
1993 
1994     /* Unpack UTF-8 encoded data */
1995     p = unicode->str;
1996     e = s + size;
1997 
1998     while (s < e) {
1999         Py_UCS4 ch = (unsigned char)*s;
2000 
2001         if (ch < 0x80) {
2002             *p++ = (Py_UNICODE)ch;
2003             s++;
2004             continue;
2005         }
2006 
2007         n = utf8_code_length[ch];
2008 
2009         if (s + n > e) {
2010             if (consumed)
2011                 break;
2012             else {
2013                 errmsg = "unexpected end of data";
2014                 startinpos = s-starts;
2015                 endinpos = startinpos+1;
2016                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2017                     endinpos++;
2018                 goto utf8Error;
2019             }
2020         }
2021 
2022         switch (n) {
2023 
2024         case 0:
2025             errmsg = "invalid start byte";
2026             startinpos = s-starts;
2027             endinpos = startinpos+1;
2028             goto utf8Error;
2029 
2030         case 1:
2031             errmsg = "internal error";
2032             startinpos = s-starts;
2033             endinpos = startinpos+1;
2034             goto utf8Error;
2035 
2036         case 2:
2037             if ((s[1] & 0xc0) != 0x80) {
2038                 errmsg = "invalid continuation byte";
2039                 startinpos = s-starts;
2040                 endinpos = startinpos + 1;
2041                 goto utf8Error;
2042             }
2043             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2044             assert ((ch > 0x007F) && (ch <= 0x07FF));
2045             *p++ = (Py_UNICODE)ch;
2046             break;
2047 
2048         case 3:
2049             /* XXX: surrogates shouldn't be valid UTF-8!
2050                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2051                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2052                Uncomment the 2 lines below to make them invalid,
2053                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2054             if ((s[1] & 0xc0) != 0x80 ||
2055                 (s[2] & 0xc0) != 0x80 ||
2056                 ((unsigned char)s[0] == 0xE0 &&
2057                  (unsigned char)s[1] < 0xA0)/* ||
2058                 ((unsigned char)s[0] == 0xED &&
2059                  (unsigned char)s[1] > 0x9F)*/) {
2060                 errmsg = "invalid continuation byte";
2061                 startinpos = s-starts;
2062                 endinpos = startinpos + 1;
2063 
2064                 /* if s[1] first two bits are 1 and 0, then the invalid
2065                    continuation byte is s[2], so increment endinpos by 1,
2066                    if not, s[1] is invalid and endinpos doesn't need to
2067                    be incremented. */
2068                 if ((s[1] & 0xC0) == 0x80)
2069                     endinpos++;
2070                 goto utf8Error;
2071             }
2072             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2073             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2074             *p++ = (Py_UNICODE)ch;
2075             break;
2076 
2077         case 4:
2078             if ((s[1] & 0xc0) != 0x80 ||
2079                 (s[2] & 0xc0) != 0x80 ||
2080                 (s[3] & 0xc0) != 0x80 ||
2081                 ((unsigned char)s[0] == 0xF0 &&
2082                  (unsigned char)s[1] < 0x90) ||
2083                 ((unsigned char)s[0] == 0xF4 &&
2084                  (unsigned char)s[1] > 0x8F)) {
2085                 errmsg = "invalid continuation byte";
2086                 startinpos = s-starts;
2087                 endinpos = startinpos + 1;
2088                 if ((s[1] & 0xC0) == 0x80) {
2089                     endinpos++;
2090                     if ((s[2] & 0xC0) == 0x80)
2091                         endinpos++;
2092                 }
2093                 goto utf8Error;
2094             }
2095             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2096                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2098 
2099 #ifdef Py_UNICODE_WIDE
2100             *p++ = (Py_UNICODE)ch;
2101 #else
2102             /*  compute and append the two surrogates: */
2103 
2104             /*  translate from 10000..10FFFF to 0..FFFF */
2105             ch -= 0x10000;
2106 
2107             /*  high surrogate = top 10 bits added to D800 */
2108             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2109 
2110             /*  low surrogate = bottom 10 bits added to DC00 */
2111             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2112 #endif
2113             break;
2114         }
2115         s += n;
2116         continue;
2117 
2118       utf8Error:
2119         outpos = p-PyUnicode_AS_UNICODE(unicode);
2120         if (unicode_decode_call_errorhandler(
2121                 errors, &errorHandler,
2122                 "utf8", errmsg,
2123                 starts, size, &startinpos, &endinpos, &exc, &s,
2124                 &unicode, &outpos, &p))
2125             goto onError;
2126     }
2127     if (consumed)
2128         *consumed = s-starts;
2129 
2130     /* Adjust length */
2131     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2132         goto onError;
2133 
2134     Py_XDECREF(errorHandler);
2135     Py_XDECREF(exc);
2136     return (PyObject *)unicode;
2137 
2138   onError:
2139     Py_XDECREF(errorHandler);
2140     Py_XDECREF(exc);
2141     Py_DECREF(unicode);
2142     return NULL;
2143 }
2144 
2145 /* Allocation strategy:  if the string is short, convert into a stack buffer
2146    and allocate exactly as much space needed at the end.  Else allocate the
2147    maximum possible needed (4 result bytes per Unicode character), and return
2148    the excess memory at the end.
2149 */
2150 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2152                      Py_ssize_t size,
2153                      const char *errors)
2154 {
2155 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2156 
2157     Py_ssize_t i;           /* index into s of next input byte */
2158     PyObject *v;        /* result string object */
2159     char *p;            /* next free byte in output buffer */
2160     Py_ssize_t nallocated;  /* number of result bytes allocated */
2161     Py_ssize_t nneeded;        /* number of result bytes needed */
2162     char stackbuf[MAX_SHORT_UNICHARS * 4];
2163 
2164     assert(s != NULL);
2165     assert(size >= 0);
2166 
2167     if (size <= MAX_SHORT_UNICHARS) {
2168         /* Write into the stack buffer; nallocated can't overflow.
2169          * At the end, we'll allocate exactly as much heap space as it
2170          * turns out we need.
2171          */
2172         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2173         v = NULL;   /* will allocate after we're done */
2174         p = stackbuf;
2175     }
2176     else {
2177         /* Overallocate on the heap, and give the excess back at the end. */
2178         nallocated = size * 4;
2179         if (nallocated / 4 != size)  /* overflow! */
2180             return PyErr_NoMemory();
2181         v = PyString_FromStringAndSize(NULL, nallocated);
2182         if (v == NULL)
2183             return NULL;
2184         p = PyString_AS_STRING(v);
2185     }
2186 
2187     for (i = 0; i < size;) {
2188         Py_UCS4 ch = s[i++];
2189 
2190         if (ch < 0x80)
2191             /* Encode ASCII */
2192             *p++ = (char) ch;
2193 
2194         else if (ch < 0x0800) {
2195             /* Encode Latin-1 */
2196             *p++ = (char)(0xc0 | (ch >> 6));
2197             *p++ = (char)(0x80 | (ch & 0x3f));
2198         }
2199         else {
2200             /* Encode UCS2 Unicode ordinals */
2201             if (ch < 0x10000) {
2202                 /* Special case: check for high surrogate */
2203                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2204                     Py_UCS4 ch2 = s[i];
2205                     /* Check for low surrogate and combine the two to
2206                        form a UCS4 value */
2207                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2208                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2209                         i++;
2210                         goto encodeUCS4;
2211                     }
2212                     /* Fall through: handles isolated high surrogates */
2213                 }
2214                 *p++ = (char)(0xe0 | (ch >> 12));
2215                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2216                 *p++ = (char)(0x80 | (ch & 0x3f));
2217                 continue;
2218             }
2219           encodeUCS4:
2220             /* Encode UCS4 Unicode ordinals */
2221             *p++ = (char)(0xf0 | (ch >> 18));
2222             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2223             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2224             *p++ = (char)(0x80 | (ch & 0x3f));
2225         }
2226     }
2227 
2228     if (v == NULL) {
2229         /* This was stack allocated. */
2230         nneeded = p - stackbuf;
2231         assert(nneeded <= nallocated);
2232         v = PyString_FromStringAndSize(stackbuf, nneeded);
2233     }
2234     else {
2235         /* Cut back to size actually needed. */
2236         nneeded = p - PyString_AS_STRING(v);
2237         assert(nneeded <= nallocated);
2238         if (_PyString_Resize(&v, nneeded))
2239             return NULL;
2240     }
2241     return v;
2242 
2243 #undef MAX_SHORT_UNICHARS
2244 }
2245 
PyUnicode_AsUTF8String(PyObject * unicode)2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2247 {
2248     if (!PyUnicode_Check(unicode)) {
2249         PyErr_BadArgument();
2250         return NULL;
2251     }
2252     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2253                                 PyUnicode_GET_SIZE(unicode),
2254                                 NULL);
2255 }
2256 
2257 /* --- UTF-32 Codec ------------------------------------------------------- */
2258 
2259 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2260 PyUnicode_DecodeUTF32(const char *s,
2261                       Py_ssize_t size,
2262                       const char *errors,
2263                       int *byteorder)
2264 {
2265     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2266 }
2267 
2268 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2269 PyUnicode_DecodeUTF32Stateful(const char *s,
2270                               Py_ssize_t size,
2271                               const char *errors,
2272                               int *byteorder,
2273                               Py_ssize_t *consumed)
2274 {
2275     const char *starts = s;
2276     Py_ssize_t startinpos;
2277     Py_ssize_t endinpos;
2278     Py_ssize_t outpos;
2279     PyUnicodeObject *unicode;
2280     Py_UNICODE *p;
2281 #ifndef Py_UNICODE_WIDE
2282     int pairs = 0;
2283     const unsigned char *qq;
2284 #else
2285     const int pairs = 0;
2286 #endif
2287     const unsigned char *q, *e;
2288     int bo = 0;       /* assume native ordering by default */
2289     const char *errmsg = "";
2290     /* Offsets from q for retrieving bytes in the right order. */
2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292     int iorder[] = {0, 1, 2, 3};
2293 #else
2294     int iorder[] = {3, 2, 1, 0};
2295 #endif
2296     PyObject *errorHandler = NULL;
2297     PyObject *exc = NULL;
2298 
2299     q = (unsigned char *)s;
2300     e = q + size;
2301 
2302     if (byteorder)
2303         bo = *byteorder;
2304 
2305     /* Check for BOM marks (U+FEFF) in the input and adjust current
2306        byte order setting accordingly. In native mode, the leading BOM
2307        mark is skipped, in all other modes, it is copied to the output
2308        stream as-is (giving a ZWNBSP character). */
2309     if (bo == 0) {
2310         if (size >= 4) {
2311             const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2312                 (q[iorder[1]] << 8) | q[iorder[0]];
2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2314             if (bom == 0x0000FEFF) {
2315                 q += 4;
2316                 bo = -1;
2317             }
2318             else if (bom == 0xFFFE0000) {
2319                 q += 4;
2320                 bo = 1;
2321             }
2322 #else
2323             if (bom == 0x0000FEFF) {
2324                 q += 4;
2325                 bo = 1;
2326             }
2327             else if (bom == 0xFFFE0000) {
2328                 q += 4;
2329                 bo = -1;
2330             }
2331 #endif
2332         }
2333     }
2334 
2335     if (bo == -1) {
2336         /* force LE */
2337         iorder[0] = 0;
2338         iorder[1] = 1;
2339         iorder[2] = 2;
2340         iorder[3] = 3;
2341     }
2342     else if (bo == 1) {
2343         /* force BE */
2344         iorder[0] = 3;
2345         iorder[1] = 2;
2346         iorder[2] = 1;
2347         iorder[3] = 0;
2348     }
2349 
2350     /* On narrow builds we split characters outside the BMP into two
2351        code points => count how much extra space we need. */
2352 #ifndef Py_UNICODE_WIDE
2353     for (qq = q; e - qq >= 4; qq += 4)
2354         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2355             pairs++;
2356 #endif
2357 
2358     /* This might be one to much, because of a BOM */
2359     unicode = _PyUnicode_New((size+3)/4+pairs);
2360     if (!unicode)
2361         return NULL;
2362     if (size == 0)
2363         return (PyObject *)unicode;
2364 
2365     /* Unpack UTF-32 encoded data */
2366     p = unicode->str;
2367 
2368     while (q < e) {
2369         Py_UCS4 ch;
2370         /* remaining bytes at the end? (size should be divisible by 4) */
2371         if (e-q<4) {
2372             if (consumed)
2373                 break;
2374             errmsg = "truncated data";
2375             startinpos = ((const char *)q)-starts;
2376             endinpos = ((const char *)e)-starts;
2377             goto utf32Error;
2378             /* The remaining input chars are ignored if the callback
2379                chooses to skip the input */
2380         }
2381         ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2382             (q[iorder[1]] << 8) | q[iorder[0]];
2383 
2384         if (ch >= 0x110000)
2385         {
2386             errmsg = "code point not in range(0x110000)";
2387             startinpos = ((const char *)q)-starts;
2388             endinpos = startinpos+4;
2389             goto utf32Error;
2390         }
2391 #ifndef Py_UNICODE_WIDE
2392         if (ch >= 0x10000)
2393         {
2394             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2395             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2396         }
2397         else
2398 #endif
2399             *p++ = ch;
2400         q += 4;
2401         continue;
2402       utf32Error:
2403         outpos = p-PyUnicode_AS_UNICODE(unicode);
2404         if (unicode_decode_call_errorhandler(
2405                 errors, &errorHandler,
2406                 "utf32", errmsg,
2407                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2408                 &unicode, &outpos, &p))
2409             goto onError;
2410     }
2411 
2412     if (byteorder)
2413         *byteorder = bo;
2414 
2415     if (consumed)
2416         *consumed = (const char *)q-starts;
2417 
2418     /* Adjust length */
2419     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2420         goto onError;
2421 
2422     Py_XDECREF(errorHandler);
2423     Py_XDECREF(exc);
2424     return (PyObject *)unicode;
2425 
2426   onError:
2427     Py_DECREF(unicode);
2428     Py_XDECREF(errorHandler);
2429     Py_XDECREF(exc);
2430     return NULL;
2431 }
2432 
2433 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2435                       Py_ssize_t size,
2436                       const char *errors,
2437                       int byteorder)
2438 {
2439     PyObject *v;
2440     unsigned char *p;
2441     Py_ssize_t nsize, bytesize;
2442 #ifndef Py_UNICODE_WIDE
2443     Py_ssize_t i, pairs;
2444 #else
2445     const int pairs = 0;
2446 #endif
2447     /* Offsets from p for storing byte pairs in the right order. */
2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449     int iorder[] = {0, 1, 2, 3};
2450 #else
2451     int iorder[] = {3, 2, 1, 0};
2452 #endif
2453 
2454 #define STORECHAR(CH)                           \
2455     do {                                        \
2456         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2457         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2458         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2459         p[iorder[0]] = (CH) & 0xff;             \
2460         p += 4;                                 \
2461     } while(0)
2462 
2463     /* In narrow builds we can output surrogate pairs as one code point,
2464        so we need less space. */
2465 #ifndef Py_UNICODE_WIDE
2466     for (i = pairs = 0; i < size-1; i++)
2467         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2468             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2469             pairs++;
2470 #endif
2471     nsize = (size - pairs + (byteorder == 0));
2472     bytesize = nsize * 4;
2473     if (bytesize / 4 != nsize)
2474         return PyErr_NoMemory();
2475     v = PyString_FromStringAndSize(NULL, bytesize);
2476     if (v == NULL)
2477         return NULL;
2478 
2479     p = (unsigned char *)PyString_AS_STRING(v);
2480     if (byteorder == 0)
2481         STORECHAR(0xFEFF);
2482     if (size == 0)
2483         return v;
2484 
2485     if (byteorder == -1) {
2486         /* force LE */
2487         iorder[0] = 0;
2488         iorder[1] = 1;
2489         iorder[2] = 2;
2490         iorder[3] = 3;
2491     }
2492     else if (byteorder == 1) {
2493         /* force BE */
2494         iorder[0] = 3;
2495         iorder[1] = 2;
2496         iorder[2] = 1;
2497         iorder[3] = 0;
2498     }
2499 
2500     while (size-- > 0) {
2501         Py_UCS4 ch = *s++;
2502 #ifndef Py_UNICODE_WIDE
2503         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2504             Py_UCS4 ch2 = *s;
2505             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2506                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2507                 s++;
2508                 size--;
2509             }
2510         }
2511 #endif
2512         STORECHAR(ch);
2513     }
2514     return v;
2515 #undef STORECHAR
2516 }
2517 
PyUnicode_AsUTF32String(PyObject * unicode)2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2519 {
2520     if (!PyUnicode_Check(unicode)) {
2521         PyErr_BadArgument();
2522         return NULL;
2523     }
2524     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2525                                  PyUnicode_GET_SIZE(unicode),
2526                                  NULL,
2527                                  0);
2528 }
2529 
2530 /* --- UTF-16 Codec ------------------------------------------------------- */
2531 
2532 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2533 PyUnicode_DecodeUTF16(const char *s,
2534                       Py_ssize_t size,
2535                       const char *errors,
2536                       int *byteorder)
2537 {
2538     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2539 }
2540 
2541 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2542 PyUnicode_DecodeUTF16Stateful(const char *s,
2543                               Py_ssize_t size,
2544                               const char *errors,
2545                               int *byteorder,
2546                               Py_ssize_t *consumed)
2547 {
2548     const char *starts = s;
2549     Py_ssize_t startinpos;
2550     Py_ssize_t endinpos;
2551     Py_ssize_t outpos;
2552     PyUnicodeObject *unicode;
2553     Py_UNICODE *p;
2554     const unsigned char *q, *e;
2555     int bo = 0;       /* assume native ordering by default */
2556     const char *errmsg = "";
2557     /* Offsets from q for retrieving byte pairs in the right order. */
2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2559     int ihi = 1, ilo = 0;
2560 #else
2561     int ihi = 0, ilo = 1;
2562 #endif
2563     PyObject *errorHandler = NULL;
2564     PyObject *exc = NULL;
2565 
2566     /* Note: size will always be longer than the resulting Unicode
2567        character count */
2568     unicode = _PyUnicode_New(size);
2569     if (!unicode)
2570         return NULL;
2571     if (size == 0)
2572         return (PyObject *)unicode;
2573 
2574     /* Unpack UTF-16 encoded data */
2575     p = unicode->str;
2576     q = (unsigned char *)s;
2577     e = q + size;
2578 
2579     if (byteorder)
2580         bo = *byteorder;
2581 
2582     /* Check for BOM marks (U+FEFF) in the input and adjust current
2583        byte order setting accordingly. In native mode, the leading BOM
2584        mark is skipped, in all other modes, it is copied to the output
2585        stream as-is (giving a ZWNBSP character). */
2586     if (bo == 0) {
2587         if (size >= 2) {
2588             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590             if (bom == 0xFEFF) {
2591                 q += 2;
2592                 bo = -1;
2593             }
2594             else if (bom == 0xFFFE) {
2595                 q += 2;
2596                 bo = 1;
2597             }
2598 #else
2599             if (bom == 0xFEFF) {
2600                 q += 2;
2601                 bo = 1;
2602             }
2603             else if (bom == 0xFFFE) {
2604                 q += 2;
2605                 bo = -1;
2606             }
2607 #endif
2608         }
2609     }
2610 
2611     if (bo == -1) {
2612         /* force LE */
2613         ihi = 1;
2614         ilo = 0;
2615     }
2616     else if (bo == 1) {
2617         /* force BE */
2618         ihi = 0;
2619         ilo = 1;
2620     }
2621 
2622     while (q < e) {
2623         Py_UNICODE ch;
2624         /* remaining bytes at the end? (size should be even) */
2625         if (e-q<2) {
2626             if (consumed)
2627                 break;
2628             errmsg = "truncated data";
2629             startinpos = ((const char *)q)-starts;
2630             endinpos = ((const char *)e)-starts;
2631             goto utf16Error;
2632             /* The remaining input chars are ignored if the callback
2633                chooses to skip the input */
2634         }
2635         ch = (q[ihi] << 8) | q[ilo];
2636 
2637         q += 2;
2638 
2639         if (ch < 0xD800 || ch > 0xDFFF) {
2640             *p++ = ch;
2641             continue;
2642         }
2643 
2644         /* UTF-16 code pair: */
2645         if (e - q < 2) {
2646             q -= 2;
2647             if (consumed)
2648                 break;
2649             errmsg = "unexpected end of data";
2650             startinpos = ((const char *)q)-starts;
2651             endinpos = ((const char *)e)-starts;
2652             goto utf16Error;
2653         }
2654         if (0xD800 <= ch && ch <= 0xDBFF) {
2655             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2656             q += 2;
2657             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2658 #ifndef Py_UNICODE_WIDE
2659                 *p++ = ch;
2660                 *p++ = ch2;
2661 #else
2662                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2663 #endif
2664                 continue;
2665             }
2666             else {
2667                 errmsg = "illegal UTF-16 surrogate";
2668                 startinpos = (((const char *)q)-4)-starts;
2669                 endinpos = startinpos+2;
2670                 goto utf16Error;
2671             }
2672 
2673         }
2674         errmsg = "illegal encoding";
2675         startinpos = (((const char *)q)-2)-starts;
2676         endinpos = startinpos+2;
2677         /* Fall through to report the error */
2678 
2679       utf16Error:
2680         outpos = p-PyUnicode_AS_UNICODE(unicode);
2681         if (unicode_decode_call_errorhandler(
2682                 errors, &errorHandler,
2683                 "utf16", errmsg,
2684                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2685                 &unicode, &outpos, &p))
2686             goto onError;
2687     }
2688 
2689     if (byteorder)
2690         *byteorder = bo;
2691 
2692     if (consumed)
2693         *consumed = (const char *)q-starts;
2694 
2695     /* Adjust length */
2696     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2697         goto onError;
2698 
2699     Py_XDECREF(errorHandler);
2700     Py_XDECREF(exc);
2701     return (PyObject *)unicode;
2702 
2703   onError:
2704     Py_DECREF(unicode);
2705     Py_XDECREF(errorHandler);
2706     Py_XDECREF(exc);
2707     return NULL;
2708 }
2709 
2710 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2712                       Py_ssize_t size,
2713                       const char *errors,
2714                       int byteorder)
2715 {
2716     PyObject *v;
2717     unsigned char *p;
2718     Py_ssize_t nsize, bytesize;
2719 #ifdef Py_UNICODE_WIDE
2720     Py_ssize_t i, pairs;
2721 #else
2722     const int pairs = 0;
2723 #endif
2724     /* Offsets from p for storing byte pairs in the right order. */
2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2726     int ihi = 1, ilo = 0;
2727 #else
2728     int ihi = 0, ilo = 1;
2729 #endif
2730 
2731 #define STORECHAR(CH)                           \
2732     do {                                        \
2733         p[ihi] = ((CH) >> 8) & 0xff;            \
2734         p[ilo] = (CH) & 0xff;                   \
2735         p += 2;                                 \
2736     } while(0)
2737 
2738 #ifdef Py_UNICODE_WIDE
2739     for (i = pairs = 0; i < size; i++)
2740         if (s[i] >= 0x10000)
2741             pairs++;
2742 #endif
2743     /* 2 * (size + pairs + (byteorder == 0)) */
2744     if (size > PY_SSIZE_T_MAX ||
2745         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2746         return PyErr_NoMemory();
2747     nsize = size + pairs + (byteorder == 0);
2748     bytesize = nsize * 2;
2749     if (bytesize / 2 != nsize)
2750         return PyErr_NoMemory();
2751     v = PyString_FromStringAndSize(NULL, bytesize);
2752     if (v == NULL)
2753         return NULL;
2754 
2755     p = (unsigned char *)PyString_AS_STRING(v);
2756     if (byteorder == 0)
2757         STORECHAR(0xFEFF);
2758     if (size == 0)
2759         return v;
2760 
2761     if (byteorder == -1) {
2762         /* force LE */
2763         ihi = 1;
2764         ilo = 0;
2765     }
2766     else if (byteorder == 1) {
2767         /* force BE */
2768         ihi = 0;
2769         ilo = 1;
2770     }
2771 
2772     while (size-- > 0) {
2773         Py_UNICODE ch = *s++;
2774         Py_UNICODE ch2 = 0;
2775 #ifdef Py_UNICODE_WIDE
2776         if (ch >= 0x10000) {
2777             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2778             ch  = 0xD800 | ((ch-0x10000) >> 10);
2779         }
2780 #endif
2781         STORECHAR(ch);
2782         if (ch2)
2783             STORECHAR(ch2);
2784     }
2785     return v;
2786 #undef STORECHAR
2787 }
2788 
PyUnicode_AsUTF16String(PyObject * unicode)2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2790 {
2791     if (!PyUnicode_Check(unicode)) {
2792         PyErr_BadArgument();
2793         return NULL;
2794     }
2795     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2796                                  PyUnicode_GET_SIZE(unicode),
2797                                  NULL,
2798                                  0);
2799 }
2800 
2801 /* --- Unicode Escape Codec ----------------------------------------------- */
2802 
2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2804 
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2806                                         Py_ssize_t size,
2807                                         const char *errors)
2808 {
2809     const char *starts = s;
2810     Py_ssize_t startinpos;
2811     Py_ssize_t endinpos;
2812     Py_ssize_t outpos;
2813     PyUnicodeObject *v;
2814     Py_UNICODE *p;
2815     const char *end;
2816     char* message;
2817     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2818     PyObject *errorHandler = NULL;
2819     PyObject *exc = NULL;
2820 
2821     /* Escaped strings will always be longer than the resulting
2822        Unicode string, so we start with size here and then reduce the
2823        length after conversion to the true value.
2824        (but if the error callback returns a long replacement string
2825        we'll have to allocate more space) */
2826     v = _PyUnicode_New(size);
2827     if (v == NULL)
2828         goto onError;
2829     if (size == 0)
2830         return (PyObject *)v;
2831 
2832     p = PyUnicode_AS_UNICODE(v);
2833     end = s + size;
2834 
2835     while (s < end) {
2836         unsigned char c;
2837         Py_UNICODE x;
2838         int digits;
2839 
2840         /* Non-escape characters are interpreted as Unicode ordinals */
2841         if (*s != '\\') {
2842             *p++ = (unsigned char) *s++;
2843             continue;
2844         }
2845 
2846         startinpos = s-starts;
2847         /* \ - Escapes */
2848         s++;
2849         c = *s++;
2850         if (s > end)
2851             c = '\0'; /* Invalid after \ */
2852         switch (c) {
2853 
2854             /* \x escapes */
2855         case '\n': break;
2856         case '\\': *p++ = '\\'; break;
2857         case '\'': *p++ = '\''; break;
2858         case '\"': *p++ = '\"'; break;
2859         case 'b': *p++ = '\b'; break;
2860         case 'f': *p++ = '\014'; break; /* FF */
2861         case 't': *p++ = '\t'; break;
2862         case 'n': *p++ = '\n'; break;
2863         case 'r': *p++ = '\r'; break;
2864         case 'v': *p++ = '\013'; break; /* VT */
2865         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2866 
2867             /* \OOO (octal) escapes */
2868         case '0': case '1': case '2': case '3':
2869         case '4': case '5': case '6': case '7':
2870             x = s[-1] - '0';
2871             if (s < end && '0' <= *s && *s <= '7') {
2872                 x = (x<<3) + *s++ - '0';
2873                 if (s < end && '0' <= *s && *s <= '7')
2874                     x = (x<<3) + *s++ - '0';
2875             }
2876             *p++ = x;
2877             break;
2878 
2879             /* hex escapes */
2880             /* \xXX */
2881         case 'x':
2882             digits = 2;
2883             message = "truncated \\xXX escape";
2884             goto hexescape;
2885 
2886             /* \uXXXX */
2887         case 'u':
2888             digits = 4;
2889             message = "truncated \\uXXXX escape";
2890             goto hexescape;
2891 
2892             /* \UXXXXXXXX */
2893         case 'U':
2894             digits = 8;
2895             message = "truncated \\UXXXXXXXX escape";
2896         hexescape:
2897             chr = 0;
2898             if (end - s < digits) {
2899                 /* count only hex digits */
2900                 for (; s < end; ++s) {
2901                     c = (unsigned char)*s;
2902                     if (!Py_ISXDIGIT(c))
2903                         goto error;
2904                 }
2905                 goto error;
2906             }
2907             for (; digits--; ++s) {
2908                 c = (unsigned char)*s;
2909                 if (!Py_ISXDIGIT(c))
2910                     goto error;
2911                 chr = (chr<<4) & ~0xF;
2912                 if (c >= '0' && c <= '9')
2913                     chr += c - '0';
2914                 else if (c >= 'a' && c <= 'f')
2915                     chr += 10 + c - 'a';
2916                 else
2917                     chr += 10 + c - 'A';
2918             }
2919             if (chr == 0xffffffff && PyErr_Occurred())
2920                 /* _decoding_error will have already written into the
2921                    target buffer. */
2922                 break;
2923         store:
2924             /* when we get here, chr is a 32-bit unicode character */
2925             if (chr <= 0xffff)
2926                 /* UCS-2 character */
2927                 *p++ = (Py_UNICODE) chr;
2928             else if (chr <= 0x10ffff) {
2929                 /* UCS-4 character. Either store directly, or as
2930                    surrogate pair. */
2931 #ifdef Py_UNICODE_WIDE
2932                 *p++ = chr;
2933 #else
2934                 chr -= 0x10000L;
2935                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2936                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2937 #endif
2938             } else {
2939                 message = "illegal Unicode character";
2940                 goto error;
2941             }
2942             break;
2943 
2944             /* \N{name} */
2945         case 'N':
2946             message = "malformed \\N character escape";
2947             if (ucnhash_CAPI == NULL) {
2948                 /* load the unicode data module */
2949                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2950                 if (ucnhash_CAPI == NULL)
2951                     goto ucnhashError;
2952             }
2953             if (*s == '{') {
2954                 const char *start = s+1;
2955                 /* look for the closing brace */
2956                 while (*s != '}' && s < end)
2957                     s++;
2958                 if (s > start && s < end && *s == '}') {
2959                     /* found a name.  look it up in the unicode database */
2960                     message = "unknown Unicode character name";
2961                     s++;
2962                     if (s - start - 1 <= INT_MAX &&
2963                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2964                         goto store;
2965                 }
2966             }
2967             goto error;
2968 
2969         default:
2970             if (s > end) {
2971                 message = "\\ at end of string";
2972                 s--;
2973                 goto error;
2974             }
2975             else {
2976                 *p++ = '\\';
2977                 *p++ = (unsigned char)s[-1];
2978             }
2979             break;
2980         }
2981         continue;
2982 
2983       error:
2984         endinpos = s-starts;
2985         outpos = p-PyUnicode_AS_UNICODE(v);
2986         if (unicode_decode_call_errorhandler(
2987                 errors, &errorHandler,
2988                 "unicodeescape", message,
2989                 starts, size, &startinpos, &endinpos, &exc, &s,
2990                 &v, &outpos, &p))
2991             goto onError;
2992         continue;
2993     }
2994     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2995         goto onError;
2996     Py_XDECREF(errorHandler);
2997     Py_XDECREF(exc);
2998     return (PyObject *)v;
2999 
3000   ucnhashError:
3001     PyErr_SetString(
3002         PyExc_UnicodeError,
3003         "\\N escapes not supported (can't load unicodedata module)"
3004         );
3005     Py_XDECREF(v);
3006     Py_XDECREF(errorHandler);
3007     Py_XDECREF(exc);
3008     return NULL;
3009 
3010   onError:
3011     Py_XDECREF(v);
3012     Py_XDECREF(errorHandler);
3013     Py_XDECREF(exc);
3014     return NULL;
3015 }
3016 
3017 /* Return a Unicode-Escape string version of the Unicode object.
3018 
3019    If quotes is true, the string is enclosed in u"" or u'' quotes as
3020    appropriate.
3021 
3022 */
3023 
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3025                                              Py_ssize_t size,
3026                                              Py_UNICODE ch)
3027 {
3028     /* like wcschr, but doesn't stop at NULL characters */
3029 
3030     while (size-- > 0) {
3031         if (*s == ch)
3032             return s;
3033         s++;
3034     }
3035 
3036     return NULL;
3037 }
3038 
3039 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)3040 PyObject *unicodeescape_string(const Py_UNICODE *s,
3041                                Py_ssize_t size,
3042                                int quotes)
3043 {
3044     PyObject *repr;
3045     char *p;
3046 
3047     static const char *hexdigit = "0123456789abcdef";
3048 #ifdef Py_UNICODE_WIDE
3049     const Py_ssize_t expandsize = 10;
3050 #else
3051     const Py_ssize_t expandsize = 6;
3052 #endif
3053 
3054     /* XXX(nnorwitz): rather than over-allocating, it would be
3055        better to choose a different scheme.  Perhaps scan the
3056        first N-chars of the string and allocate based on that size.
3057     */
3058     /* Initial allocation is based on the longest-possible unichr
3059        escape.
3060 
3061        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3062        unichr, so in this case it's the longest unichr escape. In
3063        narrow (UTF-16) builds this is five chars per source unichr
3064        since there are two unichrs in the surrogate pair, so in narrow
3065        (UTF-16) builds it's not the longest unichr escape.
3066 
3067        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3068        so in the narrow (UTF-16) build case it's the longest unichr
3069        escape.
3070     */
3071 
3072     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3073         return PyErr_NoMemory();
3074 
3075     repr = PyString_FromStringAndSize(NULL,
3076                                       2
3077                                       + expandsize*size
3078                                       + 1);
3079     if (repr == NULL)
3080         return NULL;
3081 
3082     p = PyString_AS_STRING(repr);
3083 
3084     if (quotes) {
3085         *p++ = 'u';
3086         *p++ = (findchar(s, size, '\'') &&
3087                 !findchar(s, size, '"')) ? '"' : '\'';
3088     }
3089     while (size-- > 0) {
3090         Py_UNICODE ch = *s++;
3091 
3092         /* Escape quotes and backslashes */
3093         if ((quotes &&
3094              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3095             *p++ = '\\';
3096             *p++ = (char) ch;
3097             continue;
3098         }
3099 
3100 #ifdef Py_UNICODE_WIDE
3101         /* Map 21-bit characters to '\U00xxxxxx' */
3102         else if (ch >= 0x10000) {
3103             *p++ = '\\';
3104             *p++ = 'U';
3105             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3106             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3107             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3108             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3109             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3110             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3111             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3112             *p++ = hexdigit[ch & 0x0000000F];
3113             continue;
3114         }
3115 #else
3116         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3117         else if (ch >= 0xD800 && ch < 0xDC00) {
3118             Py_UNICODE ch2;
3119             Py_UCS4 ucs;
3120 
3121             ch2 = *s++;
3122             size--;
3123             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3124                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3125                 *p++ = '\\';
3126                 *p++ = 'U';
3127                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3128                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3129                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3130                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3131                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3132                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3133                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3134                 *p++ = hexdigit[ucs & 0x0000000F];
3135                 continue;
3136             }
3137             /* Fall through: isolated surrogates are copied as-is */
3138             s--;
3139             size++;
3140         }
3141 #endif
3142 
3143         /* Map 16-bit characters to '\uxxxx' */
3144         if (ch >= 256) {
3145             *p++ = '\\';
3146             *p++ = 'u';
3147             *p++ = hexdigit[(ch >> 12) & 0x000F];
3148             *p++ = hexdigit[(ch >> 8) & 0x000F];
3149             *p++ = hexdigit[(ch >> 4) & 0x000F];
3150             *p++ = hexdigit[ch & 0x000F];
3151         }
3152 
3153         /* Map special whitespace to '\t', \n', '\r' */
3154         else if (ch == '\t') {
3155             *p++ = '\\';
3156             *p++ = 't';
3157         }
3158         else if (ch == '\n') {
3159             *p++ = '\\';
3160             *p++ = 'n';
3161         }
3162         else if (ch == '\r') {
3163             *p++ = '\\';
3164             *p++ = 'r';
3165         }
3166 
3167         /* Map non-printable US ASCII to '\xhh' */
3168         else if (ch < ' ' || ch >= 0x7F) {
3169             *p++ = '\\';
3170             *p++ = 'x';
3171             *p++ = hexdigit[(ch >> 4) & 0x000F];
3172             *p++ = hexdigit[ch & 0x000F];
3173         }
3174 
3175         /* Copy everything else as-is */
3176         else
3177             *p++ = (char) ch;
3178     }
3179     if (quotes)
3180         *p++ = PyString_AS_STRING(repr)[1];
3181 
3182     *p = '\0';
3183     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3184         return NULL;
3185     return repr;
3186 }
3187 
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3189                                         Py_ssize_t size)
3190 {
3191     return unicodeescape_string(s, size, 0);
3192 }
3193 
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3195 {
3196     if (!PyUnicode_Check(unicode)) {
3197         PyErr_BadArgument();
3198         return NULL;
3199     }
3200     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3201                                          PyUnicode_GET_SIZE(unicode));
3202 }
3203 
3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3205 
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3207                                            Py_ssize_t size,
3208                                            const char *errors)
3209 {
3210     const char *starts = s;
3211     Py_ssize_t startinpos;
3212     Py_ssize_t endinpos;
3213     Py_ssize_t outpos;
3214     PyUnicodeObject *v;
3215     Py_UNICODE *p;
3216     const char *end;
3217     const char *bs;
3218     PyObject *errorHandler = NULL;
3219     PyObject *exc = NULL;
3220 
3221     /* Escaped strings will always be longer than the resulting
3222        Unicode string, so we start with size here and then reduce the
3223        length after conversion to the true value. (But decoding error
3224        handler might have to resize the string) */
3225     v = _PyUnicode_New(size);
3226     if (v == NULL)
3227         goto onError;
3228     if (size == 0)
3229         return (PyObject *)v;
3230     p = PyUnicode_AS_UNICODE(v);
3231     end = s + size;
3232     while (s < end) {
3233         unsigned char c;
3234         Py_UCS4 x;
3235         int i;
3236         int count;
3237 
3238         /* Non-escape characters are interpreted as Unicode ordinals */
3239         if (*s != '\\') {
3240             *p++ = (unsigned char)*s++;
3241             continue;
3242         }
3243         startinpos = s-starts;
3244 
3245         /* \u-escapes are only interpreted iff the number of leading
3246            backslashes if odd */
3247         bs = s;
3248         for (;s < end;) {
3249             if (*s != '\\')
3250                 break;
3251             *p++ = (unsigned char)*s++;
3252         }
3253         if (((s - bs) & 1) == 0 ||
3254             s >= end ||
3255             (*s != 'u' && *s != 'U')) {
3256             continue;
3257         }
3258         p--;
3259         count = *s=='u' ? 4 : 8;
3260         s++;
3261 
3262         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3263         outpos = p-PyUnicode_AS_UNICODE(v);
3264         for (x = 0, i = 0; i < count; ++i, ++s) {
3265             c = (unsigned char)*s;
3266             if (!isxdigit(c)) {
3267                 endinpos = s-starts;
3268                 if (unicode_decode_call_errorhandler(
3269                         errors, &errorHandler,
3270                         "rawunicodeescape", "truncated \\uXXXX",
3271                         starts, size, &startinpos, &endinpos, &exc, &s,
3272                         &v, &outpos, &p))
3273                     goto onError;
3274                 goto nextByte;
3275             }
3276             x = (x<<4) & ~0xF;
3277             if (c >= '0' && c <= '9')
3278                 x += c - '0';
3279             else if (c >= 'a' && c <= 'f')
3280                 x += 10 + c - 'a';
3281             else
3282                 x += 10 + c - 'A';
3283         }
3284         if (x <= 0xffff)
3285             /* UCS-2 character */
3286             *p++ = (Py_UNICODE) x;
3287         else if (x <= 0x10ffff) {
3288             /* UCS-4 character. Either store directly, or as
3289                surrogate pair. */
3290 #ifdef Py_UNICODE_WIDE
3291             *p++ = (Py_UNICODE) x;
3292 #else
3293             x -= 0x10000L;
3294             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3295             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3296 #endif
3297         } else {
3298             endinpos = s-starts;
3299             outpos = p-PyUnicode_AS_UNICODE(v);
3300             if (unicode_decode_call_errorhandler(
3301                     errors, &errorHandler,
3302                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3303                     starts, size, &startinpos, &endinpos, &exc, &s,
3304                     &v, &outpos, &p))
3305                 goto onError;
3306         }
3307       nextByte:
3308         ;
3309     }
3310     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3311         goto onError;
3312     Py_XDECREF(errorHandler);
3313     Py_XDECREF(exc);
3314     return (PyObject *)v;
3315 
3316   onError:
3317     Py_XDECREF(v);
3318     Py_XDECREF(errorHandler);
3319     Py_XDECREF(exc);
3320     return NULL;
3321 }
3322 
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3324                                            Py_ssize_t size)
3325 {
3326     PyObject *repr;
3327     char *p;
3328     char *q;
3329 
3330     static const char *hexdigit = "0123456789abcdef";
3331 #ifdef Py_UNICODE_WIDE
3332     const Py_ssize_t expandsize = 10;
3333 #else
3334     const Py_ssize_t expandsize = 6;
3335 #endif
3336 
3337     if (size > PY_SSIZE_T_MAX / expandsize)
3338         return PyErr_NoMemory();
3339 
3340     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3341     if (repr == NULL)
3342         return NULL;
3343     if (size == 0)
3344         return repr;
3345 
3346     p = q = PyString_AS_STRING(repr);
3347     while (size-- > 0) {
3348         Py_UNICODE ch = *s++;
3349 #ifdef Py_UNICODE_WIDE
3350         /* Map 32-bit characters to '\Uxxxxxxxx' */
3351         if (ch >= 0x10000) {
3352             *p++ = '\\';
3353             *p++ = 'U';
3354             *p++ = hexdigit[(ch >> 28) & 0xf];
3355             *p++ = hexdigit[(ch >> 24) & 0xf];
3356             *p++ = hexdigit[(ch >> 20) & 0xf];
3357             *p++ = hexdigit[(ch >> 16) & 0xf];
3358             *p++ = hexdigit[(ch >> 12) & 0xf];
3359             *p++ = hexdigit[(ch >> 8) & 0xf];
3360             *p++ = hexdigit[(ch >> 4) & 0xf];
3361             *p++ = hexdigit[ch & 15];
3362         }
3363         else
3364 #else
3365             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3366             if (ch >= 0xD800 && ch < 0xDC00) {
3367                 Py_UNICODE ch2;
3368                 Py_UCS4 ucs;
3369 
3370                 ch2 = *s++;
3371                 size--;
3372                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3373                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3374                     *p++ = '\\';
3375                     *p++ = 'U';
3376                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3377                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3378                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3379                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3380                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3381                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3382                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3383                     *p++ = hexdigit[ucs & 0xf];
3384                     continue;
3385                 }
3386                 /* Fall through: isolated surrogates are copied as-is */
3387                 s--;
3388                 size++;
3389             }
3390 #endif
3391         /* Map 16-bit characters to '\uxxxx' */
3392         if (ch >= 256) {
3393             *p++ = '\\';
3394             *p++ = 'u';
3395             *p++ = hexdigit[(ch >> 12) & 0xf];
3396             *p++ = hexdigit[(ch >> 8) & 0xf];
3397             *p++ = hexdigit[(ch >> 4) & 0xf];
3398             *p++ = hexdigit[ch & 15];
3399         }
3400         /* Copy everything else as-is */
3401         else
3402             *p++ = (char) ch;
3403     }
3404     *p = '\0';
3405     if (_PyString_Resize(&repr, p - q))
3406         return NULL;
3407     return repr;
3408 }
3409 
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3411 {
3412     if (!PyUnicode_Check(unicode)) {
3413         PyErr_BadArgument();
3414         return NULL;
3415     }
3416     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3417                                             PyUnicode_GET_SIZE(unicode));
3418 }
3419 
3420 /* --- Unicode Internal Codec ------------------------------------------- */
3421 
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3423                                            Py_ssize_t size,
3424                                            const char *errors)
3425 {
3426     const char *starts = s;
3427     Py_ssize_t startinpos;
3428     Py_ssize_t endinpos;
3429     Py_ssize_t outpos;
3430     PyUnicodeObject *v;
3431     Py_UNICODE *p;
3432     const char *end;
3433     const char *reason;
3434     PyObject *errorHandler = NULL;
3435     PyObject *exc = NULL;
3436 
3437 #ifdef Py_UNICODE_WIDE
3438     Py_UNICODE unimax = PyUnicode_GetMax();
3439 #endif
3440 
3441     /* XXX overflow detection missing */
3442     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3443     if (v == NULL)
3444         goto onError;
3445     if (PyUnicode_GetSize((PyObject *)v) == 0)
3446         return (PyObject *)v;
3447     p = PyUnicode_AS_UNICODE(v);
3448     end = s + size;
3449 
3450     while (s < end) {
3451         if (end-s < Py_UNICODE_SIZE) {
3452             endinpos = end-starts;
3453             reason = "truncated input";
3454             goto error;
3455         }
3456         memcpy(p, s, sizeof(Py_UNICODE));
3457 #ifdef Py_UNICODE_WIDE
3458         /* We have to sanity check the raw data, otherwise doom looms for
3459            some malformed UCS-4 data. */
3460         if (*p > unimax || *p < 0) {
3461             endinpos = s - starts + Py_UNICODE_SIZE;
3462             reason = "illegal code point (> 0x10FFFF)";
3463             goto error;
3464         }
3465 #endif
3466         p++;
3467         s += Py_UNICODE_SIZE;
3468         continue;
3469 
3470   error:
3471         startinpos = s - starts;
3472         outpos = p - PyUnicode_AS_UNICODE(v);
3473         if (unicode_decode_call_errorhandler(
3474                 errors, &errorHandler,
3475                 "unicode_internal", reason,
3476                 starts, size, &startinpos, &endinpos, &exc, &s,
3477                 &v, &outpos, &p)) {
3478             goto onError;
3479         }
3480     }
3481 
3482     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3483         goto onError;
3484     Py_XDECREF(errorHandler);
3485     Py_XDECREF(exc);
3486     return (PyObject *)v;
3487 
3488   onError:
3489     Py_XDECREF(v);
3490     Py_XDECREF(errorHandler);
3491     Py_XDECREF(exc);
3492     return NULL;
3493 }
3494 
3495 /* --- Latin-1 Codec ------------------------------------------------------ */
3496 
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3497 PyObject *PyUnicode_DecodeLatin1(const char *s,
3498                                  Py_ssize_t size,
3499                                  const char *errors)
3500 {
3501     PyUnicodeObject *v;
3502     Py_UNICODE *p;
3503 
3504     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3505     if (size == 1) {
3506         Py_UNICODE r = *(unsigned char*)s;
3507         return PyUnicode_FromUnicode(&r, 1);
3508     }
3509 
3510     v = _PyUnicode_New(size);
3511     if (v == NULL)
3512         goto onError;
3513     if (size == 0)
3514         return (PyObject *)v;
3515     p = PyUnicode_AS_UNICODE(v);
3516     while (size-- > 0)
3517         *p++ = (unsigned char)*s++;
3518     return (PyObject *)v;
3519 
3520   onError:
3521     Py_XDECREF(v);
3522     return NULL;
3523 }
3524 
3525 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3526 static void make_encode_exception(PyObject **exceptionObject,
3527                                   const char *encoding,
3528                                   const Py_UNICODE *unicode, Py_ssize_t size,
3529                                   Py_ssize_t startpos, Py_ssize_t endpos,
3530                                   const char *reason)
3531 {
3532     if (*exceptionObject == NULL) {
3533         *exceptionObject = PyUnicodeEncodeError_Create(
3534             encoding, unicode, size, startpos, endpos, reason);
3535     }
3536     else {
3537         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3538             goto onError;
3539         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3540             goto onError;
3541         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3542             goto onError;
3543         return;
3544       onError:
3545         Py_CLEAR(*exceptionObject);
3546     }
3547 }
3548 
3549 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3550 static void raise_encode_exception(PyObject **exceptionObject,
3551                                    const char *encoding,
3552                                    const Py_UNICODE *unicode, Py_ssize_t size,
3553                                    Py_ssize_t startpos, Py_ssize_t endpos,
3554                                    const char *reason)
3555 {
3556     make_encode_exception(exceptionObject,
3557                           encoding, unicode, size, startpos, endpos, reason);
3558     if (*exceptionObject != NULL)
3559         PyCodec_StrictErrors(*exceptionObject);
3560 }
3561 
3562 /* error handling callback helper:
3563    build arguments, call the callback and check the arguments,
3564    put the result into newpos and return the replacement string, which
3565    has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3566 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3567                                                   PyObject **errorHandler,
3568                                                   const char *encoding, const char *reason,
3569                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3570                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3571                                                   Py_ssize_t *newpos)
3572 {
3573     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3574 
3575     PyObject *restuple;
3576     PyObject *resunicode;
3577 
3578     if (*errorHandler == NULL) {
3579         *errorHandler = PyCodec_LookupError(errors);
3580         if (*errorHandler == NULL)
3581             return NULL;
3582     }
3583 
3584     make_encode_exception(exceptionObject,
3585                           encoding, unicode, size, startpos, endpos, reason);
3586     if (*exceptionObject == NULL)
3587         return NULL;
3588 
3589     restuple = PyObject_CallFunctionObjArgs(
3590         *errorHandler, *exceptionObject, NULL);
3591     if (restuple == NULL)
3592         return NULL;
3593     if (!PyTuple_Check(restuple)) {
3594         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3595         Py_DECREF(restuple);
3596         return NULL;
3597     }
3598     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3599                           &resunicode, newpos)) {
3600         Py_DECREF(restuple);
3601         return NULL;
3602     }
3603     if (*newpos<0)
3604         *newpos = size+*newpos;
3605     if (*newpos<0 || *newpos>size) {
3606         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3607         Py_DECREF(restuple);
3608         return NULL;
3609     }
3610     Py_INCREF(resunicode);
3611     Py_DECREF(restuple);
3612     return resunicode;
3613 }
3614 
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3616                                      Py_ssize_t size,
3617                                      const char *errors,
3618                                      int limit)
3619 {
3620     /* output object */
3621     PyObject *res;
3622     /* pointers to the beginning and end+1 of input */
3623     const Py_UNICODE *startp = p;
3624     const Py_UNICODE *endp = p + size;
3625     /* pointer to the beginning of the unencodable characters */
3626     /* const Py_UNICODE *badp = NULL; */
3627     /* pointer into the output */
3628     char *str;
3629     /* current output position */
3630     Py_ssize_t respos = 0;
3631     Py_ssize_t ressize;
3632     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3633     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3634     PyObject *errorHandler = NULL;
3635     PyObject *exc = NULL;
3636     /* the following variable is used for caching string comparisons
3637      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3638     int known_errorHandler = -1;
3639 
3640     /* allocate enough for a simple encoding without
3641        replacements, if we need more, we'll resize */
3642     res = PyString_FromStringAndSize(NULL, size);
3643     if (res == NULL)
3644         goto onError;
3645     if (size == 0)
3646         return res;
3647     str = PyString_AS_STRING(res);
3648     ressize = size;
3649 
3650     while (p<endp) {
3651         Py_UNICODE c = *p;
3652 
3653         /* can we encode this? */
3654         if (c<limit) {
3655             /* no overflow check, because we know that the space is enough */
3656             *str++ = (char)c;
3657             ++p;
3658         }
3659         else {
3660             Py_ssize_t unicodepos = p-startp;
3661             Py_ssize_t requiredsize;
3662             PyObject *repunicode;
3663             Py_ssize_t repsize;
3664             Py_ssize_t newpos;
3665             Py_ssize_t respos;
3666             Py_UNICODE *uni2;
3667             /* startpos for collecting unencodable chars */
3668             const Py_UNICODE *collstart = p;
3669             const Py_UNICODE *collend = p;
3670             /* find all unecodable characters */
3671             while ((collend < endp) && ((*collend) >= limit))
3672                 ++collend;
3673             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3674             if (known_errorHandler==-1) {
3675                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3676                     known_errorHandler = 1;
3677                 else if (!strcmp(errors, "replace"))
3678                     known_errorHandler = 2;
3679                 else if (!strcmp(errors, "ignore"))
3680                     known_errorHandler = 3;
3681                 else if (!strcmp(errors, "xmlcharrefreplace"))
3682                     known_errorHandler = 4;
3683                 else
3684                     known_errorHandler = 0;
3685             }
3686             switch (known_errorHandler) {
3687             case 1: /* strict */
3688                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3689                 goto onError;
3690             case 2: /* replace */
3691                 while (collstart++ < collend)
3692                     *str++ = '?'; /* fall through */
3693             case 3: /* ignore */
3694                 p = collend;
3695                 break;
3696             case 4: /* xmlcharrefreplace */
3697                 respos = str - PyString_AS_STRING(res);
3698                 /* determine replacement size (temporarily (mis)uses p) */
3699                 requiredsize = respos;
3700                 for (p = collstart; p < collend;) {
3701                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3702                     Py_ssize_t incr;
3703                     if (ch < 10)
3704                         incr = 2+1+1;
3705                     else if (ch < 100)
3706                         incr = 2+2+1;
3707                     else if (ch < 1000)
3708                         incr = 2+3+1;
3709                     else if (ch < 10000)
3710                         incr = 2+4+1;
3711                     else if (ch < 100000)
3712                         incr = 2+5+1;
3713                     else if (ch < 1000000)
3714                         incr = 2+6+1;
3715                     else
3716                         incr = 2+7+1;
3717                     if (requiredsize > PY_SSIZE_T_MAX - incr)
3718                         goto overflow;
3719                     requiredsize += incr;
3720                 }
3721                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3722                     goto overflow;
3723                 requiredsize += endp - collend;
3724                 if (requiredsize > ressize) {
3725                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3726                         requiredsize = 2*ressize;
3727                     if (_PyString_Resize(&res, requiredsize))
3728                         goto onError;
3729                     str = PyString_AS_STRING(res) + respos;
3730                     ressize = requiredsize;
3731                 }
3732                 /* generate replacement (temporarily (mis)uses p) */
3733                 for (p = collstart; p < collend;) {
3734                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3735                     str += sprintf(str, "&#%d;", (int)ch);
3736                 }
3737                 p = collend;
3738                 break;
3739             default:
3740                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741                                                               encoding, reason, startp, size, &exc,
3742                                                               collstart-startp, collend-startp, &newpos);
3743                 if (repunicode == NULL)
3744                     goto onError;
3745                 /* need more space? (at least enough for what we have+the
3746                    replacement+the rest of the string, so we won't have to
3747                    check space for encodable characters) */
3748                 respos = str - PyString_AS_STRING(res);
3749                 repsize = PyUnicode_GET_SIZE(repunicode);
3750                 if (respos > PY_SSIZE_T_MAX - repsize)
3751                     goto overflow;
3752                 requiredsize = respos + repsize;
3753                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3754                     goto overflow;
3755                 requiredsize += endp - collend;
3756                 if (requiredsize > ressize) {
3757                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3758                         requiredsize = 2*ressize;
3759                     if (_PyString_Resize(&res, requiredsize)) {
3760                         Py_DECREF(repunicode);
3761                         goto onError;
3762                     }
3763                     str = PyString_AS_STRING(res) + respos;
3764                     ressize = requiredsize;
3765                 }
3766                 /* check if there is anything unencodable in the replacement
3767                    and copy it to the output */
3768                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3769                     c = *uni2;
3770                     if (c >= limit) {
3771                         raise_encode_exception(&exc, encoding, startp, size,
3772                                                unicodepos, unicodepos+1, reason);
3773                         Py_DECREF(repunicode);
3774                         goto onError;
3775                     }
3776                     *str = (char)c;
3777                 }
3778                 p = startp + newpos;
3779                 Py_DECREF(repunicode);
3780             }
3781         }
3782     }
3783     /* Resize if we allocated to much */
3784     respos = str - PyString_AS_STRING(res);
3785     if (respos < ressize)
3786         /* If this falls res will be NULL */
3787         _PyString_Resize(&res, respos);
3788     Py_XDECREF(errorHandler);
3789     Py_XDECREF(exc);
3790     return res;
3791 
3792   overflow:
3793     PyErr_SetString(PyExc_OverflowError,
3794                     "encoded result is too long for a Python string");
3795 
3796   onError:
3797     Py_XDECREF(res);
3798     Py_XDECREF(errorHandler);
3799     Py_XDECREF(exc);
3800     return NULL;
3801 }
3802 
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3804                                  Py_ssize_t size,
3805                                  const char *errors)
3806 {
3807     return unicode_encode_ucs1(p, size, errors, 256);
3808 }
3809 
PyUnicode_AsLatin1String(PyObject * unicode)3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3811 {
3812     if (!PyUnicode_Check(unicode)) {
3813         PyErr_BadArgument();
3814         return NULL;
3815     }
3816     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3817                                   PyUnicode_GET_SIZE(unicode),
3818                                   NULL);
3819 }
3820 
3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3822 
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3823 PyObject *PyUnicode_DecodeASCII(const char *s,
3824                                 Py_ssize_t size,
3825                                 const char *errors)
3826 {
3827     const char *starts = s;
3828     PyUnicodeObject *v;
3829     Py_UNICODE *p;
3830     Py_ssize_t startinpos;
3831     Py_ssize_t endinpos;
3832     Py_ssize_t outpos;
3833     const char *e;
3834     PyObject *errorHandler = NULL;
3835     PyObject *exc = NULL;
3836 
3837     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3838     if (size == 1 && *(unsigned char*)s < 128) {
3839         Py_UNICODE r = *(unsigned char*)s;
3840         return PyUnicode_FromUnicode(&r, 1);
3841     }
3842 
3843     v = _PyUnicode_New(size);
3844     if (v == NULL)
3845         goto onError;
3846     if (size == 0)
3847         return (PyObject *)v;
3848     p = PyUnicode_AS_UNICODE(v);
3849     e = s + size;
3850     while (s < e) {
3851         register unsigned char c = (unsigned char)*s;
3852         if (c < 128) {
3853             *p++ = c;
3854             ++s;
3855         }
3856         else {
3857             startinpos = s-starts;
3858             endinpos = startinpos + 1;
3859             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3860             if (unicode_decode_call_errorhandler(
3861                     errors, &errorHandler,
3862                     "ascii", "ordinal not in range(128)",
3863                     starts, size, &startinpos, &endinpos, &exc, &s,
3864                     &v, &outpos, &p))
3865                 goto onError;
3866         }
3867     }
3868     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3869         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3870             goto onError;
3871     Py_XDECREF(errorHandler);
3872     Py_XDECREF(exc);
3873     return (PyObject *)v;
3874 
3875   onError:
3876     Py_XDECREF(v);
3877     Py_XDECREF(errorHandler);
3878     Py_XDECREF(exc);
3879     return NULL;
3880 }
3881 
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3883                                 Py_ssize_t size,
3884                                 const char *errors)
3885 {
3886     return unicode_encode_ucs1(p, size, errors, 128);
3887 }
3888 
PyUnicode_AsASCIIString(PyObject * unicode)3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3890 {
3891     if (!PyUnicode_Check(unicode)) {
3892         PyErr_BadArgument();
3893         return NULL;
3894     }
3895     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3896                                  PyUnicode_GET_SIZE(unicode),
3897                                  NULL);
3898 }
3899 
3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3901 
3902 /* --- MBCS codecs for Windows -------------------------------------------- */
3903 
3904 #if SIZEOF_INT < SIZEOF_SIZE_T
3905 #define NEED_RETRY
3906 #endif
3907 
3908 /* XXX This code is limited to "true" double-byte encodings, as
3909    a) it assumes an incomplete character consists of a single byte, and
3910    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3911    encodings, see IsDBCSLeadByteEx documentation. */
3912 
is_dbcs_lead_byte(const char * s,int offset)3913 static int is_dbcs_lead_byte(const char *s, int offset)
3914 {
3915     const char *curr = s + offset;
3916 
3917     if (IsDBCSLeadByte(*curr)) {
3918         const char *prev = CharPrev(s, curr);
3919         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3920     }
3921     return 0;
3922 }
3923 
3924 /*
3925  * Decode MBCS string into unicode object. If 'final' is set, converts
3926  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3927  */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3928 static int decode_mbcs(PyUnicodeObject **v,
3929                        const char *s, /* MBCS string */
3930                        int size, /* sizeof MBCS string */
3931                        int final)
3932 {
3933     Py_UNICODE *p;
3934     Py_ssize_t n = 0;
3935     int usize = 0;
3936 
3937     assert(size >= 0);
3938 
3939     /* Skip trailing lead-byte unless 'final' is set */
3940     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3941         --size;
3942 
3943     /* First get the size of the result */
3944     if (size > 0) {
3945         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3946         if (usize == 0) {
3947             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3948             return -1;
3949         }
3950     }
3951 
3952     if (*v == NULL) {
3953         /* Create unicode object */
3954         *v = _PyUnicode_New(usize);
3955         if (*v == NULL)
3956             return -1;
3957     }
3958     else {
3959         /* Extend unicode object */
3960         n = PyUnicode_GET_SIZE(*v);
3961         if (_PyUnicode_Resize(v, n + usize) < 0)
3962             return -1;
3963     }
3964 
3965     /* Do the conversion */
3966     if (size > 0) {
3967         p = PyUnicode_AS_UNICODE(*v) + n;
3968         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3969             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3970             return -1;
3971         }
3972     }
3973 
3974     return size;
3975 }
3976 
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3978                                        Py_ssize_t size,
3979                                        const char *errors,
3980                                        Py_ssize_t *consumed)
3981 {
3982     PyUnicodeObject *v = NULL;
3983     int done;
3984 
3985     if (consumed)
3986         *consumed = 0;
3987 
3988 #ifdef NEED_RETRY
3989   retry:
3990     if (size > INT_MAX)
3991         done = decode_mbcs(&v, s, INT_MAX, 0);
3992     else
3993 #endif
3994         done = decode_mbcs(&v, s, (int)size, !consumed);
3995 
3996     if (done < 0) {
3997         Py_XDECREF(v);
3998         return NULL;
3999     }
4000 
4001     if (consumed)
4002         *consumed += done;
4003 
4004 #ifdef NEED_RETRY
4005     if (size > INT_MAX) {
4006         s += done;
4007         size -= done;
4008         goto retry;
4009     }
4010 #endif
4011 
4012     return (PyObject *)v;
4013 }
4014 
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)4015 PyObject *PyUnicode_DecodeMBCS(const char *s,
4016                                Py_ssize_t size,
4017                                const char *errors)
4018 {
4019     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4020 }
4021 
4022 /*
4023  * Convert unicode into string object (MBCS).
4024  * Returns 0 if succeed, -1 otherwise.
4025  */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)4026 static int encode_mbcs(PyObject **repr,
4027                        const Py_UNICODE *p, /* unicode */
4028                        int size) /* size of unicode */
4029 {
4030     int mbcssize = 0;
4031     Py_ssize_t n = 0;
4032 
4033     assert(size >= 0);
4034 
4035     /* First get the size of the result */
4036     if (size > 0) {
4037         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4038         if (mbcssize == 0) {
4039             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4040             return -1;
4041         }
4042     }
4043 
4044     if (*repr == NULL) {
4045         /* Create string object */
4046         *repr = PyString_FromStringAndSize(NULL, mbcssize);
4047         if (*repr == NULL)
4048             return -1;
4049     }
4050     else {
4051         /* Extend string object */
4052         n = PyString_Size(*repr);
4053         if (_PyString_Resize(repr, n + mbcssize) < 0)
4054             return -1;
4055     }
4056 
4057     /* Do the conversion */
4058     if (size > 0) {
4059         char *s = PyString_AS_STRING(*repr) + n;
4060         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4061             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4062             return -1;
4063         }
4064     }
4065 
4066     return 0;
4067 }
4068 
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4070                                Py_ssize_t size,
4071                                const char *errors)
4072 {
4073     PyObject *repr = NULL;
4074     int ret;
4075 
4076 #ifdef NEED_RETRY
4077   retry:
4078     if (size > INT_MAX)
4079         ret = encode_mbcs(&repr, p, INT_MAX);
4080     else
4081 #endif
4082         ret = encode_mbcs(&repr, p, (int)size);
4083 
4084     if (ret < 0) {
4085         Py_XDECREF(repr);
4086         return NULL;
4087     }
4088 
4089 #ifdef NEED_RETRY
4090     if (size > INT_MAX) {
4091         p += INT_MAX;
4092         size -= INT_MAX;
4093         goto retry;
4094     }
4095 #endif
4096 
4097     return repr;
4098 }
4099 
PyUnicode_AsMBCSString(PyObject * unicode)4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4101 {
4102     if (!PyUnicode_Check(unicode)) {
4103         PyErr_BadArgument();
4104         return NULL;
4105     }
4106     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4107                                 PyUnicode_GET_SIZE(unicode),
4108                                 NULL);
4109 }
4110 
4111 #undef NEED_RETRY
4112 
4113 #endif /* MS_WINDOWS */
4114 
4115 /* --- Character Mapping Codec -------------------------------------------- */
4116 
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4117 PyObject *PyUnicode_DecodeCharmap(const char *s,
4118                                   Py_ssize_t size,
4119                                   PyObject *mapping,
4120                                   const char *errors)
4121 {
4122     const char *starts = s;
4123     Py_ssize_t startinpos;
4124     Py_ssize_t endinpos;
4125     Py_ssize_t outpos;
4126     const char *e;
4127     PyUnicodeObject *v;
4128     Py_UNICODE *p;
4129     Py_ssize_t extrachars = 0;
4130     PyObject *errorHandler = NULL;
4131     PyObject *exc = NULL;
4132     Py_UNICODE *mapstring = NULL;
4133     Py_ssize_t maplen = 0;
4134 
4135     /* Default to Latin-1 */
4136     if (mapping == NULL)
4137         return PyUnicode_DecodeLatin1(s, size, errors);
4138 
4139     v = _PyUnicode_New(size);
4140     if (v == NULL)
4141         goto onError;
4142     if (size == 0)
4143         return (PyObject *)v;
4144     p = PyUnicode_AS_UNICODE(v);
4145     e = s + size;
4146     if (PyUnicode_CheckExact(mapping)) {
4147         mapstring = PyUnicode_AS_UNICODE(mapping);
4148         maplen = PyUnicode_GET_SIZE(mapping);
4149         while (s < e) {
4150             unsigned char ch = *s;
4151             Py_UNICODE x = 0xfffe; /* illegal value */
4152 
4153             if (ch < maplen)
4154                 x = mapstring[ch];
4155 
4156             if (x == 0xfffe) {
4157                 /* undefined mapping */
4158                 outpos = p-PyUnicode_AS_UNICODE(v);
4159                 startinpos = s-starts;
4160                 endinpos = startinpos+1;
4161                 if (unicode_decode_call_errorhandler(
4162                         errors, &errorHandler,
4163                         "charmap", "character maps to <undefined>",
4164                         starts, size, &startinpos, &endinpos, &exc, &s,
4165                         &v, &outpos, &p)) {
4166                     goto onError;
4167                 }
4168                 continue;
4169             }
4170             *p++ = x;
4171             ++s;
4172         }
4173     }
4174     else {
4175         while (s < e) {
4176             unsigned char ch = *s;
4177             PyObject *w, *x;
4178 
4179             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4180             w = PyInt_FromLong((long)ch);
4181             if (w == NULL)
4182                 goto onError;
4183             x = PyObject_GetItem(mapping, w);
4184             Py_DECREF(w);
4185             if (x == NULL) {
4186                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4187                     /* No mapping found means: mapping is undefined. */
4188                     PyErr_Clear();
4189                     goto Undefined;
4190                 } else
4191                     goto onError;
4192             }
4193 
4194             /* Apply mapping */
4195             if (x == Py_None)
4196                 goto Undefined;
4197             if (PyInt_Check(x)) {
4198                 long value = PyInt_AS_LONG(x);
4199                 if (value == 0xFFFE)
4200                     goto Undefined;
4201                 if (value < 0 || value > 0x10FFFF) {
4202                     PyErr_SetString(PyExc_TypeError,
4203                                     "character mapping must be in range(0x110000)");
4204                     Py_DECREF(x);
4205                     goto onError;
4206                 }
4207 
4208 #ifndef Py_UNICODE_WIDE
4209                 if (value > 0xFFFF) {
4210                     /* see the code for 1-n mapping below */
4211                     if (extrachars < 2) {
4212                         /* resize first */
4213                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4214                         Py_ssize_t needed = 10 - extrachars;
4215                         extrachars += needed;
4216                         /* XXX overflow detection missing */
4217                         if (_PyUnicode_Resize(&v,
4218                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4219                             Py_DECREF(x);
4220                             goto onError;
4221                         }
4222                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4223                     }
4224                     value -= 0x10000;
4225                     *p++ = 0xD800 | (value >> 10);
4226                     *p++ = 0xDC00 | (value & 0x3FF);
4227                     extrachars -= 2;
4228                 }
4229                 else
4230 #endif
4231                 *p++ = (Py_UNICODE)value;
4232             }
4233             else if (PyUnicode_Check(x)) {
4234                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4235 
4236                 if (targetsize == 1) {
4237                     /* 1-1 mapping */
4238                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4239                     if (value == 0xFFFE)
4240                         goto Undefined;
4241                     *p++ = value;
4242                 }
4243                 else if (targetsize > 1) {
4244                     /* 1-n mapping */
4245                     if (targetsize > extrachars) {
4246                         /* resize first */
4247                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4248                         Py_ssize_t needed = (targetsize - extrachars) + \
4249                             (targetsize << 2);
4250                         extrachars += needed;
4251                         /* XXX overflow detection missing */
4252                         if (_PyUnicode_Resize(&v,
4253                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4254                             Py_DECREF(x);
4255                             goto onError;
4256                         }
4257                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4258                     }
4259                     Py_UNICODE_COPY(p,
4260                                     PyUnicode_AS_UNICODE(x),
4261                                     targetsize);
4262                     p += targetsize;
4263                     extrachars -= targetsize;
4264                 }
4265                 /* 1-0 mapping: skip the character */
4266             }
4267             else {
4268                 /* wrong return value */
4269                 PyErr_SetString(PyExc_TypeError,
4270                                 "character mapping must return integer, None or unicode");
4271                 Py_DECREF(x);
4272                 goto onError;
4273             }
4274             Py_DECREF(x);
4275             ++s;
4276             continue;
4277 Undefined:
4278             /* undefined mapping */
4279             Py_XDECREF(x);
4280             outpos = p-PyUnicode_AS_UNICODE(v);
4281             startinpos = s-starts;
4282             endinpos = startinpos+1;
4283             if (unicode_decode_call_errorhandler(
4284                     errors, &errorHandler,
4285                     "charmap", "character maps to <undefined>",
4286                     starts, size, &startinpos, &endinpos, &exc, &s,
4287                     &v, &outpos, &p)) {
4288                 goto onError;
4289             }
4290         }
4291     }
4292     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4293         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4294             goto onError;
4295     Py_XDECREF(errorHandler);
4296     Py_XDECREF(exc);
4297     return (PyObject *)v;
4298 
4299   onError:
4300     Py_XDECREF(errorHandler);
4301     Py_XDECREF(exc);
4302     Py_XDECREF(v);
4303     return NULL;
4304 }
4305 
4306 /* Charmap encoding: the lookup table */
4307 
4308 struct encoding_map{
4309     PyObject_HEAD
4310     unsigned char level1[32];
4311     int count2, count3;
4312     unsigned char level23[1];
4313 };
4314 
4315 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4316 encoding_map_size(PyObject *obj, PyObject* args)
4317 {
4318     struct encoding_map *map = (struct encoding_map*)obj;
4319     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4320                           128*map->count3);
4321 }
4322 
4323 static PyMethodDef encoding_map_methods[] = {
4324     {"size", encoding_map_size, METH_NOARGS,
4325      PyDoc_STR("Return the size (in bytes) of this object") },
4326     { 0 }
4327 };
4328 
4329 static void
encoding_map_dealloc(PyObject * o)4330 encoding_map_dealloc(PyObject* o)
4331 {
4332     PyObject_FREE(o);
4333 }
4334 
4335 static PyTypeObject EncodingMapType = {
4336     PyVarObject_HEAD_INIT(NULL, 0)
4337     "EncodingMap",          /*tp_name*/
4338     sizeof(struct encoding_map),   /*tp_basicsize*/
4339     0,                      /*tp_itemsize*/
4340     /* methods */
4341     encoding_map_dealloc,   /*tp_dealloc*/
4342     0,                      /*tp_print*/
4343     0,                      /*tp_getattr*/
4344     0,                      /*tp_setattr*/
4345     0,                      /*tp_compare*/
4346     0,                      /*tp_repr*/
4347     0,                      /*tp_as_number*/
4348     0,                      /*tp_as_sequence*/
4349     0,                      /*tp_as_mapping*/
4350     0,                      /*tp_hash*/
4351     0,                      /*tp_call*/
4352     0,                      /*tp_str*/
4353     0,                      /*tp_getattro*/
4354     0,                      /*tp_setattro*/
4355     0,                      /*tp_as_buffer*/
4356     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4357     0,                      /*tp_doc*/
4358     0,                      /*tp_traverse*/
4359     0,                      /*tp_clear*/
4360     0,                      /*tp_richcompare*/
4361     0,                      /*tp_weaklistoffset*/
4362     0,                      /*tp_iter*/
4363     0,                      /*tp_iternext*/
4364     encoding_map_methods,   /*tp_methods*/
4365     0,                      /*tp_members*/
4366     0,                      /*tp_getset*/
4367     0,                      /*tp_base*/
4368     0,                      /*tp_dict*/
4369     0,                      /*tp_descr_get*/
4370     0,                      /*tp_descr_set*/
4371     0,                      /*tp_dictoffset*/
4372     0,                      /*tp_init*/
4373     0,                      /*tp_alloc*/
4374     0,                      /*tp_new*/
4375     0,                      /*tp_free*/
4376     0,                      /*tp_is_gc*/
4377 };
4378 
4379 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4380 PyUnicode_BuildEncodingMap(PyObject* string)
4381 {
4382     Py_UNICODE *decode;
4383     PyObject *result;
4384     struct encoding_map *mresult;
4385     int i;
4386     int need_dict = 0;
4387     unsigned char level1[32];
4388     unsigned char level2[512];
4389     unsigned char *mlevel1, *mlevel2, *mlevel3;
4390     int count2 = 0, count3 = 0;
4391 
4392     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4393         PyErr_BadArgument();
4394         return NULL;
4395     }
4396     decode = PyUnicode_AS_UNICODE(string);
4397     memset(level1, 0xFF, sizeof level1);
4398     memset(level2, 0xFF, sizeof level2);
4399 
4400     /* If there isn't a one-to-one mapping of NULL to \0,
4401        or if there are non-BMP characters, we need to use
4402        a mapping dictionary. */
4403     if (decode[0] != 0)
4404         need_dict = 1;
4405     for (i = 1; i < 256; i++) {
4406         int l1, l2;
4407         if (decode[i] == 0
4408 #ifdef Py_UNICODE_WIDE
4409             || decode[i] > 0xFFFF
4410 #endif
4411             ) {
4412             need_dict = 1;
4413             break;
4414         }
4415         if (decode[i] == 0xFFFE)
4416             /* unmapped character */
4417             continue;
4418         l1 = decode[i] >> 11;
4419         l2 = decode[i] >> 7;
4420         if (level1[l1] == 0xFF)
4421             level1[l1] = count2++;
4422         if (level2[l2] == 0xFF)
4423             level2[l2] = count3++;
4424     }
4425 
4426     if (count2 >= 0xFF || count3 >= 0xFF)
4427         need_dict = 1;
4428 
4429     if (need_dict) {
4430         PyObject *result = PyDict_New();
4431         PyObject *key, *value;
4432         if (!result)
4433             return NULL;
4434         for (i = 0; i < 256; i++) {
4435             value = NULL;
4436             key = PyInt_FromLong(decode[i]);
4437             value = PyInt_FromLong(i);
4438             if (!key || !value)
4439                 goto failed1;
4440             if (PyDict_SetItem(result, key, value) == -1)
4441                 goto failed1;
4442             Py_DECREF(key);
4443             Py_DECREF(value);
4444         }
4445         return result;
4446       failed1:
4447         Py_XDECREF(key);
4448         Py_XDECREF(value);
4449         Py_DECREF(result);
4450         return NULL;
4451     }
4452 
4453     /* Create a three-level trie */
4454     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4455                              16*count2 + 128*count3 - 1);
4456     if (!result)
4457         return PyErr_NoMemory();
4458     PyObject_Init(result, &EncodingMapType);
4459     mresult = (struct encoding_map*)result;
4460     mresult->count2 = count2;
4461     mresult->count3 = count3;
4462     mlevel1 = mresult->level1;
4463     mlevel2 = mresult->level23;
4464     mlevel3 = mresult->level23 + 16*count2;
4465     memcpy(mlevel1, level1, 32);
4466     memset(mlevel2, 0xFF, 16*count2);
4467     memset(mlevel3, 0, 128*count3);
4468     count3 = 0;
4469     for (i = 1; i < 256; i++) {
4470         int o1, o2, o3, i2, i3;
4471         if (decode[i] == 0xFFFE)
4472             /* unmapped character */
4473             continue;
4474         o1 = decode[i]>>11;
4475         o2 = (decode[i]>>7) & 0xF;
4476         i2 = 16*mlevel1[o1] + o2;
4477         if (mlevel2[i2] == 0xFF)
4478             mlevel2[i2] = count3++;
4479         o3 = decode[i] & 0x7F;
4480         i3 = 128*mlevel2[i2] + o3;
4481         mlevel3[i3] = i;
4482     }
4483     return result;
4484 }
4485 
4486 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4488 {
4489     struct encoding_map *map = (struct encoding_map*)mapping;
4490     int l1 = c>>11;
4491     int l2 = (c>>7) & 0xF;
4492     int l3 = c & 0x7F;
4493     int i;
4494 
4495 #ifdef Py_UNICODE_WIDE
4496     if (c > 0xFFFF) {
4497         return -1;
4498     }
4499 #endif
4500     if (c == 0)
4501         return 0;
4502     /* level 1*/
4503     i = map->level1[l1];
4504     if (i == 0xFF) {
4505         return -1;
4506     }
4507     /* level 2*/
4508     i = map->level23[16*i+l2];
4509     if (i == 0xFF) {
4510         return -1;
4511     }
4512     /* level 3 */
4513     i = map->level23[16*map->count2 + 128*i + l3];
4514     if (i == 0) {
4515         return -1;
4516     }
4517     return i;
4518 }
4519 
4520 /* Lookup the character ch in the mapping. If the character
4521    can't be found, Py_None is returned (or NULL, if another
4522    error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4524 {
4525     PyObject *w = PyInt_FromLong((long)c);
4526     PyObject *x;
4527 
4528     if (w == NULL)
4529         return NULL;
4530     x = PyObject_GetItem(mapping, w);
4531     Py_DECREF(w);
4532     if (x == NULL) {
4533         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4534             /* No mapping found means: mapping is undefined. */
4535             PyErr_Clear();
4536             x = Py_None;
4537             Py_INCREF(x);
4538             return x;
4539         } else
4540             return NULL;
4541     }
4542     else if (x == Py_None)
4543         return x;
4544     else if (PyInt_Check(x)) {
4545         long value = PyInt_AS_LONG(x);
4546         if (value < 0 || value > 255) {
4547             PyErr_SetString(PyExc_TypeError,
4548                             "character mapping must be in range(256)");
4549             Py_DECREF(x);
4550             return NULL;
4551         }
4552         return x;
4553     }
4554     else if (PyString_Check(x))
4555         return x;
4556     else {
4557         /* wrong return value */
4558         PyErr_SetString(PyExc_TypeError,
4559                         "character mapping must return integer, None or str");
4560         Py_DECREF(x);
4561         return NULL;
4562     }
4563 }
4564 
4565 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4567 {
4568     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4569     /* exponentially overallocate to minimize reallocations */
4570     if (requiredsize < 2*outsize)
4571         requiredsize = 2*outsize;
4572     if (_PyString_Resize(outobj, requiredsize)) {
4573         return 0;
4574     }
4575     return 1;
4576 }
4577 
4578 typedef enum charmapencode_result {
4579     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4580 }charmapencode_result;
4581 /* lookup the character, put the result in the output string and adjust
4582    various state variables. Reallocate the output string if not enough
4583    space is available. Return a new reference to the object that
4584    was put in the output buffer, or Py_None, if the mapping was undefined
4585    (in which case no character was written) or NULL, if a
4586    reallocation error occurred. The caller must decref the result */
4587 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4589                                           PyObject **outobj, Py_ssize_t *outpos)
4590 {
4591     PyObject *rep;
4592     char *outstart;
4593     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4594 
4595     if (Py_TYPE(mapping) == &EncodingMapType) {
4596         int res = encoding_map_lookup(c, mapping);
4597         Py_ssize_t requiredsize = *outpos+1;
4598         if (res == -1)
4599             return enc_FAILED;
4600         if (outsize<requiredsize)
4601             if (!charmapencode_resize(outobj, outpos, requiredsize))
4602                 return enc_EXCEPTION;
4603         outstart = PyString_AS_STRING(*outobj);
4604         outstart[(*outpos)++] = (char)res;
4605         return enc_SUCCESS;
4606     }
4607 
4608     rep = charmapencode_lookup(c, mapping);
4609     if (rep==NULL)
4610         return enc_EXCEPTION;
4611     else if (rep==Py_None) {
4612         Py_DECREF(rep);
4613         return enc_FAILED;
4614     } else {
4615         if (PyInt_Check(rep)) {
4616             Py_ssize_t requiredsize = *outpos+1;
4617             if (outsize<requiredsize)
4618                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4619                     Py_DECREF(rep);
4620                     return enc_EXCEPTION;
4621                 }
4622             outstart = PyString_AS_STRING(*outobj);
4623             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4624         }
4625         else {
4626             const char *repchars = PyString_AS_STRING(rep);
4627             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4628             Py_ssize_t requiredsize = *outpos+repsize;
4629             if (outsize<requiredsize)
4630                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4631                     Py_DECREF(rep);
4632                     return enc_EXCEPTION;
4633                 }
4634             outstart = PyString_AS_STRING(*outobj);
4635             memcpy(outstart + *outpos, repchars, repsize);
4636             *outpos += repsize;
4637         }
4638     }
4639     Py_DECREF(rep);
4640     return enc_SUCCESS;
4641 }
4642 
4643 /* handle an error in PyUnicode_EncodeCharmap
4644    Return 0 on success, -1 on error */
4645 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4646 int charmap_encoding_error(
4647     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4648     PyObject **exceptionObject,
4649     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4650     PyObject **res, Py_ssize_t *respos)
4651 {
4652     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4653     Py_ssize_t repsize;
4654     Py_ssize_t newpos;
4655     Py_UNICODE *uni2;
4656     /* startpos for collecting unencodable chars */
4657     Py_ssize_t collstartpos = *inpos;
4658     Py_ssize_t collendpos = *inpos+1;
4659     Py_ssize_t collpos;
4660     char *encoding = "charmap";
4661     char *reason = "character maps to <undefined>";
4662     charmapencode_result x;
4663 
4664     /* find all unencodable characters */
4665     while (collendpos < size) {
4666         PyObject *rep;
4667         if (Py_TYPE(mapping) == &EncodingMapType) {
4668             int res = encoding_map_lookup(p[collendpos], mapping);
4669             if (res != -1)
4670                 break;
4671             ++collendpos;
4672             continue;
4673         }
4674 
4675         rep = charmapencode_lookup(p[collendpos], mapping);
4676         if (rep==NULL)
4677             return -1;
4678         else if (rep!=Py_None) {
4679             Py_DECREF(rep);
4680             break;
4681         }
4682         Py_DECREF(rep);
4683         ++collendpos;
4684     }
4685     /* cache callback name lookup
4686      * (if not done yet, i.e. it's the first error) */
4687     if (*known_errorHandler==-1) {
4688         if ((errors==NULL) || (!strcmp(errors, "strict")))
4689             *known_errorHandler = 1;
4690         else if (!strcmp(errors, "replace"))
4691             *known_errorHandler = 2;
4692         else if (!strcmp(errors, "ignore"))
4693             *known_errorHandler = 3;
4694         else if (!strcmp(errors, "xmlcharrefreplace"))
4695             *known_errorHandler = 4;
4696         else
4697             *known_errorHandler = 0;
4698     }
4699     switch (*known_errorHandler) {
4700     case 1: /* strict */
4701         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702         return -1;
4703     case 2: /* replace */
4704         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4705             x = charmapencode_output('?', mapping, res, respos);
4706             if (x==enc_EXCEPTION) {
4707                 return -1;
4708             }
4709             else if (x==enc_FAILED) {
4710                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4711                 return -1;
4712             }
4713         }
4714         /* fall through */
4715     case 3: /* ignore */
4716         *inpos = collendpos;
4717         break;
4718     case 4: /* xmlcharrefreplace */
4719         /* generate replacement */
4720         for (collpos = collstartpos; collpos < collendpos;) {
4721             char buffer[2+29+1+1];
4722             char *cp;
4723             Py_UCS4 ch = p[collpos++];
4724 #ifndef Py_UNICODE_WIDE
4725             if ((0xD800 <= ch && ch <= 0xDBFF) &&
4726                 (collpos < collendpos) &&
4727                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4728                 ch = ((((ch & 0x03FF) << 10) |
4729                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4730             }
4731 #endif
4732             sprintf(buffer, "&#%d;", (int)ch);
4733             for (cp = buffer; *cp; ++cp) {
4734                 x = charmapencode_output(*cp, mapping, res, respos);
4735                 if (x==enc_EXCEPTION)
4736                     return -1;
4737                 else if (x==enc_FAILED) {
4738                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739                     return -1;
4740                 }
4741             }
4742         }
4743         *inpos = collendpos;
4744         break;
4745     default:
4746         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4747                                                       encoding, reason, p, size, exceptionObject,
4748                                                       collstartpos, collendpos, &newpos);
4749         if (repunicode == NULL)
4750             return -1;
4751         /* generate replacement  */
4752         repsize = PyUnicode_GET_SIZE(repunicode);
4753         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4754             x = charmapencode_output(*uni2, mapping, res, respos);
4755             if (x==enc_EXCEPTION) {
4756                 return -1;
4757             }
4758             else if (x==enc_FAILED) {
4759                 Py_DECREF(repunicode);
4760                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4761                 return -1;
4762             }
4763         }
4764         *inpos = newpos;
4765         Py_DECREF(repunicode);
4766     }
4767     return 0;
4768 }
4769 
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4771                                   Py_ssize_t size,
4772                                   PyObject *mapping,
4773                                   const char *errors)
4774 {
4775     /* output object */
4776     PyObject *res = NULL;
4777     /* current input position */
4778     Py_ssize_t inpos = 0;
4779     /* current output position */
4780     Py_ssize_t respos = 0;
4781     PyObject *errorHandler = NULL;
4782     PyObject *exc = NULL;
4783     /* the following variable is used for caching string comparisons
4784      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785      * 3=ignore, 4=xmlcharrefreplace */
4786     int known_errorHandler = -1;
4787 
4788     /* Default to Latin-1 */
4789     if (mapping == NULL)
4790         return PyUnicode_EncodeLatin1(p, size, errors);
4791 
4792     /* allocate enough for a simple encoding without
4793        replacements, if we need more, we'll resize */
4794     res = PyString_FromStringAndSize(NULL, size);
4795     if (res == NULL)
4796         goto onError;
4797     if (size == 0)
4798         return res;
4799 
4800     while (inpos<size) {
4801         /* try to encode it */
4802         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4803         if (x==enc_EXCEPTION) /* error */
4804             goto onError;
4805         if (x==enc_FAILED) { /* unencodable character */
4806             if (charmap_encoding_error(p, size, &inpos, mapping,
4807                                        &exc,
4808                                        &known_errorHandler, &errorHandler, errors,
4809                                        &res, &respos)) {
4810                 goto onError;
4811             }
4812         }
4813         else
4814             /* done with this character => adjust input position */
4815             ++inpos;
4816     }
4817 
4818     /* Resize if we allocated to much */
4819     if (respos<PyString_GET_SIZE(res)) {
4820         if (_PyString_Resize(&res, respos))
4821             goto onError;
4822     }
4823     Py_XDECREF(exc);
4824     Py_XDECREF(errorHandler);
4825     return res;
4826 
4827   onError:
4828     Py_XDECREF(res);
4829     Py_XDECREF(exc);
4830     Py_XDECREF(errorHandler);
4831     return NULL;
4832 }
4833 
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4835                                     PyObject *mapping)
4836 {
4837     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4838         PyErr_BadArgument();
4839         return NULL;
4840     }
4841     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4842                                    PyUnicode_GET_SIZE(unicode),
4843                                    mapping,
4844                                    NULL);
4845 }
4846 
4847 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4848 static void make_translate_exception(PyObject **exceptionObject,
4849                                      const Py_UNICODE *unicode, Py_ssize_t size,
4850                                      Py_ssize_t startpos, Py_ssize_t endpos,
4851                                      const char *reason)
4852 {
4853     if (*exceptionObject == NULL) {
4854         *exceptionObject = PyUnicodeTranslateError_Create(
4855             unicode, size, startpos, endpos, reason);
4856     }
4857     else {
4858         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4859             goto onError;
4860         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4861             goto onError;
4862         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4863             goto onError;
4864         return;
4865       onError:
4866         Py_CLEAR(*exceptionObject);
4867     }
4868 }
4869 
4870 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4871 static void raise_translate_exception(PyObject **exceptionObject,
4872                                       const Py_UNICODE *unicode, Py_ssize_t size,
4873                                       Py_ssize_t startpos, Py_ssize_t endpos,
4874                                       const char *reason)
4875 {
4876     make_translate_exception(exceptionObject,
4877                              unicode, size, startpos, endpos, reason);
4878     if (*exceptionObject != NULL)
4879         PyCodec_StrictErrors(*exceptionObject);
4880 }
4881 
4882 /* error handling callback helper:
4883    build arguments, call the callback and check the arguments,
4884    put the result into newpos and return the replacement string, which
4885    has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4886 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4887                                                      PyObject **errorHandler,
4888                                                      const char *reason,
4889                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4890                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4891                                                      Py_ssize_t *newpos)
4892 {
4893     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4894 
4895     Py_ssize_t i_newpos;
4896     PyObject *restuple;
4897     PyObject *resunicode;
4898 
4899     if (*errorHandler == NULL) {
4900         *errorHandler = PyCodec_LookupError(errors);
4901         if (*errorHandler == NULL)
4902             return NULL;
4903     }
4904 
4905     make_translate_exception(exceptionObject,
4906                              unicode, size, startpos, endpos, reason);
4907     if (*exceptionObject == NULL)
4908         return NULL;
4909 
4910     restuple = PyObject_CallFunctionObjArgs(
4911         *errorHandler, *exceptionObject, NULL);
4912     if (restuple == NULL)
4913         return NULL;
4914     if (!PyTuple_Check(restuple)) {
4915         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4916         Py_DECREF(restuple);
4917         return NULL;
4918     }
4919     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4920                           &resunicode, &i_newpos)) {
4921         Py_DECREF(restuple);
4922         return NULL;
4923     }
4924     if (i_newpos<0)
4925         *newpos = size+i_newpos;
4926     else
4927         *newpos = i_newpos;
4928     if (*newpos<0 || *newpos>size) {
4929         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4930         Py_DECREF(restuple);
4931         return NULL;
4932     }
4933     Py_INCREF(resunicode);
4934     Py_DECREF(restuple);
4935     return resunicode;
4936 }
4937 
4938 /* Lookup the character ch in the mapping and put the result in result,
4939    which must be decrefed by the caller.
4940    Return 0 on success, -1 on error */
4941 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4943 {
4944     PyObject *w = PyInt_FromLong((long)c);
4945     PyObject *x;
4946 
4947     if (w == NULL)
4948         return -1;
4949     x = PyObject_GetItem(mapping, w);
4950     Py_DECREF(w);
4951     if (x == NULL) {
4952         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4953             /* No mapping found means: use 1:1 mapping. */
4954             PyErr_Clear();
4955             *result = NULL;
4956             return 0;
4957         } else
4958             return -1;
4959     }
4960     else if (x == Py_None) {
4961         *result = x;
4962         return 0;
4963     }
4964     else if (PyInt_Check(x)) {
4965         long value = PyInt_AS_LONG(x);
4966         long max = PyUnicode_GetMax();
4967         if (value < 0 || value > max) {
4968             PyErr_Format(PyExc_TypeError,
4969                          "character mapping must be in range(0x%lx)", max+1);
4970             Py_DECREF(x);
4971             return -1;
4972         }
4973         *result = x;
4974         return 0;
4975     }
4976     else if (PyUnicode_Check(x)) {
4977         *result = x;
4978         return 0;
4979     }
4980     else {
4981         /* wrong return value */
4982         PyErr_SetString(PyExc_TypeError,
4983                         "character mapping must return integer, None or unicode");
4984         Py_DECREF(x);
4985         return -1;
4986     }
4987 }
4988 /* ensure that *outobj is at least requiredsize characters long,
4989    if not reallocate and adjust various state variables.
4990    Return 0 on success, -1 on error */
4991 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4993                                Py_ssize_t requiredsize)
4994 {
4995     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4996     if (requiredsize > oldsize) {
4997         /* remember old output position */
4998         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4999         /* exponentially overallocate to minimize reallocations */
5000         if (requiredsize < 2 * oldsize)
5001             requiredsize = 2 * oldsize;
5002         if (PyUnicode_Resize(outobj, requiredsize) < 0)
5003             return -1;
5004         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5005     }
5006     return 0;
5007 }
5008 /* lookup the character, put the result in the output string and adjust
5009    various state variables. Return a new reference to the object that
5010    was put in the output buffer in *result, or Py_None, if the mapping was
5011    undefined (in which case no character was written).
5012    The called must decref result.
5013    Return 0 on success, -1 on error. */
5014 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5016                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5017                             PyObject **res)
5018 {
5019     if (charmaptranslate_lookup(*curinp, mapping, res))
5020         return -1;
5021     if (*res==NULL) {
5022         /* not found => default to 1:1 mapping */
5023         *(*outp)++ = *curinp;
5024     }
5025     else if (*res==Py_None)
5026         ;
5027     else if (PyInt_Check(*res)) {
5028         /* no overflow check, because we know that the space is enough */
5029         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5030     }
5031     else if (PyUnicode_Check(*res)) {
5032         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5033         if (repsize==1) {
5034             /* no overflow check, because we know that the space is enough */
5035             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5036         }
5037         else if (repsize!=0) {
5038             /* more than one character */
5039             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5040                 (insize - (curinp-startinp)) +
5041                 repsize - 1;
5042             if (charmaptranslate_makespace(outobj, outp, requiredsize))
5043                 return -1;
5044             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5045             *outp += repsize;
5046         }
5047     }
5048     else
5049         return -1;
5050     return 0;
5051 }
5052 
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5054                                      Py_ssize_t size,
5055                                      PyObject *mapping,
5056                                      const char *errors)
5057 {
5058     /* output object */
5059     PyObject *res = NULL;
5060     /* pointers to the beginning and end+1 of input */
5061     const Py_UNICODE *startp = p;
5062     const Py_UNICODE *endp = p + size;
5063     /* pointer into the output */
5064     Py_UNICODE *str;
5065     /* current output position */
5066     Py_ssize_t respos = 0;
5067     char *reason = "character maps to <undefined>";
5068     PyObject *errorHandler = NULL;
5069     PyObject *exc = NULL;
5070     /* the following variable is used for caching string comparisons
5071      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5072      * 3=ignore, 4=xmlcharrefreplace */
5073     int known_errorHandler = -1;
5074 
5075     if (mapping == NULL) {
5076         PyErr_BadArgument();
5077         return NULL;
5078     }
5079 
5080     /* allocate enough for a simple 1:1 translation without
5081        replacements, if we need more, we'll resize */
5082     res = PyUnicode_FromUnicode(NULL, size);
5083     if (res == NULL)
5084         goto onError;
5085     if (size == 0)
5086         return res;
5087     str = PyUnicode_AS_UNICODE(res);
5088 
5089     while (p<endp) {
5090         /* try to encode it */
5091         PyObject *x = NULL;
5092         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5093             Py_XDECREF(x);
5094             goto onError;
5095         }
5096         Py_XDECREF(x);
5097         if (x!=Py_None) /* it worked => adjust input pointer */
5098             ++p;
5099         else { /* untranslatable character */
5100             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5101             Py_ssize_t repsize;
5102             Py_ssize_t newpos;
5103             Py_UNICODE *uni2;
5104             /* startpos for collecting untranslatable chars */
5105             const Py_UNICODE *collstart = p;
5106             const Py_UNICODE *collend = p+1;
5107             const Py_UNICODE *coll;
5108 
5109             /* find all untranslatable characters */
5110             while (collend < endp) {
5111                 if (charmaptranslate_lookup(*collend, mapping, &x))
5112                     goto onError;
5113                 Py_XDECREF(x);
5114                 if (x!=Py_None)
5115                     break;
5116                 ++collend;
5117             }
5118             /* cache callback name lookup
5119              * (if not done yet, i.e. it's the first error) */
5120             if (known_errorHandler==-1) {
5121                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122                     known_errorHandler = 1;
5123                 else if (!strcmp(errors, "replace"))
5124                     known_errorHandler = 2;
5125                 else if (!strcmp(errors, "ignore"))
5126                     known_errorHandler = 3;
5127                 else if (!strcmp(errors, "xmlcharrefreplace"))
5128                     known_errorHandler = 4;
5129                 else
5130                     known_errorHandler = 0;
5131             }
5132             switch (known_errorHandler) {
5133             case 1: /* strict */
5134                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5135                 goto onError;
5136             case 2: /* replace */
5137                 /* No need to check for space, this is a 1:1 replacement */
5138                 for (coll = collstart; coll<collend; ++coll)
5139                     *str++ = '?';
5140                 /* fall through */
5141             case 3: /* ignore */
5142                 p = collend;
5143                 break;
5144             case 4: /* xmlcharrefreplace */
5145                 /* generate replacement (temporarily (mis)uses p) */
5146                 for (p = collstart; p < collend;) {
5147                     char buffer[2+29+1+1];
5148                     char *cp;
5149                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5150                     sprintf(buffer, "&#%d;", (int)ch);
5151                     if (charmaptranslate_makespace(&res, &str,
5152                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5153                         goto onError;
5154                     for (cp = buffer; *cp; ++cp)
5155                         *str++ = *cp;
5156                 }
5157                 p = collend;
5158                 break;
5159             default:
5160                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5161                                                                  reason, startp, size, &exc,
5162                                                                  collstart-startp, collend-startp, &newpos);
5163                 if (repunicode == NULL)
5164                     goto onError;
5165                 /* generate replacement  */
5166                 repsize = PyUnicode_GET_SIZE(repunicode);
5167                 if (charmaptranslate_makespace(&res, &str,
5168                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5169                     Py_DECREF(repunicode);
5170                     goto onError;
5171                 }
5172                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5173                     *str++ = *uni2;
5174                 p = startp + newpos;
5175                 Py_DECREF(repunicode);
5176             }
5177         }
5178     }
5179     /* Resize if we allocated to much */
5180     respos = str-PyUnicode_AS_UNICODE(res);
5181     if (respos<PyUnicode_GET_SIZE(res)) {
5182         if (PyUnicode_Resize(&res, respos) < 0)
5183             goto onError;
5184     }
5185     Py_XDECREF(exc);
5186     Py_XDECREF(errorHandler);
5187     return res;
5188 
5189   onError:
5190     Py_XDECREF(res);
5191     Py_XDECREF(exc);
5192     Py_XDECREF(errorHandler);
5193     return NULL;
5194 }
5195 
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5196 PyObject *PyUnicode_Translate(PyObject *str,
5197                               PyObject *mapping,
5198                               const char *errors)
5199 {
5200     PyObject *result;
5201 
5202     str = PyUnicode_FromObject(str);
5203     if (str == NULL)
5204         goto onError;
5205     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5206                                         PyUnicode_GET_SIZE(str),
5207                                         mapping,
5208                                         errors);
5209     Py_DECREF(str);
5210     return result;
5211 
5212   onError:
5213     Py_XDECREF(str);
5214     return NULL;
5215 }
5216 
5217 /* --- Decimal Encoder ---------------------------------------------------- */
5218 
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5220                             Py_ssize_t length,
5221                             char *output,
5222                             const char *errors)
5223 {
5224     Py_UNICODE *p, *end;
5225     PyObject *errorHandler = NULL;
5226     PyObject *exc = NULL;
5227     const char *encoding = "decimal";
5228     const char *reason = "invalid decimal Unicode string";
5229     /* the following variable is used for caching string comparisons
5230      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5231     int known_errorHandler = -1;
5232 
5233     if (output == NULL) {
5234         PyErr_BadArgument();
5235         return -1;
5236     }
5237 
5238     p = s;
5239     end = s + length;
5240     while (p < end) {
5241         register Py_UNICODE ch = *p;
5242         int decimal;
5243         PyObject *repunicode;
5244         Py_ssize_t repsize;
5245         Py_ssize_t newpos;
5246         Py_UNICODE *uni2;
5247         Py_UNICODE *collstart;
5248         Py_UNICODE *collend;
5249 
5250         if (Py_UNICODE_ISSPACE(ch)) {
5251             *output++ = ' ';
5252             ++p;
5253             continue;
5254         }
5255         decimal = Py_UNICODE_TODECIMAL(ch);
5256         if (decimal >= 0) {
5257             *output++ = '0' + decimal;
5258             ++p;
5259             continue;
5260         }
5261         if (0 < ch && ch < 256) {
5262             *output++ = (char)ch;
5263             ++p;
5264             continue;
5265         }
5266         /* All other characters are considered unencodable */
5267         collstart = p;
5268         for (collend = p+1; collend < end; collend++) {
5269             if ((0 < *collend && *collend < 256) ||
5270                 Py_UNICODE_ISSPACE(*collend) ||
5271                 0 <= Py_UNICODE_TODECIMAL(*collend))
5272                 break;
5273         }
5274         /* cache callback name lookup
5275          * (if not done yet, i.e. it's the first error) */
5276         if (known_errorHandler==-1) {
5277             if ((errors==NULL) || (!strcmp(errors, "strict")))
5278                 known_errorHandler = 1;
5279             else if (!strcmp(errors, "replace"))
5280                 known_errorHandler = 2;
5281             else if (!strcmp(errors, "ignore"))
5282                 known_errorHandler = 3;
5283             else if (!strcmp(errors, "xmlcharrefreplace"))
5284                 known_errorHandler = 4;
5285             else
5286                 known_errorHandler = 0;
5287         }
5288         switch (known_errorHandler) {
5289         case 1: /* strict */
5290             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5291             goto onError;
5292         case 2: /* replace */
5293             for (p = collstart; p < collend; ++p)
5294                 *output++ = '?';
5295             /* fall through */
5296         case 3: /* ignore */
5297             p = collend;
5298             break;
5299         case 4: /* xmlcharrefreplace */
5300             /* generate replacement (temporarily (mis)uses p) */
5301             for (p = collstart; p < collend;) {
5302                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5303                 output += sprintf(output, "&#%d;", ch);
5304             }
5305             p = collend;
5306             break;
5307         default:
5308             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5309                                                           encoding, reason, s, length, &exc,
5310                                                           collstart-s, collend-s, &newpos);
5311             if (repunicode == NULL)
5312                 goto onError;
5313             /* generate replacement  */
5314             repsize = PyUnicode_GET_SIZE(repunicode);
5315             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5316                 Py_UNICODE ch = *uni2;
5317                 if (Py_UNICODE_ISSPACE(ch))
5318                     *output++ = ' ';
5319                 else {
5320                     decimal = Py_UNICODE_TODECIMAL(ch);
5321                     if (decimal >= 0)
5322                         *output++ = '0' + decimal;
5323                     else if (0 < ch && ch < 256)
5324                         *output++ = (char)ch;
5325                     else {
5326                         Py_DECREF(repunicode);
5327                         raise_encode_exception(&exc, encoding,
5328                                                s, length, collstart-s, collend-s, reason);
5329                         goto onError;
5330                     }
5331                 }
5332             }
5333             p = s + newpos;
5334             Py_DECREF(repunicode);
5335         }
5336     }
5337     /* 0-terminate the output string */
5338     *output++ = '\0';
5339     Py_XDECREF(exc);
5340     Py_XDECREF(errorHandler);
5341     return 0;
5342 
5343   onError:
5344     Py_XDECREF(exc);
5345     Py_XDECREF(errorHandler);
5346     return -1;
5347 }
5348 
5349 /* --- Helpers ------------------------------------------------------------ */
5350 
5351 #include "stringlib/unicodedefs.h"
5352 #include "stringlib/fastsearch.h"
5353 
5354 #include "stringlib/count.h"
5355 #include "stringlib/find.h"
5356 #include "stringlib/partition.h"
5357 #include "stringlib/split.h"
5358 
5359 /* helper macro to fixup start/end slice values */
5360 #define ADJUST_INDICES(start, end, len)         \
5361     if (end > len)                              \
5362         end = len;                              \
5363     else if (end < 0) {                         \
5364         end += len;                             \
5365         if (end < 0)                            \
5366             end = 0;                            \
5367     }                                           \
5368     if (start < 0) {                            \
5369         start += len;                           \
5370         if (start < 0)                          \
5371             start = 0;                          \
5372     }
5373 
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5374 Py_ssize_t PyUnicode_Count(PyObject *str,
5375                            PyObject *substr,
5376                            Py_ssize_t start,
5377                            Py_ssize_t end)
5378 {
5379     Py_ssize_t result;
5380     PyUnicodeObject* str_obj;
5381     PyUnicodeObject* sub_obj;
5382 
5383     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5384     if (!str_obj)
5385         return -1;
5386     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5387     if (!sub_obj) {
5388         Py_DECREF(str_obj);
5389         return -1;
5390     }
5391 
5392     ADJUST_INDICES(start, end, str_obj->length);
5393     result = stringlib_count(
5394         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5395         PY_SSIZE_T_MAX
5396         );
5397 
5398     Py_DECREF(sub_obj);
5399     Py_DECREF(str_obj);
5400 
5401     return result;
5402 }
5403 
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5404 Py_ssize_t PyUnicode_Find(PyObject *str,
5405                           PyObject *sub,
5406                           Py_ssize_t start,
5407                           Py_ssize_t end,
5408                           int direction)
5409 {
5410     Py_ssize_t result;
5411 
5412     str = PyUnicode_FromObject(str);
5413     if (!str)
5414         return -2;
5415     sub = PyUnicode_FromObject(sub);
5416     if (!sub) {
5417         Py_DECREF(str);
5418         return -2;
5419     }
5420 
5421     if (direction > 0)
5422         result = stringlib_find_slice(
5423             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5424             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5425             start, end
5426             );
5427     else
5428         result = stringlib_rfind_slice(
5429             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431             start, end
5432             );
5433 
5434     Py_DECREF(str);
5435     Py_DECREF(sub);
5436 
5437     return result;
5438 }
5439 
5440 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5441 int tailmatch(PyUnicodeObject *self,
5442               PyUnicodeObject *substring,
5443               Py_ssize_t start,
5444               Py_ssize_t end,
5445               int direction)
5446 {
5447     if (substring->length == 0)
5448         return 1;
5449 
5450     ADJUST_INDICES(start, end, self->length);
5451     end -= substring->length;
5452     if (end < start)
5453         return 0;
5454 
5455     if (direction > 0) {
5456         if (Py_UNICODE_MATCH(self, end, substring))
5457             return 1;
5458     } else {
5459         if (Py_UNICODE_MATCH(self, start, substring))
5460             return 1;
5461     }
5462 
5463     return 0;
5464 }
5465 
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5467                                PyObject *substr,
5468                                Py_ssize_t start,
5469                                Py_ssize_t end,
5470                                int direction)
5471 {
5472     Py_ssize_t result;
5473 
5474     str = PyUnicode_FromObject(str);
5475     if (str == NULL)
5476         return -1;
5477     substr = PyUnicode_FromObject(substr);
5478     if (substr == NULL) {
5479         Py_DECREF(str);
5480         return -1;
5481     }
5482 
5483     result = tailmatch((PyUnicodeObject *)str,
5484                        (PyUnicodeObject *)substr,
5485                        start, end, direction);
5486     Py_DECREF(str);
5487     Py_DECREF(substr);
5488     return result;
5489 }
5490 
5491 /* Apply fixfct filter to the Unicode object self and return a
5492    reference to the modified object */
5493 
5494 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5495 PyObject *fixup(PyUnicodeObject *self,
5496                 int (*fixfct)(PyUnicodeObject *s))
5497 {
5498 
5499     PyUnicodeObject *u;
5500 
5501     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5502     if (u == NULL)
5503         return NULL;
5504 
5505     Py_UNICODE_COPY(u->str, self->str, self->length);
5506 
5507     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5508         /* fixfct should return TRUE if it modified the buffer. If
5509            FALSE, return a reference to the original buffer instead
5510            (to save space, not time) */
5511         Py_INCREF(self);
5512         Py_DECREF(u);
5513         return (PyObject*) self;
5514     }
5515     return (PyObject*) u;
5516 }
5517 
5518 static
fixupper(PyUnicodeObject * self)5519 int fixupper(PyUnicodeObject *self)
5520 {
5521     Py_ssize_t len = self->length;
5522     Py_UNICODE *s = self->str;
5523     int status = 0;
5524 
5525     while (len-- > 0) {
5526         register Py_UNICODE ch;
5527 
5528         ch = Py_UNICODE_TOUPPER(*s);
5529         if (ch != *s) {
5530             status = 1;
5531             *s = ch;
5532         }
5533         s++;
5534     }
5535 
5536     return status;
5537 }
5538 
5539 static
fixlower(PyUnicodeObject * self)5540 int fixlower(PyUnicodeObject *self)
5541 {
5542     Py_ssize_t len = self->length;
5543     Py_UNICODE *s = self->str;
5544     int status = 0;
5545 
5546     while (len-- > 0) {
5547         register Py_UNICODE ch;
5548 
5549         ch = Py_UNICODE_TOLOWER(*s);
5550         if (ch != *s) {
5551             status = 1;
5552             *s = ch;
5553         }
5554         s++;
5555     }
5556 
5557     return status;
5558 }
5559 
5560 static
fixswapcase(PyUnicodeObject * self)5561 int fixswapcase(PyUnicodeObject *self)
5562 {
5563     Py_ssize_t len = self->length;
5564     Py_UNICODE *s = self->str;
5565     int status = 0;
5566 
5567     while (len-- > 0) {
5568         if (Py_UNICODE_ISUPPER(*s)) {
5569             *s = Py_UNICODE_TOLOWER(*s);
5570             status = 1;
5571         } else if (Py_UNICODE_ISLOWER(*s)) {
5572             *s = Py_UNICODE_TOUPPER(*s);
5573             status = 1;
5574         }
5575         s++;
5576     }
5577 
5578     return status;
5579 }
5580 
5581 static
fixcapitalize(PyUnicodeObject * self)5582 int fixcapitalize(PyUnicodeObject *self)
5583 {
5584     Py_ssize_t len = self->length;
5585     Py_UNICODE *s = self->str;
5586     int status = 0;
5587 
5588     if (len == 0)
5589         return 0;
5590     if (!Py_UNICODE_ISUPPER(*s)) {
5591         *s = Py_UNICODE_TOUPPER(*s);
5592         status = 1;
5593     }
5594     s++;
5595     while (--len > 0) {
5596         if (!Py_UNICODE_ISLOWER(*s)) {
5597             *s = Py_UNICODE_TOLOWER(*s);
5598             status = 1;
5599         }
5600         s++;
5601     }
5602     return status;
5603 }
5604 
5605 static
fixtitle(PyUnicodeObject * self)5606 int fixtitle(PyUnicodeObject *self)
5607 {
5608     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609     register Py_UNICODE *e;
5610     int previous_is_cased;
5611 
5612     /* Shortcut for single character strings */
5613     if (PyUnicode_GET_SIZE(self) == 1) {
5614         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5615         if (*p != ch) {
5616             *p = ch;
5617             return 1;
5618         }
5619         else
5620             return 0;
5621     }
5622 
5623     e = p + PyUnicode_GET_SIZE(self);
5624     previous_is_cased = 0;
5625     for (; p < e; p++) {
5626         register const Py_UNICODE ch = *p;
5627 
5628         if (previous_is_cased)
5629             *p = Py_UNICODE_TOLOWER(ch);
5630         else
5631             *p = Py_UNICODE_TOTITLE(ch);
5632 
5633         if (Py_UNICODE_ISLOWER(ch) ||
5634             Py_UNICODE_ISUPPER(ch) ||
5635             Py_UNICODE_ISTITLE(ch))
5636             previous_is_cased = 1;
5637         else
5638             previous_is_cased = 0;
5639     }
5640     return 1;
5641 }
5642 
5643 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5644 PyUnicode_Join(PyObject *separator, PyObject *seq)
5645 {
5646     PyObject *internal_separator = NULL;
5647     const Py_UNICODE blank = ' ';
5648     const Py_UNICODE *sep = &blank;
5649     Py_ssize_t seplen = 1;
5650     PyUnicodeObject *res = NULL; /* the result */
5651     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5652     Py_ssize_t res_used;         /* # used bytes */
5653     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5654     PyObject *fseq;          /* PySequence_Fast(seq) */
5655     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5656     PyObject *item;
5657     Py_ssize_t i;
5658 
5659     fseq = PySequence_Fast(seq, "can only join an iterable");
5660     if (fseq == NULL) {
5661         return NULL;
5662     }
5663 
5664     /* Grrrr.  A codec may be invoked to convert str objects to
5665      * Unicode, and so it's possible to call back into Python code
5666      * during PyUnicode_FromObject(), and so it's possible for a sick
5667      * codec to change the size of fseq (if seq is a list).  Therefore
5668      * we have to keep refetching the size -- can't assume seqlen
5669      * is invariant.
5670      */
5671     seqlen = PySequence_Fast_GET_SIZE(fseq);
5672     /* If empty sequence, return u"". */
5673     if (seqlen == 0) {
5674         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5675         goto Done;
5676     }
5677     /* If singleton sequence with an exact Unicode, return that. */
5678     if (seqlen == 1) {
5679         item = PySequence_Fast_GET_ITEM(fseq, 0);
5680         if (PyUnicode_CheckExact(item)) {
5681             Py_INCREF(item);
5682             res = (PyUnicodeObject *)item;
5683             goto Done;
5684         }
5685     }
5686 
5687     /* At least two items to join, or one that isn't exact Unicode. */
5688     if (seqlen > 1) {
5689         /* Set up sep and seplen -- they're needed. */
5690         if (separator == NULL) {
5691             sep = &blank;
5692             seplen = 1;
5693         }
5694         else {
5695             internal_separator = PyUnicode_FromObject(separator);
5696             if (internal_separator == NULL)
5697                 goto onError;
5698             sep = PyUnicode_AS_UNICODE(internal_separator);
5699             seplen = PyUnicode_GET_SIZE(internal_separator);
5700             /* In case PyUnicode_FromObject() mutated seq. */
5701             seqlen = PySequence_Fast_GET_SIZE(fseq);
5702         }
5703     }
5704 
5705     /* Get space. */
5706     res = _PyUnicode_New(res_alloc);
5707     if (res == NULL)
5708         goto onError;
5709     res_p = PyUnicode_AS_UNICODE(res);
5710     res_used = 0;
5711 
5712     for (i = 0; i < seqlen; ++i) {
5713         Py_ssize_t itemlen;
5714         Py_ssize_t new_res_used;
5715 
5716         item = PySequence_Fast_GET_ITEM(fseq, i);
5717         /* Convert item to Unicode. */
5718         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5719             PyErr_Format(PyExc_TypeError,
5720                          "sequence item %zd: expected string or Unicode,"
5721                          " %.80s found",
5722                          i, Py_TYPE(item)->tp_name);
5723             goto onError;
5724         }
5725         item = PyUnicode_FromObject(item);
5726         if (item == NULL)
5727             goto onError;
5728         /* We own a reference to item from here on. */
5729 
5730         /* In case PyUnicode_FromObject() mutated seq. */
5731         seqlen = PySequence_Fast_GET_SIZE(fseq);
5732 
5733         /* Make sure we have enough space for the separator and the item. */
5734         itemlen = PyUnicode_GET_SIZE(item);
5735         new_res_used = res_used + itemlen;
5736         if (new_res_used < 0)
5737             goto Overflow;
5738         if (i < seqlen - 1) {
5739             new_res_used += seplen;
5740             if (new_res_used < 0)
5741                 goto Overflow;
5742         }
5743         if (new_res_used > res_alloc) {
5744             /* double allocated size until it's big enough */
5745             do {
5746                 res_alloc += res_alloc;
5747                 if (res_alloc <= 0)
5748                     goto Overflow;
5749             } while (new_res_used > res_alloc);
5750             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5751                 Py_DECREF(item);
5752                 goto onError;
5753             }
5754             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5755         }
5756 
5757         /* Copy item, and maybe the separator. */
5758         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5759         res_p += itemlen;
5760         if (i < seqlen - 1) {
5761             Py_UNICODE_COPY(res_p, sep, seplen);
5762             res_p += seplen;
5763         }
5764         Py_DECREF(item);
5765         res_used = new_res_used;
5766     }
5767 
5768     /* Shrink res to match the used area; this probably can't fail,
5769      * but it's cheap to check.
5770      */
5771     if (_PyUnicode_Resize(&res, res_used) < 0)
5772         goto onError;
5773 
5774   Done:
5775     Py_XDECREF(internal_separator);
5776     Py_DECREF(fseq);
5777     return (PyObject *)res;
5778 
5779   Overflow:
5780     PyErr_SetString(PyExc_OverflowError,
5781                     "join() result is too long for a Python string");
5782     Py_DECREF(item);
5783     /* fall through */
5784 
5785   onError:
5786     Py_XDECREF(internal_separator);
5787     Py_DECREF(fseq);
5788     Py_XDECREF(res);
5789     return NULL;
5790 }
5791 
5792 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5793 PyUnicodeObject *pad(PyUnicodeObject *self,
5794                      Py_ssize_t left,
5795                      Py_ssize_t right,
5796                      Py_UNICODE fill)
5797 {
5798     PyUnicodeObject *u;
5799 
5800     if (left < 0)
5801         left = 0;
5802     if (right < 0)
5803         right = 0;
5804 
5805     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5806         Py_INCREF(self);
5807         return self;
5808     }
5809 
5810     if (left > PY_SSIZE_T_MAX - self->length ||
5811         right > PY_SSIZE_T_MAX - (left + self->length)) {
5812         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5813         return NULL;
5814     }
5815     u = _PyUnicode_New(left + self->length + right);
5816     if (u) {
5817         if (left)
5818             Py_UNICODE_FILL(u->str, fill, left);
5819         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5820         if (right)
5821             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5822     }
5823 
5824     return u;
5825 }
5826 
PyUnicode_Splitlines(PyObject * string,int keepends)5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5828 {
5829     PyObject *list;
5830 
5831     string = PyUnicode_FromObject(string);
5832     if (string == NULL)
5833         return NULL;
5834 
5835     list = stringlib_splitlines(
5836         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5837         PyUnicode_GET_SIZE(string), keepends);
5838 
5839     Py_DECREF(string);
5840     return list;
5841 }
5842 
5843 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5844 PyObject *split(PyUnicodeObject *self,
5845                 PyUnicodeObject *substring,
5846                 Py_ssize_t maxcount)
5847 {
5848     if (maxcount < 0)
5849         maxcount = PY_SSIZE_T_MAX;
5850 
5851     if (substring == NULL)
5852         return stringlib_split_whitespace(
5853             (PyObject*) self,  self->str, self->length, maxcount
5854             );
5855 
5856     return stringlib_split(
5857         (PyObject*) self,  self->str, self->length,
5858         substring->str, substring->length,
5859         maxcount
5860         );
5861 }
5862 
5863 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5864 PyObject *rsplit(PyUnicodeObject *self,
5865                  PyUnicodeObject *substring,
5866                  Py_ssize_t maxcount)
5867 {
5868     if (maxcount < 0)
5869         maxcount = PY_SSIZE_T_MAX;
5870 
5871     if (substring == NULL)
5872         return stringlib_rsplit_whitespace(
5873             (PyObject*) self,  self->str, self->length, maxcount
5874             );
5875 
5876     return stringlib_rsplit(
5877         (PyObject*) self,  self->str, self->length,
5878         substring->str, substring->length,
5879         maxcount
5880         );
5881 }
5882 
5883 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5884 PyObject *replace(PyUnicodeObject *self,
5885                   PyUnicodeObject *str1,
5886                   PyUnicodeObject *str2,
5887                   Py_ssize_t maxcount)
5888 {
5889     PyUnicodeObject *u;
5890 
5891     if (maxcount < 0)
5892         maxcount = PY_SSIZE_T_MAX;
5893     else if (maxcount == 0 || self->length == 0)
5894         goto nothing;
5895 
5896     if (str1->length == str2->length) {
5897         Py_ssize_t i;
5898         /* same length */
5899         if (str1->length == 0)
5900             goto nothing;
5901         if (str1->length == 1) {
5902             /* replace characters */
5903             Py_UNICODE u1, u2;
5904             if (!findchar(self->str, self->length, str1->str[0]))
5905                 goto nothing;
5906             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5907             if (!u)
5908                 return NULL;
5909             Py_UNICODE_COPY(u->str, self->str, self->length);
5910             u1 = str1->str[0];
5911             u2 = str2->str[0];
5912             for (i = 0; i < u->length; i++)
5913                 if (u->str[i] == u1) {
5914                     if (--maxcount < 0)
5915                         break;
5916                     u->str[i] = u2;
5917                 }
5918         } else {
5919             i = stringlib_find(
5920                 self->str, self->length, str1->str, str1->length, 0
5921                 );
5922             if (i < 0)
5923                 goto nothing;
5924             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5925             if (!u)
5926                 return NULL;
5927             Py_UNICODE_COPY(u->str, self->str, self->length);
5928 
5929             /* change everything in-place, starting with this one */
5930             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931             i += str1->length;
5932 
5933             while ( --maxcount > 0) {
5934                 i = stringlib_find(self->str+i, self->length-i,
5935                                    str1->str, str1->length,
5936                                    i);
5937                 if (i == -1)
5938                     break;
5939                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940                 i += str1->length;
5941             }
5942         }
5943     } else {
5944 
5945         Py_ssize_t n, i, j;
5946         Py_ssize_t product, new_size, delta;
5947         Py_UNICODE *p;
5948 
5949         /* replace strings */
5950         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5951                             maxcount);
5952         if (n == 0)
5953             goto nothing;
5954         /* new_size = self->length + n * (str2->length - str1->length)); */
5955         delta = (str2->length - str1->length);
5956         if (delta == 0) {
5957             new_size = self->length;
5958         } else {
5959             product = n * (str2->length - str1->length);
5960             if ((product / (str2->length - str1->length)) != n) {
5961                 PyErr_SetString(PyExc_OverflowError,
5962                                 "replace string is too long");
5963                 return NULL;
5964             }
5965             new_size = self->length + product;
5966             if (new_size < 0) {
5967                 PyErr_SetString(PyExc_OverflowError,
5968                                 "replace string is too long");
5969                 return NULL;
5970             }
5971         }
5972         u = _PyUnicode_New(new_size);
5973         if (!u)
5974             return NULL;
5975         i = 0;
5976         p = u->str;
5977         if (str1->length > 0) {
5978             while (n-- > 0) {
5979                 /* look for next match */
5980                 j = stringlib_find(self->str+i, self->length-i,
5981                                    str1->str, str1->length,
5982                                    i);
5983                 if (j == -1)
5984                     break;
5985                 else if (j > i) {
5986                     /* copy unchanged part [i:j] */
5987                     Py_UNICODE_COPY(p, self->str+i, j-i);
5988                     p += j - i;
5989                 }
5990                 /* copy substitution string */
5991                 if (str2->length > 0) {
5992                     Py_UNICODE_COPY(p, str2->str, str2->length);
5993                     p += str2->length;
5994                 }
5995                 i = j + str1->length;
5996             }
5997             if (i < self->length)
5998                 /* copy tail [i:] */
5999                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6000         } else {
6001             /* interleave */
6002             while (n > 0) {
6003                 Py_UNICODE_COPY(p, str2->str, str2->length);
6004                 p += str2->length;
6005                 if (--n <= 0)
6006                     break;
6007                 *p++ = self->str[i++];
6008             }
6009             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6010         }
6011     }
6012     return (PyObject *) u;
6013 
6014   nothing:
6015     /* nothing to replace; return original string (when possible) */
6016     if (PyUnicode_CheckExact(self)) {
6017         Py_INCREF(self);
6018         return (PyObject *) self;
6019     }
6020     return PyUnicode_FromUnicode(self->str, self->length);
6021 }
6022 
6023 /* --- Unicode Object Methods --------------------------------------------- */
6024 
6025 PyDoc_STRVAR(title__doc__,
6026              "S.title() -> unicode\n\
6027 \n\
6028 Return a titlecased version of S, i.e. words start with title case\n\
6029 characters, all remaining cased characters have lower case.");
6030 
6031 static PyObject*
unicode_title(PyUnicodeObject * self)6032 unicode_title(PyUnicodeObject *self)
6033 {
6034     return fixup(self, fixtitle);
6035 }
6036 
6037 PyDoc_STRVAR(capitalize__doc__,
6038              "S.capitalize() -> unicode\n\
6039 \n\
6040 Return a capitalized version of S, i.e. make the first character\n\
6041 have upper case and the rest lower case.");
6042 
6043 static PyObject*
unicode_capitalize(PyUnicodeObject * self)6044 unicode_capitalize(PyUnicodeObject *self)
6045 {
6046     return fixup(self, fixcapitalize);
6047 }
6048 
6049 #if 0
6050 PyDoc_STRVAR(capwords__doc__,
6051              "S.capwords() -> unicode\n\
6052 \n\
6053 Apply .capitalize() to all words in S and return the result with\n\
6054 normalized whitespace (all whitespace strings are replaced by ' ').");
6055 
6056 static PyObject*
6057 unicode_capwords(PyUnicodeObject *self)
6058 {
6059     PyObject *list;
6060     PyObject *item;
6061     Py_ssize_t i;
6062 
6063     /* Split into words */
6064     list = split(self, NULL, -1);
6065     if (!list)
6066         return NULL;
6067 
6068     /* Capitalize each word */
6069     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6070         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6071                      fixcapitalize);
6072         if (item == NULL)
6073             goto onError;
6074         Py_DECREF(PyList_GET_ITEM(list, i));
6075         PyList_SET_ITEM(list, i, item);
6076     }
6077 
6078     /* Join the words to form a new string */
6079     item = PyUnicode_Join(NULL, list);
6080 
6081   onError:
6082     Py_DECREF(list);
6083     return (PyObject *)item;
6084 }
6085 #endif
6086 
6087 /* Argument converter.  Coerces to a single unicode character */
6088 
6089 static int
convert_uc(PyObject * obj,void * addr)6090 convert_uc(PyObject *obj, void *addr)
6091 {
6092     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6093     PyObject *uniobj;
6094     Py_UNICODE *unistr;
6095 
6096     uniobj = PyUnicode_FromObject(obj);
6097     if (uniobj == NULL) {
6098         PyErr_SetString(PyExc_TypeError,
6099                         "The fill character cannot be converted to Unicode");
6100         return 0;
6101     }
6102     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6103         PyErr_SetString(PyExc_TypeError,
6104                         "The fill character must be exactly one character long");
6105         Py_DECREF(uniobj);
6106         return 0;
6107     }
6108     unistr = PyUnicode_AS_UNICODE(uniobj);
6109     *fillcharloc = unistr[0];
6110     Py_DECREF(uniobj);
6111     return 1;
6112 }
6113 
6114 PyDoc_STRVAR(center__doc__,
6115              "S.center(width[, fillchar]) -> unicode\n\
6116 \n\
6117 Return S centered in a Unicode string of length width. Padding is\n\
6118 done using the specified fill character (default is a space)");
6119 
6120 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6121 unicode_center(PyUnicodeObject *self, PyObject *args)
6122 {
6123     Py_ssize_t marg, left;
6124     Py_ssize_t width;
6125     Py_UNICODE fillchar = ' ';
6126 
6127     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6128         return NULL;
6129 
6130     if (self->length >= width && PyUnicode_CheckExact(self)) {
6131         Py_INCREF(self);
6132         return (PyObject*) self;
6133     }
6134 
6135     marg = width - self->length;
6136     left = marg / 2 + (marg & width & 1);
6137 
6138     return (PyObject*) pad(self, left, marg - left, fillchar);
6139 }
6140 
6141 #if 0
6142 
6143 /* This code should go into some future Unicode collation support
6144    module. The basic comparison should compare ordinals on a naive
6145    basis (this is what Java does and thus Jython too). */
6146 
6147 /* speedy UTF-16 code point order comparison */
6148 /* gleaned from: */
6149 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6150 
6151 static short utf16Fixup[32] =
6152 {
6153     0, 0, 0, 0, 0, 0, 0, 0,
6154     0, 0, 0, 0, 0, 0, 0, 0,
6155     0, 0, 0, 0, 0, 0, 0, 0,
6156     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6157 };
6158 
6159 static int
6160 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6161 {
6162     Py_ssize_t len1, len2;
6163 
6164     Py_UNICODE *s1 = str1->str;
6165     Py_UNICODE *s2 = str2->str;
6166 
6167     len1 = str1->length;
6168     len2 = str2->length;
6169 
6170     while (len1 > 0 && len2 > 0) {
6171         Py_UNICODE c1, c2;
6172 
6173         c1 = *s1++;
6174         c2 = *s2++;
6175 
6176         if (c1 > (1<<11) * 26)
6177             c1 += utf16Fixup[c1>>11];
6178         if (c2 > (1<<11) * 26)
6179             c2 += utf16Fixup[c2>>11];
6180         /* now c1 and c2 are in UTF-32-compatible order */
6181 
6182         if (c1 != c2)
6183             return (c1 < c2) ? -1 : 1;
6184 
6185         len1--; len2--;
6186     }
6187 
6188     return (len1 < len2) ? -1 : (len1 != len2);
6189 }
6190 
6191 #else
6192 
6193 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6194 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195 {
6196     register Py_ssize_t len1, len2;
6197 
6198     Py_UNICODE *s1 = str1->str;
6199     Py_UNICODE *s2 = str2->str;
6200 
6201     len1 = str1->length;
6202     len2 = str2->length;
6203 
6204     while (len1 > 0 && len2 > 0) {
6205         Py_UNICODE c1, c2;
6206 
6207         c1 = *s1++;
6208         c2 = *s2++;
6209 
6210         if (c1 != c2)
6211             return (c1 < c2) ? -1 : 1;
6212 
6213         len1--; len2--;
6214     }
6215 
6216     return (len1 < len2) ? -1 : (len1 != len2);
6217 }
6218 
6219 #endif
6220 
PyUnicode_Compare(PyObject * left,PyObject * right)6221 int PyUnicode_Compare(PyObject *left,
6222                       PyObject *right)
6223 {
6224     PyUnicodeObject *u = NULL, *v = NULL;
6225     int result;
6226 
6227     /* Coerce the two arguments */
6228     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6229     if (u == NULL)
6230         goto onError;
6231     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6232     if (v == NULL)
6233         goto onError;
6234 
6235     /* Shortcut for empty or interned objects */
6236     if (v == u) {
6237         Py_DECREF(u);
6238         Py_DECREF(v);
6239         return 0;
6240     }
6241 
6242     result = unicode_compare(u, v);
6243 
6244     Py_DECREF(u);
6245     Py_DECREF(v);
6246     return result;
6247 
6248   onError:
6249     Py_XDECREF(u);
6250     Py_XDECREF(v);
6251     return -1;
6252 }
6253 
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6254 PyObject *PyUnicode_RichCompare(PyObject *left,
6255                                 PyObject *right,
6256                                 int op)
6257 {
6258     int result;
6259 
6260     result = PyUnicode_Compare(left, right);
6261     if (result == -1 && PyErr_Occurred())
6262         goto onError;
6263 
6264     /* Convert the return value to a Boolean */
6265     switch (op) {
6266     case Py_EQ:
6267         result = (result == 0);
6268         break;
6269     case Py_NE:
6270         result = (result != 0);
6271         break;
6272     case Py_LE:
6273         result = (result <= 0);
6274         break;
6275     case Py_GE:
6276         result = (result >= 0);
6277         break;
6278     case Py_LT:
6279         result = (result == -1);
6280         break;
6281     case Py_GT:
6282         result = (result == 1);
6283         break;
6284     }
6285     return PyBool_FromLong(result);
6286 
6287   onError:
6288 
6289     /* Standard case
6290 
6291        Type errors mean that PyUnicode_FromObject() could not convert
6292        one of the arguments (usually the right hand side) to Unicode,
6293        ie. we can't handle the comparison request. However, it is
6294        possible that the other object knows a comparison method, which
6295        is why we return Py_NotImplemented to give the other object a
6296        chance.
6297 
6298     */
6299     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6300         PyErr_Clear();
6301         Py_INCREF(Py_NotImplemented);
6302         return Py_NotImplemented;
6303     }
6304     if (op != Py_EQ && op != Py_NE)
6305         return NULL;
6306 
6307     /* Equality comparison.
6308 
6309        This is a special case: we silence any PyExc_UnicodeDecodeError
6310        and instead turn it into a PyErr_UnicodeWarning.
6311 
6312     */
6313     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6314         return NULL;
6315     PyErr_Clear();
6316     if (PyErr_Warn(PyExc_UnicodeWarning,
6317                    (op == Py_EQ) ?
6318                    "Unicode equal comparison "
6319                    "failed to convert both arguments to Unicode - "
6320                    "interpreting them as being unequal" :
6321                    "Unicode unequal comparison "
6322                    "failed to convert both arguments to Unicode - "
6323                    "interpreting them as being unequal"
6324             ) < 0)
6325         return NULL;
6326     result = (op == Py_NE);
6327     return PyBool_FromLong(result);
6328 }
6329 
PyUnicode_Contains(PyObject * container,PyObject * element)6330 int PyUnicode_Contains(PyObject *container,
6331                        PyObject *element)
6332 {
6333     PyObject *str, *sub;
6334     int result;
6335 
6336     /* Coerce the two arguments */
6337     sub = PyUnicode_FromObject(element);
6338     if (!sub) {
6339         return -1;
6340     }
6341 
6342     str = PyUnicode_FromObject(container);
6343     if (!str) {
6344         Py_DECREF(sub);
6345         return -1;
6346     }
6347 
6348     result = stringlib_contains_obj(str, sub);
6349 
6350     Py_DECREF(str);
6351     Py_DECREF(sub);
6352 
6353     return result;
6354 }
6355 
6356 /* Concat to string or Unicode object giving a new Unicode object. */
6357 
PyUnicode_Concat(PyObject * left,PyObject * right)6358 PyObject *PyUnicode_Concat(PyObject *left,
6359                            PyObject *right)
6360 {
6361     PyUnicodeObject *u = NULL, *v = NULL, *w;
6362 
6363     /* Coerce the two arguments */
6364     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6365     if (u == NULL)
6366         goto onError;
6367     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6368     if (v == NULL)
6369         goto onError;
6370 
6371     /* Shortcuts */
6372     if (v == unicode_empty) {
6373         Py_DECREF(v);
6374         return (PyObject *)u;
6375     }
6376     if (u == unicode_empty) {
6377         Py_DECREF(u);
6378         return (PyObject *)v;
6379     }
6380 
6381     if (u->length > PY_SSIZE_T_MAX - v->length) {
6382         PyErr_SetString(PyExc_OverflowError,
6383                         "strings are too large to concat");
6384         goto onError;
6385     }
6386 
6387     /* Concat the two Unicode strings */
6388     w = _PyUnicode_New(u->length + v->length);
6389     if (w == NULL)
6390         goto onError;
6391     Py_UNICODE_COPY(w->str, u->str, u->length);
6392     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6393 
6394     Py_DECREF(u);
6395     Py_DECREF(v);
6396     return (PyObject *)w;
6397 
6398   onError:
6399     Py_XDECREF(u);
6400     Py_XDECREF(v);
6401     return NULL;
6402 }
6403 
6404 PyDoc_STRVAR(count__doc__,
6405              "S.count(sub[, start[, end]]) -> int\n\
6406 \n\
6407 Return the number of non-overlapping occurrences of substring sub in\n\
6408 Unicode string S[start:end].  Optional arguments start and end are\n\
6409 interpreted as in slice notation.");
6410 
6411 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6412 unicode_count(PyUnicodeObject *self, PyObject *args)
6413 {
6414     PyUnicodeObject *substring;
6415     Py_ssize_t start = 0;
6416     Py_ssize_t end = PY_SSIZE_T_MAX;
6417     PyObject *result;
6418 
6419     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6420                                             &start, &end))
6421         return NULL;
6422 
6423     ADJUST_INDICES(start, end, self->length);
6424     result = PyInt_FromSsize_t(
6425         stringlib_count(self->str + start, end - start,
6426                         substring->str, substring->length,
6427                         PY_SSIZE_T_MAX)
6428         );
6429 
6430     Py_DECREF(substring);
6431 
6432     return result;
6433 }
6434 
6435 PyDoc_STRVAR(encode__doc__,
6436              "S.encode([encoding[,errors]]) -> string or unicode\n\
6437 \n\
6438 Encodes S using the codec registered for encoding. encoding defaults\n\
6439 to the default encoding. errors may be given to set a different error\n\
6440 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6441 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6442 'xmlcharrefreplace' as well as any other name registered with\n\
6443 codecs.register_error that can handle UnicodeEncodeErrors.");
6444 
6445 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6446 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6447 {
6448     static char *kwlist[] = {"encoding", "errors", 0};
6449     char *encoding = NULL;
6450     char *errors = NULL;
6451     PyObject *v;
6452 
6453     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6454                                      kwlist, &encoding, &errors))
6455         return NULL;
6456     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6457     if (v == NULL)
6458         goto onError;
6459     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6460         PyErr_Format(PyExc_TypeError,
6461                      "encoder did not return a string/unicode object "
6462                      "(type=%.400s)",
6463                      Py_TYPE(v)->tp_name);
6464         Py_DECREF(v);
6465         return NULL;
6466     }
6467     return v;
6468 
6469   onError:
6470     return NULL;
6471 }
6472 
6473 PyDoc_STRVAR(decode__doc__,
6474              "S.decode([encoding[,errors]]) -> string or unicode\n\
6475 \n\
6476 Decodes S using the codec registered for encoding. encoding defaults\n\
6477 to the default encoding. errors may be given to set a different error\n\
6478 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6479 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6480 as well as any other name registered with codecs.register_error that is\n\
6481 able to handle UnicodeDecodeErrors.");
6482 
6483 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6484 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6485 {
6486     static char *kwlist[] = {"encoding", "errors", 0};
6487     char *encoding = NULL;
6488     char *errors = NULL;
6489     PyObject *v;
6490 
6491     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6492                                      kwlist, &encoding, &errors))
6493         return NULL;
6494     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6495     if (v == NULL)
6496         goto onError;
6497     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6498         PyErr_Format(PyExc_TypeError,
6499                      "decoder did not return a string/unicode object "
6500                      "(type=%.400s)",
6501                      Py_TYPE(v)->tp_name);
6502         Py_DECREF(v);
6503         return NULL;
6504     }
6505     return v;
6506 
6507   onError:
6508     return NULL;
6509 }
6510 
6511 PyDoc_STRVAR(expandtabs__doc__,
6512              "S.expandtabs([tabsize]) -> unicode\n\
6513 \n\
6514 Return a copy of S where all tab characters are expanded using spaces.\n\
6515 If tabsize is not given, a tab size of 8 characters is assumed.");
6516 
6517 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6518 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6519 {
6520     Py_UNICODE *e;
6521     Py_UNICODE *p;
6522     Py_UNICODE *q;
6523     Py_UNICODE *qe;
6524     Py_ssize_t i, j, incr;
6525     PyUnicodeObject *u;
6526     int tabsize = 8;
6527 
6528     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6529         return NULL;
6530 
6531     /* First pass: determine size of output string */
6532     i = 0; /* chars up to and including most recent \n or \r */
6533     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6534     e = self->str + self->length; /* end of input */
6535     for (p = self->str; p < e; p++)
6536         if (*p == '\t') {
6537             if (tabsize > 0) {
6538                 incr = tabsize - (j % tabsize); /* cannot overflow */
6539                 if (j > PY_SSIZE_T_MAX - incr)
6540                     goto overflow1;
6541                 j += incr;
6542             }
6543         }
6544         else {
6545             if (j > PY_SSIZE_T_MAX - 1)
6546                 goto overflow1;
6547             j++;
6548             if (*p == '\n' || *p == '\r') {
6549                 if (i > PY_SSIZE_T_MAX - j)
6550                     goto overflow1;
6551                 i += j;
6552                 j = 0;
6553             }
6554         }
6555 
6556     if (i > PY_SSIZE_T_MAX - j)
6557         goto overflow1;
6558 
6559     /* Second pass: create output string and fill it */
6560     u = _PyUnicode_New(i + j);
6561     if (!u)
6562         return NULL;
6563 
6564     j = 0; /* same as in first pass */
6565     q = u->str; /* next output char */
6566     qe = u->str + u->length; /* end of output */
6567 
6568     for (p = self->str; p < e; p++)
6569         if (*p == '\t') {
6570             if (tabsize > 0) {
6571                 i = tabsize - (j % tabsize);
6572                 j += i;
6573                 while (i--) {
6574                     if (q >= qe)
6575                         goto overflow2;
6576                     *q++ = ' ';
6577                 }
6578             }
6579         }
6580         else {
6581             if (q >= qe)
6582                 goto overflow2;
6583             *q++ = *p;
6584             j++;
6585             if (*p == '\n' || *p == '\r')
6586                 j = 0;
6587         }
6588 
6589     return (PyObject*) u;
6590 
6591   overflow2:
6592     Py_DECREF(u);
6593   overflow1:
6594     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6595     return NULL;
6596 }
6597 
6598 PyDoc_STRVAR(find__doc__,
6599              "S.find(sub [,start [,end]]) -> int\n\
6600 \n\
6601 Return the lowest index in S where substring sub is found,\n\
6602 such that sub is contained within S[start:end].  Optional\n\
6603 arguments start and end are interpreted as in slice notation.\n\
6604 \n\
6605 Return -1 on failure.");
6606 
6607 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6608 unicode_find(PyUnicodeObject *self, PyObject *args)
6609 {
6610     PyUnicodeObject *substring;
6611     Py_ssize_t start;
6612     Py_ssize_t end;
6613     Py_ssize_t result;
6614 
6615     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6616                                             &start, &end))
6617         return NULL;
6618 
6619     result = stringlib_find_slice(
6620         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6621         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6622         start, end
6623         );
6624 
6625     Py_DECREF(substring);
6626 
6627     return PyInt_FromSsize_t(result);
6628 }
6629 
6630 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6631 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6632 {
6633     if (index < 0 || index >= self->length) {
6634         PyErr_SetString(PyExc_IndexError, "string index out of range");
6635         return NULL;
6636     }
6637 
6638     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6639 }
6640 
6641 static long
unicode_hash(PyUnicodeObject * self)6642 unicode_hash(PyUnicodeObject *self)
6643 {
6644     /* Since Unicode objects compare equal to their ASCII string
6645        counterparts, they should use the individual character values
6646        as basis for their hash value.  This is needed to assure that
6647        strings and Unicode objects behave in the same way as
6648        dictionary keys. */
6649 
6650     register Py_ssize_t len;
6651     register Py_UNICODE *p;
6652     register long x;
6653 
6654 #ifdef Py_DEBUG
6655     assert(_Py_HashSecret_Initialized);
6656 #endif
6657     if (self->hash != -1)
6658         return self->hash;
6659     len = PyUnicode_GET_SIZE(self);
6660     /*
6661       We make the hash of the empty string be 0, rather than using
6662       (prefix ^ suffix), since this slightly obfuscates the hash secret
6663     */
6664     if (len == 0) {
6665         self->hash = 0;
6666         return 0;
6667     }
6668     p = PyUnicode_AS_UNICODE(self);
6669     x = _Py_HashSecret.prefix;
6670     x ^= *p << 7;
6671     while (--len >= 0)
6672         x = (1000003*x) ^ *p++;
6673     x ^= PyUnicode_GET_SIZE(self);
6674     x ^= _Py_HashSecret.suffix;
6675     if (x == -1)
6676         x = -2;
6677     self->hash = x;
6678     return x;
6679 }
6680 
6681 PyDoc_STRVAR(index__doc__,
6682              "S.index(sub [,start [,end]]) -> int\n\
6683 \n\
6684 Like S.find() but raise ValueError when the substring is not found.");
6685 
6686 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6687 unicode_index(PyUnicodeObject *self, PyObject *args)
6688 {
6689     Py_ssize_t result;
6690     PyUnicodeObject *substring;
6691     Py_ssize_t start;
6692     Py_ssize_t end;
6693 
6694     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6695                                             &start, &end))
6696         return NULL;
6697 
6698     result = stringlib_find_slice(
6699         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6700         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6701         start, end
6702         );
6703 
6704     Py_DECREF(substring);
6705 
6706     if (result < 0) {
6707         PyErr_SetString(PyExc_ValueError, "substring not found");
6708         return NULL;
6709     }
6710 
6711     return PyInt_FromSsize_t(result);
6712 }
6713 
6714 PyDoc_STRVAR(islower__doc__,
6715              "S.islower() -> bool\n\
6716 \n\
6717 Return True if all cased characters in S are lowercase and there is\n\
6718 at least one cased character in S, False otherwise.");
6719 
6720 static PyObject*
unicode_islower(PyUnicodeObject * self)6721 unicode_islower(PyUnicodeObject *self)
6722 {
6723     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6724     register const Py_UNICODE *e;
6725     int cased;
6726 
6727     /* Shortcut for single character strings */
6728     if (PyUnicode_GET_SIZE(self) == 1)
6729         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6730 
6731     /* Special case for empty strings */
6732     if (PyUnicode_GET_SIZE(self) == 0)
6733         return PyBool_FromLong(0);
6734 
6735     e = p + PyUnicode_GET_SIZE(self);
6736     cased = 0;
6737     for (; p < e; p++) {
6738         register const Py_UNICODE ch = *p;
6739 
6740         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6741             return PyBool_FromLong(0);
6742         else if (!cased && Py_UNICODE_ISLOWER(ch))
6743             cased = 1;
6744     }
6745     return PyBool_FromLong(cased);
6746 }
6747 
6748 PyDoc_STRVAR(isupper__doc__,
6749              "S.isupper() -> bool\n\
6750 \n\
6751 Return True if all cased characters in S are uppercase and there is\n\
6752 at least one cased character in S, False otherwise.");
6753 
6754 static PyObject*
unicode_isupper(PyUnicodeObject * self)6755 unicode_isupper(PyUnicodeObject *self)
6756 {
6757     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758     register const Py_UNICODE *e;
6759     int cased;
6760 
6761     /* Shortcut for single character strings */
6762     if (PyUnicode_GET_SIZE(self) == 1)
6763         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6764 
6765     /* Special case for empty strings */
6766     if (PyUnicode_GET_SIZE(self) == 0)
6767         return PyBool_FromLong(0);
6768 
6769     e = p + PyUnicode_GET_SIZE(self);
6770     cased = 0;
6771     for (; p < e; p++) {
6772         register const Py_UNICODE ch = *p;
6773 
6774         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6775             return PyBool_FromLong(0);
6776         else if (!cased && Py_UNICODE_ISUPPER(ch))
6777             cased = 1;
6778     }
6779     return PyBool_FromLong(cased);
6780 }
6781 
6782 PyDoc_STRVAR(istitle__doc__,
6783              "S.istitle() -> bool\n\
6784 \n\
6785 Return True if S is a titlecased string and there is at least one\n\
6786 character in S, i.e. upper- and titlecase characters may only\n\
6787 follow uncased characters and lowercase characters only cased ones.\n\
6788 Return False otherwise.");
6789 
6790 static PyObject*
unicode_istitle(PyUnicodeObject * self)6791 unicode_istitle(PyUnicodeObject *self)
6792 {
6793     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6794     register const Py_UNICODE *e;
6795     int cased, previous_is_cased;
6796 
6797     /* Shortcut for single character strings */
6798     if (PyUnicode_GET_SIZE(self) == 1)
6799         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6800                                (Py_UNICODE_ISUPPER(*p) != 0));
6801 
6802     /* Special case for empty strings */
6803     if (PyUnicode_GET_SIZE(self) == 0)
6804         return PyBool_FromLong(0);
6805 
6806     e = p + PyUnicode_GET_SIZE(self);
6807     cased = 0;
6808     previous_is_cased = 0;
6809     for (; p < e; p++) {
6810         register const Py_UNICODE ch = *p;
6811 
6812         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6813             if (previous_is_cased)
6814                 return PyBool_FromLong(0);
6815             previous_is_cased = 1;
6816             cased = 1;
6817         }
6818         else if (Py_UNICODE_ISLOWER(ch)) {
6819             if (!previous_is_cased)
6820                 return PyBool_FromLong(0);
6821             previous_is_cased = 1;
6822             cased = 1;
6823         }
6824         else
6825             previous_is_cased = 0;
6826     }
6827     return PyBool_FromLong(cased);
6828 }
6829 
6830 PyDoc_STRVAR(isspace__doc__,
6831              "S.isspace() -> bool\n\
6832 \n\
6833 Return True if all characters in S are whitespace\n\
6834 and there is at least one character in S, False otherwise.");
6835 
6836 static PyObject*
unicode_isspace(PyUnicodeObject * self)6837 unicode_isspace(PyUnicodeObject *self)
6838 {
6839     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840     register const Py_UNICODE *e;
6841 
6842     /* Shortcut for single character strings */
6843     if (PyUnicode_GET_SIZE(self) == 1 &&
6844         Py_UNICODE_ISSPACE(*p))
6845         return PyBool_FromLong(1);
6846 
6847     /* Special case for empty strings */
6848     if (PyUnicode_GET_SIZE(self) == 0)
6849         return PyBool_FromLong(0);
6850 
6851     e = p + PyUnicode_GET_SIZE(self);
6852     for (; p < e; p++) {
6853         if (!Py_UNICODE_ISSPACE(*p))
6854             return PyBool_FromLong(0);
6855     }
6856     return PyBool_FromLong(1);
6857 }
6858 
6859 PyDoc_STRVAR(isalpha__doc__,
6860              "S.isalpha() -> bool\n\
6861 \n\
6862 Return True if all characters in S are alphabetic\n\
6863 and there is at least one character in S, False otherwise.");
6864 
6865 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6866 unicode_isalpha(PyUnicodeObject *self)
6867 {
6868     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869     register const Py_UNICODE *e;
6870 
6871     /* Shortcut for single character strings */
6872     if (PyUnicode_GET_SIZE(self) == 1 &&
6873         Py_UNICODE_ISALPHA(*p))
6874         return PyBool_FromLong(1);
6875 
6876     /* Special case for empty strings */
6877     if (PyUnicode_GET_SIZE(self) == 0)
6878         return PyBool_FromLong(0);
6879 
6880     e = p + PyUnicode_GET_SIZE(self);
6881     for (; p < e; p++) {
6882         if (!Py_UNICODE_ISALPHA(*p))
6883             return PyBool_FromLong(0);
6884     }
6885     return PyBool_FromLong(1);
6886 }
6887 
6888 PyDoc_STRVAR(isalnum__doc__,
6889              "S.isalnum() -> bool\n\
6890 \n\
6891 Return True if all characters in S are alphanumeric\n\
6892 and there is at least one character in S, False otherwise.");
6893 
6894 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6895 unicode_isalnum(PyUnicodeObject *self)
6896 {
6897     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6898     register const Py_UNICODE *e;
6899 
6900     /* Shortcut for single character strings */
6901     if (PyUnicode_GET_SIZE(self) == 1 &&
6902         Py_UNICODE_ISALNUM(*p))
6903         return PyBool_FromLong(1);
6904 
6905     /* Special case for empty strings */
6906     if (PyUnicode_GET_SIZE(self) == 0)
6907         return PyBool_FromLong(0);
6908 
6909     e = p + PyUnicode_GET_SIZE(self);
6910     for (; p < e; p++) {
6911         if (!Py_UNICODE_ISALNUM(*p))
6912             return PyBool_FromLong(0);
6913     }
6914     return PyBool_FromLong(1);
6915 }
6916 
6917 PyDoc_STRVAR(isdecimal__doc__,
6918              "S.isdecimal() -> bool\n\
6919 \n\
6920 Return True if there are only decimal characters in S,\n\
6921 False otherwise.");
6922 
6923 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6924 unicode_isdecimal(PyUnicodeObject *self)
6925 {
6926     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927     register const Py_UNICODE *e;
6928 
6929     /* Shortcut for single character strings */
6930     if (PyUnicode_GET_SIZE(self) == 1 &&
6931         Py_UNICODE_ISDECIMAL(*p))
6932         return PyBool_FromLong(1);
6933 
6934     /* Special case for empty strings */
6935     if (PyUnicode_GET_SIZE(self) == 0)
6936         return PyBool_FromLong(0);
6937 
6938     e = p + PyUnicode_GET_SIZE(self);
6939     for (; p < e; p++) {
6940         if (!Py_UNICODE_ISDECIMAL(*p))
6941             return PyBool_FromLong(0);
6942     }
6943     return PyBool_FromLong(1);
6944 }
6945 
6946 PyDoc_STRVAR(isdigit__doc__,
6947              "S.isdigit() -> bool\n\
6948 \n\
6949 Return True if all characters in S are digits\n\
6950 and there is at least one character in S, False otherwise.");
6951 
6952 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6953 unicode_isdigit(PyUnicodeObject *self)
6954 {
6955     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6956     register const Py_UNICODE *e;
6957 
6958     /* Shortcut for single character strings */
6959     if (PyUnicode_GET_SIZE(self) == 1 &&
6960         Py_UNICODE_ISDIGIT(*p))
6961         return PyBool_FromLong(1);
6962 
6963     /* Special case for empty strings */
6964     if (PyUnicode_GET_SIZE(self) == 0)
6965         return PyBool_FromLong(0);
6966 
6967     e = p + PyUnicode_GET_SIZE(self);
6968     for (; p < e; p++) {
6969         if (!Py_UNICODE_ISDIGIT(*p))
6970             return PyBool_FromLong(0);
6971     }
6972     return PyBool_FromLong(1);
6973 }
6974 
6975 PyDoc_STRVAR(isnumeric__doc__,
6976              "S.isnumeric() -> bool\n\
6977 \n\
6978 Return True if there are only numeric characters in S,\n\
6979 False otherwise.");
6980 
6981 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6982 unicode_isnumeric(PyUnicodeObject *self)
6983 {
6984     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6985     register const Py_UNICODE *e;
6986 
6987     /* Shortcut for single character strings */
6988     if (PyUnicode_GET_SIZE(self) == 1 &&
6989         Py_UNICODE_ISNUMERIC(*p))
6990         return PyBool_FromLong(1);
6991 
6992     /* Special case for empty strings */
6993     if (PyUnicode_GET_SIZE(self) == 0)
6994         return PyBool_FromLong(0);
6995 
6996     e = p + PyUnicode_GET_SIZE(self);
6997     for (; p < e; p++) {
6998         if (!Py_UNICODE_ISNUMERIC(*p))
6999             return PyBool_FromLong(0);
7000     }
7001     return PyBool_FromLong(1);
7002 }
7003 
7004 PyDoc_STRVAR(join__doc__,
7005              "S.join(iterable) -> unicode\n\
7006 \n\
7007 Return a string which is the concatenation of the strings in the\n\
7008 iterable.  The separator between elements is S.");
7009 
7010 static PyObject*
unicode_join(PyObject * self,PyObject * data)7011 unicode_join(PyObject *self, PyObject *data)
7012 {
7013     return PyUnicode_Join(self, data);
7014 }
7015 
7016 static Py_ssize_t
unicode_length(PyUnicodeObject * self)7017 unicode_length(PyUnicodeObject *self)
7018 {
7019     return self->length;
7020 }
7021 
7022 PyDoc_STRVAR(ljust__doc__,
7023              "S.ljust(width[, fillchar]) -> int\n\
7024 \n\
7025 Return S left-justified in a Unicode string of length width. Padding is\n\
7026 done using the specified fill character (default is a space).");
7027 
7028 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)7029 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7030 {
7031     Py_ssize_t width;
7032     Py_UNICODE fillchar = ' ';
7033 
7034     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7035         return NULL;
7036 
7037     if (self->length >= width && PyUnicode_CheckExact(self)) {
7038         Py_INCREF(self);
7039         return (PyObject*) self;
7040     }
7041 
7042     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7043 }
7044 
7045 PyDoc_STRVAR(lower__doc__,
7046              "S.lower() -> unicode\n\
7047 \n\
7048 Return a copy of the string S converted to lowercase.");
7049 
7050 static PyObject*
unicode_lower(PyUnicodeObject * self)7051 unicode_lower(PyUnicodeObject *self)
7052 {
7053     return fixup(self, fixlower);
7054 }
7055 
7056 #define LEFTSTRIP 0
7057 #define RIGHTSTRIP 1
7058 #define BOTHSTRIP 2
7059 
7060 /* Arrays indexed by above */
7061 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7062 
7063 #define STRIPNAME(i) (stripformat[i]+3)
7064 
7065 /* externally visible for str.strip(unicode) */
7066 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)7067 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7068 {
7069     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7070     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7071     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7072     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7073     Py_ssize_t i, j;
7074 
7075     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7076 
7077     i = 0;
7078     if (striptype != RIGHTSTRIP) {
7079         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7080             i++;
7081         }
7082     }
7083 
7084     j = len;
7085     if (striptype != LEFTSTRIP) {
7086         do {
7087             j--;
7088         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7089         j++;
7090     }
7091 
7092     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7093         Py_INCREF(self);
7094         return (PyObject*)self;
7095     }
7096     else
7097         return PyUnicode_FromUnicode(s+i, j-i);
7098 }
7099 
7100 
7101 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)7102 do_strip(PyUnicodeObject *self, int striptype)
7103 {
7104     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7105     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7106 
7107     i = 0;
7108     if (striptype != RIGHTSTRIP) {
7109         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7110             i++;
7111         }
7112     }
7113 
7114     j = len;
7115     if (striptype != LEFTSTRIP) {
7116         do {
7117             j--;
7118         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7119         j++;
7120     }
7121 
7122     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7123         Py_INCREF(self);
7124         return (PyObject*)self;
7125     }
7126     else
7127         return PyUnicode_FromUnicode(s+i, j-i);
7128 }
7129 
7130 
7131 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7132 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7133 {
7134     PyObject *sep = NULL;
7135 
7136     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7137         return NULL;
7138 
7139     if (sep != NULL && sep != Py_None) {
7140         if (PyUnicode_Check(sep))
7141             return _PyUnicode_XStrip(self, striptype, sep);
7142         else if (PyString_Check(sep)) {
7143             PyObject *res;
7144             sep = PyUnicode_FromObject(sep);
7145             if (sep==NULL)
7146                 return NULL;
7147             res = _PyUnicode_XStrip(self, striptype, sep);
7148             Py_DECREF(sep);
7149             return res;
7150         }
7151         else {
7152             PyErr_Format(PyExc_TypeError,
7153                          "%s arg must be None, unicode or str",
7154                          STRIPNAME(striptype));
7155             return NULL;
7156         }
7157     }
7158 
7159     return do_strip(self, striptype);
7160 }
7161 
7162 
7163 PyDoc_STRVAR(strip__doc__,
7164              "S.strip([chars]) -> unicode\n\
7165 \n\
7166 Return a copy of the string S with leading and trailing\n\
7167 whitespace removed.\n\
7168 If chars is given and not None, remove characters in chars instead.\n\
7169 If chars is a str, it will be converted to unicode before stripping");
7170 
7171 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7172 unicode_strip(PyUnicodeObject *self, PyObject *args)
7173 {
7174     if (PyTuple_GET_SIZE(args) == 0)
7175         return do_strip(self, BOTHSTRIP); /* Common case */
7176     else
7177         return do_argstrip(self, BOTHSTRIP, args);
7178 }
7179 
7180 
7181 PyDoc_STRVAR(lstrip__doc__,
7182              "S.lstrip([chars]) -> unicode\n\
7183 \n\
7184 Return a copy of the string S with leading whitespace removed.\n\
7185 If chars is given and not None, remove characters in chars instead.\n\
7186 If chars is a str, it will be converted to unicode before stripping");
7187 
7188 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7189 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7190 {
7191     if (PyTuple_GET_SIZE(args) == 0)
7192         return do_strip(self, LEFTSTRIP); /* Common case */
7193     else
7194         return do_argstrip(self, LEFTSTRIP, args);
7195 }
7196 
7197 
7198 PyDoc_STRVAR(rstrip__doc__,
7199              "S.rstrip([chars]) -> unicode\n\
7200 \n\
7201 Return a copy of the string S with trailing whitespace removed.\n\
7202 If chars is given and not None, remove characters in chars instead.\n\
7203 If chars is a str, it will be converted to unicode before stripping");
7204 
7205 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7206 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7207 {
7208     if (PyTuple_GET_SIZE(args) == 0)
7209         return do_strip(self, RIGHTSTRIP); /* Common case */
7210     else
7211         return do_argstrip(self, RIGHTSTRIP, args);
7212 }
7213 
7214 
7215 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7216 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7217 {
7218     PyUnicodeObject *u;
7219     Py_UNICODE *p;
7220     Py_ssize_t nchars;
7221     size_t nbytes;
7222 
7223     if (len < 0)
7224         len = 0;
7225 
7226     if (len == 1 && PyUnicode_CheckExact(str)) {
7227         /* no repeat, return original string */
7228         Py_INCREF(str);
7229         return (PyObject*) str;
7230     }
7231 
7232     /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
7233      * needed doesn't overflow size_t
7234      */
7235     if (len && str->length > PY_SSIZE_T_MAX / len) {
7236         PyErr_SetString(PyExc_OverflowError,
7237                         "repeated string is too long");
7238         return NULL;
7239     }
7240     nchars = len * str->length;
7241     nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
7242     if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
7243         PyErr_SetString(PyExc_OverflowError,
7244                         "repeated string is too long");
7245         return NULL;
7246     }
7247     u = _PyUnicode_New(nchars);
7248     if (!u)
7249         return NULL;
7250 
7251     p = u->str;
7252 
7253     if (str->length == 1 && len > 0) {
7254         Py_UNICODE_FILL(p, str->str[0], len);
7255     } else {
7256         Py_ssize_t done = 0; /* number of characters copied this far */
7257         if (done < nchars) {
7258             Py_UNICODE_COPY(p, str->str, str->length);
7259             done = str->length;
7260         }
7261         while (done < nchars) {
7262             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7263             Py_UNICODE_COPY(p+done, p, n);
7264             done += n;
7265         }
7266     }
7267 
7268     return (PyObject*) u;
7269 }
7270 
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7271 PyObject *PyUnicode_Replace(PyObject *obj,
7272                             PyObject *subobj,
7273                             PyObject *replobj,
7274                             Py_ssize_t maxcount)
7275 {
7276     PyObject *self;
7277     PyObject *str1;
7278     PyObject *str2;
7279     PyObject *result;
7280 
7281     self = PyUnicode_FromObject(obj);
7282     if (self == NULL)
7283         return NULL;
7284     str1 = PyUnicode_FromObject(subobj);
7285     if (str1 == NULL) {
7286         Py_DECREF(self);
7287         return NULL;
7288     }
7289     str2 = PyUnicode_FromObject(replobj);
7290     if (str2 == NULL) {
7291         Py_DECREF(self);
7292         Py_DECREF(str1);
7293         return NULL;
7294     }
7295     result = replace((PyUnicodeObject *)self,
7296                      (PyUnicodeObject *)str1,
7297                      (PyUnicodeObject *)str2,
7298                      maxcount);
7299     Py_DECREF(self);
7300     Py_DECREF(str1);
7301     Py_DECREF(str2);
7302     return result;
7303 }
7304 
7305 PyDoc_STRVAR(replace__doc__,
7306              "S.replace(old, new[, count]) -> unicode\n\
7307 \n\
7308 Return a copy of S with all occurrences of substring\n\
7309 old replaced by new.  If the optional argument count is\n\
7310 given, only the first count occurrences are replaced.");
7311 
7312 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7313 unicode_replace(PyUnicodeObject *self, PyObject *args)
7314 {
7315     PyUnicodeObject *str1;
7316     PyUnicodeObject *str2;
7317     Py_ssize_t maxcount = -1;
7318     PyObject *result;
7319 
7320     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7321         return NULL;
7322     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7323     if (str1 == NULL)
7324         return NULL;
7325     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7326     if (str2 == NULL) {
7327         Py_DECREF(str1);
7328         return NULL;
7329     }
7330 
7331     result = replace(self, str1, str2, maxcount);
7332 
7333     Py_DECREF(str1);
7334     Py_DECREF(str2);
7335     return result;
7336 }
7337 
7338 static
unicode_repr(PyObject * unicode)7339 PyObject *unicode_repr(PyObject *unicode)
7340 {
7341     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7342                                 PyUnicode_GET_SIZE(unicode),
7343                                 1);
7344 }
7345 
7346 PyDoc_STRVAR(rfind__doc__,
7347              "S.rfind(sub [,start [,end]]) -> int\n\
7348 \n\
7349 Return the highest index in S where substring sub is found,\n\
7350 such that sub is contained within S[start:end].  Optional\n\
7351 arguments start and end are interpreted as in slice notation.\n\
7352 \n\
7353 Return -1 on failure.");
7354 
7355 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7356 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7357 {
7358     PyUnicodeObject *substring;
7359     Py_ssize_t start;
7360     Py_ssize_t end;
7361     Py_ssize_t result;
7362 
7363     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7364                                             &start, &end))
7365         return NULL;
7366 
7367     result = stringlib_rfind_slice(
7368         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7369         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7370         start, end
7371         );
7372 
7373     Py_DECREF(substring);
7374 
7375     return PyInt_FromSsize_t(result);
7376 }
7377 
7378 PyDoc_STRVAR(rindex__doc__,
7379              "S.rindex(sub [,start [,end]]) -> int\n\
7380 \n\
7381 Like S.rfind() but raise ValueError when the substring is not found.");
7382 
7383 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7384 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7385 {
7386     PyUnicodeObject *substring;
7387     Py_ssize_t start;
7388     Py_ssize_t end;
7389     Py_ssize_t result;
7390 
7391     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7392                                             &start, &end))
7393         return NULL;
7394 
7395     result = stringlib_rfind_slice(
7396         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7397         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7398         start, end
7399         );
7400 
7401     Py_DECREF(substring);
7402 
7403     if (result < 0) {
7404         PyErr_SetString(PyExc_ValueError, "substring not found");
7405         return NULL;
7406     }
7407     return PyInt_FromSsize_t(result);
7408 }
7409 
7410 PyDoc_STRVAR(rjust__doc__,
7411              "S.rjust(width[, fillchar]) -> unicode\n\
7412 \n\
7413 Return S right-justified in a Unicode string of length width. Padding is\n\
7414 done using the specified fill character (default is a space).");
7415 
7416 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7417 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7418 {
7419     Py_ssize_t width;
7420     Py_UNICODE fillchar = ' ';
7421 
7422     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7423         return NULL;
7424 
7425     if (self->length >= width && PyUnicode_CheckExact(self)) {
7426         Py_INCREF(self);
7427         return (PyObject*) self;
7428     }
7429 
7430     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7431 }
7432 
7433 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7434 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7435 {
7436     /* standard clamping */
7437     if (start < 0)
7438         start = 0;
7439     if (end < 0)
7440         end = 0;
7441     if (end > self->length)
7442         end = self->length;
7443     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7444         /* full slice, return original string */
7445         Py_INCREF(self);
7446         return (PyObject*) self;
7447     }
7448     if (start > end)
7449         start = end;
7450     /* copy slice */
7451     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7452                                              end - start);
7453 }
7454 
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7455 PyObject *PyUnicode_Split(PyObject *s,
7456                           PyObject *sep,
7457                           Py_ssize_t maxsplit)
7458 {
7459     PyObject *result;
7460 
7461     s = PyUnicode_FromObject(s);
7462     if (s == NULL)
7463         return NULL;
7464     if (sep != NULL) {
7465         sep = PyUnicode_FromObject(sep);
7466         if (sep == NULL) {
7467             Py_DECREF(s);
7468             return NULL;
7469         }
7470     }
7471 
7472     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7473 
7474     Py_DECREF(s);
7475     Py_XDECREF(sep);
7476     return result;
7477 }
7478 
7479 PyDoc_STRVAR(split__doc__,
7480              "S.split([sep [,maxsplit]]) -> list of strings\n\
7481 \n\
7482 Return a list of the words in S, using sep as the\n\
7483 delimiter string.  If maxsplit is given, at most maxsplit\n\
7484 splits are done. If sep is not specified or is None, any\n\
7485 whitespace string is a separator and empty strings are\n\
7486 removed from the result.");
7487 
7488 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7489 unicode_split(PyUnicodeObject *self, PyObject *args)
7490 {
7491     PyObject *substring = Py_None;
7492     Py_ssize_t maxcount = -1;
7493 
7494     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7495         return NULL;
7496 
7497     if (substring == Py_None)
7498         return split(self, NULL, maxcount);
7499     else if (PyUnicode_Check(substring))
7500         return split(self, (PyUnicodeObject *)substring, maxcount);
7501     else
7502         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7503 }
7504 
7505 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7506 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7507 {
7508     PyObject* str_obj;
7509     PyObject* sep_obj;
7510     PyObject* out;
7511 
7512     str_obj = PyUnicode_FromObject(str_in);
7513     if (!str_obj)
7514         return NULL;
7515     sep_obj = PyUnicode_FromObject(sep_in);
7516     if (!sep_obj) {
7517         Py_DECREF(str_obj);
7518         return NULL;
7519     }
7520 
7521     out = stringlib_partition(
7522         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7523         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7524         );
7525 
7526     Py_DECREF(sep_obj);
7527     Py_DECREF(str_obj);
7528 
7529     return out;
7530 }
7531 
7532 
7533 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7534 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7535 {
7536     PyObject* str_obj;
7537     PyObject* sep_obj;
7538     PyObject* out;
7539 
7540     str_obj = PyUnicode_FromObject(str_in);
7541     if (!str_obj)
7542         return NULL;
7543     sep_obj = PyUnicode_FromObject(sep_in);
7544     if (!sep_obj) {
7545         Py_DECREF(str_obj);
7546         return NULL;
7547     }
7548 
7549     out = stringlib_rpartition(
7550         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7551         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7552         );
7553 
7554     Py_DECREF(sep_obj);
7555     Py_DECREF(str_obj);
7556 
7557     return out;
7558 }
7559 
7560 PyDoc_STRVAR(partition__doc__,
7561              "S.partition(sep) -> (head, sep, tail)\n\
7562 \n\
7563 Search for the separator sep in S, and return the part before it,\n\
7564 the separator itself, and the part after it.  If the separator is not\n\
7565 found, return S and two empty strings.");
7566 
7567 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7568 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7569 {
7570     return PyUnicode_Partition((PyObject *)self, separator);
7571 }
7572 
7573 PyDoc_STRVAR(rpartition__doc__,
7574              "S.rpartition(sep) -> (head, sep, tail)\n\
7575 \n\
7576 Search for the separator sep in S, starting at the end of S, and return\n\
7577 the part before it, the separator itself, and the part after it.  If the\n\
7578 separator is not found, return two empty strings and S.");
7579 
7580 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7581 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7582 {
7583     return PyUnicode_RPartition((PyObject *)self, separator);
7584 }
7585 
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7586 PyObject *PyUnicode_RSplit(PyObject *s,
7587                            PyObject *sep,
7588                            Py_ssize_t maxsplit)
7589 {
7590     PyObject *result;
7591 
7592     s = PyUnicode_FromObject(s);
7593     if (s == NULL)
7594         return NULL;
7595     if (sep != NULL) {
7596         sep = PyUnicode_FromObject(sep);
7597         if (sep == NULL) {
7598             Py_DECREF(s);
7599             return NULL;
7600         }
7601     }
7602 
7603     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7604 
7605     Py_DECREF(s);
7606     Py_XDECREF(sep);
7607     return result;
7608 }
7609 
7610 PyDoc_STRVAR(rsplit__doc__,
7611              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7612 \n\
7613 Return a list of the words in S, using sep as the\n\
7614 delimiter string, starting at the end of the string and\n\
7615 working to the front.  If maxsplit is given, at most maxsplit\n\
7616 splits are done. If sep is not specified, any whitespace string\n\
7617 is a separator.");
7618 
7619 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7620 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7621 {
7622     PyObject *substring = Py_None;
7623     Py_ssize_t maxcount = -1;
7624 
7625     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7626         return NULL;
7627 
7628     if (substring == Py_None)
7629         return rsplit(self, NULL, maxcount);
7630     else if (PyUnicode_Check(substring))
7631         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7632     else
7633         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7634 }
7635 
7636 PyDoc_STRVAR(splitlines__doc__,
7637              "S.splitlines(keepends=False) -> list of strings\n\
7638 \n\
7639 Return a list of the lines in S, breaking at line boundaries.\n\
7640 Line breaks are not included in the resulting list unless keepends\n\
7641 is given and true.");
7642 
7643 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7644 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7645 {
7646     int keepends = 0;
7647 
7648     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7649         return NULL;
7650 
7651     return PyUnicode_Splitlines((PyObject *)self, keepends);
7652 }
7653 
7654 static
unicode_str(PyUnicodeObject * self)7655 PyObject *unicode_str(PyUnicodeObject *self)
7656 {
7657     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7658 }
7659 
7660 PyDoc_STRVAR(swapcase__doc__,
7661              "S.swapcase() -> unicode\n\
7662 \n\
7663 Return a copy of S with uppercase characters converted to lowercase\n\
7664 and vice versa.");
7665 
7666 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7667 unicode_swapcase(PyUnicodeObject *self)
7668 {
7669     return fixup(self, fixswapcase);
7670 }
7671 
7672 PyDoc_STRVAR(translate__doc__,
7673              "S.translate(table) -> unicode\n\
7674 \n\
7675 Return a copy of the string S, where all characters have been mapped\n\
7676 through the given translation table, which must be a mapping of\n\
7677 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7678 Unmapped characters are left untouched. Characters mapped to None\n\
7679 are deleted.");
7680 
7681 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7682 unicode_translate(PyUnicodeObject *self, PyObject *table)
7683 {
7684     return PyUnicode_TranslateCharmap(self->str,
7685                                       self->length,
7686                                       table,
7687                                       "ignore");
7688 }
7689 
7690 PyDoc_STRVAR(upper__doc__,
7691              "S.upper() -> unicode\n\
7692 \n\
7693 Return a copy of S converted to uppercase.");
7694 
7695 static PyObject*
unicode_upper(PyUnicodeObject * self)7696 unicode_upper(PyUnicodeObject *self)
7697 {
7698     return fixup(self, fixupper);
7699 }
7700 
7701 PyDoc_STRVAR(zfill__doc__,
7702              "S.zfill(width) -> unicode\n\
7703 \n\
7704 Pad a numeric string S with zeros on the left, to fill a field\n\
7705 of the specified width. The string S is never truncated.");
7706 
7707 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7708 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7709 {
7710     Py_ssize_t fill;
7711     PyUnicodeObject *u;
7712 
7713     Py_ssize_t width;
7714     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7715         return NULL;
7716 
7717     if (self->length >= width) {
7718         if (PyUnicode_CheckExact(self)) {
7719             Py_INCREF(self);
7720             return (PyObject*) self;
7721         }
7722         else
7723             return PyUnicode_FromUnicode(
7724                 PyUnicode_AS_UNICODE(self),
7725                 PyUnicode_GET_SIZE(self)
7726                 );
7727     }
7728 
7729     fill = width - self->length;
7730 
7731     u = pad(self, fill, 0, '0');
7732 
7733     if (u == NULL)
7734         return NULL;
7735 
7736     if (u->str[fill] == '+' || u->str[fill] == '-') {
7737         /* move sign to beginning of string */
7738         u->str[0] = u->str[fill];
7739         u->str[fill] = '0';
7740     }
7741 
7742     return (PyObject*) u;
7743 }
7744 
7745 #if 0
7746 static PyObject*
7747 free_listsize(PyUnicodeObject *self)
7748 {
7749     return PyInt_FromLong(numfree);
7750 }
7751 #endif
7752 
7753 PyDoc_STRVAR(startswith__doc__,
7754              "S.startswith(prefix[, start[, end]]) -> bool\n\
7755 \n\
7756 Return True if S starts with the specified prefix, False otherwise.\n\
7757 With optional start, test S beginning at that position.\n\
7758 With optional end, stop comparing S at that position.\n\
7759 prefix can also be a tuple of strings to try.");
7760 
7761 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7762 unicode_startswith(PyUnicodeObject *self,
7763                    PyObject *args)
7764 {
7765     PyObject *subobj;
7766     PyUnicodeObject *substring;
7767     Py_ssize_t start = 0;
7768     Py_ssize_t end = PY_SSIZE_T_MAX;
7769     int result;
7770 
7771     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7772         return NULL;
7773     if (PyTuple_Check(subobj)) {
7774         Py_ssize_t i;
7775         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7776             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7777                 PyTuple_GET_ITEM(subobj, i));
7778             if (substring == NULL)
7779                 return NULL;
7780             result = tailmatch(self, substring, start, end, -1);
7781             Py_DECREF(substring);
7782             if (result) {
7783                 Py_RETURN_TRUE;
7784             }
7785         }
7786         /* nothing matched */
7787         Py_RETURN_FALSE;
7788     }
7789     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7790     if (substring == NULL) {
7791         if (PyErr_ExceptionMatches(PyExc_TypeError))
7792             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7793                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7794         return NULL;
7795     }
7796     result = tailmatch(self, substring, start, end, -1);
7797     Py_DECREF(substring);
7798     return PyBool_FromLong(result);
7799 }
7800 
7801 
7802 PyDoc_STRVAR(endswith__doc__,
7803              "S.endswith(suffix[, start[, end]]) -> bool\n\
7804 \n\
7805 Return True if S ends with the specified suffix, False otherwise.\n\
7806 With optional start, test S beginning at that position.\n\
7807 With optional end, stop comparing S at that position.\n\
7808 suffix can also be a tuple of strings to try.");
7809 
7810 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7811 unicode_endswith(PyUnicodeObject *self,
7812                  PyObject *args)
7813 {
7814     PyObject *subobj;
7815     PyUnicodeObject *substring;
7816     Py_ssize_t start = 0;
7817     Py_ssize_t end = PY_SSIZE_T_MAX;
7818     int result;
7819 
7820     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7821         return NULL;
7822     if (PyTuple_Check(subobj)) {
7823         Py_ssize_t i;
7824         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7826                 PyTuple_GET_ITEM(subobj, i));
7827             if (substring == NULL)
7828                 return NULL;
7829             result = tailmatch(self, substring, start, end, +1);
7830             Py_DECREF(substring);
7831             if (result) {
7832                 Py_RETURN_TRUE;
7833             }
7834         }
7835         Py_RETURN_FALSE;
7836     }
7837     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7838     if (substring == NULL) {
7839         if (PyErr_ExceptionMatches(PyExc_TypeError))
7840             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7841                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7842         return NULL;
7843     }
7844     result = tailmatch(self, substring, start, end, +1);
7845     Py_DECREF(substring);
7846     return PyBool_FromLong(result);
7847 }
7848 
7849 
7850 /* Implements do_string_format, which is unicode because of stringlib */
7851 #include "stringlib/string_format.h"
7852 
7853 PyDoc_STRVAR(format__doc__,
7854              "S.format(*args, **kwargs) -> unicode\n\
7855 \n\
7856 Return a formatted version of S, using substitutions from args and kwargs.\n\
7857 The substitutions are identified by braces ('{' and '}').");
7858 
7859 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7860 unicode__format__(PyObject *self, PyObject *args)
7861 {
7862     PyObject *format_spec;
7863     PyObject *result = NULL;
7864     PyObject *tmp = NULL;
7865 
7866     /* If 2.x, convert format_spec to the same type as value */
7867     /* This is to allow things like u''.format('') */
7868     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7869         goto done;
7870     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7871         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7872                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7873         goto done;
7874     }
7875     tmp = PyObject_Unicode(format_spec);
7876     if (tmp == NULL)
7877         goto done;
7878     format_spec = tmp;
7879 
7880     result = _PyUnicode_FormatAdvanced(self,
7881                                        PyUnicode_AS_UNICODE(format_spec),
7882                                        PyUnicode_GET_SIZE(format_spec));
7883   done:
7884     Py_XDECREF(tmp);
7885     return result;
7886 }
7887 
7888 PyDoc_STRVAR(p_format__doc__,
7889              "S.__format__(format_spec) -> unicode\n\
7890 \n\
7891 Return a formatted version of S as described by format_spec.");
7892 
7893 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7894 unicode__sizeof__(PyUnicodeObject *v)
7895 {
7896     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7897                              sizeof(Py_UNICODE) * (v->length + 1));
7898 }
7899 
7900 PyDoc_STRVAR(sizeof__doc__,
7901              "S.__sizeof__() -> size of S in memory, in bytes\n\
7902 \n\
7903 ");
7904 
7905 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7906 unicode_getnewargs(PyUnicodeObject *v)
7907 {
7908     return Py_BuildValue("(u#)", v->str, v->length);
7909 }
7910 
7911 
7912 static PyMethodDef unicode_methods[] = {
7913     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7914     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7915     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7916     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7917     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7918     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7919     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7920     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7921     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7922     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7923     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7924     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7925     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7926     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7927     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7928     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7929     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7930 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7931     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7932     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7933     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7934     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7935     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7936     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7937     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7938     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7939     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7940     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7941     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7942     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7943     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7944     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7945     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7946     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7947     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7948     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7949     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7950     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7951     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7952     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7953     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7954     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7955     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7956     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7957     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7958 #if 0
7959     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7960 #endif
7961 
7962 #if 0
7963     /* This one is just used for debugging the implementation. */
7964     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7965 #endif
7966 
7967     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7968     {NULL, NULL}
7969 };
7970 
7971 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7972 unicode_mod(PyObject *v, PyObject *w)
7973 {
7974     if (!PyUnicode_Check(v)) {
7975         Py_INCREF(Py_NotImplemented);
7976         return Py_NotImplemented;
7977     }
7978     return PyUnicode_Format(v, w);
7979 }
7980 
7981 static PyNumberMethods unicode_as_number = {
7982     0,              /*nb_add*/
7983     0,              /*nb_subtract*/
7984     0,              /*nb_multiply*/
7985     0,              /*nb_divide*/
7986     unicode_mod,            /*nb_remainder*/
7987 };
7988 
7989 static PySequenceMethods unicode_as_sequence = {
7990     (lenfunc) unicode_length,       /* sq_length */
7991     PyUnicode_Concat,           /* sq_concat */
7992     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7993     (ssizeargfunc) unicode_getitem,     /* sq_item */
7994     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7995     0,                  /* sq_ass_item */
7996     0,                  /* sq_ass_slice */
7997     PyUnicode_Contains,         /* sq_contains */
7998 };
7999 
8000 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)8001 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8002 {
8003     if (PyIndex_Check(item)) {
8004         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8005         if (i == -1 && PyErr_Occurred())
8006             return NULL;
8007         if (i < 0)
8008             i += PyUnicode_GET_SIZE(self);
8009         return unicode_getitem(self, i);
8010     } else if (PySlice_Check(item)) {
8011         Py_ssize_t start, stop, step, slicelength, cur, i;
8012         Py_UNICODE* source_buf;
8013         Py_UNICODE* result_buf;
8014         PyObject* result;
8015 
8016         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8017                                  &start, &stop, &step, &slicelength) < 0) {
8018             return NULL;
8019         }
8020 
8021         if (slicelength <= 0) {
8022             return PyUnicode_FromUnicode(NULL, 0);
8023         } else if (start == 0 && step == 1 && slicelength == self->length &&
8024                    PyUnicode_CheckExact(self)) {
8025             Py_INCREF(self);
8026             return (PyObject *)self;
8027         } else if (step == 1) {
8028             return PyUnicode_FromUnicode(self->str + start, slicelength);
8029         } else {
8030             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8031             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8032                                                        sizeof(Py_UNICODE));
8033 
8034             if (result_buf == NULL)
8035                 return PyErr_NoMemory();
8036 
8037             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8038                 result_buf[i] = source_buf[cur];
8039             }
8040 
8041             result = PyUnicode_FromUnicode(result_buf, slicelength);
8042             PyObject_FREE(result_buf);
8043             return result;
8044         }
8045     } else {
8046         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8047         return NULL;
8048     }
8049 }
8050 
8051 static PyMappingMethods unicode_as_mapping = {
8052     (lenfunc)unicode_length,        /* mp_length */
8053     (binaryfunc)unicode_subscript,  /* mp_subscript */
8054     (objobjargproc)0,           /* mp_ass_subscript */
8055 };
8056 
8057 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8058 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8059                           Py_ssize_t index,
8060                           const void **ptr)
8061 {
8062     if (index != 0) {
8063         PyErr_SetString(PyExc_SystemError,
8064                         "accessing non-existent unicode segment");
8065         return -1;
8066     }
8067     *ptr = (void *) self->str;
8068     return PyUnicode_GET_DATA_SIZE(self);
8069 }
8070 
8071 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8072 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8073                            const void **ptr)
8074 {
8075     PyErr_SetString(PyExc_TypeError,
8076                     "cannot use unicode as modifiable buffer");
8077     return -1;
8078 }
8079 
8080 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)8081 unicode_buffer_getsegcount(PyUnicodeObject *self,
8082                            Py_ssize_t *lenp)
8083 {
8084     if (lenp)
8085         *lenp = PyUnicode_GET_DATA_SIZE(self);
8086     return 1;
8087 }
8088 
8089 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8090 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8091                           Py_ssize_t index,
8092                           const void **ptr)
8093 {
8094     PyObject *str;
8095 
8096     if (index != 0) {
8097         PyErr_SetString(PyExc_SystemError,
8098                         "accessing non-existent unicode segment");
8099         return -1;
8100     }
8101     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8102     if (str == NULL)
8103         return -1;
8104     *ptr = (void *) PyString_AS_STRING(str);
8105     return PyString_GET_SIZE(str);
8106 }
8107 
8108 /* Helpers for PyUnicode_Format() */
8109 
8110 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)8111 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8112 {
8113     Py_ssize_t argidx = *p_argidx;
8114     if (argidx < arglen) {
8115         (*p_argidx)++;
8116         if (arglen < 0)
8117             return args;
8118         else
8119             return PyTuple_GetItem(args, argidx);
8120     }
8121     PyErr_SetString(PyExc_TypeError,
8122                     "not enough arguments for format string");
8123     return NULL;
8124 }
8125 
8126 #define F_LJUST (1<<0)
8127 #define F_SIGN  (1<<1)
8128 #define F_BLANK (1<<2)
8129 #define F_ALT   (1<<3)
8130 #define F_ZERO  (1<<4)
8131 
8132 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8133 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8134 {
8135     register Py_ssize_t i;
8136     Py_ssize_t len = strlen(charbuffer);
8137     for (i = len - 1; i >= 0; i--)
8138         buffer[i] = (Py_UNICODE) charbuffer[i];
8139 
8140     return len;
8141 }
8142 
8143 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8144 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8145 {
8146     Py_ssize_t result;
8147 
8148     PyOS_snprintf((char *)buffer, len, format, x);
8149     result = strtounicode(buffer, (char *)buffer);
8150     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8151 }
8152 
8153 /* XXX To save some code duplication, formatfloat/long/int could have been
8154    shared with stringobject.c, converting from 8-bit to Unicode after the
8155    formatting is done. */
8156 
8157 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8158 
8159 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8160 formatfloat(PyObject *v, int flags, int prec, int type)
8161 {
8162     char *p;
8163     PyObject *result;
8164     double x;
8165 
8166     x = PyFloat_AsDouble(v);
8167     if (x == -1.0 && PyErr_Occurred())
8168         return NULL;
8169 
8170     if (prec < 0)
8171         prec = 6;
8172 
8173     p = PyOS_double_to_string(x, type, prec,
8174                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8175     if (p == NULL)
8176         return NULL;
8177     result = PyUnicode_FromStringAndSize(p, strlen(p));
8178     PyMem_Free(p);
8179     return result;
8180 }
8181 
8182 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8183 formatlong(PyObject *val, int flags, int prec, int type)
8184 {
8185     char *buf;
8186     int i, len;
8187     PyObject *str; /* temporary string object. */
8188     PyUnicodeObject *result;
8189 
8190     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8191     if (!str)
8192         return NULL;
8193     result = _PyUnicode_New(len);
8194     if (!result) {
8195         Py_DECREF(str);
8196         return NULL;
8197     }
8198     for (i = 0; i < len; i++)
8199         result->str[i] = buf[i];
8200     result->str[len] = 0;
8201     Py_DECREF(str);
8202     return (PyObject*)result;
8203 }
8204 
8205 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8206 formatint(Py_UNICODE *buf,
8207           size_t buflen,
8208           int flags,
8209           int prec,
8210           int type,
8211           PyObject *v)
8212 {
8213     /* fmt = '%#.' + `prec` + 'l' + `type`
8214      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8215      *                     + 1 + 1
8216      *                   = 24
8217      */
8218     char fmt[64]; /* plenty big enough! */
8219     char *sign;
8220     long x;
8221 
8222     x = PyInt_AsLong(v);
8223     if (x == -1 && PyErr_Occurred())
8224         return -1;
8225     if (x < 0 && type == 'u') {
8226         type = 'd';
8227     }
8228     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8229         sign = "-";
8230     else
8231         sign = "";
8232     if (prec < 0)
8233         prec = 1;
8234 
8235     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8236      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8237      */
8238     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8239         PyErr_SetString(PyExc_OverflowError,
8240                         "formatted integer is too long (precision too large?)");
8241         return -1;
8242     }
8243 
8244     if ((flags & F_ALT) &&
8245         (type == 'x' || type == 'X')) {
8246         /* When converting under %#x or %#X, there are a number
8247          * of issues that cause pain:
8248          * - when 0 is being converted, the C standard leaves off
8249          *   the '0x' or '0X', which is inconsistent with other
8250          *   %#x/%#X conversions and inconsistent with Python's
8251          *   hex() function
8252          * - there are platforms that violate the standard and
8253          *   convert 0 with the '0x' or '0X'
8254          *   (Metrowerks, Compaq Tru64)
8255          * - there are platforms that give '0x' when converting
8256          *   under %#X, but convert 0 in accordance with the
8257          *   standard (OS/2 EMX)
8258          *
8259          * We can achieve the desired consistency by inserting our
8260          * own '0x' or '0X' prefix, and substituting %x/%X in place
8261          * of %#x/%#X.
8262          *
8263          * Note that this is the same approach as used in
8264          * formatint() in stringobject.c
8265          */
8266         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8267                       sign, type, prec, type);
8268     }
8269     else {
8270         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8271                       sign, (flags&F_ALT) ? "#" : "",
8272                       prec, type);
8273     }
8274     if (sign[0])
8275         return longtounicode(buf, buflen, fmt, -x);
8276     else
8277         return longtounicode(buf, buflen, fmt, x);
8278 }
8279 
8280 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8281 formatchar(Py_UNICODE *buf,
8282            size_t buflen,
8283            PyObject *v)
8284 {
8285     PyObject *unistr;
8286     char *str;
8287     /* presume that the buffer is at least 2 characters long */
8288     if (PyUnicode_Check(v)) {
8289         if (PyUnicode_GET_SIZE(v) != 1)
8290             goto onError;
8291         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8292     }
8293 
8294     else if (PyString_Check(v)) {
8295         if (PyString_GET_SIZE(v) != 1)
8296             goto onError;
8297         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8298            with a UnicodeDecodeError if 'char' is not decodable with the
8299            default encoding (usually ASCII, but it might be something else) */
8300         str = PyString_AS_STRING(v);
8301         if ((unsigned char)str[0] > 0x7F) {
8302             /* the char is not ASCII; try to decode the string using the
8303                default encoding and return -1 to let the UnicodeDecodeError
8304                be raised if the string can't be decoded */
8305             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8306             if (unistr == NULL)
8307                 return -1;
8308             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8309             Py_DECREF(unistr);
8310         }
8311         else
8312             buf[0] = (Py_UNICODE)str[0];
8313     }
8314 
8315     else {
8316         /* Integer input truncated to a character */
8317         long x;
8318         x = PyInt_AsLong(v);
8319         if (x == -1 && PyErr_Occurred())
8320             goto onError;
8321 #ifdef Py_UNICODE_WIDE
8322         if (x < 0 || x > 0x10ffff) {
8323             PyErr_SetString(PyExc_OverflowError,
8324                             "%c arg not in range(0x110000) "
8325                             "(wide Python build)");
8326             return -1;
8327         }
8328 #else
8329         if (x < 0 || x > 0xffff) {
8330             PyErr_SetString(PyExc_OverflowError,
8331                             "%c arg not in range(0x10000) "
8332                             "(narrow Python build)");
8333             return -1;
8334         }
8335 #endif
8336         buf[0] = (Py_UNICODE) x;
8337     }
8338     buf[1] = '\0';
8339     return 1;
8340 
8341   onError:
8342     PyErr_SetString(PyExc_TypeError,
8343                     "%c requires int or char");
8344     return -1;
8345 }
8346 
8347 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8348 
8349    FORMATBUFLEN is the length of the buffer in which the ints &
8350    chars are formatted. XXX This is a magic number. Each formatting
8351    routine does bounds checking to ensure no overflow, but a better
8352    solution may be to malloc a buffer of appropriate size for each
8353    format. For now, the current solution is sufficient.
8354 */
8355 #define FORMATBUFLEN (size_t)120
8356 
PyUnicode_Format(PyObject * format,PyObject * args)8357 PyObject *PyUnicode_Format(PyObject *format,
8358                            PyObject *args)
8359 {
8360     Py_UNICODE *fmt, *res;
8361     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8362     int args_owned = 0;
8363     PyUnicodeObject *result = NULL;
8364     PyObject *dict = NULL;
8365     PyObject *uformat;
8366 
8367     if (format == NULL || args == NULL) {
8368         PyErr_BadInternalCall();
8369         return NULL;
8370     }
8371     uformat = PyUnicode_FromObject(format);
8372     if (uformat == NULL)
8373         return NULL;
8374     fmt = PyUnicode_AS_UNICODE(uformat);
8375     fmtcnt = PyUnicode_GET_SIZE(uformat);
8376 
8377     reslen = rescnt = fmtcnt + 100;
8378     result = _PyUnicode_New(reslen);
8379     if (result == NULL)
8380         goto onError;
8381     res = PyUnicode_AS_UNICODE(result);
8382 
8383     if (PyTuple_Check(args)) {
8384         arglen = PyTuple_Size(args);
8385         argidx = 0;
8386     }
8387     else {
8388         arglen = -1;
8389         argidx = -2;
8390     }
8391     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8392         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8393         dict = args;
8394 
8395     while (--fmtcnt >= 0) {
8396         if (*fmt != '%') {
8397             if (--rescnt < 0) {
8398                 rescnt = fmtcnt + 100;
8399                 reslen += rescnt;
8400                 if (_PyUnicode_Resize(&result, reslen) < 0)
8401                     goto onError;
8402                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8403                 --rescnt;
8404             }
8405             *res++ = *fmt++;
8406         }
8407         else {
8408             /* Got a format specifier */
8409             int flags = 0;
8410             Py_ssize_t width = -1;
8411             int prec = -1;
8412             Py_UNICODE c = '\0';
8413             Py_UNICODE fill;
8414             int isnumok;
8415             PyObject *v = NULL;
8416             PyObject *temp = NULL;
8417             Py_UNICODE *pbuf;
8418             Py_UNICODE sign;
8419             Py_ssize_t len;
8420             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8421 
8422             fmt++;
8423             if (*fmt == '(') {
8424                 Py_UNICODE *keystart;
8425                 Py_ssize_t keylen;
8426                 PyObject *key;
8427                 int pcount = 1;
8428 
8429                 if (dict == NULL) {
8430                     PyErr_SetString(PyExc_TypeError,
8431                                     "format requires a mapping");
8432                     goto onError;
8433                 }
8434                 ++fmt;
8435                 --fmtcnt;
8436                 keystart = fmt;
8437                 /* Skip over balanced parentheses */
8438                 while (pcount > 0 && --fmtcnt >= 0) {
8439                     if (*fmt == ')')
8440                         --pcount;
8441                     else if (*fmt == '(')
8442                         ++pcount;
8443                     fmt++;
8444                 }
8445                 keylen = fmt - keystart - 1;
8446                 if (fmtcnt < 0 || pcount > 0) {
8447                     PyErr_SetString(PyExc_ValueError,
8448                                     "incomplete format key");
8449                     goto onError;
8450                 }
8451 #if 0
8452                 /* keys are converted to strings using UTF-8 and
8453                    then looked up since Python uses strings to hold
8454                    variables names etc. in its namespaces and we
8455                    wouldn't want to break common idioms. */
8456                 key = PyUnicode_EncodeUTF8(keystart,
8457                                            keylen,
8458                                            NULL);
8459 #else
8460                 key = PyUnicode_FromUnicode(keystart, keylen);
8461 #endif
8462                 if (key == NULL)
8463                     goto onError;
8464                 if (args_owned) {
8465                     Py_DECREF(args);
8466                     args_owned = 0;
8467                 }
8468                 args = PyObject_GetItem(dict, key);
8469                 Py_DECREF(key);
8470                 if (args == NULL) {
8471                     goto onError;
8472                 }
8473                 args_owned = 1;
8474                 arglen = -1;
8475                 argidx = -2;
8476             }
8477             while (--fmtcnt >= 0) {
8478                 switch (c = *fmt++) {
8479                 case '-': flags |= F_LJUST; continue;
8480                 case '+': flags |= F_SIGN; continue;
8481                 case ' ': flags |= F_BLANK; continue;
8482                 case '#': flags |= F_ALT; continue;
8483                 case '0': flags |= F_ZERO; continue;
8484                 }
8485                 break;
8486             }
8487             if (c == '*') {
8488                 v = getnextarg(args, arglen, &argidx);
8489                 if (v == NULL)
8490                     goto onError;
8491                 if (!PyInt_Check(v)) {
8492                     PyErr_SetString(PyExc_TypeError,
8493                                     "* wants int");
8494                     goto onError;
8495                 }
8496                 width = PyInt_AsSsize_t(v);
8497                 if (width == -1 && PyErr_Occurred())
8498                     goto onError;
8499                 if (width < 0) {
8500                     flags |= F_LJUST;
8501                     width = -width;
8502                 }
8503                 if (--fmtcnt >= 0)
8504                     c = *fmt++;
8505             }
8506             else if (c >= '0' && c <= '9') {
8507                 width = c - '0';
8508                 while (--fmtcnt >= 0) {
8509                     c = *fmt++;
8510                     if (c < '0' || c > '9')
8511                         break;
8512                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8513                         PyErr_SetString(PyExc_ValueError,
8514                                         "width too big");
8515                         goto onError;
8516                     }
8517                     width = width*10 + (c - '0');
8518                 }
8519             }
8520             if (c == '.') {
8521                 prec = 0;
8522                 if (--fmtcnt >= 0)
8523                     c = *fmt++;
8524                 if (c == '*') {
8525                     v = getnextarg(args, arglen, &argidx);
8526                     if (v == NULL)
8527                         goto onError;
8528                     if (!PyInt_Check(v)) {
8529                         PyErr_SetString(PyExc_TypeError,
8530                                         "* wants int");
8531                         goto onError;
8532                     }
8533                     prec = _PyInt_AsInt(v);
8534                     if (prec == -1 && PyErr_Occurred())
8535                         goto onError;
8536                     if (prec < 0)
8537                         prec = 0;
8538                     if (--fmtcnt >= 0)
8539                         c = *fmt++;
8540                 }
8541                 else if (c >= '0' && c <= '9') {
8542                     prec = c - '0';
8543                     while (--fmtcnt >= 0) {
8544                         c = *fmt++;
8545                         if (c < '0' || c > '9')
8546                             break;
8547                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8548                             PyErr_SetString(PyExc_ValueError,
8549                                             "prec too big");
8550                             goto onError;
8551                         }
8552                         prec = prec*10 + (c - '0');
8553                     }
8554                 }
8555             } /* prec */
8556             if (fmtcnt >= 0) {
8557                 if (c == 'h' || c == 'l' || c == 'L') {
8558                     if (--fmtcnt >= 0)
8559                         c = *fmt++;
8560                 }
8561             }
8562             if (fmtcnt < 0) {
8563                 PyErr_SetString(PyExc_ValueError,
8564                                 "incomplete format");
8565                 goto onError;
8566             }
8567             if (c != '%') {
8568                 v = getnextarg(args, arglen, &argidx);
8569                 if (v == NULL)
8570                     goto onError;
8571             }
8572             sign = 0;
8573             fill = ' ';
8574             switch (c) {
8575 
8576             case '%':
8577                 pbuf = formatbuf;
8578                 /* presume that buffer length is at least 1 */
8579                 pbuf[0] = '%';
8580                 len = 1;
8581                 break;
8582 
8583             case 's':
8584             case 'r':
8585                 if (PyUnicode_CheckExact(v) && c == 's') {
8586                     temp = v;
8587                     Py_INCREF(temp);
8588                 }
8589                 else {
8590                     PyObject *unicode;
8591                     if (c == 's')
8592                         temp = PyObject_Unicode(v);
8593                     else
8594                         temp = PyObject_Repr(v);
8595                     if (temp == NULL)
8596                         goto onError;
8597                     if (PyUnicode_Check(temp))
8598                         /* nothing to do */;
8599                     else if (PyString_Check(temp)) {
8600                         /* convert to string to Unicode */
8601                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8602                                                    PyString_GET_SIZE(temp),
8603                                                    NULL,
8604                                                    "strict");
8605                         Py_DECREF(temp);
8606                         temp = unicode;
8607                         if (temp == NULL)
8608                             goto onError;
8609                     }
8610                     else {
8611                         Py_DECREF(temp);
8612                         PyErr_SetString(PyExc_TypeError,
8613                                         "%s argument has non-string str()");
8614                         goto onError;
8615                     }
8616                 }
8617                 pbuf = PyUnicode_AS_UNICODE(temp);
8618                 len = PyUnicode_GET_SIZE(temp);
8619                 if (prec >= 0 && len > prec)
8620                     len = prec;
8621                 break;
8622 
8623             case 'i':
8624             case 'd':
8625             case 'u':
8626             case 'o':
8627             case 'x':
8628             case 'X':
8629                 if (c == 'i')
8630                     c = 'd';
8631                 isnumok = 0;
8632                 if (PyNumber_Check(v)) {
8633                     PyObject *iobj=NULL;
8634 
8635                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8636                         iobj = v;
8637                         Py_INCREF(iobj);
8638                     }
8639                     else {
8640                         iobj = PyNumber_Int(v);
8641                         if (iobj==NULL) {
8642                             PyErr_Clear();
8643                             iobj = PyNumber_Long(v);
8644                         }
8645                     }
8646                     if (iobj!=NULL) {
8647                         if (PyInt_Check(iobj)) {
8648                             isnumok = 1;
8649                             pbuf = formatbuf;
8650                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8651                                             flags, prec, c, iobj);
8652                             Py_DECREF(iobj);
8653                             if (len < 0)
8654                                 goto onError;
8655                             sign = 1;
8656                         }
8657                         else if (PyLong_Check(iobj)) {
8658                             isnumok = 1;
8659                             temp = formatlong(iobj, flags, prec, c);
8660                             Py_DECREF(iobj);
8661                             if (!temp)
8662                                 goto onError;
8663                             pbuf = PyUnicode_AS_UNICODE(temp);
8664                             len = PyUnicode_GET_SIZE(temp);
8665                             sign = 1;
8666                         }
8667                         else {
8668                             Py_DECREF(iobj);
8669                         }
8670                     }
8671                 }
8672                 if (!isnumok) {
8673                     PyErr_Format(PyExc_TypeError,
8674                                  "%%%c format: a number is required, "
8675                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8676                     goto onError;
8677                 }
8678                 if (flags & F_ZERO)
8679                     fill = '0';
8680                 break;
8681 
8682             case 'e':
8683             case 'E':
8684             case 'f':
8685             case 'F':
8686             case 'g':
8687             case 'G':
8688                 temp = formatfloat(v, flags, prec, c);
8689                 if (temp == NULL)
8690                     goto onError;
8691                 pbuf = PyUnicode_AS_UNICODE(temp);
8692                 len = PyUnicode_GET_SIZE(temp);
8693                 sign = 1;
8694                 if (flags & F_ZERO)
8695                     fill = '0';
8696                 break;
8697 
8698             case 'c':
8699                 pbuf = formatbuf;
8700                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8701                 if (len < 0)
8702                     goto onError;
8703                 break;
8704 
8705             default:
8706                 PyErr_Format(PyExc_ValueError,
8707                              "unsupported format character '%c' (0x%x) "
8708                              "at index %zd",
8709                              (31<=c && c<=126) ? (char)c : '?',
8710                              (int)c,
8711                              (Py_ssize_t)(fmt - 1 -
8712                                           PyUnicode_AS_UNICODE(uformat)));
8713                 goto onError;
8714             }
8715             if (sign) {
8716                 if (*pbuf == '-' || *pbuf == '+') {
8717                     sign = *pbuf++;
8718                     len--;
8719                 }
8720                 else if (flags & F_SIGN)
8721                     sign = '+';
8722                 else if (flags & F_BLANK)
8723                     sign = ' ';
8724                 else
8725                     sign = 0;
8726             }
8727             if (width < len)
8728                 width = len;
8729             if (rescnt - (sign != 0) < width) {
8730                 reslen -= rescnt;
8731                 rescnt = width + fmtcnt + 100;
8732                 reslen += rescnt;
8733                 if (reslen < 0) {
8734                     Py_XDECREF(temp);
8735                     PyErr_NoMemory();
8736                     goto onError;
8737                 }
8738                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8739                     Py_XDECREF(temp);
8740                     goto onError;
8741                 }
8742                 res = PyUnicode_AS_UNICODE(result)
8743                     + reslen - rescnt;
8744             }
8745             if (sign) {
8746                 if (fill != ' ')
8747                     *res++ = sign;
8748                 rescnt--;
8749                 if (width > len)
8750                     width--;
8751             }
8752             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8753                 assert(pbuf[0] == '0');
8754                 assert(pbuf[1] == c);
8755                 if (fill != ' ') {
8756                     *res++ = *pbuf++;
8757                     *res++ = *pbuf++;
8758                 }
8759                 rescnt -= 2;
8760                 width -= 2;
8761                 if (width < 0)
8762                     width = 0;
8763                 len -= 2;
8764             }
8765             if (width > len && !(flags & F_LJUST)) {
8766                 do {
8767                     --rescnt;
8768                     *res++ = fill;
8769                 } while (--width > len);
8770             }
8771             if (fill == ' ') {
8772                 if (sign)
8773                     *res++ = sign;
8774                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8775                     assert(pbuf[0] == '0');
8776                     assert(pbuf[1] == c);
8777                     *res++ = *pbuf++;
8778                     *res++ = *pbuf++;
8779                 }
8780             }
8781             Py_UNICODE_COPY(res, pbuf, len);
8782             res += len;
8783             rescnt -= len;
8784             while (--width >= len) {
8785                 --rescnt;
8786                 *res++ = ' ';
8787             }
8788             if (dict && (argidx < arglen) && c != '%') {
8789                 PyErr_SetString(PyExc_TypeError,
8790                                 "not all arguments converted during string formatting");
8791                 Py_XDECREF(temp);
8792                 goto onError;
8793             }
8794             Py_XDECREF(temp);
8795         } /* '%' */
8796     } /* until end */
8797     if (argidx < arglen && !dict) {
8798         PyErr_SetString(PyExc_TypeError,
8799                         "not all arguments converted during string formatting");
8800         goto onError;
8801     }
8802 
8803     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8804         goto onError;
8805     if (args_owned) {
8806         Py_DECREF(args);
8807     }
8808     Py_DECREF(uformat);
8809     return (PyObject *)result;
8810 
8811   onError:
8812     Py_XDECREF(result);
8813     Py_DECREF(uformat);
8814     if (args_owned) {
8815         Py_DECREF(args);
8816     }
8817     return NULL;
8818 }
8819 
8820 static PyBufferProcs unicode_as_buffer = {
8821     (readbufferproc) unicode_buffer_getreadbuf,
8822     (writebufferproc) unicode_buffer_getwritebuf,
8823     (segcountproc) unicode_buffer_getsegcount,
8824     (charbufferproc) unicode_buffer_getcharbuf,
8825 };
8826 
8827 static PyObject *
8828 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8829 
8830 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8831 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8832 {
8833     PyObject *x = NULL;
8834     static char *kwlist[] = {"string", "encoding", "errors", 0};
8835     char *encoding = NULL;
8836     char *errors = NULL;
8837 
8838     if (type != &PyUnicode_Type)
8839         return unicode_subtype_new(type, args, kwds);
8840     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8841                                      kwlist, &x, &encoding, &errors))
8842         return NULL;
8843     if (x == NULL)
8844         return (PyObject *)_PyUnicode_New(0);
8845     if (encoding == NULL && errors == NULL)
8846         return PyObject_Unicode(x);
8847     else
8848         return PyUnicode_FromEncodedObject(x, encoding, errors);
8849 }
8850 
8851 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8852 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8853 {
8854     PyUnicodeObject *tmp, *pnew;
8855     Py_ssize_t n;
8856 
8857     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8858     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8859     if (tmp == NULL)
8860         return NULL;
8861     assert(PyUnicode_Check(tmp));
8862     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8863     if (pnew == NULL) {
8864         Py_DECREF(tmp);
8865         return NULL;
8866     }
8867     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8868     if (pnew->str == NULL) {
8869         _Py_ForgetReference((PyObject *)pnew);
8870         PyObject_Del(pnew);
8871         Py_DECREF(tmp);
8872         return PyErr_NoMemory();
8873     }
8874     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8875     pnew->length = n;
8876     pnew->hash = tmp->hash;
8877     Py_DECREF(tmp);
8878     return (PyObject *)pnew;
8879 }
8880 
8881 PyDoc_STRVAR(unicode_doc,
8882              "unicode(object='') -> unicode object\n\
8883 unicode(string[, encoding[, errors]]) -> unicode object\n\
8884 \n\
8885 Create a new Unicode object from the given encoded string.\n\
8886 encoding defaults to the current default string encoding.\n\
8887 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8888 
8889 PyTypeObject PyUnicode_Type = {
8890     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8891     "unicode",              /* tp_name */
8892     sizeof(PyUnicodeObject),        /* tp_size */
8893     0,                  /* tp_itemsize */
8894     /* Slots */
8895     (destructor)unicode_dealloc,    /* tp_dealloc */
8896     0,                  /* tp_print */
8897     0,                  /* tp_getattr */
8898     0,                  /* tp_setattr */
8899     0,                  /* tp_compare */
8900     unicode_repr,           /* tp_repr */
8901     &unicode_as_number,         /* tp_as_number */
8902     &unicode_as_sequence,       /* tp_as_sequence */
8903     &unicode_as_mapping,        /* tp_as_mapping */
8904     (hashfunc) unicode_hash,        /* tp_hash*/
8905     0,                  /* tp_call*/
8906     (reprfunc) unicode_str,     /* tp_str */
8907     PyObject_GenericGetAttr,        /* tp_getattro */
8908     0,                  /* tp_setattro */
8909     &unicode_as_buffer,         /* tp_as_buffer */
8910     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8911     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8912     unicode_doc,            /* tp_doc */
8913     0,                  /* tp_traverse */
8914     0,                  /* tp_clear */
8915     PyUnicode_RichCompare,      /* tp_richcompare */
8916     0,                  /* tp_weaklistoffset */
8917     0,                  /* tp_iter */
8918     0,                  /* tp_iternext */
8919     unicode_methods,            /* tp_methods */
8920     0,                  /* tp_members */
8921     0,                  /* tp_getset */
8922     &PyBaseString_Type,         /* tp_base */
8923     0,                  /* tp_dict */
8924     0,                  /* tp_descr_get */
8925     0,                  /* tp_descr_set */
8926     0,                  /* tp_dictoffset */
8927     0,                  /* tp_init */
8928     0,                  /* tp_alloc */
8929     unicode_new,            /* tp_new */
8930     PyObject_Del,           /* tp_free */
8931 };
8932 
8933 /* Initialize the Unicode implementation */
8934 
_PyUnicode_Init(void)8935 void _PyUnicode_Init(void)
8936 {
8937     /* XXX - move this array to unicodectype.c ? */
8938     Py_UNICODE linebreak[] = {
8939         0x000A, /* LINE FEED */
8940         0x000D, /* CARRIAGE RETURN */
8941         0x001C, /* FILE SEPARATOR */
8942         0x001D, /* GROUP SEPARATOR */
8943         0x001E, /* RECORD SEPARATOR */
8944         0x0085, /* NEXT LINE */
8945         0x2028, /* LINE SEPARATOR */
8946         0x2029, /* PARAGRAPH SEPARATOR */
8947     };
8948 
8949     /* Init the implementation */
8950     if (!unicode_empty) {
8951         unicode_empty = _PyUnicode_New(0);
8952         if (!unicode_empty)
8953             return;
8954     }
8955 
8956     if (PyType_Ready(&PyUnicode_Type) < 0)
8957         Py_FatalError("Can't initialize 'unicode'");
8958 
8959     /* initialize the linebreak bloom filter */
8960     bloom_linebreak = make_bloom_mask(
8961         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8962         );
8963 
8964     PyType_Ready(&EncodingMapType);
8965 
8966     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8967         Py_FatalError("Can't initialize field name iterator type");
8968 
8969     if (PyType_Ready(&PyFormatterIter_Type) < 0)
8970         Py_FatalError("Can't initialize formatter iter type");
8971 }
8972 
8973 /* Finalize the Unicode implementation */
8974 
8975 int
PyUnicode_ClearFreeList(void)8976 PyUnicode_ClearFreeList(void)
8977 {
8978     int freelist_size = numfree;
8979     PyUnicodeObject *u;
8980 
8981     for (u = free_list; u != NULL;) {
8982         PyUnicodeObject *v = u;
8983         u = *(PyUnicodeObject **)u;
8984         if (v->str)
8985             PyObject_DEL(v->str);
8986         Py_XDECREF(v->defenc);
8987         PyObject_Del(v);
8988         numfree--;
8989     }
8990     free_list = NULL;
8991     assert(numfree == 0);
8992     return freelist_size;
8993 }
8994 
8995 void
_PyUnicode_Fini(void)8996 _PyUnicode_Fini(void)
8997 {
8998     int i;
8999 
9000     Py_CLEAR(unicode_empty);
9001 
9002     for (i = 0; i < 256; i++)
9003         Py_CLEAR(unicode_latin1[i]);
9004 
9005     (void)PyUnicode_ClearFreeList();
9006 }
9007 
9008 #ifdef __cplusplus
9009 }
9010 #endif
9011