1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10 Copyright (c) Corporation for National Research Initiatives.
11
12 --------------------------------------------------------------------
13 The original string type implementation is:
14
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
17
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39
40 */
41
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /* Limit for the Unicode object free list */
53
54 #define PyUnicode_MAXFREELIST 1024
55
56 /* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
61
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
70
71 */
72
73 #define KEEPALIVE_SIZE_LIMIT 9
74
75 /* Endianness switches; defaults to little endian */
76
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82
83 /* --- Globals ------------------------------------------------------------
84
85 NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
88
89 */
90
91
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102
103 #define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
114
115 /* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131 /* case 0x0009: * CHARACTER TABULATION */
132 /* case 0x000A: * LINE FEED */
133 /* case 0x000B: * LINE TABULATION */
134 /* case 0x000C: * FORM FEED */
135 /* case 0x000D: * CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 /* case 0x001C: * FILE SEPARATOR */
139 /* case 0x001D: * GROUP SEPARATOR */
140 /* case 0x001E: * RECORD SEPARATOR */
141 /* case 0x001F: * UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143 /* case 0x0020: * SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157 };
158
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0x000A, * LINE FEED */
163 /* 0x000B, * LINE TABULATION */
164 /* 0x000C, * FORM FEED */
165 /* 0x000D, * CARRIAGE RETURN */
166 0, 0, 1, 1, 1, 1, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 /* 0x001C, * FILE SEPARATOR */
169 /* 0x001D, * GROUP SEPARATOR */
170 /* 0x001E, * RECORD SEPARATOR */
171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
185 };
186
187
188 Py_UNICODE
PyUnicode_GetMax(void)189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192 return 0x10FFFF;
193 #else
194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
197 #endif
198 }
199
200 /* --- Bloom Filters ----------------------------------------------------- */
201
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206 /* the linebreak mask is set up by Unicode_Init below */
207
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217
218 #define BLOOM_MASK unsigned long
219
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225 #define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
233 BLOOM_MASK mask;
234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
238 BLOOM_ADD(mask, ptr[i]);
239
240 return mask;
241 }
242
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252 }
253
254 #define BLOOM_MEMBER(mask, chr, set, setlen) \
255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257 /* --- Unicode Object ----------------------------------------------------- */
258
259 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)260 int unicode_resize(register PyUnicodeObject *unicode,
261 Py_ssize_t length)
262 {
263 void *oldstr;
264
265 /* Shortcut if there's nothing much to do. */
266 if (unicode->length == length)
267 goto reset;
268
269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
272
273 if (unicode == unicode_empty ||
274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
277 PyErr_SetString(PyExc_SystemError,
278 "can't resize shared unicode objects");
279 return -1;
280 }
281
282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
287 oldstr = unicode->str;
288 unicode->str = PyObject_REALLOC(unicode->str,
289 sizeof(Py_UNICODE) * (length + 1));
290 if (!unicode->str) {
291 unicode->str = (Py_UNICODE *)oldstr;
292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
296 unicode->length = length;
297
298 reset:
299 /* Reset the object caches */
300 if (unicode->defenc) {
301 Py_CLEAR(unicode->defenc);
302 }
303 unicode->hash = -1;
304
305 return 0;
306 }
307
308 /* We allocate one more byte to make sure the string is
309 Ux0000 terminated; some code relies on that.
310
311 XXX This allocator could further be enhanced by assuring that the
312 free list never reduces its size below 1.
313
314 */
315
316 static
_PyUnicode_New(Py_ssize_t length)317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319 register PyUnicodeObject *unicode;
320
321 /* Optimization for empty strings */
322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
332 /* Unicode freelist & memory allocation */
333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
341 unicode_resize(unicode, length) < 0) {
342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
345 }
346 else {
347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349 }
350 (void)PyObject_INIT(unicode, &PyUnicode_Type);
351 }
352 else {
353 size_t new_size;
354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355 if (unicode == NULL)
356 return NULL;
357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359 }
360
361 if (!unicode->str) {
362 PyErr_NoMemory();
363 goto onError;
364 }
365 /* Initialize the first element to guard against cases where
366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
372 unicode->str[0] = 0;
373 unicode->str[length] = 0;
374 unicode->length = length;
375 unicode->hash = -1;
376 unicode->defenc = NULL;
377 return unicode;
378
379 onError:
380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
382 _Py_ForgetReference((PyObject *)unicode);
383 PyObject_Del(unicode);
384 return NULL;
385 }
386
387 static
unicode_dealloc(register PyUnicodeObject * unicode)388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390 if (PyUnicode_CheckExact(unicode) &&
391 numfree < PyUnicode_MAXFREELIST) {
392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
399 Py_CLEAR(unicode->defenc);
400 }
401 /* Add to free list */
402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
405 }
406 else {
407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410 }
411 }
412
413 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = *unicode;
424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
432 if (v->length != length &&
433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_SETREF(*unicode, w);
440 return 0;
441 }
442
443 /* Note that we don't have to modify *unicode for unshared Unicode
444 objects, since we can modify them in-place. */
445 return unicode_resize(v, length);
446 }
447
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
449 {
450 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
451 }
452
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
454 Py_ssize_t size)
455 {
456 PyUnicodeObject *unicode;
457
458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
462 /* Optimization for empty strings */
463 if (size == 0)
464 _Py_RETURN_UNICODE_EMPTY();
465
466 /* Single character Unicode objects in the Latin-1 range are
467 shared when using this constructor */
468 if (size == 1 && *u < 256) {
469 unicode = unicode_latin1[*u];
470 if (!unicode) {
471 unicode = _PyUnicode_New(1);
472 if (!unicode)
473 return NULL;
474 unicode->str[0] = *u;
475 unicode_latin1[*u] = unicode;
476 }
477 Py_INCREF(unicode);
478 return (PyObject *)unicode;
479 }
480 }
481
482 unicode = _PyUnicode_New(size);
483 if (!unicode)
484 return NULL;
485
486 /* Copy the Unicode data into the new object */
487 if (u != NULL)
488 Py_UNICODE_COPY(unicode->str, u, size);
489
490 return (PyObject *)unicode;
491 }
492
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
494 {
495 PyUnicodeObject *unicode;
496
497 if (size < 0) {
498 PyErr_SetString(PyExc_SystemError,
499 "Negative size passed to PyUnicode_FromStringAndSize");
500 return NULL;
501 }
502
503 /* If the Unicode data is known at construction time, we can apply
504 some optimizations which share commonly used objects.
505 Also, this means the input must be UTF-8, so fall back to the
506 UTF-8 decoder at the end. */
507 if (u != NULL) {
508
509 /* Optimization for empty strings */
510 if (size == 0)
511 _Py_RETURN_UNICODE_EMPTY();
512
513 /* Single characters are shared when using this constructor.
514 Restrict to ASCII, since the input must be UTF-8. */
515 if (size == 1 && Py_CHARMASK(*u) < 128) {
516 unicode = unicode_latin1[Py_CHARMASK(*u)];
517 if (!unicode) {
518 unicode = _PyUnicode_New(1);
519 if (!unicode)
520 return NULL;
521 unicode->str[0] = Py_CHARMASK(*u);
522 unicode_latin1[Py_CHARMASK(*u)] = unicode;
523 }
524 Py_INCREF(unicode);
525 return (PyObject *)unicode;
526 }
527
528 return PyUnicode_DecodeUTF8(u, size, NULL);
529 }
530
531 unicode = _PyUnicode_New(size);
532 if (!unicode)
533 return NULL;
534
535 return (PyObject *)unicode;
536 }
537
PyUnicode_FromString(const char * u)538 PyObject *PyUnicode_FromString(const char *u)
539 {
540 size_t size = strlen(u);
541 if (size > PY_SSIZE_T_MAX) {
542 PyErr_SetString(PyExc_OverflowError, "input too long");
543 return NULL;
544 }
545
546 return PyUnicode_FromStringAndSize(u, size);
547 }
548
549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
550 * by 'ptr', possibly combining surrogate pairs on narrow builds.
551 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
552 * that should be returned and 'end' pointing to the end of the buffer.
553 * ('end' is used on narrow builds to detect a lone surrogate at the
554 * end of the buffer that should be returned unchanged.)
555 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
556 * The type of the returned char is always Py_UCS4.
557 *
558 * Note: the macro advances ptr to next char, so it might have side-effects
559 * (especially if used with other macros).
560 */
561
562 /* helper macros used by _Py_UNICODE_NEXT */
563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
565 /* Join two surrogate characters and return a single Py_UCS4 value. */
566 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \
567 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
568 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
569
570 #ifdef Py_UNICODE_WIDE
571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
572 #else
573 #define _Py_UNICODE_NEXT(ptr, end) \
574 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
575 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
576 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
577 (Py_UCS4)*(ptr)++)
578 #endif
579
580 #ifdef HAVE_WCHAR_H
581
582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583 # define CONVERT_WCHAR_TO_SURROGATES
584 #endif
585
586 #ifdef CONVERT_WCHAR_TO_SURROGATES
587
588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589 to convert from UTF32 to UTF16. */
590
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592 Py_ssize_t size)
593 {
594 PyUnicodeObject *unicode;
595 register Py_ssize_t i;
596 Py_ssize_t alloc;
597 const wchar_t *orig_w;
598
599 if (w == NULL) {
600 PyErr_BadInternalCall();
601 return NULL;
602 }
603
604 alloc = size;
605 orig_w = w;
606 for (i = size; i > 0; i--) {
607 if (*w > 0xFFFF)
608 alloc++;
609 w++;
610 }
611 w = orig_w;
612 unicode = _PyUnicode_New(alloc);
613 if (!unicode)
614 return NULL;
615
616 /* Copy the wchar_t data into the new object */
617 {
618 register Py_UNICODE *u;
619 u = PyUnicode_AS_UNICODE(unicode);
620 for (i = size; i > 0; i--) {
621 if (*w > 0xFFFF) {
622 wchar_t ordinal = *w++;
623 ordinal -= 0x10000;
624 *u++ = 0xD800 | (ordinal >> 10);
625 *u++ = 0xDC00 | (ordinal & 0x3FF);
626 }
627 else
628 *u++ = *w++;
629 }
630 }
631 return (PyObject *)unicode;
632 }
633
634 #else
635
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
637 Py_ssize_t size)
638 {
639 PyUnicodeObject *unicode;
640
641 if (w == NULL) {
642 PyErr_BadInternalCall();
643 return NULL;
644 }
645
646 unicode = _PyUnicode_New(size);
647 if (!unicode)
648 return NULL;
649
650 /* Copy the wchar_t data into the new object */
651 #ifdef HAVE_USABLE_WCHAR_T
652 memcpy(unicode->str, w, size * sizeof(wchar_t));
653 #else
654 {
655 register Py_UNICODE *u;
656 register Py_ssize_t i;
657 u = PyUnicode_AS_UNICODE(unicode);
658 for (i = size; i > 0; i--)
659 *u++ = *w++;
660 }
661 #endif
662
663 return (PyObject *)unicode;
664 }
665
666 #endif /* CONVERT_WCHAR_TO_SURROGATES */
667
668 #undef CONVERT_WCHAR_TO_SURROGATES
669
670 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
672 {
673 *fmt++ = '%';
674 if (width) {
675 if (zeropad)
676 *fmt++ = '0';
677 fmt += sprintf(fmt, "%d", width);
678 }
679 if (precision)
680 fmt += sprintf(fmt, ".%d", precision);
681 if (longflag)
682 *fmt++ = 'l';
683 else if (size_tflag) {
684 char *f = PY_FORMAT_SIZE_T;
685 while (*f)
686 *fmt++ = *f++;
687 }
688 *fmt++ = c;
689 *fmt = '\0';
690 }
691
692 #define appendstring(string) \
693 do { \
694 for (copy = string;*copy; copy++) { \
695 *s++ = (unsigned char)*copy; \
696 } \
697 } while (0)
698
699 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)700 PyUnicode_FromFormatV(const char *format, va_list vargs)
701 {
702 va_list count;
703 Py_ssize_t callcount = 0;
704 PyObject **callresults = NULL;
705 PyObject **callresult = NULL;
706 Py_ssize_t n = 0;
707 int width = 0;
708 int precision = 0;
709 int zeropad;
710 const char* f;
711 Py_UNICODE *s;
712 PyObject *string;
713 /* used by sprintf */
714 char buffer[21];
715 /* use abuffer instead of buffer, if we need more space
716 * (which can happen if there's a format specifier with width). */
717 char *abuffer = NULL;
718 char *realbuffer;
719 Py_ssize_t abuffersize = 0;
720 char fmt[60]; /* should be enough for %0width.precisionld */
721 const char *copy;
722
723 #ifdef VA_LIST_IS_ARRAY
724 Py_MEMCPY(count, vargs, sizeof(va_list));
725 #else
726 #ifdef __va_copy
727 __va_copy(count, vargs);
728 #else
729 count = vargs;
730 #endif
731 #endif
732 /* step 1: count the number of %S/%R/%s format specifications
733 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
734 * objects once during step 3 and put the result in an array) */
735 for (f = format; *f; f++) {
736 if (*f == '%') {
737 f++;
738 while (*f && *f != '%' && !isalpha((unsigned)*f))
739 f++;
740 if (!*f)
741 break;
742 if (*f == 's' || *f=='S' || *f=='R')
743 ++callcount;
744 }
745 }
746 /* step 2: allocate memory for the results of
747 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
748 if (callcount) {
749 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
750 if (!callresults) {
751 PyErr_NoMemory();
752 return NULL;
753 }
754 callresult = callresults;
755 }
756 /* step 3: figure out how large a buffer we need */
757 for (f = format; *f; f++) {
758 if (*f == '%') {
759 const char* p = f++;
760 width = 0;
761 while (isdigit((unsigned)*f))
762 width = (width*10) + *f++ - '0';
763 precision = 0;
764 if (*f == '.') {
765 f++;
766 while (isdigit((unsigned)*f))
767 precision = (precision*10) + *f++ - '0';
768 }
769
770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771 * they don't affect the amount of space we reserve.
772 */
773 if ((*f == 'l' || *f == 'z') &&
774 (f[1] == 'd' || f[1] == 'u'))
775 ++f;
776
777 switch (*f) {
778 case 'c':
779 {
780 int ordinal = va_arg(count, int);
781 #ifdef Py_UNICODE_WIDE
782 if (ordinal < 0 || ordinal > 0x10ffff) {
783 PyErr_SetString(PyExc_OverflowError,
784 "%c arg not in range(0x110000) "
785 "(wide Python build)");
786 goto fail;
787 }
788 #else
789 if (ordinal < 0 || ordinal > 0xffff) {
790 PyErr_SetString(PyExc_OverflowError,
791 "%c arg not in range(0x10000) "
792 "(narrow Python build)");
793 goto fail;
794 }
795 #endif
796 /* fall through... */
797 }
798 case '%':
799 n++;
800 break;
801 case 'd': case 'u': case 'i': case 'x':
802 (void) va_arg(count, int);
803 if (width < precision)
804 width = precision;
805 /* 20 bytes is enough to hold a 64-bit
806 integer. Decimal takes the most space.
807 This isn't enough for octal.
808 If a width is specified we need more
809 (which we allocate later). */
810 if (width < 20)
811 width = 20;
812 n += width;
813 if (abuffersize < width)
814 abuffersize = width;
815 break;
816 case 's':
817 {
818 /* UTF-8 */
819 const char *s = va_arg(count, const char*);
820 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
821 if (!str)
822 goto fail;
823 n += PyUnicode_GET_SIZE(str);
824 /* Remember the str and switch to the next slot */
825 *callresult++ = str;
826 break;
827 }
828 case 'U':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 assert(obj && PyUnicode_Check(obj));
832 n += PyUnicode_GET_SIZE(obj);
833 break;
834 }
835 case 'V':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 const char *str = va_arg(count, const char *);
839 assert(obj || str);
840 assert(!obj || PyUnicode_Check(obj));
841 if (obj)
842 n += PyUnicode_GET_SIZE(obj);
843 else
844 n += strlen(str);
845 break;
846 }
847 case 'S':
848 {
849 PyObject *obj = va_arg(count, PyObject *);
850 PyObject *str;
851 assert(obj);
852 str = PyObject_Str(obj);
853 if (!str)
854 goto fail;
855 n += PyString_GET_SIZE(str);
856 /* Remember the str and switch to the next slot */
857 *callresult++ = str;
858 break;
859 }
860 case 'R':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 PyObject *repr;
864 assert(obj);
865 repr = PyObject_Repr(obj);
866 if (!repr)
867 goto fail;
868 n += PyUnicode_GET_SIZE(repr);
869 /* Remember the repr and switch to the next slot */
870 *callresult++ = repr;
871 break;
872 }
873 case 'p':
874 (void) va_arg(count, int);
875 /* maximum 64-bit pointer representation:
876 * 0xffffffffffffffff
877 * so 19 characters is enough.
878 * XXX I count 18 -- what's the extra for?
879 */
880 n += 19;
881 break;
882 default:
883 /* if we stumble upon an unknown
884 formatting code, copy the rest of
885 the format string to the output
886 string. (we cannot just skip the
887 code, since there's no way to know
888 what's in the argument list) */
889 n += strlen(p);
890 goto expand;
891 }
892 } else
893 n++;
894 }
895 expand:
896 if (abuffersize > 20) {
897 /* add 1 for sprintf's trailing null byte */
898 abuffer = PyObject_Malloc(abuffersize + 1);
899 if (!abuffer) {
900 PyErr_NoMemory();
901 goto fail;
902 }
903 realbuffer = abuffer;
904 }
905 else
906 realbuffer = buffer;
907 /* step 4: fill the buffer */
908 /* Since we've analyzed how much space we need for the worst case,
909 we don't have to resize the string.
910 There can be no errors beyond this point. */
911 string = PyUnicode_FromUnicode(NULL, n);
912 if (!string)
913 goto fail;
914
915 s = PyUnicode_AS_UNICODE(string);
916 callresult = callresults;
917
918 for (f = format; *f; f++) {
919 if (*f == '%') {
920 const char* p = f++;
921 int longflag = 0;
922 int size_tflag = 0;
923 zeropad = (*f == '0');
924 /* parse the width.precision part */
925 width = 0;
926 while (isdigit((unsigned)*f))
927 width = (width*10) + *f++ - '0';
928 precision = 0;
929 if (*f == '.') {
930 f++;
931 while (isdigit((unsigned)*f))
932 precision = (precision*10) + *f++ - '0';
933 }
934 /* handle the long flag, but only for %ld and %lu.
935 others can be added when necessary. */
936 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
937 longflag = 1;
938 ++f;
939 }
940 /* handle the size_t flag. */
941 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
942 size_tflag = 1;
943 ++f;
944 }
945
946 switch (*f) {
947 case 'c':
948 *s++ = va_arg(vargs, int);
949 break;
950 case 'd':
951 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
952 if (longflag)
953 sprintf(realbuffer, fmt, va_arg(vargs, long));
954 else if (size_tflag)
955 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
956 else
957 sprintf(realbuffer, fmt, va_arg(vargs, int));
958 appendstring(realbuffer);
959 break;
960 case 'u':
961 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
962 if (longflag)
963 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
964 else if (size_tflag)
965 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
966 else
967 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
968 appendstring(realbuffer);
969 break;
970 case 'i':
971 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
972 sprintf(realbuffer, fmt, va_arg(vargs, int));
973 appendstring(realbuffer);
974 break;
975 case 'x':
976 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
977 sprintf(realbuffer, fmt, va_arg(vargs, int));
978 appendstring(realbuffer);
979 break;
980 case 's':
981 {
982 /* unused, since we already have the result */
983 (void) va_arg(vargs, char *);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
985 PyUnicode_GET_SIZE(*callresult));
986 s += PyUnicode_GET_SIZE(*callresult);
987 /* We're done with the unicode()/repr() => forget it */
988 Py_DECREF(*callresult);
989 /* switch to next unicode()/repr() result */
990 ++callresult;
991 break;
992 }
993 case 'U':
994 {
995 PyObject *obj = va_arg(vargs, PyObject *);
996 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
997 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
998 s += size;
999 break;
1000 }
1001 case 'V':
1002 {
1003 PyObject *obj = va_arg(vargs, PyObject *);
1004 const char *str = va_arg(vargs, const char *);
1005 if (obj) {
1006 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1007 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1008 s += size;
1009 } else {
1010 appendstring(str);
1011 }
1012 break;
1013 }
1014 case 'S':
1015 case 'R':
1016 {
1017 const char *str = PyString_AS_STRING(*callresult);
1018 /* unused, since we already have the result */
1019 (void) va_arg(vargs, PyObject *);
1020 appendstring(str);
1021 /* We're done with the unicode()/repr() => forget it */
1022 Py_DECREF(*callresult);
1023 /* switch to next unicode()/repr() result */
1024 ++callresult;
1025 break;
1026 }
1027 case 'p':
1028 sprintf(buffer, "%p", va_arg(vargs, void*));
1029 /* %p is ill-defined: ensure leading 0x. */
1030 if (buffer[1] == 'X')
1031 buffer[1] = 'x';
1032 else if (buffer[1] != 'x') {
1033 memmove(buffer+2, buffer, strlen(buffer)+1);
1034 buffer[0] = '0';
1035 buffer[1] = 'x';
1036 }
1037 appendstring(buffer);
1038 break;
1039 case '%':
1040 *s++ = '%';
1041 break;
1042 default:
1043 appendstring(p);
1044 goto end;
1045 }
1046 } else
1047 *s++ = *f;
1048 }
1049
1050 end:
1051 if (callresults)
1052 PyObject_Free(callresults);
1053 if (abuffer)
1054 PyObject_Free(abuffer);
1055 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1056 return string;
1057 fail:
1058 if (callresults) {
1059 PyObject **callresult2 = callresults;
1060 while (callresult2 < callresult) {
1061 Py_DECREF(*callresult2);
1062 ++callresult2;
1063 }
1064 PyObject_Free(callresults);
1065 }
1066 if (abuffer)
1067 PyObject_Free(abuffer);
1068 return NULL;
1069 }
1070
1071 #undef appendstring
1072
1073 PyObject *
PyUnicode_FromFormat(const char * format,...)1074 PyUnicode_FromFormat(const char *format, ...)
1075 {
1076 PyObject* ret;
1077 va_list vargs;
1078
1079 #ifdef HAVE_STDARG_PROTOTYPES
1080 va_start(vargs, format);
1081 #else
1082 va_start(vargs);
1083 #endif
1084 ret = PyUnicode_FromFormatV(format, vargs);
1085 va_end(vargs);
1086 return ret;
1087 }
1088
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1090 wchar_t *w,
1091 Py_ssize_t size)
1092 {
1093 if (unicode == NULL) {
1094 PyErr_BadInternalCall();
1095 return -1;
1096 }
1097
1098 /* If possible, try to copy the 0-termination as well */
1099 if (size > PyUnicode_GET_SIZE(unicode))
1100 size = PyUnicode_GET_SIZE(unicode) + 1;
1101
1102 #ifdef HAVE_USABLE_WCHAR_T
1103 memcpy(w, unicode->str, size * sizeof(wchar_t));
1104 #else
1105 {
1106 register Py_UNICODE *u;
1107 register Py_ssize_t i;
1108 u = PyUnicode_AS_UNICODE(unicode);
1109 for (i = size; i > 0; i--)
1110 *w++ = *u++;
1111 }
1112 #endif
1113
1114 if (size > PyUnicode_GET_SIZE(unicode))
1115 return PyUnicode_GET_SIZE(unicode);
1116 else
1117 return size;
1118 }
1119
1120 #endif
1121
PyUnicode_FromOrdinal(int ordinal)1122 PyObject *PyUnicode_FromOrdinal(int ordinal)
1123 {
1124 Py_UNICODE s[1];
1125
1126 #ifdef Py_UNICODE_WIDE
1127 if (ordinal < 0 || ordinal > 0x10ffff) {
1128 PyErr_SetString(PyExc_ValueError,
1129 "unichr() arg not in range(0x110000) "
1130 "(wide Python build)");
1131 return NULL;
1132 }
1133 #else
1134 if (ordinal < 0 || ordinal > 0xffff) {
1135 PyErr_SetString(PyExc_ValueError,
1136 "unichr() arg not in range(0x10000) "
1137 "(narrow Python build)");
1138 return NULL;
1139 }
1140 #endif
1141
1142 s[0] = (Py_UNICODE)ordinal;
1143 return PyUnicode_FromUnicode(s, 1);
1144 }
1145
PyUnicode_FromObject(register PyObject * obj)1146 PyObject *PyUnicode_FromObject(register PyObject *obj)
1147 {
1148 /* XXX Perhaps we should make this API an alias of
1149 PyObject_Unicode() instead ?! */
1150 if (PyUnicode_CheckExact(obj)) {
1151 Py_INCREF(obj);
1152 return obj;
1153 }
1154 if (PyUnicode_Check(obj)) {
1155 /* For a Unicode subtype that's not a Unicode object,
1156 return a true Unicode object with the same data. */
1157 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1158 PyUnicode_GET_SIZE(obj));
1159 }
1160 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1161 }
1162
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1164 const char *encoding,
1165 const char *errors)
1166 {
1167 const char *s = NULL;
1168 Py_ssize_t len;
1169 PyObject *v;
1170
1171 if (obj == NULL) {
1172 PyErr_BadInternalCall();
1173 return NULL;
1174 }
1175
1176 #if 0
1177 /* For b/w compatibility we also accept Unicode objects provided
1178 that no encodings is given and then redirect to
1179 PyObject_Unicode() which then applies the additional logic for
1180 Unicode subclasses.
1181
1182 NOTE: This API should really only be used for object which
1183 represent *encoded* Unicode !
1184
1185 */
1186 if (PyUnicode_Check(obj)) {
1187 if (encoding) {
1188 PyErr_SetString(PyExc_TypeError,
1189 "decoding Unicode is not supported");
1190 return NULL;
1191 }
1192 return PyObject_Unicode(obj);
1193 }
1194 #else
1195 if (PyUnicode_Check(obj)) {
1196 PyErr_SetString(PyExc_TypeError,
1197 "decoding Unicode is not supported");
1198 return NULL;
1199 }
1200 #endif
1201
1202 /* Coerce object */
1203 if (PyString_Check(obj)) {
1204 s = PyString_AS_STRING(obj);
1205 len = PyString_GET_SIZE(obj);
1206 }
1207 else if (PyByteArray_Check(obj)) {
1208 /* Python 2.x specific */
1209 PyErr_Format(PyExc_TypeError,
1210 "decoding bytearray is not supported");
1211 return NULL;
1212 }
1213 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1214 /* Overwrite the error message with something more useful in
1215 case of a TypeError. */
1216 if (PyErr_ExceptionMatches(PyExc_TypeError))
1217 PyErr_Format(PyExc_TypeError,
1218 "coercing to Unicode: need string or buffer, "
1219 "%.80s found",
1220 Py_TYPE(obj)->tp_name);
1221 goto onError;
1222 }
1223
1224 /* Convert to Unicode */
1225 if (len == 0)
1226 _Py_RETURN_UNICODE_EMPTY();
1227
1228 v = PyUnicode_Decode(s, len, encoding, errors);
1229 return v;
1230
1231 onError:
1232 return NULL;
1233 }
1234
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1235 PyObject *PyUnicode_Decode(const char *s,
1236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
1239 {
1240 PyObject *buffer = NULL, *unicode;
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Shortcuts for common default encodings */
1246 if (strcmp(encoding, "utf-8") == 0)
1247 return PyUnicode_DecodeUTF8(s, size, errors);
1248 else if (strcmp(encoding, "latin-1") == 0)
1249 return PyUnicode_DecodeLatin1(s, size, errors);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding, "mbcs") == 0)
1252 return PyUnicode_DecodeMBCS(s, size, errors);
1253 #endif
1254 else if (strcmp(encoding, "ascii") == 0)
1255 return PyUnicode_DecodeASCII(s, size, errors);
1256
1257 /* Decode via the codec registry */
1258 buffer = PyBuffer_FromMemory((void *)s, size);
1259 if (buffer == NULL)
1260 goto onError;
1261 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
1262 if (unicode == NULL)
1263 goto onError;
1264 if (!PyUnicode_Check(unicode)) {
1265 PyErr_Format(PyExc_TypeError,
1266 "decoder did not return an unicode object (type=%.400s)",
1267 Py_TYPE(unicode)->tp_name);
1268 Py_DECREF(unicode);
1269 goto onError;
1270 }
1271 Py_DECREF(buffer);
1272 return unicode;
1273
1274 onError:
1275 Py_XDECREF(buffer);
1276 return NULL;
1277 }
1278
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1280 const char *encoding,
1281 const char *errors)
1282 {
1283 PyObject *v;
1284
1285 if (!PyUnicode_Check(unicode)) {
1286 PyErr_BadArgument();
1287 goto onError;
1288 }
1289
1290 if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1291 goto onError;
1292
1293 if (encoding == NULL)
1294 encoding = PyUnicode_GetDefaultEncoding();
1295
1296 /* Decode via the codec registry */
1297 v = _PyCodec_DecodeText(unicode, encoding, errors);
1298 if (v == NULL)
1299 goto onError;
1300 return v;
1301
1302 onError:
1303 return NULL;
1304 }
1305
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1307 Py_ssize_t size,
1308 const char *encoding,
1309 const char *errors)
1310 {
1311 PyObject *v, *unicode;
1312
1313 unicode = PyUnicode_FromUnicode(s, size);
1314 if (unicode == NULL)
1315 return NULL;
1316 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1317 Py_DECREF(unicode);
1318 return v;
1319 }
1320
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1322 const char *encoding,
1323 const char *errors)
1324 {
1325 PyObject *v;
1326
1327 if (!PyUnicode_Check(unicode)) {
1328 PyErr_BadArgument();
1329 goto onError;
1330 }
1331
1332 if (encoding == NULL)
1333 encoding = PyUnicode_GetDefaultEncoding();
1334
1335 /* Encode via the codec registry */
1336 v = _PyCodec_EncodeText(unicode, encoding, errors);
1337 if (v == NULL)
1338 goto onError;
1339 return v;
1340
1341 onError:
1342 return NULL;
1343 }
1344
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1346 const char *encoding,
1347 const char *errors)
1348 {
1349 PyObject *v;
1350
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
1355
1356 if (encoding == NULL)
1357 encoding = PyUnicode_GetDefaultEncoding();
1358
1359 /* Shortcuts for common default encodings */
1360 if (errors == NULL) {
1361 if (strcmp(encoding, "utf-8") == 0)
1362 return PyUnicode_AsUTF8String(unicode);
1363 else if (strcmp(encoding, "latin-1") == 0)
1364 return PyUnicode_AsLatin1String(unicode);
1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1366 else if (strcmp(encoding, "mbcs") == 0)
1367 return PyUnicode_AsMBCSString(unicode);
1368 #endif
1369 else if (strcmp(encoding, "ascii") == 0)
1370 return PyUnicode_AsASCIIString(unicode);
1371 }
1372
1373 /* Encode via the codec registry */
1374 v = _PyCodec_EncodeText(unicode, encoding, errors);
1375 if (v == NULL)
1376 goto onError;
1377 if (!PyString_Check(v)) {
1378 PyErr_Format(PyExc_TypeError,
1379 "encoder did not return a string object (type=%.400s)",
1380 Py_TYPE(v)->tp_name);
1381 Py_DECREF(v);
1382 goto onError;
1383 }
1384 return v;
1385
1386 onError:
1387 return NULL;
1388 }
1389
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1391 const char *errors)
1392 {
1393 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1394
1395 if (v)
1396 return v;
1397 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1398 if (v && errors == NULL)
1399 ((PyUnicodeObject *)unicode)->defenc = v;
1400 return v;
1401 }
1402
PyUnicode_AsUnicode(PyObject * unicode)1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1404 {
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409 return PyUnicode_AS_UNICODE(unicode);
1410
1411 onError:
1412 return NULL;
1413 }
1414
PyUnicode_GetSize(PyObject * unicode)1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1416 {
1417 if (!PyUnicode_Check(unicode)) {
1418 PyErr_BadArgument();
1419 goto onError;
1420 }
1421 return PyUnicode_GET_SIZE(unicode);
1422
1423 onError:
1424 return -1;
1425 }
1426
PyUnicode_GetDefaultEncoding(void)1427 const char *PyUnicode_GetDefaultEncoding(void)
1428 {
1429 return unicode_default_encoding;
1430 }
1431
PyUnicode_SetDefaultEncoding(const char * encoding)1432 int PyUnicode_SetDefaultEncoding(const char *encoding)
1433 {
1434 PyObject *v;
1435
1436 /* Make sure the encoding is valid. As side effect, this also
1437 loads the encoding into the codec registry cache. */
1438 v = _PyCodec_Lookup(encoding);
1439 if (v == NULL)
1440 goto onError;
1441 Py_DECREF(v);
1442 strncpy(unicode_default_encoding,
1443 encoding,
1444 sizeof(unicode_default_encoding) - 1);
1445 return 0;
1446
1447 onError:
1448 return -1;
1449 }
1450
1451 /* error handling callback helper:
1452 build arguments, call the callback and check the arguments,
1453 if no exception occurred, copy the replacement to the output
1454 and adjust various state variables.
1455 return 0 on success, -1 on error
1456 */
1457
1458 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1460 const char *encoding, const char *reason,
1461 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1462 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1463 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1464 {
1465 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1466
1467 PyObject *restuple = NULL;
1468 PyObject *repunicode = NULL;
1469 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1470 Py_ssize_t requiredsize;
1471 Py_ssize_t newpos;
1472 Py_UNICODE *repptr;
1473 Py_ssize_t repsize;
1474 int res = -1;
1475
1476 if (*errorHandler == NULL) {
1477 *errorHandler = PyCodec_LookupError(errors);
1478 if (*errorHandler == NULL)
1479 goto onError;
1480 }
1481
1482 if (*exceptionObject == NULL) {
1483 *exceptionObject = PyUnicodeDecodeError_Create(
1484 encoding, input, insize, *startinpos, *endinpos, reason);
1485 if (*exceptionObject == NULL)
1486 goto onError;
1487 }
1488 else {
1489 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1492 goto onError;
1493 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1494 goto onError;
1495 }
1496
1497 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1498 if (restuple == NULL)
1499 goto onError;
1500 if (!PyTuple_Check(restuple)) {
1501 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1502 goto onError;
1503 }
1504 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1505 goto onError;
1506 if (newpos<0)
1507 newpos = insize+newpos;
1508 if (newpos<0 || newpos>insize) {
1509 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1510 goto onError;
1511 }
1512
1513 /* need more space? (at least enough for what we
1514 have+the replacement+the rest of the string (starting
1515 at the new input position), so we won't have to check space
1516 when there are no errors in the rest of the string) */
1517 repptr = PyUnicode_AS_UNICODE(repunicode);
1518 repsize = PyUnicode_GET_SIZE(repunicode);
1519 requiredsize = *outpos;
1520 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1521 goto overflow;
1522 requiredsize += repsize;
1523 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1524 goto overflow;
1525 requiredsize += insize - newpos;
1526 if (requiredsize > outsize) {
1527 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1528 requiredsize = 2*outsize;
1529 if (_PyUnicode_Resize(output, requiredsize) < 0)
1530 goto onError;
1531 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1532 }
1533 *endinpos = newpos;
1534 *inptr = input + newpos;
1535 Py_UNICODE_COPY(*outptr, repptr, repsize);
1536 *outptr += repsize;
1537 *outpos += repsize;
1538 /* we made it! */
1539 res = 0;
1540
1541 onError:
1542 Py_XDECREF(restuple);
1543 return res;
1544
1545 overflow:
1546 PyErr_SetString(PyExc_OverflowError,
1547 "decoded result is too long for a Python string");
1548 goto onError;
1549 }
1550
1551 /* --- UTF-7 Codec -------------------------------------------------------- */
1552
1553 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1554
1555 /* Three simple macros defining base-64. */
1556
1557 /* Is c a base-64 character? */
1558
1559 #define IS_BASE64(c) \
1560 (((c) >= 'A' && (c) <= 'Z') || \
1561 ((c) >= 'a' && (c) <= 'z') || \
1562 ((c) >= '0' && (c) <= '9') || \
1563 (c) == '+' || (c) == '/')
1564
1565 /* given that c is a base-64 character, what is its base-64 value? */
1566
1567 #define FROM_BASE64(c) \
1568 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1569 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1570 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1571 (c) == '+' ? 62 : 63)
1572
1573 /* What is the base-64 character of the bottom 6 bits of n? */
1574
1575 #define TO_BASE64(n) \
1576 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1577
1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1579 * decoded as itself. We are permissive on decoding; the only ASCII
1580 * byte not decoding to itself is the + which begins a base64
1581 * string. */
1582
1583 #define DECODE_DIRECT(c) \
1584 ((c) <= 127 && (c) != '+')
1585
1586 /* The UTF-7 encoder treats ASCII characters differently according to
1587 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1588 * the above). See RFC2152. This array identifies these different
1589 * sets:
1590 * 0 : "Set D"
1591 * alphanumeric and '(),-./:?
1592 * 1 : "Set O"
1593 * !"#$%&*;<=>@[]^_`{|}
1594 * 2 : "whitespace"
1595 * ht nl cr sp
1596 * 3 : special (must be base64 encoded)
1597 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1598 */
1599
1600 static
1601 char utf7_category[128] = {
1602 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1603 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1605 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1606 /* sp ! " # $ % & ' ( ) * + , - . / */
1607 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1608 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1610 /* @ A B C D E F G H I J K L M N O */
1611 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1612 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1614 /* ` a b c d e f g h i j k l m n o */
1615 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1616 /* p q r s t u v w x y z { | } ~ del */
1617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1618 };
1619
1620 /* ENCODE_DIRECT: this character should be encoded as itself. The
1621 * answer depends on whether we are encoding set O as itself, and also
1622 * on whether we are encoding whitespace as itself. RFC2152 makes it
1623 * clear that the answers to these questions vary between
1624 * applications, so this code needs to be flexible. */
1625
1626 #define ENCODE_DIRECT(c, directO, directWS) \
1627 ((c) < 128 && (c) > 0 && \
1628 ((utf7_category[(c)] == 0) || \
1629 (directWS && (utf7_category[(c)] == 2)) || \
1630 (directO && (utf7_category[(c)] == 1))))
1631
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1632 PyObject *PyUnicode_DecodeUTF7(const char *s,
1633 Py_ssize_t size,
1634 const char *errors)
1635 {
1636 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1637 }
1638
1639 /* The decoder. The only state we preserve is our read position,
1640 * i.e. how many characters we have consumed. So if we end in the
1641 * middle of a shift sequence we have to back off the read position
1642 * and the output to the beginning of the sequence, otherwise we lose
1643 * all the shift state (seen bits, number of bits seen, high
1644 * surrogate). */
1645
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1647 Py_ssize_t size,
1648 const char *errors,
1649 Py_ssize_t *consumed)
1650 {
1651 const char *starts = s;
1652 Py_ssize_t startinpos;
1653 Py_ssize_t endinpos;
1654 Py_ssize_t outpos;
1655 const char *e;
1656 PyUnicodeObject *unicode;
1657 Py_UNICODE *p;
1658 const char *errmsg = "";
1659 int inShift = 0;
1660 Py_UNICODE *shiftOutStart;
1661 unsigned int base64bits = 0;
1662 unsigned long base64buffer = 0;
1663 Py_UNICODE surrogate = 0;
1664 PyObject *errorHandler = NULL;
1665 PyObject *exc = NULL;
1666
1667 unicode = _PyUnicode_New(size);
1668 if (!unicode)
1669 return NULL;
1670 if (size == 0) {
1671 if (consumed)
1672 *consumed = 0;
1673 return (PyObject *)unicode;
1674 }
1675
1676 p = unicode->str;
1677 shiftOutStart = p;
1678 e = s + size;
1679
1680 while (s < e) {
1681 Py_UNICODE ch = (unsigned char) *s;
1682
1683 if (inShift) { /* in a base-64 section */
1684 if (IS_BASE64(ch)) { /* consume a base-64 character */
1685 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1686 base64bits += 6;
1687 s++;
1688 if (base64bits >= 16) {
1689 /* we have enough bits for a UTF-16 value */
1690 Py_UNICODE outCh = (Py_UNICODE)
1691 (base64buffer >> (base64bits-16));
1692 base64bits -= 16;
1693 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1694 assert(outCh <= 0xffff);
1695 if (surrogate) {
1696 /* expecting a second surrogate */
1697 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1698 #ifdef Py_UNICODE_WIDE
1699 *p++ = (((surrogate & 0x3FF)<<10)
1700 | (outCh & 0x3FF)) + 0x10000;
1701 #else
1702 *p++ = surrogate;
1703 *p++ = outCh;
1704 #endif
1705 surrogate = 0;
1706 continue;
1707 }
1708 else {
1709 *p++ = surrogate;
1710 surrogate = 0;
1711 }
1712 }
1713 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1714 /* first surrogate */
1715 surrogate = outCh;
1716 }
1717 else {
1718 *p++ = outCh;
1719 }
1720 }
1721 }
1722 else { /* now leaving a base-64 section */
1723 inShift = 0;
1724 if (base64bits > 0) { /* left-over bits */
1725 if (base64bits >= 6) {
1726 /* We've seen at least one base-64 character */
1727 s++;
1728 errmsg = "partial character in shift sequence";
1729 goto utf7Error;
1730 }
1731 else {
1732 /* Some bits remain; they should be zero */
1733 if (base64buffer != 0) {
1734 s++;
1735 errmsg = "non-zero padding bits in shift sequence";
1736 goto utf7Error;
1737 }
1738 }
1739 }
1740 if (surrogate && DECODE_DIRECT(ch))
1741 *p++ = surrogate;
1742 surrogate = 0;
1743 if (ch == '-') {
1744 /* '-' is absorbed; other terminating
1745 characters are preserved */
1746 s++;
1747 }
1748 }
1749 }
1750 else if ( ch == '+' ) {
1751 startinpos = s-starts;
1752 s++; /* consume '+' */
1753 if (s < e && *s == '-') { /* '+-' encodes '+' */
1754 s++;
1755 *p++ = '+';
1756 }
1757 else { /* begin base64-encoded section */
1758 inShift = 1;
1759 surrogate = 0;
1760 shiftOutStart = p;
1761 base64bits = 0;
1762 base64buffer = 0;
1763 }
1764 }
1765 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1766 *p++ = ch;
1767 s++;
1768 }
1769 else {
1770 startinpos = s-starts;
1771 s++;
1772 errmsg = "unexpected special character";
1773 goto utf7Error;
1774 }
1775 continue;
1776 utf7Error:
1777 outpos = p-PyUnicode_AS_UNICODE(unicode);
1778 endinpos = s-starts;
1779 if (unicode_decode_call_errorhandler(
1780 errors, &errorHandler,
1781 "utf7", errmsg,
1782 starts, size, &startinpos, &endinpos, &exc, &s,
1783 &unicode, &outpos, &p))
1784 goto onError;
1785 }
1786
1787 /* end of string */
1788
1789 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1790 /* if we're in an inconsistent state, that's an error */
1791 inShift = 0;
1792 if (surrogate ||
1793 (base64bits >= 6) ||
1794 (base64bits > 0 && base64buffer != 0)) {
1795 outpos = p-PyUnicode_AS_UNICODE(unicode);
1796 endinpos = size;
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "utf7", "unterminated shift sequence",
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 &unicode, &outpos, &p))
1802 goto onError;
1803 }
1804 }
1805
1806 /* return state */
1807 if (consumed) {
1808 if (inShift) {
1809 p = shiftOutStart; /* back off output */
1810 *consumed = startinpos;
1811 }
1812 else {
1813 *consumed = s-starts;
1814 }
1815 }
1816
1817 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1818 goto onError;
1819
1820 Py_XDECREF(errorHandler);
1821 Py_XDECREF(exc);
1822 return (PyObject *)unicode;
1823
1824 onError:
1825 Py_XDECREF(errorHandler);
1826 Py_XDECREF(exc);
1827 Py_DECREF(unicode);
1828 return NULL;
1829 }
1830
1831
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1833 Py_ssize_t size,
1834 int base64SetO,
1835 int base64WhiteSpace,
1836 const char *errors)
1837 {
1838 PyObject *v;
1839 /* It might be possible to tighten this worst case */
1840 Py_ssize_t allocated = 8 * size;
1841 int inShift = 0;
1842 Py_ssize_t i = 0;
1843 unsigned int base64bits = 0;
1844 unsigned long base64buffer = 0;
1845 char * out;
1846 char * start;
1847
1848 if (allocated / 8 != size)
1849 return PyErr_NoMemory();
1850
1851 if (size == 0)
1852 return PyString_FromStringAndSize(NULL, 0);
1853
1854 v = PyString_FromStringAndSize(NULL, allocated);
1855 if (v == NULL)
1856 return NULL;
1857
1858 start = out = PyString_AS_STRING(v);
1859 for (;i < size; ++i) {
1860 Py_UNICODE ch = s[i];
1861
1862 if (inShift) {
1863 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864 /* shifting out */
1865 if (base64bits) { /* output remaining bits */
1866 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1867 base64buffer = 0;
1868 base64bits = 0;
1869 }
1870 inShift = 0;
1871 /* Characters not in the BASE64 set implicitly unshift the sequence
1872 so no '-' is required, except if the character is itself a '-' */
1873 if (IS_BASE64(ch) || ch == '-') {
1874 *out++ = '-';
1875 }
1876 *out++ = (char) ch;
1877 }
1878 else {
1879 goto encode_char;
1880 }
1881 }
1882 else { /* not in a shift sequence */
1883 if (ch == '+') {
1884 *out++ = '+';
1885 *out++ = '-';
1886 }
1887 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1888 *out++ = (char) ch;
1889 }
1890 else {
1891 *out++ = '+';
1892 inShift = 1;
1893 goto encode_char;
1894 }
1895 }
1896 continue;
1897 encode_char:
1898 #ifdef Py_UNICODE_WIDE
1899 if (ch >= 0x10000) {
1900 /* code first surrogate */
1901 base64bits += 16;
1902 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1903 while (base64bits >= 6) {
1904 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905 base64bits -= 6;
1906 }
1907 /* prepare second surrogate */
1908 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1909 }
1910 #endif
1911 base64bits += 16;
1912 base64buffer = (base64buffer << 16) | ch;
1913 while (base64bits >= 6) {
1914 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1915 base64bits -= 6;
1916 }
1917 }
1918 if (base64bits)
1919 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1920 if (inShift)
1921 *out++ = '-';
1922
1923 if (_PyString_Resize(&v, out - start))
1924 return NULL;
1925 return v;
1926 }
1927
1928 #undef IS_BASE64
1929 #undef FROM_BASE64
1930 #undef TO_BASE64
1931 #undef DECODE_DIRECT
1932 #undef ENCODE_DIRECT
1933
1934 /* --- UTF-8 Codec -------------------------------------------------------- */
1935
1936 static
1937 char utf8_code_length[256] = {
1938 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1939 illegal prefix. See RFC 3629 for details */
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1952 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1953 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1954 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1955 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1956 };
1957
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1958 PyObject *PyUnicode_DecodeUTF8(const char *s,
1959 Py_ssize_t size,
1960 const char *errors)
1961 {
1962 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1963 }
1964
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1966 Py_ssize_t size,
1967 const char *errors,
1968 Py_ssize_t *consumed)
1969 {
1970 const char *starts = s;
1971 int n;
1972 int k;
1973 Py_ssize_t startinpos;
1974 Py_ssize_t endinpos;
1975 Py_ssize_t outpos;
1976 const char *e;
1977 PyUnicodeObject *unicode;
1978 Py_UNICODE *p;
1979 const char *errmsg = "";
1980 PyObject *errorHandler = NULL;
1981 PyObject *exc = NULL;
1982
1983 /* Note: size will always be longer than the resulting Unicode
1984 character count */
1985 unicode = _PyUnicode_New(size);
1986 if (!unicode)
1987 return NULL;
1988 if (size == 0) {
1989 if (consumed)
1990 *consumed = 0;
1991 return (PyObject *)unicode;
1992 }
1993
1994 /* Unpack UTF-8 encoded data */
1995 p = unicode->str;
1996 e = s + size;
1997
1998 while (s < e) {
1999 Py_UCS4 ch = (unsigned char)*s;
2000
2001 if (ch < 0x80) {
2002 *p++ = (Py_UNICODE)ch;
2003 s++;
2004 continue;
2005 }
2006
2007 n = utf8_code_length[ch];
2008
2009 if (s + n > e) {
2010 if (consumed)
2011 break;
2012 else {
2013 errmsg = "unexpected end of data";
2014 startinpos = s-starts;
2015 endinpos = startinpos+1;
2016 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2017 endinpos++;
2018 goto utf8Error;
2019 }
2020 }
2021
2022 switch (n) {
2023
2024 case 0:
2025 errmsg = "invalid start byte";
2026 startinpos = s-starts;
2027 endinpos = startinpos+1;
2028 goto utf8Error;
2029
2030 case 1:
2031 errmsg = "internal error";
2032 startinpos = s-starts;
2033 endinpos = startinpos+1;
2034 goto utf8Error;
2035
2036 case 2:
2037 if ((s[1] & 0xc0) != 0x80) {
2038 errmsg = "invalid continuation byte";
2039 startinpos = s-starts;
2040 endinpos = startinpos + 1;
2041 goto utf8Error;
2042 }
2043 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2044 assert ((ch > 0x007F) && (ch <= 0x07FF));
2045 *p++ = (Py_UNICODE)ch;
2046 break;
2047
2048 case 3:
2049 /* XXX: surrogates shouldn't be valid UTF-8!
2050 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2051 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2052 Uncomment the 2 lines below to make them invalid,
2053 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2054 if ((s[1] & 0xc0) != 0x80 ||
2055 (s[2] & 0xc0) != 0x80 ||
2056 ((unsigned char)s[0] == 0xE0 &&
2057 (unsigned char)s[1] < 0xA0)/* ||
2058 ((unsigned char)s[0] == 0xED &&
2059 (unsigned char)s[1] > 0x9F)*/) {
2060 errmsg = "invalid continuation byte";
2061 startinpos = s-starts;
2062 endinpos = startinpos + 1;
2063
2064 /* if s[1] first two bits are 1 and 0, then the invalid
2065 continuation byte is s[2], so increment endinpos by 1,
2066 if not, s[1] is invalid and endinpos doesn't need to
2067 be incremented. */
2068 if ((s[1] & 0xC0) == 0x80)
2069 endinpos++;
2070 goto utf8Error;
2071 }
2072 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2073 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2074 *p++ = (Py_UNICODE)ch;
2075 break;
2076
2077 case 4:
2078 if ((s[1] & 0xc0) != 0x80 ||
2079 (s[2] & 0xc0) != 0x80 ||
2080 (s[3] & 0xc0) != 0x80 ||
2081 ((unsigned char)s[0] == 0xF0 &&
2082 (unsigned char)s[1] < 0x90) ||
2083 ((unsigned char)s[0] == 0xF4 &&
2084 (unsigned char)s[1] > 0x8F)) {
2085 errmsg = "invalid continuation byte";
2086 startinpos = s-starts;
2087 endinpos = startinpos + 1;
2088 if ((s[1] & 0xC0) == 0x80) {
2089 endinpos++;
2090 if ((s[2] & 0xC0) == 0x80)
2091 endinpos++;
2092 }
2093 goto utf8Error;
2094 }
2095 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2096 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2098
2099 #ifdef Py_UNICODE_WIDE
2100 *p++ = (Py_UNICODE)ch;
2101 #else
2102 /* compute and append the two surrogates: */
2103
2104 /* translate from 10000..10FFFF to 0..FFFF */
2105 ch -= 0x10000;
2106
2107 /* high surrogate = top 10 bits added to D800 */
2108 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2109
2110 /* low surrogate = bottom 10 bits added to DC00 */
2111 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2112 #endif
2113 break;
2114 }
2115 s += n;
2116 continue;
2117
2118 utf8Error:
2119 outpos = p-PyUnicode_AS_UNICODE(unicode);
2120 if (unicode_decode_call_errorhandler(
2121 errors, &errorHandler,
2122 "utf8", errmsg,
2123 starts, size, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
2126 }
2127 if (consumed)
2128 *consumed = s-starts;
2129
2130 /* Adjust length */
2131 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2132 goto onError;
2133
2134 Py_XDECREF(errorHandler);
2135 Py_XDECREF(exc);
2136 return (PyObject *)unicode;
2137
2138 onError:
2139 Py_XDECREF(errorHandler);
2140 Py_XDECREF(exc);
2141 Py_DECREF(unicode);
2142 return NULL;
2143 }
2144
2145 /* Allocation strategy: if the string is short, convert into a stack buffer
2146 and allocate exactly as much space needed at the end. Else allocate the
2147 maximum possible needed (4 result bytes per Unicode character), and return
2148 the excess memory at the end.
2149 */
2150 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2152 Py_ssize_t size,
2153 const char *errors)
2154 {
2155 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2156
2157 Py_ssize_t i; /* index into s of next input byte */
2158 PyObject *v; /* result string object */
2159 char *p; /* next free byte in output buffer */
2160 Py_ssize_t nallocated; /* number of result bytes allocated */
2161 Py_ssize_t nneeded; /* number of result bytes needed */
2162 char stackbuf[MAX_SHORT_UNICHARS * 4];
2163
2164 assert(s != NULL);
2165 assert(size >= 0);
2166
2167 if (size <= MAX_SHORT_UNICHARS) {
2168 /* Write into the stack buffer; nallocated can't overflow.
2169 * At the end, we'll allocate exactly as much heap space as it
2170 * turns out we need.
2171 */
2172 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2173 v = NULL; /* will allocate after we're done */
2174 p = stackbuf;
2175 }
2176 else {
2177 /* Overallocate on the heap, and give the excess back at the end. */
2178 nallocated = size * 4;
2179 if (nallocated / 4 != size) /* overflow! */
2180 return PyErr_NoMemory();
2181 v = PyString_FromStringAndSize(NULL, nallocated);
2182 if (v == NULL)
2183 return NULL;
2184 p = PyString_AS_STRING(v);
2185 }
2186
2187 for (i = 0; i < size;) {
2188 Py_UCS4 ch = s[i++];
2189
2190 if (ch < 0x80)
2191 /* Encode ASCII */
2192 *p++ = (char) ch;
2193
2194 else if (ch < 0x0800) {
2195 /* Encode Latin-1 */
2196 *p++ = (char)(0xc0 | (ch >> 6));
2197 *p++ = (char)(0x80 | (ch & 0x3f));
2198 }
2199 else {
2200 /* Encode UCS2 Unicode ordinals */
2201 if (ch < 0x10000) {
2202 /* Special case: check for high surrogate */
2203 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2204 Py_UCS4 ch2 = s[i];
2205 /* Check for low surrogate and combine the two to
2206 form a UCS4 value */
2207 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2208 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2209 i++;
2210 goto encodeUCS4;
2211 }
2212 /* Fall through: handles isolated high surrogates */
2213 }
2214 *p++ = (char)(0xe0 | (ch >> 12));
2215 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2216 *p++ = (char)(0x80 | (ch & 0x3f));
2217 continue;
2218 }
2219 encodeUCS4:
2220 /* Encode UCS4 Unicode ordinals */
2221 *p++ = (char)(0xf0 | (ch >> 18));
2222 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2223 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2224 *p++ = (char)(0x80 | (ch & 0x3f));
2225 }
2226 }
2227
2228 if (v == NULL) {
2229 /* This was stack allocated. */
2230 nneeded = p - stackbuf;
2231 assert(nneeded <= nallocated);
2232 v = PyString_FromStringAndSize(stackbuf, nneeded);
2233 }
2234 else {
2235 /* Cut back to size actually needed. */
2236 nneeded = p - PyString_AS_STRING(v);
2237 assert(nneeded <= nallocated);
2238 if (_PyString_Resize(&v, nneeded))
2239 return NULL;
2240 }
2241 return v;
2242
2243 #undef MAX_SHORT_UNICHARS
2244 }
2245
PyUnicode_AsUTF8String(PyObject * unicode)2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2247 {
2248 if (!PyUnicode_Check(unicode)) {
2249 PyErr_BadArgument();
2250 return NULL;
2251 }
2252 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2253 PyUnicode_GET_SIZE(unicode),
2254 NULL);
2255 }
2256
2257 /* --- UTF-32 Codec ------------------------------------------------------- */
2258
2259 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2260 PyUnicode_DecodeUTF32(const char *s,
2261 Py_ssize_t size,
2262 const char *errors,
2263 int *byteorder)
2264 {
2265 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2266 }
2267
2268 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2269 PyUnicode_DecodeUTF32Stateful(const char *s,
2270 Py_ssize_t size,
2271 const char *errors,
2272 int *byteorder,
2273 Py_ssize_t *consumed)
2274 {
2275 const char *starts = s;
2276 Py_ssize_t startinpos;
2277 Py_ssize_t endinpos;
2278 Py_ssize_t outpos;
2279 PyUnicodeObject *unicode;
2280 Py_UNICODE *p;
2281 #ifndef Py_UNICODE_WIDE
2282 int pairs = 0;
2283 const unsigned char *qq;
2284 #else
2285 const int pairs = 0;
2286 #endif
2287 const unsigned char *q, *e;
2288 int bo = 0; /* assume native ordering by default */
2289 const char *errmsg = "";
2290 /* Offsets from q for retrieving bytes in the right order. */
2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292 int iorder[] = {0, 1, 2, 3};
2293 #else
2294 int iorder[] = {3, 2, 1, 0};
2295 #endif
2296 PyObject *errorHandler = NULL;
2297 PyObject *exc = NULL;
2298
2299 q = (unsigned char *)s;
2300 e = q + size;
2301
2302 if (byteorder)
2303 bo = *byteorder;
2304
2305 /* Check for BOM marks (U+FEFF) in the input and adjust current
2306 byte order setting accordingly. In native mode, the leading BOM
2307 mark is skipped, in all other modes, it is copied to the output
2308 stream as-is (giving a ZWNBSP character). */
2309 if (bo == 0) {
2310 if (size >= 4) {
2311 const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2312 (q[iorder[1]] << 8) | q[iorder[0]];
2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2314 if (bom == 0x0000FEFF) {
2315 q += 4;
2316 bo = -1;
2317 }
2318 else if (bom == 0xFFFE0000) {
2319 q += 4;
2320 bo = 1;
2321 }
2322 #else
2323 if (bom == 0x0000FEFF) {
2324 q += 4;
2325 bo = 1;
2326 }
2327 else if (bom == 0xFFFE0000) {
2328 q += 4;
2329 bo = -1;
2330 }
2331 #endif
2332 }
2333 }
2334
2335 if (bo == -1) {
2336 /* force LE */
2337 iorder[0] = 0;
2338 iorder[1] = 1;
2339 iorder[2] = 2;
2340 iorder[3] = 3;
2341 }
2342 else if (bo == 1) {
2343 /* force BE */
2344 iorder[0] = 3;
2345 iorder[1] = 2;
2346 iorder[2] = 1;
2347 iorder[3] = 0;
2348 }
2349
2350 /* On narrow builds we split characters outside the BMP into two
2351 code points => count how much extra space we need. */
2352 #ifndef Py_UNICODE_WIDE
2353 for (qq = q; e - qq >= 4; qq += 4)
2354 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2355 pairs++;
2356 #endif
2357
2358 /* This might be one to much, because of a BOM */
2359 unicode = _PyUnicode_New((size+3)/4+pairs);
2360 if (!unicode)
2361 return NULL;
2362 if (size == 0)
2363 return (PyObject *)unicode;
2364
2365 /* Unpack UTF-32 encoded data */
2366 p = unicode->str;
2367
2368 while (q < e) {
2369 Py_UCS4 ch;
2370 /* remaining bytes at the end? (size should be divisible by 4) */
2371 if (e-q<4) {
2372 if (consumed)
2373 break;
2374 errmsg = "truncated data";
2375 startinpos = ((const char *)q)-starts;
2376 endinpos = ((const char *)e)-starts;
2377 goto utf32Error;
2378 /* The remaining input chars are ignored if the callback
2379 chooses to skip the input */
2380 }
2381 ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2382 (q[iorder[1]] << 8) | q[iorder[0]];
2383
2384 if (ch >= 0x110000)
2385 {
2386 errmsg = "code point not in range(0x110000)";
2387 startinpos = ((const char *)q)-starts;
2388 endinpos = startinpos+4;
2389 goto utf32Error;
2390 }
2391 #ifndef Py_UNICODE_WIDE
2392 if (ch >= 0x10000)
2393 {
2394 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2395 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2396 }
2397 else
2398 #endif
2399 *p++ = ch;
2400 q += 4;
2401 continue;
2402 utf32Error:
2403 outpos = p-PyUnicode_AS_UNICODE(unicode);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "utf32", errmsg,
2407 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2408 &unicode, &outpos, &p))
2409 goto onError;
2410 }
2411
2412 if (byteorder)
2413 *byteorder = bo;
2414
2415 if (consumed)
2416 *consumed = (const char *)q-starts;
2417
2418 /* Adjust length */
2419 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2420 goto onError;
2421
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return (PyObject *)unicode;
2425
2426 onError:
2427 Py_DECREF(unicode);
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return NULL;
2431 }
2432
2433 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2435 Py_ssize_t size,
2436 const char *errors,
2437 int byteorder)
2438 {
2439 PyObject *v;
2440 unsigned char *p;
2441 Py_ssize_t nsize, bytesize;
2442 #ifndef Py_UNICODE_WIDE
2443 Py_ssize_t i, pairs;
2444 #else
2445 const int pairs = 0;
2446 #endif
2447 /* Offsets from p for storing byte pairs in the right order. */
2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449 int iorder[] = {0, 1, 2, 3};
2450 #else
2451 int iorder[] = {3, 2, 1, 0};
2452 #endif
2453
2454 #define STORECHAR(CH) \
2455 do { \
2456 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2457 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2458 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2459 p[iorder[0]] = (CH) & 0xff; \
2460 p += 4; \
2461 } while(0)
2462
2463 /* In narrow builds we can output surrogate pairs as one code point,
2464 so we need less space. */
2465 #ifndef Py_UNICODE_WIDE
2466 for (i = pairs = 0; i < size-1; i++)
2467 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2468 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2469 pairs++;
2470 #endif
2471 nsize = (size - pairs + (byteorder == 0));
2472 bytesize = nsize * 4;
2473 if (bytesize / 4 != nsize)
2474 return PyErr_NoMemory();
2475 v = PyString_FromStringAndSize(NULL, bytesize);
2476 if (v == NULL)
2477 return NULL;
2478
2479 p = (unsigned char *)PyString_AS_STRING(v);
2480 if (byteorder == 0)
2481 STORECHAR(0xFEFF);
2482 if (size == 0)
2483 return v;
2484
2485 if (byteorder == -1) {
2486 /* force LE */
2487 iorder[0] = 0;
2488 iorder[1] = 1;
2489 iorder[2] = 2;
2490 iorder[3] = 3;
2491 }
2492 else if (byteorder == 1) {
2493 /* force BE */
2494 iorder[0] = 3;
2495 iorder[1] = 2;
2496 iorder[2] = 1;
2497 iorder[3] = 0;
2498 }
2499
2500 while (size-- > 0) {
2501 Py_UCS4 ch = *s++;
2502 #ifndef Py_UNICODE_WIDE
2503 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2504 Py_UCS4 ch2 = *s;
2505 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2506 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2507 s++;
2508 size--;
2509 }
2510 }
2511 #endif
2512 STORECHAR(ch);
2513 }
2514 return v;
2515 #undef STORECHAR
2516 }
2517
PyUnicode_AsUTF32String(PyObject * unicode)2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2519 {
2520 if (!PyUnicode_Check(unicode)) {
2521 PyErr_BadArgument();
2522 return NULL;
2523 }
2524 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2525 PyUnicode_GET_SIZE(unicode),
2526 NULL,
2527 0);
2528 }
2529
2530 /* --- UTF-16 Codec ------------------------------------------------------- */
2531
2532 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2533 PyUnicode_DecodeUTF16(const char *s,
2534 Py_ssize_t size,
2535 const char *errors,
2536 int *byteorder)
2537 {
2538 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2539 }
2540
2541 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2542 PyUnicode_DecodeUTF16Stateful(const char *s,
2543 Py_ssize_t size,
2544 const char *errors,
2545 int *byteorder,
2546 Py_ssize_t *consumed)
2547 {
2548 const char *starts = s;
2549 Py_ssize_t startinpos;
2550 Py_ssize_t endinpos;
2551 Py_ssize_t outpos;
2552 PyUnicodeObject *unicode;
2553 Py_UNICODE *p;
2554 const unsigned char *q, *e;
2555 int bo = 0; /* assume native ordering by default */
2556 const char *errmsg = "";
2557 /* Offsets from q for retrieving byte pairs in the right order. */
2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2559 int ihi = 1, ilo = 0;
2560 #else
2561 int ihi = 0, ilo = 1;
2562 #endif
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
2565
2566 /* Note: size will always be longer than the resulting Unicode
2567 character count */
2568 unicode = _PyUnicode_New(size);
2569 if (!unicode)
2570 return NULL;
2571 if (size == 0)
2572 return (PyObject *)unicode;
2573
2574 /* Unpack UTF-16 encoded data */
2575 p = unicode->str;
2576 q = (unsigned char *)s;
2577 e = q + size;
2578
2579 if (byteorder)
2580 bo = *byteorder;
2581
2582 /* Check for BOM marks (U+FEFF) in the input and adjust current
2583 byte order setting accordingly. In native mode, the leading BOM
2584 mark is skipped, in all other modes, it is copied to the output
2585 stream as-is (giving a ZWNBSP character). */
2586 if (bo == 0) {
2587 if (size >= 2) {
2588 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590 if (bom == 0xFEFF) {
2591 q += 2;
2592 bo = -1;
2593 }
2594 else if (bom == 0xFFFE) {
2595 q += 2;
2596 bo = 1;
2597 }
2598 #else
2599 if (bom == 0xFEFF) {
2600 q += 2;
2601 bo = 1;
2602 }
2603 else if (bom == 0xFFFE) {
2604 q += 2;
2605 bo = -1;
2606 }
2607 #endif
2608 }
2609 }
2610
2611 if (bo == -1) {
2612 /* force LE */
2613 ihi = 1;
2614 ilo = 0;
2615 }
2616 else if (bo == 1) {
2617 /* force BE */
2618 ihi = 0;
2619 ilo = 1;
2620 }
2621
2622 while (q < e) {
2623 Py_UNICODE ch;
2624 /* remaining bytes at the end? (size should be even) */
2625 if (e-q<2) {
2626 if (consumed)
2627 break;
2628 errmsg = "truncated data";
2629 startinpos = ((const char *)q)-starts;
2630 endinpos = ((const char *)e)-starts;
2631 goto utf16Error;
2632 /* The remaining input chars are ignored if the callback
2633 chooses to skip the input */
2634 }
2635 ch = (q[ihi] << 8) | q[ilo];
2636
2637 q += 2;
2638
2639 if (ch < 0xD800 || ch > 0xDFFF) {
2640 *p++ = ch;
2641 continue;
2642 }
2643
2644 /* UTF-16 code pair: */
2645 if (e - q < 2) {
2646 q -= 2;
2647 if (consumed)
2648 break;
2649 errmsg = "unexpected end of data";
2650 startinpos = ((const char *)q)-starts;
2651 endinpos = ((const char *)e)-starts;
2652 goto utf16Error;
2653 }
2654 if (0xD800 <= ch && ch <= 0xDBFF) {
2655 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2656 q += 2;
2657 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2658 #ifndef Py_UNICODE_WIDE
2659 *p++ = ch;
2660 *p++ = ch2;
2661 #else
2662 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2663 #endif
2664 continue;
2665 }
2666 else {
2667 errmsg = "illegal UTF-16 surrogate";
2668 startinpos = (((const char *)q)-4)-starts;
2669 endinpos = startinpos+2;
2670 goto utf16Error;
2671 }
2672
2673 }
2674 errmsg = "illegal encoding";
2675 startinpos = (((const char *)q)-2)-starts;
2676 endinpos = startinpos+2;
2677 /* Fall through to report the error */
2678
2679 utf16Error:
2680 outpos = p-PyUnicode_AS_UNICODE(unicode);
2681 if (unicode_decode_call_errorhandler(
2682 errors, &errorHandler,
2683 "utf16", errmsg,
2684 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2685 &unicode, &outpos, &p))
2686 goto onError;
2687 }
2688
2689 if (byteorder)
2690 *byteorder = bo;
2691
2692 if (consumed)
2693 *consumed = (const char *)q-starts;
2694
2695 /* Adjust length */
2696 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2697 goto onError;
2698
2699 Py_XDECREF(errorHandler);
2700 Py_XDECREF(exc);
2701 return (PyObject *)unicode;
2702
2703 onError:
2704 Py_DECREF(unicode);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return NULL;
2708 }
2709
2710 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2712 Py_ssize_t size,
2713 const char *errors,
2714 int byteorder)
2715 {
2716 PyObject *v;
2717 unsigned char *p;
2718 Py_ssize_t nsize, bytesize;
2719 #ifdef Py_UNICODE_WIDE
2720 Py_ssize_t i, pairs;
2721 #else
2722 const int pairs = 0;
2723 #endif
2724 /* Offsets from p for storing byte pairs in the right order. */
2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2726 int ihi = 1, ilo = 0;
2727 #else
2728 int ihi = 0, ilo = 1;
2729 #endif
2730
2731 #define STORECHAR(CH) \
2732 do { \
2733 p[ihi] = ((CH) >> 8) & 0xff; \
2734 p[ilo] = (CH) & 0xff; \
2735 p += 2; \
2736 } while(0)
2737
2738 #ifdef Py_UNICODE_WIDE
2739 for (i = pairs = 0; i < size; i++)
2740 if (s[i] >= 0x10000)
2741 pairs++;
2742 #endif
2743 /* 2 * (size + pairs + (byteorder == 0)) */
2744 if (size > PY_SSIZE_T_MAX ||
2745 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2746 return PyErr_NoMemory();
2747 nsize = size + pairs + (byteorder == 0);
2748 bytesize = nsize * 2;
2749 if (bytesize / 2 != nsize)
2750 return PyErr_NoMemory();
2751 v = PyString_FromStringAndSize(NULL, bytesize);
2752 if (v == NULL)
2753 return NULL;
2754
2755 p = (unsigned char *)PyString_AS_STRING(v);
2756 if (byteorder == 0)
2757 STORECHAR(0xFEFF);
2758 if (size == 0)
2759 return v;
2760
2761 if (byteorder == -1) {
2762 /* force LE */
2763 ihi = 1;
2764 ilo = 0;
2765 }
2766 else if (byteorder == 1) {
2767 /* force BE */
2768 ihi = 0;
2769 ilo = 1;
2770 }
2771
2772 while (size-- > 0) {
2773 Py_UNICODE ch = *s++;
2774 Py_UNICODE ch2 = 0;
2775 #ifdef Py_UNICODE_WIDE
2776 if (ch >= 0x10000) {
2777 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2778 ch = 0xD800 | ((ch-0x10000) >> 10);
2779 }
2780 #endif
2781 STORECHAR(ch);
2782 if (ch2)
2783 STORECHAR(ch2);
2784 }
2785 return v;
2786 #undef STORECHAR
2787 }
2788
PyUnicode_AsUTF16String(PyObject * unicode)2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2790 {
2791 if (!PyUnicode_Check(unicode)) {
2792 PyErr_BadArgument();
2793 return NULL;
2794 }
2795 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2796 PyUnicode_GET_SIZE(unicode),
2797 NULL,
2798 0);
2799 }
2800
2801 /* --- Unicode Escape Codec ----------------------------------------------- */
2802
2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2804
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2806 Py_ssize_t size,
2807 const char *errors)
2808 {
2809 const char *starts = s;
2810 Py_ssize_t startinpos;
2811 Py_ssize_t endinpos;
2812 Py_ssize_t outpos;
2813 PyUnicodeObject *v;
2814 Py_UNICODE *p;
2815 const char *end;
2816 char* message;
2817 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2818 PyObject *errorHandler = NULL;
2819 PyObject *exc = NULL;
2820
2821 /* Escaped strings will always be longer than the resulting
2822 Unicode string, so we start with size here and then reduce the
2823 length after conversion to the true value.
2824 (but if the error callback returns a long replacement string
2825 we'll have to allocate more space) */
2826 v = _PyUnicode_New(size);
2827 if (v == NULL)
2828 goto onError;
2829 if (size == 0)
2830 return (PyObject *)v;
2831
2832 p = PyUnicode_AS_UNICODE(v);
2833 end = s + size;
2834
2835 while (s < end) {
2836 unsigned char c;
2837 Py_UNICODE x;
2838 int digits;
2839
2840 /* Non-escape characters are interpreted as Unicode ordinals */
2841 if (*s != '\\') {
2842 *p++ = (unsigned char) *s++;
2843 continue;
2844 }
2845
2846 startinpos = s-starts;
2847 /* \ - Escapes */
2848 s++;
2849 c = *s++;
2850 if (s > end)
2851 c = '\0'; /* Invalid after \ */
2852 switch (c) {
2853
2854 /* \x escapes */
2855 case '\n': break;
2856 case '\\': *p++ = '\\'; break;
2857 case '\'': *p++ = '\''; break;
2858 case '\"': *p++ = '\"'; break;
2859 case 'b': *p++ = '\b'; break;
2860 case 'f': *p++ = '\014'; break; /* FF */
2861 case 't': *p++ = '\t'; break;
2862 case 'n': *p++ = '\n'; break;
2863 case 'r': *p++ = '\r'; break;
2864 case 'v': *p++ = '\013'; break; /* VT */
2865 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2866
2867 /* \OOO (octal) escapes */
2868 case '0': case '1': case '2': case '3':
2869 case '4': case '5': case '6': case '7':
2870 x = s[-1] - '0';
2871 if (s < end && '0' <= *s && *s <= '7') {
2872 x = (x<<3) + *s++ - '0';
2873 if (s < end && '0' <= *s && *s <= '7')
2874 x = (x<<3) + *s++ - '0';
2875 }
2876 *p++ = x;
2877 break;
2878
2879 /* hex escapes */
2880 /* \xXX */
2881 case 'x':
2882 digits = 2;
2883 message = "truncated \\xXX escape";
2884 goto hexescape;
2885
2886 /* \uXXXX */
2887 case 'u':
2888 digits = 4;
2889 message = "truncated \\uXXXX escape";
2890 goto hexescape;
2891
2892 /* \UXXXXXXXX */
2893 case 'U':
2894 digits = 8;
2895 message = "truncated \\UXXXXXXXX escape";
2896 hexescape:
2897 chr = 0;
2898 if (end - s < digits) {
2899 /* count only hex digits */
2900 for (; s < end; ++s) {
2901 c = (unsigned char)*s;
2902 if (!Py_ISXDIGIT(c))
2903 goto error;
2904 }
2905 goto error;
2906 }
2907 for (; digits--; ++s) {
2908 c = (unsigned char)*s;
2909 if (!Py_ISXDIGIT(c))
2910 goto error;
2911 chr = (chr<<4) & ~0xF;
2912 if (c >= '0' && c <= '9')
2913 chr += c - '0';
2914 else if (c >= 'a' && c <= 'f')
2915 chr += 10 + c - 'a';
2916 else
2917 chr += 10 + c - 'A';
2918 }
2919 if (chr == 0xffffffff && PyErr_Occurred())
2920 /* _decoding_error will have already written into the
2921 target buffer. */
2922 break;
2923 store:
2924 /* when we get here, chr is a 32-bit unicode character */
2925 if (chr <= 0xffff)
2926 /* UCS-2 character */
2927 *p++ = (Py_UNICODE) chr;
2928 else if (chr <= 0x10ffff) {
2929 /* UCS-4 character. Either store directly, or as
2930 surrogate pair. */
2931 #ifdef Py_UNICODE_WIDE
2932 *p++ = chr;
2933 #else
2934 chr -= 0x10000L;
2935 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2936 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2937 #endif
2938 } else {
2939 message = "illegal Unicode character";
2940 goto error;
2941 }
2942 break;
2943
2944 /* \N{name} */
2945 case 'N':
2946 message = "malformed \\N character escape";
2947 if (ucnhash_CAPI == NULL) {
2948 /* load the unicode data module */
2949 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2950 if (ucnhash_CAPI == NULL)
2951 goto ucnhashError;
2952 }
2953 if (*s == '{') {
2954 const char *start = s+1;
2955 /* look for the closing brace */
2956 while (*s != '}' && s < end)
2957 s++;
2958 if (s > start && s < end && *s == '}') {
2959 /* found a name. look it up in the unicode database */
2960 message = "unknown Unicode character name";
2961 s++;
2962 if (s - start - 1 <= INT_MAX &&
2963 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2964 goto store;
2965 }
2966 }
2967 goto error;
2968
2969 default:
2970 if (s > end) {
2971 message = "\\ at end of string";
2972 s--;
2973 goto error;
2974 }
2975 else {
2976 *p++ = '\\';
2977 *p++ = (unsigned char)s[-1];
2978 }
2979 break;
2980 }
2981 continue;
2982
2983 error:
2984 endinpos = s-starts;
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 if (unicode_decode_call_errorhandler(
2987 errors, &errorHandler,
2988 "unicodeescape", message,
2989 starts, size, &startinpos, &endinpos, &exc, &s,
2990 &v, &outpos, &p))
2991 goto onError;
2992 continue;
2993 }
2994 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2995 goto onError;
2996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
2998 return (PyObject *)v;
2999
3000 ucnhashError:
3001 PyErr_SetString(
3002 PyExc_UnicodeError,
3003 "\\N escapes not supported (can't load unicodedata module)"
3004 );
3005 Py_XDECREF(v);
3006 Py_XDECREF(errorHandler);
3007 Py_XDECREF(exc);
3008 return NULL;
3009
3010 onError:
3011 Py_XDECREF(v);
3012 Py_XDECREF(errorHandler);
3013 Py_XDECREF(exc);
3014 return NULL;
3015 }
3016
3017 /* Return a Unicode-Escape string version of the Unicode object.
3018
3019 If quotes is true, the string is enclosed in u"" or u'' quotes as
3020 appropriate.
3021
3022 */
3023
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3025 Py_ssize_t size,
3026 Py_UNICODE ch)
3027 {
3028 /* like wcschr, but doesn't stop at NULL characters */
3029
3030 while (size-- > 0) {
3031 if (*s == ch)
3032 return s;
3033 s++;
3034 }
3035
3036 return NULL;
3037 }
3038
3039 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)3040 PyObject *unicodeescape_string(const Py_UNICODE *s,
3041 Py_ssize_t size,
3042 int quotes)
3043 {
3044 PyObject *repr;
3045 char *p;
3046
3047 static const char *hexdigit = "0123456789abcdef";
3048 #ifdef Py_UNICODE_WIDE
3049 const Py_ssize_t expandsize = 10;
3050 #else
3051 const Py_ssize_t expandsize = 6;
3052 #endif
3053
3054 /* XXX(nnorwitz): rather than over-allocating, it would be
3055 better to choose a different scheme. Perhaps scan the
3056 first N-chars of the string and allocate based on that size.
3057 */
3058 /* Initial allocation is based on the longest-possible unichr
3059 escape.
3060
3061 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3062 unichr, so in this case it's the longest unichr escape. In
3063 narrow (UTF-16) builds this is five chars per source unichr
3064 since there are two unichrs in the surrogate pair, so in narrow
3065 (UTF-16) builds it's not the longest unichr escape.
3066
3067 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3068 so in the narrow (UTF-16) build case it's the longest unichr
3069 escape.
3070 */
3071
3072 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3073 return PyErr_NoMemory();
3074
3075 repr = PyString_FromStringAndSize(NULL,
3076 2
3077 + expandsize*size
3078 + 1);
3079 if (repr == NULL)
3080 return NULL;
3081
3082 p = PyString_AS_STRING(repr);
3083
3084 if (quotes) {
3085 *p++ = 'u';
3086 *p++ = (findchar(s, size, '\'') &&
3087 !findchar(s, size, '"')) ? '"' : '\'';
3088 }
3089 while (size-- > 0) {
3090 Py_UNICODE ch = *s++;
3091
3092 /* Escape quotes and backslashes */
3093 if ((quotes &&
3094 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3095 *p++ = '\\';
3096 *p++ = (char) ch;
3097 continue;
3098 }
3099
3100 #ifdef Py_UNICODE_WIDE
3101 /* Map 21-bit characters to '\U00xxxxxx' */
3102 else if (ch >= 0x10000) {
3103 *p++ = '\\';
3104 *p++ = 'U';
3105 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3106 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3107 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3108 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3109 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3110 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3111 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3112 *p++ = hexdigit[ch & 0x0000000F];
3113 continue;
3114 }
3115 #else
3116 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3117 else if (ch >= 0xD800 && ch < 0xDC00) {
3118 Py_UNICODE ch2;
3119 Py_UCS4 ucs;
3120
3121 ch2 = *s++;
3122 size--;
3123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3124 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3125 *p++ = '\\';
3126 *p++ = 'U';
3127 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3128 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3129 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3130 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3131 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3132 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3133 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3134 *p++ = hexdigit[ucs & 0x0000000F];
3135 continue;
3136 }
3137 /* Fall through: isolated surrogates are copied as-is */
3138 s--;
3139 size++;
3140 }
3141 #endif
3142
3143 /* Map 16-bit characters to '\uxxxx' */
3144 if (ch >= 256) {
3145 *p++ = '\\';
3146 *p++ = 'u';
3147 *p++ = hexdigit[(ch >> 12) & 0x000F];
3148 *p++ = hexdigit[(ch >> 8) & 0x000F];
3149 *p++ = hexdigit[(ch >> 4) & 0x000F];
3150 *p++ = hexdigit[ch & 0x000F];
3151 }
3152
3153 /* Map special whitespace to '\t', \n', '\r' */
3154 else if (ch == '\t') {
3155 *p++ = '\\';
3156 *p++ = 't';
3157 }
3158 else if (ch == '\n') {
3159 *p++ = '\\';
3160 *p++ = 'n';
3161 }
3162 else if (ch == '\r') {
3163 *p++ = '\\';
3164 *p++ = 'r';
3165 }
3166
3167 /* Map non-printable US ASCII to '\xhh' */
3168 else if (ch < ' ' || ch >= 0x7F) {
3169 *p++ = '\\';
3170 *p++ = 'x';
3171 *p++ = hexdigit[(ch >> 4) & 0x000F];
3172 *p++ = hexdigit[ch & 0x000F];
3173 }
3174
3175 /* Copy everything else as-is */
3176 else
3177 *p++ = (char) ch;
3178 }
3179 if (quotes)
3180 *p++ = PyString_AS_STRING(repr)[1];
3181
3182 *p = '\0';
3183 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3184 return NULL;
3185 return repr;
3186 }
3187
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3189 Py_ssize_t size)
3190 {
3191 return unicodeescape_string(s, size, 0);
3192 }
3193
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3195 {
3196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
3198 return NULL;
3199 }
3200 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3201 PyUnicode_GET_SIZE(unicode));
3202 }
3203
3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3205
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3207 Py_ssize_t size,
3208 const char *errors)
3209 {
3210 const char *starts = s;
3211 Py_ssize_t startinpos;
3212 Py_ssize_t endinpos;
3213 Py_ssize_t outpos;
3214 PyUnicodeObject *v;
3215 Py_UNICODE *p;
3216 const char *end;
3217 const char *bs;
3218 PyObject *errorHandler = NULL;
3219 PyObject *exc = NULL;
3220
3221 /* Escaped strings will always be longer than the resulting
3222 Unicode string, so we start with size here and then reduce the
3223 length after conversion to the true value. (But decoding error
3224 handler might have to resize the string) */
3225 v = _PyUnicode_New(size);
3226 if (v == NULL)
3227 goto onError;
3228 if (size == 0)
3229 return (PyObject *)v;
3230 p = PyUnicode_AS_UNICODE(v);
3231 end = s + size;
3232 while (s < end) {
3233 unsigned char c;
3234 Py_UCS4 x;
3235 int i;
3236 int count;
3237
3238 /* Non-escape characters are interpreted as Unicode ordinals */
3239 if (*s != '\\') {
3240 *p++ = (unsigned char)*s++;
3241 continue;
3242 }
3243 startinpos = s-starts;
3244
3245 /* \u-escapes are only interpreted iff the number of leading
3246 backslashes if odd */
3247 bs = s;
3248 for (;s < end;) {
3249 if (*s != '\\')
3250 break;
3251 *p++ = (unsigned char)*s++;
3252 }
3253 if (((s - bs) & 1) == 0 ||
3254 s >= end ||
3255 (*s != 'u' && *s != 'U')) {
3256 continue;
3257 }
3258 p--;
3259 count = *s=='u' ? 4 : 8;
3260 s++;
3261
3262 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3263 outpos = p-PyUnicode_AS_UNICODE(v);
3264 for (x = 0, i = 0; i < count; ++i, ++s) {
3265 c = (unsigned char)*s;
3266 if (!isxdigit(c)) {
3267 endinpos = s-starts;
3268 if (unicode_decode_call_errorhandler(
3269 errors, &errorHandler,
3270 "rawunicodeescape", "truncated \\uXXXX",
3271 starts, size, &startinpos, &endinpos, &exc, &s,
3272 &v, &outpos, &p))
3273 goto onError;
3274 goto nextByte;
3275 }
3276 x = (x<<4) & ~0xF;
3277 if (c >= '0' && c <= '9')
3278 x += c - '0';
3279 else if (c >= 'a' && c <= 'f')
3280 x += 10 + c - 'a';
3281 else
3282 x += 10 + c - 'A';
3283 }
3284 if (x <= 0xffff)
3285 /* UCS-2 character */
3286 *p++ = (Py_UNICODE) x;
3287 else if (x <= 0x10ffff) {
3288 /* UCS-4 character. Either store directly, or as
3289 surrogate pair. */
3290 #ifdef Py_UNICODE_WIDE
3291 *p++ = (Py_UNICODE) x;
3292 #else
3293 x -= 0x10000L;
3294 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3295 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3296 #endif
3297 } else {
3298 endinpos = s-starts;
3299 outpos = p-PyUnicode_AS_UNICODE(v);
3300 if (unicode_decode_call_errorhandler(
3301 errors, &errorHandler,
3302 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3303 starts, size, &startinpos, &endinpos, &exc, &s,
3304 &v, &outpos, &p))
3305 goto onError;
3306 }
3307 nextByte:
3308 ;
3309 }
3310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3311 goto onError;
3312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
3314 return (PyObject *)v;
3315
3316 onError:
3317 Py_XDECREF(v);
3318 Py_XDECREF(errorHandler);
3319 Py_XDECREF(exc);
3320 return NULL;
3321 }
3322
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3324 Py_ssize_t size)
3325 {
3326 PyObject *repr;
3327 char *p;
3328 char *q;
3329
3330 static const char *hexdigit = "0123456789abcdef";
3331 #ifdef Py_UNICODE_WIDE
3332 const Py_ssize_t expandsize = 10;
3333 #else
3334 const Py_ssize_t expandsize = 6;
3335 #endif
3336
3337 if (size > PY_SSIZE_T_MAX / expandsize)
3338 return PyErr_NoMemory();
3339
3340 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3341 if (repr == NULL)
3342 return NULL;
3343 if (size == 0)
3344 return repr;
3345
3346 p = q = PyString_AS_STRING(repr);
3347 while (size-- > 0) {
3348 Py_UNICODE ch = *s++;
3349 #ifdef Py_UNICODE_WIDE
3350 /* Map 32-bit characters to '\Uxxxxxxxx' */
3351 if (ch >= 0x10000) {
3352 *p++ = '\\';
3353 *p++ = 'U';
3354 *p++ = hexdigit[(ch >> 28) & 0xf];
3355 *p++ = hexdigit[(ch >> 24) & 0xf];
3356 *p++ = hexdigit[(ch >> 20) & 0xf];
3357 *p++ = hexdigit[(ch >> 16) & 0xf];
3358 *p++ = hexdigit[(ch >> 12) & 0xf];
3359 *p++ = hexdigit[(ch >> 8) & 0xf];
3360 *p++ = hexdigit[(ch >> 4) & 0xf];
3361 *p++ = hexdigit[ch & 15];
3362 }
3363 else
3364 #else
3365 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3366 if (ch >= 0xD800 && ch < 0xDC00) {
3367 Py_UNICODE ch2;
3368 Py_UCS4 ucs;
3369
3370 ch2 = *s++;
3371 size--;
3372 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3373 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3374 *p++ = '\\';
3375 *p++ = 'U';
3376 *p++ = hexdigit[(ucs >> 28) & 0xf];
3377 *p++ = hexdigit[(ucs >> 24) & 0xf];
3378 *p++ = hexdigit[(ucs >> 20) & 0xf];
3379 *p++ = hexdigit[(ucs >> 16) & 0xf];
3380 *p++ = hexdigit[(ucs >> 12) & 0xf];
3381 *p++ = hexdigit[(ucs >> 8) & 0xf];
3382 *p++ = hexdigit[(ucs >> 4) & 0xf];
3383 *p++ = hexdigit[ucs & 0xf];
3384 continue;
3385 }
3386 /* Fall through: isolated surrogates are copied as-is */
3387 s--;
3388 size++;
3389 }
3390 #endif
3391 /* Map 16-bit characters to '\uxxxx' */
3392 if (ch >= 256) {
3393 *p++ = '\\';
3394 *p++ = 'u';
3395 *p++ = hexdigit[(ch >> 12) & 0xf];
3396 *p++ = hexdigit[(ch >> 8) & 0xf];
3397 *p++ = hexdigit[(ch >> 4) & 0xf];
3398 *p++ = hexdigit[ch & 15];
3399 }
3400 /* Copy everything else as-is */
3401 else
3402 *p++ = (char) ch;
3403 }
3404 *p = '\0';
3405 if (_PyString_Resize(&repr, p - q))
3406 return NULL;
3407 return repr;
3408 }
3409
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3411 {
3412 if (!PyUnicode_Check(unicode)) {
3413 PyErr_BadArgument();
3414 return NULL;
3415 }
3416 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3417 PyUnicode_GET_SIZE(unicode));
3418 }
3419
3420 /* --- Unicode Internal Codec ------------------------------------------- */
3421
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3423 Py_ssize_t size,
3424 const char *errors)
3425 {
3426 const char *starts = s;
3427 Py_ssize_t startinpos;
3428 Py_ssize_t endinpos;
3429 Py_ssize_t outpos;
3430 PyUnicodeObject *v;
3431 Py_UNICODE *p;
3432 const char *end;
3433 const char *reason;
3434 PyObject *errorHandler = NULL;
3435 PyObject *exc = NULL;
3436
3437 #ifdef Py_UNICODE_WIDE
3438 Py_UNICODE unimax = PyUnicode_GetMax();
3439 #endif
3440
3441 /* XXX overflow detection missing */
3442 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3443 if (v == NULL)
3444 goto onError;
3445 if (PyUnicode_GetSize((PyObject *)v) == 0)
3446 return (PyObject *)v;
3447 p = PyUnicode_AS_UNICODE(v);
3448 end = s + size;
3449
3450 while (s < end) {
3451 if (end-s < Py_UNICODE_SIZE) {
3452 endinpos = end-starts;
3453 reason = "truncated input";
3454 goto error;
3455 }
3456 memcpy(p, s, sizeof(Py_UNICODE));
3457 #ifdef Py_UNICODE_WIDE
3458 /* We have to sanity check the raw data, otherwise doom looms for
3459 some malformed UCS-4 data. */
3460 if (*p > unimax || *p < 0) {
3461 endinpos = s - starts + Py_UNICODE_SIZE;
3462 reason = "illegal code point (> 0x10FFFF)";
3463 goto error;
3464 }
3465 #endif
3466 p++;
3467 s += Py_UNICODE_SIZE;
3468 continue;
3469
3470 error:
3471 startinpos = s - starts;
3472 outpos = p - PyUnicode_AS_UNICODE(v);
3473 if (unicode_decode_call_errorhandler(
3474 errors, &errorHandler,
3475 "unicode_internal", reason,
3476 starts, size, &startinpos, &endinpos, &exc, &s,
3477 &v, &outpos, &p)) {
3478 goto onError;
3479 }
3480 }
3481
3482 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3483 goto onError;
3484 Py_XDECREF(errorHandler);
3485 Py_XDECREF(exc);
3486 return (PyObject *)v;
3487
3488 onError:
3489 Py_XDECREF(v);
3490 Py_XDECREF(errorHandler);
3491 Py_XDECREF(exc);
3492 return NULL;
3493 }
3494
3495 /* --- Latin-1 Codec ------------------------------------------------------ */
3496
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3497 PyObject *PyUnicode_DecodeLatin1(const char *s,
3498 Py_ssize_t size,
3499 const char *errors)
3500 {
3501 PyUnicodeObject *v;
3502 Py_UNICODE *p;
3503
3504 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3505 if (size == 1) {
3506 Py_UNICODE r = *(unsigned char*)s;
3507 return PyUnicode_FromUnicode(&r, 1);
3508 }
3509
3510 v = _PyUnicode_New(size);
3511 if (v == NULL)
3512 goto onError;
3513 if (size == 0)
3514 return (PyObject *)v;
3515 p = PyUnicode_AS_UNICODE(v);
3516 while (size-- > 0)
3517 *p++ = (unsigned char)*s++;
3518 return (PyObject *)v;
3519
3520 onError:
3521 Py_XDECREF(v);
3522 return NULL;
3523 }
3524
3525 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3526 static void make_encode_exception(PyObject **exceptionObject,
3527 const char *encoding,
3528 const Py_UNICODE *unicode, Py_ssize_t size,
3529 Py_ssize_t startpos, Py_ssize_t endpos,
3530 const char *reason)
3531 {
3532 if (*exceptionObject == NULL) {
3533 *exceptionObject = PyUnicodeEncodeError_Create(
3534 encoding, unicode, size, startpos, endpos, reason);
3535 }
3536 else {
3537 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3538 goto onError;
3539 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3540 goto onError;
3541 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3542 goto onError;
3543 return;
3544 onError:
3545 Py_CLEAR(*exceptionObject);
3546 }
3547 }
3548
3549 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3550 static void raise_encode_exception(PyObject **exceptionObject,
3551 const char *encoding,
3552 const Py_UNICODE *unicode, Py_ssize_t size,
3553 Py_ssize_t startpos, Py_ssize_t endpos,
3554 const char *reason)
3555 {
3556 make_encode_exception(exceptionObject,
3557 encoding, unicode, size, startpos, endpos, reason);
3558 if (*exceptionObject != NULL)
3559 PyCodec_StrictErrors(*exceptionObject);
3560 }
3561
3562 /* error handling callback helper:
3563 build arguments, call the callback and check the arguments,
3564 put the result into newpos and return the replacement string, which
3565 has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3566 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3567 PyObject **errorHandler,
3568 const char *encoding, const char *reason,
3569 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3570 Py_ssize_t startpos, Py_ssize_t endpos,
3571 Py_ssize_t *newpos)
3572 {
3573 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3574
3575 PyObject *restuple;
3576 PyObject *resunicode;
3577
3578 if (*errorHandler == NULL) {
3579 *errorHandler = PyCodec_LookupError(errors);
3580 if (*errorHandler == NULL)
3581 return NULL;
3582 }
3583
3584 make_encode_exception(exceptionObject,
3585 encoding, unicode, size, startpos, endpos, reason);
3586 if (*exceptionObject == NULL)
3587 return NULL;
3588
3589 restuple = PyObject_CallFunctionObjArgs(
3590 *errorHandler, *exceptionObject, NULL);
3591 if (restuple == NULL)
3592 return NULL;
3593 if (!PyTuple_Check(restuple)) {
3594 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3595 Py_DECREF(restuple);
3596 return NULL;
3597 }
3598 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3599 &resunicode, newpos)) {
3600 Py_DECREF(restuple);
3601 return NULL;
3602 }
3603 if (*newpos<0)
3604 *newpos = size+*newpos;
3605 if (*newpos<0 || *newpos>size) {
3606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3607 Py_DECREF(restuple);
3608 return NULL;
3609 }
3610 Py_INCREF(resunicode);
3611 Py_DECREF(restuple);
3612 return resunicode;
3613 }
3614
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3616 Py_ssize_t size,
3617 const char *errors,
3618 int limit)
3619 {
3620 /* output object */
3621 PyObject *res;
3622 /* pointers to the beginning and end+1 of input */
3623 const Py_UNICODE *startp = p;
3624 const Py_UNICODE *endp = p + size;
3625 /* pointer to the beginning of the unencodable characters */
3626 /* const Py_UNICODE *badp = NULL; */
3627 /* pointer into the output */
3628 char *str;
3629 /* current output position */
3630 Py_ssize_t respos = 0;
3631 Py_ssize_t ressize;
3632 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3633 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3634 PyObject *errorHandler = NULL;
3635 PyObject *exc = NULL;
3636 /* the following variable is used for caching string comparisons
3637 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3638 int known_errorHandler = -1;
3639
3640 /* allocate enough for a simple encoding without
3641 replacements, if we need more, we'll resize */
3642 res = PyString_FromStringAndSize(NULL, size);
3643 if (res == NULL)
3644 goto onError;
3645 if (size == 0)
3646 return res;
3647 str = PyString_AS_STRING(res);
3648 ressize = size;
3649
3650 while (p<endp) {
3651 Py_UNICODE c = *p;
3652
3653 /* can we encode this? */
3654 if (c<limit) {
3655 /* no overflow check, because we know that the space is enough */
3656 *str++ = (char)c;
3657 ++p;
3658 }
3659 else {
3660 Py_ssize_t unicodepos = p-startp;
3661 Py_ssize_t requiredsize;
3662 PyObject *repunicode;
3663 Py_ssize_t repsize;
3664 Py_ssize_t newpos;
3665 Py_ssize_t respos;
3666 Py_UNICODE *uni2;
3667 /* startpos for collecting unencodable chars */
3668 const Py_UNICODE *collstart = p;
3669 const Py_UNICODE *collend = p;
3670 /* find all unecodable characters */
3671 while ((collend < endp) && ((*collend) >= limit))
3672 ++collend;
3673 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3674 if (known_errorHandler==-1) {
3675 if ((errors==NULL) || (!strcmp(errors, "strict")))
3676 known_errorHandler = 1;
3677 else if (!strcmp(errors, "replace"))
3678 known_errorHandler = 2;
3679 else if (!strcmp(errors, "ignore"))
3680 known_errorHandler = 3;
3681 else if (!strcmp(errors, "xmlcharrefreplace"))
3682 known_errorHandler = 4;
3683 else
3684 known_errorHandler = 0;
3685 }
3686 switch (known_errorHandler) {
3687 case 1: /* strict */
3688 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3689 goto onError;
3690 case 2: /* replace */
3691 while (collstart++ < collend)
3692 *str++ = '?'; /* fall through */
3693 case 3: /* ignore */
3694 p = collend;
3695 break;
3696 case 4: /* xmlcharrefreplace */
3697 respos = str - PyString_AS_STRING(res);
3698 /* determine replacement size (temporarily (mis)uses p) */
3699 requiredsize = respos;
3700 for (p = collstart; p < collend;) {
3701 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3702 Py_ssize_t incr;
3703 if (ch < 10)
3704 incr = 2+1+1;
3705 else if (ch < 100)
3706 incr = 2+2+1;
3707 else if (ch < 1000)
3708 incr = 2+3+1;
3709 else if (ch < 10000)
3710 incr = 2+4+1;
3711 else if (ch < 100000)
3712 incr = 2+5+1;
3713 else if (ch < 1000000)
3714 incr = 2+6+1;
3715 else
3716 incr = 2+7+1;
3717 if (requiredsize > PY_SSIZE_T_MAX - incr)
3718 goto overflow;
3719 requiredsize += incr;
3720 }
3721 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3722 goto overflow;
3723 requiredsize += endp - collend;
3724 if (requiredsize > ressize) {
3725 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3726 requiredsize = 2*ressize;
3727 if (_PyString_Resize(&res, requiredsize))
3728 goto onError;
3729 str = PyString_AS_STRING(res) + respos;
3730 ressize = requiredsize;
3731 }
3732 /* generate replacement (temporarily (mis)uses p) */
3733 for (p = collstart; p < collend;) {
3734 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3735 str += sprintf(str, "&#%d;", (int)ch);
3736 }
3737 p = collend;
3738 break;
3739 default:
3740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741 encoding, reason, startp, size, &exc,
3742 collstart-startp, collend-startp, &newpos);
3743 if (repunicode == NULL)
3744 goto onError;
3745 /* need more space? (at least enough for what we have+the
3746 replacement+the rest of the string, so we won't have to
3747 check space for encodable characters) */
3748 respos = str - PyString_AS_STRING(res);
3749 repsize = PyUnicode_GET_SIZE(repunicode);
3750 if (respos > PY_SSIZE_T_MAX - repsize)
3751 goto overflow;
3752 requiredsize = respos + repsize;
3753 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3754 goto overflow;
3755 requiredsize += endp - collend;
3756 if (requiredsize > ressize) {
3757 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3758 requiredsize = 2*ressize;
3759 if (_PyString_Resize(&res, requiredsize)) {
3760 Py_DECREF(repunicode);
3761 goto onError;
3762 }
3763 str = PyString_AS_STRING(res) + respos;
3764 ressize = requiredsize;
3765 }
3766 /* check if there is anything unencodable in the replacement
3767 and copy it to the output */
3768 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3769 c = *uni2;
3770 if (c >= limit) {
3771 raise_encode_exception(&exc, encoding, startp, size,
3772 unicodepos, unicodepos+1, reason);
3773 Py_DECREF(repunicode);
3774 goto onError;
3775 }
3776 *str = (char)c;
3777 }
3778 p = startp + newpos;
3779 Py_DECREF(repunicode);
3780 }
3781 }
3782 }
3783 /* Resize if we allocated to much */
3784 respos = str - PyString_AS_STRING(res);
3785 if (respos < ressize)
3786 /* If this falls res will be NULL */
3787 _PyString_Resize(&res, respos);
3788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
3790 return res;
3791
3792 overflow:
3793 PyErr_SetString(PyExc_OverflowError,
3794 "encoded result is too long for a Python string");
3795
3796 onError:
3797 Py_XDECREF(res);
3798 Py_XDECREF(errorHandler);
3799 Py_XDECREF(exc);
3800 return NULL;
3801 }
3802
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3804 Py_ssize_t size,
3805 const char *errors)
3806 {
3807 return unicode_encode_ucs1(p, size, errors, 256);
3808 }
3809
PyUnicode_AsLatin1String(PyObject * unicode)3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3811 {
3812 if (!PyUnicode_Check(unicode)) {
3813 PyErr_BadArgument();
3814 return NULL;
3815 }
3816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3817 PyUnicode_GET_SIZE(unicode),
3818 NULL);
3819 }
3820
3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3822
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3823 PyObject *PyUnicode_DecodeASCII(const char *s,
3824 Py_ssize_t size,
3825 const char *errors)
3826 {
3827 const char *starts = s;
3828 PyUnicodeObject *v;
3829 Py_UNICODE *p;
3830 Py_ssize_t startinpos;
3831 Py_ssize_t endinpos;
3832 Py_ssize_t outpos;
3833 const char *e;
3834 PyObject *errorHandler = NULL;
3835 PyObject *exc = NULL;
3836
3837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3838 if (size == 1 && *(unsigned char*)s < 128) {
3839 Py_UNICODE r = *(unsigned char*)s;
3840 return PyUnicode_FromUnicode(&r, 1);
3841 }
3842
3843 v = _PyUnicode_New(size);
3844 if (v == NULL)
3845 goto onError;
3846 if (size == 0)
3847 return (PyObject *)v;
3848 p = PyUnicode_AS_UNICODE(v);
3849 e = s + size;
3850 while (s < e) {
3851 register unsigned char c = (unsigned char)*s;
3852 if (c < 128) {
3853 *p++ = c;
3854 ++s;
3855 }
3856 else {
3857 startinpos = s-starts;
3858 endinpos = startinpos + 1;
3859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3860 if (unicode_decode_call_errorhandler(
3861 errors, &errorHandler,
3862 "ascii", "ordinal not in range(128)",
3863 starts, size, &startinpos, &endinpos, &exc, &s,
3864 &v, &outpos, &p))
3865 goto onError;
3866 }
3867 }
3868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3870 goto onError;
3871 Py_XDECREF(errorHandler);
3872 Py_XDECREF(exc);
3873 return (PyObject *)v;
3874
3875 onError:
3876 Py_XDECREF(v);
3877 Py_XDECREF(errorHandler);
3878 Py_XDECREF(exc);
3879 return NULL;
3880 }
3881
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3883 Py_ssize_t size,
3884 const char *errors)
3885 {
3886 return unicode_encode_ucs1(p, size, errors, 128);
3887 }
3888
PyUnicode_AsASCIIString(PyObject * unicode)3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3890 {
3891 if (!PyUnicode_Check(unicode)) {
3892 PyErr_BadArgument();
3893 return NULL;
3894 }
3895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3896 PyUnicode_GET_SIZE(unicode),
3897 NULL);
3898 }
3899
3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3901
3902 /* --- MBCS codecs for Windows -------------------------------------------- */
3903
3904 #if SIZEOF_INT < SIZEOF_SIZE_T
3905 #define NEED_RETRY
3906 #endif
3907
3908 /* XXX This code is limited to "true" double-byte encodings, as
3909 a) it assumes an incomplete character consists of a single byte, and
3910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3911 encodings, see IsDBCSLeadByteEx documentation. */
3912
is_dbcs_lead_byte(const char * s,int offset)3913 static int is_dbcs_lead_byte(const char *s, int offset)
3914 {
3915 const char *curr = s + offset;
3916
3917 if (IsDBCSLeadByte(*curr)) {
3918 const char *prev = CharPrev(s, curr);
3919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3920 }
3921 return 0;
3922 }
3923
3924 /*
3925 * Decode MBCS string into unicode object. If 'final' is set, converts
3926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3927 */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3928 static int decode_mbcs(PyUnicodeObject **v,
3929 const char *s, /* MBCS string */
3930 int size, /* sizeof MBCS string */
3931 int final)
3932 {
3933 Py_UNICODE *p;
3934 Py_ssize_t n = 0;
3935 int usize = 0;
3936
3937 assert(size >= 0);
3938
3939 /* Skip trailing lead-byte unless 'final' is set */
3940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3941 --size;
3942
3943 /* First get the size of the result */
3944 if (size > 0) {
3945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3946 if (usize == 0) {
3947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3948 return -1;
3949 }
3950 }
3951
3952 if (*v == NULL) {
3953 /* Create unicode object */
3954 *v = _PyUnicode_New(usize);
3955 if (*v == NULL)
3956 return -1;
3957 }
3958 else {
3959 /* Extend unicode object */
3960 n = PyUnicode_GET_SIZE(*v);
3961 if (_PyUnicode_Resize(v, n + usize) < 0)
3962 return -1;
3963 }
3964
3965 /* Do the conversion */
3966 if (size > 0) {
3967 p = PyUnicode_AS_UNICODE(*v) + n;
3968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3969 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3970 return -1;
3971 }
3972 }
3973
3974 return size;
3975 }
3976
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3978 Py_ssize_t size,
3979 const char *errors,
3980 Py_ssize_t *consumed)
3981 {
3982 PyUnicodeObject *v = NULL;
3983 int done;
3984
3985 if (consumed)
3986 *consumed = 0;
3987
3988 #ifdef NEED_RETRY
3989 retry:
3990 if (size > INT_MAX)
3991 done = decode_mbcs(&v, s, INT_MAX, 0);
3992 else
3993 #endif
3994 done = decode_mbcs(&v, s, (int)size, !consumed);
3995
3996 if (done < 0) {
3997 Py_XDECREF(v);
3998 return NULL;
3999 }
4000
4001 if (consumed)
4002 *consumed += done;
4003
4004 #ifdef NEED_RETRY
4005 if (size > INT_MAX) {
4006 s += done;
4007 size -= done;
4008 goto retry;
4009 }
4010 #endif
4011
4012 return (PyObject *)v;
4013 }
4014
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)4015 PyObject *PyUnicode_DecodeMBCS(const char *s,
4016 Py_ssize_t size,
4017 const char *errors)
4018 {
4019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4020 }
4021
4022 /*
4023 * Convert unicode into string object (MBCS).
4024 * Returns 0 if succeed, -1 otherwise.
4025 */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)4026 static int encode_mbcs(PyObject **repr,
4027 const Py_UNICODE *p, /* unicode */
4028 int size) /* size of unicode */
4029 {
4030 int mbcssize = 0;
4031 Py_ssize_t n = 0;
4032
4033 assert(size >= 0);
4034
4035 /* First get the size of the result */
4036 if (size > 0) {
4037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4038 if (mbcssize == 0) {
4039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4040 return -1;
4041 }
4042 }
4043
4044 if (*repr == NULL) {
4045 /* Create string object */
4046 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4047 if (*repr == NULL)
4048 return -1;
4049 }
4050 else {
4051 /* Extend string object */
4052 n = PyString_Size(*repr);
4053 if (_PyString_Resize(repr, n + mbcssize) < 0)
4054 return -1;
4055 }
4056
4057 /* Do the conversion */
4058 if (size > 0) {
4059 char *s = PyString_AS_STRING(*repr) + n;
4060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4061 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4062 return -1;
4063 }
4064 }
4065
4066 return 0;
4067 }
4068
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4070 Py_ssize_t size,
4071 const char *errors)
4072 {
4073 PyObject *repr = NULL;
4074 int ret;
4075
4076 #ifdef NEED_RETRY
4077 retry:
4078 if (size > INT_MAX)
4079 ret = encode_mbcs(&repr, p, INT_MAX);
4080 else
4081 #endif
4082 ret = encode_mbcs(&repr, p, (int)size);
4083
4084 if (ret < 0) {
4085 Py_XDECREF(repr);
4086 return NULL;
4087 }
4088
4089 #ifdef NEED_RETRY
4090 if (size > INT_MAX) {
4091 p += INT_MAX;
4092 size -= INT_MAX;
4093 goto retry;
4094 }
4095 #endif
4096
4097 return repr;
4098 }
4099
PyUnicode_AsMBCSString(PyObject * unicode)4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4101 {
4102 if (!PyUnicode_Check(unicode)) {
4103 PyErr_BadArgument();
4104 return NULL;
4105 }
4106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4107 PyUnicode_GET_SIZE(unicode),
4108 NULL);
4109 }
4110
4111 #undef NEED_RETRY
4112
4113 #endif /* MS_WINDOWS */
4114
4115 /* --- Character Mapping Codec -------------------------------------------- */
4116
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4117 PyObject *PyUnicode_DecodeCharmap(const char *s,
4118 Py_ssize_t size,
4119 PyObject *mapping,
4120 const char *errors)
4121 {
4122 const char *starts = s;
4123 Py_ssize_t startinpos;
4124 Py_ssize_t endinpos;
4125 Py_ssize_t outpos;
4126 const char *e;
4127 PyUnicodeObject *v;
4128 Py_UNICODE *p;
4129 Py_ssize_t extrachars = 0;
4130 PyObject *errorHandler = NULL;
4131 PyObject *exc = NULL;
4132 Py_UNICODE *mapstring = NULL;
4133 Py_ssize_t maplen = 0;
4134
4135 /* Default to Latin-1 */
4136 if (mapping == NULL)
4137 return PyUnicode_DecodeLatin1(s, size, errors);
4138
4139 v = _PyUnicode_New(size);
4140 if (v == NULL)
4141 goto onError;
4142 if (size == 0)
4143 return (PyObject *)v;
4144 p = PyUnicode_AS_UNICODE(v);
4145 e = s + size;
4146 if (PyUnicode_CheckExact(mapping)) {
4147 mapstring = PyUnicode_AS_UNICODE(mapping);
4148 maplen = PyUnicode_GET_SIZE(mapping);
4149 while (s < e) {
4150 unsigned char ch = *s;
4151 Py_UNICODE x = 0xfffe; /* illegal value */
4152
4153 if (ch < maplen)
4154 x = mapstring[ch];
4155
4156 if (x == 0xfffe) {
4157 /* undefined mapping */
4158 outpos = p-PyUnicode_AS_UNICODE(v);
4159 startinpos = s-starts;
4160 endinpos = startinpos+1;
4161 if (unicode_decode_call_errorhandler(
4162 errors, &errorHandler,
4163 "charmap", "character maps to <undefined>",
4164 starts, size, &startinpos, &endinpos, &exc, &s,
4165 &v, &outpos, &p)) {
4166 goto onError;
4167 }
4168 continue;
4169 }
4170 *p++ = x;
4171 ++s;
4172 }
4173 }
4174 else {
4175 while (s < e) {
4176 unsigned char ch = *s;
4177 PyObject *w, *x;
4178
4179 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4180 w = PyInt_FromLong((long)ch);
4181 if (w == NULL)
4182 goto onError;
4183 x = PyObject_GetItem(mapping, w);
4184 Py_DECREF(w);
4185 if (x == NULL) {
4186 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4187 /* No mapping found means: mapping is undefined. */
4188 PyErr_Clear();
4189 goto Undefined;
4190 } else
4191 goto onError;
4192 }
4193
4194 /* Apply mapping */
4195 if (x == Py_None)
4196 goto Undefined;
4197 if (PyInt_Check(x)) {
4198 long value = PyInt_AS_LONG(x);
4199 if (value == 0xFFFE)
4200 goto Undefined;
4201 if (value < 0 || value > 0x10FFFF) {
4202 PyErr_SetString(PyExc_TypeError,
4203 "character mapping must be in range(0x110000)");
4204 Py_DECREF(x);
4205 goto onError;
4206 }
4207
4208 #ifndef Py_UNICODE_WIDE
4209 if (value > 0xFFFF) {
4210 /* see the code for 1-n mapping below */
4211 if (extrachars < 2) {
4212 /* resize first */
4213 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4214 Py_ssize_t needed = 10 - extrachars;
4215 extrachars += needed;
4216 /* XXX overflow detection missing */
4217 if (_PyUnicode_Resize(&v,
4218 PyUnicode_GET_SIZE(v) + needed) < 0) {
4219 Py_DECREF(x);
4220 goto onError;
4221 }
4222 p = PyUnicode_AS_UNICODE(v) + oldpos;
4223 }
4224 value -= 0x10000;
4225 *p++ = 0xD800 | (Py_UNICODE)(value >> 10);
4226 *p++ = 0xDC00 | (value & 0x3FF);
4227 extrachars -= 2;
4228 }
4229 else
4230 #endif
4231 *p++ = (Py_UNICODE)value;
4232 }
4233 else if (PyUnicode_Check(x)) {
4234 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4235
4236 if (targetsize == 1) {
4237 /* 1-1 mapping */
4238 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4239 if (value == 0xFFFE)
4240 goto Undefined;
4241 *p++ = value;
4242 }
4243 else if (targetsize > 1) {
4244 /* 1-n mapping */
4245 if (targetsize > extrachars) {
4246 /* resize first */
4247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4248 Py_ssize_t needed = (targetsize - extrachars) + \
4249 (targetsize << 2);
4250 extrachars += needed;
4251 /* XXX overflow detection missing */
4252 if (_PyUnicode_Resize(&v,
4253 PyUnicode_GET_SIZE(v) + needed) < 0) {
4254 Py_DECREF(x);
4255 goto onError;
4256 }
4257 p = PyUnicode_AS_UNICODE(v) + oldpos;
4258 }
4259 Py_UNICODE_COPY(p,
4260 PyUnicode_AS_UNICODE(x),
4261 targetsize);
4262 p += targetsize;
4263 extrachars -= targetsize;
4264 }
4265 /* 1-0 mapping: skip the character */
4266 }
4267 else {
4268 /* wrong return value */
4269 PyErr_SetString(PyExc_TypeError,
4270 "character mapping must return integer, None or unicode");
4271 Py_DECREF(x);
4272 goto onError;
4273 }
4274 Py_DECREF(x);
4275 ++s;
4276 continue;
4277 Undefined:
4278 /* undefined mapping */
4279 Py_XDECREF(x);
4280 outpos = p-PyUnicode_AS_UNICODE(v);
4281 startinpos = s-starts;
4282 endinpos = startinpos+1;
4283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "charmap", "character maps to <undefined>",
4286 starts, size, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p)) {
4288 goto onError;
4289 }
4290 }
4291 }
4292 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4294 goto onError;
4295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
4297 return (PyObject *)v;
4298
4299 onError:
4300 Py_XDECREF(errorHandler);
4301 Py_XDECREF(exc);
4302 Py_XDECREF(v);
4303 return NULL;
4304 }
4305
4306 /* Charmap encoding: the lookup table */
4307
4308 struct encoding_map{
4309 PyObject_HEAD
4310 unsigned char level1[32];
4311 int count2, count3;
4312 unsigned char level23[1];
4313 };
4314
4315 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4316 encoding_map_size(PyObject *obj, PyObject* args)
4317 {
4318 struct encoding_map *map = (struct encoding_map*)obj;
4319 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4320 128*map->count3);
4321 }
4322
4323 static PyMethodDef encoding_map_methods[] = {
4324 {"size", encoding_map_size, METH_NOARGS,
4325 PyDoc_STR("Return the size (in bytes) of this object") },
4326 { 0 }
4327 };
4328
4329 static void
encoding_map_dealloc(PyObject * o)4330 encoding_map_dealloc(PyObject* o)
4331 {
4332 PyObject_FREE(o);
4333 }
4334
4335 static PyTypeObject EncodingMapType = {
4336 PyVarObject_HEAD_INIT(NULL, 0)
4337 "EncodingMap", /*tp_name*/
4338 sizeof(struct encoding_map), /*tp_basicsize*/
4339 0, /*tp_itemsize*/
4340 /* methods */
4341 encoding_map_dealloc, /*tp_dealloc*/
4342 0, /*tp_print*/
4343 0, /*tp_getattr*/
4344 0, /*tp_setattr*/
4345 0, /*tp_compare*/
4346 0, /*tp_repr*/
4347 0, /*tp_as_number*/
4348 0, /*tp_as_sequence*/
4349 0, /*tp_as_mapping*/
4350 0, /*tp_hash*/
4351 0, /*tp_call*/
4352 0, /*tp_str*/
4353 0, /*tp_getattro*/
4354 0, /*tp_setattro*/
4355 0, /*tp_as_buffer*/
4356 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4357 0, /*tp_doc*/
4358 0, /*tp_traverse*/
4359 0, /*tp_clear*/
4360 0, /*tp_richcompare*/
4361 0, /*tp_weaklistoffset*/
4362 0, /*tp_iter*/
4363 0, /*tp_iternext*/
4364 encoding_map_methods, /*tp_methods*/
4365 0, /*tp_members*/
4366 0, /*tp_getset*/
4367 0, /*tp_base*/
4368 0, /*tp_dict*/
4369 0, /*tp_descr_get*/
4370 0, /*tp_descr_set*/
4371 0, /*tp_dictoffset*/
4372 0, /*tp_init*/
4373 0, /*tp_alloc*/
4374 0, /*tp_new*/
4375 0, /*tp_free*/
4376 0, /*tp_is_gc*/
4377 };
4378
4379 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4380 PyUnicode_BuildEncodingMap(PyObject* string)
4381 {
4382 Py_UNICODE *decode;
4383 PyObject *result;
4384 struct encoding_map *mresult;
4385 int i;
4386 int need_dict = 0;
4387 unsigned char level1[32];
4388 unsigned char level2[512];
4389 unsigned char *mlevel1, *mlevel2, *mlevel3;
4390 int count2 = 0, count3 = 0;
4391
4392 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4393 PyErr_BadArgument();
4394 return NULL;
4395 }
4396 decode = PyUnicode_AS_UNICODE(string);
4397 memset(level1, 0xFF, sizeof level1);
4398 memset(level2, 0xFF, sizeof level2);
4399
4400 /* If there isn't a one-to-one mapping of NULL to \0,
4401 or if there are non-BMP characters, we need to use
4402 a mapping dictionary. */
4403 if (decode[0] != 0)
4404 need_dict = 1;
4405 for (i = 1; i < 256; i++) {
4406 int l1, l2;
4407 if (decode[i] == 0
4408 #ifdef Py_UNICODE_WIDE
4409 || decode[i] > 0xFFFF
4410 #endif
4411 ) {
4412 need_dict = 1;
4413 break;
4414 }
4415 if (decode[i] == 0xFFFE)
4416 /* unmapped character */
4417 continue;
4418 l1 = decode[i] >> 11;
4419 l2 = decode[i] >> 7;
4420 if (level1[l1] == 0xFF)
4421 level1[l1] = count2++;
4422 if (level2[l2] == 0xFF)
4423 level2[l2] = count3++;
4424 }
4425
4426 if (count2 >= 0xFF || count3 >= 0xFF)
4427 need_dict = 1;
4428
4429 if (need_dict) {
4430 PyObject *result = PyDict_New();
4431 PyObject *key, *value;
4432 if (!result)
4433 return NULL;
4434 for (i = 0; i < 256; i++) {
4435 value = NULL;
4436 key = PyInt_FromLong(decode[i]);
4437 value = PyInt_FromLong(i);
4438 if (!key || !value)
4439 goto failed1;
4440 if (PyDict_SetItem(result, key, value) == -1)
4441 goto failed1;
4442 Py_DECREF(key);
4443 Py_DECREF(value);
4444 }
4445 return result;
4446 failed1:
4447 Py_XDECREF(key);
4448 Py_XDECREF(value);
4449 Py_DECREF(result);
4450 return NULL;
4451 }
4452
4453 /* Create a three-level trie */
4454 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4455 16*count2 + 128*count3 - 1);
4456 if (!result)
4457 return PyErr_NoMemory();
4458 PyObject_Init(result, &EncodingMapType);
4459 mresult = (struct encoding_map*)result;
4460 mresult->count2 = count2;
4461 mresult->count3 = count3;
4462 mlevel1 = mresult->level1;
4463 mlevel2 = mresult->level23;
4464 mlevel3 = mresult->level23 + 16*count2;
4465 memcpy(mlevel1, level1, 32);
4466 memset(mlevel2, 0xFF, 16*count2);
4467 memset(mlevel3, 0, 128*count3);
4468 count3 = 0;
4469 for (i = 1; i < 256; i++) {
4470 int o1, o2, o3, i2, i3;
4471 if (decode[i] == 0xFFFE)
4472 /* unmapped character */
4473 continue;
4474 o1 = decode[i]>>11;
4475 o2 = (decode[i]>>7) & 0xF;
4476 i2 = 16*mlevel1[o1] + o2;
4477 if (mlevel2[i2] == 0xFF)
4478 mlevel2[i2] = count3++;
4479 o3 = decode[i] & 0x7F;
4480 i3 = 128*mlevel2[i2] + o3;
4481 mlevel3[i3] = i;
4482 }
4483 return result;
4484 }
4485
4486 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4488 {
4489 struct encoding_map *map = (struct encoding_map*)mapping;
4490 int l1 = c>>11;
4491 int l2 = (c>>7) & 0xF;
4492 int l3 = c & 0x7F;
4493 int i;
4494
4495 #ifdef Py_UNICODE_WIDE
4496 if (c > 0xFFFF) {
4497 return -1;
4498 }
4499 #endif
4500 if (c == 0)
4501 return 0;
4502 /* level 1*/
4503 i = map->level1[l1];
4504 if (i == 0xFF) {
4505 return -1;
4506 }
4507 /* level 2*/
4508 i = map->level23[16*i+l2];
4509 if (i == 0xFF) {
4510 return -1;
4511 }
4512 /* level 3 */
4513 i = map->level23[16*map->count2 + 128*i + l3];
4514 if (i == 0) {
4515 return -1;
4516 }
4517 return i;
4518 }
4519
4520 /* Lookup the character ch in the mapping. If the character
4521 can't be found, Py_None is returned (or NULL, if another
4522 error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4524 {
4525 PyObject *w = PyInt_FromLong((long)c);
4526 PyObject *x;
4527
4528 if (w == NULL)
4529 return NULL;
4530 x = PyObject_GetItem(mapping, w);
4531 Py_DECREF(w);
4532 if (x == NULL) {
4533 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4534 /* No mapping found means: mapping is undefined. */
4535 PyErr_Clear();
4536 x = Py_None;
4537 Py_INCREF(x);
4538 return x;
4539 } else
4540 return NULL;
4541 }
4542 else if (x == Py_None)
4543 return x;
4544 else if (PyInt_Check(x)) {
4545 long value = PyInt_AS_LONG(x);
4546 if (value < 0 || value > 255) {
4547 PyErr_SetString(PyExc_TypeError,
4548 "character mapping must be in range(256)");
4549 Py_DECREF(x);
4550 return NULL;
4551 }
4552 return x;
4553 }
4554 else if (PyString_Check(x))
4555 return x;
4556 else {
4557 /* wrong return value */
4558 PyErr_SetString(PyExc_TypeError,
4559 "character mapping must return integer, None or str");
4560 Py_DECREF(x);
4561 return NULL;
4562 }
4563 }
4564
4565 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4567 {
4568 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4569 /* exponentially overallocate to minimize reallocations */
4570 if (requiredsize < 2*outsize)
4571 requiredsize = 2*outsize;
4572 if (_PyString_Resize(outobj, requiredsize)) {
4573 return 0;
4574 }
4575 return 1;
4576 }
4577
4578 typedef enum charmapencode_result {
4579 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4580 }charmapencode_result;
4581 /* lookup the character, put the result in the output string and adjust
4582 various state variables. Reallocate the output string if not enough
4583 space is available. Return a new reference to the object that
4584 was put in the output buffer, or Py_None, if the mapping was undefined
4585 (in which case no character was written) or NULL, if a
4586 reallocation error occurred. The caller must decref the result */
4587 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4589 PyObject **outobj, Py_ssize_t *outpos)
4590 {
4591 PyObject *rep;
4592 char *outstart;
4593 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4594
4595 if (Py_TYPE(mapping) == &EncodingMapType) {
4596 int res = encoding_map_lookup(c, mapping);
4597 Py_ssize_t requiredsize = *outpos+1;
4598 if (res == -1)
4599 return enc_FAILED;
4600 if (outsize<requiredsize)
4601 if (!charmapencode_resize(outobj, outpos, requiredsize))
4602 return enc_EXCEPTION;
4603 outstart = PyString_AS_STRING(*outobj);
4604 outstart[(*outpos)++] = (char)res;
4605 return enc_SUCCESS;
4606 }
4607
4608 rep = charmapencode_lookup(c, mapping);
4609 if (rep==NULL)
4610 return enc_EXCEPTION;
4611 else if (rep==Py_None) {
4612 Py_DECREF(rep);
4613 return enc_FAILED;
4614 } else {
4615 if (PyInt_Check(rep)) {
4616 Py_ssize_t requiredsize = *outpos+1;
4617 if (outsize<requiredsize)
4618 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4619 Py_DECREF(rep);
4620 return enc_EXCEPTION;
4621 }
4622 outstart = PyString_AS_STRING(*outobj);
4623 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4624 }
4625 else {
4626 const char *repchars = PyString_AS_STRING(rep);
4627 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4628 Py_ssize_t requiredsize = *outpos+repsize;
4629 if (outsize<requiredsize)
4630 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4631 Py_DECREF(rep);
4632 return enc_EXCEPTION;
4633 }
4634 outstart = PyString_AS_STRING(*outobj);
4635 memcpy(outstart + *outpos, repchars, repsize);
4636 *outpos += repsize;
4637 }
4638 }
4639 Py_DECREF(rep);
4640 return enc_SUCCESS;
4641 }
4642
4643 /* handle an error in PyUnicode_EncodeCharmap
4644 Return 0 on success, -1 on error */
4645 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4646 int charmap_encoding_error(
4647 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4648 PyObject **exceptionObject,
4649 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4650 PyObject **res, Py_ssize_t *respos)
4651 {
4652 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4653 Py_ssize_t repsize;
4654 Py_ssize_t newpos;
4655 Py_UNICODE *uni2;
4656 /* startpos for collecting unencodable chars */
4657 Py_ssize_t collstartpos = *inpos;
4658 Py_ssize_t collendpos = *inpos+1;
4659 Py_ssize_t collpos;
4660 char *encoding = "charmap";
4661 char *reason = "character maps to <undefined>";
4662 charmapencode_result x;
4663
4664 /* find all unencodable characters */
4665 while (collendpos < size) {
4666 PyObject *rep;
4667 if (Py_TYPE(mapping) == &EncodingMapType) {
4668 int res = encoding_map_lookup(p[collendpos], mapping);
4669 if (res != -1)
4670 break;
4671 ++collendpos;
4672 continue;
4673 }
4674
4675 rep = charmapencode_lookup(p[collendpos], mapping);
4676 if (rep==NULL)
4677 return -1;
4678 else if (rep!=Py_None) {
4679 Py_DECREF(rep);
4680 break;
4681 }
4682 Py_DECREF(rep);
4683 ++collendpos;
4684 }
4685 /* cache callback name lookup
4686 * (if not done yet, i.e. it's the first error) */
4687 if (*known_errorHandler==-1) {
4688 if ((errors==NULL) || (!strcmp(errors, "strict")))
4689 *known_errorHandler = 1;
4690 else if (!strcmp(errors, "replace"))
4691 *known_errorHandler = 2;
4692 else if (!strcmp(errors, "ignore"))
4693 *known_errorHandler = 3;
4694 else if (!strcmp(errors, "xmlcharrefreplace"))
4695 *known_errorHandler = 4;
4696 else
4697 *known_errorHandler = 0;
4698 }
4699 switch (*known_errorHandler) {
4700 case 1: /* strict */
4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702 return -1;
4703 case 2: /* replace */
4704 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4705 x = charmapencode_output('?', mapping, res, respos);
4706 if (x==enc_EXCEPTION) {
4707 return -1;
4708 }
4709 else if (x==enc_FAILED) {
4710 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4711 return -1;
4712 }
4713 }
4714 /* fall through */
4715 case 3: /* ignore */
4716 *inpos = collendpos;
4717 break;
4718 case 4: /* xmlcharrefreplace */
4719 /* generate replacement */
4720 for (collpos = collstartpos; collpos < collendpos;) {
4721 char buffer[2+29+1+1];
4722 char *cp;
4723 Py_UCS4 ch = p[collpos++];
4724 #ifndef Py_UNICODE_WIDE
4725 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4726 (collpos < collendpos) &&
4727 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4728 ch = ((((ch & 0x03FF) << 10) |
4729 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4730 }
4731 #endif
4732 sprintf(buffer, "&#%d;", (int)ch);
4733 for (cp = buffer; *cp; ++cp) {
4734 x = charmapencode_output(*cp, mapping, res, respos);
4735 if (x==enc_EXCEPTION)
4736 return -1;
4737 else if (x==enc_FAILED) {
4738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739 return -1;
4740 }
4741 }
4742 }
4743 *inpos = collendpos;
4744 break;
4745 default:
4746 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4747 encoding, reason, p, size, exceptionObject,
4748 collstartpos, collendpos, &newpos);
4749 if (repunicode == NULL)
4750 return -1;
4751 /* generate replacement */
4752 repsize = PyUnicode_GET_SIZE(repunicode);
4753 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4754 x = charmapencode_output(*uni2, mapping, res, respos);
4755 if (x==enc_EXCEPTION) {
4756 return -1;
4757 }
4758 else if (x==enc_FAILED) {
4759 Py_DECREF(repunicode);
4760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4761 return -1;
4762 }
4763 }
4764 *inpos = newpos;
4765 Py_DECREF(repunicode);
4766 }
4767 return 0;
4768 }
4769
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4771 Py_ssize_t size,
4772 PyObject *mapping,
4773 const char *errors)
4774 {
4775 /* output object */
4776 PyObject *res = NULL;
4777 /* current input position */
4778 Py_ssize_t inpos = 0;
4779 /* current output position */
4780 Py_ssize_t respos = 0;
4781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
4783 /* the following variable is used for caching string comparisons
4784 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785 * 3=ignore, 4=xmlcharrefreplace */
4786 int known_errorHandler = -1;
4787
4788 /* Default to Latin-1 */
4789 if (mapping == NULL)
4790 return PyUnicode_EncodeLatin1(p, size, errors);
4791
4792 /* allocate enough for a simple encoding without
4793 replacements, if we need more, we'll resize */
4794 res = PyString_FromStringAndSize(NULL, size);
4795 if (res == NULL)
4796 goto onError;
4797 if (size == 0)
4798 return res;
4799
4800 while (inpos<size) {
4801 /* try to encode it */
4802 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4803 if (x==enc_EXCEPTION) /* error */
4804 goto onError;
4805 if (x==enc_FAILED) { /* unencodable character */
4806 if (charmap_encoding_error(p, size, &inpos, mapping,
4807 &exc,
4808 &known_errorHandler, &errorHandler, errors,
4809 &res, &respos)) {
4810 goto onError;
4811 }
4812 }
4813 else
4814 /* done with this character => adjust input position */
4815 ++inpos;
4816 }
4817
4818 /* Resize if we allocated to much */
4819 if (respos<PyString_GET_SIZE(res)) {
4820 if (_PyString_Resize(&res, respos))
4821 goto onError;
4822 }
4823 Py_XDECREF(exc);
4824 Py_XDECREF(errorHandler);
4825 return res;
4826
4827 onError:
4828 Py_XDECREF(res);
4829 Py_XDECREF(exc);
4830 Py_XDECREF(errorHandler);
4831 return NULL;
4832 }
4833
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4835 PyObject *mapping)
4836 {
4837 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4838 PyErr_BadArgument();
4839 return NULL;
4840 }
4841 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4842 PyUnicode_GET_SIZE(unicode),
4843 mapping,
4844 NULL);
4845 }
4846
4847 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4848 static void make_translate_exception(PyObject **exceptionObject,
4849 const Py_UNICODE *unicode, Py_ssize_t size,
4850 Py_ssize_t startpos, Py_ssize_t endpos,
4851 const char *reason)
4852 {
4853 if (*exceptionObject == NULL) {
4854 *exceptionObject = PyUnicodeTranslateError_Create(
4855 unicode, size, startpos, endpos, reason);
4856 }
4857 else {
4858 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4859 goto onError;
4860 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4861 goto onError;
4862 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4863 goto onError;
4864 return;
4865 onError:
4866 Py_CLEAR(*exceptionObject);
4867 }
4868 }
4869
4870 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4871 static void raise_translate_exception(PyObject **exceptionObject,
4872 const Py_UNICODE *unicode, Py_ssize_t size,
4873 Py_ssize_t startpos, Py_ssize_t endpos,
4874 const char *reason)
4875 {
4876 make_translate_exception(exceptionObject,
4877 unicode, size, startpos, endpos, reason);
4878 if (*exceptionObject != NULL)
4879 PyCodec_StrictErrors(*exceptionObject);
4880 }
4881
4882 /* error handling callback helper:
4883 build arguments, call the callback and check the arguments,
4884 put the result into newpos and return the replacement string, which
4885 has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4886 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4887 PyObject **errorHandler,
4888 const char *reason,
4889 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4890 Py_ssize_t startpos, Py_ssize_t endpos,
4891 Py_ssize_t *newpos)
4892 {
4893 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4894
4895 Py_ssize_t i_newpos;
4896 PyObject *restuple;
4897 PyObject *resunicode;
4898
4899 if (*errorHandler == NULL) {
4900 *errorHandler = PyCodec_LookupError(errors);
4901 if (*errorHandler == NULL)
4902 return NULL;
4903 }
4904
4905 make_translate_exception(exceptionObject,
4906 unicode, size, startpos, endpos, reason);
4907 if (*exceptionObject == NULL)
4908 return NULL;
4909
4910 restuple = PyObject_CallFunctionObjArgs(
4911 *errorHandler, *exceptionObject, NULL);
4912 if (restuple == NULL)
4913 return NULL;
4914 if (!PyTuple_Check(restuple)) {
4915 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4916 Py_DECREF(restuple);
4917 return NULL;
4918 }
4919 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4920 &resunicode, &i_newpos)) {
4921 Py_DECREF(restuple);
4922 return NULL;
4923 }
4924 if (i_newpos<0)
4925 *newpos = size+i_newpos;
4926 else
4927 *newpos = i_newpos;
4928 if (*newpos<0 || *newpos>size) {
4929 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4930 Py_DECREF(restuple);
4931 return NULL;
4932 }
4933 Py_INCREF(resunicode);
4934 Py_DECREF(restuple);
4935 return resunicode;
4936 }
4937
4938 /* Lookup the character ch in the mapping and put the result in result,
4939 which must be decrefed by the caller.
4940 Return 0 on success, -1 on error */
4941 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4943 {
4944 PyObject *w = PyInt_FromLong((long)c);
4945 PyObject *x;
4946
4947 if (w == NULL)
4948 return -1;
4949 x = PyObject_GetItem(mapping, w);
4950 Py_DECREF(w);
4951 if (x == NULL) {
4952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4953 /* No mapping found means: use 1:1 mapping. */
4954 PyErr_Clear();
4955 *result = NULL;
4956 return 0;
4957 } else
4958 return -1;
4959 }
4960 else if (x == Py_None) {
4961 *result = x;
4962 return 0;
4963 }
4964 else if (PyInt_Check(x)) {
4965 long value = PyInt_AS_LONG(x);
4966 long max = PyUnicode_GetMax();
4967 if (value < 0 || value > max) {
4968 PyErr_Format(PyExc_TypeError,
4969 "character mapping must be in range(0x%lx)", max+1);
4970 Py_DECREF(x);
4971 return -1;
4972 }
4973 *result = x;
4974 return 0;
4975 }
4976 else if (PyUnicode_Check(x)) {
4977 *result = x;
4978 return 0;
4979 }
4980 else {
4981 /* wrong return value */
4982 PyErr_SetString(PyExc_TypeError,
4983 "character mapping must return integer, None or unicode");
4984 Py_DECREF(x);
4985 return -1;
4986 }
4987 }
4988 /* ensure that *outobj is at least requiredsize characters long,
4989 if not reallocate and adjust various state variables.
4990 Return 0 on success, -1 on error */
4991 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4993 Py_ssize_t requiredsize)
4994 {
4995 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4996 if (requiredsize > oldsize) {
4997 /* remember old output position */
4998 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4999 /* exponentially overallocate to minimize reallocations */
5000 if (requiredsize < 2 * oldsize)
5001 requiredsize = 2 * oldsize;
5002 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5003 return -1;
5004 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5005 }
5006 return 0;
5007 }
5008 /* lookup the character, put the result in the output string and adjust
5009 various state variables. Return a new reference to the object that
5010 was put in the output buffer in *result, or Py_None, if the mapping was
5011 undefined (in which case no character was written).
5012 The called must decref result.
5013 Return 0 on success, -1 on error. */
5014 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5016 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5017 PyObject **res)
5018 {
5019 if (charmaptranslate_lookup(*curinp, mapping, res))
5020 return -1;
5021 if (*res==NULL) {
5022 /* not found => default to 1:1 mapping */
5023 *(*outp)++ = *curinp;
5024 }
5025 else if (*res==Py_None)
5026 ;
5027 else if (PyInt_Check(*res)) {
5028 /* no overflow check, because we know that the space is enough */
5029 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5030 }
5031 else if (PyUnicode_Check(*res)) {
5032 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5033 if (repsize==1) {
5034 /* no overflow check, because we know that the space is enough */
5035 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5036 }
5037 else if (repsize!=0) {
5038 /* more than one character */
5039 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5040 (insize - (curinp-startinp)) +
5041 repsize - 1;
5042 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5043 return -1;
5044 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5045 *outp += repsize;
5046 }
5047 }
5048 else
5049 return -1;
5050 return 0;
5051 }
5052
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5054 Py_ssize_t size,
5055 PyObject *mapping,
5056 const char *errors)
5057 {
5058 /* output object */
5059 PyObject *res = NULL;
5060 /* pointers to the beginning and end+1 of input */
5061 const Py_UNICODE *startp = p;
5062 const Py_UNICODE *endp = p + size;
5063 /* pointer into the output */
5064 Py_UNICODE *str;
5065 /* current output position */
5066 Py_ssize_t respos = 0;
5067 char *reason = "character maps to <undefined>";
5068 PyObject *errorHandler = NULL;
5069 PyObject *exc = NULL;
5070 /* the following variable is used for caching string comparisons
5071 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5072 * 3=ignore, 4=xmlcharrefreplace */
5073 int known_errorHandler = -1;
5074
5075 if (mapping == NULL) {
5076 PyErr_BadArgument();
5077 return NULL;
5078 }
5079
5080 /* allocate enough for a simple 1:1 translation without
5081 replacements, if we need more, we'll resize */
5082 res = PyUnicode_FromUnicode(NULL, size);
5083 if (res == NULL)
5084 goto onError;
5085 if (size == 0)
5086 return res;
5087 str = PyUnicode_AS_UNICODE(res);
5088
5089 while (p<endp) {
5090 /* try to encode it */
5091 PyObject *x = NULL;
5092 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5093 Py_XDECREF(x);
5094 goto onError;
5095 }
5096 Py_XDECREF(x);
5097 if (x!=Py_None) /* it worked => adjust input pointer */
5098 ++p;
5099 else { /* untranslatable character */
5100 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5101 Py_ssize_t repsize;
5102 Py_ssize_t newpos;
5103 Py_UNICODE *uni2;
5104 /* startpos for collecting untranslatable chars */
5105 const Py_UNICODE *collstart = p;
5106 const Py_UNICODE *collend = p+1;
5107 const Py_UNICODE *coll;
5108
5109 /* find all untranslatable characters */
5110 while (collend < endp) {
5111 if (charmaptranslate_lookup(*collend, mapping, &x))
5112 goto onError;
5113 Py_XDECREF(x);
5114 if (x!=Py_None)
5115 break;
5116 ++collend;
5117 }
5118 /* cache callback name lookup
5119 * (if not done yet, i.e. it's the first error) */
5120 if (known_errorHandler==-1) {
5121 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122 known_errorHandler = 1;
5123 else if (!strcmp(errors, "replace"))
5124 known_errorHandler = 2;
5125 else if (!strcmp(errors, "ignore"))
5126 known_errorHandler = 3;
5127 else if (!strcmp(errors, "xmlcharrefreplace"))
5128 known_errorHandler = 4;
5129 else
5130 known_errorHandler = 0;
5131 }
5132 switch (known_errorHandler) {
5133 case 1: /* strict */
5134 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5135 goto onError;
5136 case 2: /* replace */
5137 /* No need to check for space, this is a 1:1 replacement */
5138 for (coll = collstart; coll<collend; ++coll)
5139 *str++ = '?';
5140 /* fall through */
5141 case 3: /* ignore */
5142 p = collend;
5143 break;
5144 case 4: /* xmlcharrefreplace */
5145 /* generate replacement (temporarily (mis)uses p) */
5146 for (p = collstart; p < collend;) {
5147 char buffer[2+29+1+1];
5148 char *cp;
5149 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5150 sprintf(buffer, "&#%d;", (int)ch);
5151 if (charmaptranslate_makespace(&res, &str,
5152 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5153 goto onError;
5154 for (cp = buffer; *cp; ++cp)
5155 *str++ = *cp;
5156 }
5157 p = collend;
5158 break;
5159 default:
5160 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5161 reason, startp, size, &exc,
5162 collstart-startp, collend-startp, &newpos);
5163 if (repunicode == NULL)
5164 goto onError;
5165 /* generate replacement */
5166 repsize = PyUnicode_GET_SIZE(repunicode);
5167 if (charmaptranslate_makespace(&res, &str,
5168 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5169 Py_DECREF(repunicode);
5170 goto onError;
5171 }
5172 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5173 *str++ = *uni2;
5174 p = startp + newpos;
5175 Py_DECREF(repunicode);
5176 }
5177 }
5178 }
5179 /* Resize if we allocated to much */
5180 respos = str-PyUnicode_AS_UNICODE(res);
5181 if (respos<PyUnicode_GET_SIZE(res)) {
5182 if (PyUnicode_Resize(&res, respos) < 0)
5183 goto onError;
5184 }
5185 Py_XDECREF(exc);
5186 Py_XDECREF(errorHandler);
5187 return res;
5188
5189 onError:
5190 Py_XDECREF(res);
5191 Py_XDECREF(exc);
5192 Py_XDECREF(errorHandler);
5193 return NULL;
5194 }
5195
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5196 PyObject *PyUnicode_Translate(PyObject *str,
5197 PyObject *mapping,
5198 const char *errors)
5199 {
5200 PyObject *result;
5201
5202 str = PyUnicode_FromObject(str);
5203 if (str == NULL)
5204 goto onError;
5205 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5206 PyUnicode_GET_SIZE(str),
5207 mapping,
5208 errors);
5209 Py_DECREF(str);
5210 return result;
5211
5212 onError:
5213 Py_XDECREF(str);
5214 return NULL;
5215 }
5216
5217 /* --- Decimal Encoder ---------------------------------------------------- */
5218
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5220 Py_ssize_t length,
5221 char *output,
5222 const char *errors)
5223 {
5224 Py_UNICODE *p, *end;
5225 PyObject *errorHandler = NULL;
5226 PyObject *exc = NULL;
5227 const char *encoding = "decimal";
5228 const char *reason = "invalid decimal Unicode string";
5229 /* the following variable is used for caching string comparisons
5230 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5231 int known_errorHandler = -1;
5232
5233 if (output == NULL) {
5234 PyErr_BadArgument();
5235 return -1;
5236 }
5237
5238 p = s;
5239 end = s + length;
5240 while (p < end) {
5241 register Py_UNICODE ch = *p;
5242 int decimal;
5243 PyObject *repunicode;
5244 Py_ssize_t repsize;
5245 Py_ssize_t newpos;
5246 Py_UNICODE *uni2;
5247 Py_UNICODE *collstart;
5248 Py_UNICODE *collend;
5249
5250 if (Py_UNICODE_ISSPACE(ch)) {
5251 *output++ = ' ';
5252 ++p;
5253 continue;
5254 }
5255 decimal = Py_UNICODE_TODECIMAL(ch);
5256 if (decimal >= 0) {
5257 *output++ = '0' + decimal;
5258 ++p;
5259 continue;
5260 }
5261 if (0 < ch && ch < 256) {
5262 *output++ = (char)ch;
5263 ++p;
5264 continue;
5265 }
5266 /* All other characters are considered unencodable */
5267 collstart = p;
5268 for (collend = p+1; collend < end; collend++) {
5269 if ((0 < *collend && *collend < 256) ||
5270 Py_UNICODE_ISSPACE(*collend) ||
5271 0 <= Py_UNICODE_TODECIMAL(*collend))
5272 break;
5273 }
5274 /* cache callback name lookup
5275 * (if not done yet, i.e. it's the first error) */
5276 if (known_errorHandler==-1) {
5277 if ((errors==NULL) || (!strcmp(errors, "strict")))
5278 known_errorHandler = 1;
5279 else if (!strcmp(errors, "replace"))
5280 known_errorHandler = 2;
5281 else if (!strcmp(errors, "ignore"))
5282 known_errorHandler = 3;
5283 else if (!strcmp(errors, "xmlcharrefreplace"))
5284 known_errorHandler = 4;
5285 else
5286 known_errorHandler = 0;
5287 }
5288 switch (known_errorHandler) {
5289 case 1: /* strict */
5290 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5291 goto onError;
5292 case 2: /* replace */
5293 for (p = collstart; p < collend; ++p)
5294 *output++ = '?';
5295 /* fall through */
5296 case 3: /* ignore */
5297 p = collend;
5298 break;
5299 case 4: /* xmlcharrefreplace */
5300 /* generate replacement (temporarily (mis)uses p) */
5301 for (p = collstart; p < collend;) {
5302 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5303 output += sprintf(output, "&#%d;", ch);
5304 }
5305 p = collend;
5306 break;
5307 default:
5308 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5309 encoding, reason, s, length, &exc,
5310 collstart-s, collend-s, &newpos);
5311 if (repunicode == NULL)
5312 goto onError;
5313 /* generate replacement */
5314 repsize = PyUnicode_GET_SIZE(repunicode);
5315 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5316 Py_UNICODE ch = *uni2;
5317 if (Py_UNICODE_ISSPACE(ch))
5318 *output++ = ' ';
5319 else {
5320 decimal = Py_UNICODE_TODECIMAL(ch);
5321 if (decimal >= 0)
5322 *output++ = '0' + decimal;
5323 else if (0 < ch && ch < 256)
5324 *output++ = (char)ch;
5325 else {
5326 Py_DECREF(repunicode);
5327 raise_encode_exception(&exc, encoding,
5328 s, length, collstart-s, collend-s, reason);
5329 goto onError;
5330 }
5331 }
5332 }
5333 p = s + newpos;
5334 Py_DECREF(repunicode);
5335 }
5336 }
5337 /* 0-terminate the output string */
5338 *output++ = '\0';
5339 Py_XDECREF(exc);
5340 Py_XDECREF(errorHandler);
5341 return 0;
5342
5343 onError:
5344 Py_XDECREF(exc);
5345 Py_XDECREF(errorHandler);
5346 return -1;
5347 }
5348
5349 /* --- Helpers ------------------------------------------------------------ */
5350
5351 #include "stringlib/unicodedefs.h"
5352 #include "stringlib/fastsearch.h"
5353
5354 #include "stringlib/count.h"
5355 #include "stringlib/find.h"
5356 #include "stringlib/partition.h"
5357 #include "stringlib/split.h"
5358
5359 /* helper macro to fixup start/end slice values */
5360 #define ADJUST_INDICES(start, end, len) \
5361 if (end > len) \
5362 end = len; \
5363 else if (end < 0) { \
5364 end += len; \
5365 if (end < 0) \
5366 end = 0; \
5367 } \
5368 if (start < 0) { \
5369 start += len; \
5370 if (start < 0) \
5371 start = 0; \
5372 }
5373
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5374 Py_ssize_t PyUnicode_Count(PyObject *str,
5375 PyObject *substr,
5376 Py_ssize_t start,
5377 Py_ssize_t end)
5378 {
5379 Py_ssize_t result;
5380 PyUnicodeObject* str_obj;
5381 PyUnicodeObject* sub_obj;
5382
5383 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5384 if (!str_obj)
5385 return -1;
5386 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5387 if (!sub_obj) {
5388 Py_DECREF(str_obj);
5389 return -1;
5390 }
5391
5392 ADJUST_INDICES(start, end, str_obj->length);
5393 result = stringlib_count(
5394 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5395 PY_SSIZE_T_MAX
5396 );
5397
5398 Py_DECREF(sub_obj);
5399 Py_DECREF(str_obj);
5400
5401 return result;
5402 }
5403
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5404 Py_ssize_t PyUnicode_Find(PyObject *str,
5405 PyObject *sub,
5406 Py_ssize_t start,
5407 Py_ssize_t end,
5408 int direction)
5409 {
5410 Py_ssize_t result;
5411
5412 str = PyUnicode_FromObject(str);
5413 if (!str)
5414 return -2;
5415 sub = PyUnicode_FromObject(sub);
5416 if (!sub) {
5417 Py_DECREF(str);
5418 return -2;
5419 }
5420
5421 if (direction > 0)
5422 result = stringlib_find_slice(
5423 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5424 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5425 start, end
5426 );
5427 else
5428 result = stringlib_rfind_slice(
5429 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431 start, end
5432 );
5433
5434 Py_DECREF(str);
5435 Py_DECREF(sub);
5436
5437 return result;
5438 }
5439
5440 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5441 int tailmatch(PyUnicodeObject *self,
5442 PyUnicodeObject *substring,
5443 Py_ssize_t start,
5444 Py_ssize_t end,
5445 int direction)
5446 {
5447 if (substring->length == 0)
5448 return 1;
5449
5450 ADJUST_INDICES(start, end, self->length);
5451 end -= substring->length;
5452 if (end < start)
5453 return 0;
5454
5455 if (direction > 0) {
5456 if (Py_UNICODE_MATCH(self, end, substring))
5457 return 1;
5458 } else {
5459 if (Py_UNICODE_MATCH(self, start, substring))
5460 return 1;
5461 }
5462
5463 return 0;
5464 }
5465
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5467 PyObject *substr,
5468 Py_ssize_t start,
5469 Py_ssize_t end,
5470 int direction)
5471 {
5472 Py_ssize_t result;
5473
5474 str = PyUnicode_FromObject(str);
5475 if (str == NULL)
5476 return -1;
5477 substr = PyUnicode_FromObject(substr);
5478 if (substr == NULL) {
5479 Py_DECREF(str);
5480 return -1;
5481 }
5482
5483 result = tailmatch((PyUnicodeObject *)str,
5484 (PyUnicodeObject *)substr,
5485 start, end, direction);
5486 Py_DECREF(str);
5487 Py_DECREF(substr);
5488 return result;
5489 }
5490
5491 /* Apply fixfct filter to the Unicode object self and return a
5492 reference to the modified object */
5493
5494 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5495 PyObject *fixup(PyUnicodeObject *self,
5496 int (*fixfct)(PyUnicodeObject *s))
5497 {
5498
5499 PyUnicodeObject *u;
5500
5501 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5502 if (u == NULL)
5503 return NULL;
5504
5505 Py_UNICODE_COPY(u->str, self->str, self->length);
5506
5507 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5508 /* fixfct should return TRUE if it modified the buffer. If
5509 FALSE, return a reference to the original buffer instead
5510 (to save space, not time) */
5511 Py_INCREF(self);
5512 Py_DECREF(u);
5513 return (PyObject*) self;
5514 }
5515 return (PyObject*) u;
5516 }
5517
5518 static
fixupper(PyUnicodeObject * self)5519 int fixupper(PyUnicodeObject *self)
5520 {
5521 Py_ssize_t len = self->length;
5522 Py_UNICODE *s = self->str;
5523 int status = 0;
5524
5525 while (len-- > 0) {
5526 register Py_UNICODE ch;
5527
5528 ch = Py_UNICODE_TOUPPER(*s);
5529 if (ch != *s) {
5530 status = 1;
5531 *s = ch;
5532 }
5533 s++;
5534 }
5535
5536 return status;
5537 }
5538
5539 static
fixlower(PyUnicodeObject * self)5540 int fixlower(PyUnicodeObject *self)
5541 {
5542 Py_ssize_t len = self->length;
5543 Py_UNICODE *s = self->str;
5544 int status = 0;
5545
5546 while (len-- > 0) {
5547 register Py_UNICODE ch;
5548
5549 ch = Py_UNICODE_TOLOWER(*s);
5550 if (ch != *s) {
5551 status = 1;
5552 *s = ch;
5553 }
5554 s++;
5555 }
5556
5557 return status;
5558 }
5559
5560 static
fixswapcase(PyUnicodeObject * self)5561 int fixswapcase(PyUnicodeObject *self)
5562 {
5563 Py_ssize_t len = self->length;
5564 Py_UNICODE *s = self->str;
5565 int status = 0;
5566
5567 while (len-- > 0) {
5568 if (Py_UNICODE_ISUPPER(*s)) {
5569 *s = Py_UNICODE_TOLOWER(*s);
5570 status = 1;
5571 } else if (Py_UNICODE_ISLOWER(*s)) {
5572 *s = Py_UNICODE_TOUPPER(*s);
5573 status = 1;
5574 }
5575 s++;
5576 }
5577
5578 return status;
5579 }
5580
5581 static
fixcapitalize(PyUnicodeObject * self)5582 int fixcapitalize(PyUnicodeObject *self)
5583 {
5584 Py_ssize_t len = self->length;
5585 Py_UNICODE *s = self->str;
5586 int status = 0;
5587
5588 if (len == 0)
5589 return 0;
5590 if (!Py_UNICODE_ISUPPER(*s)) {
5591 *s = Py_UNICODE_TOUPPER(*s);
5592 status = 1;
5593 }
5594 s++;
5595 while (--len > 0) {
5596 if (!Py_UNICODE_ISLOWER(*s)) {
5597 *s = Py_UNICODE_TOLOWER(*s);
5598 status = 1;
5599 }
5600 s++;
5601 }
5602 return status;
5603 }
5604
5605 static
fixtitle(PyUnicodeObject * self)5606 int fixtitle(PyUnicodeObject *self)
5607 {
5608 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609 register Py_UNICODE *e;
5610 int previous_is_cased;
5611
5612 /* Shortcut for single character strings */
5613 if (PyUnicode_GET_SIZE(self) == 1) {
5614 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5615 if (*p != ch) {
5616 *p = ch;
5617 return 1;
5618 }
5619 else
5620 return 0;
5621 }
5622
5623 e = p + PyUnicode_GET_SIZE(self);
5624 previous_is_cased = 0;
5625 for (; p < e; p++) {
5626 register const Py_UNICODE ch = *p;
5627
5628 if (previous_is_cased)
5629 *p = Py_UNICODE_TOLOWER(ch);
5630 else
5631 *p = Py_UNICODE_TOTITLE(ch);
5632
5633 if (Py_UNICODE_ISLOWER(ch) ||
5634 Py_UNICODE_ISUPPER(ch) ||
5635 Py_UNICODE_ISTITLE(ch))
5636 previous_is_cased = 1;
5637 else
5638 previous_is_cased = 0;
5639 }
5640 return 1;
5641 }
5642
5643 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5644 PyUnicode_Join(PyObject *separator, PyObject *seq)
5645 {
5646 PyObject *internal_separator = NULL;
5647 const Py_UNICODE blank = ' ';
5648 const Py_UNICODE *sep = ␣
5649 Py_ssize_t seplen = 1;
5650 PyUnicodeObject *res = NULL; /* the result */
5651 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5652 Py_ssize_t res_used; /* # used bytes */
5653 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5654 PyObject *fseq; /* PySequence_Fast(seq) */
5655 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5656 PyObject *item;
5657 Py_ssize_t i;
5658
5659 fseq = PySequence_Fast(seq, "can only join an iterable");
5660 if (fseq == NULL) {
5661 return NULL;
5662 }
5663
5664 /* Grrrr. A codec may be invoked to convert str objects to
5665 * Unicode, and so it's possible to call back into Python code
5666 * during PyUnicode_FromObject(), and so it's possible for a sick
5667 * codec to change the size of fseq (if seq is a list). Therefore
5668 * we have to keep refetching the size -- can't assume seqlen
5669 * is invariant.
5670 */
5671 seqlen = PySequence_Fast_GET_SIZE(fseq);
5672 /* If empty sequence, return u"". */
5673 if (seqlen == 0) {
5674 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5675 goto Done;
5676 }
5677 /* If singleton sequence with an exact Unicode, return that. */
5678 if (seqlen == 1) {
5679 item = PySequence_Fast_GET_ITEM(fseq, 0);
5680 if (PyUnicode_CheckExact(item)) {
5681 Py_INCREF(item);
5682 res = (PyUnicodeObject *)item;
5683 goto Done;
5684 }
5685 }
5686
5687 /* At least two items to join, or one that isn't exact Unicode. */
5688 if (seqlen > 1) {
5689 /* Set up sep and seplen -- they're needed. */
5690 if (separator == NULL) {
5691 sep = ␣
5692 seplen = 1;
5693 }
5694 else {
5695 internal_separator = PyUnicode_FromObject(separator);
5696 if (internal_separator == NULL)
5697 goto onError;
5698 sep = PyUnicode_AS_UNICODE(internal_separator);
5699 seplen = PyUnicode_GET_SIZE(internal_separator);
5700 /* In case PyUnicode_FromObject() mutated seq. */
5701 seqlen = PySequence_Fast_GET_SIZE(fseq);
5702 }
5703 }
5704
5705 /* Get space. */
5706 res = _PyUnicode_New(res_alloc);
5707 if (res == NULL)
5708 goto onError;
5709 res_p = PyUnicode_AS_UNICODE(res);
5710 res_used = 0;
5711
5712 for (i = 0; i < seqlen; ++i) {
5713 Py_ssize_t itemlen;
5714 Py_ssize_t new_res_used;
5715
5716 item = PySequence_Fast_GET_ITEM(fseq, i);
5717 /* Convert item to Unicode. */
5718 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5719 PyErr_Format(PyExc_TypeError,
5720 "sequence item %zd: expected string or Unicode,"
5721 " %.80s found",
5722 i, Py_TYPE(item)->tp_name);
5723 goto onError;
5724 }
5725 item = PyUnicode_FromObject(item);
5726 if (item == NULL)
5727 goto onError;
5728 /* We own a reference to item from here on. */
5729
5730 /* In case PyUnicode_FromObject() mutated seq. */
5731 seqlen = PySequence_Fast_GET_SIZE(fseq);
5732
5733 /* Make sure we have enough space for the separator and the item. */
5734 itemlen = PyUnicode_GET_SIZE(item);
5735 if (res_used > PY_SSIZE_T_MAX - itemlen)
5736 goto Overflow;
5737 new_res_used = res_used + itemlen;
5738 if (i < seqlen - 1) {
5739 if (new_res_used > PY_SSIZE_T_MAX - seplen)
5740 goto Overflow;
5741 new_res_used += seplen;
5742 }
5743 if (new_res_used > res_alloc) {
5744 /* double allocated size until it's big enough */
5745 do {
5746 if (res_alloc > PY_SSIZE_T_MAX / 2)
5747 goto Overflow;
5748 res_alloc += res_alloc;
5749 } while (new_res_used > res_alloc);
5750 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5751 Py_DECREF(item);
5752 goto onError;
5753 }
5754 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5755 }
5756
5757 /* Copy item, and maybe the separator. */
5758 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5759 res_p += itemlen;
5760 if (i < seqlen - 1) {
5761 Py_UNICODE_COPY(res_p, sep, seplen);
5762 res_p += seplen;
5763 }
5764 Py_DECREF(item);
5765 res_used = new_res_used;
5766 }
5767
5768 /* Shrink res to match the used area; this probably can't fail,
5769 * but it's cheap to check.
5770 */
5771 if (_PyUnicode_Resize(&res, res_used) < 0)
5772 goto onError;
5773
5774 Done:
5775 Py_XDECREF(internal_separator);
5776 Py_DECREF(fseq);
5777 return (PyObject *)res;
5778
5779 Overflow:
5780 PyErr_SetString(PyExc_OverflowError,
5781 "join() result is too long for a Python string");
5782 Py_DECREF(item);
5783 /* fall through */
5784
5785 onError:
5786 Py_XDECREF(internal_separator);
5787 Py_DECREF(fseq);
5788 Py_XDECREF(res);
5789 return NULL;
5790 }
5791
5792 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5793 PyUnicodeObject *pad(PyUnicodeObject *self,
5794 Py_ssize_t left,
5795 Py_ssize_t right,
5796 Py_UNICODE fill)
5797 {
5798 PyUnicodeObject *u;
5799
5800 if (left < 0)
5801 left = 0;
5802 if (right < 0)
5803 right = 0;
5804
5805 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5806 Py_INCREF(self);
5807 return self;
5808 }
5809
5810 if (left > PY_SSIZE_T_MAX - self->length ||
5811 right > PY_SSIZE_T_MAX - (left + self->length)) {
5812 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5813 return NULL;
5814 }
5815 u = _PyUnicode_New(left + self->length + right);
5816 if (u) {
5817 if (left)
5818 Py_UNICODE_FILL(u->str, fill, left);
5819 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5820 if (right)
5821 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5822 }
5823
5824 return u;
5825 }
5826
PyUnicode_Splitlines(PyObject * string,int keepends)5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5828 {
5829 PyObject *list;
5830
5831 string = PyUnicode_FromObject(string);
5832 if (string == NULL)
5833 return NULL;
5834
5835 list = stringlib_splitlines(
5836 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5837 PyUnicode_GET_SIZE(string), keepends);
5838
5839 Py_DECREF(string);
5840 return list;
5841 }
5842
5843 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5844 PyObject *split(PyUnicodeObject *self,
5845 PyUnicodeObject *substring,
5846 Py_ssize_t maxcount)
5847 {
5848 if (maxcount < 0)
5849 maxcount = PY_SSIZE_T_MAX;
5850
5851 if (substring == NULL)
5852 return stringlib_split_whitespace(
5853 (PyObject*) self, self->str, self->length, maxcount
5854 );
5855
5856 return stringlib_split(
5857 (PyObject*) self, self->str, self->length,
5858 substring->str, substring->length,
5859 maxcount
5860 );
5861 }
5862
5863 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5864 PyObject *rsplit(PyUnicodeObject *self,
5865 PyUnicodeObject *substring,
5866 Py_ssize_t maxcount)
5867 {
5868 if (maxcount < 0)
5869 maxcount = PY_SSIZE_T_MAX;
5870
5871 if (substring == NULL)
5872 return stringlib_rsplit_whitespace(
5873 (PyObject*) self, self->str, self->length, maxcount
5874 );
5875
5876 return stringlib_rsplit(
5877 (PyObject*) self, self->str, self->length,
5878 substring->str, substring->length,
5879 maxcount
5880 );
5881 }
5882
5883 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5884 PyObject *replace(PyUnicodeObject *self,
5885 PyUnicodeObject *str1,
5886 PyUnicodeObject *str2,
5887 Py_ssize_t maxcount)
5888 {
5889 PyUnicodeObject *u;
5890
5891 if (maxcount < 0)
5892 maxcount = PY_SSIZE_T_MAX;
5893 else if (maxcount == 0 || self->length == 0)
5894 goto nothing;
5895
5896 if (str1->length == str2->length) {
5897 Py_ssize_t i;
5898 /* same length */
5899 if (str1->length == 0)
5900 goto nothing;
5901 if (str1->length == 1) {
5902 /* replace characters */
5903 Py_UNICODE u1, u2;
5904 if (!findchar(self->str, self->length, str1->str[0]))
5905 goto nothing;
5906 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5907 if (!u)
5908 return NULL;
5909 Py_UNICODE_COPY(u->str, self->str, self->length);
5910 u1 = str1->str[0];
5911 u2 = str2->str[0];
5912 for (i = 0; i < u->length; i++)
5913 if (u->str[i] == u1) {
5914 if (--maxcount < 0)
5915 break;
5916 u->str[i] = u2;
5917 }
5918 } else {
5919 i = stringlib_find(
5920 self->str, self->length, str1->str, str1->length, 0
5921 );
5922 if (i < 0)
5923 goto nothing;
5924 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5925 if (!u)
5926 return NULL;
5927 Py_UNICODE_COPY(u->str, self->str, self->length);
5928
5929 /* change everything in-place, starting with this one */
5930 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931 i += str1->length;
5932
5933 while ( --maxcount > 0) {
5934 i = stringlib_find(self->str+i, self->length-i,
5935 str1->str, str1->length,
5936 i);
5937 if (i == -1)
5938 break;
5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940 i += str1->length;
5941 }
5942 }
5943 } else {
5944
5945 Py_ssize_t n, i, j;
5946 Py_ssize_t new_size, delta;
5947 Py_UNICODE *p;
5948
5949 /* replace strings */
5950 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5951 maxcount);
5952 if (n == 0)
5953 goto nothing;
5954 /* new_size = self->length + n * (str2->length - str1->length)); */
5955 delta = (str2->length - str1->length);
5956 if (delta == 0) {
5957 new_size = self->length;
5958 } else {
5959 assert(n > 0);
5960 if (delta > (PY_SSIZE_T_MAX - self->length) / n) {
5961 PyErr_SetString(PyExc_OverflowError,
5962 "replace string is too long");
5963 return NULL;
5964 }
5965 new_size = self->length + delta * n;
5966 }
5967 u = _PyUnicode_New(new_size);
5968 if (!u)
5969 return NULL;
5970 i = 0;
5971 p = u->str;
5972 if (str1->length > 0) {
5973 while (n-- > 0) {
5974 /* look for next match */
5975 j = stringlib_find(self->str+i, self->length-i,
5976 str1->str, str1->length,
5977 i);
5978 if (j == -1)
5979 break;
5980 else if (j > i) {
5981 /* copy unchanged part [i:j] */
5982 Py_UNICODE_COPY(p, self->str+i, j-i);
5983 p += j - i;
5984 }
5985 /* copy substitution string */
5986 if (str2->length > 0) {
5987 Py_UNICODE_COPY(p, str2->str, str2->length);
5988 p += str2->length;
5989 }
5990 i = j + str1->length;
5991 }
5992 if (i < self->length)
5993 /* copy tail [i:] */
5994 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5995 } else {
5996 /* interleave */
5997 while (n > 0) {
5998 Py_UNICODE_COPY(p, str2->str, str2->length);
5999 p += str2->length;
6000 if (--n <= 0)
6001 break;
6002 *p++ = self->str[i++];
6003 }
6004 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6005 }
6006 }
6007 return (PyObject *) u;
6008
6009 nothing:
6010 /* nothing to replace; return original string (when possible) */
6011 if (PyUnicode_CheckExact(self)) {
6012 Py_INCREF(self);
6013 return (PyObject *) self;
6014 }
6015 return PyUnicode_FromUnicode(self->str, self->length);
6016 }
6017
6018 /* --- Unicode Object Methods --------------------------------------------- */
6019
6020 PyDoc_STRVAR(title__doc__,
6021 "S.title() -> unicode\n\
6022 \n\
6023 Return a titlecased version of S, i.e. words start with title case\n\
6024 characters, all remaining cased characters have lower case.");
6025
6026 static PyObject*
unicode_title(PyUnicodeObject * self)6027 unicode_title(PyUnicodeObject *self)
6028 {
6029 return fixup(self, fixtitle);
6030 }
6031
6032 PyDoc_STRVAR(capitalize__doc__,
6033 "S.capitalize() -> unicode\n\
6034 \n\
6035 Return a capitalized version of S, i.e. make the first character\n\
6036 have upper case and the rest lower case.");
6037
6038 static PyObject*
unicode_capitalize(PyUnicodeObject * self)6039 unicode_capitalize(PyUnicodeObject *self)
6040 {
6041 return fixup(self, fixcapitalize);
6042 }
6043
6044 #if 0
6045 PyDoc_STRVAR(capwords__doc__,
6046 "S.capwords() -> unicode\n\
6047 \n\
6048 Apply .capitalize() to all words in S and return the result with\n\
6049 normalized whitespace (all whitespace strings are replaced by ' ').");
6050
6051 static PyObject*
6052 unicode_capwords(PyUnicodeObject *self)
6053 {
6054 PyObject *list;
6055 PyObject *item;
6056 Py_ssize_t i;
6057
6058 /* Split into words */
6059 list = split(self, NULL, -1);
6060 if (!list)
6061 return NULL;
6062
6063 /* Capitalize each word */
6064 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6065 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6066 fixcapitalize);
6067 if (item == NULL)
6068 goto onError;
6069 Py_DECREF(PyList_GET_ITEM(list, i));
6070 PyList_SET_ITEM(list, i, item);
6071 }
6072
6073 /* Join the words to form a new string */
6074 item = PyUnicode_Join(NULL, list);
6075
6076 onError:
6077 Py_DECREF(list);
6078 return (PyObject *)item;
6079 }
6080 #endif
6081
6082 /* Argument converter. Coerces to a single unicode character */
6083
6084 static int
convert_uc(PyObject * obj,void * addr)6085 convert_uc(PyObject *obj, void *addr)
6086 {
6087 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6088 PyObject *uniobj;
6089 Py_UNICODE *unistr;
6090
6091 uniobj = PyUnicode_FromObject(obj);
6092 if (uniobj == NULL) {
6093 PyErr_SetString(PyExc_TypeError,
6094 "The fill character cannot be converted to Unicode");
6095 return 0;
6096 }
6097 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6098 PyErr_SetString(PyExc_TypeError,
6099 "The fill character must be exactly one character long");
6100 Py_DECREF(uniobj);
6101 return 0;
6102 }
6103 unistr = PyUnicode_AS_UNICODE(uniobj);
6104 *fillcharloc = unistr[0];
6105 Py_DECREF(uniobj);
6106 return 1;
6107 }
6108
6109 PyDoc_STRVAR(center__doc__,
6110 "S.center(width[, fillchar]) -> unicode\n\
6111 \n\
6112 Return S centered in a Unicode string of length width. Padding is\n\
6113 done using the specified fill character (default is a space)");
6114
6115 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6116 unicode_center(PyUnicodeObject *self, PyObject *args)
6117 {
6118 Py_ssize_t marg, left;
6119 Py_ssize_t width;
6120 Py_UNICODE fillchar = ' ';
6121
6122 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6123 return NULL;
6124
6125 if (self->length >= width && PyUnicode_CheckExact(self)) {
6126 Py_INCREF(self);
6127 return (PyObject*) self;
6128 }
6129
6130 marg = width - self->length;
6131 left = marg / 2 + (marg & width & 1);
6132
6133 return (PyObject*) pad(self, left, marg - left, fillchar);
6134 }
6135
6136 #if 0
6137
6138 /* This code should go into some future Unicode collation support
6139 module. The basic comparison should compare ordinals on a naive
6140 basis (this is what Java does and thus Jython too). */
6141
6142 /* speedy UTF-16 code point order comparison */
6143 /* gleaned from: */
6144 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6145
6146 static short utf16Fixup[32] =
6147 {
6148 0, 0, 0, 0, 0, 0, 0, 0,
6149 0, 0, 0, 0, 0, 0, 0, 0,
6150 0, 0, 0, 0, 0, 0, 0, 0,
6151 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6152 };
6153
6154 static int
6155 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6156 {
6157 Py_ssize_t len1, len2;
6158
6159 Py_UNICODE *s1 = str1->str;
6160 Py_UNICODE *s2 = str2->str;
6161
6162 len1 = str1->length;
6163 len2 = str2->length;
6164
6165 while (len1 > 0 && len2 > 0) {
6166 Py_UNICODE c1, c2;
6167
6168 c1 = *s1++;
6169 c2 = *s2++;
6170
6171 if (c1 > (1<<11) * 26)
6172 c1 += utf16Fixup[c1>>11];
6173 if (c2 > (1<<11) * 26)
6174 c2 += utf16Fixup[c2>>11];
6175 /* now c1 and c2 are in UTF-32-compatible order */
6176
6177 if (c1 != c2)
6178 return (c1 < c2) ? -1 : 1;
6179
6180 len1--; len2--;
6181 }
6182
6183 return (len1 < len2) ? -1 : (len1 != len2);
6184 }
6185
6186 #else
6187
6188 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6189 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6190 {
6191 register Py_ssize_t len1, len2;
6192
6193 Py_UNICODE *s1 = str1->str;
6194 Py_UNICODE *s2 = str2->str;
6195
6196 len1 = str1->length;
6197 len2 = str2->length;
6198
6199 while (len1 > 0 && len2 > 0) {
6200 Py_UNICODE c1, c2;
6201
6202 c1 = *s1++;
6203 c2 = *s2++;
6204
6205 if (c1 != c2)
6206 return (c1 < c2) ? -1 : 1;
6207
6208 len1--; len2--;
6209 }
6210
6211 return (len1 < len2) ? -1 : (len1 != len2);
6212 }
6213
6214 #endif
6215
PyUnicode_Compare(PyObject * left,PyObject * right)6216 int PyUnicode_Compare(PyObject *left,
6217 PyObject *right)
6218 {
6219 PyUnicodeObject *u = NULL, *v = NULL;
6220 int result;
6221
6222 /* Coerce the two arguments */
6223 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6224 if (u == NULL)
6225 goto onError;
6226 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6227 if (v == NULL)
6228 goto onError;
6229
6230 /* Shortcut for empty or interned objects */
6231 if (v == u) {
6232 Py_DECREF(u);
6233 Py_DECREF(v);
6234 return 0;
6235 }
6236
6237 result = unicode_compare(u, v);
6238
6239 Py_DECREF(u);
6240 Py_DECREF(v);
6241 return result;
6242
6243 onError:
6244 Py_XDECREF(u);
6245 Py_XDECREF(v);
6246 return -1;
6247 }
6248
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6249 PyObject *PyUnicode_RichCompare(PyObject *left,
6250 PyObject *right,
6251 int op)
6252 {
6253 int result;
6254
6255 result = PyUnicode_Compare(left, right);
6256 if (result == -1 && PyErr_Occurred())
6257 goto onError;
6258
6259 /* Convert the return value to a Boolean */
6260 switch (op) {
6261 case Py_EQ:
6262 result = (result == 0);
6263 break;
6264 case Py_NE:
6265 result = (result != 0);
6266 break;
6267 case Py_LE:
6268 result = (result <= 0);
6269 break;
6270 case Py_GE:
6271 result = (result >= 0);
6272 break;
6273 case Py_LT:
6274 result = (result == -1);
6275 break;
6276 case Py_GT:
6277 result = (result == 1);
6278 break;
6279 }
6280 return PyBool_FromLong(result);
6281
6282 onError:
6283
6284 /* Standard case
6285
6286 Type errors mean that PyUnicode_FromObject() could not convert
6287 one of the arguments (usually the right hand side) to Unicode,
6288 ie. we can't handle the comparison request. However, it is
6289 possible that the other object knows a comparison method, which
6290 is why we return Py_NotImplemented to give the other object a
6291 chance.
6292
6293 */
6294 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6295 PyErr_Clear();
6296 Py_INCREF(Py_NotImplemented);
6297 return Py_NotImplemented;
6298 }
6299 if (op != Py_EQ && op != Py_NE)
6300 return NULL;
6301
6302 /* Equality comparison.
6303
6304 This is a special case: we silence any PyExc_UnicodeDecodeError
6305 and instead turn it into a PyErr_UnicodeWarning.
6306
6307 */
6308 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6309 return NULL;
6310 PyErr_Clear();
6311 if (PyErr_Warn(PyExc_UnicodeWarning,
6312 (op == Py_EQ) ?
6313 "Unicode equal comparison "
6314 "failed to convert both arguments to Unicode - "
6315 "interpreting them as being unequal" :
6316 "Unicode unequal comparison "
6317 "failed to convert both arguments to Unicode - "
6318 "interpreting them as being unequal"
6319 ) < 0)
6320 return NULL;
6321 result = (op == Py_NE);
6322 return PyBool_FromLong(result);
6323 }
6324
PyUnicode_Contains(PyObject * container,PyObject * element)6325 int PyUnicode_Contains(PyObject *container,
6326 PyObject *element)
6327 {
6328 PyObject *str, *sub;
6329 int result;
6330
6331 /* Coerce the two arguments */
6332 sub = PyUnicode_FromObject(element);
6333 if (!sub) {
6334 return -1;
6335 }
6336
6337 str = PyUnicode_FromObject(container);
6338 if (!str) {
6339 Py_DECREF(sub);
6340 return -1;
6341 }
6342
6343 result = stringlib_contains_obj(str, sub);
6344
6345 Py_DECREF(str);
6346 Py_DECREF(sub);
6347
6348 return result;
6349 }
6350
6351 /* Concat to string or Unicode object giving a new Unicode object. */
6352
PyUnicode_Concat(PyObject * left,PyObject * right)6353 PyObject *PyUnicode_Concat(PyObject *left,
6354 PyObject *right)
6355 {
6356 PyUnicodeObject *u = NULL, *v = NULL, *w;
6357
6358 /* Coerce the two arguments */
6359 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6360 if (u == NULL)
6361 goto onError;
6362 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6363 if (v == NULL)
6364 goto onError;
6365
6366 /* Shortcuts */
6367 if (v == unicode_empty) {
6368 Py_DECREF(v);
6369 return (PyObject *)u;
6370 }
6371 if (u == unicode_empty) {
6372 Py_DECREF(u);
6373 return (PyObject *)v;
6374 }
6375
6376 if (u->length > PY_SSIZE_T_MAX - v->length) {
6377 PyErr_SetString(PyExc_OverflowError,
6378 "strings are too large to concat");
6379 goto onError;
6380 }
6381
6382 /* Concat the two Unicode strings */
6383 w = _PyUnicode_New(u->length + v->length);
6384 if (w == NULL)
6385 goto onError;
6386 Py_UNICODE_COPY(w->str, u->str, u->length);
6387 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6388
6389 Py_DECREF(u);
6390 Py_DECREF(v);
6391 return (PyObject *)w;
6392
6393 onError:
6394 Py_XDECREF(u);
6395 Py_XDECREF(v);
6396 return NULL;
6397 }
6398
6399 PyDoc_STRVAR(count__doc__,
6400 "S.count(sub[, start[, end]]) -> int\n\
6401 \n\
6402 Return the number of non-overlapping occurrences of substring sub in\n\
6403 Unicode string S[start:end]. Optional arguments start and end are\n\
6404 interpreted as in slice notation.");
6405
6406 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6407 unicode_count(PyUnicodeObject *self, PyObject *args)
6408 {
6409 PyUnicodeObject *substring;
6410 Py_ssize_t start = 0;
6411 Py_ssize_t end = PY_SSIZE_T_MAX;
6412 PyObject *result;
6413
6414 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6415 &start, &end))
6416 return NULL;
6417
6418 ADJUST_INDICES(start, end, self->length);
6419 result = PyInt_FromSsize_t(
6420 stringlib_count(self->str + start, end - start,
6421 substring->str, substring->length,
6422 PY_SSIZE_T_MAX)
6423 );
6424
6425 Py_DECREF(substring);
6426
6427 return result;
6428 }
6429
6430 PyDoc_STRVAR(encode__doc__,
6431 "S.encode([encoding[,errors]]) -> string or unicode\n\
6432 \n\
6433 Encodes S using the codec registered for encoding. encoding defaults\n\
6434 to the default encoding. errors may be given to set a different error\n\
6435 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6436 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6437 'xmlcharrefreplace' as well as any other name registered with\n\
6438 codecs.register_error that can handle UnicodeEncodeErrors.");
6439
6440 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6441 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6442 {
6443 static char *kwlist[] = {"encoding", "errors", 0};
6444 char *encoding = NULL;
6445 char *errors = NULL;
6446 PyObject *v;
6447
6448 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6449 kwlist, &encoding, &errors))
6450 return NULL;
6451 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6452 if (v == NULL)
6453 goto onError;
6454 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6455 PyErr_Format(PyExc_TypeError,
6456 "encoder did not return a string/unicode object "
6457 "(type=%.400s)",
6458 Py_TYPE(v)->tp_name);
6459 Py_DECREF(v);
6460 return NULL;
6461 }
6462 return v;
6463
6464 onError:
6465 return NULL;
6466 }
6467
6468 PyDoc_STRVAR(decode__doc__,
6469 "S.decode([encoding[,errors]]) -> string or unicode\n\
6470 \n\
6471 Decodes S using the codec registered for encoding. encoding defaults\n\
6472 to the default encoding. errors may be given to set a different error\n\
6473 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6474 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6475 as well as any other name registered with codecs.register_error that is\n\
6476 able to handle UnicodeDecodeErrors.");
6477
6478 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6479 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6480 {
6481 static char *kwlist[] = {"encoding", "errors", 0};
6482 char *encoding = NULL;
6483 char *errors = NULL;
6484 PyObject *v;
6485
6486 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6487 kwlist, &encoding, &errors))
6488 return NULL;
6489 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6490 if (v == NULL)
6491 goto onError;
6492 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6493 PyErr_Format(PyExc_TypeError,
6494 "decoder did not return a string/unicode object "
6495 "(type=%.400s)",
6496 Py_TYPE(v)->tp_name);
6497 Py_DECREF(v);
6498 return NULL;
6499 }
6500 return v;
6501
6502 onError:
6503 return NULL;
6504 }
6505
6506 PyDoc_STRVAR(expandtabs__doc__,
6507 "S.expandtabs([tabsize]) -> unicode\n\
6508 \n\
6509 Return a copy of S where all tab characters are expanded using spaces.\n\
6510 If tabsize is not given, a tab size of 8 characters is assumed.");
6511
6512 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6513 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6514 {
6515 Py_UNICODE *e;
6516 Py_UNICODE *p;
6517 Py_UNICODE *q;
6518 Py_UNICODE *qe;
6519 Py_ssize_t i, j, incr;
6520 PyUnicodeObject *u;
6521 int tabsize = 8;
6522
6523 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6524 return NULL;
6525
6526 /* First pass: determine size of output string */
6527 i = 0; /* chars up to and including most recent \n or \r */
6528 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6529 e = self->str + self->length; /* end of input */
6530 for (p = self->str; p < e; p++)
6531 if (*p == '\t') {
6532 if (tabsize > 0) {
6533 incr = tabsize - (j % tabsize); /* cannot overflow */
6534 if (j > PY_SSIZE_T_MAX - incr)
6535 goto overflow1;
6536 j += incr;
6537 }
6538 }
6539 else {
6540 if (j > PY_SSIZE_T_MAX - 1)
6541 goto overflow1;
6542 j++;
6543 if (*p == '\n' || *p == '\r') {
6544 if (i > PY_SSIZE_T_MAX - j)
6545 goto overflow1;
6546 i += j;
6547 j = 0;
6548 }
6549 }
6550
6551 if (i > PY_SSIZE_T_MAX - j)
6552 goto overflow1;
6553
6554 /* Second pass: create output string and fill it */
6555 u = _PyUnicode_New(i + j);
6556 if (!u)
6557 return NULL;
6558
6559 j = 0; /* same as in first pass */
6560 q = u->str; /* next output char */
6561 qe = u->str + u->length; /* end of output */
6562
6563 for (p = self->str; p < e; p++)
6564 if (*p == '\t') {
6565 if (tabsize > 0) {
6566 i = tabsize - (j % tabsize);
6567 j += i;
6568 while (i--) {
6569 if (q >= qe)
6570 goto overflow2;
6571 *q++ = ' ';
6572 }
6573 }
6574 }
6575 else {
6576 if (q >= qe)
6577 goto overflow2;
6578 *q++ = *p;
6579 j++;
6580 if (*p == '\n' || *p == '\r')
6581 j = 0;
6582 }
6583
6584 return (PyObject*) u;
6585
6586 overflow2:
6587 Py_DECREF(u);
6588 overflow1:
6589 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6590 return NULL;
6591 }
6592
6593 PyDoc_STRVAR(find__doc__,
6594 "S.find(sub [,start [,end]]) -> int\n\
6595 \n\
6596 Return the lowest index in S where substring sub is found,\n\
6597 such that sub is contained within S[start:end]. Optional\n\
6598 arguments start and end are interpreted as in slice notation.\n\
6599 \n\
6600 Return -1 on failure.");
6601
6602 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6603 unicode_find(PyUnicodeObject *self, PyObject *args)
6604 {
6605 PyUnicodeObject *substring;
6606 Py_ssize_t start;
6607 Py_ssize_t end;
6608 Py_ssize_t result;
6609
6610 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6611 &start, &end))
6612 return NULL;
6613
6614 result = stringlib_find_slice(
6615 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6616 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6617 start, end
6618 );
6619
6620 Py_DECREF(substring);
6621
6622 return PyInt_FromSsize_t(result);
6623 }
6624
6625 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6626 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6627 {
6628 if (index < 0 || index >= self->length) {
6629 PyErr_SetString(PyExc_IndexError, "string index out of range");
6630 return NULL;
6631 }
6632
6633 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6634 }
6635
6636 static long
unicode_hash(PyUnicodeObject * self)6637 unicode_hash(PyUnicodeObject *self)
6638 {
6639 /* Since Unicode objects compare equal to their ASCII string
6640 counterparts, they should use the individual character values
6641 as basis for their hash value. This is needed to assure that
6642 strings and Unicode objects behave in the same way as
6643 dictionary keys. */
6644
6645 register Py_ssize_t len;
6646 register Py_UNICODE *p;
6647 register long x;
6648
6649 #ifdef Py_DEBUG
6650 assert(_Py_HashSecret_Initialized);
6651 #endif
6652 if (self->hash != -1)
6653 return self->hash;
6654 len = PyUnicode_GET_SIZE(self);
6655 /*
6656 We make the hash of the empty string be 0, rather than using
6657 (prefix ^ suffix), since this slightly obfuscates the hash secret
6658 */
6659 if (len == 0) {
6660 self->hash = 0;
6661 return 0;
6662 }
6663 p = PyUnicode_AS_UNICODE(self);
6664 x = _Py_HashSecret.prefix;
6665 x ^= *p << 7;
6666 while (--len >= 0)
6667 x = (1000003*x) ^ *p++;
6668 x ^= PyUnicode_GET_SIZE(self);
6669 x ^= _Py_HashSecret.suffix;
6670 if (x == -1)
6671 x = -2;
6672 self->hash = x;
6673 return x;
6674 }
6675
6676 PyDoc_STRVAR(index__doc__,
6677 "S.index(sub [,start [,end]]) -> int\n\
6678 \n\
6679 Like S.find() but raise ValueError when the substring is not found.");
6680
6681 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6682 unicode_index(PyUnicodeObject *self, PyObject *args)
6683 {
6684 Py_ssize_t result;
6685 PyUnicodeObject *substring;
6686 Py_ssize_t start;
6687 Py_ssize_t end;
6688
6689 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6690 &start, &end))
6691 return NULL;
6692
6693 result = stringlib_find_slice(
6694 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6695 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6696 start, end
6697 );
6698
6699 Py_DECREF(substring);
6700
6701 if (result < 0) {
6702 PyErr_SetString(PyExc_ValueError, "substring not found");
6703 return NULL;
6704 }
6705
6706 return PyInt_FromSsize_t(result);
6707 }
6708
6709 PyDoc_STRVAR(islower__doc__,
6710 "S.islower() -> bool\n\
6711 \n\
6712 Return True if all cased characters in S are lowercase and there is\n\
6713 at least one cased character in S, False otherwise.");
6714
6715 static PyObject*
unicode_islower(PyUnicodeObject * self)6716 unicode_islower(PyUnicodeObject *self)
6717 {
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719 register const Py_UNICODE *e;
6720 int cased;
6721
6722 /* Shortcut for single character strings */
6723 if (PyUnicode_GET_SIZE(self) == 1)
6724 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6725
6726 /* Special case for empty strings */
6727 if (PyUnicode_GET_SIZE(self) == 0)
6728 return PyBool_FromLong(0);
6729
6730 e = p + PyUnicode_GET_SIZE(self);
6731 cased = 0;
6732 for (; p < e; p++) {
6733 register const Py_UNICODE ch = *p;
6734
6735 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6736 return PyBool_FromLong(0);
6737 else if (!cased && Py_UNICODE_ISLOWER(ch))
6738 cased = 1;
6739 }
6740 return PyBool_FromLong(cased);
6741 }
6742
6743 PyDoc_STRVAR(isupper__doc__,
6744 "S.isupper() -> bool\n\
6745 \n\
6746 Return True if all cased characters in S are uppercase and there is\n\
6747 at least one cased character in S, False otherwise.");
6748
6749 static PyObject*
unicode_isupper(PyUnicodeObject * self)6750 unicode_isupper(PyUnicodeObject *self)
6751 {
6752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753 register const Py_UNICODE *e;
6754 int cased;
6755
6756 /* Shortcut for single character strings */
6757 if (PyUnicode_GET_SIZE(self) == 1)
6758 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6759
6760 /* Special case for empty strings */
6761 if (PyUnicode_GET_SIZE(self) == 0)
6762 return PyBool_FromLong(0);
6763
6764 e = p + PyUnicode_GET_SIZE(self);
6765 cased = 0;
6766 for (; p < e; p++) {
6767 register const Py_UNICODE ch = *p;
6768
6769 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6770 return PyBool_FromLong(0);
6771 else if (!cased && Py_UNICODE_ISUPPER(ch))
6772 cased = 1;
6773 }
6774 return PyBool_FromLong(cased);
6775 }
6776
6777 PyDoc_STRVAR(istitle__doc__,
6778 "S.istitle() -> bool\n\
6779 \n\
6780 Return True if S is a titlecased string and there is at least one\n\
6781 character in S, i.e. upper- and titlecase characters may only\n\
6782 follow uncased characters and lowercase characters only cased ones.\n\
6783 Return False otherwise.");
6784
6785 static PyObject*
unicode_istitle(PyUnicodeObject * self)6786 unicode_istitle(PyUnicodeObject *self)
6787 {
6788 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6789 register const Py_UNICODE *e;
6790 int cased, previous_is_cased;
6791
6792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1)
6794 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6795 (Py_UNICODE_ISUPPER(*p) != 0));
6796
6797 /* Special case for empty strings */
6798 if (PyUnicode_GET_SIZE(self) == 0)
6799 return PyBool_FromLong(0);
6800
6801 e = p + PyUnicode_GET_SIZE(self);
6802 cased = 0;
6803 previous_is_cased = 0;
6804 for (; p < e; p++) {
6805 register const Py_UNICODE ch = *p;
6806
6807 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6808 if (previous_is_cased)
6809 return PyBool_FromLong(0);
6810 previous_is_cased = 1;
6811 cased = 1;
6812 }
6813 else if (Py_UNICODE_ISLOWER(ch)) {
6814 if (!previous_is_cased)
6815 return PyBool_FromLong(0);
6816 previous_is_cased = 1;
6817 cased = 1;
6818 }
6819 else
6820 previous_is_cased = 0;
6821 }
6822 return PyBool_FromLong(cased);
6823 }
6824
6825 PyDoc_STRVAR(isspace__doc__,
6826 "S.isspace() -> bool\n\
6827 \n\
6828 Return True if all characters in S are whitespace\n\
6829 and there is at least one character in S, False otherwise.");
6830
6831 static PyObject*
unicode_isspace(PyUnicodeObject * self)6832 unicode_isspace(PyUnicodeObject *self)
6833 {
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835 register const Py_UNICODE *e;
6836
6837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self) == 1 &&
6839 Py_UNICODE_ISSPACE(*p))
6840 return PyBool_FromLong(1);
6841
6842 /* Special case for empty strings */
6843 if (PyUnicode_GET_SIZE(self) == 0)
6844 return PyBool_FromLong(0);
6845
6846 e = p + PyUnicode_GET_SIZE(self);
6847 for (; p < e; p++) {
6848 if (!Py_UNICODE_ISSPACE(*p))
6849 return PyBool_FromLong(0);
6850 }
6851 return PyBool_FromLong(1);
6852 }
6853
6854 PyDoc_STRVAR(isalpha__doc__,
6855 "S.isalpha() -> bool\n\
6856 \n\
6857 Return True if all characters in S are alphabetic\n\
6858 and there is at least one character in S, False otherwise.");
6859
6860 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6861 unicode_isalpha(PyUnicodeObject *self)
6862 {
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864 register const Py_UNICODE *e;
6865
6866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self) == 1 &&
6868 Py_UNICODE_ISALPHA(*p))
6869 return PyBool_FromLong(1);
6870
6871 /* Special case for empty strings */
6872 if (PyUnicode_GET_SIZE(self) == 0)
6873 return PyBool_FromLong(0);
6874
6875 e = p + PyUnicode_GET_SIZE(self);
6876 for (; p < e; p++) {
6877 if (!Py_UNICODE_ISALPHA(*p))
6878 return PyBool_FromLong(0);
6879 }
6880 return PyBool_FromLong(1);
6881 }
6882
6883 PyDoc_STRVAR(isalnum__doc__,
6884 "S.isalnum() -> bool\n\
6885 \n\
6886 Return True if all characters in S are alphanumeric\n\
6887 and there is at least one character in S, False otherwise.");
6888
6889 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6890 unicode_isalnum(PyUnicodeObject *self)
6891 {
6892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893 register const Py_UNICODE *e;
6894
6895 /* Shortcut for single character strings */
6896 if (PyUnicode_GET_SIZE(self) == 1 &&
6897 Py_UNICODE_ISALNUM(*p))
6898 return PyBool_FromLong(1);
6899
6900 /* Special case for empty strings */
6901 if (PyUnicode_GET_SIZE(self) == 0)
6902 return PyBool_FromLong(0);
6903
6904 e = p + PyUnicode_GET_SIZE(self);
6905 for (; p < e; p++) {
6906 if (!Py_UNICODE_ISALNUM(*p))
6907 return PyBool_FromLong(0);
6908 }
6909 return PyBool_FromLong(1);
6910 }
6911
6912 PyDoc_STRVAR(isdecimal__doc__,
6913 "S.isdecimal() -> bool\n\
6914 \n\
6915 Return True if there are only decimal characters in S,\n\
6916 False otherwise.");
6917
6918 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6919 unicode_isdecimal(PyUnicodeObject *self)
6920 {
6921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922 register const Py_UNICODE *e;
6923
6924 /* Shortcut for single character strings */
6925 if (PyUnicode_GET_SIZE(self) == 1 &&
6926 Py_UNICODE_ISDECIMAL(*p))
6927 return PyBool_FromLong(1);
6928
6929 /* Special case for empty strings */
6930 if (PyUnicode_GET_SIZE(self) == 0)
6931 return PyBool_FromLong(0);
6932
6933 e = p + PyUnicode_GET_SIZE(self);
6934 for (; p < e; p++) {
6935 if (!Py_UNICODE_ISDECIMAL(*p))
6936 return PyBool_FromLong(0);
6937 }
6938 return PyBool_FromLong(1);
6939 }
6940
6941 PyDoc_STRVAR(isdigit__doc__,
6942 "S.isdigit() -> bool\n\
6943 \n\
6944 Return True if all characters in S are digits\n\
6945 and there is at least one character in S, False otherwise.");
6946
6947 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6948 unicode_isdigit(PyUnicodeObject *self)
6949 {
6950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6951 register const Py_UNICODE *e;
6952
6953 /* Shortcut for single character strings */
6954 if (PyUnicode_GET_SIZE(self) == 1 &&
6955 Py_UNICODE_ISDIGIT(*p))
6956 return PyBool_FromLong(1);
6957
6958 /* Special case for empty strings */
6959 if (PyUnicode_GET_SIZE(self) == 0)
6960 return PyBool_FromLong(0);
6961
6962 e = p + PyUnicode_GET_SIZE(self);
6963 for (; p < e; p++) {
6964 if (!Py_UNICODE_ISDIGIT(*p))
6965 return PyBool_FromLong(0);
6966 }
6967 return PyBool_FromLong(1);
6968 }
6969
6970 PyDoc_STRVAR(isnumeric__doc__,
6971 "S.isnumeric() -> bool\n\
6972 \n\
6973 Return True if there are only numeric characters in S,\n\
6974 False otherwise.");
6975
6976 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6977 unicode_isnumeric(PyUnicodeObject *self)
6978 {
6979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6980 register const Py_UNICODE *e;
6981
6982 /* Shortcut for single character strings */
6983 if (PyUnicode_GET_SIZE(self) == 1 &&
6984 Py_UNICODE_ISNUMERIC(*p))
6985 return PyBool_FromLong(1);
6986
6987 /* Special case for empty strings */
6988 if (PyUnicode_GET_SIZE(self) == 0)
6989 return PyBool_FromLong(0);
6990
6991 e = p + PyUnicode_GET_SIZE(self);
6992 for (; p < e; p++) {
6993 if (!Py_UNICODE_ISNUMERIC(*p))
6994 return PyBool_FromLong(0);
6995 }
6996 return PyBool_FromLong(1);
6997 }
6998
6999 PyDoc_STRVAR(join__doc__,
7000 "S.join(iterable) -> unicode\n\
7001 \n\
7002 Return a string which is the concatenation of the strings in the\n\
7003 iterable. The separator between elements is S.");
7004
7005 static PyObject*
unicode_join(PyObject * self,PyObject * data)7006 unicode_join(PyObject *self, PyObject *data)
7007 {
7008 return PyUnicode_Join(self, data);
7009 }
7010
7011 static Py_ssize_t
unicode_length(PyUnicodeObject * self)7012 unicode_length(PyUnicodeObject *self)
7013 {
7014 return self->length;
7015 }
7016
7017 PyDoc_STRVAR(ljust__doc__,
7018 "S.ljust(width[, fillchar]) -> int\n\
7019 \n\
7020 Return S left-justified in a Unicode string of length width. Padding is\n\
7021 done using the specified fill character (default is a space).");
7022
7023 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)7024 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7025 {
7026 Py_ssize_t width;
7027 Py_UNICODE fillchar = ' ';
7028
7029 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7030 return NULL;
7031
7032 if (self->length >= width && PyUnicode_CheckExact(self)) {
7033 Py_INCREF(self);
7034 return (PyObject*) self;
7035 }
7036
7037 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7038 }
7039
7040 PyDoc_STRVAR(lower__doc__,
7041 "S.lower() -> unicode\n\
7042 \n\
7043 Return a copy of the string S converted to lowercase.");
7044
7045 static PyObject*
unicode_lower(PyUnicodeObject * self)7046 unicode_lower(PyUnicodeObject *self)
7047 {
7048 return fixup(self, fixlower);
7049 }
7050
7051 #define LEFTSTRIP 0
7052 #define RIGHTSTRIP 1
7053 #define BOTHSTRIP 2
7054
7055 /* Arrays indexed by above */
7056 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7057
7058 #define STRIPNAME(i) (stripformat[i]+3)
7059
7060 /* externally visible for str.strip(unicode) */
7061 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)7062 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7063 {
7064 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7065 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7066 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7067 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7068 Py_ssize_t i, j;
7069
7070 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7071
7072 i = 0;
7073 if (striptype != RIGHTSTRIP) {
7074 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7075 i++;
7076 }
7077 }
7078
7079 j = len;
7080 if (striptype != LEFTSTRIP) {
7081 do {
7082 j--;
7083 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7084 j++;
7085 }
7086
7087 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7088 Py_INCREF(self);
7089 return (PyObject*)self;
7090 }
7091 else
7092 return PyUnicode_FromUnicode(s+i, j-i);
7093 }
7094
7095
7096 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)7097 do_strip(PyUnicodeObject *self, int striptype)
7098 {
7099 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7100 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7101
7102 i = 0;
7103 if (striptype != RIGHTSTRIP) {
7104 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7105 i++;
7106 }
7107 }
7108
7109 j = len;
7110 if (striptype != LEFTSTRIP) {
7111 do {
7112 j--;
7113 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7114 j++;
7115 }
7116
7117 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7118 Py_INCREF(self);
7119 return (PyObject*)self;
7120 }
7121 else
7122 return PyUnicode_FromUnicode(s+i, j-i);
7123 }
7124
7125
7126 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7127 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7128 {
7129 PyObject *sep = NULL;
7130
7131 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7132 return NULL;
7133
7134 if (sep != NULL && sep != Py_None) {
7135 if (PyUnicode_Check(sep))
7136 return _PyUnicode_XStrip(self, striptype, sep);
7137 else if (PyString_Check(sep)) {
7138 PyObject *res;
7139 sep = PyUnicode_FromObject(sep);
7140 if (sep==NULL)
7141 return NULL;
7142 res = _PyUnicode_XStrip(self, striptype, sep);
7143 Py_DECREF(sep);
7144 return res;
7145 }
7146 else {
7147 PyErr_Format(PyExc_TypeError,
7148 "%s arg must be None, unicode or str",
7149 STRIPNAME(striptype));
7150 return NULL;
7151 }
7152 }
7153
7154 return do_strip(self, striptype);
7155 }
7156
7157
7158 PyDoc_STRVAR(strip__doc__,
7159 "S.strip([chars]) -> unicode\n\
7160 \n\
7161 Return a copy of the string S with leading and trailing\n\
7162 whitespace removed.\n\
7163 If chars is given and not None, remove characters in chars instead.\n\
7164 If chars is a str, it will be converted to unicode before stripping");
7165
7166 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7167 unicode_strip(PyUnicodeObject *self, PyObject *args)
7168 {
7169 if (PyTuple_GET_SIZE(args) == 0)
7170 return do_strip(self, BOTHSTRIP); /* Common case */
7171 else
7172 return do_argstrip(self, BOTHSTRIP, args);
7173 }
7174
7175
7176 PyDoc_STRVAR(lstrip__doc__,
7177 "S.lstrip([chars]) -> unicode\n\
7178 \n\
7179 Return a copy of the string S with leading whitespace removed.\n\
7180 If chars is given and not None, remove characters in chars instead.\n\
7181 If chars is a str, it will be converted to unicode before stripping");
7182
7183 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7184 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7185 {
7186 if (PyTuple_GET_SIZE(args) == 0)
7187 return do_strip(self, LEFTSTRIP); /* Common case */
7188 else
7189 return do_argstrip(self, LEFTSTRIP, args);
7190 }
7191
7192
7193 PyDoc_STRVAR(rstrip__doc__,
7194 "S.rstrip([chars]) -> unicode\n\
7195 \n\
7196 Return a copy of the string S with trailing whitespace removed.\n\
7197 If chars is given and not None, remove characters in chars instead.\n\
7198 If chars is a str, it will be converted to unicode before stripping");
7199
7200 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7201 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7202 {
7203 if (PyTuple_GET_SIZE(args) == 0)
7204 return do_strip(self, RIGHTSTRIP); /* Common case */
7205 else
7206 return do_argstrip(self, RIGHTSTRIP, args);
7207 }
7208
7209
7210 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7211 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7212 {
7213 PyUnicodeObject *u;
7214 Py_UNICODE *p;
7215 Py_ssize_t nchars;
7216 size_t nbytes;
7217
7218 if (len < 0)
7219 len = 0;
7220
7221 if (len == 1 && PyUnicode_CheckExact(str)) {
7222 /* no repeat, return original string */
7223 Py_INCREF(str);
7224 return (PyObject*) str;
7225 }
7226
7227 /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
7228 * needed doesn't overflow size_t
7229 */
7230 if (len && str->length > PY_SSIZE_T_MAX / len) {
7231 PyErr_SetString(PyExc_OverflowError,
7232 "repeated string is too long");
7233 return NULL;
7234 }
7235 nchars = len * str->length;
7236 nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
7237 if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
7238 PyErr_SetString(PyExc_OverflowError,
7239 "repeated string is too long");
7240 return NULL;
7241 }
7242 u = _PyUnicode_New(nchars);
7243 if (!u)
7244 return NULL;
7245
7246 p = u->str;
7247
7248 if (str->length == 1 && len > 0) {
7249 Py_UNICODE_FILL(p, str->str[0], len);
7250 } else {
7251 Py_ssize_t done = 0; /* number of characters copied this far */
7252 if (done < nchars) {
7253 Py_UNICODE_COPY(p, str->str, str->length);
7254 done = str->length;
7255 }
7256 while (done < nchars) {
7257 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7258 Py_UNICODE_COPY(p+done, p, n);
7259 done += n;
7260 }
7261 }
7262
7263 return (PyObject*) u;
7264 }
7265
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7266 PyObject *PyUnicode_Replace(PyObject *obj,
7267 PyObject *subobj,
7268 PyObject *replobj,
7269 Py_ssize_t maxcount)
7270 {
7271 PyObject *self;
7272 PyObject *str1;
7273 PyObject *str2;
7274 PyObject *result;
7275
7276 self = PyUnicode_FromObject(obj);
7277 if (self == NULL)
7278 return NULL;
7279 str1 = PyUnicode_FromObject(subobj);
7280 if (str1 == NULL) {
7281 Py_DECREF(self);
7282 return NULL;
7283 }
7284 str2 = PyUnicode_FromObject(replobj);
7285 if (str2 == NULL) {
7286 Py_DECREF(self);
7287 Py_DECREF(str1);
7288 return NULL;
7289 }
7290 result = replace((PyUnicodeObject *)self,
7291 (PyUnicodeObject *)str1,
7292 (PyUnicodeObject *)str2,
7293 maxcount);
7294 Py_DECREF(self);
7295 Py_DECREF(str1);
7296 Py_DECREF(str2);
7297 return result;
7298 }
7299
7300 PyDoc_STRVAR(replace__doc__,
7301 "S.replace(old, new[, count]) -> unicode\n\
7302 \n\
7303 Return a copy of S with all occurrences of substring\n\
7304 old replaced by new. If the optional argument count is\n\
7305 given, only the first count occurrences are replaced.");
7306
7307 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7308 unicode_replace(PyUnicodeObject *self, PyObject *args)
7309 {
7310 PyUnicodeObject *str1;
7311 PyUnicodeObject *str2;
7312 Py_ssize_t maxcount = -1;
7313 PyObject *result;
7314
7315 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7316 return NULL;
7317 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7318 if (str1 == NULL)
7319 return NULL;
7320 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7321 if (str2 == NULL) {
7322 Py_DECREF(str1);
7323 return NULL;
7324 }
7325
7326 result = replace(self, str1, str2, maxcount);
7327
7328 Py_DECREF(str1);
7329 Py_DECREF(str2);
7330 return result;
7331 }
7332
7333 static
unicode_repr(PyObject * unicode)7334 PyObject *unicode_repr(PyObject *unicode)
7335 {
7336 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7337 PyUnicode_GET_SIZE(unicode),
7338 1);
7339 }
7340
7341 PyDoc_STRVAR(rfind__doc__,
7342 "S.rfind(sub [,start [,end]]) -> int\n\
7343 \n\
7344 Return the highest index in S where substring sub is found,\n\
7345 such that sub is contained within S[start:end]. Optional\n\
7346 arguments start and end are interpreted as in slice notation.\n\
7347 \n\
7348 Return -1 on failure.");
7349
7350 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7351 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7352 {
7353 PyUnicodeObject *substring;
7354 Py_ssize_t start;
7355 Py_ssize_t end;
7356 Py_ssize_t result;
7357
7358 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7359 &start, &end))
7360 return NULL;
7361
7362 result = stringlib_rfind_slice(
7363 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7364 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7365 start, end
7366 );
7367
7368 Py_DECREF(substring);
7369
7370 return PyInt_FromSsize_t(result);
7371 }
7372
7373 PyDoc_STRVAR(rindex__doc__,
7374 "S.rindex(sub [,start [,end]]) -> int\n\
7375 \n\
7376 Like S.rfind() but raise ValueError when the substring is not found.");
7377
7378 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7379 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7380 {
7381 PyUnicodeObject *substring;
7382 Py_ssize_t start;
7383 Py_ssize_t end;
7384 Py_ssize_t result;
7385
7386 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7387 &start, &end))
7388 return NULL;
7389
7390 result = stringlib_rfind_slice(
7391 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7392 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7393 start, end
7394 );
7395
7396 Py_DECREF(substring);
7397
7398 if (result < 0) {
7399 PyErr_SetString(PyExc_ValueError, "substring not found");
7400 return NULL;
7401 }
7402 return PyInt_FromSsize_t(result);
7403 }
7404
7405 PyDoc_STRVAR(rjust__doc__,
7406 "S.rjust(width[, fillchar]) -> unicode\n\
7407 \n\
7408 Return S right-justified in a Unicode string of length width. Padding is\n\
7409 done using the specified fill character (default is a space).");
7410
7411 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7412 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7413 {
7414 Py_ssize_t width;
7415 Py_UNICODE fillchar = ' ';
7416
7417 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7418 return NULL;
7419
7420 if (self->length >= width && PyUnicode_CheckExact(self)) {
7421 Py_INCREF(self);
7422 return (PyObject*) self;
7423 }
7424
7425 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7426 }
7427
7428 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7429 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7430 {
7431 /* standard clamping */
7432 if (start < 0)
7433 start = 0;
7434 if (end < 0)
7435 end = 0;
7436 if (end > self->length)
7437 end = self->length;
7438 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7439 /* full slice, return original string */
7440 Py_INCREF(self);
7441 return (PyObject*) self;
7442 }
7443 if (start > end)
7444 start = end;
7445 /* copy slice */
7446 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7447 end - start);
7448 }
7449
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7450 PyObject *PyUnicode_Split(PyObject *s,
7451 PyObject *sep,
7452 Py_ssize_t maxsplit)
7453 {
7454 PyObject *result;
7455
7456 s = PyUnicode_FromObject(s);
7457 if (s == NULL)
7458 return NULL;
7459 if (sep != NULL) {
7460 sep = PyUnicode_FromObject(sep);
7461 if (sep == NULL) {
7462 Py_DECREF(s);
7463 return NULL;
7464 }
7465 }
7466
7467 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7468
7469 Py_DECREF(s);
7470 Py_XDECREF(sep);
7471 return result;
7472 }
7473
7474 PyDoc_STRVAR(split__doc__,
7475 "S.split([sep [,maxsplit]]) -> list of strings\n\
7476 \n\
7477 Return a list of the words in S, using sep as the\n\
7478 delimiter string. If maxsplit is given, at most maxsplit\n\
7479 splits are done. If sep is not specified or is None, any\n\
7480 whitespace string is a separator and empty strings are\n\
7481 removed from the result.");
7482
7483 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7484 unicode_split(PyUnicodeObject *self, PyObject *args)
7485 {
7486 PyObject *substring = Py_None;
7487 Py_ssize_t maxcount = -1;
7488
7489 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7490 return NULL;
7491
7492 if (substring == Py_None)
7493 return split(self, NULL, maxcount);
7494 else if (PyUnicode_Check(substring))
7495 return split(self, (PyUnicodeObject *)substring, maxcount);
7496 else
7497 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7498 }
7499
7500 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7501 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7502 {
7503 PyObject* str_obj;
7504 PyObject* sep_obj;
7505 PyObject* out;
7506
7507 str_obj = PyUnicode_FromObject(str_in);
7508 if (!str_obj)
7509 return NULL;
7510 sep_obj = PyUnicode_FromObject(sep_in);
7511 if (!sep_obj) {
7512 Py_DECREF(str_obj);
7513 return NULL;
7514 }
7515
7516 out = stringlib_partition(
7517 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7518 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7519 );
7520
7521 Py_DECREF(sep_obj);
7522 Py_DECREF(str_obj);
7523
7524 return out;
7525 }
7526
7527
7528 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7529 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7530 {
7531 PyObject* str_obj;
7532 PyObject* sep_obj;
7533 PyObject* out;
7534
7535 str_obj = PyUnicode_FromObject(str_in);
7536 if (!str_obj)
7537 return NULL;
7538 sep_obj = PyUnicode_FromObject(sep_in);
7539 if (!sep_obj) {
7540 Py_DECREF(str_obj);
7541 return NULL;
7542 }
7543
7544 out = stringlib_rpartition(
7545 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7546 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7547 );
7548
7549 Py_DECREF(sep_obj);
7550 Py_DECREF(str_obj);
7551
7552 return out;
7553 }
7554
7555 PyDoc_STRVAR(partition__doc__,
7556 "S.partition(sep) -> (head, sep, tail)\n\
7557 \n\
7558 Search for the separator sep in S, and return the part before it,\n\
7559 the separator itself, and the part after it. If the separator is not\n\
7560 found, return S and two empty strings.");
7561
7562 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7563 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7564 {
7565 return PyUnicode_Partition((PyObject *)self, separator);
7566 }
7567
7568 PyDoc_STRVAR(rpartition__doc__,
7569 "S.rpartition(sep) -> (head, sep, tail)\n\
7570 \n\
7571 Search for the separator sep in S, starting at the end of S, and return\n\
7572 the part before it, the separator itself, and the part after it. If the\n\
7573 separator is not found, return two empty strings and S.");
7574
7575 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7576 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7577 {
7578 return PyUnicode_RPartition((PyObject *)self, separator);
7579 }
7580
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7581 PyObject *PyUnicode_RSplit(PyObject *s,
7582 PyObject *sep,
7583 Py_ssize_t maxsplit)
7584 {
7585 PyObject *result;
7586
7587 s = PyUnicode_FromObject(s);
7588 if (s == NULL)
7589 return NULL;
7590 if (sep != NULL) {
7591 sep = PyUnicode_FromObject(sep);
7592 if (sep == NULL) {
7593 Py_DECREF(s);
7594 return NULL;
7595 }
7596 }
7597
7598 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7599
7600 Py_DECREF(s);
7601 Py_XDECREF(sep);
7602 return result;
7603 }
7604
7605 PyDoc_STRVAR(rsplit__doc__,
7606 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7607 \n\
7608 Return a list of the words in S, using sep as the\n\
7609 delimiter string, starting at the end of the string and\n\
7610 working to the front. If maxsplit is given, at most maxsplit\n\
7611 splits are done. If sep is not specified, any whitespace string\n\
7612 is a separator.");
7613
7614 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7615 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7616 {
7617 PyObject *substring = Py_None;
7618 Py_ssize_t maxcount = -1;
7619
7620 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7621 return NULL;
7622
7623 if (substring == Py_None)
7624 return rsplit(self, NULL, maxcount);
7625 else if (PyUnicode_Check(substring))
7626 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7627 else
7628 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7629 }
7630
7631 PyDoc_STRVAR(splitlines__doc__,
7632 "S.splitlines(keepends=False) -> list of strings\n\
7633 \n\
7634 Return a list of the lines in S, breaking at line boundaries.\n\
7635 Line breaks are not included in the resulting list unless keepends\n\
7636 is given and true.");
7637
7638 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7639 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7640 {
7641 int keepends = 0;
7642
7643 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7644 return NULL;
7645
7646 return PyUnicode_Splitlines((PyObject *)self, keepends);
7647 }
7648
7649 static
unicode_str(PyUnicodeObject * self)7650 PyObject *unicode_str(PyUnicodeObject *self)
7651 {
7652 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7653 }
7654
7655 PyDoc_STRVAR(swapcase__doc__,
7656 "S.swapcase() -> unicode\n\
7657 \n\
7658 Return a copy of S with uppercase characters converted to lowercase\n\
7659 and vice versa.");
7660
7661 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7662 unicode_swapcase(PyUnicodeObject *self)
7663 {
7664 return fixup(self, fixswapcase);
7665 }
7666
7667 PyDoc_STRVAR(translate__doc__,
7668 "S.translate(table) -> unicode\n\
7669 \n\
7670 Return a copy of the string S, where all characters have been mapped\n\
7671 through the given translation table, which must be a mapping of\n\
7672 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7673 Unmapped characters are left untouched. Characters mapped to None\n\
7674 are deleted.");
7675
7676 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7677 unicode_translate(PyUnicodeObject *self, PyObject *table)
7678 {
7679 return PyUnicode_TranslateCharmap(self->str,
7680 self->length,
7681 table,
7682 "ignore");
7683 }
7684
7685 PyDoc_STRVAR(upper__doc__,
7686 "S.upper() -> unicode\n\
7687 \n\
7688 Return a copy of S converted to uppercase.");
7689
7690 static PyObject*
unicode_upper(PyUnicodeObject * self)7691 unicode_upper(PyUnicodeObject *self)
7692 {
7693 return fixup(self, fixupper);
7694 }
7695
7696 PyDoc_STRVAR(zfill__doc__,
7697 "S.zfill(width) -> unicode\n\
7698 \n\
7699 Pad a numeric string S with zeros on the left, to fill a field\n\
7700 of the specified width. The string S is never truncated.");
7701
7702 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7703 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7704 {
7705 Py_ssize_t fill;
7706 PyUnicodeObject *u;
7707
7708 Py_ssize_t width;
7709 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7710 return NULL;
7711
7712 if (self->length >= width) {
7713 if (PyUnicode_CheckExact(self)) {
7714 Py_INCREF(self);
7715 return (PyObject*) self;
7716 }
7717 else
7718 return PyUnicode_FromUnicode(
7719 PyUnicode_AS_UNICODE(self),
7720 PyUnicode_GET_SIZE(self)
7721 );
7722 }
7723
7724 fill = width - self->length;
7725
7726 u = pad(self, fill, 0, '0');
7727
7728 if (u == NULL)
7729 return NULL;
7730
7731 if (u->str[fill] == '+' || u->str[fill] == '-') {
7732 /* move sign to beginning of string */
7733 u->str[0] = u->str[fill];
7734 u->str[fill] = '0';
7735 }
7736
7737 return (PyObject*) u;
7738 }
7739
7740 #if 0
7741 static PyObject*
7742 free_listsize(PyUnicodeObject *self)
7743 {
7744 return PyInt_FromLong(numfree);
7745 }
7746 #endif
7747
7748 PyDoc_STRVAR(startswith__doc__,
7749 "S.startswith(prefix[, start[, end]]) -> bool\n\
7750 \n\
7751 Return True if S starts with the specified prefix, False otherwise.\n\
7752 With optional start, test S beginning at that position.\n\
7753 With optional end, stop comparing S at that position.\n\
7754 prefix can also be a tuple of strings to try.");
7755
7756 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7757 unicode_startswith(PyUnicodeObject *self,
7758 PyObject *args)
7759 {
7760 PyObject *subobj;
7761 PyUnicodeObject *substring;
7762 Py_ssize_t start = 0;
7763 Py_ssize_t end = PY_SSIZE_T_MAX;
7764 int result;
7765
7766 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7767 return NULL;
7768 if (PyTuple_Check(subobj)) {
7769 Py_ssize_t i;
7770 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7771 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7772 PyTuple_GET_ITEM(subobj, i));
7773 if (substring == NULL)
7774 return NULL;
7775 result = tailmatch(self, substring, start, end, -1);
7776 Py_DECREF(substring);
7777 if (result) {
7778 Py_RETURN_TRUE;
7779 }
7780 }
7781 /* nothing matched */
7782 Py_RETURN_FALSE;
7783 }
7784 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7785 if (substring == NULL) {
7786 if (PyErr_ExceptionMatches(PyExc_TypeError))
7787 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7788 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7789 return NULL;
7790 }
7791 result = tailmatch(self, substring, start, end, -1);
7792 Py_DECREF(substring);
7793 return PyBool_FromLong(result);
7794 }
7795
7796
7797 PyDoc_STRVAR(endswith__doc__,
7798 "S.endswith(suffix[, start[, end]]) -> bool\n\
7799 \n\
7800 Return True if S ends with the specified suffix, False otherwise.\n\
7801 With optional start, test S beginning at that position.\n\
7802 With optional end, stop comparing S at that position.\n\
7803 suffix can also be a tuple of strings to try.");
7804
7805 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7806 unicode_endswith(PyUnicodeObject *self,
7807 PyObject *args)
7808 {
7809 PyObject *subobj;
7810 PyUnicodeObject *substring;
7811 Py_ssize_t start = 0;
7812 Py_ssize_t end = PY_SSIZE_T_MAX;
7813 int result;
7814
7815 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7816 return NULL;
7817 if (PyTuple_Check(subobj)) {
7818 Py_ssize_t i;
7819 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7820 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7821 PyTuple_GET_ITEM(subobj, i));
7822 if (substring == NULL)
7823 return NULL;
7824 result = tailmatch(self, substring, start, end, +1);
7825 Py_DECREF(substring);
7826 if (result) {
7827 Py_RETURN_TRUE;
7828 }
7829 }
7830 Py_RETURN_FALSE;
7831 }
7832 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7833 if (substring == NULL) {
7834 if (PyErr_ExceptionMatches(PyExc_TypeError))
7835 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7836 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7837 return NULL;
7838 }
7839 result = tailmatch(self, substring, start, end, +1);
7840 Py_DECREF(substring);
7841 return PyBool_FromLong(result);
7842 }
7843
7844
7845 /* Implements do_string_format, which is unicode because of stringlib */
7846 #include "stringlib/string_format.h"
7847
7848 PyDoc_STRVAR(format__doc__,
7849 "S.format(*args, **kwargs) -> unicode\n\
7850 \n\
7851 Return a formatted version of S, using substitutions from args and kwargs.\n\
7852 The substitutions are identified by braces ('{' and '}').");
7853
7854 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7855 unicode__format__(PyObject *self, PyObject *args)
7856 {
7857 PyObject *format_spec;
7858 PyObject *result = NULL;
7859 PyObject *tmp = NULL;
7860
7861 /* If 2.x, convert format_spec to the same type as value */
7862 /* This is to allow things like u''.format('') */
7863 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7864 goto done;
7865 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7866 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7867 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7868 goto done;
7869 }
7870 tmp = PyObject_Unicode(format_spec);
7871 if (tmp == NULL)
7872 goto done;
7873 format_spec = tmp;
7874
7875 result = _PyUnicode_FormatAdvanced(self,
7876 PyUnicode_AS_UNICODE(format_spec),
7877 PyUnicode_GET_SIZE(format_spec));
7878 done:
7879 Py_XDECREF(tmp);
7880 return result;
7881 }
7882
7883 PyDoc_STRVAR(p_format__doc__,
7884 "S.__format__(format_spec) -> unicode\n\
7885 \n\
7886 Return a formatted version of S as described by format_spec.");
7887
7888 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7889 unicode__sizeof__(PyUnicodeObject *v)
7890 {
7891 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7892 sizeof(Py_UNICODE) * (v->length + 1));
7893 }
7894
7895 PyDoc_STRVAR(sizeof__doc__,
7896 "S.__sizeof__() -> size of S in memory, in bytes\n\
7897 \n\
7898 ");
7899
7900 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7901 unicode_getnewargs(PyUnicodeObject *v)
7902 {
7903 return Py_BuildValue("(u#)", v->str, v->length);
7904 }
7905
7906
7907 static PyMethodDef unicode_methods[] = {
7908 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7909 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7910 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7911 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7912 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7913 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7914 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7915 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7917 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7919 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7921 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7922 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7923 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7924 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7925 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7926 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7927 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7928 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7929 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7930 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7931 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7932 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7933 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7934 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7935 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7936 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7937 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7938 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7939 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7940 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7941 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7942 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7943 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7944 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7945 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7946 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7947 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7948 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7949 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7950 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7951 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7952 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7953 #if 0
7954 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7955 #endif
7956
7957 #if 0
7958 /* This one is just used for debugging the implementation. */
7959 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7960 #endif
7961
7962 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7963 {NULL, NULL}
7964 };
7965
7966 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7967 unicode_mod(PyObject *v, PyObject *w)
7968 {
7969 if (!PyUnicode_Check(v)) {
7970 Py_INCREF(Py_NotImplemented);
7971 return Py_NotImplemented;
7972 }
7973 return PyUnicode_Format(v, w);
7974 }
7975
7976 static PyNumberMethods unicode_as_number = {
7977 0, /*nb_add*/
7978 0, /*nb_subtract*/
7979 0, /*nb_multiply*/
7980 0, /*nb_divide*/
7981 unicode_mod, /*nb_remainder*/
7982 };
7983
7984 static PySequenceMethods unicode_as_sequence = {
7985 (lenfunc) unicode_length, /* sq_length */
7986 PyUnicode_Concat, /* sq_concat */
7987 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7988 (ssizeargfunc) unicode_getitem, /* sq_item */
7989 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7990 0, /* sq_ass_item */
7991 0, /* sq_ass_slice */
7992 PyUnicode_Contains, /* sq_contains */
7993 };
7994
7995 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)7996 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7997 {
7998 if (PyIndex_Check(item)) {
7999 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8000 if (i == -1 && PyErr_Occurred())
8001 return NULL;
8002 if (i < 0)
8003 i += PyUnicode_GET_SIZE(self);
8004 return unicode_getitem(self, i);
8005 } else if (PySlice_Check(item)) {
8006 Py_ssize_t start, stop, step, slicelength, cur, i;
8007 Py_UNICODE* source_buf;
8008 Py_UNICODE* result_buf;
8009 PyObject* result;
8010
8011 if (_PySlice_Unpack(item, &start, &stop, &step) < 0) {
8012 return NULL;
8013 }
8014 slicelength = _PySlice_AdjustIndices(PyUnicode_GET_SIZE(self), &start,
8015 &stop, step);
8016
8017 if (slicelength <= 0) {
8018 return PyUnicode_FromUnicode(NULL, 0);
8019 } else if (start == 0 && step == 1 && slicelength == self->length &&
8020 PyUnicode_CheckExact(self)) {
8021 Py_INCREF(self);
8022 return (PyObject *)self;
8023 } else if (step == 1) {
8024 return PyUnicode_FromUnicode(self->str + start, slicelength);
8025 } else {
8026 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8027 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8028 sizeof(Py_UNICODE));
8029
8030 if (result_buf == NULL)
8031 return PyErr_NoMemory();
8032
8033 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8034 result_buf[i] = source_buf[cur];
8035 }
8036
8037 result = PyUnicode_FromUnicode(result_buf, slicelength);
8038 PyObject_FREE(result_buf);
8039 return result;
8040 }
8041 } else {
8042 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8043 return NULL;
8044 }
8045 }
8046
8047 static PyMappingMethods unicode_as_mapping = {
8048 (lenfunc)unicode_length, /* mp_length */
8049 (binaryfunc)unicode_subscript, /* mp_subscript */
8050 (objobjargproc)0, /* mp_ass_subscript */
8051 };
8052
8053 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8054 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8055 Py_ssize_t index,
8056 const void **ptr)
8057 {
8058 if (index != 0) {
8059 PyErr_SetString(PyExc_SystemError,
8060 "accessing non-existent unicode segment");
8061 return -1;
8062 }
8063 *ptr = (void *) self->str;
8064 return PyUnicode_GET_DATA_SIZE(self);
8065 }
8066
8067 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8068 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8069 const void **ptr)
8070 {
8071 PyErr_SetString(PyExc_TypeError,
8072 "cannot use unicode as modifiable buffer");
8073 return -1;
8074 }
8075
8076 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)8077 unicode_buffer_getsegcount(PyUnicodeObject *self,
8078 Py_ssize_t *lenp)
8079 {
8080 if (lenp)
8081 *lenp = PyUnicode_GET_DATA_SIZE(self);
8082 return 1;
8083 }
8084
8085 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8086 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8087 Py_ssize_t index,
8088 const void **ptr)
8089 {
8090 PyObject *str;
8091
8092 if (index != 0) {
8093 PyErr_SetString(PyExc_SystemError,
8094 "accessing non-existent unicode segment");
8095 return -1;
8096 }
8097 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8098 if (str == NULL)
8099 return -1;
8100 *ptr = (void *) PyString_AS_STRING(str);
8101 return PyString_GET_SIZE(str);
8102 }
8103
8104 /* Helpers for PyUnicode_Format() */
8105
8106 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)8107 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8108 {
8109 Py_ssize_t argidx = *p_argidx;
8110 if (argidx < arglen) {
8111 (*p_argidx)++;
8112 if (arglen < 0)
8113 return args;
8114 else
8115 return PyTuple_GetItem(args, argidx);
8116 }
8117 PyErr_SetString(PyExc_TypeError,
8118 "not enough arguments for format string");
8119 return NULL;
8120 }
8121
8122 #define F_LJUST (1<<0)
8123 #define F_SIGN (1<<1)
8124 #define F_BLANK (1<<2)
8125 #define F_ALT (1<<3)
8126 #define F_ZERO (1<<4)
8127
8128 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8129 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8130 {
8131 register Py_ssize_t i;
8132 Py_ssize_t len = strlen(charbuffer);
8133 for (i = len - 1; i >= 0; i--)
8134 buffer[i] = (Py_UNICODE) charbuffer[i];
8135
8136 return len;
8137 }
8138
8139 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8140 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8141 {
8142 Py_ssize_t result;
8143
8144 PyOS_snprintf((char *)buffer, len, format, x);
8145 result = strtounicode(buffer, (char *)buffer);
8146 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8147 }
8148
8149 /* XXX To save some code duplication, formatfloat/long/int could have been
8150 shared with stringobject.c, converting from 8-bit to Unicode after the
8151 formatting is done. */
8152
8153 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8154
8155 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8156 formatfloat(PyObject *v, int flags, int prec, int type)
8157 {
8158 char *p;
8159 PyObject *result;
8160 double x;
8161
8162 x = PyFloat_AsDouble(v);
8163 if (x == -1.0 && PyErr_Occurred())
8164 return NULL;
8165
8166 if (prec < 0)
8167 prec = 6;
8168
8169 p = PyOS_double_to_string(x, type, prec,
8170 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8171 if (p == NULL)
8172 return NULL;
8173 result = PyUnicode_FromStringAndSize(p, strlen(p));
8174 PyMem_Free(p);
8175 return result;
8176 }
8177
8178 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8179 formatlong(PyObject *val, int flags, int prec, int type)
8180 {
8181 char *buf;
8182 int i, len;
8183 PyObject *str; /* temporary string object. */
8184 PyUnicodeObject *result;
8185
8186 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8187 if (!str)
8188 return NULL;
8189 result = _PyUnicode_New(len);
8190 if (!result) {
8191 Py_DECREF(str);
8192 return NULL;
8193 }
8194 for (i = 0; i < len; i++)
8195 result->str[i] = buf[i];
8196 result->str[len] = 0;
8197 Py_DECREF(str);
8198 return (PyObject*)result;
8199 }
8200
8201 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8202 formatint(Py_UNICODE *buf,
8203 size_t buflen,
8204 int flags,
8205 int prec,
8206 int type,
8207 PyObject *v)
8208 {
8209 /* fmt = '%#.' + `prec` + 'l' + `type`
8210 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8211 * + 1 + 1
8212 * = 24
8213 */
8214 char fmt[64]; /* plenty big enough! */
8215 char *sign;
8216 long x;
8217
8218 x = PyInt_AsLong(v);
8219 if (x == -1 && PyErr_Occurred())
8220 return -1;
8221 if (x < 0 && type == 'u') {
8222 type = 'd';
8223 }
8224 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8225 sign = "-";
8226 else
8227 sign = "";
8228 if (prec < 0)
8229 prec = 1;
8230
8231 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8232 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8233 */
8234 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8235 PyErr_SetString(PyExc_OverflowError,
8236 "formatted integer is too long (precision too large?)");
8237 return -1;
8238 }
8239
8240 if ((flags & F_ALT) &&
8241 (type == 'x' || type == 'X')) {
8242 /* When converting under %#x or %#X, there are a number
8243 * of issues that cause pain:
8244 * - when 0 is being converted, the C standard leaves off
8245 * the '0x' or '0X', which is inconsistent with other
8246 * %#x/%#X conversions and inconsistent with Python's
8247 * hex() function
8248 * - there are platforms that violate the standard and
8249 * convert 0 with the '0x' or '0X'
8250 * (Metrowerks, Compaq Tru64)
8251 * - there are platforms that give '0x' when converting
8252 * under %#X, but convert 0 in accordance with the
8253 * standard (OS/2 EMX)
8254 *
8255 * We can achieve the desired consistency by inserting our
8256 * own '0x' or '0X' prefix, and substituting %x/%X in place
8257 * of %#x/%#X.
8258 *
8259 * Note that this is the same approach as used in
8260 * formatint() in stringobject.c
8261 */
8262 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8263 sign, type, prec, type);
8264 }
8265 else {
8266 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8267 sign, (flags&F_ALT) ? "#" : "",
8268 prec, type);
8269 }
8270 if (sign[0])
8271 return longtounicode(buf, buflen, fmt, -x);
8272 else
8273 return longtounicode(buf, buflen, fmt, x);
8274 }
8275
8276 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8277 formatchar(Py_UNICODE *buf,
8278 size_t buflen,
8279 PyObject *v)
8280 {
8281 PyObject *unistr;
8282 char *str;
8283 /* presume that the buffer is at least 2 characters long */
8284 if (PyUnicode_Check(v)) {
8285 if (PyUnicode_GET_SIZE(v) != 1)
8286 goto onError;
8287 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8288 }
8289
8290 else if (PyString_Check(v)) {
8291 if (PyString_GET_SIZE(v) != 1)
8292 goto onError;
8293 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8294 with a UnicodeDecodeError if 'char' is not decodable with the
8295 default encoding (usually ASCII, but it might be something else) */
8296 str = PyString_AS_STRING(v);
8297 if ((unsigned char)str[0] > 0x7F) {
8298 /* the char is not ASCII; try to decode the string using the
8299 default encoding and return -1 to let the UnicodeDecodeError
8300 be raised if the string can't be decoded */
8301 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8302 if (unistr == NULL)
8303 return -1;
8304 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8305 Py_DECREF(unistr);
8306 }
8307 else
8308 buf[0] = (Py_UNICODE)str[0];
8309 }
8310
8311 else {
8312 /* Integer input truncated to a character */
8313 long x;
8314 x = PyInt_AsLong(v);
8315 if (x == -1 && PyErr_Occurred())
8316 goto onError;
8317 #ifdef Py_UNICODE_WIDE
8318 if (x < 0 || x > 0x10ffff) {
8319 PyErr_SetString(PyExc_OverflowError,
8320 "%c arg not in range(0x110000) "
8321 "(wide Python build)");
8322 return -1;
8323 }
8324 #else
8325 if (x < 0 || x > 0xffff) {
8326 PyErr_SetString(PyExc_OverflowError,
8327 "%c arg not in range(0x10000) "
8328 "(narrow Python build)");
8329 return -1;
8330 }
8331 #endif
8332 buf[0] = (Py_UNICODE) x;
8333 }
8334 buf[1] = '\0';
8335 return 1;
8336
8337 onError:
8338 PyErr_SetString(PyExc_TypeError,
8339 "%c requires int or char");
8340 return -1;
8341 }
8342
8343 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8344
8345 FORMATBUFLEN is the length of the buffer in which the ints &
8346 chars are formatted. XXX This is a magic number. Each formatting
8347 routine does bounds checking to ensure no overflow, but a better
8348 solution may be to malloc a buffer of appropriate size for each
8349 format. For now, the current solution is sufficient.
8350 */
8351 #define FORMATBUFLEN (size_t)120
8352
PyUnicode_Format(PyObject * format,PyObject * args)8353 PyObject *PyUnicode_Format(PyObject *format,
8354 PyObject *args)
8355 {
8356 Py_UNICODE *fmt, *res;
8357 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8358 int args_owned = 0;
8359 PyUnicodeObject *result = NULL;
8360 PyObject *dict = NULL;
8361 PyObject *uformat;
8362
8363 if (format == NULL || args == NULL) {
8364 PyErr_BadInternalCall();
8365 return NULL;
8366 }
8367 uformat = PyUnicode_FromObject(format);
8368 if (uformat == NULL)
8369 return NULL;
8370 fmt = PyUnicode_AS_UNICODE(uformat);
8371 fmtcnt = PyUnicode_GET_SIZE(uformat);
8372
8373 reslen = rescnt = fmtcnt + 100;
8374 result = _PyUnicode_New(reslen);
8375 if (result == NULL)
8376 goto onError;
8377 res = PyUnicode_AS_UNICODE(result);
8378
8379 if (PyTuple_Check(args)) {
8380 arglen = PyTuple_Size(args);
8381 argidx = 0;
8382 }
8383 else {
8384 arglen = -1;
8385 argidx = -2;
8386 }
8387 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8388 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8389 dict = args;
8390
8391 while (--fmtcnt >= 0) {
8392 if (*fmt != '%') {
8393 if (--rescnt < 0) {
8394 rescnt = fmtcnt + 100;
8395 reslen += rescnt;
8396 if (_PyUnicode_Resize(&result, reslen) < 0)
8397 goto onError;
8398 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8399 --rescnt;
8400 }
8401 *res++ = *fmt++;
8402 }
8403 else {
8404 /* Got a format specifier */
8405 int flags = 0;
8406 Py_ssize_t width = -1;
8407 int prec = -1;
8408 Py_UNICODE c = '\0';
8409 Py_UNICODE fill;
8410 int isnumok;
8411 PyObject *v = NULL;
8412 PyObject *temp = NULL;
8413 Py_UNICODE *pbuf;
8414 Py_UNICODE sign;
8415 Py_ssize_t len;
8416 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8417
8418 fmt++;
8419 if (*fmt == '(') {
8420 Py_UNICODE *keystart;
8421 Py_ssize_t keylen;
8422 PyObject *key;
8423 int pcount = 1;
8424
8425 if (dict == NULL) {
8426 PyErr_SetString(PyExc_TypeError,
8427 "format requires a mapping");
8428 goto onError;
8429 }
8430 ++fmt;
8431 --fmtcnt;
8432 keystart = fmt;
8433 /* Skip over balanced parentheses */
8434 while (pcount > 0 && --fmtcnt >= 0) {
8435 if (*fmt == ')')
8436 --pcount;
8437 else if (*fmt == '(')
8438 ++pcount;
8439 fmt++;
8440 }
8441 keylen = fmt - keystart - 1;
8442 if (fmtcnt < 0 || pcount > 0) {
8443 PyErr_SetString(PyExc_ValueError,
8444 "incomplete format key");
8445 goto onError;
8446 }
8447 #if 0
8448 /* keys are converted to strings using UTF-8 and
8449 then looked up since Python uses strings to hold
8450 variables names etc. in its namespaces and we
8451 wouldn't want to break common idioms. */
8452 key = PyUnicode_EncodeUTF8(keystart,
8453 keylen,
8454 NULL);
8455 #else
8456 key = PyUnicode_FromUnicode(keystart, keylen);
8457 #endif
8458 if (key == NULL)
8459 goto onError;
8460 if (args_owned) {
8461 Py_DECREF(args);
8462 args_owned = 0;
8463 }
8464 args = PyObject_GetItem(dict, key);
8465 Py_DECREF(key);
8466 if (args == NULL) {
8467 goto onError;
8468 }
8469 args_owned = 1;
8470 arglen = -1;
8471 argidx = -2;
8472 }
8473 while (--fmtcnt >= 0) {
8474 switch (c = *fmt++) {
8475 case '-': flags |= F_LJUST; continue;
8476 case '+': flags |= F_SIGN; continue;
8477 case ' ': flags |= F_BLANK; continue;
8478 case '#': flags |= F_ALT; continue;
8479 case '0': flags |= F_ZERO; continue;
8480 }
8481 break;
8482 }
8483 if (c == '*') {
8484 v = getnextarg(args, arglen, &argidx);
8485 if (v == NULL)
8486 goto onError;
8487 if (!PyInt_Check(v)) {
8488 PyErr_SetString(PyExc_TypeError,
8489 "* wants int");
8490 goto onError;
8491 }
8492 width = PyInt_AsSsize_t(v);
8493 if (width == -1 && PyErr_Occurred())
8494 goto onError;
8495 if (width < 0) {
8496 flags |= F_LJUST;
8497 width = -width;
8498 }
8499 if (--fmtcnt >= 0)
8500 c = *fmt++;
8501 }
8502 else if (c >= '0' && c <= '9') {
8503 width = c - '0';
8504 while (--fmtcnt >= 0) {
8505 c = *fmt++;
8506 if (c < '0' || c > '9')
8507 break;
8508 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8509 PyErr_SetString(PyExc_ValueError,
8510 "width too big");
8511 goto onError;
8512 }
8513 width = width*10 + (c - '0');
8514 }
8515 }
8516 if (c == '.') {
8517 prec = 0;
8518 if (--fmtcnt >= 0)
8519 c = *fmt++;
8520 if (c == '*') {
8521 v = getnextarg(args, arglen, &argidx);
8522 if (v == NULL)
8523 goto onError;
8524 if (!PyInt_Check(v)) {
8525 PyErr_SetString(PyExc_TypeError,
8526 "* wants int");
8527 goto onError;
8528 }
8529 prec = _PyInt_AsInt(v);
8530 if (prec == -1 && PyErr_Occurred())
8531 goto onError;
8532 if (prec < 0)
8533 prec = 0;
8534 if (--fmtcnt >= 0)
8535 c = *fmt++;
8536 }
8537 else if (c >= '0' && c <= '9') {
8538 prec = c - '0';
8539 while (--fmtcnt >= 0) {
8540 c = *fmt++;
8541 if (c < '0' || c > '9')
8542 break;
8543 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8544 PyErr_SetString(PyExc_ValueError,
8545 "prec too big");
8546 goto onError;
8547 }
8548 prec = prec*10 + (c - '0');
8549 }
8550 }
8551 } /* prec */
8552 if (fmtcnt >= 0) {
8553 if (c == 'h' || c == 'l' || c == 'L') {
8554 if (--fmtcnt >= 0)
8555 c = *fmt++;
8556 }
8557 }
8558 if (fmtcnt < 0) {
8559 PyErr_SetString(PyExc_ValueError,
8560 "incomplete format");
8561 goto onError;
8562 }
8563 if (c != '%') {
8564 v = getnextarg(args, arglen, &argidx);
8565 if (v == NULL)
8566 goto onError;
8567 }
8568 sign = 0;
8569 fill = ' ';
8570 switch (c) {
8571
8572 case '%':
8573 pbuf = formatbuf;
8574 /* presume that buffer length is at least 1 */
8575 pbuf[0] = '%';
8576 len = 1;
8577 break;
8578
8579 case 's':
8580 case 'r':
8581 if (PyUnicode_CheckExact(v) && c == 's') {
8582 temp = v;
8583 Py_INCREF(temp);
8584 }
8585 else {
8586 PyObject *unicode;
8587 if (c == 's')
8588 temp = PyObject_Unicode(v);
8589 else
8590 temp = PyObject_Repr(v);
8591 if (temp == NULL)
8592 goto onError;
8593 if (PyUnicode_Check(temp))
8594 /* nothing to do */;
8595 else if (PyString_Check(temp)) {
8596 /* convert to string to Unicode */
8597 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8598 PyString_GET_SIZE(temp),
8599 NULL,
8600 "strict");
8601 Py_DECREF(temp);
8602 temp = unicode;
8603 if (temp == NULL)
8604 goto onError;
8605 }
8606 else {
8607 Py_DECREF(temp);
8608 PyErr_SetString(PyExc_TypeError,
8609 "%s argument has non-string str()");
8610 goto onError;
8611 }
8612 }
8613 pbuf = PyUnicode_AS_UNICODE(temp);
8614 len = PyUnicode_GET_SIZE(temp);
8615 if (prec >= 0 && len > prec)
8616 len = prec;
8617 break;
8618
8619 case 'i':
8620 case 'd':
8621 case 'u':
8622 case 'o':
8623 case 'x':
8624 case 'X':
8625 if (c == 'i')
8626 c = 'd';
8627 isnumok = 0;
8628 if (PyNumber_Check(v)) {
8629 PyObject *iobj=NULL;
8630
8631 if (_PyAnyInt_Check(v)) {
8632 iobj = v;
8633 Py_INCREF(iobj);
8634 }
8635 else {
8636 iobj = PyNumber_Int(v);
8637 if (iobj==NULL) {
8638 PyErr_Clear();
8639 iobj = PyNumber_Long(v);
8640 }
8641 }
8642 if (iobj!=NULL) {
8643 if (PyInt_Check(iobj)) {
8644 isnumok = 1;
8645 pbuf = formatbuf;
8646 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8647 flags, prec, c, iobj);
8648 Py_DECREF(iobj);
8649 if (len < 0)
8650 goto onError;
8651 sign = 1;
8652 }
8653 else if (PyLong_Check(iobj)) {
8654 isnumok = 1;
8655 temp = formatlong(iobj, flags, prec, c);
8656 Py_DECREF(iobj);
8657 if (!temp)
8658 goto onError;
8659 pbuf = PyUnicode_AS_UNICODE(temp);
8660 len = PyUnicode_GET_SIZE(temp);
8661 sign = 1;
8662 }
8663 else {
8664 Py_DECREF(iobj);
8665 }
8666 }
8667 }
8668 if (!isnumok) {
8669 PyErr_Format(PyExc_TypeError,
8670 "%%%c format: a number is required, "
8671 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8672 goto onError;
8673 }
8674 if (flags & F_ZERO)
8675 fill = '0';
8676 break;
8677
8678 case 'e':
8679 case 'E':
8680 case 'f':
8681 case 'F':
8682 case 'g':
8683 case 'G':
8684 temp = formatfloat(v, flags, prec, c);
8685 if (temp == NULL)
8686 goto onError;
8687 pbuf = PyUnicode_AS_UNICODE(temp);
8688 len = PyUnicode_GET_SIZE(temp);
8689 sign = 1;
8690 if (flags & F_ZERO)
8691 fill = '0';
8692 break;
8693
8694 case 'c':
8695 pbuf = formatbuf;
8696 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8697 if (len < 0)
8698 goto onError;
8699 break;
8700
8701 default:
8702 PyErr_Format(PyExc_ValueError,
8703 "unsupported format character '%c' (0x%x) "
8704 "at index %zd",
8705 (31<=c && c<=126) ? (char)c : '?',
8706 (int)c,
8707 (Py_ssize_t)(fmt - 1 -
8708 PyUnicode_AS_UNICODE(uformat)));
8709 goto onError;
8710 }
8711 if (sign) {
8712 if (*pbuf == '-' || *pbuf == '+') {
8713 sign = *pbuf++;
8714 len--;
8715 }
8716 else if (flags & F_SIGN)
8717 sign = '+';
8718 else if (flags & F_BLANK)
8719 sign = ' ';
8720 else
8721 sign = 0;
8722 }
8723 if (width < len)
8724 width = len;
8725 if (rescnt - (sign != 0) < width) {
8726 reslen -= rescnt;
8727 rescnt = width + fmtcnt + 100;
8728 reslen += rescnt;
8729 if (reslen < 0) {
8730 Py_XDECREF(temp);
8731 PyErr_NoMemory();
8732 goto onError;
8733 }
8734 if (_PyUnicode_Resize(&result, reslen) < 0) {
8735 Py_XDECREF(temp);
8736 goto onError;
8737 }
8738 res = PyUnicode_AS_UNICODE(result)
8739 + reslen - rescnt;
8740 }
8741 if (sign) {
8742 if (fill != ' ')
8743 *res++ = sign;
8744 rescnt--;
8745 if (width > len)
8746 width--;
8747 }
8748 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8749 assert(pbuf[0] == '0');
8750 assert(pbuf[1] == c);
8751 if (fill != ' ') {
8752 *res++ = *pbuf++;
8753 *res++ = *pbuf++;
8754 }
8755 rescnt -= 2;
8756 width -= 2;
8757 if (width < 0)
8758 width = 0;
8759 len -= 2;
8760 }
8761 if (width > len && !(flags & F_LJUST)) {
8762 do {
8763 --rescnt;
8764 *res++ = fill;
8765 } while (--width > len);
8766 }
8767 if (fill == ' ') {
8768 if (sign)
8769 *res++ = sign;
8770 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8771 assert(pbuf[0] == '0');
8772 assert(pbuf[1] == c);
8773 *res++ = *pbuf++;
8774 *res++ = *pbuf++;
8775 }
8776 }
8777 Py_UNICODE_COPY(res, pbuf, len);
8778 res += len;
8779 rescnt -= len;
8780 while (--width >= len) {
8781 --rescnt;
8782 *res++ = ' ';
8783 }
8784 if (dict && (argidx < arglen) && c != '%') {
8785 PyErr_SetString(PyExc_TypeError,
8786 "not all arguments converted during string formatting");
8787 Py_XDECREF(temp);
8788 goto onError;
8789 }
8790 Py_XDECREF(temp);
8791 } /* '%' */
8792 } /* until end */
8793 if (argidx < arglen && !dict) {
8794 PyErr_SetString(PyExc_TypeError,
8795 "not all arguments converted during string formatting");
8796 goto onError;
8797 }
8798
8799 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8800 goto onError;
8801 if (args_owned) {
8802 Py_DECREF(args);
8803 }
8804 Py_DECREF(uformat);
8805 return (PyObject *)result;
8806
8807 onError:
8808 Py_XDECREF(result);
8809 Py_DECREF(uformat);
8810 if (args_owned) {
8811 Py_DECREF(args);
8812 }
8813 return NULL;
8814 }
8815
8816 static PyBufferProcs unicode_as_buffer = {
8817 (readbufferproc) unicode_buffer_getreadbuf,
8818 (writebufferproc) unicode_buffer_getwritebuf,
8819 (segcountproc) unicode_buffer_getsegcount,
8820 (charbufferproc) unicode_buffer_getcharbuf,
8821 };
8822
8823 static PyObject *
8824 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8825
8826 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8827 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8828 {
8829 PyObject *x = NULL;
8830 static char *kwlist[] = {"string", "encoding", "errors", 0};
8831 char *encoding = NULL;
8832 char *errors = NULL;
8833
8834 if (type != &PyUnicode_Type)
8835 return unicode_subtype_new(type, args, kwds);
8836 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8837 kwlist, &x, &encoding, &errors))
8838 return NULL;
8839 if (x == NULL)
8840 return (PyObject *)_PyUnicode_New(0);
8841 if (encoding == NULL && errors == NULL)
8842 return PyObject_Unicode(x);
8843 else
8844 return PyUnicode_FromEncodedObject(x, encoding, errors);
8845 }
8846
8847 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8848 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8849 {
8850 PyUnicodeObject *tmp, *pnew;
8851 Py_ssize_t n;
8852
8853 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8854 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8855 if (tmp == NULL)
8856 return NULL;
8857 assert(PyUnicode_Check(tmp));
8858 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8859 if (pnew == NULL) {
8860 Py_DECREF(tmp);
8861 return NULL;
8862 }
8863 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8864 if (pnew->str == NULL) {
8865 _Py_ForgetReference((PyObject *)pnew);
8866 PyObject_Del(pnew);
8867 Py_DECREF(tmp);
8868 return PyErr_NoMemory();
8869 }
8870 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8871 pnew->length = n;
8872 pnew->hash = tmp->hash;
8873 Py_DECREF(tmp);
8874 return (PyObject *)pnew;
8875 }
8876
8877 PyDoc_STRVAR(unicode_doc,
8878 "unicode(object='') -> unicode object\n\
8879 unicode(string[, encoding[, errors]]) -> unicode object\n\
8880 \n\
8881 Create a new Unicode object from the given encoded string.\n\
8882 encoding defaults to the current default string encoding.\n\
8883 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8884
8885 PyTypeObject PyUnicode_Type = {
8886 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8887 "unicode", /* tp_name */
8888 sizeof(PyUnicodeObject), /* tp_size */
8889 0, /* tp_itemsize */
8890 /* Slots */
8891 (destructor)unicode_dealloc, /* tp_dealloc */
8892 0, /* tp_print */
8893 0, /* tp_getattr */
8894 0, /* tp_setattr */
8895 0, /* tp_compare */
8896 unicode_repr, /* tp_repr */
8897 &unicode_as_number, /* tp_as_number */
8898 &unicode_as_sequence, /* tp_as_sequence */
8899 &unicode_as_mapping, /* tp_as_mapping */
8900 (hashfunc) unicode_hash, /* tp_hash*/
8901 0, /* tp_call*/
8902 (reprfunc) unicode_str, /* tp_str */
8903 PyObject_GenericGetAttr, /* tp_getattro */
8904 0, /* tp_setattro */
8905 &unicode_as_buffer, /* tp_as_buffer */
8906 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8907 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8908 unicode_doc, /* tp_doc */
8909 0, /* tp_traverse */
8910 0, /* tp_clear */
8911 PyUnicode_RichCompare, /* tp_richcompare */
8912 0, /* tp_weaklistoffset */
8913 0, /* tp_iter */
8914 0, /* tp_iternext */
8915 unicode_methods, /* tp_methods */
8916 0, /* tp_members */
8917 0, /* tp_getset */
8918 &PyBaseString_Type, /* tp_base */
8919 0, /* tp_dict */
8920 0, /* tp_descr_get */
8921 0, /* tp_descr_set */
8922 0, /* tp_dictoffset */
8923 0, /* tp_init */
8924 0, /* tp_alloc */
8925 unicode_new, /* tp_new */
8926 PyObject_Del, /* tp_free */
8927 };
8928
8929 /* Initialize the Unicode implementation */
8930
_PyUnicode_Init(void)8931 void _PyUnicode_Init(void)
8932 {
8933 /* XXX - move this array to unicodectype.c ? */
8934 Py_UNICODE linebreak[] = {
8935 0x000A, /* LINE FEED */
8936 0x000D, /* CARRIAGE RETURN */
8937 0x001C, /* FILE SEPARATOR */
8938 0x001D, /* GROUP SEPARATOR */
8939 0x001E, /* RECORD SEPARATOR */
8940 0x0085, /* NEXT LINE */
8941 0x2028, /* LINE SEPARATOR */
8942 0x2029, /* PARAGRAPH SEPARATOR */
8943 };
8944
8945 /* Init the implementation */
8946 if (!unicode_empty) {
8947 unicode_empty = _PyUnicode_New(0);
8948 if (!unicode_empty)
8949 return;
8950 }
8951
8952 if (PyType_Ready(&PyUnicode_Type) < 0)
8953 Py_FatalError("Can't initialize 'unicode'");
8954
8955 /* initialize the linebreak bloom filter */
8956 bloom_linebreak = make_bloom_mask(
8957 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8958 );
8959
8960 PyType_Ready(&EncodingMapType);
8961
8962 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8963 Py_FatalError("Can't initialize field name iterator type");
8964
8965 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8966 Py_FatalError("Can't initialize formatter iter type");
8967 }
8968
8969 /* Finalize the Unicode implementation */
8970
8971 int
PyUnicode_ClearFreeList(void)8972 PyUnicode_ClearFreeList(void)
8973 {
8974 int freelist_size = numfree;
8975 PyUnicodeObject *u;
8976
8977 for (u = free_list; u != NULL;) {
8978 PyUnicodeObject *v = u;
8979 u = *(PyUnicodeObject **)u;
8980 if (v->str)
8981 PyObject_DEL(v->str);
8982 Py_XDECREF(v->defenc);
8983 PyObject_Del(v);
8984 numfree--;
8985 }
8986 free_list = NULL;
8987 assert(numfree == 0);
8988 return freelist_size;
8989 }
8990
8991 void
_PyUnicode_Fini(void)8992 _PyUnicode_Fini(void)
8993 {
8994 int i;
8995
8996 Py_CLEAR(unicode_empty);
8997
8998 for (i = 0; i < 256; i++)
8999 Py_CLEAR(unicode_latin1[i]);
9000
9001 (void)PyUnicode_ClearFreeList();
9002 }
9003
9004 #ifdef __cplusplus
9005 }
9006 #endif
9007