1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10 Copyright (c) Corporation for National Research Initiatives.
11
12 --------------------------------------------------------------------
13 The original string type implementation is:
14
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
17
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39
40 */
41
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /* Limit for the Unicode object free list */
53
54 #define PyUnicode_MAXFREELIST 1024
55
56 /* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
61
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
70
71 */
72
73 #define KEEPALIVE_SIZE_LIMIT 9
74
75 /* Endianness switches; defaults to little endian */
76
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82
83 /* --- Globals ------------------------------------------------------------
84
85 NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
88
89 */
90
91
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102
103 #define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
114
115 /* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131 /* case 0x0009: * CHARACTER TABULATION */
132 /* case 0x000A: * LINE FEED */
133 /* case 0x000B: * LINE TABULATION */
134 /* case 0x000C: * FORM FEED */
135 /* case 0x000D: * CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 /* case 0x001C: * FILE SEPARATOR */
139 /* case 0x001D: * GROUP SEPARATOR */
140 /* case 0x001E: * RECORD SEPARATOR */
141 /* case 0x001F: * UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143 /* case 0x0020: * SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157 };
158
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0x000A, * LINE FEED */
163 /* 0x000B, * LINE TABULATION */
164 /* 0x000C, * FORM FEED */
165 /* 0x000D, * CARRIAGE RETURN */
166 0, 0, 1, 1, 1, 1, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 /* 0x001C, * FILE SEPARATOR */
169 /* 0x001D, * GROUP SEPARATOR */
170 /* 0x001E, * RECORD SEPARATOR */
171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
185 };
186
187
188 Py_UNICODE
PyUnicode_GetMax(void)189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192 return 0x10FFFF;
193 #else
194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
197 #endif
198 }
199
200 /* --- Bloom Filters ----------------------------------------------------- */
201
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206 /* the linebreak mask is set up by Unicode_Init below */
207
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217
218 #define BLOOM_MASK unsigned long
219
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225 #define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
233 BLOOM_MASK mask;
234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
238 BLOOM_ADD(mask, ptr[i]);
239
240 return mask;
241 }
242
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252 }
253
254 #define BLOOM_MEMBER(mask, chr, set, setlen) \
255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257 /* --- Unicode Object ----------------------------------------------------- */
258
259 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)260 int unicode_resize(register PyUnicodeObject *unicode,
261 Py_ssize_t length)
262 {
263 void *oldstr;
264
265 /* Shortcut if there's nothing much to do. */
266 if (unicode->length == length)
267 goto reset;
268
269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
272
273 if (unicode == unicode_empty ||
274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
277 PyErr_SetString(PyExc_SystemError,
278 "can't resize shared unicode objects");
279 return -1;
280 }
281
282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
287 oldstr = unicode->str;
288 unicode->str = PyObject_REALLOC(unicode->str,
289 sizeof(Py_UNICODE) * (length + 1));
290 if (!unicode->str) {
291 unicode->str = (Py_UNICODE *)oldstr;
292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
296 unicode->length = length;
297
298 reset:
299 /* Reset the object caches */
300 if (unicode->defenc) {
301 Py_CLEAR(unicode->defenc);
302 }
303 unicode->hash = -1;
304
305 return 0;
306 }
307
308 /* We allocate one more byte to make sure the string is
309 Ux0000 terminated; some code relies on that.
310
311 XXX This allocator could further be enhanced by assuring that the
312 free list never reduces its size below 1.
313
314 */
315
316 static
_PyUnicode_New(Py_ssize_t length)317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319 register PyUnicodeObject *unicode;
320
321 /* Optimization for empty strings */
322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
332 /* Unicode freelist & memory allocation */
333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
341 unicode_resize(unicode, length) < 0) {
342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
345 }
346 else {
347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349 }
350 (void)PyObject_INIT(unicode, &PyUnicode_Type);
351 }
352 else {
353 size_t new_size;
354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355 if (unicode == NULL)
356 return NULL;
357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359 }
360
361 if (!unicode->str) {
362 PyErr_NoMemory();
363 goto onError;
364 }
365 /* Initialize the first element to guard against cases where
366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
372 unicode->str[0] = 0;
373 unicode->str[length] = 0;
374 unicode->length = length;
375 unicode->hash = -1;
376 unicode->defenc = NULL;
377 return unicode;
378
379 onError:
380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
382 _Py_ForgetReference((PyObject *)unicode);
383 PyObject_Del(unicode);
384 return NULL;
385 }
386
387 static
unicode_dealloc(register PyUnicodeObject * unicode)388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390 if (PyUnicode_CheckExact(unicode) &&
391 numfree < PyUnicode_MAXFREELIST) {
392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
399 Py_CLEAR(unicode->defenc);
400 }
401 /* Add to free list */
402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
405 }
406 else {
407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410 }
411 }
412
413 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = *unicode;
424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
432 if (v->length != length &&
433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_SETREF(*unicode, w);
440 return 0;
441 }
442
443 /* Note that we don't have to modify *unicode for unshared Unicode
444 objects, since we can modify them in-place. */
445 return unicode_resize(v, length);
446 }
447
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
449 {
450 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
451 }
452
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
454 Py_ssize_t size)
455 {
456 PyUnicodeObject *unicode;
457
458 /* If the Unicode data is known at construction time, we can apply
459 some optimizations which share commonly used objects. */
460 if (u != NULL) {
461
462 /* Optimization for empty strings */
463 if (size == 0)
464 _Py_RETURN_UNICODE_EMPTY();
465
466 /* Single character Unicode objects in the Latin-1 range are
467 shared when using this constructor */
468 if (size == 1 && *u < 256) {
469 unicode = unicode_latin1[*u];
470 if (!unicode) {
471 unicode = _PyUnicode_New(1);
472 if (!unicode)
473 return NULL;
474 unicode->str[0] = *u;
475 unicode_latin1[*u] = unicode;
476 }
477 Py_INCREF(unicode);
478 return (PyObject *)unicode;
479 }
480 }
481
482 unicode = _PyUnicode_New(size);
483 if (!unicode)
484 return NULL;
485
486 /* Copy the Unicode data into the new object */
487 if (u != NULL)
488 Py_UNICODE_COPY(unicode->str, u, size);
489
490 return (PyObject *)unicode;
491 }
492
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
494 {
495 PyUnicodeObject *unicode;
496
497 if (size < 0) {
498 PyErr_SetString(PyExc_SystemError,
499 "Negative size passed to PyUnicode_FromStringAndSize");
500 return NULL;
501 }
502
503 /* If the Unicode data is known at construction time, we can apply
504 some optimizations which share commonly used objects.
505 Also, this means the input must be UTF-8, so fall back to the
506 UTF-8 decoder at the end. */
507 if (u != NULL) {
508
509 /* Optimization for empty strings */
510 if (size == 0)
511 _Py_RETURN_UNICODE_EMPTY();
512
513 /* Single characters are shared when using this constructor.
514 Restrict to ASCII, since the input must be UTF-8. */
515 if (size == 1 && Py_CHARMASK(*u) < 128) {
516 unicode = unicode_latin1[Py_CHARMASK(*u)];
517 if (!unicode) {
518 unicode = _PyUnicode_New(1);
519 if (!unicode)
520 return NULL;
521 unicode->str[0] = Py_CHARMASK(*u);
522 unicode_latin1[Py_CHARMASK(*u)] = unicode;
523 }
524 Py_INCREF(unicode);
525 return (PyObject *)unicode;
526 }
527
528 return PyUnicode_DecodeUTF8(u, size, NULL);
529 }
530
531 unicode = _PyUnicode_New(size);
532 if (!unicode)
533 return NULL;
534
535 return (PyObject *)unicode;
536 }
537
PyUnicode_FromString(const char * u)538 PyObject *PyUnicode_FromString(const char *u)
539 {
540 size_t size = strlen(u);
541 if (size > PY_SSIZE_T_MAX) {
542 PyErr_SetString(PyExc_OverflowError, "input too long");
543 return NULL;
544 }
545
546 return PyUnicode_FromStringAndSize(u, size);
547 }
548
549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
550 * by 'ptr', possibly combining surrogate pairs on narrow builds.
551 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
552 * that should be returned and 'end' pointing to the end of the buffer.
553 * ('end' is used on narrow builds to detect a lone surrogate at the
554 * end of the buffer that should be returned unchanged.)
555 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
556 * The type of the returned char is always Py_UCS4.
557 *
558 * Note: the macro advances ptr to next char, so it might have side-effects
559 * (especially if used with other macros).
560 */
561
562 /* helper macros used by _Py_UNICODE_NEXT */
563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
565 /* Join two surrogate characters and return a single Py_UCS4 value. */
566 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \
567 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
568 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
569
570 #ifdef Py_UNICODE_WIDE
571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
572 #else
573 #define _Py_UNICODE_NEXT(ptr, end) \
574 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
575 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
576 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
577 (Py_UCS4)*(ptr)++)
578 #endif
579
580 #ifdef HAVE_WCHAR_H
581
582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583 # define CONVERT_WCHAR_TO_SURROGATES
584 #endif
585
586 #ifdef CONVERT_WCHAR_TO_SURROGATES
587
588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589 to convert from UTF32 to UTF16. */
590
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592 Py_ssize_t size)
593 {
594 PyUnicodeObject *unicode;
595 register Py_ssize_t i;
596 Py_ssize_t alloc;
597 const wchar_t *orig_w;
598
599 if (w == NULL) {
600 PyErr_BadInternalCall();
601 return NULL;
602 }
603
604 alloc = size;
605 orig_w = w;
606 for (i = size; i > 0; i--) {
607 if (*w > 0xFFFF)
608 alloc++;
609 w++;
610 }
611 w = orig_w;
612 unicode = _PyUnicode_New(alloc);
613 if (!unicode)
614 return NULL;
615
616 /* Copy the wchar_t data into the new object */
617 {
618 register Py_UNICODE *u;
619 u = PyUnicode_AS_UNICODE(unicode);
620 for (i = size; i > 0; i--) {
621 if (*w > 0xFFFF) {
622 wchar_t ordinal = *w++;
623 ordinal -= 0x10000;
624 *u++ = 0xD800 | (ordinal >> 10);
625 *u++ = 0xDC00 | (ordinal & 0x3FF);
626 }
627 else
628 *u++ = *w++;
629 }
630 }
631 return (PyObject *)unicode;
632 }
633
634 #else
635
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
637 Py_ssize_t size)
638 {
639 PyUnicodeObject *unicode;
640
641 if (w == NULL) {
642 PyErr_BadInternalCall();
643 return NULL;
644 }
645
646 unicode = _PyUnicode_New(size);
647 if (!unicode)
648 return NULL;
649
650 /* Copy the wchar_t data into the new object */
651 #ifdef HAVE_USABLE_WCHAR_T
652 memcpy(unicode->str, w, size * sizeof(wchar_t));
653 #else
654 {
655 register Py_UNICODE *u;
656 register Py_ssize_t i;
657 u = PyUnicode_AS_UNICODE(unicode);
658 for (i = size; i > 0; i--)
659 *u++ = *w++;
660 }
661 #endif
662
663 return (PyObject *)unicode;
664 }
665
666 #endif /* CONVERT_WCHAR_TO_SURROGATES */
667
668 #undef CONVERT_WCHAR_TO_SURROGATES
669
670 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
672 {
673 *fmt++ = '%';
674 if (width) {
675 if (zeropad)
676 *fmt++ = '0';
677 fmt += sprintf(fmt, "%d", width);
678 }
679 if (precision)
680 fmt += sprintf(fmt, ".%d", precision);
681 if (longflag)
682 *fmt++ = 'l';
683 else if (size_tflag) {
684 char *f = PY_FORMAT_SIZE_T;
685 while (*f)
686 *fmt++ = *f++;
687 }
688 *fmt++ = c;
689 *fmt = '\0';
690 }
691
692 #define appendstring(string) \
693 do { \
694 for (copy = string;*copy; copy++) { \
695 *s++ = (unsigned char)*copy; \
696 } \
697 } while (0)
698
699 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)700 PyUnicode_FromFormatV(const char *format, va_list vargs)
701 {
702 va_list count;
703 Py_ssize_t callcount = 0;
704 PyObject **callresults = NULL;
705 PyObject **callresult = NULL;
706 Py_ssize_t n = 0;
707 int width = 0;
708 int precision = 0;
709 int zeropad;
710 const char* f;
711 Py_UNICODE *s;
712 PyObject *string;
713 /* used by sprintf */
714 char buffer[21];
715 /* use abuffer instead of buffer, if we need more space
716 * (which can happen if there's a format specifier with width). */
717 char *abuffer = NULL;
718 char *realbuffer;
719 Py_ssize_t abuffersize = 0;
720 char fmt[60]; /* should be enough for %0width.precisionld */
721 const char *copy;
722
723 #ifdef VA_LIST_IS_ARRAY
724 Py_MEMCPY(count, vargs, sizeof(va_list));
725 #else
726 #ifdef __va_copy
727 __va_copy(count, vargs);
728 #else
729 count = vargs;
730 #endif
731 #endif
732 /* step 1: count the number of %S/%R/%s format specifications
733 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
734 * objects once during step 3 and put the result in an array) */
735 for (f = format; *f; f++) {
736 if (*f == '%') {
737 f++;
738 while (*f && *f != '%' && !isalpha((unsigned)*f))
739 f++;
740 if (!*f)
741 break;
742 if (*f == 's' || *f=='S' || *f=='R')
743 ++callcount;
744 }
745 }
746 /* step 2: allocate memory for the results of
747 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
748 if (callcount) {
749 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
750 if (!callresults) {
751 PyErr_NoMemory();
752 return NULL;
753 }
754 callresult = callresults;
755 }
756 /* step 3: figure out how large a buffer we need */
757 for (f = format; *f; f++) {
758 if (*f == '%') {
759 const char* p = f++;
760 width = 0;
761 while (isdigit((unsigned)*f))
762 width = (width*10) + *f++ - '0';
763 precision = 0;
764 if (*f == '.') {
765 f++;
766 while (isdigit((unsigned)*f))
767 precision = (precision*10) + *f++ - '0';
768 }
769
770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771 * they don't affect the amount of space we reserve.
772 */
773 if ((*f == 'l' || *f == 'z') &&
774 (f[1] == 'd' || f[1] == 'u'))
775 ++f;
776
777 switch (*f) {
778 case 'c':
779 {
780 int ordinal = va_arg(count, int);
781 #ifdef Py_UNICODE_WIDE
782 if (ordinal < 0 || ordinal > 0x10ffff) {
783 PyErr_SetString(PyExc_OverflowError,
784 "%c arg not in range(0x110000) "
785 "(wide Python build)");
786 goto fail;
787 }
788 #else
789 if (ordinal < 0 || ordinal > 0xffff) {
790 PyErr_SetString(PyExc_OverflowError,
791 "%c arg not in range(0x10000) "
792 "(narrow Python build)");
793 goto fail;
794 }
795 #endif
796 /* fall through... */
797 }
798 case '%':
799 n++;
800 break;
801 case 'd': case 'u': case 'i': case 'x':
802 (void) va_arg(count, int);
803 if (width < precision)
804 width = precision;
805 /* 20 bytes is enough to hold a 64-bit
806 integer. Decimal takes the most space.
807 This isn't enough for octal.
808 If a width is specified we need more
809 (which we allocate later). */
810 if (width < 20)
811 width = 20;
812 n += width;
813 if (abuffersize < width)
814 abuffersize = width;
815 break;
816 case 's':
817 {
818 /* UTF-8 */
819 const char *s = va_arg(count, const char*);
820 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
821 if (!str)
822 goto fail;
823 n += PyUnicode_GET_SIZE(str);
824 /* Remember the str and switch to the next slot */
825 *callresult++ = str;
826 break;
827 }
828 case 'U':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 assert(obj && PyUnicode_Check(obj));
832 n += PyUnicode_GET_SIZE(obj);
833 break;
834 }
835 case 'V':
836 {
837 PyObject *obj = va_arg(count, PyObject *);
838 const char *str = va_arg(count, const char *);
839 assert(obj || str);
840 assert(!obj || PyUnicode_Check(obj));
841 if (obj)
842 n += PyUnicode_GET_SIZE(obj);
843 else
844 n += strlen(str);
845 break;
846 }
847 case 'S':
848 {
849 PyObject *obj = va_arg(count, PyObject *);
850 PyObject *str;
851 assert(obj);
852 str = PyObject_Str(obj);
853 if (!str)
854 goto fail;
855 n += PyString_GET_SIZE(str);
856 /* Remember the str and switch to the next slot */
857 *callresult++ = str;
858 break;
859 }
860 case 'R':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 PyObject *repr;
864 assert(obj);
865 repr = PyObject_Repr(obj);
866 if (!repr)
867 goto fail;
868 n += PyUnicode_GET_SIZE(repr);
869 /* Remember the repr and switch to the next slot */
870 *callresult++ = repr;
871 break;
872 }
873 case 'p':
874 (void) va_arg(count, int);
875 /* maximum 64-bit pointer representation:
876 * 0xffffffffffffffff
877 * so 19 characters is enough.
878 * XXX I count 18 -- what's the extra for?
879 */
880 n += 19;
881 break;
882 default:
883 /* if we stumble upon an unknown
884 formatting code, copy the rest of
885 the format string to the output
886 string. (we cannot just skip the
887 code, since there's no way to know
888 what's in the argument list) */
889 n += strlen(p);
890 goto expand;
891 }
892 } else
893 n++;
894 }
895 expand:
896 if (abuffersize > 20) {
897 /* add 1 for sprintf's trailing null byte */
898 abuffer = PyObject_Malloc(abuffersize + 1);
899 if (!abuffer) {
900 PyErr_NoMemory();
901 goto fail;
902 }
903 realbuffer = abuffer;
904 }
905 else
906 realbuffer = buffer;
907 /* step 4: fill the buffer */
908 /* Since we've analyzed how much space we need for the worst case,
909 we don't have to resize the string.
910 There can be no errors beyond this point. */
911 string = PyUnicode_FromUnicode(NULL, n);
912 if (!string)
913 goto fail;
914
915 s = PyUnicode_AS_UNICODE(string);
916 callresult = callresults;
917
918 for (f = format; *f; f++) {
919 if (*f == '%') {
920 const char* p = f++;
921 int longflag = 0;
922 int size_tflag = 0;
923 zeropad = (*f == '0');
924 /* parse the width.precision part */
925 width = 0;
926 while (isdigit((unsigned)*f))
927 width = (width*10) + *f++ - '0';
928 precision = 0;
929 if (*f == '.') {
930 f++;
931 while (isdigit((unsigned)*f))
932 precision = (precision*10) + *f++ - '0';
933 }
934 /* handle the long flag, but only for %ld and %lu.
935 others can be added when necessary. */
936 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
937 longflag = 1;
938 ++f;
939 }
940 /* handle the size_t flag. */
941 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
942 size_tflag = 1;
943 ++f;
944 }
945
946 switch (*f) {
947 case 'c':
948 *s++ = va_arg(vargs, int);
949 break;
950 case 'd':
951 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
952 if (longflag)
953 sprintf(realbuffer, fmt, va_arg(vargs, long));
954 else if (size_tflag)
955 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
956 else
957 sprintf(realbuffer, fmt, va_arg(vargs, int));
958 appendstring(realbuffer);
959 break;
960 case 'u':
961 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
962 if (longflag)
963 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
964 else if (size_tflag)
965 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
966 else
967 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
968 appendstring(realbuffer);
969 break;
970 case 'i':
971 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
972 sprintf(realbuffer, fmt, va_arg(vargs, int));
973 appendstring(realbuffer);
974 break;
975 case 'x':
976 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
977 sprintf(realbuffer, fmt, va_arg(vargs, int));
978 appendstring(realbuffer);
979 break;
980 case 's':
981 {
982 /* unused, since we already have the result */
983 (void) va_arg(vargs, char *);
984 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
985 PyUnicode_GET_SIZE(*callresult));
986 s += PyUnicode_GET_SIZE(*callresult);
987 /* We're done with the unicode()/repr() => forget it */
988 Py_DECREF(*callresult);
989 /* switch to next unicode()/repr() result */
990 ++callresult;
991 break;
992 }
993 case 'U':
994 {
995 PyObject *obj = va_arg(vargs, PyObject *);
996 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
997 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
998 s += size;
999 break;
1000 }
1001 case 'V':
1002 {
1003 PyObject *obj = va_arg(vargs, PyObject *);
1004 const char *str = va_arg(vargs, const char *);
1005 if (obj) {
1006 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1007 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1008 s += size;
1009 } else {
1010 appendstring(str);
1011 }
1012 break;
1013 }
1014 case 'S':
1015 case 'R':
1016 {
1017 const char *str = PyString_AS_STRING(*callresult);
1018 /* unused, since we already have the result */
1019 (void) va_arg(vargs, PyObject *);
1020 appendstring(str);
1021 /* We're done with the unicode()/repr() => forget it */
1022 Py_DECREF(*callresult);
1023 /* switch to next unicode()/repr() result */
1024 ++callresult;
1025 break;
1026 }
1027 case 'p':
1028 sprintf(buffer, "%p", va_arg(vargs, void*));
1029 /* %p is ill-defined: ensure leading 0x. */
1030 if (buffer[1] == 'X')
1031 buffer[1] = 'x';
1032 else if (buffer[1] != 'x') {
1033 memmove(buffer+2, buffer, strlen(buffer)+1);
1034 buffer[0] = '0';
1035 buffer[1] = 'x';
1036 }
1037 appendstring(buffer);
1038 break;
1039 case '%':
1040 *s++ = '%';
1041 break;
1042 default:
1043 appendstring(p);
1044 goto end;
1045 }
1046 } else
1047 *s++ = *f;
1048 }
1049
1050 end:
1051 if (callresults)
1052 PyObject_Free(callresults);
1053 if (abuffer)
1054 PyObject_Free(abuffer);
1055 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1056 return string;
1057 fail:
1058 if (callresults) {
1059 PyObject **callresult2 = callresults;
1060 while (callresult2 < callresult) {
1061 Py_DECREF(*callresult2);
1062 ++callresult2;
1063 }
1064 PyObject_Free(callresults);
1065 }
1066 if (abuffer)
1067 PyObject_Free(abuffer);
1068 return NULL;
1069 }
1070
1071 #undef appendstring
1072
1073 PyObject *
PyUnicode_FromFormat(const char * format,...)1074 PyUnicode_FromFormat(const char *format, ...)
1075 {
1076 PyObject* ret;
1077 va_list vargs;
1078
1079 #ifdef HAVE_STDARG_PROTOTYPES
1080 va_start(vargs, format);
1081 #else
1082 va_start(vargs);
1083 #endif
1084 ret = PyUnicode_FromFormatV(format, vargs);
1085 va_end(vargs);
1086 return ret;
1087 }
1088
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1090 wchar_t *w,
1091 Py_ssize_t size)
1092 {
1093 if (unicode == NULL) {
1094 PyErr_BadInternalCall();
1095 return -1;
1096 }
1097
1098 /* If possible, try to copy the 0-termination as well */
1099 if (size > PyUnicode_GET_SIZE(unicode))
1100 size = PyUnicode_GET_SIZE(unicode) + 1;
1101
1102 #ifdef HAVE_USABLE_WCHAR_T
1103 memcpy(w, unicode->str, size * sizeof(wchar_t));
1104 #else
1105 {
1106 register Py_UNICODE *u;
1107 register Py_ssize_t i;
1108 u = PyUnicode_AS_UNICODE(unicode);
1109 for (i = size; i > 0; i--)
1110 *w++ = *u++;
1111 }
1112 #endif
1113
1114 if (size > PyUnicode_GET_SIZE(unicode))
1115 return PyUnicode_GET_SIZE(unicode);
1116 else
1117 return size;
1118 }
1119
1120 #endif
1121
PyUnicode_FromOrdinal(int ordinal)1122 PyObject *PyUnicode_FromOrdinal(int ordinal)
1123 {
1124 Py_UNICODE s[1];
1125
1126 #ifdef Py_UNICODE_WIDE
1127 if (ordinal < 0 || ordinal > 0x10ffff) {
1128 PyErr_SetString(PyExc_ValueError,
1129 "unichr() arg not in range(0x110000) "
1130 "(wide Python build)");
1131 return NULL;
1132 }
1133 #else
1134 if (ordinal < 0 || ordinal > 0xffff) {
1135 PyErr_SetString(PyExc_ValueError,
1136 "unichr() arg not in range(0x10000) "
1137 "(narrow Python build)");
1138 return NULL;
1139 }
1140 #endif
1141
1142 s[0] = (Py_UNICODE)ordinal;
1143 return PyUnicode_FromUnicode(s, 1);
1144 }
1145
PyUnicode_FromObject(register PyObject * obj)1146 PyObject *PyUnicode_FromObject(register PyObject *obj)
1147 {
1148 /* XXX Perhaps we should make this API an alias of
1149 PyObject_Unicode() instead ?! */
1150 if (PyUnicode_CheckExact(obj)) {
1151 Py_INCREF(obj);
1152 return obj;
1153 }
1154 if (PyUnicode_Check(obj)) {
1155 /* For a Unicode subtype that's not a Unicode object,
1156 return a true Unicode object with the same data. */
1157 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1158 PyUnicode_GET_SIZE(obj));
1159 }
1160 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1161 }
1162
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1164 const char *encoding,
1165 const char *errors)
1166 {
1167 const char *s = NULL;
1168 Py_ssize_t len;
1169 PyObject *v;
1170
1171 if (obj == NULL) {
1172 PyErr_BadInternalCall();
1173 return NULL;
1174 }
1175
1176 #if 0
1177 /* For b/w compatibility we also accept Unicode objects provided
1178 that no encodings is given and then redirect to
1179 PyObject_Unicode() which then applies the additional logic for
1180 Unicode subclasses.
1181
1182 NOTE: This API should really only be used for object which
1183 represent *encoded* Unicode !
1184
1185 */
1186 if (PyUnicode_Check(obj)) {
1187 if (encoding) {
1188 PyErr_SetString(PyExc_TypeError,
1189 "decoding Unicode is not supported");
1190 return NULL;
1191 }
1192 return PyObject_Unicode(obj);
1193 }
1194 #else
1195 if (PyUnicode_Check(obj)) {
1196 PyErr_SetString(PyExc_TypeError,
1197 "decoding Unicode is not supported");
1198 return NULL;
1199 }
1200 #endif
1201
1202 /* Coerce object */
1203 if (PyString_Check(obj)) {
1204 s = PyString_AS_STRING(obj);
1205 len = PyString_GET_SIZE(obj);
1206 }
1207 else if (PyByteArray_Check(obj)) {
1208 /* Python 2.x specific */
1209 PyErr_Format(PyExc_TypeError,
1210 "decoding bytearray is not supported");
1211 return NULL;
1212 }
1213 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1214 /* Overwrite the error message with something more useful in
1215 case of a TypeError. */
1216 if (PyErr_ExceptionMatches(PyExc_TypeError))
1217 PyErr_Format(PyExc_TypeError,
1218 "coercing to Unicode: need string or buffer, "
1219 "%.80s found",
1220 Py_TYPE(obj)->tp_name);
1221 goto onError;
1222 }
1223
1224 /* Convert to Unicode */
1225 if (len == 0)
1226 _Py_RETURN_UNICODE_EMPTY();
1227
1228 v = PyUnicode_Decode(s, len, encoding, errors);
1229 return v;
1230
1231 onError:
1232 return NULL;
1233 }
1234
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1235 PyObject *PyUnicode_Decode(const char *s,
1236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
1239 {
1240 PyObject *buffer = NULL, *unicode;
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Shortcuts for common default encodings */
1246 if (strcmp(encoding, "utf-8") == 0)
1247 return PyUnicode_DecodeUTF8(s, size, errors);
1248 else if (strcmp(encoding, "latin-1") == 0)
1249 return PyUnicode_DecodeLatin1(s, size, errors);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding, "mbcs") == 0)
1252 return PyUnicode_DecodeMBCS(s, size, errors);
1253 #endif
1254 else if (strcmp(encoding, "ascii") == 0)
1255 return PyUnicode_DecodeASCII(s, size, errors);
1256
1257 /* Decode via the codec registry */
1258 buffer = PyBuffer_FromMemory((void *)s, size);
1259 if (buffer == NULL)
1260 goto onError;
1261 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
1262 if (unicode == NULL)
1263 goto onError;
1264 if (!PyUnicode_Check(unicode)) {
1265 PyErr_Format(PyExc_TypeError,
1266 "decoder did not return an unicode object (type=%.400s)",
1267 Py_TYPE(unicode)->tp_name);
1268 Py_DECREF(unicode);
1269 goto onError;
1270 }
1271 Py_DECREF(buffer);
1272 return unicode;
1273
1274 onError:
1275 Py_XDECREF(buffer);
1276 return NULL;
1277 }
1278
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1280 const char *encoding,
1281 const char *errors)
1282 {
1283 PyObject *v;
1284
1285 if (!PyUnicode_Check(unicode)) {
1286 PyErr_BadArgument();
1287 goto onError;
1288 }
1289
1290 if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1291 goto onError;
1292
1293 if (encoding == NULL)
1294 encoding = PyUnicode_GetDefaultEncoding();
1295
1296 /* Decode via the codec registry */
1297 v = _PyCodec_DecodeText(unicode, encoding, errors);
1298 if (v == NULL)
1299 goto onError;
1300 return v;
1301
1302 onError:
1303 return NULL;
1304 }
1305
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1307 Py_ssize_t size,
1308 const char *encoding,
1309 const char *errors)
1310 {
1311 PyObject *v, *unicode;
1312
1313 unicode = PyUnicode_FromUnicode(s, size);
1314 if (unicode == NULL)
1315 return NULL;
1316 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1317 Py_DECREF(unicode);
1318 return v;
1319 }
1320
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1322 const char *encoding,
1323 const char *errors)
1324 {
1325 PyObject *v;
1326
1327 if (!PyUnicode_Check(unicode)) {
1328 PyErr_BadArgument();
1329 goto onError;
1330 }
1331
1332 if (encoding == NULL)
1333 encoding = PyUnicode_GetDefaultEncoding();
1334
1335 /* Encode via the codec registry */
1336 v = _PyCodec_EncodeText(unicode, encoding, errors);
1337 if (v == NULL)
1338 goto onError;
1339 return v;
1340
1341 onError:
1342 return NULL;
1343 }
1344
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1346 const char *encoding,
1347 const char *errors)
1348 {
1349 PyObject *v;
1350
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 goto onError;
1354 }
1355
1356 if (encoding == NULL)
1357 encoding = PyUnicode_GetDefaultEncoding();
1358
1359 /* Shortcuts for common default encodings */
1360 if (errors == NULL) {
1361 if (strcmp(encoding, "utf-8") == 0)
1362 return PyUnicode_AsUTF8String(unicode);
1363 else if (strcmp(encoding, "latin-1") == 0)
1364 return PyUnicode_AsLatin1String(unicode);
1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1366 else if (strcmp(encoding, "mbcs") == 0)
1367 return PyUnicode_AsMBCSString(unicode);
1368 #endif
1369 else if (strcmp(encoding, "ascii") == 0)
1370 return PyUnicode_AsASCIIString(unicode);
1371 }
1372
1373 /* Encode via the codec registry */
1374 v = _PyCodec_EncodeText(unicode, encoding, errors);
1375 if (v == NULL)
1376 goto onError;
1377 if (!PyString_Check(v)) {
1378 PyErr_Format(PyExc_TypeError,
1379 "encoder did not return a string object (type=%.400s)",
1380 Py_TYPE(v)->tp_name);
1381 Py_DECREF(v);
1382 goto onError;
1383 }
1384 return v;
1385
1386 onError:
1387 return NULL;
1388 }
1389
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1391 const char *errors)
1392 {
1393 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1394
1395 if (v)
1396 return v;
1397 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1398 if (v && errors == NULL)
1399 ((PyUnicodeObject *)unicode)->defenc = v;
1400 return v;
1401 }
1402
PyUnicode_AsUnicode(PyObject * unicode)1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1404 {
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409 return PyUnicode_AS_UNICODE(unicode);
1410
1411 onError:
1412 return NULL;
1413 }
1414
PyUnicode_GetSize(PyObject * unicode)1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1416 {
1417 if (!PyUnicode_Check(unicode)) {
1418 PyErr_BadArgument();
1419 goto onError;
1420 }
1421 return PyUnicode_GET_SIZE(unicode);
1422
1423 onError:
1424 return -1;
1425 }
1426
PyUnicode_GetDefaultEncoding(void)1427 const char *PyUnicode_GetDefaultEncoding(void)
1428 {
1429 return unicode_default_encoding;
1430 }
1431
PyUnicode_SetDefaultEncoding(const char * encoding)1432 int PyUnicode_SetDefaultEncoding(const char *encoding)
1433 {
1434 PyObject *v;
1435
1436 /* Make sure the encoding is valid. As side effect, this also
1437 loads the encoding into the codec registry cache. */
1438 v = _PyCodec_Lookup(encoding);
1439 if (v == NULL)
1440 goto onError;
1441 Py_DECREF(v);
1442 strncpy(unicode_default_encoding,
1443 encoding,
1444 sizeof(unicode_default_encoding) - 1);
1445 return 0;
1446
1447 onError:
1448 return -1;
1449 }
1450
1451 /* error handling callback helper:
1452 build arguments, call the callback and check the arguments,
1453 if no exception occurred, copy the replacement to the output
1454 and adjust various state variables.
1455 return 0 on success, -1 on error
1456 */
1457
1458 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1460 const char *encoding, const char *reason,
1461 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1462 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1463 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1464 {
1465 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1466
1467 PyObject *restuple = NULL;
1468 PyObject *repunicode = NULL;
1469 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1470 Py_ssize_t requiredsize;
1471 Py_ssize_t newpos;
1472 Py_UNICODE *repptr;
1473 Py_ssize_t repsize;
1474 int res = -1;
1475
1476 if (*errorHandler == NULL) {
1477 *errorHandler = PyCodec_LookupError(errors);
1478 if (*errorHandler == NULL)
1479 goto onError;
1480 }
1481
1482 if (*exceptionObject == NULL) {
1483 *exceptionObject = PyUnicodeDecodeError_Create(
1484 encoding, input, insize, *startinpos, *endinpos, reason);
1485 if (*exceptionObject == NULL)
1486 goto onError;
1487 }
1488 else {
1489 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1492 goto onError;
1493 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1494 goto onError;
1495 }
1496
1497 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1498 if (restuple == NULL)
1499 goto onError;
1500 if (!PyTuple_Check(restuple)) {
1501 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1502 goto onError;
1503 }
1504 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1505 goto onError;
1506 if (newpos<0)
1507 newpos = insize+newpos;
1508 if (newpos<0 || newpos>insize) {
1509 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1510 goto onError;
1511 }
1512
1513 /* need more space? (at least enough for what we
1514 have+the replacement+the rest of the string (starting
1515 at the new input position), so we won't have to check space
1516 when there are no errors in the rest of the string) */
1517 repptr = PyUnicode_AS_UNICODE(repunicode);
1518 repsize = PyUnicode_GET_SIZE(repunicode);
1519 requiredsize = *outpos;
1520 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1521 goto overflow;
1522 requiredsize += repsize;
1523 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1524 goto overflow;
1525 requiredsize += insize - newpos;
1526 if (requiredsize > outsize) {
1527 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1528 requiredsize = 2*outsize;
1529 if (_PyUnicode_Resize(output, requiredsize) < 0)
1530 goto onError;
1531 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1532 }
1533 *endinpos = newpos;
1534 *inptr = input + newpos;
1535 Py_UNICODE_COPY(*outptr, repptr, repsize);
1536 *outptr += repsize;
1537 *outpos += repsize;
1538 /* we made it! */
1539 res = 0;
1540
1541 onError:
1542 Py_XDECREF(restuple);
1543 return res;
1544
1545 overflow:
1546 PyErr_SetString(PyExc_OverflowError,
1547 "decoded result is too long for a Python string");
1548 goto onError;
1549 }
1550
1551 /* --- UTF-7 Codec -------------------------------------------------------- */
1552
1553 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1554
1555 /* Three simple macros defining base-64. */
1556
1557 /* Is c a base-64 character? */
1558
1559 #define IS_BASE64(c) \
1560 (((c) >= 'A' && (c) <= 'Z') || \
1561 ((c) >= 'a' && (c) <= 'z') || \
1562 ((c) >= '0' && (c) <= '9') || \
1563 (c) == '+' || (c) == '/')
1564
1565 /* given that c is a base-64 character, what is its base-64 value? */
1566
1567 #define FROM_BASE64(c) \
1568 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1569 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1570 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1571 (c) == '+' ? 62 : 63)
1572
1573 /* What is the base-64 character of the bottom 6 bits of n? */
1574
1575 #define TO_BASE64(n) \
1576 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1577
1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1579 * decoded as itself. We are permissive on decoding; the only ASCII
1580 * byte not decoding to itself is the + which begins a base64
1581 * string. */
1582
1583 #define DECODE_DIRECT(c) \
1584 ((c) <= 127 && (c) != '+')
1585
1586 /* The UTF-7 encoder treats ASCII characters differently according to
1587 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1588 * the above). See RFC2152. This array identifies these different
1589 * sets:
1590 * 0 : "Set D"
1591 * alphanumeric and '(),-./:?
1592 * 1 : "Set O"
1593 * !"#$%&*;<=>@[]^_`{|}
1594 * 2 : "whitespace"
1595 * ht nl cr sp
1596 * 3 : special (must be base64 encoded)
1597 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1598 */
1599
1600 static
1601 char utf7_category[128] = {
1602 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1603 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1605 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1606 /* sp ! " # $ % & ' ( ) * + , - . / */
1607 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1608 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1610 /* @ A B C D E F G H I J K L M N O */
1611 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1612 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1614 /* ` a b c d e f g h i j k l m n o */
1615 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1616 /* p q r s t u v w x y z { | } ~ del */
1617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1618 };
1619
1620 /* ENCODE_DIRECT: this character should be encoded as itself. The
1621 * answer depends on whether we are encoding set O as itself, and also
1622 * on whether we are encoding whitespace as itself. RFC2152 makes it
1623 * clear that the answers to these questions vary between
1624 * applications, so this code needs to be flexible. */
1625
1626 #define ENCODE_DIRECT(c, directO, directWS) \
1627 ((c) < 128 && (c) > 0 && \
1628 ((utf7_category[(c)] == 0) || \
1629 (directWS && (utf7_category[(c)] == 2)) || \
1630 (directO && (utf7_category[(c)] == 1))))
1631
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1632 PyObject *PyUnicode_DecodeUTF7(const char *s,
1633 Py_ssize_t size,
1634 const char *errors)
1635 {
1636 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1637 }
1638
1639 /* The decoder. The only state we preserve is our read position,
1640 * i.e. how many characters we have consumed. So if we end in the
1641 * middle of a shift sequence we have to back off the read position
1642 * and the output to the beginning of the sequence, otherwise we lose
1643 * all the shift state (seen bits, number of bits seen, high
1644 * surrogate). */
1645
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1647 Py_ssize_t size,
1648 const char *errors,
1649 Py_ssize_t *consumed)
1650 {
1651 const char *starts = s;
1652 Py_ssize_t startinpos;
1653 Py_ssize_t endinpos;
1654 Py_ssize_t outpos;
1655 const char *e;
1656 PyUnicodeObject *unicode;
1657 Py_UNICODE *p;
1658 const char *errmsg = "";
1659 int inShift = 0;
1660 Py_UNICODE *shiftOutStart;
1661 unsigned int base64bits = 0;
1662 unsigned long base64buffer = 0;
1663 Py_UNICODE surrogate = 0;
1664 PyObject *errorHandler = NULL;
1665 PyObject *exc = NULL;
1666
1667 unicode = _PyUnicode_New(size);
1668 if (!unicode)
1669 return NULL;
1670 if (size == 0) {
1671 if (consumed)
1672 *consumed = 0;
1673 return (PyObject *)unicode;
1674 }
1675
1676 p = unicode->str;
1677 shiftOutStart = p;
1678 e = s + size;
1679
1680 while (s < e) {
1681 Py_UNICODE ch = (unsigned char) *s;
1682
1683 if (inShift) { /* in a base-64 section */
1684 if (IS_BASE64(ch)) { /* consume a base-64 character */
1685 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1686 base64bits += 6;
1687 s++;
1688 if (base64bits >= 16) {
1689 /* we have enough bits for a UTF-16 value */
1690 Py_UNICODE outCh = (Py_UNICODE)
1691 (base64buffer >> (base64bits-16));
1692 base64bits -= 16;
1693 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1694 assert(outCh <= 0xffff);
1695 if (surrogate) {
1696 /* expecting a second surrogate */
1697 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1698 #ifdef Py_UNICODE_WIDE
1699 *p++ = (((surrogate & 0x3FF)<<10)
1700 | (outCh & 0x3FF)) + 0x10000;
1701 #else
1702 *p++ = surrogate;
1703 *p++ = outCh;
1704 #endif
1705 surrogate = 0;
1706 continue;
1707 }
1708 else {
1709 *p++ = surrogate;
1710 surrogate = 0;
1711 }
1712 }
1713 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1714 /* first surrogate */
1715 surrogate = outCh;
1716 }
1717 else {
1718 *p++ = outCh;
1719 }
1720 }
1721 }
1722 else { /* now leaving a base-64 section */
1723 inShift = 0;
1724 if (base64bits > 0) { /* left-over bits */
1725 if (base64bits >= 6) {
1726 /* We've seen at least one base-64 character */
1727 s++;
1728 errmsg = "partial character in shift sequence";
1729 goto utf7Error;
1730 }
1731 else {
1732 /* Some bits remain; they should be zero */
1733 if (base64buffer != 0) {
1734 s++;
1735 errmsg = "non-zero padding bits in shift sequence";
1736 goto utf7Error;
1737 }
1738 }
1739 }
1740 if (surrogate && DECODE_DIRECT(ch))
1741 *p++ = surrogate;
1742 surrogate = 0;
1743 if (ch == '-') {
1744 /* '-' is absorbed; other terminating
1745 characters are preserved */
1746 s++;
1747 }
1748 }
1749 }
1750 else if ( ch == '+' ) {
1751 startinpos = s-starts;
1752 s++; /* consume '+' */
1753 if (s < e && *s == '-') { /* '+-' encodes '+' */
1754 s++;
1755 *p++ = '+';
1756 }
1757 else { /* begin base64-encoded section */
1758 inShift = 1;
1759 surrogate = 0;
1760 shiftOutStart = p;
1761 base64bits = 0;
1762 base64buffer = 0;
1763 }
1764 }
1765 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1766 *p++ = ch;
1767 s++;
1768 }
1769 else {
1770 startinpos = s-starts;
1771 s++;
1772 errmsg = "unexpected special character";
1773 goto utf7Error;
1774 }
1775 continue;
1776 utf7Error:
1777 outpos = p-PyUnicode_AS_UNICODE(unicode);
1778 endinpos = s-starts;
1779 if (unicode_decode_call_errorhandler(
1780 errors, &errorHandler,
1781 "utf7", errmsg,
1782 starts, size, &startinpos, &endinpos, &exc, &s,
1783 &unicode, &outpos, &p))
1784 goto onError;
1785 }
1786
1787 /* end of string */
1788
1789 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1790 /* if we're in an inconsistent state, that's an error */
1791 inShift = 0;
1792 if (surrogate ||
1793 (base64bits >= 6) ||
1794 (base64bits > 0 && base64buffer != 0)) {
1795 outpos = p-PyUnicode_AS_UNICODE(unicode);
1796 endinpos = size;
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "utf7", "unterminated shift sequence",
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 &unicode, &outpos, &p))
1802 goto onError;
1803 }
1804 }
1805
1806 /* return state */
1807 if (consumed) {
1808 if (inShift) {
1809 p = shiftOutStart; /* back off output */
1810 *consumed = startinpos;
1811 }
1812 else {
1813 *consumed = s-starts;
1814 }
1815 }
1816
1817 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1818 goto onError;
1819
1820 Py_XDECREF(errorHandler);
1821 Py_XDECREF(exc);
1822 return (PyObject *)unicode;
1823
1824 onError:
1825 Py_XDECREF(errorHandler);
1826 Py_XDECREF(exc);
1827 Py_DECREF(unicode);
1828 return NULL;
1829 }
1830
1831
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1833 Py_ssize_t size,
1834 int base64SetO,
1835 int base64WhiteSpace,
1836 const char *errors)
1837 {
1838 PyObject *v;
1839 /* It might be possible to tighten this worst case */
1840 Py_ssize_t allocated = 8 * size;
1841 int inShift = 0;
1842 Py_ssize_t i = 0;
1843 unsigned int base64bits = 0;
1844 unsigned long base64buffer = 0;
1845 char * out;
1846 char * start;
1847
1848 if (allocated / 8 != size)
1849 return PyErr_NoMemory();
1850
1851 if (size == 0)
1852 return PyString_FromStringAndSize(NULL, 0);
1853
1854 v = PyString_FromStringAndSize(NULL, allocated);
1855 if (v == NULL)
1856 return NULL;
1857
1858 start = out = PyString_AS_STRING(v);
1859 for (;i < size; ++i) {
1860 Py_UNICODE ch = s[i];
1861
1862 if (inShift) {
1863 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864 /* shifting out */
1865 if (base64bits) { /* output remaining bits */
1866 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1867 base64buffer = 0;
1868 base64bits = 0;
1869 }
1870 inShift = 0;
1871 /* Characters not in the BASE64 set implicitly unshift the sequence
1872 so no '-' is required, except if the character is itself a '-' */
1873 if (IS_BASE64(ch) || ch == '-') {
1874 *out++ = '-';
1875 }
1876 *out++ = (char) ch;
1877 }
1878 else {
1879 goto encode_char;
1880 }
1881 }
1882 else { /* not in a shift sequence */
1883 if (ch == '+') {
1884 *out++ = '+';
1885 *out++ = '-';
1886 }
1887 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1888 *out++ = (char) ch;
1889 }
1890 else {
1891 *out++ = '+';
1892 inShift = 1;
1893 goto encode_char;
1894 }
1895 }
1896 continue;
1897 encode_char:
1898 #ifdef Py_UNICODE_WIDE
1899 if (ch >= 0x10000) {
1900 /* code first surrogate */
1901 base64bits += 16;
1902 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1903 while (base64bits >= 6) {
1904 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905 base64bits -= 6;
1906 }
1907 /* prepare second surrogate */
1908 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1909 }
1910 #endif
1911 base64bits += 16;
1912 base64buffer = (base64buffer << 16) | ch;
1913 while (base64bits >= 6) {
1914 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1915 base64bits -= 6;
1916 }
1917 }
1918 if (base64bits)
1919 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1920 if (inShift)
1921 *out++ = '-';
1922
1923 if (_PyString_Resize(&v, out - start))
1924 return NULL;
1925 return v;
1926 }
1927
1928 #undef IS_BASE64
1929 #undef FROM_BASE64
1930 #undef TO_BASE64
1931 #undef DECODE_DIRECT
1932 #undef ENCODE_DIRECT
1933
1934 /* --- UTF-8 Codec -------------------------------------------------------- */
1935
1936 static
1937 char utf8_code_length[256] = {
1938 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1939 illegal prefix. See RFC 3629 for details */
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1948 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1949 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1952 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1953 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1954 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1955 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1956 };
1957
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1958 PyObject *PyUnicode_DecodeUTF8(const char *s,
1959 Py_ssize_t size,
1960 const char *errors)
1961 {
1962 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1963 }
1964
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1966 Py_ssize_t size,
1967 const char *errors,
1968 Py_ssize_t *consumed)
1969 {
1970 const char *starts = s;
1971 int n;
1972 int k;
1973 Py_ssize_t startinpos;
1974 Py_ssize_t endinpos;
1975 Py_ssize_t outpos;
1976 const char *e;
1977 PyUnicodeObject *unicode;
1978 Py_UNICODE *p;
1979 const char *errmsg = "";
1980 PyObject *errorHandler = NULL;
1981 PyObject *exc = NULL;
1982
1983 /* Note: size will always be longer than the resulting Unicode
1984 character count */
1985 unicode = _PyUnicode_New(size);
1986 if (!unicode)
1987 return NULL;
1988 if (size == 0) {
1989 if (consumed)
1990 *consumed = 0;
1991 return (PyObject *)unicode;
1992 }
1993
1994 /* Unpack UTF-8 encoded data */
1995 p = unicode->str;
1996 e = s + size;
1997
1998 while (s < e) {
1999 Py_UCS4 ch = (unsigned char)*s;
2000
2001 if (ch < 0x80) {
2002 *p++ = (Py_UNICODE)ch;
2003 s++;
2004 continue;
2005 }
2006
2007 n = utf8_code_length[ch];
2008
2009 if (s + n > e) {
2010 if (consumed)
2011 break;
2012 else {
2013 errmsg = "unexpected end of data";
2014 startinpos = s-starts;
2015 endinpos = startinpos+1;
2016 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2017 endinpos++;
2018 goto utf8Error;
2019 }
2020 }
2021
2022 switch (n) {
2023
2024 case 0:
2025 errmsg = "invalid start byte";
2026 startinpos = s-starts;
2027 endinpos = startinpos+1;
2028 goto utf8Error;
2029
2030 case 1:
2031 errmsg = "internal error";
2032 startinpos = s-starts;
2033 endinpos = startinpos+1;
2034 goto utf8Error;
2035
2036 case 2:
2037 if ((s[1] & 0xc0) != 0x80) {
2038 errmsg = "invalid continuation byte";
2039 startinpos = s-starts;
2040 endinpos = startinpos + 1;
2041 goto utf8Error;
2042 }
2043 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2044 assert ((ch > 0x007F) && (ch <= 0x07FF));
2045 *p++ = (Py_UNICODE)ch;
2046 break;
2047
2048 case 3:
2049 /* XXX: surrogates shouldn't be valid UTF-8!
2050 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2051 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2052 Uncomment the 2 lines below to make them invalid,
2053 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2054 if ((s[1] & 0xc0) != 0x80 ||
2055 (s[2] & 0xc0) != 0x80 ||
2056 ((unsigned char)s[0] == 0xE0 &&
2057 (unsigned char)s[1] < 0xA0)/* ||
2058 ((unsigned char)s[0] == 0xED &&
2059 (unsigned char)s[1] > 0x9F)*/) {
2060 errmsg = "invalid continuation byte";
2061 startinpos = s-starts;
2062 endinpos = startinpos + 1;
2063
2064 /* if s[1] first two bits are 1 and 0, then the invalid
2065 continuation byte is s[2], so increment endinpos by 1,
2066 if not, s[1] is invalid and endinpos doesn't need to
2067 be incremented. */
2068 if ((s[1] & 0xC0) == 0x80)
2069 endinpos++;
2070 goto utf8Error;
2071 }
2072 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2073 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2074 *p++ = (Py_UNICODE)ch;
2075 break;
2076
2077 case 4:
2078 if ((s[1] & 0xc0) != 0x80 ||
2079 (s[2] & 0xc0) != 0x80 ||
2080 (s[3] & 0xc0) != 0x80 ||
2081 ((unsigned char)s[0] == 0xF0 &&
2082 (unsigned char)s[1] < 0x90) ||
2083 ((unsigned char)s[0] == 0xF4 &&
2084 (unsigned char)s[1] > 0x8F)) {
2085 errmsg = "invalid continuation byte";
2086 startinpos = s-starts;
2087 endinpos = startinpos + 1;
2088 if ((s[1] & 0xC0) == 0x80) {
2089 endinpos++;
2090 if ((s[2] & 0xC0) == 0x80)
2091 endinpos++;
2092 }
2093 goto utf8Error;
2094 }
2095 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2096 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2098
2099 #ifdef Py_UNICODE_WIDE
2100 *p++ = (Py_UNICODE)ch;
2101 #else
2102 /* compute and append the two surrogates: */
2103
2104 /* translate from 10000..10FFFF to 0..FFFF */
2105 ch -= 0x10000;
2106
2107 /* high surrogate = top 10 bits added to D800 */
2108 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2109
2110 /* low surrogate = bottom 10 bits added to DC00 */
2111 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2112 #endif
2113 break;
2114 }
2115 s += n;
2116 continue;
2117
2118 utf8Error:
2119 outpos = p-PyUnicode_AS_UNICODE(unicode);
2120 if (unicode_decode_call_errorhandler(
2121 errors, &errorHandler,
2122 "utf8", errmsg,
2123 starts, size, &startinpos, &endinpos, &exc, &s,
2124 &unicode, &outpos, &p))
2125 goto onError;
2126 }
2127 if (consumed)
2128 *consumed = s-starts;
2129
2130 /* Adjust length */
2131 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2132 goto onError;
2133
2134 Py_XDECREF(errorHandler);
2135 Py_XDECREF(exc);
2136 return (PyObject *)unicode;
2137
2138 onError:
2139 Py_XDECREF(errorHandler);
2140 Py_XDECREF(exc);
2141 Py_DECREF(unicode);
2142 return NULL;
2143 }
2144
2145 /* Allocation strategy: if the string is short, convert into a stack buffer
2146 and allocate exactly as much space needed at the end. Else allocate the
2147 maximum possible needed (4 result bytes per Unicode character), and return
2148 the excess memory at the end.
2149 */
2150 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2152 Py_ssize_t size,
2153 const char *errors)
2154 {
2155 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2156
2157 Py_ssize_t i; /* index into s of next input byte */
2158 PyObject *v; /* result string object */
2159 char *p; /* next free byte in output buffer */
2160 Py_ssize_t nallocated; /* number of result bytes allocated */
2161 Py_ssize_t nneeded; /* number of result bytes needed */
2162 char stackbuf[MAX_SHORT_UNICHARS * 4];
2163
2164 assert(s != NULL);
2165 assert(size >= 0);
2166
2167 if (size <= MAX_SHORT_UNICHARS) {
2168 /* Write into the stack buffer; nallocated can't overflow.
2169 * At the end, we'll allocate exactly as much heap space as it
2170 * turns out we need.
2171 */
2172 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2173 v = NULL; /* will allocate after we're done */
2174 p = stackbuf;
2175 }
2176 else {
2177 /* Overallocate on the heap, and give the excess back at the end. */
2178 nallocated = size * 4;
2179 if (nallocated / 4 != size) /* overflow! */
2180 return PyErr_NoMemory();
2181 v = PyString_FromStringAndSize(NULL, nallocated);
2182 if (v == NULL)
2183 return NULL;
2184 p = PyString_AS_STRING(v);
2185 }
2186
2187 for (i = 0; i < size;) {
2188 Py_UCS4 ch = s[i++];
2189
2190 if (ch < 0x80)
2191 /* Encode ASCII */
2192 *p++ = (char) ch;
2193
2194 else if (ch < 0x0800) {
2195 /* Encode Latin-1 */
2196 *p++ = (char)(0xc0 | (ch >> 6));
2197 *p++ = (char)(0x80 | (ch & 0x3f));
2198 }
2199 else {
2200 /* Encode UCS2 Unicode ordinals */
2201 if (ch < 0x10000) {
2202 /* Special case: check for high surrogate */
2203 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2204 Py_UCS4 ch2 = s[i];
2205 /* Check for low surrogate and combine the two to
2206 form a UCS4 value */
2207 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2208 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2209 i++;
2210 goto encodeUCS4;
2211 }
2212 /* Fall through: handles isolated high surrogates */
2213 }
2214 *p++ = (char)(0xe0 | (ch >> 12));
2215 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2216 *p++ = (char)(0x80 | (ch & 0x3f));
2217 continue;
2218 }
2219 encodeUCS4:
2220 /* Encode UCS4 Unicode ordinals */
2221 *p++ = (char)(0xf0 | (ch >> 18));
2222 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2223 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2224 *p++ = (char)(0x80 | (ch & 0x3f));
2225 }
2226 }
2227
2228 if (v == NULL) {
2229 /* This was stack allocated. */
2230 nneeded = p - stackbuf;
2231 assert(nneeded <= nallocated);
2232 v = PyString_FromStringAndSize(stackbuf, nneeded);
2233 }
2234 else {
2235 /* Cut back to size actually needed. */
2236 nneeded = p - PyString_AS_STRING(v);
2237 assert(nneeded <= nallocated);
2238 if (_PyString_Resize(&v, nneeded))
2239 return NULL;
2240 }
2241 return v;
2242
2243 #undef MAX_SHORT_UNICHARS
2244 }
2245
PyUnicode_AsUTF8String(PyObject * unicode)2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2247 {
2248 if (!PyUnicode_Check(unicode)) {
2249 PyErr_BadArgument();
2250 return NULL;
2251 }
2252 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2253 PyUnicode_GET_SIZE(unicode),
2254 NULL);
2255 }
2256
2257 /* --- UTF-32 Codec ------------------------------------------------------- */
2258
2259 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2260 PyUnicode_DecodeUTF32(const char *s,
2261 Py_ssize_t size,
2262 const char *errors,
2263 int *byteorder)
2264 {
2265 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2266 }
2267
2268 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2269 PyUnicode_DecodeUTF32Stateful(const char *s,
2270 Py_ssize_t size,
2271 const char *errors,
2272 int *byteorder,
2273 Py_ssize_t *consumed)
2274 {
2275 const char *starts = s;
2276 Py_ssize_t startinpos;
2277 Py_ssize_t endinpos;
2278 Py_ssize_t outpos;
2279 PyUnicodeObject *unicode;
2280 Py_UNICODE *p;
2281 #ifndef Py_UNICODE_WIDE
2282 int pairs = 0;
2283 const unsigned char *qq;
2284 #else
2285 const int pairs = 0;
2286 #endif
2287 const unsigned char *q, *e;
2288 int bo = 0; /* assume native ordering by default */
2289 const char *errmsg = "";
2290 /* Offsets from q for retrieving bytes in the right order. */
2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292 int iorder[] = {0, 1, 2, 3};
2293 #else
2294 int iorder[] = {3, 2, 1, 0};
2295 #endif
2296 PyObject *errorHandler = NULL;
2297 PyObject *exc = NULL;
2298
2299 q = (unsigned char *)s;
2300 e = q + size;
2301
2302 if (byteorder)
2303 bo = *byteorder;
2304
2305 /* Check for BOM marks (U+FEFF) in the input and adjust current
2306 byte order setting accordingly. In native mode, the leading BOM
2307 mark is skipped, in all other modes, it is copied to the output
2308 stream as-is (giving a ZWNBSP character). */
2309 if (bo == 0) {
2310 if (size >= 4) {
2311 const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2312 (q[iorder[1]] << 8) | q[iorder[0]];
2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2314 if (bom == 0x0000FEFF) {
2315 q += 4;
2316 bo = -1;
2317 }
2318 else if (bom == 0xFFFE0000) {
2319 q += 4;
2320 bo = 1;
2321 }
2322 #else
2323 if (bom == 0x0000FEFF) {
2324 q += 4;
2325 bo = 1;
2326 }
2327 else if (bom == 0xFFFE0000) {
2328 q += 4;
2329 bo = -1;
2330 }
2331 #endif
2332 }
2333 }
2334
2335 if (bo == -1) {
2336 /* force LE */
2337 iorder[0] = 0;
2338 iorder[1] = 1;
2339 iorder[2] = 2;
2340 iorder[3] = 3;
2341 }
2342 else if (bo == 1) {
2343 /* force BE */
2344 iorder[0] = 3;
2345 iorder[1] = 2;
2346 iorder[2] = 1;
2347 iorder[3] = 0;
2348 }
2349
2350 /* On narrow builds we split characters outside the BMP into two
2351 code points => count how much extra space we need. */
2352 #ifndef Py_UNICODE_WIDE
2353 for (qq = q; e - qq >= 4; qq += 4)
2354 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2355 pairs++;
2356 #endif
2357
2358 /* This might be one to much, because of a BOM */
2359 unicode = _PyUnicode_New((size+3)/4+pairs);
2360 if (!unicode)
2361 return NULL;
2362 if (size == 0)
2363 return (PyObject *)unicode;
2364
2365 /* Unpack UTF-32 encoded data */
2366 p = unicode->str;
2367
2368 while (q < e) {
2369 Py_UCS4 ch;
2370 /* remaining bytes at the end? (size should be divisible by 4) */
2371 if (e-q<4) {
2372 if (consumed)
2373 break;
2374 errmsg = "truncated data";
2375 startinpos = ((const char *)q)-starts;
2376 endinpos = ((const char *)e)-starts;
2377 goto utf32Error;
2378 /* The remaining input chars are ignored if the callback
2379 chooses to skip the input */
2380 }
2381 ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2382 (q[iorder[1]] << 8) | q[iorder[0]];
2383
2384 if (ch >= 0x110000)
2385 {
2386 errmsg = "code point not in range(0x110000)";
2387 startinpos = ((const char *)q)-starts;
2388 endinpos = startinpos+4;
2389 goto utf32Error;
2390 }
2391 #ifndef Py_UNICODE_WIDE
2392 if (ch >= 0x10000)
2393 {
2394 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2395 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2396 }
2397 else
2398 #endif
2399 *p++ = ch;
2400 q += 4;
2401 continue;
2402 utf32Error:
2403 outpos = p-PyUnicode_AS_UNICODE(unicode);
2404 if (unicode_decode_call_errorhandler(
2405 errors, &errorHandler,
2406 "utf32", errmsg,
2407 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2408 &unicode, &outpos, &p))
2409 goto onError;
2410 }
2411
2412 if (byteorder)
2413 *byteorder = bo;
2414
2415 if (consumed)
2416 *consumed = (const char *)q-starts;
2417
2418 /* Adjust length */
2419 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2420 goto onError;
2421
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return (PyObject *)unicode;
2425
2426 onError:
2427 Py_DECREF(unicode);
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return NULL;
2431 }
2432
2433 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2435 Py_ssize_t size,
2436 const char *errors,
2437 int byteorder)
2438 {
2439 PyObject *v;
2440 unsigned char *p;
2441 Py_ssize_t nsize, bytesize;
2442 #ifndef Py_UNICODE_WIDE
2443 Py_ssize_t i, pairs;
2444 #else
2445 const int pairs = 0;
2446 #endif
2447 /* Offsets from p for storing byte pairs in the right order. */
2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449 int iorder[] = {0, 1, 2, 3};
2450 #else
2451 int iorder[] = {3, 2, 1, 0};
2452 #endif
2453
2454 #define STORECHAR(CH) \
2455 do { \
2456 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2457 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2458 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2459 p[iorder[0]] = (CH) & 0xff; \
2460 p += 4; \
2461 } while(0)
2462
2463 /* In narrow builds we can output surrogate pairs as one code point,
2464 so we need less space. */
2465 #ifndef Py_UNICODE_WIDE
2466 for (i = pairs = 0; i < size-1; i++)
2467 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2468 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2469 pairs++;
2470 #endif
2471 nsize = (size - pairs + (byteorder == 0));
2472 bytesize = nsize * 4;
2473 if (bytesize / 4 != nsize)
2474 return PyErr_NoMemory();
2475 v = PyString_FromStringAndSize(NULL, bytesize);
2476 if (v == NULL)
2477 return NULL;
2478
2479 p = (unsigned char *)PyString_AS_STRING(v);
2480 if (byteorder == 0)
2481 STORECHAR(0xFEFF);
2482 if (size == 0)
2483 return v;
2484
2485 if (byteorder == -1) {
2486 /* force LE */
2487 iorder[0] = 0;
2488 iorder[1] = 1;
2489 iorder[2] = 2;
2490 iorder[3] = 3;
2491 }
2492 else if (byteorder == 1) {
2493 /* force BE */
2494 iorder[0] = 3;
2495 iorder[1] = 2;
2496 iorder[2] = 1;
2497 iorder[3] = 0;
2498 }
2499
2500 while (size-- > 0) {
2501 Py_UCS4 ch = *s++;
2502 #ifndef Py_UNICODE_WIDE
2503 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2504 Py_UCS4 ch2 = *s;
2505 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2506 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2507 s++;
2508 size--;
2509 }
2510 }
2511 #endif
2512 STORECHAR(ch);
2513 }
2514 return v;
2515 #undef STORECHAR
2516 }
2517
PyUnicode_AsUTF32String(PyObject * unicode)2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2519 {
2520 if (!PyUnicode_Check(unicode)) {
2521 PyErr_BadArgument();
2522 return NULL;
2523 }
2524 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2525 PyUnicode_GET_SIZE(unicode),
2526 NULL,
2527 0);
2528 }
2529
2530 /* --- UTF-16 Codec ------------------------------------------------------- */
2531
2532 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2533 PyUnicode_DecodeUTF16(const char *s,
2534 Py_ssize_t size,
2535 const char *errors,
2536 int *byteorder)
2537 {
2538 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2539 }
2540
2541 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2542 PyUnicode_DecodeUTF16Stateful(const char *s,
2543 Py_ssize_t size,
2544 const char *errors,
2545 int *byteorder,
2546 Py_ssize_t *consumed)
2547 {
2548 const char *starts = s;
2549 Py_ssize_t startinpos;
2550 Py_ssize_t endinpos;
2551 Py_ssize_t outpos;
2552 PyUnicodeObject *unicode;
2553 Py_UNICODE *p;
2554 const unsigned char *q, *e;
2555 int bo = 0; /* assume native ordering by default */
2556 const char *errmsg = "";
2557 /* Offsets from q for retrieving byte pairs in the right order. */
2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2559 int ihi = 1, ilo = 0;
2560 #else
2561 int ihi = 0, ilo = 1;
2562 #endif
2563 PyObject *errorHandler = NULL;
2564 PyObject *exc = NULL;
2565
2566 /* Note: size will always be longer than the resulting Unicode
2567 character count */
2568 unicode = _PyUnicode_New(size);
2569 if (!unicode)
2570 return NULL;
2571 if (size == 0)
2572 return (PyObject *)unicode;
2573
2574 /* Unpack UTF-16 encoded data */
2575 p = unicode->str;
2576 q = (unsigned char *)s;
2577 e = q + size;
2578
2579 if (byteorder)
2580 bo = *byteorder;
2581
2582 /* Check for BOM marks (U+FEFF) in the input and adjust current
2583 byte order setting accordingly. In native mode, the leading BOM
2584 mark is skipped, in all other modes, it is copied to the output
2585 stream as-is (giving a ZWNBSP character). */
2586 if (bo == 0) {
2587 if (size >= 2) {
2588 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590 if (bom == 0xFEFF) {
2591 q += 2;
2592 bo = -1;
2593 }
2594 else if (bom == 0xFFFE) {
2595 q += 2;
2596 bo = 1;
2597 }
2598 #else
2599 if (bom == 0xFEFF) {
2600 q += 2;
2601 bo = 1;
2602 }
2603 else if (bom == 0xFFFE) {
2604 q += 2;
2605 bo = -1;
2606 }
2607 #endif
2608 }
2609 }
2610
2611 if (bo == -1) {
2612 /* force LE */
2613 ihi = 1;
2614 ilo = 0;
2615 }
2616 else if (bo == 1) {
2617 /* force BE */
2618 ihi = 0;
2619 ilo = 1;
2620 }
2621
2622 while (q < e) {
2623 Py_UNICODE ch;
2624 /* remaining bytes at the end? (size should be even) */
2625 if (e-q<2) {
2626 if (consumed)
2627 break;
2628 errmsg = "truncated data";
2629 startinpos = ((const char *)q)-starts;
2630 endinpos = ((const char *)e)-starts;
2631 goto utf16Error;
2632 /* The remaining input chars are ignored if the callback
2633 chooses to skip the input */
2634 }
2635 ch = (q[ihi] << 8) | q[ilo];
2636
2637 q += 2;
2638
2639 if (ch < 0xD800 || ch > 0xDFFF) {
2640 *p++ = ch;
2641 continue;
2642 }
2643
2644 /* UTF-16 code pair: */
2645 if (e - q < 2) {
2646 q -= 2;
2647 if (consumed)
2648 break;
2649 errmsg = "unexpected end of data";
2650 startinpos = ((const char *)q)-starts;
2651 endinpos = ((const char *)e)-starts;
2652 goto utf16Error;
2653 }
2654 if (0xD800 <= ch && ch <= 0xDBFF) {
2655 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2656 q += 2;
2657 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2658 #ifndef Py_UNICODE_WIDE
2659 *p++ = ch;
2660 *p++ = ch2;
2661 #else
2662 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2663 #endif
2664 continue;
2665 }
2666 else {
2667 errmsg = "illegal UTF-16 surrogate";
2668 startinpos = (((const char *)q)-4)-starts;
2669 endinpos = startinpos+2;
2670 goto utf16Error;
2671 }
2672
2673 }
2674 errmsg = "illegal encoding";
2675 startinpos = (((const char *)q)-2)-starts;
2676 endinpos = startinpos+2;
2677 /* Fall through to report the error */
2678
2679 utf16Error:
2680 outpos = p-PyUnicode_AS_UNICODE(unicode);
2681 if (unicode_decode_call_errorhandler(
2682 errors, &errorHandler,
2683 "utf16", errmsg,
2684 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2685 &unicode, &outpos, &p))
2686 goto onError;
2687 }
2688
2689 if (byteorder)
2690 *byteorder = bo;
2691
2692 if (consumed)
2693 *consumed = (const char *)q-starts;
2694
2695 /* Adjust length */
2696 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2697 goto onError;
2698
2699 Py_XDECREF(errorHandler);
2700 Py_XDECREF(exc);
2701 return (PyObject *)unicode;
2702
2703 onError:
2704 Py_DECREF(unicode);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return NULL;
2708 }
2709
2710 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2712 Py_ssize_t size,
2713 const char *errors,
2714 int byteorder)
2715 {
2716 PyObject *v;
2717 unsigned char *p;
2718 Py_ssize_t nsize, bytesize;
2719 #ifdef Py_UNICODE_WIDE
2720 Py_ssize_t i, pairs;
2721 #else
2722 const int pairs = 0;
2723 #endif
2724 /* Offsets from p for storing byte pairs in the right order. */
2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2726 int ihi = 1, ilo = 0;
2727 #else
2728 int ihi = 0, ilo = 1;
2729 #endif
2730
2731 #define STORECHAR(CH) \
2732 do { \
2733 p[ihi] = ((CH) >> 8) & 0xff; \
2734 p[ilo] = (CH) & 0xff; \
2735 p += 2; \
2736 } while(0)
2737
2738 #ifdef Py_UNICODE_WIDE
2739 for (i = pairs = 0; i < size; i++)
2740 if (s[i] >= 0x10000)
2741 pairs++;
2742 #endif
2743 /* 2 * (size + pairs + (byteorder == 0)) */
2744 if (size > PY_SSIZE_T_MAX ||
2745 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2746 return PyErr_NoMemory();
2747 nsize = size + pairs + (byteorder == 0);
2748 bytesize = nsize * 2;
2749 if (bytesize / 2 != nsize)
2750 return PyErr_NoMemory();
2751 v = PyString_FromStringAndSize(NULL, bytesize);
2752 if (v == NULL)
2753 return NULL;
2754
2755 p = (unsigned char *)PyString_AS_STRING(v);
2756 if (byteorder == 0)
2757 STORECHAR(0xFEFF);
2758 if (size == 0)
2759 return v;
2760
2761 if (byteorder == -1) {
2762 /* force LE */
2763 ihi = 1;
2764 ilo = 0;
2765 }
2766 else if (byteorder == 1) {
2767 /* force BE */
2768 ihi = 0;
2769 ilo = 1;
2770 }
2771
2772 while (size-- > 0) {
2773 Py_UNICODE ch = *s++;
2774 Py_UNICODE ch2 = 0;
2775 #ifdef Py_UNICODE_WIDE
2776 if (ch >= 0x10000) {
2777 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2778 ch = 0xD800 | ((ch-0x10000) >> 10);
2779 }
2780 #endif
2781 STORECHAR(ch);
2782 if (ch2)
2783 STORECHAR(ch2);
2784 }
2785 return v;
2786 #undef STORECHAR
2787 }
2788
PyUnicode_AsUTF16String(PyObject * unicode)2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2790 {
2791 if (!PyUnicode_Check(unicode)) {
2792 PyErr_BadArgument();
2793 return NULL;
2794 }
2795 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2796 PyUnicode_GET_SIZE(unicode),
2797 NULL,
2798 0);
2799 }
2800
2801 /* --- Unicode Escape Codec ----------------------------------------------- */
2802
2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2804
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2806 Py_ssize_t size,
2807 const char *errors)
2808 {
2809 const char *starts = s;
2810 Py_ssize_t startinpos;
2811 Py_ssize_t endinpos;
2812 Py_ssize_t outpos;
2813 PyUnicodeObject *v;
2814 Py_UNICODE *p;
2815 const char *end;
2816 char* message;
2817 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2818 PyObject *errorHandler = NULL;
2819 PyObject *exc = NULL;
2820
2821 /* Escaped strings will always be longer than the resulting
2822 Unicode string, so we start with size here and then reduce the
2823 length after conversion to the true value.
2824 (but if the error callback returns a long replacement string
2825 we'll have to allocate more space) */
2826 v = _PyUnicode_New(size);
2827 if (v == NULL)
2828 goto onError;
2829 if (size == 0)
2830 return (PyObject *)v;
2831
2832 p = PyUnicode_AS_UNICODE(v);
2833 end = s + size;
2834
2835 while (s < end) {
2836 unsigned char c;
2837 Py_UNICODE x;
2838 int digits;
2839
2840 /* Non-escape characters are interpreted as Unicode ordinals */
2841 if (*s != '\\') {
2842 *p++ = (unsigned char) *s++;
2843 continue;
2844 }
2845
2846 startinpos = s-starts;
2847 /* \ - Escapes */
2848 s++;
2849 c = *s++;
2850 if (s > end)
2851 c = '\0'; /* Invalid after \ */
2852 switch (c) {
2853
2854 /* \x escapes */
2855 case '\n': break;
2856 case '\\': *p++ = '\\'; break;
2857 case '\'': *p++ = '\''; break;
2858 case '\"': *p++ = '\"'; break;
2859 case 'b': *p++ = '\b'; break;
2860 case 'f': *p++ = '\014'; break; /* FF */
2861 case 't': *p++ = '\t'; break;
2862 case 'n': *p++ = '\n'; break;
2863 case 'r': *p++ = '\r'; break;
2864 case 'v': *p++ = '\013'; break; /* VT */
2865 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2866
2867 /* \OOO (octal) escapes */
2868 case '0': case '1': case '2': case '3':
2869 case '4': case '5': case '6': case '7':
2870 x = s[-1] - '0';
2871 if (s < end && '0' <= *s && *s <= '7') {
2872 x = (x<<3) + *s++ - '0';
2873 if (s < end && '0' <= *s && *s <= '7')
2874 x = (x<<3) + *s++ - '0';
2875 }
2876 *p++ = x;
2877 break;
2878
2879 /* hex escapes */
2880 /* \xXX */
2881 case 'x':
2882 digits = 2;
2883 message = "truncated \\xXX escape";
2884 goto hexescape;
2885
2886 /* \uXXXX */
2887 case 'u':
2888 digits = 4;
2889 message = "truncated \\uXXXX escape";
2890 goto hexescape;
2891
2892 /* \UXXXXXXXX */
2893 case 'U':
2894 digits = 8;
2895 message = "truncated \\UXXXXXXXX escape";
2896 hexescape:
2897 chr = 0;
2898 if (end - s < digits) {
2899 /* count only hex digits */
2900 for (; s < end; ++s) {
2901 c = (unsigned char)*s;
2902 if (!Py_ISXDIGIT(c))
2903 goto error;
2904 }
2905 goto error;
2906 }
2907 for (; digits--; ++s) {
2908 c = (unsigned char)*s;
2909 if (!Py_ISXDIGIT(c))
2910 goto error;
2911 chr = (chr<<4) & ~0xF;
2912 if (c >= '0' && c <= '9')
2913 chr += c - '0';
2914 else if (c >= 'a' && c <= 'f')
2915 chr += 10 + c - 'a';
2916 else
2917 chr += 10 + c - 'A';
2918 }
2919 if (chr == 0xffffffff && PyErr_Occurred())
2920 /* _decoding_error will have already written into the
2921 target buffer. */
2922 break;
2923 store:
2924 /* when we get here, chr is a 32-bit unicode character */
2925 if (chr <= 0xffff)
2926 /* UCS-2 character */
2927 *p++ = (Py_UNICODE) chr;
2928 else if (chr <= 0x10ffff) {
2929 /* UCS-4 character. Either store directly, or as
2930 surrogate pair. */
2931 #ifdef Py_UNICODE_WIDE
2932 *p++ = chr;
2933 #else
2934 chr -= 0x10000L;
2935 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2936 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2937 #endif
2938 } else {
2939 message = "illegal Unicode character";
2940 goto error;
2941 }
2942 break;
2943
2944 /* \N{name} */
2945 case 'N':
2946 message = "malformed \\N character escape";
2947 if (ucnhash_CAPI == NULL) {
2948 /* load the unicode data module */
2949 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2950 if (ucnhash_CAPI == NULL)
2951 goto ucnhashError;
2952 }
2953 if (*s == '{') {
2954 const char *start = s+1;
2955 /* look for the closing brace */
2956 while (*s != '}' && s < end)
2957 s++;
2958 if (s > start && s < end && *s == '}') {
2959 /* found a name. look it up in the unicode database */
2960 message = "unknown Unicode character name";
2961 s++;
2962 if (s - start - 1 <= INT_MAX &&
2963 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2964 goto store;
2965 }
2966 }
2967 goto error;
2968
2969 default:
2970 if (s > end) {
2971 message = "\\ at end of string";
2972 s--;
2973 goto error;
2974 }
2975 else {
2976 *p++ = '\\';
2977 *p++ = (unsigned char)s[-1];
2978 }
2979 break;
2980 }
2981 continue;
2982
2983 error:
2984 endinpos = s-starts;
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 if (unicode_decode_call_errorhandler(
2987 errors, &errorHandler,
2988 "unicodeescape", message,
2989 starts, size, &startinpos, &endinpos, &exc, &s,
2990 &v, &outpos, &p))
2991 goto onError;
2992 continue;
2993 }
2994 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2995 goto onError;
2996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
2998 return (PyObject *)v;
2999
3000 ucnhashError:
3001 PyErr_SetString(
3002 PyExc_UnicodeError,
3003 "\\N escapes not supported (can't load unicodedata module)"
3004 );
3005 Py_XDECREF(v);
3006 Py_XDECREF(errorHandler);
3007 Py_XDECREF(exc);
3008 return NULL;
3009
3010 onError:
3011 Py_XDECREF(v);
3012 Py_XDECREF(errorHandler);
3013 Py_XDECREF(exc);
3014 return NULL;
3015 }
3016
3017 /* Return a Unicode-Escape string version of the Unicode object.
3018
3019 If quotes is true, the string is enclosed in u"" or u'' quotes as
3020 appropriate.
3021
3022 */
3023
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3025 Py_ssize_t size,
3026 Py_UNICODE ch)
3027 {
3028 /* like wcschr, but doesn't stop at NULL characters */
3029
3030 while (size-- > 0) {
3031 if (*s == ch)
3032 return s;
3033 s++;
3034 }
3035
3036 return NULL;
3037 }
3038
3039 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)3040 PyObject *unicodeescape_string(const Py_UNICODE *s,
3041 Py_ssize_t size,
3042 int quotes)
3043 {
3044 PyObject *repr;
3045 char *p;
3046
3047 static const char *hexdigit = "0123456789abcdef";
3048 #ifdef Py_UNICODE_WIDE
3049 const Py_ssize_t expandsize = 10;
3050 #else
3051 const Py_ssize_t expandsize = 6;
3052 #endif
3053
3054 /* XXX(nnorwitz): rather than over-allocating, it would be
3055 better to choose a different scheme. Perhaps scan the
3056 first N-chars of the string and allocate based on that size.
3057 */
3058 /* Initial allocation is based on the longest-possible unichr
3059 escape.
3060
3061 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3062 unichr, so in this case it's the longest unichr escape. In
3063 narrow (UTF-16) builds this is five chars per source unichr
3064 since there are two unichrs in the surrogate pair, so in narrow
3065 (UTF-16) builds it's not the longest unichr escape.
3066
3067 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3068 so in the narrow (UTF-16) build case it's the longest unichr
3069 escape.
3070 */
3071
3072 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3073 return PyErr_NoMemory();
3074
3075 repr = PyString_FromStringAndSize(NULL,
3076 2
3077 + expandsize*size
3078 + 1);
3079 if (repr == NULL)
3080 return NULL;
3081
3082 p = PyString_AS_STRING(repr);
3083
3084 if (quotes) {
3085 *p++ = 'u';
3086 *p++ = (findchar(s, size, '\'') &&
3087 !findchar(s, size, '"')) ? '"' : '\'';
3088 }
3089 while (size-- > 0) {
3090 Py_UNICODE ch = *s++;
3091
3092 /* Escape quotes and backslashes */
3093 if ((quotes &&
3094 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3095 *p++ = '\\';
3096 *p++ = (char) ch;
3097 continue;
3098 }
3099
3100 #ifdef Py_UNICODE_WIDE
3101 /* Map 21-bit characters to '\U00xxxxxx' */
3102 else if (ch >= 0x10000) {
3103 *p++ = '\\';
3104 *p++ = 'U';
3105 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3106 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3107 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3108 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3109 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3110 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3111 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3112 *p++ = hexdigit[ch & 0x0000000F];
3113 continue;
3114 }
3115 #else
3116 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3117 else if (ch >= 0xD800 && ch < 0xDC00) {
3118 Py_UNICODE ch2;
3119 Py_UCS4 ucs;
3120
3121 ch2 = *s++;
3122 size--;
3123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3124 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3125 *p++ = '\\';
3126 *p++ = 'U';
3127 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3128 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3129 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3130 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3131 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3132 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3133 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3134 *p++ = hexdigit[ucs & 0x0000000F];
3135 continue;
3136 }
3137 /* Fall through: isolated surrogates are copied as-is */
3138 s--;
3139 size++;
3140 }
3141 #endif
3142
3143 /* Map 16-bit characters to '\uxxxx' */
3144 if (ch >= 256) {
3145 *p++ = '\\';
3146 *p++ = 'u';
3147 *p++ = hexdigit[(ch >> 12) & 0x000F];
3148 *p++ = hexdigit[(ch >> 8) & 0x000F];
3149 *p++ = hexdigit[(ch >> 4) & 0x000F];
3150 *p++ = hexdigit[ch & 0x000F];
3151 }
3152
3153 /* Map special whitespace to '\t', \n', '\r' */
3154 else if (ch == '\t') {
3155 *p++ = '\\';
3156 *p++ = 't';
3157 }
3158 else if (ch == '\n') {
3159 *p++ = '\\';
3160 *p++ = 'n';
3161 }
3162 else if (ch == '\r') {
3163 *p++ = '\\';
3164 *p++ = 'r';
3165 }
3166
3167 /* Map non-printable US ASCII to '\xhh' */
3168 else if (ch < ' ' || ch >= 0x7F) {
3169 *p++ = '\\';
3170 *p++ = 'x';
3171 *p++ = hexdigit[(ch >> 4) & 0x000F];
3172 *p++ = hexdigit[ch & 0x000F];
3173 }
3174
3175 /* Copy everything else as-is */
3176 else
3177 *p++ = (char) ch;
3178 }
3179 if (quotes)
3180 *p++ = PyString_AS_STRING(repr)[1];
3181
3182 *p = '\0';
3183 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3184 return NULL;
3185 return repr;
3186 }
3187
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3189 Py_ssize_t size)
3190 {
3191 return unicodeescape_string(s, size, 0);
3192 }
3193
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3195 {
3196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
3198 return NULL;
3199 }
3200 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3201 PyUnicode_GET_SIZE(unicode));
3202 }
3203
3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3205
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3207 Py_ssize_t size,
3208 const char *errors)
3209 {
3210 const char *starts = s;
3211 Py_ssize_t startinpos;
3212 Py_ssize_t endinpos;
3213 Py_ssize_t outpos;
3214 PyUnicodeObject *v;
3215 Py_UNICODE *p;
3216 const char *end;
3217 const char *bs;
3218 PyObject *errorHandler = NULL;
3219 PyObject *exc = NULL;
3220
3221 /* Escaped strings will always be longer than the resulting
3222 Unicode string, so we start with size here and then reduce the
3223 length after conversion to the true value. (But decoding error
3224 handler might have to resize the string) */
3225 v = _PyUnicode_New(size);
3226 if (v == NULL)
3227 goto onError;
3228 if (size == 0)
3229 return (PyObject *)v;
3230 p = PyUnicode_AS_UNICODE(v);
3231 end = s + size;
3232 while (s < end) {
3233 unsigned char c;
3234 Py_UCS4 x;
3235 int i;
3236 int count;
3237
3238 /* Non-escape characters are interpreted as Unicode ordinals */
3239 if (*s != '\\') {
3240 *p++ = (unsigned char)*s++;
3241 continue;
3242 }
3243 startinpos = s-starts;
3244
3245 /* \u-escapes are only interpreted iff the number of leading
3246 backslashes if odd */
3247 bs = s;
3248 for (;s < end;) {
3249 if (*s != '\\')
3250 break;
3251 *p++ = (unsigned char)*s++;
3252 }
3253 if (((s - bs) & 1) == 0 ||
3254 s >= end ||
3255 (*s != 'u' && *s != 'U')) {
3256 continue;
3257 }
3258 p--;
3259 count = *s=='u' ? 4 : 8;
3260 s++;
3261
3262 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3263 outpos = p-PyUnicode_AS_UNICODE(v);
3264 for (x = 0, i = 0; i < count; ++i, ++s) {
3265 c = (unsigned char)*s;
3266 if (!isxdigit(c)) {
3267 endinpos = s-starts;
3268 if (unicode_decode_call_errorhandler(
3269 errors, &errorHandler,
3270 "rawunicodeescape", "truncated \\uXXXX",
3271 starts, size, &startinpos, &endinpos, &exc, &s,
3272 &v, &outpos, &p))
3273 goto onError;
3274 goto nextByte;
3275 }
3276 x = (x<<4) & ~0xF;
3277 if (c >= '0' && c <= '9')
3278 x += c - '0';
3279 else if (c >= 'a' && c <= 'f')
3280 x += 10 + c - 'a';
3281 else
3282 x += 10 + c - 'A';
3283 }
3284 if (x <= 0xffff)
3285 /* UCS-2 character */
3286 *p++ = (Py_UNICODE) x;
3287 else if (x <= 0x10ffff) {
3288 /* UCS-4 character. Either store directly, or as
3289 surrogate pair. */
3290 #ifdef Py_UNICODE_WIDE
3291 *p++ = (Py_UNICODE) x;
3292 #else
3293 x -= 0x10000L;
3294 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3295 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3296 #endif
3297 } else {
3298 endinpos = s-starts;
3299 outpos = p-PyUnicode_AS_UNICODE(v);
3300 if (unicode_decode_call_errorhandler(
3301 errors, &errorHandler,
3302 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3303 starts, size, &startinpos, &endinpos, &exc, &s,
3304 &v, &outpos, &p))
3305 goto onError;
3306 }
3307 nextByte:
3308 ;
3309 }
3310 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3311 goto onError;
3312 Py_XDECREF(errorHandler);
3313 Py_XDECREF(exc);
3314 return (PyObject *)v;
3315
3316 onError:
3317 Py_XDECREF(v);
3318 Py_XDECREF(errorHandler);
3319 Py_XDECREF(exc);
3320 return NULL;
3321 }
3322
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3324 Py_ssize_t size)
3325 {
3326 PyObject *repr;
3327 char *p;
3328 char *q;
3329
3330 static const char *hexdigit = "0123456789abcdef";
3331 #ifdef Py_UNICODE_WIDE
3332 const Py_ssize_t expandsize = 10;
3333 #else
3334 const Py_ssize_t expandsize = 6;
3335 #endif
3336
3337 if (size > PY_SSIZE_T_MAX / expandsize)
3338 return PyErr_NoMemory();
3339
3340 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3341 if (repr == NULL)
3342 return NULL;
3343 if (size == 0)
3344 return repr;
3345
3346 p = q = PyString_AS_STRING(repr);
3347 while (size-- > 0) {
3348 Py_UNICODE ch = *s++;
3349 #ifdef Py_UNICODE_WIDE
3350 /* Map 32-bit characters to '\Uxxxxxxxx' */
3351 if (ch >= 0x10000) {
3352 *p++ = '\\';
3353 *p++ = 'U';
3354 *p++ = hexdigit[(ch >> 28) & 0xf];
3355 *p++ = hexdigit[(ch >> 24) & 0xf];
3356 *p++ = hexdigit[(ch >> 20) & 0xf];
3357 *p++ = hexdigit[(ch >> 16) & 0xf];
3358 *p++ = hexdigit[(ch >> 12) & 0xf];
3359 *p++ = hexdigit[(ch >> 8) & 0xf];
3360 *p++ = hexdigit[(ch >> 4) & 0xf];
3361 *p++ = hexdigit[ch & 15];
3362 }
3363 else
3364 #else
3365 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3366 if (ch >= 0xD800 && ch < 0xDC00) {
3367 Py_UNICODE ch2;
3368 Py_UCS4 ucs;
3369
3370 ch2 = *s++;
3371 size--;
3372 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3373 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3374 *p++ = '\\';
3375 *p++ = 'U';
3376 *p++ = hexdigit[(ucs >> 28) & 0xf];
3377 *p++ = hexdigit[(ucs >> 24) & 0xf];
3378 *p++ = hexdigit[(ucs >> 20) & 0xf];
3379 *p++ = hexdigit[(ucs >> 16) & 0xf];
3380 *p++ = hexdigit[(ucs >> 12) & 0xf];
3381 *p++ = hexdigit[(ucs >> 8) & 0xf];
3382 *p++ = hexdigit[(ucs >> 4) & 0xf];
3383 *p++ = hexdigit[ucs & 0xf];
3384 continue;
3385 }
3386 /* Fall through: isolated surrogates are copied as-is */
3387 s--;
3388 size++;
3389 }
3390 #endif
3391 /* Map 16-bit characters to '\uxxxx' */
3392 if (ch >= 256) {
3393 *p++ = '\\';
3394 *p++ = 'u';
3395 *p++ = hexdigit[(ch >> 12) & 0xf];
3396 *p++ = hexdigit[(ch >> 8) & 0xf];
3397 *p++ = hexdigit[(ch >> 4) & 0xf];
3398 *p++ = hexdigit[ch & 15];
3399 }
3400 /* Copy everything else as-is */
3401 else
3402 *p++ = (char) ch;
3403 }
3404 *p = '\0';
3405 if (_PyString_Resize(&repr, p - q))
3406 return NULL;
3407 return repr;
3408 }
3409
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3411 {
3412 if (!PyUnicode_Check(unicode)) {
3413 PyErr_BadArgument();
3414 return NULL;
3415 }
3416 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3417 PyUnicode_GET_SIZE(unicode));
3418 }
3419
3420 /* --- Unicode Internal Codec ------------------------------------------- */
3421
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3423 Py_ssize_t size,
3424 const char *errors)
3425 {
3426 const char *starts = s;
3427 Py_ssize_t startinpos;
3428 Py_ssize_t endinpos;
3429 Py_ssize_t outpos;
3430 PyUnicodeObject *v;
3431 Py_UNICODE *p;
3432 const char *end;
3433 const char *reason;
3434 PyObject *errorHandler = NULL;
3435 PyObject *exc = NULL;
3436
3437 #ifdef Py_UNICODE_WIDE
3438 Py_UNICODE unimax = PyUnicode_GetMax();
3439 #endif
3440
3441 /* XXX overflow detection missing */
3442 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3443 if (v == NULL)
3444 goto onError;
3445 if (PyUnicode_GetSize((PyObject *)v) == 0)
3446 return (PyObject *)v;
3447 p = PyUnicode_AS_UNICODE(v);
3448 end = s + size;
3449
3450 while (s < end) {
3451 if (end-s < Py_UNICODE_SIZE) {
3452 endinpos = end-starts;
3453 reason = "truncated input";
3454 goto error;
3455 }
3456 memcpy(p, s, sizeof(Py_UNICODE));
3457 #ifdef Py_UNICODE_WIDE
3458 /* We have to sanity check the raw data, otherwise doom looms for
3459 some malformed UCS-4 data. */
3460 if (*p > unimax || *p < 0) {
3461 endinpos = s - starts + Py_UNICODE_SIZE;
3462 reason = "illegal code point (> 0x10FFFF)";
3463 goto error;
3464 }
3465 #endif
3466 p++;
3467 s += Py_UNICODE_SIZE;
3468 continue;
3469
3470 error:
3471 startinpos = s - starts;
3472 outpos = p - PyUnicode_AS_UNICODE(v);
3473 if (unicode_decode_call_errorhandler(
3474 errors, &errorHandler,
3475 "unicode_internal", reason,
3476 starts, size, &startinpos, &endinpos, &exc, &s,
3477 &v, &outpos, &p)) {
3478 goto onError;
3479 }
3480 }
3481
3482 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3483 goto onError;
3484 Py_XDECREF(errorHandler);
3485 Py_XDECREF(exc);
3486 return (PyObject *)v;
3487
3488 onError:
3489 Py_XDECREF(v);
3490 Py_XDECREF(errorHandler);
3491 Py_XDECREF(exc);
3492 return NULL;
3493 }
3494
3495 /* --- Latin-1 Codec ------------------------------------------------------ */
3496
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3497 PyObject *PyUnicode_DecodeLatin1(const char *s,
3498 Py_ssize_t size,
3499 const char *errors)
3500 {
3501 PyUnicodeObject *v;
3502 Py_UNICODE *p;
3503
3504 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3505 if (size == 1) {
3506 Py_UNICODE r = *(unsigned char*)s;
3507 return PyUnicode_FromUnicode(&r, 1);
3508 }
3509
3510 v = _PyUnicode_New(size);
3511 if (v == NULL)
3512 goto onError;
3513 if (size == 0)
3514 return (PyObject *)v;
3515 p = PyUnicode_AS_UNICODE(v);
3516 while (size-- > 0)
3517 *p++ = (unsigned char)*s++;
3518 return (PyObject *)v;
3519
3520 onError:
3521 Py_XDECREF(v);
3522 return NULL;
3523 }
3524
3525 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3526 static void make_encode_exception(PyObject **exceptionObject,
3527 const char *encoding,
3528 const Py_UNICODE *unicode, Py_ssize_t size,
3529 Py_ssize_t startpos, Py_ssize_t endpos,
3530 const char *reason)
3531 {
3532 if (*exceptionObject == NULL) {
3533 *exceptionObject = PyUnicodeEncodeError_Create(
3534 encoding, unicode, size, startpos, endpos, reason);
3535 }
3536 else {
3537 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3538 goto onError;
3539 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3540 goto onError;
3541 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3542 goto onError;
3543 return;
3544 onError:
3545 Py_CLEAR(*exceptionObject);
3546 }
3547 }
3548
3549 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3550 static void raise_encode_exception(PyObject **exceptionObject,
3551 const char *encoding,
3552 const Py_UNICODE *unicode, Py_ssize_t size,
3553 Py_ssize_t startpos, Py_ssize_t endpos,
3554 const char *reason)
3555 {
3556 make_encode_exception(exceptionObject,
3557 encoding, unicode, size, startpos, endpos, reason);
3558 if (*exceptionObject != NULL)
3559 PyCodec_StrictErrors(*exceptionObject);
3560 }
3561
3562 /* error handling callback helper:
3563 build arguments, call the callback and check the arguments,
3564 put the result into newpos and return the replacement string, which
3565 has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3566 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3567 PyObject **errorHandler,
3568 const char *encoding, const char *reason,
3569 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3570 Py_ssize_t startpos, Py_ssize_t endpos,
3571 Py_ssize_t *newpos)
3572 {
3573 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3574
3575 PyObject *restuple;
3576 PyObject *resunicode;
3577
3578 if (*errorHandler == NULL) {
3579 *errorHandler = PyCodec_LookupError(errors);
3580 if (*errorHandler == NULL)
3581 return NULL;
3582 }
3583
3584 make_encode_exception(exceptionObject,
3585 encoding, unicode, size, startpos, endpos, reason);
3586 if (*exceptionObject == NULL)
3587 return NULL;
3588
3589 restuple = PyObject_CallFunctionObjArgs(
3590 *errorHandler, *exceptionObject, NULL);
3591 if (restuple == NULL)
3592 return NULL;
3593 if (!PyTuple_Check(restuple)) {
3594 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3595 Py_DECREF(restuple);
3596 return NULL;
3597 }
3598 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3599 &resunicode, newpos)) {
3600 Py_DECREF(restuple);
3601 return NULL;
3602 }
3603 if (*newpos<0)
3604 *newpos = size+*newpos;
3605 if (*newpos<0 || *newpos>size) {
3606 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3607 Py_DECREF(restuple);
3608 return NULL;
3609 }
3610 Py_INCREF(resunicode);
3611 Py_DECREF(restuple);
3612 return resunicode;
3613 }
3614
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3616 Py_ssize_t size,
3617 const char *errors,
3618 int limit)
3619 {
3620 /* output object */
3621 PyObject *res;
3622 /* pointers to the beginning and end+1 of input */
3623 const Py_UNICODE *startp = p;
3624 const Py_UNICODE *endp = p + size;
3625 /* pointer to the beginning of the unencodable characters */
3626 /* const Py_UNICODE *badp = NULL; */
3627 /* pointer into the output */
3628 char *str;
3629 /* current output position */
3630 Py_ssize_t respos = 0;
3631 Py_ssize_t ressize;
3632 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3633 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3634 PyObject *errorHandler = NULL;
3635 PyObject *exc = NULL;
3636 /* the following variable is used for caching string comparisons
3637 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3638 int known_errorHandler = -1;
3639
3640 /* allocate enough for a simple encoding without
3641 replacements, if we need more, we'll resize */
3642 res = PyString_FromStringAndSize(NULL, size);
3643 if (res == NULL)
3644 goto onError;
3645 if (size == 0)
3646 return res;
3647 str = PyString_AS_STRING(res);
3648 ressize = size;
3649
3650 while (p<endp) {
3651 Py_UNICODE c = *p;
3652
3653 /* can we encode this? */
3654 if (c<limit) {
3655 /* no overflow check, because we know that the space is enough */
3656 *str++ = (char)c;
3657 ++p;
3658 }
3659 else {
3660 Py_ssize_t unicodepos = p-startp;
3661 Py_ssize_t requiredsize;
3662 PyObject *repunicode;
3663 Py_ssize_t repsize;
3664 Py_ssize_t newpos;
3665 Py_ssize_t respos;
3666 Py_UNICODE *uni2;
3667 /* startpos for collecting unencodable chars */
3668 const Py_UNICODE *collstart = p;
3669 const Py_UNICODE *collend = p;
3670 /* find all unecodable characters */
3671 while ((collend < endp) && ((*collend) >= limit))
3672 ++collend;
3673 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3674 if (known_errorHandler==-1) {
3675 if ((errors==NULL) || (!strcmp(errors, "strict")))
3676 known_errorHandler = 1;
3677 else if (!strcmp(errors, "replace"))
3678 known_errorHandler = 2;
3679 else if (!strcmp(errors, "ignore"))
3680 known_errorHandler = 3;
3681 else if (!strcmp(errors, "xmlcharrefreplace"))
3682 known_errorHandler = 4;
3683 else
3684 known_errorHandler = 0;
3685 }
3686 switch (known_errorHandler) {
3687 case 1: /* strict */
3688 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3689 goto onError;
3690 case 2: /* replace */
3691 while (collstart++ < collend)
3692 *str++ = '?'; /* fall through */
3693 case 3: /* ignore */
3694 p = collend;
3695 break;
3696 case 4: /* xmlcharrefreplace */
3697 respos = str - PyString_AS_STRING(res);
3698 /* determine replacement size (temporarily (mis)uses p) */
3699 requiredsize = respos;
3700 for (p = collstart; p < collend;) {
3701 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3702 Py_ssize_t incr;
3703 if (ch < 10)
3704 incr = 2+1+1;
3705 else if (ch < 100)
3706 incr = 2+2+1;
3707 else if (ch < 1000)
3708 incr = 2+3+1;
3709 else if (ch < 10000)
3710 incr = 2+4+1;
3711 else if (ch < 100000)
3712 incr = 2+5+1;
3713 else if (ch < 1000000)
3714 incr = 2+6+1;
3715 else
3716 incr = 2+7+1;
3717 if (requiredsize > PY_SSIZE_T_MAX - incr)
3718 goto overflow;
3719 requiredsize += incr;
3720 }
3721 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3722 goto overflow;
3723 requiredsize += endp - collend;
3724 if (requiredsize > ressize) {
3725 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3726 requiredsize = 2*ressize;
3727 if (_PyString_Resize(&res, requiredsize))
3728 goto onError;
3729 str = PyString_AS_STRING(res) + respos;
3730 ressize = requiredsize;
3731 }
3732 /* generate replacement (temporarily (mis)uses p) */
3733 for (p = collstart; p < collend;) {
3734 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3735 str += sprintf(str, "&#%d;", (int)ch);
3736 }
3737 p = collend;
3738 break;
3739 default:
3740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741 encoding, reason, startp, size, &exc,
3742 collstart-startp, collend-startp, &newpos);
3743 if (repunicode == NULL)
3744 goto onError;
3745 /* need more space? (at least enough for what we have+the
3746 replacement+the rest of the string, so we won't have to
3747 check space for encodable characters) */
3748 respos = str - PyString_AS_STRING(res);
3749 repsize = PyUnicode_GET_SIZE(repunicode);
3750 if (respos > PY_SSIZE_T_MAX - repsize)
3751 goto overflow;
3752 requiredsize = respos + repsize;
3753 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3754 goto overflow;
3755 requiredsize += endp - collend;
3756 if (requiredsize > ressize) {
3757 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3758 requiredsize = 2*ressize;
3759 if (_PyString_Resize(&res, requiredsize)) {
3760 Py_DECREF(repunicode);
3761 goto onError;
3762 }
3763 str = PyString_AS_STRING(res) + respos;
3764 ressize = requiredsize;
3765 }
3766 /* check if there is anything unencodable in the replacement
3767 and copy it to the output */
3768 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3769 c = *uni2;
3770 if (c >= limit) {
3771 raise_encode_exception(&exc, encoding, startp, size,
3772 unicodepos, unicodepos+1, reason);
3773 Py_DECREF(repunicode);
3774 goto onError;
3775 }
3776 *str = (char)c;
3777 }
3778 p = startp + newpos;
3779 Py_DECREF(repunicode);
3780 }
3781 }
3782 }
3783 /* Resize if we allocated to much */
3784 respos = str - PyString_AS_STRING(res);
3785 if (respos < ressize)
3786 /* If this falls res will be NULL */
3787 _PyString_Resize(&res, respos);
3788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
3790 return res;
3791
3792 overflow:
3793 PyErr_SetString(PyExc_OverflowError,
3794 "encoded result is too long for a Python string");
3795
3796 onError:
3797 Py_XDECREF(res);
3798 Py_XDECREF(errorHandler);
3799 Py_XDECREF(exc);
3800 return NULL;
3801 }
3802
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3804 Py_ssize_t size,
3805 const char *errors)
3806 {
3807 return unicode_encode_ucs1(p, size, errors, 256);
3808 }
3809
PyUnicode_AsLatin1String(PyObject * unicode)3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3811 {
3812 if (!PyUnicode_Check(unicode)) {
3813 PyErr_BadArgument();
3814 return NULL;
3815 }
3816 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3817 PyUnicode_GET_SIZE(unicode),
3818 NULL);
3819 }
3820
3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3822
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3823 PyObject *PyUnicode_DecodeASCII(const char *s,
3824 Py_ssize_t size,
3825 const char *errors)
3826 {
3827 const char *starts = s;
3828 PyUnicodeObject *v;
3829 Py_UNICODE *p;
3830 Py_ssize_t startinpos;
3831 Py_ssize_t endinpos;
3832 Py_ssize_t outpos;
3833 const char *e;
3834 PyObject *errorHandler = NULL;
3835 PyObject *exc = NULL;
3836
3837 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3838 if (size == 1 && *(unsigned char*)s < 128) {
3839 Py_UNICODE r = *(unsigned char*)s;
3840 return PyUnicode_FromUnicode(&r, 1);
3841 }
3842
3843 v = _PyUnicode_New(size);
3844 if (v == NULL)
3845 goto onError;
3846 if (size == 0)
3847 return (PyObject *)v;
3848 p = PyUnicode_AS_UNICODE(v);
3849 e = s + size;
3850 while (s < e) {
3851 register unsigned char c = (unsigned char)*s;
3852 if (c < 128) {
3853 *p++ = c;
3854 ++s;
3855 }
3856 else {
3857 startinpos = s-starts;
3858 endinpos = startinpos + 1;
3859 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3860 if (unicode_decode_call_errorhandler(
3861 errors, &errorHandler,
3862 "ascii", "ordinal not in range(128)",
3863 starts, size, &startinpos, &endinpos, &exc, &s,
3864 &v, &outpos, &p))
3865 goto onError;
3866 }
3867 }
3868 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3869 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3870 goto onError;
3871 Py_XDECREF(errorHandler);
3872 Py_XDECREF(exc);
3873 return (PyObject *)v;
3874
3875 onError:
3876 Py_XDECREF(v);
3877 Py_XDECREF(errorHandler);
3878 Py_XDECREF(exc);
3879 return NULL;
3880 }
3881
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3883 Py_ssize_t size,
3884 const char *errors)
3885 {
3886 return unicode_encode_ucs1(p, size, errors, 128);
3887 }
3888
PyUnicode_AsASCIIString(PyObject * unicode)3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3890 {
3891 if (!PyUnicode_Check(unicode)) {
3892 PyErr_BadArgument();
3893 return NULL;
3894 }
3895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3896 PyUnicode_GET_SIZE(unicode),
3897 NULL);
3898 }
3899
3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3901
3902 /* --- MBCS codecs for Windows -------------------------------------------- */
3903
3904 #if SIZEOF_INT < SIZEOF_SIZE_T
3905 #define NEED_RETRY
3906 #endif
3907
3908 /* XXX This code is limited to "true" double-byte encodings, as
3909 a) it assumes an incomplete character consists of a single byte, and
3910 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3911 encodings, see IsDBCSLeadByteEx documentation. */
3912
is_dbcs_lead_byte(const char * s,int offset)3913 static int is_dbcs_lead_byte(const char *s, int offset)
3914 {
3915 const char *curr = s + offset;
3916
3917 if (IsDBCSLeadByte(*curr)) {
3918 const char *prev = CharPrev(s, curr);
3919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3920 }
3921 return 0;
3922 }
3923
3924 /*
3925 * Decode MBCS string into unicode object. If 'final' is set, converts
3926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3927 */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3928 static int decode_mbcs(PyUnicodeObject **v,
3929 const char *s, /* MBCS string */
3930 int size, /* sizeof MBCS string */
3931 int final)
3932 {
3933 Py_UNICODE *p;
3934 Py_ssize_t n = 0;
3935 int usize = 0;
3936
3937 assert(size >= 0);
3938
3939 /* Skip trailing lead-byte unless 'final' is set */
3940 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3941 --size;
3942
3943 /* First get the size of the result */
3944 if (size > 0) {
3945 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3946 if (usize == 0) {
3947 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3948 return -1;
3949 }
3950 }
3951
3952 if (*v == NULL) {
3953 /* Create unicode object */
3954 *v = _PyUnicode_New(usize);
3955 if (*v == NULL)
3956 return -1;
3957 }
3958 else {
3959 /* Extend unicode object */
3960 n = PyUnicode_GET_SIZE(*v);
3961 if (_PyUnicode_Resize(v, n + usize) < 0)
3962 return -1;
3963 }
3964
3965 /* Do the conversion */
3966 if (size > 0) {
3967 p = PyUnicode_AS_UNICODE(*v) + n;
3968 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3969 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3970 return -1;
3971 }
3972 }
3973
3974 return size;
3975 }
3976
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3978 Py_ssize_t size,
3979 const char *errors,
3980 Py_ssize_t *consumed)
3981 {
3982 PyUnicodeObject *v = NULL;
3983 int done;
3984
3985 if (consumed)
3986 *consumed = 0;
3987
3988 #ifdef NEED_RETRY
3989 retry:
3990 if (size > INT_MAX)
3991 done = decode_mbcs(&v, s, INT_MAX, 0);
3992 else
3993 #endif
3994 done = decode_mbcs(&v, s, (int)size, !consumed);
3995
3996 if (done < 0) {
3997 Py_XDECREF(v);
3998 return NULL;
3999 }
4000
4001 if (consumed)
4002 *consumed += done;
4003
4004 #ifdef NEED_RETRY
4005 if (size > INT_MAX) {
4006 s += done;
4007 size -= done;
4008 goto retry;
4009 }
4010 #endif
4011
4012 return (PyObject *)v;
4013 }
4014
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)4015 PyObject *PyUnicode_DecodeMBCS(const char *s,
4016 Py_ssize_t size,
4017 const char *errors)
4018 {
4019 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4020 }
4021
4022 /*
4023 * Convert unicode into string object (MBCS).
4024 * Returns 0 if succeed, -1 otherwise.
4025 */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)4026 static int encode_mbcs(PyObject **repr,
4027 const Py_UNICODE *p, /* unicode */
4028 int size) /* size of unicode */
4029 {
4030 int mbcssize = 0;
4031 Py_ssize_t n = 0;
4032
4033 assert(size >= 0);
4034
4035 /* First get the size of the result */
4036 if (size > 0) {
4037 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4038 if (mbcssize == 0) {
4039 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4040 return -1;
4041 }
4042 }
4043
4044 if (*repr == NULL) {
4045 /* Create string object */
4046 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4047 if (*repr == NULL)
4048 return -1;
4049 }
4050 else {
4051 /* Extend string object */
4052 n = PyString_Size(*repr);
4053 if (_PyString_Resize(repr, n + mbcssize) < 0)
4054 return -1;
4055 }
4056
4057 /* Do the conversion */
4058 if (size > 0) {
4059 char *s = PyString_AS_STRING(*repr) + n;
4060 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4061 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4062 return -1;
4063 }
4064 }
4065
4066 return 0;
4067 }
4068
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4070 Py_ssize_t size,
4071 const char *errors)
4072 {
4073 PyObject *repr = NULL;
4074 int ret;
4075
4076 #ifdef NEED_RETRY
4077 retry:
4078 if (size > INT_MAX)
4079 ret = encode_mbcs(&repr, p, INT_MAX);
4080 else
4081 #endif
4082 ret = encode_mbcs(&repr, p, (int)size);
4083
4084 if (ret < 0) {
4085 Py_XDECREF(repr);
4086 return NULL;
4087 }
4088
4089 #ifdef NEED_RETRY
4090 if (size > INT_MAX) {
4091 p += INT_MAX;
4092 size -= INT_MAX;
4093 goto retry;
4094 }
4095 #endif
4096
4097 return repr;
4098 }
4099
PyUnicode_AsMBCSString(PyObject * unicode)4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4101 {
4102 if (!PyUnicode_Check(unicode)) {
4103 PyErr_BadArgument();
4104 return NULL;
4105 }
4106 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4107 PyUnicode_GET_SIZE(unicode),
4108 NULL);
4109 }
4110
4111 #undef NEED_RETRY
4112
4113 #endif /* MS_WINDOWS */
4114
4115 /* --- Character Mapping Codec -------------------------------------------- */
4116
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4117 PyObject *PyUnicode_DecodeCharmap(const char *s,
4118 Py_ssize_t size,
4119 PyObject *mapping,
4120 const char *errors)
4121 {
4122 const char *starts = s;
4123 Py_ssize_t startinpos;
4124 Py_ssize_t endinpos;
4125 Py_ssize_t outpos;
4126 const char *e;
4127 PyUnicodeObject *v;
4128 Py_UNICODE *p;
4129 Py_ssize_t extrachars = 0;
4130 PyObject *errorHandler = NULL;
4131 PyObject *exc = NULL;
4132 Py_UNICODE *mapstring = NULL;
4133 Py_ssize_t maplen = 0;
4134
4135 /* Default to Latin-1 */
4136 if (mapping == NULL)
4137 return PyUnicode_DecodeLatin1(s, size, errors);
4138
4139 v = _PyUnicode_New(size);
4140 if (v == NULL)
4141 goto onError;
4142 if (size == 0)
4143 return (PyObject *)v;
4144 p = PyUnicode_AS_UNICODE(v);
4145 e = s + size;
4146 if (PyUnicode_CheckExact(mapping)) {
4147 mapstring = PyUnicode_AS_UNICODE(mapping);
4148 maplen = PyUnicode_GET_SIZE(mapping);
4149 while (s < e) {
4150 unsigned char ch = *s;
4151 Py_UNICODE x = 0xfffe; /* illegal value */
4152
4153 if (ch < maplen)
4154 x = mapstring[ch];
4155
4156 if (x == 0xfffe) {
4157 /* undefined mapping */
4158 outpos = p-PyUnicode_AS_UNICODE(v);
4159 startinpos = s-starts;
4160 endinpos = startinpos+1;
4161 if (unicode_decode_call_errorhandler(
4162 errors, &errorHandler,
4163 "charmap", "character maps to <undefined>",
4164 starts, size, &startinpos, &endinpos, &exc, &s,
4165 &v, &outpos, &p)) {
4166 goto onError;
4167 }
4168 continue;
4169 }
4170 *p++ = x;
4171 ++s;
4172 }
4173 }
4174 else {
4175 while (s < e) {
4176 unsigned char ch = *s;
4177 PyObject *w, *x;
4178
4179 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4180 w = PyInt_FromLong((long)ch);
4181 if (w == NULL)
4182 goto onError;
4183 x = PyObject_GetItem(mapping, w);
4184 Py_DECREF(w);
4185 if (x == NULL) {
4186 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4187 /* No mapping found means: mapping is undefined. */
4188 PyErr_Clear();
4189 goto Undefined;
4190 } else
4191 goto onError;
4192 }
4193
4194 /* Apply mapping */
4195 if (x == Py_None)
4196 goto Undefined;
4197 if (PyInt_Check(x)) {
4198 long value = PyInt_AS_LONG(x);
4199 if (value == 0xFFFE)
4200 goto Undefined;
4201 if (value < 0 || value > 0x10FFFF) {
4202 PyErr_SetString(PyExc_TypeError,
4203 "character mapping must be in range(0x110000)");
4204 Py_DECREF(x);
4205 goto onError;
4206 }
4207
4208 #ifndef Py_UNICODE_WIDE
4209 if (value > 0xFFFF) {
4210 /* see the code for 1-n mapping below */
4211 if (extrachars < 2) {
4212 /* resize first */
4213 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4214 Py_ssize_t needed = 10 - extrachars;
4215 extrachars += needed;
4216 /* XXX overflow detection missing */
4217 if (_PyUnicode_Resize(&v,
4218 PyUnicode_GET_SIZE(v) + needed) < 0) {
4219 Py_DECREF(x);
4220 goto onError;
4221 }
4222 p = PyUnicode_AS_UNICODE(v) + oldpos;
4223 }
4224 value -= 0x10000;
4225 *p++ = 0xD800 | (value >> 10);
4226 *p++ = 0xDC00 | (value & 0x3FF);
4227 extrachars -= 2;
4228 }
4229 else
4230 #endif
4231 *p++ = (Py_UNICODE)value;
4232 }
4233 else if (PyUnicode_Check(x)) {
4234 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4235
4236 if (targetsize == 1) {
4237 /* 1-1 mapping */
4238 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4239 if (value == 0xFFFE)
4240 goto Undefined;
4241 *p++ = value;
4242 }
4243 else if (targetsize > 1) {
4244 /* 1-n mapping */
4245 if (targetsize > extrachars) {
4246 /* resize first */
4247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4248 Py_ssize_t needed = (targetsize - extrachars) + \
4249 (targetsize << 2);
4250 extrachars += needed;
4251 /* XXX overflow detection missing */
4252 if (_PyUnicode_Resize(&v,
4253 PyUnicode_GET_SIZE(v) + needed) < 0) {
4254 Py_DECREF(x);
4255 goto onError;
4256 }
4257 p = PyUnicode_AS_UNICODE(v) + oldpos;
4258 }
4259 Py_UNICODE_COPY(p,
4260 PyUnicode_AS_UNICODE(x),
4261 targetsize);
4262 p += targetsize;
4263 extrachars -= targetsize;
4264 }
4265 /* 1-0 mapping: skip the character */
4266 }
4267 else {
4268 /* wrong return value */
4269 PyErr_SetString(PyExc_TypeError,
4270 "character mapping must return integer, None or unicode");
4271 Py_DECREF(x);
4272 goto onError;
4273 }
4274 Py_DECREF(x);
4275 ++s;
4276 continue;
4277 Undefined:
4278 /* undefined mapping */
4279 Py_XDECREF(x);
4280 outpos = p-PyUnicode_AS_UNICODE(v);
4281 startinpos = s-starts;
4282 endinpos = startinpos+1;
4283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "charmap", "character maps to <undefined>",
4286 starts, size, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p)) {
4288 goto onError;
4289 }
4290 }
4291 }
4292 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4294 goto onError;
4295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
4297 return (PyObject *)v;
4298
4299 onError:
4300 Py_XDECREF(errorHandler);
4301 Py_XDECREF(exc);
4302 Py_XDECREF(v);
4303 return NULL;
4304 }
4305
4306 /* Charmap encoding: the lookup table */
4307
4308 struct encoding_map{
4309 PyObject_HEAD
4310 unsigned char level1[32];
4311 int count2, count3;
4312 unsigned char level23[1];
4313 };
4314
4315 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4316 encoding_map_size(PyObject *obj, PyObject* args)
4317 {
4318 struct encoding_map *map = (struct encoding_map*)obj;
4319 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4320 128*map->count3);
4321 }
4322
4323 static PyMethodDef encoding_map_methods[] = {
4324 {"size", encoding_map_size, METH_NOARGS,
4325 PyDoc_STR("Return the size (in bytes) of this object") },
4326 { 0 }
4327 };
4328
4329 static void
encoding_map_dealloc(PyObject * o)4330 encoding_map_dealloc(PyObject* o)
4331 {
4332 PyObject_FREE(o);
4333 }
4334
4335 static PyTypeObject EncodingMapType = {
4336 PyVarObject_HEAD_INIT(NULL, 0)
4337 "EncodingMap", /*tp_name*/
4338 sizeof(struct encoding_map), /*tp_basicsize*/
4339 0, /*tp_itemsize*/
4340 /* methods */
4341 encoding_map_dealloc, /*tp_dealloc*/
4342 0, /*tp_print*/
4343 0, /*tp_getattr*/
4344 0, /*tp_setattr*/
4345 0, /*tp_compare*/
4346 0, /*tp_repr*/
4347 0, /*tp_as_number*/
4348 0, /*tp_as_sequence*/
4349 0, /*tp_as_mapping*/
4350 0, /*tp_hash*/
4351 0, /*tp_call*/
4352 0, /*tp_str*/
4353 0, /*tp_getattro*/
4354 0, /*tp_setattro*/
4355 0, /*tp_as_buffer*/
4356 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4357 0, /*tp_doc*/
4358 0, /*tp_traverse*/
4359 0, /*tp_clear*/
4360 0, /*tp_richcompare*/
4361 0, /*tp_weaklistoffset*/
4362 0, /*tp_iter*/
4363 0, /*tp_iternext*/
4364 encoding_map_methods, /*tp_methods*/
4365 0, /*tp_members*/
4366 0, /*tp_getset*/
4367 0, /*tp_base*/
4368 0, /*tp_dict*/
4369 0, /*tp_descr_get*/
4370 0, /*tp_descr_set*/
4371 0, /*tp_dictoffset*/
4372 0, /*tp_init*/
4373 0, /*tp_alloc*/
4374 0, /*tp_new*/
4375 0, /*tp_free*/
4376 0, /*tp_is_gc*/
4377 };
4378
4379 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4380 PyUnicode_BuildEncodingMap(PyObject* string)
4381 {
4382 Py_UNICODE *decode;
4383 PyObject *result;
4384 struct encoding_map *mresult;
4385 int i;
4386 int need_dict = 0;
4387 unsigned char level1[32];
4388 unsigned char level2[512];
4389 unsigned char *mlevel1, *mlevel2, *mlevel3;
4390 int count2 = 0, count3 = 0;
4391
4392 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4393 PyErr_BadArgument();
4394 return NULL;
4395 }
4396 decode = PyUnicode_AS_UNICODE(string);
4397 memset(level1, 0xFF, sizeof level1);
4398 memset(level2, 0xFF, sizeof level2);
4399
4400 /* If there isn't a one-to-one mapping of NULL to \0,
4401 or if there are non-BMP characters, we need to use
4402 a mapping dictionary. */
4403 if (decode[0] != 0)
4404 need_dict = 1;
4405 for (i = 1; i < 256; i++) {
4406 int l1, l2;
4407 if (decode[i] == 0
4408 #ifdef Py_UNICODE_WIDE
4409 || decode[i] > 0xFFFF
4410 #endif
4411 ) {
4412 need_dict = 1;
4413 break;
4414 }
4415 if (decode[i] == 0xFFFE)
4416 /* unmapped character */
4417 continue;
4418 l1 = decode[i] >> 11;
4419 l2 = decode[i] >> 7;
4420 if (level1[l1] == 0xFF)
4421 level1[l1] = count2++;
4422 if (level2[l2] == 0xFF)
4423 level2[l2] = count3++;
4424 }
4425
4426 if (count2 >= 0xFF || count3 >= 0xFF)
4427 need_dict = 1;
4428
4429 if (need_dict) {
4430 PyObject *result = PyDict_New();
4431 PyObject *key, *value;
4432 if (!result)
4433 return NULL;
4434 for (i = 0; i < 256; i++) {
4435 value = NULL;
4436 key = PyInt_FromLong(decode[i]);
4437 value = PyInt_FromLong(i);
4438 if (!key || !value)
4439 goto failed1;
4440 if (PyDict_SetItem(result, key, value) == -1)
4441 goto failed1;
4442 Py_DECREF(key);
4443 Py_DECREF(value);
4444 }
4445 return result;
4446 failed1:
4447 Py_XDECREF(key);
4448 Py_XDECREF(value);
4449 Py_DECREF(result);
4450 return NULL;
4451 }
4452
4453 /* Create a three-level trie */
4454 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4455 16*count2 + 128*count3 - 1);
4456 if (!result)
4457 return PyErr_NoMemory();
4458 PyObject_Init(result, &EncodingMapType);
4459 mresult = (struct encoding_map*)result;
4460 mresult->count2 = count2;
4461 mresult->count3 = count3;
4462 mlevel1 = mresult->level1;
4463 mlevel2 = mresult->level23;
4464 mlevel3 = mresult->level23 + 16*count2;
4465 memcpy(mlevel1, level1, 32);
4466 memset(mlevel2, 0xFF, 16*count2);
4467 memset(mlevel3, 0, 128*count3);
4468 count3 = 0;
4469 for (i = 1; i < 256; i++) {
4470 int o1, o2, o3, i2, i3;
4471 if (decode[i] == 0xFFFE)
4472 /* unmapped character */
4473 continue;
4474 o1 = decode[i]>>11;
4475 o2 = (decode[i]>>7) & 0xF;
4476 i2 = 16*mlevel1[o1] + o2;
4477 if (mlevel2[i2] == 0xFF)
4478 mlevel2[i2] = count3++;
4479 o3 = decode[i] & 0x7F;
4480 i3 = 128*mlevel2[i2] + o3;
4481 mlevel3[i3] = i;
4482 }
4483 return result;
4484 }
4485
4486 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4488 {
4489 struct encoding_map *map = (struct encoding_map*)mapping;
4490 int l1 = c>>11;
4491 int l2 = (c>>7) & 0xF;
4492 int l3 = c & 0x7F;
4493 int i;
4494
4495 #ifdef Py_UNICODE_WIDE
4496 if (c > 0xFFFF) {
4497 return -1;
4498 }
4499 #endif
4500 if (c == 0)
4501 return 0;
4502 /* level 1*/
4503 i = map->level1[l1];
4504 if (i == 0xFF) {
4505 return -1;
4506 }
4507 /* level 2*/
4508 i = map->level23[16*i+l2];
4509 if (i == 0xFF) {
4510 return -1;
4511 }
4512 /* level 3 */
4513 i = map->level23[16*map->count2 + 128*i + l3];
4514 if (i == 0) {
4515 return -1;
4516 }
4517 return i;
4518 }
4519
4520 /* Lookup the character ch in the mapping. If the character
4521 can't be found, Py_None is returned (or NULL, if another
4522 error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4524 {
4525 PyObject *w = PyInt_FromLong((long)c);
4526 PyObject *x;
4527
4528 if (w == NULL)
4529 return NULL;
4530 x = PyObject_GetItem(mapping, w);
4531 Py_DECREF(w);
4532 if (x == NULL) {
4533 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4534 /* No mapping found means: mapping is undefined. */
4535 PyErr_Clear();
4536 x = Py_None;
4537 Py_INCREF(x);
4538 return x;
4539 } else
4540 return NULL;
4541 }
4542 else if (x == Py_None)
4543 return x;
4544 else if (PyInt_Check(x)) {
4545 long value = PyInt_AS_LONG(x);
4546 if (value < 0 || value > 255) {
4547 PyErr_SetString(PyExc_TypeError,
4548 "character mapping must be in range(256)");
4549 Py_DECREF(x);
4550 return NULL;
4551 }
4552 return x;
4553 }
4554 else if (PyString_Check(x))
4555 return x;
4556 else {
4557 /* wrong return value */
4558 PyErr_SetString(PyExc_TypeError,
4559 "character mapping must return integer, None or str");
4560 Py_DECREF(x);
4561 return NULL;
4562 }
4563 }
4564
4565 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4567 {
4568 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4569 /* exponentially overallocate to minimize reallocations */
4570 if (requiredsize < 2*outsize)
4571 requiredsize = 2*outsize;
4572 if (_PyString_Resize(outobj, requiredsize)) {
4573 return 0;
4574 }
4575 return 1;
4576 }
4577
4578 typedef enum charmapencode_result {
4579 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4580 }charmapencode_result;
4581 /* lookup the character, put the result in the output string and adjust
4582 various state variables. Reallocate the output string if not enough
4583 space is available. Return a new reference to the object that
4584 was put in the output buffer, or Py_None, if the mapping was undefined
4585 (in which case no character was written) or NULL, if a
4586 reallocation error occurred. The caller must decref the result */
4587 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4589 PyObject **outobj, Py_ssize_t *outpos)
4590 {
4591 PyObject *rep;
4592 char *outstart;
4593 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4594
4595 if (Py_TYPE(mapping) == &EncodingMapType) {
4596 int res = encoding_map_lookup(c, mapping);
4597 Py_ssize_t requiredsize = *outpos+1;
4598 if (res == -1)
4599 return enc_FAILED;
4600 if (outsize<requiredsize)
4601 if (!charmapencode_resize(outobj, outpos, requiredsize))
4602 return enc_EXCEPTION;
4603 outstart = PyString_AS_STRING(*outobj);
4604 outstart[(*outpos)++] = (char)res;
4605 return enc_SUCCESS;
4606 }
4607
4608 rep = charmapencode_lookup(c, mapping);
4609 if (rep==NULL)
4610 return enc_EXCEPTION;
4611 else if (rep==Py_None) {
4612 Py_DECREF(rep);
4613 return enc_FAILED;
4614 } else {
4615 if (PyInt_Check(rep)) {
4616 Py_ssize_t requiredsize = *outpos+1;
4617 if (outsize<requiredsize)
4618 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4619 Py_DECREF(rep);
4620 return enc_EXCEPTION;
4621 }
4622 outstart = PyString_AS_STRING(*outobj);
4623 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4624 }
4625 else {
4626 const char *repchars = PyString_AS_STRING(rep);
4627 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4628 Py_ssize_t requiredsize = *outpos+repsize;
4629 if (outsize<requiredsize)
4630 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4631 Py_DECREF(rep);
4632 return enc_EXCEPTION;
4633 }
4634 outstart = PyString_AS_STRING(*outobj);
4635 memcpy(outstart + *outpos, repchars, repsize);
4636 *outpos += repsize;
4637 }
4638 }
4639 Py_DECREF(rep);
4640 return enc_SUCCESS;
4641 }
4642
4643 /* handle an error in PyUnicode_EncodeCharmap
4644 Return 0 on success, -1 on error */
4645 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4646 int charmap_encoding_error(
4647 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4648 PyObject **exceptionObject,
4649 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4650 PyObject **res, Py_ssize_t *respos)
4651 {
4652 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4653 Py_ssize_t repsize;
4654 Py_ssize_t newpos;
4655 Py_UNICODE *uni2;
4656 /* startpos for collecting unencodable chars */
4657 Py_ssize_t collstartpos = *inpos;
4658 Py_ssize_t collendpos = *inpos+1;
4659 Py_ssize_t collpos;
4660 char *encoding = "charmap";
4661 char *reason = "character maps to <undefined>";
4662 charmapencode_result x;
4663
4664 /* find all unencodable characters */
4665 while (collendpos < size) {
4666 PyObject *rep;
4667 if (Py_TYPE(mapping) == &EncodingMapType) {
4668 int res = encoding_map_lookup(p[collendpos], mapping);
4669 if (res != -1)
4670 break;
4671 ++collendpos;
4672 continue;
4673 }
4674
4675 rep = charmapencode_lookup(p[collendpos], mapping);
4676 if (rep==NULL)
4677 return -1;
4678 else if (rep!=Py_None) {
4679 Py_DECREF(rep);
4680 break;
4681 }
4682 Py_DECREF(rep);
4683 ++collendpos;
4684 }
4685 /* cache callback name lookup
4686 * (if not done yet, i.e. it's the first error) */
4687 if (*known_errorHandler==-1) {
4688 if ((errors==NULL) || (!strcmp(errors, "strict")))
4689 *known_errorHandler = 1;
4690 else if (!strcmp(errors, "replace"))
4691 *known_errorHandler = 2;
4692 else if (!strcmp(errors, "ignore"))
4693 *known_errorHandler = 3;
4694 else if (!strcmp(errors, "xmlcharrefreplace"))
4695 *known_errorHandler = 4;
4696 else
4697 *known_errorHandler = 0;
4698 }
4699 switch (*known_errorHandler) {
4700 case 1: /* strict */
4701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702 return -1;
4703 case 2: /* replace */
4704 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4705 x = charmapencode_output('?', mapping, res, respos);
4706 if (x==enc_EXCEPTION) {
4707 return -1;
4708 }
4709 else if (x==enc_FAILED) {
4710 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4711 return -1;
4712 }
4713 }
4714 /* fall through */
4715 case 3: /* ignore */
4716 *inpos = collendpos;
4717 break;
4718 case 4: /* xmlcharrefreplace */
4719 /* generate replacement */
4720 for (collpos = collstartpos; collpos < collendpos;) {
4721 char buffer[2+29+1+1];
4722 char *cp;
4723 Py_UCS4 ch = p[collpos++];
4724 #ifndef Py_UNICODE_WIDE
4725 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4726 (collpos < collendpos) &&
4727 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4728 ch = ((((ch & 0x03FF) << 10) |
4729 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4730 }
4731 #endif
4732 sprintf(buffer, "&#%d;", (int)ch);
4733 for (cp = buffer; *cp; ++cp) {
4734 x = charmapencode_output(*cp, mapping, res, respos);
4735 if (x==enc_EXCEPTION)
4736 return -1;
4737 else if (x==enc_FAILED) {
4738 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739 return -1;
4740 }
4741 }
4742 }
4743 *inpos = collendpos;
4744 break;
4745 default:
4746 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4747 encoding, reason, p, size, exceptionObject,
4748 collstartpos, collendpos, &newpos);
4749 if (repunicode == NULL)
4750 return -1;
4751 /* generate replacement */
4752 repsize = PyUnicode_GET_SIZE(repunicode);
4753 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4754 x = charmapencode_output(*uni2, mapping, res, respos);
4755 if (x==enc_EXCEPTION) {
4756 return -1;
4757 }
4758 else if (x==enc_FAILED) {
4759 Py_DECREF(repunicode);
4760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4761 return -1;
4762 }
4763 }
4764 *inpos = newpos;
4765 Py_DECREF(repunicode);
4766 }
4767 return 0;
4768 }
4769
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4771 Py_ssize_t size,
4772 PyObject *mapping,
4773 const char *errors)
4774 {
4775 /* output object */
4776 PyObject *res = NULL;
4777 /* current input position */
4778 Py_ssize_t inpos = 0;
4779 /* current output position */
4780 Py_ssize_t respos = 0;
4781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
4783 /* the following variable is used for caching string comparisons
4784 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785 * 3=ignore, 4=xmlcharrefreplace */
4786 int known_errorHandler = -1;
4787
4788 /* Default to Latin-1 */
4789 if (mapping == NULL)
4790 return PyUnicode_EncodeLatin1(p, size, errors);
4791
4792 /* allocate enough for a simple encoding without
4793 replacements, if we need more, we'll resize */
4794 res = PyString_FromStringAndSize(NULL, size);
4795 if (res == NULL)
4796 goto onError;
4797 if (size == 0)
4798 return res;
4799
4800 while (inpos<size) {
4801 /* try to encode it */
4802 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4803 if (x==enc_EXCEPTION) /* error */
4804 goto onError;
4805 if (x==enc_FAILED) { /* unencodable character */
4806 if (charmap_encoding_error(p, size, &inpos, mapping,
4807 &exc,
4808 &known_errorHandler, &errorHandler, errors,
4809 &res, &respos)) {
4810 goto onError;
4811 }
4812 }
4813 else
4814 /* done with this character => adjust input position */
4815 ++inpos;
4816 }
4817
4818 /* Resize if we allocated to much */
4819 if (respos<PyString_GET_SIZE(res)) {
4820 if (_PyString_Resize(&res, respos))
4821 goto onError;
4822 }
4823 Py_XDECREF(exc);
4824 Py_XDECREF(errorHandler);
4825 return res;
4826
4827 onError:
4828 Py_XDECREF(res);
4829 Py_XDECREF(exc);
4830 Py_XDECREF(errorHandler);
4831 return NULL;
4832 }
4833
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4835 PyObject *mapping)
4836 {
4837 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4838 PyErr_BadArgument();
4839 return NULL;
4840 }
4841 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4842 PyUnicode_GET_SIZE(unicode),
4843 mapping,
4844 NULL);
4845 }
4846
4847 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4848 static void make_translate_exception(PyObject **exceptionObject,
4849 const Py_UNICODE *unicode, Py_ssize_t size,
4850 Py_ssize_t startpos, Py_ssize_t endpos,
4851 const char *reason)
4852 {
4853 if (*exceptionObject == NULL) {
4854 *exceptionObject = PyUnicodeTranslateError_Create(
4855 unicode, size, startpos, endpos, reason);
4856 }
4857 else {
4858 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4859 goto onError;
4860 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4861 goto onError;
4862 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4863 goto onError;
4864 return;
4865 onError:
4866 Py_CLEAR(*exceptionObject);
4867 }
4868 }
4869
4870 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4871 static void raise_translate_exception(PyObject **exceptionObject,
4872 const Py_UNICODE *unicode, Py_ssize_t size,
4873 Py_ssize_t startpos, Py_ssize_t endpos,
4874 const char *reason)
4875 {
4876 make_translate_exception(exceptionObject,
4877 unicode, size, startpos, endpos, reason);
4878 if (*exceptionObject != NULL)
4879 PyCodec_StrictErrors(*exceptionObject);
4880 }
4881
4882 /* error handling callback helper:
4883 build arguments, call the callback and check the arguments,
4884 put the result into newpos and return the replacement string, which
4885 has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4886 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4887 PyObject **errorHandler,
4888 const char *reason,
4889 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4890 Py_ssize_t startpos, Py_ssize_t endpos,
4891 Py_ssize_t *newpos)
4892 {
4893 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4894
4895 Py_ssize_t i_newpos;
4896 PyObject *restuple;
4897 PyObject *resunicode;
4898
4899 if (*errorHandler == NULL) {
4900 *errorHandler = PyCodec_LookupError(errors);
4901 if (*errorHandler == NULL)
4902 return NULL;
4903 }
4904
4905 make_translate_exception(exceptionObject,
4906 unicode, size, startpos, endpos, reason);
4907 if (*exceptionObject == NULL)
4908 return NULL;
4909
4910 restuple = PyObject_CallFunctionObjArgs(
4911 *errorHandler, *exceptionObject, NULL);
4912 if (restuple == NULL)
4913 return NULL;
4914 if (!PyTuple_Check(restuple)) {
4915 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4916 Py_DECREF(restuple);
4917 return NULL;
4918 }
4919 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4920 &resunicode, &i_newpos)) {
4921 Py_DECREF(restuple);
4922 return NULL;
4923 }
4924 if (i_newpos<0)
4925 *newpos = size+i_newpos;
4926 else
4927 *newpos = i_newpos;
4928 if (*newpos<0 || *newpos>size) {
4929 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4930 Py_DECREF(restuple);
4931 return NULL;
4932 }
4933 Py_INCREF(resunicode);
4934 Py_DECREF(restuple);
4935 return resunicode;
4936 }
4937
4938 /* Lookup the character ch in the mapping and put the result in result,
4939 which must be decrefed by the caller.
4940 Return 0 on success, -1 on error */
4941 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4943 {
4944 PyObject *w = PyInt_FromLong((long)c);
4945 PyObject *x;
4946
4947 if (w == NULL)
4948 return -1;
4949 x = PyObject_GetItem(mapping, w);
4950 Py_DECREF(w);
4951 if (x == NULL) {
4952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4953 /* No mapping found means: use 1:1 mapping. */
4954 PyErr_Clear();
4955 *result = NULL;
4956 return 0;
4957 } else
4958 return -1;
4959 }
4960 else if (x == Py_None) {
4961 *result = x;
4962 return 0;
4963 }
4964 else if (PyInt_Check(x)) {
4965 long value = PyInt_AS_LONG(x);
4966 long max = PyUnicode_GetMax();
4967 if (value < 0 || value > max) {
4968 PyErr_Format(PyExc_TypeError,
4969 "character mapping must be in range(0x%lx)", max+1);
4970 Py_DECREF(x);
4971 return -1;
4972 }
4973 *result = x;
4974 return 0;
4975 }
4976 else if (PyUnicode_Check(x)) {
4977 *result = x;
4978 return 0;
4979 }
4980 else {
4981 /* wrong return value */
4982 PyErr_SetString(PyExc_TypeError,
4983 "character mapping must return integer, None or unicode");
4984 Py_DECREF(x);
4985 return -1;
4986 }
4987 }
4988 /* ensure that *outobj is at least requiredsize characters long,
4989 if not reallocate and adjust various state variables.
4990 Return 0 on success, -1 on error */
4991 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4993 Py_ssize_t requiredsize)
4994 {
4995 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4996 if (requiredsize > oldsize) {
4997 /* remember old output position */
4998 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4999 /* exponentially overallocate to minimize reallocations */
5000 if (requiredsize < 2 * oldsize)
5001 requiredsize = 2 * oldsize;
5002 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5003 return -1;
5004 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5005 }
5006 return 0;
5007 }
5008 /* lookup the character, put the result in the output string and adjust
5009 various state variables. Return a new reference to the object that
5010 was put in the output buffer in *result, or Py_None, if the mapping was
5011 undefined (in which case no character was written).
5012 The called must decref result.
5013 Return 0 on success, -1 on error. */
5014 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5016 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5017 PyObject **res)
5018 {
5019 if (charmaptranslate_lookup(*curinp, mapping, res))
5020 return -1;
5021 if (*res==NULL) {
5022 /* not found => default to 1:1 mapping */
5023 *(*outp)++ = *curinp;
5024 }
5025 else if (*res==Py_None)
5026 ;
5027 else if (PyInt_Check(*res)) {
5028 /* no overflow check, because we know that the space is enough */
5029 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5030 }
5031 else if (PyUnicode_Check(*res)) {
5032 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5033 if (repsize==1) {
5034 /* no overflow check, because we know that the space is enough */
5035 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5036 }
5037 else if (repsize!=0) {
5038 /* more than one character */
5039 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5040 (insize - (curinp-startinp)) +
5041 repsize - 1;
5042 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5043 return -1;
5044 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5045 *outp += repsize;
5046 }
5047 }
5048 else
5049 return -1;
5050 return 0;
5051 }
5052
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5054 Py_ssize_t size,
5055 PyObject *mapping,
5056 const char *errors)
5057 {
5058 /* output object */
5059 PyObject *res = NULL;
5060 /* pointers to the beginning and end+1 of input */
5061 const Py_UNICODE *startp = p;
5062 const Py_UNICODE *endp = p + size;
5063 /* pointer into the output */
5064 Py_UNICODE *str;
5065 /* current output position */
5066 Py_ssize_t respos = 0;
5067 char *reason = "character maps to <undefined>";
5068 PyObject *errorHandler = NULL;
5069 PyObject *exc = NULL;
5070 /* the following variable is used for caching string comparisons
5071 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5072 * 3=ignore, 4=xmlcharrefreplace */
5073 int known_errorHandler = -1;
5074
5075 if (mapping == NULL) {
5076 PyErr_BadArgument();
5077 return NULL;
5078 }
5079
5080 /* allocate enough for a simple 1:1 translation without
5081 replacements, if we need more, we'll resize */
5082 res = PyUnicode_FromUnicode(NULL, size);
5083 if (res == NULL)
5084 goto onError;
5085 if (size == 0)
5086 return res;
5087 str = PyUnicode_AS_UNICODE(res);
5088
5089 while (p<endp) {
5090 /* try to encode it */
5091 PyObject *x = NULL;
5092 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5093 Py_XDECREF(x);
5094 goto onError;
5095 }
5096 Py_XDECREF(x);
5097 if (x!=Py_None) /* it worked => adjust input pointer */
5098 ++p;
5099 else { /* untranslatable character */
5100 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5101 Py_ssize_t repsize;
5102 Py_ssize_t newpos;
5103 Py_UNICODE *uni2;
5104 /* startpos for collecting untranslatable chars */
5105 const Py_UNICODE *collstart = p;
5106 const Py_UNICODE *collend = p+1;
5107 const Py_UNICODE *coll;
5108
5109 /* find all untranslatable characters */
5110 while (collend < endp) {
5111 if (charmaptranslate_lookup(*collend, mapping, &x))
5112 goto onError;
5113 Py_XDECREF(x);
5114 if (x!=Py_None)
5115 break;
5116 ++collend;
5117 }
5118 /* cache callback name lookup
5119 * (if not done yet, i.e. it's the first error) */
5120 if (known_errorHandler==-1) {
5121 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122 known_errorHandler = 1;
5123 else if (!strcmp(errors, "replace"))
5124 known_errorHandler = 2;
5125 else if (!strcmp(errors, "ignore"))
5126 known_errorHandler = 3;
5127 else if (!strcmp(errors, "xmlcharrefreplace"))
5128 known_errorHandler = 4;
5129 else
5130 known_errorHandler = 0;
5131 }
5132 switch (known_errorHandler) {
5133 case 1: /* strict */
5134 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5135 goto onError;
5136 case 2: /* replace */
5137 /* No need to check for space, this is a 1:1 replacement */
5138 for (coll = collstart; coll<collend; ++coll)
5139 *str++ = '?';
5140 /* fall through */
5141 case 3: /* ignore */
5142 p = collend;
5143 break;
5144 case 4: /* xmlcharrefreplace */
5145 /* generate replacement (temporarily (mis)uses p) */
5146 for (p = collstart; p < collend;) {
5147 char buffer[2+29+1+1];
5148 char *cp;
5149 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5150 sprintf(buffer, "&#%d;", (int)ch);
5151 if (charmaptranslate_makespace(&res, &str,
5152 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5153 goto onError;
5154 for (cp = buffer; *cp; ++cp)
5155 *str++ = *cp;
5156 }
5157 p = collend;
5158 break;
5159 default:
5160 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5161 reason, startp, size, &exc,
5162 collstart-startp, collend-startp, &newpos);
5163 if (repunicode == NULL)
5164 goto onError;
5165 /* generate replacement */
5166 repsize = PyUnicode_GET_SIZE(repunicode);
5167 if (charmaptranslate_makespace(&res, &str,
5168 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5169 Py_DECREF(repunicode);
5170 goto onError;
5171 }
5172 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5173 *str++ = *uni2;
5174 p = startp + newpos;
5175 Py_DECREF(repunicode);
5176 }
5177 }
5178 }
5179 /* Resize if we allocated to much */
5180 respos = str-PyUnicode_AS_UNICODE(res);
5181 if (respos<PyUnicode_GET_SIZE(res)) {
5182 if (PyUnicode_Resize(&res, respos) < 0)
5183 goto onError;
5184 }
5185 Py_XDECREF(exc);
5186 Py_XDECREF(errorHandler);
5187 return res;
5188
5189 onError:
5190 Py_XDECREF(res);
5191 Py_XDECREF(exc);
5192 Py_XDECREF(errorHandler);
5193 return NULL;
5194 }
5195
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5196 PyObject *PyUnicode_Translate(PyObject *str,
5197 PyObject *mapping,
5198 const char *errors)
5199 {
5200 PyObject *result;
5201
5202 str = PyUnicode_FromObject(str);
5203 if (str == NULL)
5204 goto onError;
5205 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5206 PyUnicode_GET_SIZE(str),
5207 mapping,
5208 errors);
5209 Py_DECREF(str);
5210 return result;
5211
5212 onError:
5213 Py_XDECREF(str);
5214 return NULL;
5215 }
5216
5217 /* --- Decimal Encoder ---------------------------------------------------- */
5218
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5220 Py_ssize_t length,
5221 char *output,
5222 const char *errors)
5223 {
5224 Py_UNICODE *p, *end;
5225 PyObject *errorHandler = NULL;
5226 PyObject *exc = NULL;
5227 const char *encoding = "decimal";
5228 const char *reason = "invalid decimal Unicode string";
5229 /* the following variable is used for caching string comparisons
5230 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5231 int known_errorHandler = -1;
5232
5233 if (output == NULL) {
5234 PyErr_BadArgument();
5235 return -1;
5236 }
5237
5238 p = s;
5239 end = s + length;
5240 while (p < end) {
5241 register Py_UNICODE ch = *p;
5242 int decimal;
5243 PyObject *repunicode;
5244 Py_ssize_t repsize;
5245 Py_ssize_t newpos;
5246 Py_UNICODE *uni2;
5247 Py_UNICODE *collstart;
5248 Py_UNICODE *collend;
5249
5250 if (Py_UNICODE_ISSPACE(ch)) {
5251 *output++ = ' ';
5252 ++p;
5253 continue;
5254 }
5255 decimal = Py_UNICODE_TODECIMAL(ch);
5256 if (decimal >= 0) {
5257 *output++ = '0' + decimal;
5258 ++p;
5259 continue;
5260 }
5261 if (0 < ch && ch < 256) {
5262 *output++ = (char)ch;
5263 ++p;
5264 continue;
5265 }
5266 /* All other characters are considered unencodable */
5267 collstart = p;
5268 for (collend = p+1; collend < end; collend++) {
5269 if ((0 < *collend && *collend < 256) ||
5270 Py_UNICODE_ISSPACE(*collend) ||
5271 0 <= Py_UNICODE_TODECIMAL(*collend))
5272 break;
5273 }
5274 /* cache callback name lookup
5275 * (if not done yet, i.e. it's the first error) */
5276 if (known_errorHandler==-1) {
5277 if ((errors==NULL) || (!strcmp(errors, "strict")))
5278 known_errorHandler = 1;
5279 else if (!strcmp(errors, "replace"))
5280 known_errorHandler = 2;
5281 else if (!strcmp(errors, "ignore"))
5282 known_errorHandler = 3;
5283 else if (!strcmp(errors, "xmlcharrefreplace"))
5284 known_errorHandler = 4;
5285 else
5286 known_errorHandler = 0;
5287 }
5288 switch (known_errorHandler) {
5289 case 1: /* strict */
5290 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5291 goto onError;
5292 case 2: /* replace */
5293 for (p = collstart; p < collend; ++p)
5294 *output++ = '?';
5295 /* fall through */
5296 case 3: /* ignore */
5297 p = collend;
5298 break;
5299 case 4: /* xmlcharrefreplace */
5300 /* generate replacement (temporarily (mis)uses p) */
5301 for (p = collstart; p < collend;) {
5302 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5303 output += sprintf(output, "&#%d;", ch);
5304 }
5305 p = collend;
5306 break;
5307 default:
5308 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5309 encoding, reason, s, length, &exc,
5310 collstart-s, collend-s, &newpos);
5311 if (repunicode == NULL)
5312 goto onError;
5313 /* generate replacement */
5314 repsize = PyUnicode_GET_SIZE(repunicode);
5315 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5316 Py_UNICODE ch = *uni2;
5317 if (Py_UNICODE_ISSPACE(ch))
5318 *output++ = ' ';
5319 else {
5320 decimal = Py_UNICODE_TODECIMAL(ch);
5321 if (decimal >= 0)
5322 *output++ = '0' + decimal;
5323 else if (0 < ch && ch < 256)
5324 *output++ = (char)ch;
5325 else {
5326 Py_DECREF(repunicode);
5327 raise_encode_exception(&exc, encoding,
5328 s, length, collstart-s, collend-s, reason);
5329 goto onError;
5330 }
5331 }
5332 }
5333 p = s + newpos;
5334 Py_DECREF(repunicode);
5335 }
5336 }
5337 /* 0-terminate the output string */
5338 *output++ = '\0';
5339 Py_XDECREF(exc);
5340 Py_XDECREF(errorHandler);
5341 return 0;
5342
5343 onError:
5344 Py_XDECREF(exc);
5345 Py_XDECREF(errorHandler);
5346 return -1;
5347 }
5348
5349 /* --- Helpers ------------------------------------------------------------ */
5350
5351 #include "stringlib/unicodedefs.h"
5352 #include "stringlib/fastsearch.h"
5353
5354 #include "stringlib/count.h"
5355 #include "stringlib/find.h"
5356 #include "stringlib/partition.h"
5357 #include "stringlib/split.h"
5358
5359 /* helper macro to fixup start/end slice values */
5360 #define ADJUST_INDICES(start, end, len) \
5361 if (end > len) \
5362 end = len; \
5363 else if (end < 0) { \
5364 end += len; \
5365 if (end < 0) \
5366 end = 0; \
5367 } \
5368 if (start < 0) { \
5369 start += len; \
5370 if (start < 0) \
5371 start = 0; \
5372 }
5373
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5374 Py_ssize_t PyUnicode_Count(PyObject *str,
5375 PyObject *substr,
5376 Py_ssize_t start,
5377 Py_ssize_t end)
5378 {
5379 Py_ssize_t result;
5380 PyUnicodeObject* str_obj;
5381 PyUnicodeObject* sub_obj;
5382
5383 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5384 if (!str_obj)
5385 return -1;
5386 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5387 if (!sub_obj) {
5388 Py_DECREF(str_obj);
5389 return -1;
5390 }
5391
5392 ADJUST_INDICES(start, end, str_obj->length);
5393 result = stringlib_count(
5394 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5395 PY_SSIZE_T_MAX
5396 );
5397
5398 Py_DECREF(sub_obj);
5399 Py_DECREF(str_obj);
5400
5401 return result;
5402 }
5403
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5404 Py_ssize_t PyUnicode_Find(PyObject *str,
5405 PyObject *sub,
5406 Py_ssize_t start,
5407 Py_ssize_t end,
5408 int direction)
5409 {
5410 Py_ssize_t result;
5411
5412 str = PyUnicode_FromObject(str);
5413 if (!str)
5414 return -2;
5415 sub = PyUnicode_FromObject(sub);
5416 if (!sub) {
5417 Py_DECREF(str);
5418 return -2;
5419 }
5420
5421 if (direction > 0)
5422 result = stringlib_find_slice(
5423 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5424 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5425 start, end
5426 );
5427 else
5428 result = stringlib_rfind_slice(
5429 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431 start, end
5432 );
5433
5434 Py_DECREF(str);
5435 Py_DECREF(sub);
5436
5437 return result;
5438 }
5439
5440 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5441 int tailmatch(PyUnicodeObject *self,
5442 PyUnicodeObject *substring,
5443 Py_ssize_t start,
5444 Py_ssize_t end,
5445 int direction)
5446 {
5447 if (substring->length == 0)
5448 return 1;
5449
5450 ADJUST_INDICES(start, end, self->length);
5451 end -= substring->length;
5452 if (end < start)
5453 return 0;
5454
5455 if (direction > 0) {
5456 if (Py_UNICODE_MATCH(self, end, substring))
5457 return 1;
5458 } else {
5459 if (Py_UNICODE_MATCH(self, start, substring))
5460 return 1;
5461 }
5462
5463 return 0;
5464 }
5465
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5467 PyObject *substr,
5468 Py_ssize_t start,
5469 Py_ssize_t end,
5470 int direction)
5471 {
5472 Py_ssize_t result;
5473
5474 str = PyUnicode_FromObject(str);
5475 if (str == NULL)
5476 return -1;
5477 substr = PyUnicode_FromObject(substr);
5478 if (substr == NULL) {
5479 Py_DECREF(str);
5480 return -1;
5481 }
5482
5483 result = tailmatch((PyUnicodeObject *)str,
5484 (PyUnicodeObject *)substr,
5485 start, end, direction);
5486 Py_DECREF(str);
5487 Py_DECREF(substr);
5488 return result;
5489 }
5490
5491 /* Apply fixfct filter to the Unicode object self and return a
5492 reference to the modified object */
5493
5494 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5495 PyObject *fixup(PyUnicodeObject *self,
5496 int (*fixfct)(PyUnicodeObject *s))
5497 {
5498
5499 PyUnicodeObject *u;
5500
5501 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5502 if (u == NULL)
5503 return NULL;
5504
5505 Py_UNICODE_COPY(u->str, self->str, self->length);
5506
5507 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5508 /* fixfct should return TRUE if it modified the buffer. If
5509 FALSE, return a reference to the original buffer instead
5510 (to save space, not time) */
5511 Py_INCREF(self);
5512 Py_DECREF(u);
5513 return (PyObject*) self;
5514 }
5515 return (PyObject*) u;
5516 }
5517
5518 static
fixupper(PyUnicodeObject * self)5519 int fixupper(PyUnicodeObject *self)
5520 {
5521 Py_ssize_t len = self->length;
5522 Py_UNICODE *s = self->str;
5523 int status = 0;
5524
5525 while (len-- > 0) {
5526 register Py_UNICODE ch;
5527
5528 ch = Py_UNICODE_TOUPPER(*s);
5529 if (ch != *s) {
5530 status = 1;
5531 *s = ch;
5532 }
5533 s++;
5534 }
5535
5536 return status;
5537 }
5538
5539 static
fixlower(PyUnicodeObject * self)5540 int fixlower(PyUnicodeObject *self)
5541 {
5542 Py_ssize_t len = self->length;
5543 Py_UNICODE *s = self->str;
5544 int status = 0;
5545
5546 while (len-- > 0) {
5547 register Py_UNICODE ch;
5548
5549 ch = Py_UNICODE_TOLOWER(*s);
5550 if (ch != *s) {
5551 status = 1;
5552 *s = ch;
5553 }
5554 s++;
5555 }
5556
5557 return status;
5558 }
5559
5560 static
fixswapcase(PyUnicodeObject * self)5561 int fixswapcase(PyUnicodeObject *self)
5562 {
5563 Py_ssize_t len = self->length;
5564 Py_UNICODE *s = self->str;
5565 int status = 0;
5566
5567 while (len-- > 0) {
5568 if (Py_UNICODE_ISUPPER(*s)) {
5569 *s = Py_UNICODE_TOLOWER(*s);
5570 status = 1;
5571 } else if (Py_UNICODE_ISLOWER(*s)) {
5572 *s = Py_UNICODE_TOUPPER(*s);
5573 status = 1;
5574 }
5575 s++;
5576 }
5577
5578 return status;
5579 }
5580
5581 static
fixcapitalize(PyUnicodeObject * self)5582 int fixcapitalize(PyUnicodeObject *self)
5583 {
5584 Py_ssize_t len = self->length;
5585 Py_UNICODE *s = self->str;
5586 int status = 0;
5587
5588 if (len == 0)
5589 return 0;
5590 if (!Py_UNICODE_ISUPPER(*s)) {
5591 *s = Py_UNICODE_TOUPPER(*s);
5592 status = 1;
5593 }
5594 s++;
5595 while (--len > 0) {
5596 if (!Py_UNICODE_ISLOWER(*s)) {
5597 *s = Py_UNICODE_TOLOWER(*s);
5598 status = 1;
5599 }
5600 s++;
5601 }
5602 return status;
5603 }
5604
5605 static
fixtitle(PyUnicodeObject * self)5606 int fixtitle(PyUnicodeObject *self)
5607 {
5608 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609 register Py_UNICODE *e;
5610 int previous_is_cased;
5611
5612 /* Shortcut for single character strings */
5613 if (PyUnicode_GET_SIZE(self) == 1) {
5614 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5615 if (*p != ch) {
5616 *p = ch;
5617 return 1;
5618 }
5619 else
5620 return 0;
5621 }
5622
5623 e = p + PyUnicode_GET_SIZE(self);
5624 previous_is_cased = 0;
5625 for (; p < e; p++) {
5626 register const Py_UNICODE ch = *p;
5627
5628 if (previous_is_cased)
5629 *p = Py_UNICODE_TOLOWER(ch);
5630 else
5631 *p = Py_UNICODE_TOTITLE(ch);
5632
5633 if (Py_UNICODE_ISLOWER(ch) ||
5634 Py_UNICODE_ISUPPER(ch) ||
5635 Py_UNICODE_ISTITLE(ch))
5636 previous_is_cased = 1;
5637 else
5638 previous_is_cased = 0;
5639 }
5640 return 1;
5641 }
5642
5643 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5644 PyUnicode_Join(PyObject *separator, PyObject *seq)
5645 {
5646 PyObject *internal_separator = NULL;
5647 const Py_UNICODE blank = ' ';
5648 const Py_UNICODE *sep = ␣
5649 Py_ssize_t seplen = 1;
5650 PyUnicodeObject *res = NULL; /* the result */
5651 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5652 Py_ssize_t res_used; /* # used bytes */
5653 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5654 PyObject *fseq; /* PySequence_Fast(seq) */
5655 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5656 PyObject *item;
5657 Py_ssize_t i;
5658
5659 fseq = PySequence_Fast(seq, "can only join an iterable");
5660 if (fseq == NULL) {
5661 return NULL;
5662 }
5663
5664 /* Grrrr. A codec may be invoked to convert str objects to
5665 * Unicode, and so it's possible to call back into Python code
5666 * during PyUnicode_FromObject(), and so it's possible for a sick
5667 * codec to change the size of fseq (if seq is a list). Therefore
5668 * we have to keep refetching the size -- can't assume seqlen
5669 * is invariant.
5670 */
5671 seqlen = PySequence_Fast_GET_SIZE(fseq);
5672 /* If empty sequence, return u"". */
5673 if (seqlen == 0) {
5674 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5675 goto Done;
5676 }
5677 /* If singleton sequence with an exact Unicode, return that. */
5678 if (seqlen == 1) {
5679 item = PySequence_Fast_GET_ITEM(fseq, 0);
5680 if (PyUnicode_CheckExact(item)) {
5681 Py_INCREF(item);
5682 res = (PyUnicodeObject *)item;
5683 goto Done;
5684 }
5685 }
5686
5687 /* At least two items to join, or one that isn't exact Unicode. */
5688 if (seqlen > 1) {
5689 /* Set up sep and seplen -- they're needed. */
5690 if (separator == NULL) {
5691 sep = ␣
5692 seplen = 1;
5693 }
5694 else {
5695 internal_separator = PyUnicode_FromObject(separator);
5696 if (internal_separator == NULL)
5697 goto onError;
5698 sep = PyUnicode_AS_UNICODE(internal_separator);
5699 seplen = PyUnicode_GET_SIZE(internal_separator);
5700 /* In case PyUnicode_FromObject() mutated seq. */
5701 seqlen = PySequence_Fast_GET_SIZE(fseq);
5702 }
5703 }
5704
5705 /* Get space. */
5706 res = _PyUnicode_New(res_alloc);
5707 if (res == NULL)
5708 goto onError;
5709 res_p = PyUnicode_AS_UNICODE(res);
5710 res_used = 0;
5711
5712 for (i = 0; i < seqlen; ++i) {
5713 Py_ssize_t itemlen;
5714 Py_ssize_t new_res_used;
5715
5716 item = PySequence_Fast_GET_ITEM(fseq, i);
5717 /* Convert item to Unicode. */
5718 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5719 PyErr_Format(PyExc_TypeError,
5720 "sequence item %zd: expected string or Unicode,"
5721 " %.80s found",
5722 i, Py_TYPE(item)->tp_name);
5723 goto onError;
5724 }
5725 item = PyUnicode_FromObject(item);
5726 if (item == NULL)
5727 goto onError;
5728 /* We own a reference to item from here on. */
5729
5730 /* In case PyUnicode_FromObject() mutated seq. */
5731 seqlen = PySequence_Fast_GET_SIZE(fseq);
5732
5733 /* Make sure we have enough space for the separator and the item. */
5734 itemlen = PyUnicode_GET_SIZE(item);
5735 new_res_used = res_used + itemlen;
5736 if (new_res_used < 0)
5737 goto Overflow;
5738 if (i < seqlen - 1) {
5739 new_res_used += seplen;
5740 if (new_res_used < 0)
5741 goto Overflow;
5742 }
5743 if (new_res_used > res_alloc) {
5744 /* double allocated size until it's big enough */
5745 do {
5746 res_alloc += res_alloc;
5747 if (res_alloc <= 0)
5748 goto Overflow;
5749 } while (new_res_used > res_alloc);
5750 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5751 Py_DECREF(item);
5752 goto onError;
5753 }
5754 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5755 }
5756
5757 /* Copy item, and maybe the separator. */
5758 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5759 res_p += itemlen;
5760 if (i < seqlen - 1) {
5761 Py_UNICODE_COPY(res_p, sep, seplen);
5762 res_p += seplen;
5763 }
5764 Py_DECREF(item);
5765 res_used = new_res_used;
5766 }
5767
5768 /* Shrink res to match the used area; this probably can't fail,
5769 * but it's cheap to check.
5770 */
5771 if (_PyUnicode_Resize(&res, res_used) < 0)
5772 goto onError;
5773
5774 Done:
5775 Py_XDECREF(internal_separator);
5776 Py_DECREF(fseq);
5777 return (PyObject *)res;
5778
5779 Overflow:
5780 PyErr_SetString(PyExc_OverflowError,
5781 "join() result is too long for a Python string");
5782 Py_DECREF(item);
5783 /* fall through */
5784
5785 onError:
5786 Py_XDECREF(internal_separator);
5787 Py_DECREF(fseq);
5788 Py_XDECREF(res);
5789 return NULL;
5790 }
5791
5792 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5793 PyUnicodeObject *pad(PyUnicodeObject *self,
5794 Py_ssize_t left,
5795 Py_ssize_t right,
5796 Py_UNICODE fill)
5797 {
5798 PyUnicodeObject *u;
5799
5800 if (left < 0)
5801 left = 0;
5802 if (right < 0)
5803 right = 0;
5804
5805 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5806 Py_INCREF(self);
5807 return self;
5808 }
5809
5810 if (left > PY_SSIZE_T_MAX - self->length ||
5811 right > PY_SSIZE_T_MAX - (left + self->length)) {
5812 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5813 return NULL;
5814 }
5815 u = _PyUnicode_New(left + self->length + right);
5816 if (u) {
5817 if (left)
5818 Py_UNICODE_FILL(u->str, fill, left);
5819 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5820 if (right)
5821 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5822 }
5823
5824 return u;
5825 }
5826
PyUnicode_Splitlines(PyObject * string,int keepends)5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5828 {
5829 PyObject *list;
5830
5831 string = PyUnicode_FromObject(string);
5832 if (string == NULL)
5833 return NULL;
5834
5835 list = stringlib_splitlines(
5836 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5837 PyUnicode_GET_SIZE(string), keepends);
5838
5839 Py_DECREF(string);
5840 return list;
5841 }
5842
5843 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5844 PyObject *split(PyUnicodeObject *self,
5845 PyUnicodeObject *substring,
5846 Py_ssize_t maxcount)
5847 {
5848 if (maxcount < 0)
5849 maxcount = PY_SSIZE_T_MAX;
5850
5851 if (substring == NULL)
5852 return stringlib_split_whitespace(
5853 (PyObject*) self, self->str, self->length, maxcount
5854 );
5855
5856 return stringlib_split(
5857 (PyObject*) self, self->str, self->length,
5858 substring->str, substring->length,
5859 maxcount
5860 );
5861 }
5862
5863 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5864 PyObject *rsplit(PyUnicodeObject *self,
5865 PyUnicodeObject *substring,
5866 Py_ssize_t maxcount)
5867 {
5868 if (maxcount < 0)
5869 maxcount = PY_SSIZE_T_MAX;
5870
5871 if (substring == NULL)
5872 return stringlib_rsplit_whitespace(
5873 (PyObject*) self, self->str, self->length, maxcount
5874 );
5875
5876 return stringlib_rsplit(
5877 (PyObject*) self, self->str, self->length,
5878 substring->str, substring->length,
5879 maxcount
5880 );
5881 }
5882
5883 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5884 PyObject *replace(PyUnicodeObject *self,
5885 PyUnicodeObject *str1,
5886 PyUnicodeObject *str2,
5887 Py_ssize_t maxcount)
5888 {
5889 PyUnicodeObject *u;
5890
5891 if (maxcount < 0)
5892 maxcount = PY_SSIZE_T_MAX;
5893 else if (maxcount == 0 || self->length == 0)
5894 goto nothing;
5895
5896 if (str1->length == str2->length) {
5897 Py_ssize_t i;
5898 /* same length */
5899 if (str1->length == 0)
5900 goto nothing;
5901 if (str1->length == 1) {
5902 /* replace characters */
5903 Py_UNICODE u1, u2;
5904 if (!findchar(self->str, self->length, str1->str[0]))
5905 goto nothing;
5906 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5907 if (!u)
5908 return NULL;
5909 Py_UNICODE_COPY(u->str, self->str, self->length);
5910 u1 = str1->str[0];
5911 u2 = str2->str[0];
5912 for (i = 0; i < u->length; i++)
5913 if (u->str[i] == u1) {
5914 if (--maxcount < 0)
5915 break;
5916 u->str[i] = u2;
5917 }
5918 } else {
5919 i = stringlib_find(
5920 self->str, self->length, str1->str, str1->length, 0
5921 );
5922 if (i < 0)
5923 goto nothing;
5924 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5925 if (!u)
5926 return NULL;
5927 Py_UNICODE_COPY(u->str, self->str, self->length);
5928
5929 /* change everything in-place, starting with this one */
5930 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931 i += str1->length;
5932
5933 while ( --maxcount > 0) {
5934 i = stringlib_find(self->str+i, self->length-i,
5935 str1->str, str1->length,
5936 i);
5937 if (i == -1)
5938 break;
5939 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940 i += str1->length;
5941 }
5942 }
5943 } else {
5944
5945 Py_ssize_t n, i, j;
5946 Py_ssize_t product, new_size, delta;
5947 Py_UNICODE *p;
5948
5949 /* replace strings */
5950 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5951 maxcount);
5952 if (n == 0)
5953 goto nothing;
5954 /* new_size = self->length + n * (str2->length - str1->length)); */
5955 delta = (str2->length - str1->length);
5956 if (delta == 0) {
5957 new_size = self->length;
5958 } else {
5959 product = n * (str2->length - str1->length);
5960 if ((product / (str2->length - str1->length)) != n) {
5961 PyErr_SetString(PyExc_OverflowError,
5962 "replace string is too long");
5963 return NULL;
5964 }
5965 new_size = self->length + product;
5966 if (new_size < 0) {
5967 PyErr_SetString(PyExc_OverflowError,
5968 "replace string is too long");
5969 return NULL;
5970 }
5971 }
5972 u = _PyUnicode_New(new_size);
5973 if (!u)
5974 return NULL;
5975 i = 0;
5976 p = u->str;
5977 if (str1->length > 0) {
5978 while (n-- > 0) {
5979 /* look for next match */
5980 j = stringlib_find(self->str+i, self->length-i,
5981 str1->str, str1->length,
5982 i);
5983 if (j == -1)
5984 break;
5985 else if (j > i) {
5986 /* copy unchanged part [i:j] */
5987 Py_UNICODE_COPY(p, self->str+i, j-i);
5988 p += j - i;
5989 }
5990 /* copy substitution string */
5991 if (str2->length > 0) {
5992 Py_UNICODE_COPY(p, str2->str, str2->length);
5993 p += str2->length;
5994 }
5995 i = j + str1->length;
5996 }
5997 if (i < self->length)
5998 /* copy tail [i:] */
5999 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6000 } else {
6001 /* interleave */
6002 while (n > 0) {
6003 Py_UNICODE_COPY(p, str2->str, str2->length);
6004 p += str2->length;
6005 if (--n <= 0)
6006 break;
6007 *p++ = self->str[i++];
6008 }
6009 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6010 }
6011 }
6012 return (PyObject *) u;
6013
6014 nothing:
6015 /* nothing to replace; return original string (when possible) */
6016 if (PyUnicode_CheckExact(self)) {
6017 Py_INCREF(self);
6018 return (PyObject *) self;
6019 }
6020 return PyUnicode_FromUnicode(self->str, self->length);
6021 }
6022
6023 /* --- Unicode Object Methods --------------------------------------------- */
6024
6025 PyDoc_STRVAR(title__doc__,
6026 "S.title() -> unicode\n\
6027 \n\
6028 Return a titlecased version of S, i.e. words start with title case\n\
6029 characters, all remaining cased characters have lower case.");
6030
6031 static PyObject*
unicode_title(PyUnicodeObject * self)6032 unicode_title(PyUnicodeObject *self)
6033 {
6034 return fixup(self, fixtitle);
6035 }
6036
6037 PyDoc_STRVAR(capitalize__doc__,
6038 "S.capitalize() -> unicode\n\
6039 \n\
6040 Return a capitalized version of S, i.e. make the first character\n\
6041 have upper case and the rest lower case.");
6042
6043 static PyObject*
unicode_capitalize(PyUnicodeObject * self)6044 unicode_capitalize(PyUnicodeObject *self)
6045 {
6046 return fixup(self, fixcapitalize);
6047 }
6048
6049 #if 0
6050 PyDoc_STRVAR(capwords__doc__,
6051 "S.capwords() -> unicode\n\
6052 \n\
6053 Apply .capitalize() to all words in S and return the result with\n\
6054 normalized whitespace (all whitespace strings are replaced by ' ').");
6055
6056 static PyObject*
6057 unicode_capwords(PyUnicodeObject *self)
6058 {
6059 PyObject *list;
6060 PyObject *item;
6061 Py_ssize_t i;
6062
6063 /* Split into words */
6064 list = split(self, NULL, -1);
6065 if (!list)
6066 return NULL;
6067
6068 /* Capitalize each word */
6069 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6070 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6071 fixcapitalize);
6072 if (item == NULL)
6073 goto onError;
6074 Py_DECREF(PyList_GET_ITEM(list, i));
6075 PyList_SET_ITEM(list, i, item);
6076 }
6077
6078 /* Join the words to form a new string */
6079 item = PyUnicode_Join(NULL, list);
6080
6081 onError:
6082 Py_DECREF(list);
6083 return (PyObject *)item;
6084 }
6085 #endif
6086
6087 /* Argument converter. Coerces to a single unicode character */
6088
6089 static int
convert_uc(PyObject * obj,void * addr)6090 convert_uc(PyObject *obj, void *addr)
6091 {
6092 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6093 PyObject *uniobj;
6094 Py_UNICODE *unistr;
6095
6096 uniobj = PyUnicode_FromObject(obj);
6097 if (uniobj == NULL) {
6098 PyErr_SetString(PyExc_TypeError,
6099 "The fill character cannot be converted to Unicode");
6100 return 0;
6101 }
6102 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6103 PyErr_SetString(PyExc_TypeError,
6104 "The fill character must be exactly one character long");
6105 Py_DECREF(uniobj);
6106 return 0;
6107 }
6108 unistr = PyUnicode_AS_UNICODE(uniobj);
6109 *fillcharloc = unistr[0];
6110 Py_DECREF(uniobj);
6111 return 1;
6112 }
6113
6114 PyDoc_STRVAR(center__doc__,
6115 "S.center(width[, fillchar]) -> unicode\n\
6116 \n\
6117 Return S centered in a Unicode string of length width. Padding is\n\
6118 done using the specified fill character (default is a space)");
6119
6120 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6121 unicode_center(PyUnicodeObject *self, PyObject *args)
6122 {
6123 Py_ssize_t marg, left;
6124 Py_ssize_t width;
6125 Py_UNICODE fillchar = ' ';
6126
6127 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6128 return NULL;
6129
6130 if (self->length >= width && PyUnicode_CheckExact(self)) {
6131 Py_INCREF(self);
6132 return (PyObject*) self;
6133 }
6134
6135 marg = width - self->length;
6136 left = marg / 2 + (marg & width & 1);
6137
6138 return (PyObject*) pad(self, left, marg - left, fillchar);
6139 }
6140
6141 #if 0
6142
6143 /* This code should go into some future Unicode collation support
6144 module. The basic comparison should compare ordinals on a naive
6145 basis (this is what Java does and thus Jython too). */
6146
6147 /* speedy UTF-16 code point order comparison */
6148 /* gleaned from: */
6149 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6150
6151 static short utf16Fixup[32] =
6152 {
6153 0, 0, 0, 0, 0, 0, 0, 0,
6154 0, 0, 0, 0, 0, 0, 0, 0,
6155 0, 0, 0, 0, 0, 0, 0, 0,
6156 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6157 };
6158
6159 static int
6160 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6161 {
6162 Py_ssize_t len1, len2;
6163
6164 Py_UNICODE *s1 = str1->str;
6165 Py_UNICODE *s2 = str2->str;
6166
6167 len1 = str1->length;
6168 len2 = str2->length;
6169
6170 while (len1 > 0 && len2 > 0) {
6171 Py_UNICODE c1, c2;
6172
6173 c1 = *s1++;
6174 c2 = *s2++;
6175
6176 if (c1 > (1<<11) * 26)
6177 c1 += utf16Fixup[c1>>11];
6178 if (c2 > (1<<11) * 26)
6179 c2 += utf16Fixup[c2>>11];
6180 /* now c1 and c2 are in UTF-32-compatible order */
6181
6182 if (c1 != c2)
6183 return (c1 < c2) ? -1 : 1;
6184
6185 len1--; len2--;
6186 }
6187
6188 return (len1 < len2) ? -1 : (len1 != len2);
6189 }
6190
6191 #else
6192
6193 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6194 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6195 {
6196 register Py_ssize_t len1, len2;
6197
6198 Py_UNICODE *s1 = str1->str;
6199 Py_UNICODE *s2 = str2->str;
6200
6201 len1 = str1->length;
6202 len2 = str2->length;
6203
6204 while (len1 > 0 && len2 > 0) {
6205 Py_UNICODE c1, c2;
6206
6207 c1 = *s1++;
6208 c2 = *s2++;
6209
6210 if (c1 != c2)
6211 return (c1 < c2) ? -1 : 1;
6212
6213 len1--; len2--;
6214 }
6215
6216 return (len1 < len2) ? -1 : (len1 != len2);
6217 }
6218
6219 #endif
6220
PyUnicode_Compare(PyObject * left,PyObject * right)6221 int PyUnicode_Compare(PyObject *left,
6222 PyObject *right)
6223 {
6224 PyUnicodeObject *u = NULL, *v = NULL;
6225 int result;
6226
6227 /* Coerce the two arguments */
6228 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6229 if (u == NULL)
6230 goto onError;
6231 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6232 if (v == NULL)
6233 goto onError;
6234
6235 /* Shortcut for empty or interned objects */
6236 if (v == u) {
6237 Py_DECREF(u);
6238 Py_DECREF(v);
6239 return 0;
6240 }
6241
6242 result = unicode_compare(u, v);
6243
6244 Py_DECREF(u);
6245 Py_DECREF(v);
6246 return result;
6247
6248 onError:
6249 Py_XDECREF(u);
6250 Py_XDECREF(v);
6251 return -1;
6252 }
6253
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6254 PyObject *PyUnicode_RichCompare(PyObject *left,
6255 PyObject *right,
6256 int op)
6257 {
6258 int result;
6259
6260 result = PyUnicode_Compare(left, right);
6261 if (result == -1 && PyErr_Occurred())
6262 goto onError;
6263
6264 /* Convert the return value to a Boolean */
6265 switch (op) {
6266 case Py_EQ:
6267 result = (result == 0);
6268 break;
6269 case Py_NE:
6270 result = (result != 0);
6271 break;
6272 case Py_LE:
6273 result = (result <= 0);
6274 break;
6275 case Py_GE:
6276 result = (result >= 0);
6277 break;
6278 case Py_LT:
6279 result = (result == -1);
6280 break;
6281 case Py_GT:
6282 result = (result == 1);
6283 break;
6284 }
6285 return PyBool_FromLong(result);
6286
6287 onError:
6288
6289 /* Standard case
6290
6291 Type errors mean that PyUnicode_FromObject() could not convert
6292 one of the arguments (usually the right hand side) to Unicode,
6293 ie. we can't handle the comparison request. However, it is
6294 possible that the other object knows a comparison method, which
6295 is why we return Py_NotImplemented to give the other object a
6296 chance.
6297
6298 */
6299 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6300 PyErr_Clear();
6301 Py_INCREF(Py_NotImplemented);
6302 return Py_NotImplemented;
6303 }
6304 if (op != Py_EQ && op != Py_NE)
6305 return NULL;
6306
6307 /* Equality comparison.
6308
6309 This is a special case: we silence any PyExc_UnicodeDecodeError
6310 and instead turn it into a PyErr_UnicodeWarning.
6311
6312 */
6313 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6314 return NULL;
6315 PyErr_Clear();
6316 if (PyErr_Warn(PyExc_UnicodeWarning,
6317 (op == Py_EQ) ?
6318 "Unicode equal comparison "
6319 "failed to convert both arguments to Unicode - "
6320 "interpreting them as being unequal" :
6321 "Unicode unequal comparison "
6322 "failed to convert both arguments to Unicode - "
6323 "interpreting them as being unequal"
6324 ) < 0)
6325 return NULL;
6326 result = (op == Py_NE);
6327 return PyBool_FromLong(result);
6328 }
6329
PyUnicode_Contains(PyObject * container,PyObject * element)6330 int PyUnicode_Contains(PyObject *container,
6331 PyObject *element)
6332 {
6333 PyObject *str, *sub;
6334 int result;
6335
6336 /* Coerce the two arguments */
6337 sub = PyUnicode_FromObject(element);
6338 if (!sub) {
6339 return -1;
6340 }
6341
6342 str = PyUnicode_FromObject(container);
6343 if (!str) {
6344 Py_DECREF(sub);
6345 return -1;
6346 }
6347
6348 result = stringlib_contains_obj(str, sub);
6349
6350 Py_DECREF(str);
6351 Py_DECREF(sub);
6352
6353 return result;
6354 }
6355
6356 /* Concat to string or Unicode object giving a new Unicode object. */
6357
PyUnicode_Concat(PyObject * left,PyObject * right)6358 PyObject *PyUnicode_Concat(PyObject *left,
6359 PyObject *right)
6360 {
6361 PyUnicodeObject *u = NULL, *v = NULL, *w;
6362
6363 /* Coerce the two arguments */
6364 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6365 if (u == NULL)
6366 goto onError;
6367 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6368 if (v == NULL)
6369 goto onError;
6370
6371 /* Shortcuts */
6372 if (v == unicode_empty) {
6373 Py_DECREF(v);
6374 return (PyObject *)u;
6375 }
6376 if (u == unicode_empty) {
6377 Py_DECREF(u);
6378 return (PyObject *)v;
6379 }
6380
6381 if (u->length > PY_SSIZE_T_MAX - v->length) {
6382 PyErr_SetString(PyExc_OverflowError,
6383 "strings are too large to concat");
6384 goto onError;
6385 }
6386
6387 /* Concat the two Unicode strings */
6388 w = _PyUnicode_New(u->length + v->length);
6389 if (w == NULL)
6390 goto onError;
6391 Py_UNICODE_COPY(w->str, u->str, u->length);
6392 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6393
6394 Py_DECREF(u);
6395 Py_DECREF(v);
6396 return (PyObject *)w;
6397
6398 onError:
6399 Py_XDECREF(u);
6400 Py_XDECREF(v);
6401 return NULL;
6402 }
6403
6404 PyDoc_STRVAR(count__doc__,
6405 "S.count(sub[, start[, end]]) -> int\n\
6406 \n\
6407 Return the number of non-overlapping occurrences of substring sub in\n\
6408 Unicode string S[start:end]. Optional arguments start and end are\n\
6409 interpreted as in slice notation.");
6410
6411 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6412 unicode_count(PyUnicodeObject *self, PyObject *args)
6413 {
6414 PyUnicodeObject *substring;
6415 Py_ssize_t start = 0;
6416 Py_ssize_t end = PY_SSIZE_T_MAX;
6417 PyObject *result;
6418
6419 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6420 &start, &end))
6421 return NULL;
6422
6423 ADJUST_INDICES(start, end, self->length);
6424 result = PyInt_FromSsize_t(
6425 stringlib_count(self->str + start, end - start,
6426 substring->str, substring->length,
6427 PY_SSIZE_T_MAX)
6428 );
6429
6430 Py_DECREF(substring);
6431
6432 return result;
6433 }
6434
6435 PyDoc_STRVAR(encode__doc__,
6436 "S.encode([encoding[,errors]]) -> string or unicode\n\
6437 \n\
6438 Encodes S using the codec registered for encoding. encoding defaults\n\
6439 to the default encoding. errors may be given to set a different error\n\
6440 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6441 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6442 'xmlcharrefreplace' as well as any other name registered with\n\
6443 codecs.register_error that can handle UnicodeEncodeErrors.");
6444
6445 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6446 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6447 {
6448 static char *kwlist[] = {"encoding", "errors", 0};
6449 char *encoding = NULL;
6450 char *errors = NULL;
6451 PyObject *v;
6452
6453 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6454 kwlist, &encoding, &errors))
6455 return NULL;
6456 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6457 if (v == NULL)
6458 goto onError;
6459 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6460 PyErr_Format(PyExc_TypeError,
6461 "encoder did not return a string/unicode object "
6462 "(type=%.400s)",
6463 Py_TYPE(v)->tp_name);
6464 Py_DECREF(v);
6465 return NULL;
6466 }
6467 return v;
6468
6469 onError:
6470 return NULL;
6471 }
6472
6473 PyDoc_STRVAR(decode__doc__,
6474 "S.decode([encoding[,errors]]) -> string or unicode\n\
6475 \n\
6476 Decodes S using the codec registered for encoding. encoding defaults\n\
6477 to the default encoding. errors may be given to set a different error\n\
6478 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6479 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6480 as well as any other name registered with codecs.register_error that is\n\
6481 able to handle UnicodeDecodeErrors.");
6482
6483 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6484 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6485 {
6486 static char *kwlist[] = {"encoding", "errors", 0};
6487 char *encoding = NULL;
6488 char *errors = NULL;
6489 PyObject *v;
6490
6491 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6492 kwlist, &encoding, &errors))
6493 return NULL;
6494 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6495 if (v == NULL)
6496 goto onError;
6497 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6498 PyErr_Format(PyExc_TypeError,
6499 "decoder did not return a string/unicode object "
6500 "(type=%.400s)",
6501 Py_TYPE(v)->tp_name);
6502 Py_DECREF(v);
6503 return NULL;
6504 }
6505 return v;
6506
6507 onError:
6508 return NULL;
6509 }
6510
6511 PyDoc_STRVAR(expandtabs__doc__,
6512 "S.expandtabs([tabsize]) -> unicode\n\
6513 \n\
6514 Return a copy of S where all tab characters are expanded using spaces.\n\
6515 If tabsize is not given, a tab size of 8 characters is assumed.");
6516
6517 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6518 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6519 {
6520 Py_UNICODE *e;
6521 Py_UNICODE *p;
6522 Py_UNICODE *q;
6523 Py_UNICODE *qe;
6524 Py_ssize_t i, j, incr;
6525 PyUnicodeObject *u;
6526 int tabsize = 8;
6527
6528 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6529 return NULL;
6530
6531 /* First pass: determine size of output string */
6532 i = 0; /* chars up to and including most recent \n or \r */
6533 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6534 e = self->str + self->length; /* end of input */
6535 for (p = self->str; p < e; p++)
6536 if (*p == '\t') {
6537 if (tabsize > 0) {
6538 incr = tabsize - (j % tabsize); /* cannot overflow */
6539 if (j > PY_SSIZE_T_MAX - incr)
6540 goto overflow1;
6541 j += incr;
6542 }
6543 }
6544 else {
6545 if (j > PY_SSIZE_T_MAX - 1)
6546 goto overflow1;
6547 j++;
6548 if (*p == '\n' || *p == '\r') {
6549 if (i > PY_SSIZE_T_MAX - j)
6550 goto overflow1;
6551 i += j;
6552 j = 0;
6553 }
6554 }
6555
6556 if (i > PY_SSIZE_T_MAX - j)
6557 goto overflow1;
6558
6559 /* Second pass: create output string and fill it */
6560 u = _PyUnicode_New(i + j);
6561 if (!u)
6562 return NULL;
6563
6564 j = 0; /* same as in first pass */
6565 q = u->str; /* next output char */
6566 qe = u->str + u->length; /* end of output */
6567
6568 for (p = self->str; p < e; p++)
6569 if (*p == '\t') {
6570 if (tabsize > 0) {
6571 i = tabsize - (j % tabsize);
6572 j += i;
6573 while (i--) {
6574 if (q >= qe)
6575 goto overflow2;
6576 *q++ = ' ';
6577 }
6578 }
6579 }
6580 else {
6581 if (q >= qe)
6582 goto overflow2;
6583 *q++ = *p;
6584 j++;
6585 if (*p == '\n' || *p == '\r')
6586 j = 0;
6587 }
6588
6589 return (PyObject*) u;
6590
6591 overflow2:
6592 Py_DECREF(u);
6593 overflow1:
6594 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6595 return NULL;
6596 }
6597
6598 PyDoc_STRVAR(find__doc__,
6599 "S.find(sub [,start [,end]]) -> int\n\
6600 \n\
6601 Return the lowest index in S where substring sub is found,\n\
6602 such that sub is contained within S[start:end]. Optional\n\
6603 arguments start and end are interpreted as in slice notation.\n\
6604 \n\
6605 Return -1 on failure.");
6606
6607 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6608 unicode_find(PyUnicodeObject *self, PyObject *args)
6609 {
6610 PyUnicodeObject *substring;
6611 Py_ssize_t start;
6612 Py_ssize_t end;
6613 Py_ssize_t result;
6614
6615 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6616 &start, &end))
6617 return NULL;
6618
6619 result = stringlib_find_slice(
6620 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6621 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6622 start, end
6623 );
6624
6625 Py_DECREF(substring);
6626
6627 return PyInt_FromSsize_t(result);
6628 }
6629
6630 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6631 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6632 {
6633 if (index < 0 || index >= self->length) {
6634 PyErr_SetString(PyExc_IndexError, "string index out of range");
6635 return NULL;
6636 }
6637
6638 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6639 }
6640
6641 static long
unicode_hash(PyUnicodeObject * self)6642 unicode_hash(PyUnicodeObject *self)
6643 {
6644 /* Since Unicode objects compare equal to their ASCII string
6645 counterparts, they should use the individual character values
6646 as basis for their hash value. This is needed to assure that
6647 strings and Unicode objects behave in the same way as
6648 dictionary keys. */
6649
6650 register Py_ssize_t len;
6651 register Py_UNICODE *p;
6652 register long x;
6653
6654 #ifdef Py_DEBUG
6655 assert(_Py_HashSecret_Initialized);
6656 #endif
6657 if (self->hash != -1)
6658 return self->hash;
6659 len = PyUnicode_GET_SIZE(self);
6660 /*
6661 We make the hash of the empty string be 0, rather than using
6662 (prefix ^ suffix), since this slightly obfuscates the hash secret
6663 */
6664 if (len == 0) {
6665 self->hash = 0;
6666 return 0;
6667 }
6668 p = PyUnicode_AS_UNICODE(self);
6669 x = _Py_HashSecret.prefix;
6670 x ^= *p << 7;
6671 while (--len >= 0)
6672 x = (1000003*x) ^ *p++;
6673 x ^= PyUnicode_GET_SIZE(self);
6674 x ^= _Py_HashSecret.suffix;
6675 if (x == -1)
6676 x = -2;
6677 self->hash = x;
6678 return x;
6679 }
6680
6681 PyDoc_STRVAR(index__doc__,
6682 "S.index(sub [,start [,end]]) -> int\n\
6683 \n\
6684 Like S.find() but raise ValueError when the substring is not found.");
6685
6686 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6687 unicode_index(PyUnicodeObject *self, PyObject *args)
6688 {
6689 Py_ssize_t result;
6690 PyUnicodeObject *substring;
6691 Py_ssize_t start;
6692 Py_ssize_t end;
6693
6694 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6695 &start, &end))
6696 return NULL;
6697
6698 result = stringlib_find_slice(
6699 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6700 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6701 start, end
6702 );
6703
6704 Py_DECREF(substring);
6705
6706 if (result < 0) {
6707 PyErr_SetString(PyExc_ValueError, "substring not found");
6708 return NULL;
6709 }
6710
6711 return PyInt_FromSsize_t(result);
6712 }
6713
6714 PyDoc_STRVAR(islower__doc__,
6715 "S.islower() -> bool\n\
6716 \n\
6717 Return True if all cased characters in S are lowercase and there is\n\
6718 at least one cased character in S, False otherwise.");
6719
6720 static PyObject*
unicode_islower(PyUnicodeObject * self)6721 unicode_islower(PyUnicodeObject *self)
6722 {
6723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6724 register const Py_UNICODE *e;
6725 int cased;
6726
6727 /* Shortcut for single character strings */
6728 if (PyUnicode_GET_SIZE(self) == 1)
6729 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6730
6731 /* Special case for empty strings */
6732 if (PyUnicode_GET_SIZE(self) == 0)
6733 return PyBool_FromLong(0);
6734
6735 e = p + PyUnicode_GET_SIZE(self);
6736 cased = 0;
6737 for (; p < e; p++) {
6738 register const Py_UNICODE ch = *p;
6739
6740 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6741 return PyBool_FromLong(0);
6742 else if (!cased && Py_UNICODE_ISLOWER(ch))
6743 cased = 1;
6744 }
6745 return PyBool_FromLong(cased);
6746 }
6747
6748 PyDoc_STRVAR(isupper__doc__,
6749 "S.isupper() -> bool\n\
6750 \n\
6751 Return True if all cased characters in S are uppercase and there is\n\
6752 at least one cased character in S, False otherwise.");
6753
6754 static PyObject*
unicode_isupper(PyUnicodeObject * self)6755 unicode_isupper(PyUnicodeObject *self)
6756 {
6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758 register const Py_UNICODE *e;
6759 int cased;
6760
6761 /* Shortcut for single character strings */
6762 if (PyUnicode_GET_SIZE(self) == 1)
6763 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6764
6765 /* Special case for empty strings */
6766 if (PyUnicode_GET_SIZE(self) == 0)
6767 return PyBool_FromLong(0);
6768
6769 e = p + PyUnicode_GET_SIZE(self);
6770 cased = 0;
6771 for (; p < e; p++) {
6772 register const Py_UNICODE ch = *p;
6773
6774 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6775 return PyBool_FromLong(0);
6776 else if (!cased && Py_UNICODE_ISUPPER(ch))
6777 cased = 1;
6778 }
6779 return PyBool_FromLong(cased);
6780 }
6781
6782 PyDoc_STRVAR(istitle__doc__,
6783 "S.istitle() -> bool\n\
6784 \n\
6785 Return True if S is a titlecased string and there is at least one\n\
6786 character in S, i.e. upper- and titlecase characters may only\n\
6787 follow uncased characters and lowercase characters only cased ones.\n\
6788 Return False otherwise.");
6789
6790 static PyObject*
unicode_istitle(PyUnicodeObject * self)6791 unicode_istitle(PyUnicodeObject *self)
6792 {
6793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6794 register const Py_UNICODE *e;
6795 int cased, previous_is_cased;
6796
6797 /* Shortcut for single character strings */
6798 if (PyUnicode_GET_SIZE(self) == 1)
6799 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6800 (Py_UNICODE_ISUPPER(*p) != 0));
6801
6802 /* Special case for empty strings */
6803 if (PyUnicode_GET_SIZE(self) == 0)
6804 return PyBool_FromLong(0);
6805
6806 e = p + PyUnicode_GET_SIZE(self);
6807 cased = 0;
6808 previous_is_cased = 0;
6809 for (; p < e; p++) {
6810 register const Py_UNICODE ch = *p;
6811
6812 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6813 if (previous_is_cased)
6814 return PyBool_FromLong(0);
6815 previous_is_cased = 1;
6816 cased = 1;
6817 }
6818 else if (Py_UNICODE_ISLOWER(ch)) {
6819 if (!previous_is_cased)
6820 return PyBool_FromLong(0);
6821 previous_is_cased = 1;
6822 cased = 1;
6823 }
6824 else
6825 previous_is_cased = 0;
6826 }
6827 return PyBool_FromLong(cased);
6828 }
6829
6830 PyDoc_STRVAR(isspace__doc__,
6831 "S.isspace() -> bool\n\
6832 \n\
6833 Return True if all characters in S are whitespace\n\
6834 and there is at least one character in S, False otherwise.");
6835
6836 static PyObject*
unicode_isspace(PyUnicodeObject * self)6837 unicode_isspace(PyUnicodeObject *self)
6838 {
6839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6840 register const Py_UNICODE *e;
6841
6842 /* Shortcut for single character strings */
6843 if (PyUnicode_GET_SIZE(self) == 1 &&
6844 Py_UNICODE_ISSPACE(*p))
6845 return PyBool_FromLong(1);
6846
6847 /* Special case for empty strings */
6848 if (PyUnicode_GET_SIZE(self) == 0)
6849 return PyBool_FromLong(0);
6850
6851 e = p + PyUnicode_GET_SIZE(self);
6852 for (; p < e; p++) {
6853 if (!Py_UNICODE_ISSPACE(*p))
6854 return PyBool_FromLong(0);
6855 }
6856 return PyBool_FromLong(1);
6857 }
6858
6859 PyDoc_STRVAR(isalpha__doc__,
6860 "S.isalpha() -> bool\n\
6861 \n\
6862 Return True if all characters in S are alphabetic\n\
6863 and there is at least one character in S, False otherwise.");
6864
6865 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6866 unicode_isalpha(PyUnicodeObject *self)
6867 {
6868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6869 register const Py_UNICODE *e;
6870
6871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self) == 1 &&
6873 Py_UNICODE_ISALPHA(*p))
6874 return PyBool_FromLong(1);
6875
6876 /* Special case for empty strings */
6877 if (PyUnicode_GET_SIZE(self) == 0)
6878 return PyBool_FromLong(0);
6879
6880 e = p + PyUnicode_GET_SIZE(self);
6881 for (; p < e; p++) {
6882 if (!Py_UNICODE_ISALPHA(*p))
6883 return PyBool_FromLong(0);
6884 }
6885 return PyBool_FromLong(1);
6886 }
6887
6888 PyDoc_STRVAR(isalnum__doc__,
6889 "S.isalnum() -> bool\n\
6890 \n\
6891 Return True if all characters in S are alphanumeric\n\
6892 and there is at least one character in S, False otherwise.");
6893
6894 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6895 unicode_isalnum(PyUnicodeObject *self)
6896 {
6897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6898 register const Py_UNICODE *e;
6899
6900 /* Shortcut for single character strings */
6901 if (PyUnicode_GET_SIZE(self) == 1 &&
6902 Py_UNICODE_ISALNUM(*p))
6903 return PyBool_FromLong(1);
6904
6905 /* Special case for empty strings */
6906 if (PyUnicode_GET_SIZE(self) == 0)
6907 return PyBool_FromLong(0);
6908
6909 e = p + PyUnicode_GET_SIZE(self);
6910 for (; p < e; p++) {
6911 if (!Py_UNICODE_ISALNUM(*p))
6912 return PyBool_FromLong(0);
6913 }
6914 return PyBool_FromLong(1);
6915 }
6916
6917 PyDoc_STRVAR(isdecimal__doc__,
6918 "S.isdecimal() -> bool\n\
6919 \n\
6920 Return True if there are only decimal characters in S,\n\
6921 False otherwise.");
6922
6923 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6924 unicode_isdecimal(PyUnicodeObject *self)
6925 {
6926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927 register const Py_UNICODE *e;
6928
6929 /* Shortcut for single character strings */
6930 if (PyUnicode_GET_SIZE(self) == 1 &&
6931 Py_UNICODE_ISDECIMAL(*p))
6932 return PyBool_FromLong(1);
6933
6934 /* Special case for empty strings */
6935 if (PyUnicode_GET_SIZE(self) == 0)
6936 return PyBool_FromLong(0);
6937
6938 e = p + PyUnicode_GET_SIZE(self);
6939 for (; p < e; p++) {
6940 if (!Py_UNICODE_ISDECIMAL(*p))
6941 return PyBool_FromLong(0);
6942 }
6943 return PyBool_FromLong(1);
6944 }
6945
6946 PyDoc_STRVAR(isdigit__doc__,
6947 "S.isdigit() -> bool\n\
6948 \n\
6949 Return True if all characters in S are digits\n\
6950 and there is at least one character in S, False otherwise.");
6951
6952 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6953 unicode_isdigit(PyUnicodeObject *self)
6954 {
6955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6956 register const Py_UNICODE *e;
6957
6958 /* Shortcut for single character strings */
6959 if (PyUnicode_GET_SIZE(self) == 1 &&
6960 Py_UNICODE_ISDIGIT(*p))
6961 return PyBool_FromLong(1);
6962
6963 /* Special case for empty strings */
6964 if (PyUnicode_GET_SIZE(self) == 0)
6965 return PyBool_FromLong(0);
6966
6967 e = p + PyUnicode_GET_SIZE(self);
6968 for (; p < e; p++) {
6969 if (!Py_UNICODE_ISDIGIT(*p))
6970 return PyBool_FromLong(0);
6971 }
6972 return PyBool_FromLong(1);
6973 }
6974
6975 PyDoc_STRVAR(isnumeric__doc__,
6976 "S.isnumeric() -> bool\n\
6977 \n\
6978 Return True if there are only numeric characters in S,\n\
6979 False otherwise.");
6980
6981 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6982 unicode_isnumeric(PyUnicodeObject *self)
6983 {
6984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6985 register const Py_UNICODE *e;
6986
6987 /* Shortcut for single character strings */
6988 if (PyUnicode_GET_SIZE(self) == 1 &&
6989 Py_UNICODE_ISNUMERIC(*p))
6990 return PyBool_FromLong(1);
6991
6992 /* Special case for empty strings */
6993 if (PyUnicode_GET_SIZE(self) == 0)
6994 return PyBool_FromLong(0);
6995
6996 e = p + PyUnicode_GET_SIZE(self);
6997 for (; p < e; p++) {
6998 if (!Py_UNICODE_ISNUMERIC(*p))
6999 return PyBool_FromLong(0);
7000 }
7001 return PyBool_FromLong(1);
7002 }
7003
7004 PyDoc_STRVAR(join__doc__,
7005 "S.join(iterable) -> unicode\n\
7006 \n\
7007 Return a string which is the concatenation of the strings in the\n\
7008 iterable. The separator between elements is S.");
7009
7010 static PyObject*
unicode_join(PyObject * self,PyObject * data)7011 unicode_join(PyObject *self, PyObject *data)
7012 {
7013 return PyUnicode_Join(self, data);
7014 }
7015
7016 static Py_ssize_t
unicode_length(PyUnicodeObject * self)7017 unicode_length(PyUnicodeObject *self)
7018 {
7019 return self->length;
7020 }
7021
7022 PyDoc_STRVAR(ljust__doc__,
7023 "S.ljust(width[, fillchar]) -> int\n\
7024 \n\
7025 Return S left-justified in a Unicode string of length width. Padding is\n\
7026 done using the specified fill character (default is a space).");
7027
7028 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)7029 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7030 {
7031 Py_ssize_t width;
7032 Py_UNICODE fillchar = ' ';
7033
7034 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7035 return NULL;
7036
7037 if (self->length >= width && PyUnicode_CheckExact(self)) {
7038 Py_INCREF(self);
7039 return (PyObject*) self;
7040 }
7041
7042 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7043 }
7044
7045 PyDoc_STRVAR(lower__doc__,
7046 "S.lower() -> unicode\n\
7047 \n\
7048 Return a copy of the string S converted to lowercase.");
7049
7050 static PyObject*
unicode_lower(PyUnicodeObject * self)7051 unicode_lower(PyUnicodeObject *self)
7052 {
7053 return fixup(self, fixlower);
7054 }
7055
7056 #define LEFTSTRIP 0
7057 #define RIGHTSTRIP 1
7058 #define BOTHSTRIP 2
7059
7060 /* Arrays indexed by above */
7061 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7062
7063 #define STRIPNAME(i) (stripformat[i]+3)
7064
7065 /* externally visible for str.strip(unicode) */
7066 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)7067 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7068 {
7069 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7070 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7071 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7072 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7073 Py_ssize_t i, j;
7074
7075 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7076
7077 i = 0;
7078 if (striptype != RIGHTSTRIP) {
7079 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7080 i++;
7081 }
7082 }
7083
7084 j = len;
7085 if (striptype != LEFTSTRIP) {
7086 do {
7087 j--;
7088 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7089 j++;
7090 }
7091
7092 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7093 Py_INCREF(self);
7094 return (PyObject*)self;
7095 }
7096 else
7097 return PyUnicode_FromUnicode(s+i, j-i);
7098 }
7099
7100
7101 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)7102 do_strip(PyUnicodeObject *self, int striptype)
7103 {
7104 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7105 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7106
7107 i = 0;
7108 if (striptype != RIGHTSTRIP) {
7109 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7110 i++;
7111 }
7112 }
7113
7114 j = len;
7115 if (striptype != LEFTSTRIP) {
7116 do {
7117 j--;
7118 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7119 j++;
7120 }
7121
7122 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7123 Py_INCREF(self);
7124 return (PyObject*)self;
7125 }
7126 else
7127 return PyUnicode_FromUnicode(s+i, j-i);
7128 }
7129
7130
7131 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7132 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7133 {
7134 PyObject *sep = NULL;
7135
7136 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7137 return NULL;
7138
7139 if (sep != NULL && sep != Py_None) {
7140 if (PyUnicode_Check(sep))
7141 return _PyUnicode_XStrip(self, striptype, sep);
7142 else if (PyString_Check(sep)) {
7143 PyObject *res;
7144 sep = PyUnicode_FromObject(sep);
7145 if (sep==NULL)
7146 return NULL;
7147 res = _PyUnicode_XStrip(self, striptype, sep);
7148 Py_DECREF(sep);
7149 return res;
7150 }
7151 else {
7152 PyErr_Format(PyExc_TypeError,
7153 "%s arg must be None, unicode or str",
7154 STRIPNAME(striptype));
7155 return NULL;
7156 }
7157 }
7158
7159 return do_strip(self, striptype);
7160 }
7161
7162
7163 PyDoc_STRVAR(strip__doc__,
7164 "S.strip([chars]) -> unicode\n\
7165 \n\
7166 Return a copy of the string S with leading and trailing\n\
7167 whitespace removed.\n\
7168 If chars is given and not None, remove characters in chars instead.\n\
7169 If chars is a str, it will be converted to unicode before stripping");
7170
7171 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7172 unicode_strip(PyUnicodeObject *self, PyObject *args)
7173 {
7174 if (PyTuple_GET_SIZE(args) == 0)
7175 return do_strip(self, BOTHSTRIP); /* Common case */
7176 else
7177 return do_argstrip(self, BOTHSTRIP, args);
7178 }
7179
7180
7181 PyDoc_STRVAR(lstrip__doc__,
7182 "S.lstrip([chars]) -> unicode\n\
7183 \n\
7184 Return a copy of the string S with leading whitespace removed.\n\
7185 If chars is given and not None, remove characters in chars instead.\n\
7186 If chars is a str, it will be converted to unicode before stripping");
7187
7188 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7189 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7190 {
7191 if (PyTuple_GET_SIZE(args) == 0)
7192 return do_strip(self, LEFTSTRIP); /* Common case */
7193 else
7194 return do_argstrip(self, LEFTSTRIP, args);
7195 }
7196
7197
7198 PyDoc_STRVAR(rstrip__doc__,
7199 "S.rstrip([chars]) -> unicode\n\
7200 \n\
7201 Return a copy of the string S with trailing whitespace removed.\n\
7202 If chars is given and not None, remove characters in chars instead.\n\
7203 If chars is a str, it will be converted to unicode before stripping");
7204
7205 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7206 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7207 {
7208 if (PyTuple_GET_SIZE(args) == 0)
7209 return do_strip(self, RIGHTSTRIP); /* Common case */
7210 else
7211 return do_argstrip(self, RIGHTSTRIP, args);
7212 }
7213
7214
7215 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7216 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7217 {
7218 PyUnicodeObject *u;
7219 Py_UNICODE *p;
7220 Py_ssize_t nchars;
7221 size_t nbytes;
7222
7223 if (len < 0)
7224 len = 0;
7225
7226 if (len == 1 && PyUnicode_CheckExact(str)) {
7227 /* no repeat, return original string */
7228 Py_INCREF(str);
7229 return (PyObject*) str;
7230 }
7231
7232 /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
7233 * needed doesn't overflow size_t
7234 */
7235 if (len && str->length > PY_SSIZE_T_MAX / len) {
7236 PyErr_SetString(PyExc_OverflowError,
7237 "repeated string is too long");
7238 return NULL;
7239 }
7240 nchars = len * str->length;
7241 nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
7242 if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
7243 PyErr_SetString(PyExc_OverflowError,
7244 "repeated string is too long");
7245 return NULL;
7246 }
7247 u = _PyUnicode_New(nchars);
7248 if (!u)
7249 return NULL;
7250
7251 p = u->str;
7252
7253 if (str->length == 1 && len > 0) {
7254 Py_UNICODE_FILL(p, str->str[0], len);
7255 } else {
7256 Py_ssize_t done = 0; /* number of characters copied this far */
7257 if (done < nchars) {
7258 Py_UNICODE_COPY(p, str->str, str->length);
7259 done = str->length;
7260 }
7261 while (done < nchars) {
7262 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7263 Py_UNICODE_COPY(p+done, p, n);
7264 done += n;
7265 }
7266 }
7267
7268 return (PyObject*) u;
7269 }
7270
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7271 PyObject *PyUnicode_Replace(PyObject *obj,
7272 PyObject *subobj,
7273 PyObject *replobj,
7274 Py_ssize_t maxcount)
7275 {
7276 PyObject *self;
7277 PyObject *str1;
7278 PyObject *str2;
7279 PyObject *result;
7280
7281 self = PyUnicode_FromObject(obj);
7282 if (self == NULL)
7283 return NULL;
7284 str1 = PyUnicode_FromObject(subobj);
7285 if (str1 == NULL) {
7286 Py_DECREF(self);
7287 return NULL;
7288 }
7289 str2 = PyUnicode_FromObject(replobj);
7290 if (str2 == NULL) {
7291 Py_DECREF(self);
7292 Py_DECREF(str1);
7293 return NULL;
7294 }
7295 result = replace((PyUnicodeObject *)self,
7296 (PyUnicodeObject *)str1,
7297 (PyUnicodeObject *)str2,
7298 maxcount);
7299 Py_DECREF(self);
7300 Py_DECREF(str1);
7301 Py_DECREF(str2);
7302 return result;
7303 }
7304
7305 PyDoc_STRVAR(replace__doc__,
7306 "S.replace(old, new[, count]) -> unicode\n\
7307 \n\
7308 Return a copy of S with all occurrences of substring\n\
7309 old replaced by new. If the optional argument count is\n\
7310 given, only the first count occurrences are replaced.");
7311
7312 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7313 unicode_replace(PyUnicodeObject *self, PyObject *args)
7314 {
7315 PyUnicodeObject *str1;
7316 PyUnicodeObject *str2;
7317 Py_ssize_t maxcount = -1;
7318 PyObject *result;
7319
7320 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7321 return NULL;
7322 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7323 if (str1 == NULL)
7324 return NULL;
7325 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7326 if (str2 == NULL) {
7327 Py_DECREF(str1);
7328 return NULL;
7329 }
7330
7331 result = replace(self, str1, str2, maxcount);
7332
7333 Py_DECREF(str1);
7334 Py_DECREF(str2);
7335 return result;
7336 }
7337
7338 static
unicode_repr(PyObject * unicode)7339 PyObject *unicode_repr(PyObject *unicode)
7340 {
7341 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7342 PyUnicode_GET_SIZE(unicode),
7343 1);
7344 }
7345
7346 PyDoc_STRVAR(rfind__doc__,
7347 "S.rfind(sub [,start [,end]]) -> int\n\
7348 \n\
7349 Return the highest index in S where substring sub is found,\n\
7350 such that sub is contained within S[start:end]. Optional\n\
7351 arguments start and end are interpreted as in slice notation.\n\
7352 \n\
7353 Return -1 on failure.");
7354
7355 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7356 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7357 {
7358 PyUnicodeObject *substring;
7359 Py_ssize_t start;
7360 Py_ssize_t end;
7361 Py_ssize_t result;
7362
7363 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7364 &start, &end))
7365 return NULL;
7366
7367 result = stringlib_rfind_slice(
7368 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7369 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7370 start, end
7371 );
7372
7373 Py_DECREF(substring);
7374
7375 return PyInt_FromSsize_t(result);
7376 }
7377
7378 PyDoc_STRVAR(rindex__doc__,
7379 "S.rindex(sub [,start [,end]]) -> int\n\
7380 \n\
7381 Like S.rfind() but raise ValueError when the substring is not found.");
7382
7383 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7384 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7385 {
7386 PyUnicodeObject *substring;
7387 Py_ssize_t start;
7388 Py_ssize_t end;
7389 Py_ssize_t result;
7390
7391 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7392 &start, &end))
7393 return NULL;
7394
7395 result = stringlib_rfind_slice(
7396 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7397 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7398 start, end
7399 );
7400
7401 Py_DECREF(substring);
7402
7403 if (result < 0) {
7404 PyErr_SetString(PyExc_ValueError, "substring not found");
7405 return NULL;
7406 }
7407 return PyInt_FromSsize_t(result);
7408 }
7409
7410 PyDoc_STRVAR(rjust__doc__,
7411 "S.rjust(width[, fillchar]) -> unicode\n\
7412 \n\
7413 Return S right-justified in a Unicode string of length width. Padding is\n\
7414 done using the specified fill character (default is a space).");
7415
7416 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7417 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7418 {
7419 Py_ssize_t width;
7420 Py_UNICODE fillchar = ' ';
7421
7422 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7423 return NULL;
7424
7425 if (self->length >= width && PyUnicode_CheckExact(self)) {
7426 Py_INCREF(self);
7427 return (PyObject*) self;
7428 }
7429
7430 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7431 }
7432
7433 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7434 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7435 {
7436 /* standard clamping */
7437 if (start < 0)
7438 start = 0;
7439 if (end < 0)
7440 end = 0;
7441 if (end > self->length)
7442 end = self->length;
7443 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7444 /* full slice, return original string */
7445 Py_INCREF(self);
7446 return (PyObject*) self;
7447 }
7448 if (start > end)
7449 start = end;
7450 /* copy slice */
7451 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7452 end - start);
7453 }
7454
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7455 PyObject *PyUnicode_Split(PyObject *s,
7456 PyObject *sep,
7457 Py_ssize_t maxsplit)
7458 {
7459 PyObject *result;
7460
7461 s = PyUnicode_FromObject(s);
7462 if (s == NULL)
7463 return NULL;
7464 if (sep != NULL) {
7465 sep = PyUnicode_FromObject(sep);
7466 if (sep == NULL) {
7467 Py_DECREF(s);
7468 return NULL;
7469 }
7470 }
7471
7472 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7473
7474 Py_DECREF(s);
7475 Py_XDECREF(sep);
7476 return result;
7477 }
7478
7479 PyDoc_STRVAR(split__doc__,
7480 "S.split([sep [,maxsplit]]) -> list of strings\n\
7481 \n\
7482 Return a list of the words in S, using sep as the\n\
7483 delimiter string. If maxsplit is given, at most maxsplit\n\
7484 splits are done. If sep is not specified or is None, any\n\
7485 whitespace string is a separator and empty strings are\n\
7486 removed from the result.");
7487
7488 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7489 unicode_split(PyUnicodeObject *self, PyObject *args)
7490 {
7491 PyObject *substring = Py_None;
7492 Py_ssize_t maxcount = -1;
7493
7494 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7495 return NULL;
7496
7497 if (substring == Py_None)
7498 return split(self, NULL, maxcount);
7499 else if (PyUnicode_Check(substring))
7500 return split(self, (PyUnicodeObject *)substring, maxcount);
7501 else
7502 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7503 }
7504
7505 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7506 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7507 {
7508 PyObject* str_obj;
7509 PyObject* sep_obj;
7510 PyObject* out;
7511
7512 str_obj = PyUnicode_FromObject(str_in);
7513 if (!str_obj)
7514 return NULL;
7515 sep_obj = PyUnicode_FromObject(sep_in);
7516 if (!sep_obj) {
7517 Py_DECREF(str_obj);
7518 return NULL;
7519 }
7520
7521 out = stringlib_partition(
7522 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7523 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7524 );
7525
7526 Py_DECREF(sep_obj);
7527 Py_DECREF(str_obj);
7528
7529 return out;
7530 }
7531
7532
7533 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7534 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7535 {
7536 PyObject* str_obj;
7537 PyObject* sep_obj;
7538 PyObject* out;
7539
7540 str_obj = PyUnicode_FromObject(str_in);
7541 if (!str_obj)
7542 return NULL;
7543 sep_obj = PyUnicode_FromObject(sep_in);
7544 if (!sep_obj) {
7545 Py_DECREF(str_obj);
7546 return NULL;
7547 }
7548
7549 out = stringlib_rpartition(
7550 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7551 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7552 );
7553
7554 Py_DECREF(sep_obj);
7555 Py_DECREF(str_obj);
7556
7557 return out;
7558 }
7559
7560 PyDoc_STRVAR(partition__doc__,
7561 "S.partition(sep) -> (head, sep, tail)\n\
7562 \n\
7563 Search for the separator sep in S, and return the part before it,\n\
7564 the separator itself, and the part after it. If the separator is not\n\
7565 found, return S and two empty strings.");
7566
7567 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7568 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7569 {
7570 return PyUnicode_Partition((PyObject *)self, separator);
7571 }
7572
7573 PyDoc_STRVAR(rpartition__doc__,
7574 "S.rpartition(sep) -> (head, sep, tail)\n\
7575 \n\
7576 Search for the separator sep in S, starting at the end of S, and return\n\
7577 the part before it, the separator itself, and the part after it. If the\n\
7578 separator is not found, return two empty strings and S.");
7579
7580 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7581 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7582 {
7583 return PyUnicode_RPartition((PyObject *)self, separator);
7584 }
7585
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7586 PyObject *PyUnicode_RSplit(PyObject *s,
7587 PyObject *sep,
7588 Py_ssize_t maxsplit)
7589 {
7590 PyObject *result;
7591
7592 s = PyUnicode_FromObject(s);
7593 if (s == NULL)
7594 return NULL;
7595 if (sep != NULL) {
7596 sep = PyUnicode_FromObject(sep);
7597 if (sep == NULL) {
7598 Py_DECREF(s);
7599 return NULL;
7600 }
7601 }
7602
7603 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7604
7605 Py_DECREF(s);
7606 Py_XDECREF(sep);
7607 return result;
7608 }
7609
7610 PyDoc_STRVAR(rsplit__doc__,
7611 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7612 \n\
7613 Return a list of the words in S, using sep as the\n\
7614 delimiter string, starting at the end of the string and\n\
7615 working to the front. If maxsplit is given, at most maxsplit\n\
7616 splits are done. If sep is not specified, any whitespace string\n\
7617 is a separator.");
7618
7619 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7620 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7621 {
7622 PyObject *substring = Py_None;
7623 Py_ssize_t maxcount = -1;
7624
7625 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7626 return NULL;
7627
7628 if (substring == Py_None)
7629 return rsplit(self, NULL, maxcount);
7630 else if (PyUnicode_Check(substring))
7631 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7632 else
7633 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7634 }
7635
7636 PyDoc_STRVAR(splitlines__doc__,
7637 "S.splitlines(keepends=False) -> list of strings\n\
7638 \n\
7639 Return a list of the lines in S, breaking at line boundaries.\n\
7640 Line breaks are not included in the resulting list unless keepends\n\
7641 is given and true.");
7642
7643 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7644 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7645 {
7646 int keepends = 0;
7647
7648 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7649 return NULL;
7650
7651 return PyUnicode_Splitlines((PyObject *)self, keepends);
7652 }
7653
7654 static
unicode_str(PyUnicodeObject * self)7655 PyObject *unicode_str(PyUnicodeObject *self)
7656 {
7657 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7658 }
7659
7660 PyDoc_STRVAR(swapcase__doc__,
7661 "S.swapcase() -> unicode\n\
7662 \n\
7663 Return a copy of S with uppercase characters converted to lowercase\n\
7664 and vice versa.");
7665
7666 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7667 unicode_swapcase(PyUnicodeObject *self)
7668 {
7669 return fixup(self, fixswapcase);
7670 }
7671
7672 PyDoc_STRVAR(translate__doc__,
7673 "S.translate(table) -> unicode\n\
7674 \n\
7675 Return a copy of the string S, where all characters have been mapped\n\
7676 through the given translation table, which must be a mapping of\n\
7677 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7678 Unmapped characters are left untouched. Characters mapped to None\n\
7679 are deleted.");
7680
7681 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7682 unicode_translate(PyUnicodeObject *self, PyObject *table)
7683 {
7684 return PyUnicode_TranslateCharmap(self->str,
7685 self->length,
7686 table,
7687 "ignore");
7688 }
7689
7690 PyDoc_STRVAR(upper__doc__,
7691 "S.upper() -> unicode\n\
7692 \n\
7693 Return a copy of S converted to uppercase.");
7694
7695 static PyObject*
unicode_upper(PyUnicodeObject * self)7696 unicode_upper(PyUnicodeObject *self)
7697 {
7698 return fixup(self, fixupper);
7699 }
7700
7701 PyDoc_STRVAR(zfill__doc__,
7702 "S.zfill(width) -> unicode\n\
7703 \n\
7704 Pad a numeric string S with zeros on the left, to fill a field\n\
7705 of the specified width. The string S is never truncated.");
7706
7707 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7708 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7709 {
7710 Py_ssize_t fill;
7711 PyUnicodeObject *u;
7712
7713 Py_ssize_t width;
7714 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7715 return NULL;
7716
7717 if (self->length >= width) {
7718 if (PyUnicode_CheckExact(self)) {
7719 Py_INCREF(self);
7720 return (PyObject*) self;
7721 }
7722 else
7723 return PyUnicode_FromUnicode(
7724 PyUnicode_AS_UNICODE(self),
7725 PyUnicode_GET_SIZE(self)
7726 );
7727 }
7728
7729 fill = width - self->length;
7730
7731 u = pad(self, fill, 0, '0');
7732
7733 if (u == NULL)
7734 return NULL;
7735
7736 if (u->str[fill] == '+' || u->str[fill] == '-') {
7737 /* move sign to beginning of string */
7738 u->str[0] = u->str[fill];
7739 u->str[fill] = '0';
7740 }
7741
7742 return (PyObject*) u;
7743 }
7744
7745 #if 0
7746 static PyObject*
7747 free_listsize(PyUnicodeObject *self)
7748 {
7749 return PyInt_FromLong(numfree);
7750 }
7751 #endif
7752
7753 PyDoc_STRVAR(startswith__doc__,
7754 "S.startswith(prefix[, start[, end]]) -> bool\n\
7755 \n\
7756 Return True if S starts with the specified prefix, False otherwise.\n\
7757 With optional start, test S beginning at that position.\n\
7758 With optional end, stop comparing S at that position.\n\
7759 prefix can also be a tuple of strings to try.");
7760
7761 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7762 unicode_startswith(PyUnicodeObject *self,
7763 PyObject *args)
7764 {
7765 PyObject *subobj;
7766 PyUnicodeObject *substring;
7767 Py_ssize_t start = 0;
7768 Py_ssize_t end = PY_SSIZE_T_MAX;
7769 int result;
7770
7771 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7772 return NULL;
7773 if (PyTuple_Check(subobj)) {
7774 Py_ssize_t i;
7775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7776 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7777 PyTuple_GET_ITEM(subobj, i));
7778 if (substring == NULL)
7779 return NULL;
7780 result = tailmatch(self, substring, start, end, -1);
7781 Py_DECREF(substring);
7782 if (result) {
7783 Py_RETURN_TRUE;
7784 }
7785 }
7786 /* nothing matched */
7787 Py_RETURN_FALSE;
7788 }
7789 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7790 if (substring == NULL) {
7791 if (PyErr_ExceptionMatches(PyExc_TypeError))
7792 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7793 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7794 return NULL;
7795 }
7796 result = tailmatch(self, substring, start, end, -1);
7797 Py_DECREF(substring);
7798 return PyBool_FromLong(result);
7799 }
7800
7801
7802 PyDoc_STRVAR(endswith__doc__,
7803 "S.endswith(suffix[, start[, end]]) -> bool\n\
7804 \n\
7805 Return True if S ends with the specified suffix, False otherwise.\n\
7806 With optional start, test S beginning at that position.\n\
7807 With optional end, stop comparing S at that position.\n\
7808 suffix can also be a tuple of strings to try.");
7809
7810 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7811 unicode_endswith(PyUnicodeObject *self,
7812 PyObject *args)
7813 {
7814 PyObject *subobj;
7815 PyUnicodeObject *substring;
7816 Py_ssize_t start = 0;
7817 Py_ssize_t end = PY_SSIZE_T_MAX;
7818 int result;
7819
7820 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7821 return NULL;
7822 if (PyTuple_Check(subobj)) {
7823 Py_ssize_t i;
7824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7826 PyTuple_GET_ITEM(subobj, i));
7827 if (substring == NULL)
7828 return NULL;
7829 result = tailmatch(self, substring, start, end, +1);
7830 Py_DECREF(substring);
7831 if (result) {
7832 Py_RETURN_TRUE;
7833 }
7834 }
7835 Py_RETURN_FALSE;
7836 }
7837 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7838 if (substring == NULL) {
7839 if (PyErr_ExceptionMatches(PyExc_TypeError))
7840 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7841 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7842 return NULL;
7843 }
7844 result = tailmatch(self, substring, start, end, +1);
7845 Py_DECREF(substring);
7846 return PyBool_FromLong(result);
7847 }
7848
7849
7850 /* Implements do_string_format, which is unicode because of stringlib */
7851 #include "stringlib/string_format.h"
7852
7853 PyDoc_STRVAR(format__doc__,
7854 "S.format(*args, **kwargs) -> unicode\n\
7855 \n\
7856 Return a formatted version of S, using substitutions from args and kwargs.\n\
7857 The substitutions are identified by braces ('{' and '}').");
7858
7859 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7860 unicode__format__(PyObject *self, PyObject *args)
7861 {
7862 PyObject *format_spec;
7863 PyObject *result = NULL;
7864 PyObject *tmp = NULL;
7865
7866 /* If 2.x, convert format_spec to the same type as value */
7867 /* This is to allow things like u''.format('') */
7868 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7869 goto done;
7870 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7871 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7872 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7873 goto done;
7874 }
7875 tmp = PyObject_Unicode(format_spec);
7876 if (tmp == NULL)
7877 goto done;
7878 format_spec = tmp;
7879
7880 result = _PyUnicode_FormatAdvanced(self,
7881 PyUnicode_AS_UNICODE(format_spec),
7882 PyUnicode_GET_SIZE(format_spec));
7883 done:
7884 Py_XDECREF(tmp);
7885 return result;
7886 }
7887
7888 PyDoc_STRVAR(p_format__doc__,
7889 "S.__format__(format_spec) -> unicode\n\
7890 \n\
7891 Return a formatted version of S as described by format_spec.");
7892
7893 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7894 unicode__sizeof__(PyUnicodeObject *v)
7895 {
7896 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7897 sizeof(Py_UNICODE) * (v->length + 1));
7898 }
7899
7900 PyDoc_STRVAR(sizeof__doc__,
7901 "S.__sizeof__() -> size of S in memory, in bytes\n\
7902 \n\
7903 ");
7904
7905 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7906 unicode_getnewargs(PyUnicodeObject *v)
7907 {
7908 return Py_BuildValue("(u#)", v->str, v->length);
7909 }
7910
7911
7912 static PyMethodDef unicode_methods[] = {
7913 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7914 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7915 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7916 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7917 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7918 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7919 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7920 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7922 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7924 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7926 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7927 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7928 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7929 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7930 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7931 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7932 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7933 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7934 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7935 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7936 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7937 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7938 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7939 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7940 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7941 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7942 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7943 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7944 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7945 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7946 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7947 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7948 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7949 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7950 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7951 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7952 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7953 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7954 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7955 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7956 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7957 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7958 #if 0
7959 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7960 #endif
7961
7962 #if 0
7963 /* This one is just used for debugging the implementation. */
7964 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7965 #endif
7966
7967 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7968 {NULL, NULL}
7969 };
7970
7971 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7972 unicode_mod(PyObject *v, PyObject *w)
7973 {
7974 if (!PyUnicode_Check(v)) {
7975 Py_INCREF(Py_NotImplemented);
7976 return Py_NotImplemented;
7977 }
7978 return PyUnicode_Format(v, w);
7979 }
7980
7981 static PyNumberMethods unicode_as_number = {
7982 0, /*nb_add*/
7983 0, /*nb_subtract*/
7984 0, /*nb_multiply*/
7985 0, /*nb_divide*/
7986 unicode_mod, /*nb_remainder*/
7987 };
7988
7989 static PySequenceMethods unicode_as_sequence = {
7990 (lenfunc) unicode_length, /* sq_length */
7991 PyUnicode_Concat, /* sq_concat */
7992 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7993 (ssizeargfunc) unicode_getitem, /* sq_item */
7994 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7995 0, /* sq_ass_item */
7996 0, /* sq_ass_slice */
7997 PyUnicode_Contains, /* sq_contains */
7998 };
7999
8000 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)8001 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8002 {
8003 if (PyIndex_Check(item)) {
8004 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8005 if (i == -1 && PyErr_Occurred())
8006 return NULL;
8007 if (i < 0)
8008 i += PyUnicode_GET_SIZE(self);
8009 return unicode_getitem(self, i);
8010 } else if (PySlice_Check(item)) {
8011 Py_ssize_t start, stop, step, slicelength, cur, i;
8012 Py_UNICODE* source_buf;
8013 Py_UNICODE* result_buf;
8014 PyObject* result;
8015
8016 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8017 &start, &stop, &step, &slicelength) < 0) {
8018 return NULL;
8019 }
8020
8021 if (slicelength <= 0) {
8022 return PyUnicode_FromUnicode(NULL, 0);
8023 } else if (start == 0 && step == 1 && slicelength == self->length &&
8024 PyUnicode_CheckExact(self)) {
8025 Py_INCREF(self);
8026 return (PyObject *)self;
8027 } else if (step == 1) {
8028 return PyUnicode_FromUnicode(self->str + start, slicelength);
8029 } else {
8030 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8031 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8032 sizeof(Py_UNICODE));
8033
8034 if (result_buf == NULL)
8035 return PyErr_NoMemory();
8036
8037 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8038 result_buf[i] = source_buf[cur];
8039 }
8040
8041 result = PyUnicode_FromUnicode(result_buf, slicelength);
8042 PyObject_FREE(result_buf);
8043 return result;
8044 }
8045 } else {
8046 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8047 return NULL;
8048 }
8049 }
8050
8051 static PyMappingMethods unicode_as_mapping = {
8052 (lenfunc)unicode_length, /* mp_length */
8053 (binaryfunc)unicode_subscript, /* mp_subscript */
8054 (objobjargproc)0, /* mp_ass_subscript */
8055 };
8056
8057 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8058 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8059 Py_ssize_t index,
8060 const void **ptr)
8061 {
8062 if (index != 0) {
8063 PyErr_SetString(PyExc_SystemError,
8064 "accessing non-existent unicode segment");
8065 return -1;
8066 }
8067 *ptr = (void *) self->str;
8068 return PyUnicode_GET_DATA_SIZE(self);
8069 }
8070
8071 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8072 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8073 const void **ptr)
8074 {
8075 PyErr_SetString(PyExc_TypeError,
8076 "cannot use unicode as modifiable buffer");
8077 return -1;
8078 }
8079
8080 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)8081 unicode_buffer_getsegcount(PyUnicodeObject *self,
8082 Py_ssize_t *lenp)
8083 {
8084 if (lenp)
8085 *lenp = PyUnicode_GET_DATA_SIZE(self);
8086 return 1;
8087 }
8088
8089 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8090 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8091 Py_ssize_t index,
8092 const void **ptr)
8093 {
8094 PyObject *str;
8095
8096 if (index != 0) {
8097 PyErr_SetString(PyExc_SystemError,
8098 "accessing non-existent unicode segment");
8099 return -1;
8100 }
8101 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8102 if (str == NULL)
8103 return -1;
8104 *ptr = (void *) PyString_AS_STRING(str);
8105 return PyString_GET_SIZE(str);
8106 }
8107
8108 /* Helpers for PyUnicode_Format() */
8109
8110 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)8111 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8112 {
8113 Py_ssize_t argidx = *p_argidx;
8114 if (argidx < arglen) {
8115 (*p_argidx)++;
8116 if (arglen < 0)
8117 return args;
8118 else
8119 return PyTuple_GetItem(args, argidx);
8120 }
8121 PyErr_SetString(PyExc_TypeError,
8122 "not enough arguments for format string");
8123 return NULL;
8124 }
8125
8126 #define F_LJUST (1<<0)
8127 #define F_SIGN (1<<1)
8128 #define F_BLANK (1<<2)
8129 #define F_ALT (1<<3)
8130 #define F_ZERO (1<<4)
8131
8132 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8133 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8134 {
8135 register Py_ssize_t i;
8136 Py_ssize_t len = strlen(charbuffer);
8137 for (i = len - 1; i >= 0; i--)
8138 buffer[i] = (Py_UNICODE) charbuffer[i];
8139
8140 return len;
8141 }
8142
8143 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8144 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8145 {
8146 Py_ssize_t result;
8147
8148 PyOS_snprintf((char *)buffer, len, format, x);
8149 result = strtounicode(buffer, (char *)buffer);
8150 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8151 }
8152
8153 /* XXX To save some code duplication, formatfloat/long/int could have been
8154 shared with stringobject.c, converting from 8-bit to Unicode after the
8155 formatting is done. */
8156
8157 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8158
8159 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8160 formatfloat(PyObject *v, int flags, int prec, int type)
8161 {
8162 char *p;
8163 PyObject *result;
8164 double x;
8165
8166 x = PyFloat_AsDouble(v);
8167 if (x == -1.0 && PyErr_Occurred())
8168 return NULL;
8169
8170 if (prec < 0)
8171 prec = 6;
8172
8173 p = PyOS_double_to_string(x, type, prec,
8174 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8175 if (p == NULL)
8176 return NULL;
8177 result = PyUnicode_FromStringAndSize(p, strlen(p));
8178 PyMem_Free(p);
8179 return result;
8180 }
8181
8182 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8183 formatlong(PyObject *val, int flags, int prec, int type)
8184 {
8185 char *buf;
8186 int i, len;
8187 PyObject *str; /* temporary string object. */
8188 PyUnicodeObject *result;
8189
8190 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8191 if (!str)
8192 return NULL;
8193 result = _PyUnicode_New(len);
8194 if (!result) {
8195 Py_DECREF(str);
8196 return NULL;
8197 }
8198 for (i = 0; i < len; i++)
8199 result->str[i] = buf[i];
8200 result->str[len] = 0;
8201 Py_DECREF(str);
8202 return (PyObject*)result;
8203 }
8204
8205 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8206 formatint(Py_UNICODE *buf,
8207 size_t buflen,
8208 int flags,
8209 int prec,
8210 int type,
8211 PyObject *v)
8212 {
8213 /* fmt = '%#.' + `prec` + 'l' + `type`
8214 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8215 * + 1 + 1
8216 * = 24
8217 */
8218 char fmt[64]; /* plenty big enough! */
8219 char *sign;
8220 long x;
8221
8222 x = PyInt_AsLong(v);
8223 if (x == -1 && PyErr_Occurred())
8224 return -1;
8225 if (x < 0 && type == 'u') {
8226 type = 'd';
8227 }
8228 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8229 sign = "-";
8230 else
8231 sign = "";
8232 if (prec < 0)
8233 prec = 1;
8234
8235 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8236 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8237 */
8238 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8239 PyErr_SetString(PyExc_OverflowError,
8240 "formatted integer is too long (precision too large?)");
8241 return -1;
8242 }
8243
8244 if ((flags & F_ALT) &&
8245 (type == 'x' || type == 'X')) {
8246 /* When converting under %#x or %#X, there are a number
8247 * of issues that cause pain:
8248 * - when 0 is being converted, the C standard leaves off
8249 * the '0x' or '0X', which is inconsistent with other
8250 * %#x/%#X conversions and inconsistent with Python's
8251 * hex() function
8252 * - there are platforms that violate the standard and
8253 * convert 0 with the '0x' or '0X'
8254 * (Metrowerks, Compaq Tru64)
8255 * - there are platforms that give '0x' when converting
8256 * under %#X, but convert 0 in accordance with the
8257 * standard (OS/2 EMX)
8258 *
8259 * We can achieve the desired consistency by inserting our
8260 * own '0x' or '0X' prefix, and substituting %x/%X in place
8261 * of %#x/%#X.
8262 *
8263 * Note that this is the same approach as used in
8264 * formatint() in stringobject.c
8265 */
8266 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8267 sign, type, prec, type);
8268 }
8269 else {
8270 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8271 sign, (flags&F_ALT) ? "#" : "",
8272 prec, type);
8273 }
8274 if (sign[0])
8275 return longtounicode(buf, buflen, fmt, -x);
8276 else
8277 return longtounicode(buf, buflen, fmt, x);
8278 }
8279
8280 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8281 formatchar(Py_UNICODE *buf,
8282 size_t buflen,
8283 PyObject *v)
8284 {
8285 PyObject *unistr;
8286 char *str;
8287 /* presume that the buffer is at least 2 characters long */
8288 if (PyUnicode_Check(v)) {
8289 if (PyUnicode_GET_SIZE(v) != 1)
8290 goto onError;
8291 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8292 }
8293
8294 else if (PyString_Check(v)) {
8295 if (PyString_GET_SIZE(v) != 1)
8296 goto onError;
8297 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8298 with a UnicodeDecodeError if 'char' is not decodable with the
8299 default encoding (usually ASCII, but it might be something else) */
8300 str = PyString_AS_STRING(v);
8301 if ((unsigned char)str[0] > 0x7F) {
8302 /* the char is not ASCII; try to decode the string using the
8303 default encoding and return -1 to let the UnicodeDecodeError
8304 be raised if the string can't be decoded */
8305 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8306 if (unistr == NULL)
8307 return -1;
8308 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8309 Py_DECREF(unistr);
8310 }
8311 else
8312 buf[0] = (Py_UNICODE)str[0];
8313 }
8314
8315 else {
8316 /* Integer input truncated to a character */
8317 long x;
8318 x = PyInt_AsLong(v);
8319 if (x == -1 && PyErr_Occurred())
8320 goto onError;
8321 #ifdef Py_UNICODE_WIDE
8322 if (x < 0 || x > 0x10ffff) {
8323 PyErr_SetString(PyExc_OverflowError,
8324 "%c arg not in range(0x110000) "
8325 "(wide Python build)");
8326 return -1;
8327 }
8328 #else
8329 if (x < 0 || x > 0xffff) {
8330 PyErr_SetString(PyExc_OverflowError,
8331 "%c arg not in range(0x10000) "
8332 "(narrow Python build)");
8333 return -1;
8334 }
8335 #endif
8336 buf[0] = (Py_UNICODE) x;
8337 }
8338 buf[1] = '\0';
8339 return 1;
8340
8341 onError:
8342 PyErr_SetString(PyExc_TypeError,
8343 "%c requires int or char");
8344 return -1;
8345 }
8346
8347 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8348
8349 FORMATBUFLEN is the length of the buffer in which the ints &
8350 chars are formatted. XXX This is a magic number. Each formatting
8351 routine does bounds checking to ensure no overflow, but a better
8352 solution may be to malloc a buffer of appropriate size for each
8353 format. For now, the current solution is sufficient.
8354 */
8355 #define FORMATBUFLEN (size_t)120
8356
PyUnicode_Format(PyObject * format,PyObject * args)8357 PyObject *PyUnicode_Format(PyObject *format,
8358 PyObject *args)
8359 {
8360 Py_UNICODE *fmt, *res;
8361 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8362 int args_owned = 0;
8363 PyUnicodeObject *result = NULL;
8364 PyObject *dict = NULL;
8365 PyObject *uformat;
8366
8367 if (format == NULL || args == NULL) {
8368 PyErr_BadInternalCall();
8369 return NULL;
8370 }
8371 uformat = PyUnicode_FromObject(format);
8372 if (uformat == NULL)
8373 return NULL;
8374 fmt = PyUnicode_AS_UNICODE(uformat);
8375 fmtcnt = PyUnicode_GET_SIZE(uformat);
8376
8377 reslen = rescnt = fmtcnt + 100;
8378 result = _PyUnicode_New(reslen);
8379 if (result == NULL)
8380 goto onError;
8381 res = PyUnicode_AS_UNICODE(result);
8382
8383 if (PyTuple_Check(args)) {
8384 arglen = PyTuple_Size(args);
8385 argidx = 0;
8386 }
8387 else {
8388 arglen = -1;
8389 argidx = -2;
8390 }
8391 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8392 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8393 dict = args;
8394
8395 while (--fmtcnt >= 0) {
8396 if (*fmt != '%') {
8397 if (--rescnt < 0) {
8398 rescnt = fmtcnt + 100;
8399 reslen += rescnt;
8400 if (_PyUnicode_Resize(&result, reslen) < 0)
8401 goto onError;
8402 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8403 --rescnt;
8404 }
8405 *res++ = *fmt++;
8406 }
8407 else {
8408 /* Got a format specifier */
8409 int flags = 0;
8410 Py_ssize_t width = -1;
8411 int prec = -1;
8412 Py_UNICODE c = '\0';
8413 Py_UNICODE fill;
8414 int isnumok;
8415 PyObject *v = NULL;
8416 PyObject *temp = NULL;
8417 Py_UNICODE *pbuf;
8418 Py_UNICODE sign;
8419 Py_ssize_t len;
8420 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8421
8422 fmt++;
8423 if (*fmt == '(') {
8424 Py_UNICODE *keystart;
8425 Py_ssize_t keylen;
8426 PyObject *key;
8427 int pcount = 1;
8428
8429 if (dict == NULL) {
8430 PyErr_SetString(PyExc_TypeError,
8431 "format requires a mapping");
8432 goto onError;
8433 }
8434 ++fmt;
8435 --fmtcnt;
8436 keystart = fmt;
8437 /* Skip over balanced parentheses */
8438 while (pcount > 0 && --fmtcnt >= 0) {
8439 if (*fmt == ')')
8440 --pcount;
8441 else if (*fmt == '(')
8442 ++pcount;
8443 fmt++;
8444 }
8445 keylen = fmt - keystart - 1;
8446 if (fmtcnt < 0 || pcount > 0) {
8447 PyErr_SetString(PyExc_ValueError,
8448 "incomplete format key");
8449 goto onError;
8450 }
8451 #if 0
8452 /* keys are converted to strings using UTF-8 and
8453 then looked up since Python uses strings to hold
8454 variables names etc. in its namespaces and we
8455 wouldn't want to break common idioms. */
8456 key = PyUnicode_EncodeUTF8(keystart,
8457 keylen,
8458 NULL);
8459 #else
8460 key = PyUnicode_FromUnicode(keystart, keylen);
8461 #endif
8462 if (key == NULL)
8463 goto onError;
8464 if (args_owned) {
8465 Py_DECREF(args);
8466 args_owned = 0;
8467 }
8468 args = PyObject_GetItem(dict, key);
8469 Py_DECREF(key);
8470 if (args == NULL) {
8471 goto onError;
8472 }
8473 args_owned = 1;
8474 arglen = -1;
8475 argidx = -2;
8476 }
8477 while (--fmtcnt >= 0) {
8478 switch (c = *fmt++) {
8479 case '-': flags |= F_LJUST; continue;
8480 case '+': flags |= F_SIGN; continue;
8481 case ' ': flags |= F_BLANK; continue;
8482 case '#': flags |= F_ALT; continue;
8483 case '0': flags |= F_ZERO; continue;
8484 }
8485 break;
8486 }
8487 if (c == '*') {
8488 v = getnextarg(args, arglen, &argidx);
8489 if (v == NULL)
8490 goto onError;
8491 if (!PyInt_Check(v)) {
8492 PyErr_SetString(PyExc_TypeError,
8493 "* wants int");
8494 goto onError;
8495 }
8496 width = PyInt_AsSsize_t(v);
8497 if (width == -1 && PyErr_Occurred())
8498 goto onError;
8499 if (width < 0) {
8500 flags |= F_LJUST;
8501 width = -width;
8502 }
8503 if (--fmtcnt >= 0)
8504 c = *fmt++;
8505 }
8506 else if (c >= '0' && c <= '9') {
8507 width = c - '0';
8508 while (--fmtcnt >= 0) {
8509 c = *fmt++;
8510 if (c < '0' || c > '9')
8511 break;
8512 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8513 PyErr_SetString(PyExc_ValueError,
8514 "width too big");
8515 goto onError;
8516 }
8517 width = width*10 + (c - '0');
8518 }
8519 }
8520 if (c == '.') {
8521 prec = 0;
8522 if (--fmtcnt >= 0)
8523 c = *fmt++;
8524 if (c == '*') {
8525 v = getnextarg(args, arglen, &argidx);
8526 if (v == NULL)
8527 goto onError;
8528 if (!PyInt_Check(v)) {
8529 PyErr_SetString(PyExc_TypeError,
8530 "* wants int");
8531 goto onError;
8532 }
8533 prec = _PyInt_AsInt(v);
8534 if (prec == -1 && PyErr_Occurred())
8535 goto onError;
8536 if (prec < 0)
8537 prec = 0;
8538 if (--fmtcnt >= 0)
8539 c = *fmt++;
8540 }
8541 else if (c >= '0' && c <= '9') {
8542 prec = c - '0';
8543 while (--fmtcnt >= 0) {
8544 c = *fmt++;
8545 if (c < '0' || c > '9')
8546 break;
8547 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8548 PyErr_SetString(PyExc_ValueError,
8549 "prec too big");
8550 goto onError;
8551 }
8552 prec = prec*10 + (c - '0');
8553 }
8554 }
8555 } /* prec */
8556 if (fmtcnt >= 0) {
8557 if (c == 'h' || c == 'l' || c == 'L') {
8558 if (--fmtcnt >= 0)
8559 c = *fmt++;
8560 }
8561 }
8562 if (fmtcnt < 0) {
8563 PyErr_SetString(PyExc_ValueError,
8564 "incomplete format");
8565 goto onError;
8566 }
8567 if (c != '%') {
8568 v = getnextarg(args, arglen, &argidx);
8569 if (v == NULL)
8570 goto onError;
8571 }
8572 sign = 0;
8573 fill = ' ';
8574 switch (c) {
8575
8576 case '%':
8577 pbuf = formatbuf;
8578 /* presume that buffer length is at least 1 */
8579 pbuf[0] = '%';
8580 len = 1;
8581 break;
8582
8583 case 's':
8584 case 'r':
8585 if (PyUnicode_CheckExact(v) && c == 's') {
8586 temp = v;
8587 Py_INCREF(temp);
8588 }
8589 else {
8590 PyObject *unicode;
8591 if (c == 's')
8592 temp = PyObject_Unicode(v);
8593 else
8594 temp = PyObject_Repr(v);
8595 if (temp == NULL)
8596 goto onError;
8597 if (PyUnicode_Check(temp))
8598 /* nothing to do */;
8599 else if (PyString_Check(temp)) {
8600 /* convert to string to Unicode */
8601 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8602 PyString_GET_SIZE(temp),
8603 NULL,
8604 "strict");
8605 Py_DECREF(temp);
8606 temp = unicode;
8607 if (temp == NULL)
8608 goto onError;
8609 }
8610 else {
8611 Py_DECREF(temp);
8612 PyErr_SetString(PyExc_TypeError,
8613 "%s argument has non-string str()");
8614 goto onError;
8615 }
8616 }
8617 pbuf = PyUnicode_AS_UNICODE(temp);
8618 len = PyUnicode_GET_SIZE(temp);
8619 if (prec >= 0 && len > prec)
8620 len = prec;
8621 break;
8622
8623 case 'i':
8624 case 'd':
8625 case 'u':
8626 case 'o':
8627 case 'x':
8628 case 'X':
8629 if (c == 'i')
8630 c = 'd';
8631 isnumok = 0;
8632 if (PyNumber_Check(v)) {
8633 PyObject *iobj=NULL;
8634
8635 if (PyInt_Check(v) || (PyLong_Check(v))) {
8636 iobj = v;
8637 Py_INCREF(iobj);
8638 }
8639 else {
8640 iobj = PyNumber_Int(v);
8641 if (iobj==NULL) {
8642 PyErr_Clear();
8643 iobj = PyNumber_Long(v);
8644 }
8645 }
8646 if (iobj!=NULL) {
8647 if (PyInt_Check(iobj)) {
8648 isnumok = 1;
8649 pbuf = formatbuf;
8650 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8651 flags, prec, c, iobj);
8652 Py_DECREF(iobj);
8653 if (len < 0)
8654 goto onError;
8655 sign = 1;
8656 }
8657 else if (PyLong_Check(iobj)) {
8658 isnumok = 1;
8659 temp = formatlong(iobj, flags, prec, c);
8660 Py_DECREF(iobj);
8661 if (!temp)
8662 goto onError;
8663 pbuf = PyUnicode_AS_UNICODE(temp);
8664 len = PyUnicode_GET_SIZE(temp);
8665 sign = 1;
8666 }
8667 else {
8668 Py_DECREF(iobj);
8669 }
8670 }
8671 }
8672 if (!isnumok) {
8673 PyErr_Format(PyExc_TypeError,
8674 "%%%c format: a number is required, "
8675 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8676 goto onError;
8677 }
8678 if (flags & F_ZERO)
8679 fill = '0';
8680 break;
8681
8682 case 'e':
8683 case 'E':
8684 case 'f':
8685 case 'F':
8686 case 'g':
8687 case 'G':
8688 temp = formatfloat(v, flags, prec, c);
8689 if (temp == NULL)
8690 goto onError;
8691 pbuf = PyUnicode_AS_UNICODE(temp);
8692 len = PyUnicode_GET_SIZE(temp);
8693 sign = 1;
8694 if (flags & F_ZERO)
8695 fill = '0';
8696 break;
8697
8698 case 'c':
8699 pbuf = formatbuf;
8700 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8701 if (len < 0)
8702 goto onError;
8703 break;
8704
8705 default:
8706 PyErr_Format(PyExc_ValueError,
8707 "unsupported format character '%c' (0x%x) "
8708 "at index %zd",
8709 (31<=c && c<=126) ? (char)c : '?',
8710 (int)c,
8711 (Py_ssize_t)(fmt - 1 -
8712 PyUnicode_AS_UNICODE(uformat)));
8713 goto onError;
8714 }
8715 if (sign) {
8716 if (*pbuf == '-' || *pbuf == '+') {
8717 sign = *pbuf++;
8718 len--;
8719 }
8720 else if (flags & F_SIGN)
8721 sign = '+';
8722 else if (flags & F_BLANK)
8723 sign = ' ';
8724 else
8725 sign = 0;
8726 }
8727 if (width < len)
8728 width = len;
8729 if (rescnt - (sign != 0) < width) {
8730 reslen -= rescnt;
8731 rescnt = width + fmtcnt + 100;
8732 reslen += rescnt;
8733 if (reslen < 0) {
8734 Py_XDECREF(temp);
8735 PyErr_NoMemory();
8736 goto onError;
8737 }
8738 if (_PyUnicode_Resize(&result, reslen) < 0) {
8739 Py_XDECREF(temp);
8740 goto onError;
8741 }
8742 res = PyUnicode_AS_UNICODE(result)
8743 + reslen - rescnt;
8744 }
8745 if (sign) {
8746 if (fill != ' ')
8747 *res++ = sign;
8748 rescnt--;
8749 if (width > len)
8750 width--;
8751 }
8752 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8753 assert(pbuf[0] == '0');
8754 assert(pbuf[1] == c);
8755 if (fill != ' ') {
8756 *res++ = *pbuf++;
8757 *res++ = *pbuf++;
8758 }
8759 rescnt -= 2;
8760 width -= 2;
8761 if (width < 0)
8762 width = 0;
8763 len -= 2;
8764 }
8765 if (width > len && !(flags & F_LJUST)) {
8766 do {
8767 --rescnt;
8768 *res++ = fill;
8769 } while (--width > len);
8770 }
8771 if (fill == ' ') {
8772 if (sign)
8773 *res++ = sign;
8774 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8775 assert(pbuf[0] == '0');
8776 assert(pbuf[1] == c);
8777 *res++ = *pbuf++;
8778 *res++ = *pbuf++;
8779 }
8780 }
8781 Py_UNICODE_COPY(res, pbuf, len);
8782 res += len;
8783 rescnt -= len;
8784 while (--width >= len) {
8785 --rescnt;
8786 *res++ = ' ';
8787 }
8788 if (dict && (argidx < arglen) && c != '%') {
8789 PyErr_SetString(PyExc_TypeError,
8790 "not all arguments converted during string formatting");
8791 Py_XDECREF(temp);
8792 goto onError;
8793 }
8794 Py_XDECREF(temp);
8795 } /* '%' */
8796 } /* until end */
8797 if (argidx < arglen && !dict) {
8798 PyErr_SetString(PyExc_TypeError,
8799 "not all arguments converted during string formatting");
8800 goto onError;
8801 }
8802
8803 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8804 goto onError;
8805 if (args_owned) {
8806 Py_DECREF(args);
8807 }
8808 Py_DECREF(uformat);
8809 return (PyObject *)result;
8810
8811 onError:
8812 Py_XDECREF(result);
8813 Py_DECREF(uformat);
8814 if (args_owned) {
8815 Py_DECREF(args);
8816 }
8817 return NULL;
8818 }
8819
8820 static PyBufferProcs unicode_as_buffer = {
8821 (readbufferproc) unicode_buffer_getreadbuf,
8822 (writebufferproc) unicode_buffer_getwritebuf,
8823 (segcountproc) unicode_buffer_getsegcount,
8824 (charbufferproc) unicode_buffer_getcharbuf,
8825 };
8826
8827 static PyObject *
8828 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8829
8830 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8831 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8832 {
8833 PyObject *x = NULL;
8834 static char *kwlist[] = {"string", "encoding", "errors", 0};
8835 char *encoding = NULL;
8836 char *errors = NULL;
8837
8838 if (type != &PyUnicode_Type)
8839 return unicode_subtype_new(type, args, kwds);
8840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8841 kwlist, &x, &encoding, &errors))
8842 return NULL;
8843 if (x == NULL)
8844 return (PyObject *)_PyUnicode_New(0);
8845 if (encoding == NULL && errors == NULL)
8846 return PyObject_Unicode(x);
8847 else
8848 return PyUnicode_FromEncodedObject(x, encoding, errors);
8849 }
8850
8851 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8852 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8853 {
8854 PyUnicodeObject *tmp, *pnew;
8855 Py_ssize_t n;
8856
8857 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8858 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8859 if (tmp == NULL)
8860 return NULL;
8861 assert(PyUnicode_Check(tmp));
8862 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8863 if (pnew == NULL) {
8864 Py_DECREF(tmp);
8865 return NULL;
8866 }
8867 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8868 if (pnew->str == NULL) {
8869 _Py_ForgetReference((PyObject *)pnew);
8870 PyObject_Del(pnew);
8871 Py_DECREF(tmp);
8872 return PyErr_NoMemory();
8873 }
8874 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8875 pnew->length = n;
8876 pnew->hash = tmp->hash;
8877 Py_DECREF(tmp);
8878 return (PyObject *)pnew;
8879 }
8880
8881 PyDoc_STRVAR(unicode_doc,
8882 "unicode(object='') -> unicode object\n\
8883 unicode(string[, encoding[, errors]]) -> unicode object\n\
8884 \n\
8885 Create a new Unicode object from the given encoded string.\n\
8886 encoding defaults to the current default string encoding.\n\
8887 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8888
8889 PyTypeObject PyUnicode_Type = {
8890 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8891 "unicode", /* tp_name */
8892 sizeof(PyUnicodeObject), /* tp_size */
8893 0, /* tp_itemsize */
8894 /* Slots */
8895 (destructor)unicode_dealloc, /* tp_dealloc */
8896 0, /* tp_print */
8897 0, /* tp_getattr */
8898 0, /* tp_setattr */
8899 0, /* tp_compare */
8900 unicode_repr, /* tp_repr */
8901 &unicode_as_number, /* tp_as_number */
8902 &unicode_as_sequence, /* tp_as_sequence */
8903 &unicode_as_mapping, /* tp_as_mapping */
8904 (hashfunc) unicode_hash, /* tp_hash*/
8905 0, /* tp_call*/
8906 (reprfunc) unicode_str, /* tp_str */
8907 PyObject_GenericGetAttr, /* tp_getattro */
8908 0, /* tp_setattro */
8909 &unicode_as_buffer, /* tp_as_buffer */
8910 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8911 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8912 unicode_doc, /* tp_doc */
8913 0, /* tp_traverse */
8914 0, /* tp_clear */
8915 PyUnicode_RichCompare, /* tp_richcompare */
8916 0, /* tp_weaklistoffset */
8917 0, /* tp_iter */
8918 0, /* tp_iternext */
8919 unicode_methods, /* tp_methods */
8920 0, /* tp_members */
8921 0, /* tp_getset */
8922 &PyBaseString_Type, /* tp_base */
8923 0, /* tp_dict */
8924 0, /* tp_descr_get */
8925 0, /* tp_descr_set */
8926 0, /* tp_dictoffset */
8927 0, /* tp_init */
8928 0, /* tp_alloc */
8929 unicode_new, /* tp_new */
8930 PyObject_Del, /* tp_free */
8931 };
8932
8933 /* Initialize the Unicode implementation */
8934
_PyUnicode_Init(void)8935 void _PyUnicode_Init(void)
8936 {
8937 /* XXX - move this array to unicodectype.c ? */
8938 Py_UNICODE linebreak[] = {
8939 0x000A, /* LINE FEED */
8940 0x000D, /* CARRIAGE RETURN */
8941 0x001C, /* FILE SEPARATOR */
8942 0x001D, /* GROUP SEPARATOR */
8943 0x001E, /* RECORD SEPARATOR */
8944 0x0085, /* NEXT LINE */
8945 0x2028, /* LINE SEPARATOR */
8946 0x2029, /* PARAGRAPH SEPARATOR */
8947 };
8948
8949 /* Init the implementation */
8950 if (!unicode_empty) {
8951 unicode_empty = _PyUnicode_New(0);
8952 if (!unicode_empty)
8953 return;
8954 }
8955
8956 if (PyType_Ready(&PyUnicode_Type) < 0)
8957 Py_FatalError("Can't initialize 'unicode'");
8958
8959 /* initialize the linebreak bloom filter */
8960 bloom_linebreak = make_bloom_mask(
8961 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8962 );
8963
8964 PyType_Ready(&EncodingMapType);
8965
8966 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8967 Py_FatalError("Can't initialize field name iterator type");
8968
8969 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8970 Py_FatalError("Can't initialize formatter iter type");
8971 }
8972
8973 /* Finalize the Unicode implementation */
8974
8975 int
PyUnicode_ClearFreeList(void)8976 PyUnicode_ClearFreeList(void)
8977 {
8978 int freelist_size = numfree;
8979 PyUnicodeObject *u;
8980
8981 for (u = free_list; u != NULL;) {
8982 PyUnicodeObject *v = u;
8983 u = *(PyUnicodeObject **)u;
8984 if (v->str)
8985 PyObject_DEL(v->str);
8986 Py_XDECREF(v->defenc);
8987 PyObject_Del(v);
8988 numfree--;
8989 }
8990 free_list = NULL;
8991 assert(numfree == 0);
8992 return freelist_size;
8993 }
8994
8995 void
_PyUnicode_Fini(void)8996 _PyUnicode_Fini(void)
8997 {
8998 int i;
8999
9000 Py_CLEAR(unicode_empty);
9001
9002 for (i = 0; i < 256; i++)
9003 Py_CLEAR(unicode_latin1[i]);
9004
9005 (void)PyUnicode_ClearFreeList();
9006 }
9007
9008 #ifdef __cplusplus
9009 }
9010 #endif
9011