1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "internal/pystate.h"
44 #include "ucnhash.h"
45 #include "bytes_methods.h"
46 #include "stringlib/eq.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /*[clinic input]
53 class str "PyObject *" "&PyUnicode_Type"
54 [clinic start generated code]*/
55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57 /*[python input]
58 class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68 [python start generated code]*/
69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
70
71 /* --- Globals ------------------------------------------------------------
72
73 NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
76
77 */
78
79
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83
84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85 #define MAX_UNICODE 0x10ffff
86
87 #ifdef Py_DEBUG
88 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
89 #else
90 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91 #endif
92
93 #define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95 #define PyUnicode_UTF8(op) \
96 (assert(_PyUnicode_CHECK(op)), \
97 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
101 #define _PyUnicode_UTF8_LENGTH(op) \
102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103 #define PyUnicode_UTF8_LENGTH(op) \
104 (assert(_PyUnicode_CHECK(op)), \
105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
109 #define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111 #define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113 #define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115 #define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117 #define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
119 #define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 ((PyASCIIObject *)(op))->state.kind)
122 #define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 ((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
127
128 #undef PyUnicode_READY
129 #define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
132 0 : \
133 _PyUnicode_Ready(op)))
134
135 #define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139 #define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
143 /* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
145 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
147 && _PyUnicode_UTF8(op) \
148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
150 /* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
153 ((_PyUnicode_WSTR(op) && \
154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
157 /* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
176 } \
177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
179 } while (0)
180
181 #ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183 # define OVERALLOCATE_FACTOR 2
184 #else
185 /* On Linux, overallocate by 25% is the best factor */
186 # define OVERALLOCATE_FACTOR 4
187 #endif
188
189 /* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
196 */
197 static PyObject *interned = NULL;
198
199 /* The empty Unicode object is shared to improve performance. */
200 static PyObject *unicode_empty = NULL;
201
202 #define _Py_INCREF_UNICODE_EMPTY() \
203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
209 Py_INCREF(unicode_empty); \
210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
212 } \
213 } while (0)
214
215 #define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
220
221 #define FILL(kind, data, value, start, length) \
222 do { \
223 assert(0 <= start); \
224 assert(kind != PyUnicode_WCHAR_KIND); \
225 switch (kind) { \
226 case PyUnicode_1BYTE_KIND: { \
227 assert(value <= 0xff); \
228 Py_UCS1 ch = (unsigned char)value; \
229 Py_UCS1 *to = (Py_UCS1 *)data + start; \
230 memset(to, ch, length); \
231 break; \
232 } \
233 case PyUnicode_2BYTE_KIND: { \
234 assert(value <= 0xffff); \
235 Py_UCS2 ch = (Py_UCS2)value; \
236 Py_UCS2 *to = (Py_UCS2 *)data + start; \
237 const Py_UCS2 *end = to + length; \
238 for (; to < end; ++to) *to = ch; \
239 break; \
240 } \
241 case PyUnicode_4BYTE_KIND: { \
242 assert(value <= MAX_UNICODE); \
243 Py_UCS4 ch = value; \
244 Py_UCS4 * to = (Py_UCS4 *)data + start; \
245 const Py_UCS4 *end = to + length; \
246 for (; to < end; ++to) *to = ch; \
247 break; \
248 } \
249 default: Py_UNREACHABLE(); \
250 } \
251 } while (0)
252
253
254 /* Forward declaration */
255 static inline int
256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257
258 /* List of static strings. */
259 static _Py_Identifier *static_strings = NULL;
260
261 /* Single character Unicode strings in the Latin-1 range are being
262 shared as well. */
263 static PyObject *unicode_latin1[256] = {NULL};
264
265 /* Fast detection of the most frequent whitespace characters */
266 const unsigned char _Py_ascii_whitespace[] = {
267 0, 0, 0, 0, 0, 0, 0, 0,
268 /* case 0x0009: * CHARACTER TABULATION */
269 /* case 0x000A: * LINE FEED */
270 /* case 0x000B: * LINE TABULATION */
271 /* case 0x000C: * FORM FEED */
272 /* case 0x000D: * CARRIAGE RETURN */
273 0, 1, 1, 1, 1, 1, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 /* case 0x001C: * FILE SEPARATOR */
276 /* case 0x001D: * GROUP SEPARATOR */
277 /* case 0x001E: * RECORD SEPARATOR */
278 /* case 0x001F: * UNIT SEPARATOR */
279 0, 0, 0, 0, 1, 1, 1, 1,
280 /* case 0x0020: * SPACE */
281 1, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0
294 };
295
296 /* forward */
297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
298 static PyObject* get_latin1_char(unsigned char ch);
299 static int unicode_modifiable(PyObject *unicode);
300
301
302 static PyObject *
303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
304 static PyObject *
305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306 static PyObject *
307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308
309 static PyObject *
310 unicode_encode_call_errorhandler(const char *errors,
311 PyObject **errorHandler,const char *encoding, const char *reason,
312 PyObject *unicode, PyObject **exceptionObject,
313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314
315 static void
316 raise_encode_exception(PyObject **exceptionObject,
317 const char *encoding,
318 PyObject *unicode,
319 Py_ssize_t startpos, Py_ssize_t endpos,
320 const char *reason);
321
322 /* Same for linebreaks */
323 static const unsigned char ascii_linebreak[] = {
324 0, 0, 0, 0, 0, 0, 0, 0,
325 /* 0x000A, * LINE FEED */
326 /* 0x000B, * LINE TABULATION */
327 /* 0x000C, * FORM FEED */
328 /* 0x000D, * CARRIAGE RETURN */
329 0, 0, 1, 1, 1, 1, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 /* 0x001C, * FILE SEPARATOR */
332 /* 0x001D, * GROUP SEPARATOR */
333 /* 0x001E, * RECORD SEPARATOR */
334 0, 0, 0, 0, 1, 1, 1, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0
348 };
349
350 static int convert_uc(PyObject *obj, void *addr);
351
352 #include "clinic/unicodeobject.c.h"
353
354 typedef enum {
355 _Py_ERROR_UNKNOWN=0,
356 _Py_ERROR_STRICT,
357 _Py_ERROR_SURROGATEESCAPE,
358 _Py_ERROR_REPLACE,
359 _Py_ERROR_IGNORE,
360 _Py_ERROR_BACKSLASHREPLACE,
361 _Py_ERROR_SURROGATEPASS,
362 _Py_ERROR_XMLCHARREFREPLACE,
363 _Py_ERROR_OTHER
364 } _Py_error_handler;
365
366 static _Py_error_handler
get_error_handler(const char * errors)367 get_error_handler(const char *errors)
368 {
369 if (errors == NULL || strcmp(errors, "strict") == 0) {
370 return _Py_ERROR_STRICT;
371 }
372 if (strcmp(errors, "surrogateescape") == 0) {
373 return _Py_ERROR_SURROGATEESCAPE;
374 }
375 if (strcmp(errors, "replace") == 0) {
376 return _Py_ERROR_REPLACE;
377 }
378 if (strcmp(errors, "ignore") == 0) {
379 return _Py_ERROR_IGNORE;
380 }
381 if (strcmp(errors, "backslashreplace") == 0) {
382 return _Py_ERROR_BACKSLASHREPLACE;
383 }
384 if (strcmp(errors, "surrogatepass") == 0) {
385 return _Py_ERROR_SURROGATEPASS;
386 }
387 if (strcmp(errors, "xmlcharrefreplace") == 0) {
388 return _Py_ERROR_XMLCHARREFREPLACE;
389 }
390 return _Py_ERROR_OTHER;
391 }
392
393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394 This function is kept for backward compatibility with the old API. */
395 Py_UNICODE
PyUnicode_GetMax(void)396 PyUnicode_GetMax(void)
397 {
398 #ifdef Py_UNICODE_WIDE
399 return 0x10FFFF;
400 #else
401 /* This is actually an illegal character, so it should
402 not be passed to unichr. */
403 return 0xFFFF;
404 #endif
405 }
406
407 #ifdef Py_DEBUG
408 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)409 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
410 {
411 PyASCIIObject *ascii;
412 unsigned int kind;
413
414 assert(PyUnicode_Check(op));
415
416 ascii = (PyASCIIObject *)op;
417 kind = ascii->state.kind;
418
419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
420 assert(kind == PyUnicode_1BYTE_KIND);
421 assert(ascii->state.ready == 1);
422 }
423 else {
424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
425 void *data;
426
427 if (ascii->state.compact == 1) {
428 data = compact + 1;
429 assert(kind == PyUnicode_1BYTE_KIND
430 || kind == PyUnicode_2BYTE_KIND
431 || kind == PyUnicode_4BYTE_KIND);
432 assert(ascii->state.ascii == 0);
433 assert(ascii->state.ready == 1);
434 assert (compact->utf8 != data);
435 }
436 else {
437 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438
439 data = unicode->data.any;
440 if (kind == PyUnicode_WCHAR_KIND) {
441 assert(ascii->length == 0);
442 assert(ascii->hash == -1);
443 assert(ascii->state.compact == 0);
444 assert(ascii->state.ascii == 0);
445 assert(ascii->state.ready == 0);
446 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
447 assert(ascii->wstr != NULL);
448 assert(data == NULL);
449 assert(compact->utf8 == NULL);
450 }
451 else {
452 assert(kind == PyUnicode_1BYTE_KIND
453 || kind == PyUnicode_2BYTE_KIND
454 || kind == PyUnicode_4BYTE_KIND);
455 assert(ascii->state.compact == 0);
456 assert(ascii->state.ready == 1);
457 assert(data != NULL);
458 if (ascii->state.ascii) {
459 assert (compact->utf8 == data);
460 assert (compact->utf8_length == ascii->length);
461 }
462 else
463 assert (compact->utf8 != data);
464 }
465 }
466 if (kind != PyUnicode_WCHAR_KIND) {
467 if (
468 #if SIZEOF_WCHAR_T == 2
469 kind == PyUnicode_2BYTE_KIND
470 #else
471 kind == PyUnicode_4BYTE_KIND
472 #endif
473 )
474 {
475 assert(ascii->wstr == data);
476 assert(compact->wstr_length == ascii->length);
477 } else
478 assert(ascii->wstr != data);
479 }
480
481 if (compact->utf8 == NULL)
482 assert(compact->utf8_length == 0);
483 if (ascii->wstr == NULL)
484 assert(compact->wstr_length == 0);
485 }
486 /* check that the best kind is used */
487 if (check_content && kind != PyUnicode_WCHAR_KIND)
488 {
489 Py_ssize_t i;
490 Py_UCS4 maxchar = 0;
491 void *data;
492 Py_UCS4 ch;
493
494 data = PyUnicode_DATA(ascii);
495 for (i=0; i < ascii->length; i++)
496 {
497 ch = PyUnicode_READ(kind, data, i);
498 if (ch > maxchar)
499 maxchar = ch;
500 }
501 if (kind == PyUnicode_1BYTE_KIND) {
502 if (ascii->state.ascii == 0) {
503 assert(maxchar >= 128);
504 assert(maxchar <= 255);
505 }
506 else
507 assert(maxchar < 128);
508 }
509 else if (kind == PyUnicode_2BYTE_KIND) {
510 assert(maxchar >= 0x100);
511 assert(maxchar <= 0xFFFF);
512 }
513 else {
514 assert(maxchar >= 0x10000);
515 assert(maxchar <= MAX_UNICODE);
516 }
517 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
518 }
519 return 1;
520 }
521 #endif
522
523 static PyObject*
unicode_result_wchar(PyObject * unicode)524 unicode_result_wchar(PyObject *unicode)
525 {
526 #ifndef Py_DEBUG
527 Py_ssize_t len;
528
529 len = _PyUnicode_WSTR_LENGTH(unicode);
530 if (len == 0) {
531 Py_DECREF(unicode);
532 _Py_RETURN_UNICODE_EMPTY();
533 }
534
535 if (len == 1) {
536 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
537 if ((Py_UCS4)ch < 256) {
538 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539 Py_DECREF(unicode);
540 return latin1_char;
541 }
542 }
543
544 if (_PyUnicode_Ready(unicode) < 0) {
545 Py_DECREF(unicode);
546 return NULL;
547 }
548 #else
549 assert(Py_REFCNT(unicode) == 1);
550
551 /* don't make the result ready in debug mode to ensure that the caller
552 makes the string ready before using it */
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 #endif
555 return unicode;
556 }
557
558 static PyObject*
unicode_result_ready(PyObject * unicode)559 unicode_result_ready(PyObject *unicode)
560 {
561 Py_ssize_t length;
562
563 length = PyUnicode_GET_LENGTH(unicode);
564 if (length == 0) {
565 if (unicode != unicode_empty) {
566 Py_DECREF(unicode);
567 _Py_RETURN_UNICODE_EMPTY();
568 }
569 return unicode_empty;
570 }
571
572 if (length == 1) {
573 void *data = PyUnicode_DATA(unicode);
574 int kind = PyUnicode_KIND(unicode);
575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
576 if (ch < 256) {
577 PyObject *latin1_char = unicode_latin1[ch];
578 if (latin1_char != NULL) {
579 if (unicode != latin1_char) {
580 Py_INCREF(latin1_char);
581 Py_DECREF(unicode);
582 }
583 return latin1_char;
584 }
585 else {
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 Py_INCREF(unicode);
588 unicode_latin1[ch] = unicode;
589 return unicode;
590 }
591 }
592 }
593
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 return unicode;
596 }
597
598 static PyObject*
unicode_result(PyObject * unicode)599 unicode_result(PyObject *unicode)
600 {
601 assert(_PyUnicode_CHECK(unicode));
602 if (PyUnicode_IS_READY(unicode))
603 return unicode_result_ready(unicode);
604 else
605 return unicode_result_wchar(unicode);
606 }
607
608 static PyObject*
unicode_result_unchanged(PyObject * unicode)609 unicode_result_unchanged(PyObject *unicode)
610 {
611 if (PyUnicode_CheckExact(unicode)) {
612 if (PyUnicode_READY(unicode) == -1)
613 return NULL;
614 Py_INCREF(unicode);
615 return unicode;
616 }
617 else
618 /* Subtype -- return genuine unicode string with the same value. */
619 return _PyUnicode_Copy(unicode);
620 }
621
622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623 ASCII, Latin1, UTF-8, etc. */
624 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)625 backslashreplace(_PyBytesWriter *writer, char *str,
626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627 {
628 Py_ssize_t size, i;
629 Py_UCS4 ch;
630 enum PyUnicode_Kind kind;
631 void *data;
632
633 assert(PyUnicode_IS_READY(unicode));
634 kind = PyUnicode_KIND(unicode);
635 data = PyUnicode_DATA(unicode);
636
637 size = 0;
638 /* determine replacement size */
639 for (i = collstart; i < collend; ++i) {
640 Py_ssize_t incr;
641
642 ch = PyUnicode_READ(kind, data, i);
643 if (ch < 0x100)
644 incr = 2+2;
645 else if (ch < 0x10000)
646 incr = 2+4;
647 else {
648 assert(ch <= MAX_UNICODE);
649 incr = 2+8;
650 }
651 if (size > PY_SSIZE_T_MAX - incr) {
652 PyErr_SetString(PyExc_OverflowError,
653 "encoded result is too long for a Python string");
654 return NULL;
655 }
656 size += incr;
657 }
658
659 str = _PyBytesWriter_Prepare(writer, str, size);
660 if (str == NULL)
661 return NULL;
662
663 /* generate replacement */
664 for (i = collstart; i < collend; ++i) {
665 ch = PyUnicode_READ(kind, data, i);
666 *str++ = '\\';
667 if (ch >= 0x00010000) {
668 *str++ = 'U';
669 *str++ = Py_hexdigits[(ch>>28)&0xf];
670 *str++ = Py_hexdigits[(ch>>24)&0xf];
671 *str++ = Py_hexdigits[(ch>>20)&0xf];
672 *str++ = Py_hexdigits[(ch>>16)&0xf];
673 *str++ = Py_hexdigits[(ch>>12)&0xf];
674 *str++ = Py_hexdigits[(ch>>8)&0xf];
675 }
676 else if (ch >= 0x100) {
677 *str++ = 'u';
678 *str++ = Py_hexdigits[(ch>>12)&0xf];
679 *str++ = Py_hexdigits[(ch>>8)&0xf];
680 }
681 else
682 *str++ = 'x';
683 *str++ = Py_hexdigits[(ch>>4)&0xf];
684 *str++ = Py_hexdigits[ch&0xf];
685 }
686 return str;
687 }
688
689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690 ASCII, Latin1, UTF-8, etc. */
691 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)692 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694 {
695 Py_ssize_t size, i;
696 Py_UCS4 ch;
697 enum PyUnicode_Kind kind;
698 void *data;
699
700 assert(PyUnicode_IS_READY(unicode));
701 kind = PyUnicode_KIND(unicode);
702 data = PyUnicode_DATA(unicode);
703
704 size = 0;
705 /* determine replacement size */
706 for (i = collstart; i < collend; ++i) {
707 Py_ssize_t incr;
708
709 ch = PyUnicode_READ(kind, data, i);
710 if (ch < 10)
711 incr = 2+1+1;
712 else if (ch < 100)
713 incr = 2+2+1;
714 else if (ch < 1000)
715 incr = 2+3+1;
716 else if (ch < 10000)
717 incr = 2+4+1;
718 else if (ch < 100000)
719 incr = 2+5+1;
720 else if (ch < 1000000)
721 incr = 2+6+1;
722 else {
723 assert(ch <= MAX_UNICODE);
724 incr = 2+7+1;
725 }
726 if (size > PY_SSIZE_T_MAX - incr) {
727 PyErr_SetString(PyExc_OverflowError,
728 "encoded result is too long for a Python string");
729 return NULL;
730 }
731 size += incr;
732 }
733
734 str = _PyBytesWriter_Prepare(writer, str, size);
735 if (str == NULL)
736 return NULL;
737
738 /* generate replacement */
739 for (i = collstart; i < collend; ++i) {
740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741 }
742 return str;
743 }
744
745 /* --- Bloom Filters ----------------------------------------------------- */
746
747 /* stuff to implement simple "bloom filters" for Unicode characters.
748 to keep things simple, we use a single bitmask, using the least 5
749 bits from each unicode characters as the bit index. */
750
751 /* the linebreak mask is set up by Unicode_Init below */
752
753 #if LONG_BIT >= 128
754 #define BLOOM_WIDTH 128
755 #elif LONG_BIT >= 64
756 #define BLOOM_WIDTH 64
757 #elif LONG_BIT >= 32
758 #define BLOOM_WIDTH 32
759 #else
760 #error "LONG_BIT is smaller than 32"
761 #endif
762
763 #define BLOOM_MASK unsigned long
764
765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
766
767 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
768
769 #define BLOOM_LINEBREAK(ch) \
770 ((ch) < 128U ? ascii_linebreak[(ch)] : \
771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
772
773 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
775 {
776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
777 do { \
778 TYPE *data = (TYPE *)PTR; \
779 TYPE *end = data + LEN; \
780 Py_UCS4 ch; \
781 for (; data != end; data++) { \
782 ch = *data; \
783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784 } \
785 break; \
786 } while (0)
787
788 /* calculate simple bloom-style bitmask for a given unicode string */
789
790 BLOOM_MASK mask;
791
792 mask = 0;
793 switch (kind) {
794 case PyUnicode_1BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796 break;
797 case PyUnicode_2BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799 break;
800 case PyUnicode_4BYTE_KIND:
801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802 break;
803 default:
804 Py_UNREACHABLE();
805 }
806 return mask;
807
808 #undef BLOOM_UPDATE
809 }
810
811 static int
ensure_unicode(PyObject * obj)812 ensure_unicode(PyObject *obj)
813 {
814 if (!PyUnicode_Check(obj)) {
815 PyErr_Format(PyExc_TypeError,
816 "must be str, not %.100s",
817 Py_TYPE(obj)->tp_name);
818 return -1;
819 }
820 return PyUnicode_READY(obj);
821 }
822
823 /* Compilation of templated routines */
824
825 #include "stringlib/asciilib.h"
826 #include "stringlib/fastsearch.h"
827 #include "stringlib/partition.h"
828 #include "stringlib/split.h"
829 #include "stringlib/count.h"
830 #include "stringlib/find.h"
831 #include "stringlib/find_max_char.h"
832 #include "stringlib/undef.h"
833
834 #include "stringlib/ucs1lib.h"
835 #include "stringlib/fastsearch.h"
836 #include "stringlib/partition.h"
837 #include "stringlib/split.h"
838 #include "stringlib/count.h"
839 #include "stringlib/find.h"
840 #include "stringlib/replace.h"
841 #include "stringlib/find_max_char.h"
842 #include "stringlib/undef.h"
843
844 #include "stringlib/ucs2lib.h"
845 #include "stringlib/fastsearch.h"
846 #include "stringlib/partition.h"
847 #include "stringlib/split.h"
848 #include "stringlib/count.h"
849 #include "stringlib/find.h"
850 #include "stringlib/replace.h"
851 #include "stringlib/find_max_char.h"
852 #include "stringlib/undef.h"
853
854 #include "stringlib/ucs4lib.h"
855 #include "stringlib/fastsearch.h"
856 #include "stringlib/partition.h"
857 #include "stringlib/split.h"
858 #include "stringlib/count.h"
859 #include "stringlib/find.h"
860 #include "stringlib/replace.h"
861 #include "stringlib/find_max_char.h"
862 #include "stringlib/undef.h"
863
864 #include "stringlib/unicodedefs.h"
865 #include "stringlib/fastsearch.h"
866 #include "stringlib/count.h"
867 #include "stringlib/find.h"
868 #include "stringlib/undef.h"
869
870 /* --- Unicode Object ----------------------------------------------------- */
871
872 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)873 findchar(const void *s, int kind,
874 Py_ssize_t size, Py_UCS4 ch,
875 int direction)
876 {
877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
879 if ((Py_UCS1) ch != ch)
880 return -1;
881 if (direction > 0)
882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883 else
884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
885 case PyUnicode_2BYTE_KIND:
886 if ((Py_UCS2) ch != ch)
887 return -1;
888 if (direction > 0)
889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890 else
891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
892 case PyUnicode_4BYTE_KIND:
893 if (direction > 0)
894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895 else
896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
897 default:
898 Py_UNREACHABLE();
899 }
900 }
901
902 #ifdef Py_DEBUG
903 /* Fill the data of a Unicode string with invalid characters to detect bugs
904 earlier.
905
906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908 invalid character in Unicode 6.0. */
909 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911 {
912 int kind = PyUnicode_KIND(unicode);
913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915 if (length <= old_length)
916 return;
917 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918 }
919 #endif
920
921 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)922 resize_compact(PyObject *unicode, Py_ssize_t length)
923 {
924 Py_ssize_t char_size;
925 Py_ssize_t struct_size;
926 Py_ssize_t new_size;
927 int share_wstr;
928 PyObject *new_unicode;
929 #ifdef Py_DEBUG
930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931 #endif
932
933 assert(unicode_modifiable(unicode));
934 assert(PyUnicode_IS_READY(unicode));
935 assert(PyUnicode_IS_COMPACT(unicode));
936
937 char_size = PyUnicode_KIND(unicode);
938 if (PyUnicode_IS_ASCII(unicode))
939 struct_size = sizeof(PyASCIIObject);
940 else
941 struct_size = sizeof(PyCompactUnicodeObject);
942 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
943
944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945 PyErr_NoMemory();
946 return NULL;
947 }
948 new_size = (struct_size + (length + 1) * char_size);
949
950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951 PyObject_DEL(_PyUnicode_UTF8(unicode));
952 _PyUnicode_UTF8(unicode) = NULL;
953 _PyUnicode_UTF8_LENGTH(unicode) = 0;
954 }
955 _Py_DEC_REFTOTAL;
956 _Py_ForgetReference(unicode);
957
958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
959 if (new_unicode == NULL) {
960 _Py_NewReference(unicode);
961 PyErr_NoMemory();
962 return NULL;
963 }
964 unicode = new_unicode;
965 _Py_NewReference(unicode);
966
967 _PyUnicode_LENGTH(unicode) = length;
968 if (share_wstr) {
969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
970 if (!PyUnicode_IS_ASCII(unicode))
971 _PyUnicode_WSTR_LENGTH(unicode) = length;
972 }
973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974 PyObject_DEL(_PyUnicode_WSTR(unicode));
975 _PyUnicode_WSTR(unicode) = NULL;
976 if (!PyUnicode_IS_ASCII(unicode))
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
978 }
979 #ifdef Py_DEBUG
980 unicode_fill_invalid(unicode, old_length);
981 #endif
982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983 length, 0);
984 assert(_PyUnicode_CheckConsistency(unicode, 0));
985 return unicode;
986 }
987
988 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)989 resize_inplace(PyObject *unicode, Py_ssize_t length)
990 {
991 wchar_t *wstr;
992 Py_ssize_t new_size;
993 assert(!PyUnicode_IS_COMPACT(unicode));
994 assert(Py_REFCNT(unicode) == 1);
995
996 if (PyUnicode_IS_READY(unicode)) {
997 Py_ssize_t char_size;
998 int share_wstr, share_utf8;
999 void *data;
1000 #ifdef Py_DEBUG
1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002 #endif
1003
1004 data = _PyUnicode_DATA_ANY(unicode);
1005 char_size = PyUnicode_KIND(unicode);
1006 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1008
1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010 PyErr_NoMemory();
1011 return -1;
1012 }
1013 new_size = (length + 1) * char_size;
1014
1015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016 {
1017 PyObject_DEL(_PyUnicode_UTF8(unicode));
1018 _PyUnicode_UTF8(unicode) = NULL;
1019 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020 }
1021
1022 data = (PyObject *)PyObject_REALLOC(data, new_size);
1023 if (data == NULL) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 _PyUnicode_DATA_ANY(unicode) = data;
1028 if (share_wstr) {
1029 _PyUnicode_WSTR(unicode) = data;
1030 _PyUnicode_WSTR_LENGTH(unicode) = length;
1031 }
1032 if (share_utf8) {
1033 _PyUnicode_UTF8(unicode) = data;
1034 _PyUnicode_UTF8_LENGTH(unicode) = length;
1035 }
1036 _PyUnicode_LENGTH(unicode) = length;
1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1038 #ifdef Py_DEBUG
1039 unicode_fill_invalid(unicode, old_length);
1040 #endif
1041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1042 assert(_PyUnicode_CheckConsistency(unicode, 0));
1043 return 0;
1044 }
1045 }
1046 assert(_PyUnicode_WSTR(unicode) != NULL);
1047
1048 /* check for integer overflow */
1049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1050 PyErr_NoMemory();
1051 return -1;
1052 }
1053 new_size = sizeof(wchar_t) * (length + 1);
1054 wstr = _PyUnicode_WSTR(unicode);
1055 wstr = PyObject_REALLOC(wstr, new_size);
1056 if (!wstr) {
1057 PyErr_NoMemory();
1058 return -1;
1059 }
1060 _PyUnicode_WSTR(unicode) = wstr;
1061 _PyUnicode_WSTR(unicode)[length] = 0;
1062 _PyUnicode_WSTR_LENGTH(unicode) = length;
1063 assert(_PyUnicode_CheckConsistency(unicode, 0));
1064 return 0;
1065 }
1066
1067 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1068 resize_copy(PyObject *unicode, Py_ssize_t length)
1069 {
1070 Py_ssize_t copy_length;
1071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1072 PyObject *copy;
1073
1074 assert(PyUnicode_IS_READY(unicode));
1075
1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077 if (copy == NULL)
1078 return NULL;
1079
1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1082 return copy;
1083 }
1084 else {
1085 PyObject *w;
1086
1087 w = (PyObject*)_PyUnicode_New(length);
1088 if (w == NULL)
1089 return NULL;
1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091 copy_length = Py_MIN(copy_length, length);
1092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1093 copy_length * sizeof(wchar_t));
1094 return w;
1095 }
1096 }
1097
1098 /* We allocate one more byte to make sure the string is
1099 Ux0000 terminated; some code (e.g. new_identifier)
1100 relies on that.
1101
1102 XXX This allocator could further be enhanced by assuring that the
1103 free list never reduces its size below 1.
1104
1105 */
1106
1107 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1108 _PyUnicode_New(Py_ssize_t length)
1109 {
1110 PyUnicodeObject *unicode;
1111 size_t new_size;
1112
1113 /* Optimization for empty strings */
1114 if (length == 0 && unicode_empty != NULL) {
1115 Py_INCREF(unicode_empty);
1116 return (PyUnicodeObject*)unicode_empty;
1117 }
1118
1119 /* Ensure we won't overflow the size. */
1120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1121 return (PyUnicodeObject *)PyErr_NoMemory();
1122 }
1123 if (length < 0) {
1124 PyErr_SetString(PyExc_SystemError,
1125 "Negative size passed to _PyUnicode_New");
1126 return NULL;
1127 }
1128
1129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130 if (unicode == NULL)
1131 return NULL;
1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1133
1134 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_HASH(unicode) = -1;
1136 _PyUnicode_STATE(unicode).interned = 0;
1137 _PyUnicode_STATE(unicode).kind = 0;
1138 _PyUnicode_STATE(unicode).compact = 0;
1139 _PyUnicode_STATE(unicode).ready = 0;
1140 _PyUnicode_STATE(unicode).ascii = 0;
1141 _PyUnicode_DATA_ANY(unicode) = NULL;
1142 _PyUnicode_LENGTH(unicode) = 0;
1143 _PyUnicode_UTF8(unicode) = NULL;
1144 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145
1146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147 if (!_PyUnicode_WSTR(unicode)) {
1148 Py_DECREF(unicode);
1149 PyErr_NoMemory();
1150 return NULL;
1151 }
1152
1153 /* Initialize the first element to guard against cases where
1154 * the caller fails before initializing str -- unicode_resize()
1155 * reads str[0], and the Keep-Alive optimization can keep memory
1156 * allocated for str alive across a call to unicode_dealloc(unicode).
1157 * We don't want unicode_resize to read uninitialized memory in
1158 * that case.
1159 */
1160 _PyUnicode_WSTR(unicode)[0] = 0;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
1162
1163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1164 return unicode;
1165 }
1166
1167 static const char*
unicode_kind_name(PyObject * unicode)1168 unicode_kind_name(PyObject *unicode)
1169 {
1170 /* don't check consistency: unicode_kind_name() is called from
1171 _PyUnicode_Dump() */
1172 if (!PyUnicode_IS_COMPACT(unicode))
1173 {
1174 if (!PyUnicode_IS_READY(unicode))
1175 return "wstr";
1176 switch (PyUnicode_KIND(unicode))
1177 {
1178 case PyUnicode_1BYTE_KIND:
1179 if (PyUnicode_IS_ASCII(unicode))
1180 return "legacy ascii";
1181 else
1182 return "legacy latin1";
1183 case PyUnicode_2BYTE_KIND:
1184 return "legacy UCS2";
1185 case PyUnicode_4BYTE_KIND:
1186 return "legacy UCS4";
1187 default:
1188 return "<legacy invalid kind>";
1189 }
1190 }
1191 assert(PyUnicode_IS_READY(unicode));
1192 switch (PyUnicode_KIND(unicode)) {
1193 case PyUnicode_1BYTE_KIND:
1194 if (PyUnicode_IS_ASCII(unicode))
1195 return "ascii";
1196 else
1197 return "latin1";
1198 case PyUnicode_2BYTE_KIND:
1199 return "UCS2";
1200 case PyUnicode_4BYTE_KIND:
1201 return "UCS4";
1202 default:
1203 return "<invalid compact kind>";
1204 }
1205 }
1206
1207 #ifdef Py_DEBUG
1208 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1209 char *_PyUnicode_utf8(void *unicode){
1210 return PyUnicode_UTF8(unicode);
1211 }
1212
_PyUnicode_compact_data(void * unicode)1213 void *_PyUnicode_compact_data(void *unicode) {
1214 return _PyUnicode_COMPACT_DATA(unicode);
1215 }
_PyUnicode_data(void * unicode)1216 void *_PyUnicode_data(void *unicode){
1217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224 }
1225
1226 void
_PyUnicode_Dump(PyObject * op)1227 _PyUnicode_Dump(PyObject *op)
1228 {
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
1230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
1233
1234 if (ascii->state.compact)
1235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
1241 else
1242 data = unicode->data.any;
1243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
1245
1246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
1249
1250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
1254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
1256 }
1257 printf(", data=%p\n", data);
1258 }
1259 #endif
1260
1261 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263 {
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
1267 enum PyUnicode_Kind kind;
1268 int is_sharing, is_ascii;
1269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
1275 return unicode_empty;
1276 }
1277
1278 is_ascii = 0;
1279 is_sharing = 0;
1280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
1282 kind = PyUnicode_1BYTE_KIND;
1283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
1288 kind = PyUnicode_1BYTE_KIND;
1289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
1292 kind = PyUnicode_2BYTE_KIND;
1293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
1298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
1303 kind = PyUnicode_4BYTE_KIND;
1304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
1337 _PyUnicode_STATE(unicode).kind = kind;
1338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
1345 else if (kind == PyUnicode_1BYTE_KIND) {
1346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1349 unicode->utf8 = NULL;
1350 unicode->utf8_length = 0;
1351 }
1352 else {
1353 unicode->utf8 = NULL;
1354 unicode->utf8_length = 0;
1355 if (kind == PyUnicode_2BYTE_KIND)
1356 ((Py_UCS2*)data)[size] = 0;
1357 else /* kind == PyUnicode_4BYTE_KIND */
1358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
1368 #ifdef Py_DEBUG
1369 unicode_fill_invalid((PyObject*)unicode, 0);
1370 #endif
1371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1372 return obj;
1373 }
1374
1375 #if SIZEOF_WCHAR_T == 2
1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
1378 for efficiency.
1379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
1382 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1384 PyObject *unicode)
1385 {
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
1389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
1391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
1397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1400 {
1401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
1412 }
1413 #endif
1414
1415 static int
unicode_check_modifiable(PyObject * unicode)1416 unicode_check_modifiable(PyObject *unicode)
1417 {
1418 if (!unicode_modifiable(unicode)) {
1419 PyErr_SetString(PyExc_SystemError,
1420 "Cannot modify a string currently used");
1421 return -1;
1422 }
1423 return 0;
1424 }
1425
1426 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1427 _copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
1430 {
1431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
1433
1434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
1437 assert(PyUnicode_Check(from));
1438 assert(PyUnicode_IS_READY(from));
1439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1440
1441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
1445 if (how_many == 0)
1446 return 0;
1447
1448 from_kind = PyUnicode_KIND(from);
1449 from_data = PyUnicode_DATA(from);
1450 to_kind = PyUnicode_KIND(to);
1451 to_data = PyUnicode_DATA(to);
1452
1453 #ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465 #endif
1466
1467 if (from_kind == to_kind) {
1468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
1471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
1473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
1478 }
1479 memcpy((char*)to_data + to_kind * to_start,
1480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
1482 }
1483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
1485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
1492 }
1493 else if (from_kind == PyUnicode_1BYTE_KIND
1494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
1502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
1512 }
1513 else {
1514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
1516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
1548 Py_UNREACHABLE();
1549 }
1550 }
1551 else {
1552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1553 Py_UCS4 ch;
1554 Py_ssize_t i;
1555
1556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1558 if (ch > to_maxchar)
1559 return -1;
1560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
1562 }
1563 }
1564 return 0;
1565 }
1566
1567 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1568 _PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1571 {
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573 }
1574
1575 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579 {
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
1587 if (PyUnicode_READY(from) == -1)
1588 return -1;
1589 if (PyUnicode_READY(to) == -1)
1590 return -1;
1591
1592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
1596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
1600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
1607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
1609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
1616 if (unicode_check_modifiable(to))
1617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
1628 return how_many;
1629 }
1630
1631 /* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1639 {
1640 const wchar_t *iter;
1641 Py_UCS4 ch;
1642
1643 assert(num_surrogates != NULL && maxchar != NULL);
1644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
1648 #if SIZEOF_WCHAR_T == 2
1649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
1656 }
1657 else
1658 #endif
1659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
1672 }
1673 return 0;
1674 }
1675
1676 int
_PyUnicode_Ready(PyObject * unicode)1677 _PyUnicode_Ready(PyObject *unicode)
1678 {
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682 #if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684 #endif
1685
1686 /* _PyUnicode_Ready() is only intended for old-style API usage where
1687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
1690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1692 assert(_PyUnicode_WSTR(unicode) != NULL);
1693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1694 assert(_PyUnicode_UTF8(unicode) == NULL);
1695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1697
1698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1700 &maxchar, &num_surrogates) == -1)
1701 return -1;
1702
1703 if (maxchar < 256) {
1704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
1706 PyErr_NoMemory();
1707 return -1;
1708 }
1709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
1716 _PyUnicode_STATE(unicode).ascii = 1;
1717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1719 }
1720 else {
1721 _PyUnicode_STATE(unicode).ascii = 0;
1722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
1735 #if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
1737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743 #else
1744 /* sizeof(wchar_t) == 4 */
1745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1747 if (!_PyUnicode_DATA_ANY(unicode)) {
1748 PyErr_NoMemory();
1749 return -1;
1750 }
1751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762 #endif
1763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766 #if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
1774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
1776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
1785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789 #else
1790 assert(num_surrogates == 0);
1791
1792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797 #endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return 0;
1803 }
1804
1805 static void
unicode_dealloc(PyObject * unicode)1806 unicode_dealloc(PyObject *unicode)
1807 {
1808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1809 case SSTATE_NOT_INTERNED:
1810 break;
1811
1812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
1815 if (PyDict_DelItem(interned, unicode) != 0)
1816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
1819
1820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
1822 /* fall through */
1823
1824 default:
1825 Py_FatalError("Inconsistent interned string state.");
1826 }
1827
1828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1829 PyObject_DEL(_PyUnicode_WSTR(unicode));
1830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1831 PyObject_DEL(_PyUnicode_UTF8(unicode));
1832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1834
1835 Py_TYPE(unicode)->tp_free(unicode);
1836 }
1837
1838 #ifdef Py_DEBUG
1839 static int
unicode_is_singleton(PyObject * unicode)1840 unicode_is_singleton(PyObject *unicode)
1841 {
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852 }
1853 #endif
1854
1855 static int
unicode_modifiable(PyObject * unicode)1856 unicode_modifiable(PyObject *unicode)
1857 {
1858 assert(_PyUnicode_CHECK(unicode));
1859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
1861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
1863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
1865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
1867 #ifdef Py_DEBUG
1868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
1870 #endif
1871 return 1;
1872 }
1873
1874 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876 {
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
1887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
1894 if (length == 0) {
1895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
1897 return -1;
1898 Py_SETREF(*p_unicode, unicode_empty);
1899 return 0;
1900 }
1901
1902 if (!unicode_modifiable(unicode)) {
1903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
1905 return -1;
1906 Py_SETREF(*p_unicode, copy);
1907 return 0;
1908 }
1909
1910 if (PyUnicode_IS_COMPACT(unicode)) {
1911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
1913 return -1;
1914 *p_unicode = new_unicode;
1915 return 0;
1916 }
1917 return resize_inplace(unicode, length);
1918 }
1919
1920 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1922 {
1923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
1929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
1935 }
1936
1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1938
1939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
1942 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
1945 {
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
1948 const char *end = str + len;
1949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
1952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1953 #ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960 #endif
1961 memcpy((char *) data + index, str, len);
1962 break;
1963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
1969 for (; str < end; ++ucs2, ++str)
1970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1973 break;
1974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
1981 for (; str < end; ++ucs4, ++str)
1982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1985 }
1986 }
1987 }
1988
1989 static PyObject*
get_latin1_char(unsigned char ch)1990 get_latin1_char(unsigned char ch)
1991 {
1992 PyObject *unicode = unicode_latin1[ch];
1993 if (!unicode) {
1994 unicode = PyUnicode_New(1, ch);
1995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1998 assert(_PyUnicode_CheckConsistency(unicode, 1));
1999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
2002 return unicode;
2003 }
2004
2005 static PyObject*
unicode_char(Py_UCS4 ch)2006 unicode_char(Py_UCS4 ch)
2007 {
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
2012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
2015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
2018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2022 } else {
2023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028 }
2029
2030 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2032 {
2033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042 }
2043
2044 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046 {
2047 PyObject *unicode;
2048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
2051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
2059
2060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
2062
2063 /* Optimization for empty strings */
2064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
2066
2067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
2069 if (size == 1 && (Py_UCS4)*u < 256)
2070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
2074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
2076 return NULL;
2077
2078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2079 if (!unicode)
2080 return NULL;
2081
2082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
2084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088 #if Py_UNICODE_SIZE == 2
2089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2090 #else
2091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093 #endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096 #if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
2099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2100 #else
2101 assert(num_surrogates == 0);
2102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2103 #endif
2104 break;
2105 default:
2106 Py_UNREACHABLE();
2107 }
2108
2109 return unicode_result(unicode);
2110 }
2111
2112 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2114 {
2115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
2117 "Negative size passed to PyUnicode_FromStringAndSize");
2118 return NULL;
2119 }
2120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
2124 }
2125
2126 PyObject *
PyUnicode_FromString(const char * u)2127 PyUnicode_FromString(const char *u)
2128 {
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
2134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2135 }
2136
2137 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2138 _PyUnicode_FromId(_Py_Identifier *id)
2139 {
2140 if (!id->object) {
2141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
2144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
2151 return id->object;
2152 }
2153
2154 void
_PyUnicode_ClearStaticStrings()2155 _PyUnicode_ClearStaticStrings()
2156 {
2157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
2159 Py_CLEAR(s->object);
2160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
2163 }
2164 static_strings = NULL;
2165 }
2166
2167 /* Internal function, doesn't check maximum character */
2168
2169 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2171 {
2172 const unsigned char *s = (const unsigned char *)buffer;
2173 PyObject *unicode;
2174 if (size == 1) {
2175 #ifdef Py_DEBUG
2176 assert((unsigned char)s[0] < 128);
2177 #endif
2178 return get_latin1_char(s[0]);
2179 }
2180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
2182 return NULL;
2183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
2186 }
2187
2188 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2189 kind_maxchar_limit(unsigned int kind)
2190 {
2191 switch (kind) {
2192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
2199 Py_UNREACHABLE();
2200 }
2201 }
2202
2203 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2205 {
2206 PyObject *res;
2207 unsigned char max_char;
2208
2209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
2211 assert(size > 0);
2212 if (size == 1)
2213 return get_latin1_char(u[0]);
2214
2215 max_char = ucs1lib_find_max_char(u, u + size);
2216 res = PyUnicode_New(size, max_char);
2217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2220 assert(_PyUnicode_CheckConsistency(res, 1));
2221 return res;
2222 }
2223
2224 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2226 {
2227 PyObject *res;
2228 Py_UCS2 max_char;
2229
2230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
2232 assert(size > 0);
2233 if (size == 1)
2234 return unicode_char(u[0]);
2235
2236 max_char = ucs2lib_find_max_char(u, u + size);
2237 res = PyUnicode_New(size, max_char);
2238 if (!res)
2239 return NULL;
2240 if (max_char >= 256)
2241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
2246 assert(_PyUnicode_CheckConsistency(res, 1));
2247 return res;
2248 }
2249
2250 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2252 {
2253 PyObject *res;
2254 Py_UCS4 max_char;
2255
2256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
2258 assert(size > 0);
2259 if (size == 1)
2260 return unicode_char(u[0]);
2261
2262 max_char = ucs4lib_find_max_char(u, u + size);
2263 res = PyUnicode_New(size, max_char);
2264 if (!res)
2265 return NULL;
2266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
2273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2274 assert(_PyUnicode_CheckConsistency(res, 1));
2275 return res;
2276 }
2277
2278 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280 {
2281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
2285 switch (kind) {
2286 case PyUnicode_1BYTE_KIND:
2287 return _PyUnicode_FromUCS1(buffer, size);
2288 case PyUnicode_2BYTE_KIND:
2289 return _PyUnicode_FromUCS2(buffer, size);
2290 case PyUnicode_4BYTE_KIND:
2291 return _PyUnicode_FromUCS4(buffer, size);
2292 default:
2293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
2295 }
2296 }
2297
2298 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300 {
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
2315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
2318 kind = PyUnicode_KIND(unicode);
2319 startptr = PyUnicode_DATA(unicode);
2320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
2322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
2329 default:
2330 Py_UNREACHABLE();
2331 }
2332 }
2333
2334 /* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
2337 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2338 unicode_adjust_maxchar(PyObject **p_unicode)
2339 {
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
2342 Py_ssize_t len;
2343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
2358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
2364 }
2365 else {
2366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2367 assert(kind == PyUnicode_4BYTE_KIND);
2368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
2371 }
2372 copy = PyUnicode_New(len, max_char);
2373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377 }
2378
2379 PyObject*
_PyUnicode_Copy(PyObject * unicode)2380 _PyUnicode_Copy(PyObject *unicode)
2381 {
2382 Py_ssize_t length;
2383 PyObject *copy;
2384
2385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
2389 if (PyUnicode_READY(unicode) == -1)
2390 return NULL;
2391
2392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
2398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2399 length * PyUnicode_KIND(unicode));
2400 assert(_PyUnicode_CheckConsistency(copy, 1));
2401 return copy;
2402 }
2403
2404
2405 /* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
2407
2408 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410 {
2411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
2415 if (PyUnicode_READY(s) == -1)
2416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
2421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2422 return NULL;
2423 }
2424 switch (kind) {
2425 case PyUnicode_2BYTE_KIND:
2426 result = PyMem_New(Py_UCS2, len);
2427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
2435 return result;
2436 case PyUnicode_4BYTE_KIND:
2437 result = PyMem_New(Py_UCS4, len);
2438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
2446 }
2447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
2455 return result;
2456 default:
2457 break;
2458 }
2459 PyErr_SetString(PyExc_SystemError, "invalid kind");
2460 return NULL;
2461 }
2462
2463 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466 {
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
2479 target = PyMem_New(Py_UCS4, targetlen);
2480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
2494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2497 }
2498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
2504 memcpy(target, data, len * sizeof(Py_UCS4));
2505 }
2506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509 }
2510
2511 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514 {
2515 if (target == NULL || targetsize < 0) {
2516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520 }
2521
2522 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2523 PyUnicode_AsUCS4Copy(PyObject *string)
2524 {
2525 return as_ucs4(string, NULL, 0, 1);
2526 }
2527
2528 /* maximum number of characters required for output of %lld or %p.
2529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2532
2533 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536 {
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571 }
2572
2573 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575 Py_ssize_t width, Py_ssize_t precision)
2576 {
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
2582 if (precision == -1) {
2583 length = strlen(str);
2584 }
2585 else {
2586 length = 0;
2587 while (length < precision && str[length]) {
2588 length++;
2589 }
2590 }
2591 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2592 if (unicode == NULL)
2593 return -1;
2594
2595 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2596 Py_DECREF(unicode);
2597 return res;
2598 }
2599
2600 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2601 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2602 const char *f, va_list *vargs)
2603 {
2604 const char *p;
2605 Py_ssize_t len;
2606 int zeropad;
2607 Py_ssize_t width;
2608 Py_ssize_t precision;
2609 int longflag;
2610 int longlongflag;
2611 int size_tflag;
2612 Py_ssize_t fill;
2613
2614 p = f;
2615 f++;
2616 zeropad = 0;
2617 if (*f == '0') {
2618 zeropad = 1;
2619 f++;
2620 }
2621
2622 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2623 width = -1;
2624 if (Py_ISDIGIT((unsigned)*f)) {
2625 width = *f - '0';
2626 f++;
2627 while (Py_ISDIGIT((unsigned)*f)) {
2628 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629 PyErr_SetString(PyExc_ValueError,
2630 "width too big");
2631 return NULL;
2632 }
2633 width = (width * 10) + (*f - '0');
2634 f++;
2635 }
2636 }
2637 precision = -1;
2638 if (*f == '.') {
2639 f++;
2640 if (Py_ISDIGIT((unsigned)*f)) {
2641 precision = (*f - '0');
2642 f++;
2643 while (Py_ISDIGIT((unsigned)*f)) {
2644 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2645 PyErr_SetString(PyExc_ValueError,
2646 "precision too big");
2647 return NULL;
2648 }
2649 precision = (precision * 10) + (*f - '0');
2650 f++;
2651 }
2652 }
2653 if (*f == '%') {
2654 /* "%.3%s" => f points to "3" */
2655 f--;
2656 }
2657 }
2658 if (*f == '\0') {
2659 /* bogus format "%.123" => go backward, f points to "3" */
2660 f--;
2661 }
2662
2663 /* Handle %ld, %lu, %lld and %llu. */
2664 longflag = 0;
2665 longlongflag = 0;
2666 size_tflag = 0;
2667 if (*f == 'l') {
2668 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2669 longflag = 1;
2670 ++f;
2671 }
2672 else if (f[1] == 'l' &&
2673 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2674 longlongflag = 1;
2675 f += 2;
2676 }
2677 }
2678 /* handle the size_t flag. */
2679 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2680 size_tflag = 1;
2681 ++f;
2682 }
2683
2684 if (f[1] == '\0')
2685 writer->overallocate = 0;
2686
2687 switch (*f) {
2688 case 'c':
2689 {
2690 int ordinal = va_arg(*vargs, int);
2691 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2692 PyErr_SetString(PyExc_OverflowError,
2693 "character argument not in range(0x110000)");
2694 return NULL;
2695 }
2696 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2697 return NULL;
2698 break;
2699 }
2700
2701 case 'i':
2702 case 'd':
2703 case 'u':
2704 case 'x':
2705 {
2706 /* used by sprintf */
2707 char buffer[MAX_LONG_LONG_CHARS];
2708 Py_ssize_t arglen;
2709
2710 if (*f == 'u') {
2711 if (longflag)
2712 len = sprintf(buffer, "%lu",
2713 va_arg(*vargs, unsigned long));
2714 else if (longlongflag)
2715 len = sprintf(buffer, "%llu",
2716 va_arg(*vargs, unsigned long long));
2717 else if (size_tflag)
2718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2719 va_arg(*vargs, size_t));
2720 else
2721 len = sprintf(buffer, "%u",
2722 va_arg(*vargs, unsigned int));
2723 }
2724 else if (*f == 'x') {
2725 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2726 }
2727 else {
2728 if (longflag)
2729 len = sprintf(buffer, "%li",
2730 va_arg(*vargs, long));
2731 else if (longlongflag)
2732 len = sprintf(buffer, "%lli",
2733 va_arg(*vargs, long long));
2734 else if (size_tflag)
2735 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2736 va_arg(*vargs, Py_ssize_t));
2737 else
2738 len = sprintf(buffer, "%i",
2739 va_arg(*vargs, int));
2740 }
2741 assert(len >= 0);
2742
2743 if (precision < len)
2744 precision = len;
2745
2746 arglen = Py_MAX(precision, width);
2747 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2748 return NULL;
2749
2750 if (width > precision) {
2751 Py_UCS4 fillchar;
2752 fill = width - precision;
2753 fillchar = zeropad?'0':' ';
2754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2755 return NULL;
2756 writer->pos += fill;
2757 }
2758 if (precision > len) {
2759 fill = precision - len;
2760 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2761 return NULL;
2762 writer->pos += fill;
2763 }
2764
2765 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2766 return NULL;
2767 break;
2768 }
2769
2770 case 'p':
2771 {
2772 char number[MAX_LONG_LONG_CHARS];
2773
2774 len = sprintf(number, "%p", va_arg(*vargs, void*));
2775 assert(len >= 0);
2776
2777 /* %p is ill-defined: ensure leading 0x. */
2778 if (number[1] == 'X')
2779 number[1] = 'x';
2780 else if (number[1] != 'x') {
2781 memmove(number + 2, number,
2782 strlen(number) + 1);
2783 number[0] = '0';
2784 number[1] = 'x';
2785 len += 2;
2786 }
2787
2788 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2789 return NULL;
2790 break;
2791 }
2792
2793 case 's':
2794 {
2795 /* UTF-8 */
2796 const char *s = va_arg(*vargs, const char*);
2797 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2798 return NULL;
2799 break;
2800 }
2801
2802 case 'U':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 assert(obj && _PyUnicode_CHECK(obj));
2806
2807 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2808 return NULL;
2809 break;
2810 }
2811
2812 case 'V':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 const char *str = va_arg(*vargs, const char *);
2816 if (obj) {
2817 assert(_PyUnicode_CHECK(obj));
2818 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2819 return NULL;
2820 }
2821 else {
2822 assert(str != NULL);
2823 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2824 return NULL;
2825 }
2826 break;
2827 }
2828
2829 case 'S':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *str;
2833 assert(obj);
2834 str = PyObject_Str(obj);
2835 if (!str)
2836 return NULL;
2837 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2838 Py_DECREF(str);
2839 return NULL;
2840 }
2841 Py_DECREF(str);
2842 break;
2843 }
2844
2845 case 'R':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 PyObject *repr;
2849 assert(obj);
2850 repr = PyObject_Repr(obj);
2851 if (!repr)
2852 return NULL;
2853 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2854 Py_DECREF(repr);
2855 return NULL;
2856 }
2857 Py_DECREF(repr);
2858 break;
2859 }
2860
2861 case 'A':
2862 {
2863 PyObject *obj = va_arg(*vargs, PyObject *);
2864 PyObject *ascii;
2865 assert(obj);
2866 ascii = PyObject_ASCII(obj);
2867 if (!ascii)
2868 return NULL;
2869 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2870 Py_DECREF(ascii);
2871 return NULL;
2872 }
2873 Py_DECREF(ascii);
2874 break;
2875 }
2876
2877 case '%':
2878 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2879 return NULL;
2880 break;
2881
2882 default:
2883 /* if we stumble upon an unknown formatting code, copy the rest
2884 of the format string to the output string. (we cannot just
2885 skip the code, since there's no way to know what's in the
2886 argument list) */
2887 len = strlen(p);
2888 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2889 return NULL;
2890 f = p+len;
2891 return f;
2892 }
2893
2894 f++;
2895 return f;
2896 }
2897
2898 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2899 PyUnicode_FromFormatV(const char *format, va_list vargs)
2900 {
2901 va_list vargs2;
2902 const char *f;
2903 _PyUnicodeWriter writer;
2904
2905 _PyUnicodeWriter_Init(&writer);
2906 writer.min_length = strlen(format) + 100;
2907 writer.overallocate = 1;
2908
2909 // Copy varags to be able to pass a reference to a subfunction.
2910 va_copy(vargs2, vargs);
2911
2912 for (f = format; *f; ) {
2913 if (*f == '%') {
2914 f = unicode_fromformat_arg(&writer, f, &vargs2);
2915 if (f == NULL)
2916 goto fail;
2917 }
2918 else {
2919 const char *p;
2920 Py_ssize_t len;
2921
2922 p = f;
2923 do
2924 {
2925 if ((unsigned char)*p > 127) {
2926 PyErr_Format(PyExc_ValueError,
2927 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2928 "string, got a non-ASCII byte: 0x%02x",
2929 (unsigned char)*p);
2930 goto fail;
2931 }
2932 p++;
2933 }
2934 while (*p != '\0' && *p != '%');
2935 len = p - f;
2936
2937 if (*p == '\0')
2938 writer.overallocate = 0;
2939
2940 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2941 goto fail;
2942
2943 f = p;
2944 }
2945 }
2946 va_end(vargs2);
2947 return _PyUnicodeWriter_Finish(&writer);
2948
2949 fail:
2950 va_end(vargs2);
2951 _PyUnicodeWriter_Dealloc(&writer);
2952 return NULL;
2953 }
2954
2955 PyObject *
PyUnicode_FromFormat(const char * format,...)2956 PyUnicode_FromFormat(const char *format, ...)
2957 {
2958 PyObject* ret;
2959 va_list vargs;
2960
2961 #ifdef HAVE_STDARG_PROTOTYPES
2962 va_start(vargs, format);
2963 #else
2964 va_start(vargs);
2965 #endif
2966 ret = PyUnicode_FromFormatV(format, vargs);
2967 va_end(vargs);
2968 return ret;
2969 }
2970
2971 #ifdef HAVE_WCHAR_H
2972
2973 /* Convert a Unicode object to a wide character string.
2974
2975 - If w is NULL: return the number of wide characters (including the null
2976 character) required to convert the unicode object. Ignore size argument.
2977
2978 - Otherwise: return the number of wide characters (excluding the null
2979 character) written into w. Write at most size wide characters (including
2980 the null character). */
2981 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2982 PyUnicode_AsWideChar(PyObject *unicode,
2983 wchar_t *w,
2984 Py_ssize_t size)
2985 {
2986 Py_ssize_t res;
2987 const wchar_t *wstr;
2988
2989 if (unicode == NULL) {
2990 PyErr_BadInternalCall();
2991 return -1;
2992 }
2993 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2994 if (wstr == NULL)
2995 return -1;
2996
2997 if (w != NULL) {
2998 if (size > res)
2999 size = res + 1;
3000 else
3001 res = size;
3002 memcpy(w, wstr, size * sizeof(wchar_t));
3003 return res;
3004 }
3005 else
3006 return res + 1;
3007 }
3008
3009 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3010 PyUnicode_AsWideCharString(PyObject *unicode,
3011 Py_ssize_t *size)
3012 {
3013 const wchar_t *wstr;
3014 wchar_t *buffer;
3015 Py_ssize_t buflen;
3016
3017 if (unicode == NULL) {
3018 PyErr_BadInternalCall();
3019 return NULL;
3020 }
3021
3022 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3023 if (wstr == NULL) {
3024 return NULL;
3025 }
3026 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3027 PyErr_SetString(PyExc_ValueError,
3028 "embedded null character");
3029 return NULL;
3030 }
3031
3032 buffer = PyMem_NEW(wchar_t, buflen + 1);
3033 if (buffer == NULL) {
3034 PyErr_NoMemory();
3035 return NULL;
3036 }
3037 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3038 if (size != NULL)
3039 *size = buflen;
3040 return buffer;
3041 }
3042
3043 #endif /* HAVE_WCHAR_H */
3044
3045 PyObject *
PyUnicode_FromOrdinal(int ordinal)3046 PyUnicode_FromOrdinal(int ordinal)
3047 {
3048 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3049 PyErr_SetString(PyExc_ValueError,
3050 "chr() arg not in range(0x110000)");
3051 return NULL;
3052 }
3053
3054 return unicode_char((Py_UCS4)ordinal);
3055 }
3056
3057 PyObject *
PyUnicode_FromObject(PyObject * obj)3058 PyUnicode_FromObject(PyObject *obj)
3059 {
3060 /* XXX Perhaps we should make this API an alias of
3061 PyObject_Str() instead ?! */
3062 if (PyUnicode_CheckExact(obj)) {
3063 if (PyUnicode_READY(obj) == -1)
3064 return NULL;
3065 Py_INCREF(obj);
3066 return obj;
3067 }
3068 if (PyUnicode_Check(obj)) {
3069 /* For a Unicode subtype that's not a Unicode object,
3070 return a true Unicode object with the same data. */
3071 return _PyUnicode_Copy(obj);
3072 }
3073 PyErr_Format(PyExc_TypeError,
3074 "Can't convert '%.100s' object to str implicitly",
3075 Py_TYPE(obj)->tp_name);
3076 return NULL;
3077 }
3078
3079 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3080 PyUnicode_FromEncodedObject(PyObject *obj,
3081 const char *encoding,
3082 const char *errors)
3083 {
3084 Py_buffer buffer;
3085 PyObject *v;
3086
3087 if (obj == NULL) {
3088 PyErr_BadInternalCall();
3089 return NULL;
3090 }
3091
3092 /* Decoding bytes objects is the most common case and should be fast */
3093 if (PyBytes_Check(obj)) {
3094 if (PyBytes_GET_SIZE(obj) == 0)
3095 _Py_RETURN_UNICODE_EMPTY();
3096 v = PyUnicode_Decode(
3097 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3098 encoding, errors);
3099 return v;
3100 }
3101
3102 if (PyUnicode_Check(obj)) {
3103 PyErr_SetString(PyExc_TypeError,
3104 "decoding str is not supported");
3105 return NULL;
3106 }
3107
3108 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3109 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3110 PyErr_Format(PyExc_TypeError,
3111 "decoding to str: need a bytes-like object, %.80s found",
3112 Py_TYPE(obj)->tp_name);
3113 return NULL;
3114 }
3115
3116 if (buffer.len == 0) {
3117 PyBuffer_Release(&buffer);
3118 _Py_RETURN_UNICODE_EMPTY();
3119 }
3120
3121 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3122 PyBuffer_Release(&buffer);
3123 return v;
3124 }
3125
3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3127 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3128 longer than lower_len-1). */
3129 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3130 _Py_normalize_encoding(const char *encoding,
3131 char *lower,
3132 size_t lower_len)
3133 {
3134 const char *e;
3135 char *l;
3136 char *l_end;
3137 int punct;
3138
3139 assert(encoding != NULL);
3140
3141 e = encoding;
3142 l = lower;
3143 l_end = &lower[lower_len - 1];
3144 punct = 0;
3145 while (1) {
3146 char c = *e;
3147 if (c == 0) {
3148 break;
3149 }
3150
3151 if (Py_ISALNUM(c) || c == '.') {
3152 if (punct && l != lower) {
3153 if (l == l_end) {
3154 return 0;
3155 }
3156 *l++ = '_';
3157 }
3158 punct = 0;
3159
3160 if (l == l_end) {
3161 return 0;
3162 }
3163 *l++ = Py_TOLOWER(c);
3164 }
3165 else {
3166 punct = 1;
3167 }
3168
3169 e++;
3170 }
3171 *l = '\0';
3172 return 1;
3173 }
3174
3175 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3176 PyUnicode_Decode(const char *s,
3177 Py_ssize_t size,
3178 const char *encoding,
3179 const char *errors)
3180 {
3181 PyObject *buffer = NULL, *unicode;
3182 Py_buffer info;
3183 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3184
3185 if (encoding == NULL) {
3186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187 }
3188
3189 /* Shortcuts for common default encodings */
3190 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3191 char *lower = buflower;
3192
3193 /* Fast paths */
3194 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3195 lower += 3;
3196 if (*lower == '_') {
3197 /* Match "utf8" and "utf_8" */
3198 lower++;
3199 }
3200
3201 if (lower[0] == '8' && lower[1] == 0) {
3202 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3203 }
3204 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3205 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3206 }
3207 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3208 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3209 }
3210 }
3211 else {
3212 if (strcmp(lower, "ascii") == 0
3213 || strcmp(lower, "us_ascii") == 0) {
3214 return PyUnicode_DecodeASCII(s, size, errors);
3215 }
3216 #ifdef MS_WINDOWS
3217 else if (strcmp(lower, "mbcs") == 0) {
3218 return PyUnicode_DecodeMBCS(s, size, errors);
3219 }
3220 #endif
3221 else if (strcmp(lower, "latin1") == 0
3222 || strcmp(lower, "latin_1") == 0
3223 || strcmp(lower, "iso_8859_1") == 0
3224 || strcmp(lower, "iso8859_1") == 0) {
3225 return PyUnicode_DecodeLatin1(s, size, errors);
3226 }
3227 }
3228 }
3229
3230 /* Decode via the codec registry */
3231 buffer = NULL;
3232 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3233 goto onError;
3234 buffer = PyMemoryView_FromBuffer(&info);
3235 if (buffer == NULL)
3236 goto onError;
3237 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3238 if (unicode == NULL)
3239 goto onError;
3240 if (!PyUnicode_Check(unicode)) {
3241 PyErr_Format(PyExc_TypeError,
3242 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3243 "use codecs.decode() to decode to arbitrary types",
3244 encoding,
3245 Py_TYPE(unicode)->tp_name);
3246 Py_DECREF(unicode);
3247 goto onError;
3248 }
3249 Py_DECREF(buffer);
3250 return unicode_result(unicode);
3251
3252 onError:
3253 Py_XDECREF(buffer);
3254 return NULL;
3255 }
3256
3257 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3258 PyUnicode_AsDecodedObject(PyObject *unicode,
3259 const char *encoding,
3260 const char *errors)
3261 {
3262 if (!PyUnicode_Check(unicode)) {
3263 PyErr_BadArgument();
3264 return NULL;
3265 }
3266
3267 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3268 "PyUnicode_AsDecodedObject() is deprecated; "
3269 "use PyCodec_Decode() to decode from str", 1) < 0)
3270 return NULL;
3271
3272 if (encoding == NULL)
3273 encoding = PyUnicode_GetDefaultEncoding();
3274
3275 /* Decode via the codec registry */
3276 return PyCodec_Decode(unicode, encoding, errors);
3277 }
3278
3279 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3280 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3281 const char *encoding,
3282 const char *errors)
3283 {
3284 PyObject *v;
3285
3286 if (!PyUnicode_Check(unicode)) {
3287 PyErr_BadArgument();
3288 goto onError;
3289 }
3290
3291 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3292 "PyUnicode_AsDecodedUnicode() is deprecated; "
3293 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3294 return NULL;
3295
3296 if (encoding == NULL)
3297 encoding = PyUnicode_GetDefaultEncoding();
3298
3299 /* Decode via the codec registry */
3300 v = PyCodec_Decode(unicode, encoding, errors);
3301 if (v == NULL)
3302 goto onError;
3303 if (!PyUnicode_Check(v)) {
3304 PyErr_Format(PyExc_TypeError,
3305 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3306 "use codecs.decode() to decode to arbitrary types",
3307 encoding,
3308 Py_TYPE(unicode)->tp_name);
3309 Py_DECREF(v);
3310 goto onError;
3311 }
3312 return unicode_result(v);
3313
3314 onError:
3315 return NULL;
3316 }
3317
3318 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3319 PyUnicode_Encode(const Py_UNICODE *s,
3320 Py_ssize_t size,
3321 const char *encoding,
3322 const char *errors)
3323 {
3324 PyObject *v, *unicode;
3325
3326 unicode = PyUnicode_FromWideChar(s, size);
3327 if (unicode == NULL)
3328 return NULL;
3329 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3330 Py_DECREF(unicode);
3331 return v;
3332 }
3333
3334 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3335 PyUnicode_AsEncodedObject(PyObject *unicode,
3336 const char *encoding,
3337 const char *errors)
3338 {
3339 PyObject *v;
3340
3341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
3343 goto onError;
3344 }
3345
3346 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3347 "PyUnicode_AsEncodedObject() is deprecated; "
3348 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3349 "or PyCodec_Encode() for generic encoding", 1) < 0)
3350 return NULL;
3351
3352 if (encoding == NULL)
3353 encoding = PyUnicode_GetDefaultEncoding();
3354
3355 /* Encode via the codec registry */
3356 v = PyCodec_Encode(unicode, encoding, errors);
3357 if (v == NULL)
3358 goto onError;
3359 return v;
3360
3361 onError:
3362 return NULL;
3363 }
3364
3365 static int
locale_error_handler(const char * errors,int * surrogateescape)3366 locale_error_handler(const char *errors, int *surrogateescape)
3367 {
3368 _Py_error_handler error_handler = get_error_handler(errors);
3369 switch (error_handler)
3370 {
3371 case _Py_ERROR_STRICT:
3372 *surrogateescape = 0;
3373 return 0;
3374 case _Py_ERROR_SURROGATEESCAPE:
3375 *surrogateescape = 1;
3376 return 0;
3377 default:
3378 PyErr_Format(PyExc_ValueError,
3379 "only 'strict' and 'surrogateescape' error handlers "
3380 "are supported, not '%s'",
3381 errors);
3382 return -1;
3383 }
3384 }
3385
3386 static PyObject *
unicode_encode_locale(PyObject * unicode,const char * errors,int current_locale)3387 unicode_encode_locale(PyObject *unicode, const char *errors,
3388 int current_locale)
3389 {
3390 int surrogateescape;
3391 if (locale_error_handler(errors, &surrogateescape) < 0)
3392 return NULL;
3393
3394 Py_ssize_t wlen;
3395 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3396 if (wstr == NULL) {
3397 return NULL;
3398 }
3399
3400 if ((size_t)wlen != wcslen(wstr)) {
3401 PyErr_SetString(PyExc_ValueError, "embedded null character");
3402 PyMem_Free(wstr);
3403 return NULL;
3404 }
3405
3406 char *str;
3407 size_t error_pos;
3408 const char *reason;
3409 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3410 current_locale, surrogateescape);
3411 PyMem_Free(wstr);
3412
3413 if (res != 0) {
3414 if (res == -2) {
3415 PyObject *exc;
3416 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3417 "locale", unicode,
3418 (Py_ssize_t)error_pos,
3419 (Py_ssize_t)(error_pos+1),
3420 reason);
3421 if (exc != NULL) {
3422 PyCodec_StrictErrors(exc);
3423 Py_DECREF(exc);
3424 }
3425 }
3426 else {
3427 PyErr_NoMemory();
3428 }
3429 return NULL;
3430 }
3431
3432 PyObject *bytes = PyBytes_FromString(str);
3433 PyMem_RawFree(str);
3434 return bytes;
3435 }
3436
3437 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3439 {
3440 return unicode_encode_locale(unicode, errors, 1);
3441 }
3442
3443 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3444 PyUnicode_EncodeFSDefault(PyObject *unicode)
3445 {
3446 #if defined(__APPLE__)
3447 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3448 #else
3449 PyInterpreterState *interp = PyThreadState_GET()->interp;
3450 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3451 cannot use it to encode and decode filenames before it is loaded. Load
3452 the Python codec requires to encode at least its own filename. Use the C
3453 version of the locale codec until the codec registry is initialized and
3454 the Python codec is loaded.
3455
3456 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3457 cannot only rely on it: check also interp->fscodec_initialized for
3458 subinterpreters. */
3459 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3460 return PyUnicode_AsEncodedString(unicode,
3461 Py_FileSystemDefaultEncoding,
3462 Py_FileSystemDefaultEncodeErrors);
3463 }
3464 else {
3465 return unicode_encode_locale(unicode,
3466 Py_FileSystemDefaultEncodeErrors, 0);
3467 }
3468 #endif
3469 }
3470
3471 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3472 PyUnicode_AsEncodedString(PyObject *unicode,
3473 const char *encoding,
3474 const char *errors)
3475 {
3476 PyObject *v;
3477 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3478
3479 if (!PyUnicode_Check(unicode)) {
3480 PyErr_BadArgument();
3481 return NULL;
3482 }
3483
3484 if (encoding == NULL) {
3485 return _PyUnicode_AsUTF8String(unicode, errors);
3486 }
3487
3488 /* Shortcuts for common default encodings */
3489 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3490 char *lower = buflower;
3491
3492 /* Fast paths */
3493 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3494 lower += 3;
3495 if (*lower == '_') {
3496 /* Match "utf8" and "utf_8" */
3497 lower++;
3498 }
3499
3500 if (lower[0] == '8' && lower[1] == 0) {
3501 return _PyUnicode_AsUTF8String(unicode, errors);
3502 }
3503 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3504 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3505 }
3506 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3507 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3508 }
3509 }
3510 else {
3511 if (strcmp(lower, "ascii") == 0
3512 || strcmp(lower, "us_ascii") == 0) {
3513 return _PyUnicode_AsASCIIString(unicode, errors);
3514 }
3515 #ifdef MS_WINDOWS
3516 else if (strcmp(lower, "mbcs") == 0) {
3517 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3518 }
3519 #endif
3520 else if (strcmp(lower, "latin1") == 0 ||
3521 strcmp(lower, "latin_1") == 0 ||
3522 strcmp(lower, "iso_8859_1") == 0 ||
3523 strcmp(lower, "iso8859_1") == 0) {
3524 return _PyUnicode_AsLatin1String(unicode, errors);
3525 }
3526 }
3527 }
3528
3529 /* Encode via the codec registry */
3530 v = _PyCodec_EncodeText(unicode, encoding, errors);
3531 if (v == NULL)
3532 return NULL;
3533
3534 /* The normal path */
3535 if (PyBytes_Check(v))
3536 return v;
3537
3538 /* If the codec returns a buffer, raise a warning and convert to bytes */
3539 if (PyByteArray_Check(v)) {
3540 int error;
3541 PyObject *b;
3542
3543 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3544 "encoder %s returned bytearray instead of bytes; "
3545 "use codecs.encode() to encode to arbitrary types",
3546 encoding);
3547 if (error) {
3548 Py_DECREF(v);
3549 return NULL;
3550 }
3551
3552 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3553 PyByteArray_GET_SIZE(v));
3554 Py_DECREF(v);
3555 return b;
3556 }
3557
3558 PyErr_Format(PyExc_TypeError,
3559 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3560 "use codecs.encode() to encode to arbitrary types",
3561 encoding,
3562 Py_TYPE(v)->tp_name);
3563 Py_DECREF(v);
3564 return NULL;
3565 }
3566
3567 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3568 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3569 const char *encoding,
3570 const char *errors)
3571 {
3572 PyObject *v;
3573
3574 if (!PyUnicode_Check(unicode)) {
3575 PyErr_BadArgument();
3576 goto onError;
3577 }
3578
3579 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3580 "PyUnicode_AsEncodedUnicode() is deprecated; "
3581 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3582 return NULL;
3583
3584 if (encoding == NULL)
3585 encoding = PyUnicode_GetDefaultEncoding();
3586
3587 /* Encode via the codec registry */
3588 v = PyCodec_Encode(unicode, encoding, errors);
3589 if (v == NULL)
3590 goto onError;
3591 if (!PyUnicode_Check(v)) {
3592 PyErr_Format(PyExc_TypeError,
3593 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3594 "use codecs.encode() to encode to arbitrary types",
3595 encoding,
3596 Py_TYPE(v)->tp_name);
3597 Py_DECREF(v);
3598 goto onError;
3599 }
3600 return v;
3601
3602 onError:
3603 return NULL;
3604 }
3605
3606 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,const char * errors,int current_locale)3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3608 int current_locale)
3609 {
3610 int surrogateescape;
3611 if (locale_error_handler(errors, &surrogateescape) < 0)
3612 return NULL;
3613
3614 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3615 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3616 return NULL;
3617 }
3618
3619 wchar_t *wstr;
3620 size_t wlen;
3621 const char *reason;
3622 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3623 current_locale, surrogateescape);
3624 if (res != 0) {
3625 if (res == -2) {
3626 PyObject *exc;
3627 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3628 "locale", str, len,
3629 (Py_ssize_t)wlen,
3630 (Py_ssize_t)(wlen + 1),
3631 reason);
3632 if (exc != NULL) {
3633 PyCodec_StrictErrors(exc);
3634 Py_DECREF(exc);
3635 }
3636 }
3637 else {
3638 PyErr_NoMemory();
3639 }
3640 return NULL;
3641 }
3642
3643 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3644 PyMem_RawFree(wstr);
3645 return unicode;
3646 }
3647
3648 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3650 const char *errors)
3651 {
3652 return unicode_decode_locale(str, len, errors, 1);
3653 }
3654
3655 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3656 PyUnicode_DecodeLocale(const char *str, const char *errors)
3657 {
3658 Py_ssize_t size = (Py_ssize_t)strlen(str);
3659 return unicode_decode_locale(str, size, errors, 1);
3660 }
3661
3662
3663 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3664 PyUnicode_DecodeFSDefault(const char *s) {
3665 Py_ssize_t size = (Py_ssize_t)strlen(s);
3666 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3667 }
3668
3669 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3671 {
3672 #if defined(__APPLE__)
3673 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3674 #else
3675 PyInterpreterState *interp = PyThreadState_GET()->interp;
3676 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3677 cannot use it to encode and decode filenames before it is loaded. Load
3678 the Python codec requires to encode at least its own filename. Use the C
3679 version of the locale codec until the codec registry is initialized and
3680 the Python codec is loaded.
3681
3682 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3683 cannot only rely on it: check also interp->fscodec_initialized for
3684 subinterpreters. */
3685 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3686 return PyUnicode_Decode(s, size,
3687 Py_FileSystemDefaultEncoding,
3688 Py_FileSystemDefaultEncodeErrors);
3689 }
3690 else {
3691 return unicode_decode_locale(s, size,
3692 Py_FileSystemDefaultEncodeErrors, 0);
3693 }
3694 #endif
3695 }
3696
3697
3698 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3699 PyUnicode_FSConverter(PyObject* arg, void* addr)
3700 {
3701 PyObject *path = NULL;
3702 PyObject *output = NULL;
3703 Py_ssize_t size;
3704 void *data;
3705 if (arg == NULL) {
3706 Py_DECREF(*(PyObject**)addr);
3707 *(PyObject**)addr = NULL;
3708 return 1;
3709 }
3710 path = PyOS_FSPath(arg);
3711 if (path == NULL) {
3712 return 0;
3713 }
3714 if (PyBytes_Check(path)) {
3715 output = path;
3716 }
3717 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3718 output = PyUnicode_EncodeFSDefault(path);
3719 Py_DECREF(path);
3720 if (!output) {
3721 return 0;
3722 }
3723 assert(PyBytes_Check(output));
3724 }
3725
3726 size = PyBytes_GET_SIZE(output);
3727 data = PyBytes_AS_STRING(output);
3728 if ((size_t)size != strlen(data)) {
3729 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3730 Py_DECREF(output);
3731 return 0;
3732 }
3733 *(PyObject**)addr = output;
3734 return Py_CLEANUP_SUPPORTED;
3735 }
3736
3737
3738 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3739 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3740 {
3741 int is_buffer = 0;
3742 PyObject *path = NULL;
3743 PyObject *output = NULL;
3744 if (arg == NULL) {
3745 Py_DECREF(*(PyObject**)addr);
3746 *(PyObject**)addr = NULL;
3747 return 1;
3748 }
3749
3750 is_buffer = PyObject_CheckBuffer(arg);
3751 if (!is_buffer) {
3752 path = PyOS_FSPath(arg);
3753 if (path == NULL) {
3754 return 0;
3755 }
3756 }
3757 else {
3758 path = arg;
3759 Py_INCREF(arg);
3760 }
3761
3762 if (PyUnicode_Check(path)) {
3763 if (PyUnicode_READY(path) == -1) {
3764 Py_DECREF(path);
3765 return 0;
3766 }
3767 output = path;
3768 }
3769 else if (PyBytes_Check(path) || is_buffer) {
3770 PyObject *path_bytes = NULL;
3771
3772 if (!PyBytes_Check(path) &&
3773 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3774 "path should be string, bytes, or os.PathLike, not %.200s",
3775 Py_TYPE(arg)->tp_name)) {
3776 Py_DECREF(path);
3777 return 0;
3778 }
3779 path_bytes = PyBytes_FromObject(path);
3780 Py_DECREF(path);
3781 if (!path_bytes) {
3782 return 0;
3783 }
3784 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3785 PyBytes_GET_SIZE(path_bytes));
3786 Py_DECREF(path_bytes);
3787 if (!output) {
3788 return 0;
3789 }
3790 }
3791 else {
3792 PyErr_Format(PyExc_TypeError,
3793 "path should be string, bytes, or os.PathLike, not %.200s",
3794 Py_TYPE(arg)->tp_name);
3795 Py_DECREF(path);
3796 return 0;
3797 }
3798 if (PyUnicode_READY(output) == -1) {
3799 Py_DECREF(output);
3800 return 0;
3801 }
3802 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3803 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3804 PyErr_SetString(PyExc_ValueError, "embedded null character");
3805 Py_DECREF(output);
3806 return 0;
3807 }
3808 *(PyObject**)addr = output;
3809 return Py_CLEANUP_SUPPORTED;
3810 }
3811
3812
3813 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3815 {
3816 PyObject *bytes;
3817
3818 if (!PyUnicode_Check(unicode)) {
3819 PyErr_BadArgument();
3820 return NULL;
3821 }
3822 if (PyUnicode_READY(unicode) == -1)
3823 return NULL;
3824
3825 if (PyUnicode_UTF8(unicode) == NULL) {
3826 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3827 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3828 if (bytes == NULL)
3829 return NULL;
3830 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3831 if (_PyUnicode_UTF8(unicode) == NULL) {
3832 PyErr_NoMemory();
3833 Py_DECREF(bytes);
3834 return NULL;
3835 }
3836 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3837 memcpy(_PyUnicode_UTF8(unicode),
3838 PyBytes_AS_STRING(bytes),
3839 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3840 Py_DECREF(bytes);
3841 }
3842
3843 if (psize)
3844 *psize = PyUnicode_UTF8_LENGTH(unicode);
3845 return PyUnicode_UTF8(unicode);
3846 }
3847
3848 const char *
PyUnicode_AsUTF8(PyObject * unicode)3849 PyUnicode_AsUTF8(PyObject *unicode)
3850 {
3851 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3852 }
3853
3854 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3856 {
3857 const unsigned char *one_byte;
3858 #if SIZEOF_WCHAR_T == 4
3859 const Py_UCS2 *two_bytes;
3860 #else
3861 const Py_UCS4 *four_bytes;
3862 const Py_UCS4 *ucs4_end;
3863 Py_ssize_t num_surrogates;
3864 #endif
3865 wchar_t *w;
3866 wchar_t *wchar_end;
3867
3868 if (!PyUnicode_Check(unicode)) {
3869 PyErr_BadArgument();
3870 return NULL;
3871 }
3872 if (_PyUnicode_WSTR(unicode) == NULL) {
3873 /* Non-ASCII compact unicode object */
3874 assert(_PyUnicode_KIND(unicode) != 0);
3875 assert(PyUnicode_IS_READY(unicode));
3876
3877 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3878 #if SIZEOF_WCHAR_T == 2
3879 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3880 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3881 num_surrogates = 0;
3882
3883 for (; four_bytes < ucs4_end; ++four_bytes) {
3884 if (*four_bytes > 0xFFFF)
3885 ++num_surrogates;
3886 }
3887
3888 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3889 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3890 if (!_PyUnicode_WSTR(unicode)) {
3891 PyErr_NoMemory();
3892 return NULL;
3893 }
3894 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3895
3896 w = _PyUnicode_WSTR(unicode);
3897 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3898 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3899 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3900 if (*four_bytes > 0xFFFF) {
3901 assert(*four_bytes <= MAX_UNICODE);
3902 /* encode surrogate pair in this case */
3903 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3904 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3905 }
3906 else
3907 *w = *four_bytes;
3908
3909 if (w > wchar_end) {
3910 Py_UNREACHABLE();
3911 }
3912 }
3913 *w = 0;
3914 #else
3915 /* sizeof(wchar_t) == 4 */
3916 Py_FatalError("Impossible unicode object state, wstr and str "
3917 "should share memory already.");
3918 return NULL;
3919 #endif
3920 }
3921 else {
3922 if ((size_t)_PyUnicode_LENGTH(unicode) >
3923 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3924 PyErr_NoMemory();
3925 return NULL;
3926 }
3927 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3928 (_PyUnicode_LENGTH(unicode) + 1));
3929 if (!_PyUnicode_WSTR(unicode)) {
3930 PyErr_NoMemory();
3931 return NULL;
3932 }
3933 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3934 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3935 w = _PyUnicode_WSTR(unicode);
3936 wchar_end = w + _PyUnicode_LENGTH(unicode);
3937
3938 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3939 one_byte = PyUnicode_1BYTE_DATA(unicode);
3940 for (; w < wchar_end; ++one_byte, ++w)
3941 *w = *one_byte;
3942 /* null-terminate the wstr */
3943 *w = 0;
3944 }
3945 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3946 #if SIZEOF_WCHAR_T == 4
3947 two_bytes = PyUnicode_2BYTE_DATA(unicode);
3948 for (; w < wchar_end; ++two_bytes, ++w)
3949 *w = *two_bytes;
3950 /* null-terminate the wstr */
3951 *w = 0;
3952 #else
3953 /* sizeof(wchar_t) == 2 */
3954 PyObject_FREE(_PyUnicode_WSTR(unicode));
3955 _PyUnicode_WSTR(unicode) = NULL;
3956 Py_FatalError("Impossible unicode object state, wstr "
3957 "and str should share memory already.");
3958 return NULL;
3959 #endif
3960 }
3961 else {
3962 Py_UNREACHABLE();
3963 }
3964 }
3965 }
3966 if (size != NULL)
3967 *size = PyUnicode_WSTR_LENGTH(unicode);
3968 return _PyUnicode_WSTR(unicode);
3969 }
3970
3971 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)3972 PyUnicode_AsUnicode(PyObject *unicode)
3973 {
3974 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3975 }
3976
3977 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)3978 _PyUnicode_AsUnicode(PyObject *unicode)
3979 {
3980 Py_ssize_t size;
3981 const Py_UNICODE *wstr;
3982
3983 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3984 if (wstr && wcslen(wstr) != (size_t)size) {
3985 PyErr_SetString(PyExc_ValueError, "embedded null character");
3986 return NULL;
3987 }
3988 return wstr;
3989 }
3990
3991
3992 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)3993 PyUnicode_GetSize(PyObject *unicode)
3994 {
3995 if (!PyUnicode_Check(unicode)) {
3996 PyErr_BadArgument();
3997 goto onError;
3998 }
3999 if (_PyUnicode_WSTR(unicode) == NULL) {
4000 if (PyUnicode_AsUnicode(unicode) == NULL)
4001 goto onError;
4002 }
4003 return PyUnicode_WSTR_LENGTH(unicode);
4004
4005 onError:
4006 return -1;
4007 }
4008
4009 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4010 PyUnicode_GetLength(PyObject *unicode)
4011 {
4012 if (!PyUnicode_Check(unicode)) {
4013 PyErr_BadArgument();
4014 return -1;
4015 }
4016 if (PyUnicode_READY(unicode) == -1)
4017 return -1;
4018 return PyUnicode_GET_LENGTH(unicode);
4019 }
4020
4021 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4023 {
4024 void *data;
4025 int kind;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return (Py_UCS4)-1;
4030 }
4031 if (PyUnicode_READY(unicode) == -1) {
4032 return (Py_UCS4)-1;
4033 }
4034 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4035 PyErr_SetString(PyExc_IndexError, "string index out of range");
4036 return (Py_UCS4)-1;
4037 }
4038 data = PyUnicode_DATA(unicode);
4039 kind = PyUnicode_KIND(unicode);
4040 return PyUnicode_READ(kind, data, index);
4041 }
4042
4043 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4045 {
4046 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4047 PyErr_BadArgument();
4048 return -1;
4049 }
4050 assert(PyUnicode_IS_READY(unicode));
4051 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4052 PyErr_SetString(PyExc_IndexError, "string index out of range");
4053 return -1;
4054 }
4055 if (unicode_check_modifiable(unicode))
4056 return -1;
4057 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4058 PyErr_SetString(PyExc_ValueError, "character out of range");
4059 return -1;
4060 }
4061 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4062 index, ch);
4063 return 0;
4064 }
4065
4066 const char *
PyUnicode_GetDefaultEncoding(void)4067 PyUnicode_GetDefaultEncoding(void)
4068 {
4069 return "utf-8";
4070 }
4071
4072 /* create or adjust a UnicodeDecodeError */
4073 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4074 make_decode_exception(PyObject **exceptionObject,
4075 const char *encoding,
4076 const char *input, Py_ssize_t length,
4077 Py_ssize_t startpos, Py_ssize_t endpos,
4078 const char *reason)
4079 {
4080 if (*exceptionObject == NULL) {
4081 *exceptionObject = PyUnicodeDecodeError_Create(
4082 encoding, input, length, startpos, endpos, reason);
4083 }
4084 else {
4085 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4086 goto onError;
4087 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4088 goto onError;
4089 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4090 goto onError;
4091 }
4092 return;
4093
4094 onError:
4095 Py_CLEAR(*exceptionObject);
4096 }
4097
4098 #ifdef MS_WINDOWS
4099 /* error handling callback helper:
4100 build arguments, call the callback and check the arguments,
4101 if no exception occurred, copy the replacement to the output
4102 and adjust various state variables.
4103 return 0 on success, -1 on error
4104 */
4105
4106 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4107 unicode_decode_call_errorhandler_wchar(
4108 const char *errors, PyObject **errorHandler,
4109 const char *encoding, const char *reason,
4110 const char **input, const char **inend, Py_ssize_t *startinpos,
4111 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4112 PyObject **output, Py_ssize_t *outpos)
4113 {
4114 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4115
4116 PyObject *restuple = NULL;
4117 PyObject *repunicode = NULL;
4118 Py_ssize_t outsize;
4119 Py_ssize_t insize;
4120 Py_ssize_t requiredsize;
4121 Py_ssize_t newpos;
4122 PyObject *inputobj = NULL;
4123 wchar_t *repwstr;
4124 Py_ssize_t repwlen;
4125
4126 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4127 outsize = _PyUnicode_WSTR_LENGTH(*output);
4128
4129 if (*errorHandler == NULL) {
4130 *errorHandler = PyCodec_LookupError(errors);
4131 if (*errorHandler == NULL)
4132 goto onError;
4133 }
4134
4135 make_decode_exception(exceptionObject,
4136 encoding,
4137 *input, *inend - *input,
4138 *startinpos, *endinpos,
4139 reason);
4140 if (*exceptionObject == NULL)
4141 goto onError;
4142
4143 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4144 if (restuple == NULL)
4145 goto onError;
4146 if (!PyTuple_Check(restuple)) {
4147 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4148 goto onError;
4149 }
4150 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4151 goto onError;
4152
4153 /* Copy back the bytes variables, which might have been modified by the
4154 callback */
4155 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4156 if (!inputobj)
4157 goto onError;
4158 *input = PyBytes_AS_STRING(inputobj);
4159 insize = PyBytes_GET_SIZE(inputobj);
4160 *inend = *input + insize;
4161 /* we can DECREF safely, as the exception has another reference,
4162 so the object won't go away. */
4163 Py_DECREF(inputobj);
4164
4165 if (newpos<0)
4166 newpos = insize+newpos;
4167 if (newpos<0 || newpos>insize) {
4168 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4169 goto onError;
4170 }
4171
4172 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4173 if (repwstr == NULL)
4174 goto onError;
4175 /* need more space? (at least enough for what we
4176 have+the replacement+the rest of the string (starting
4177 at the new input position), so we won't have to check space
4178 when there are no errors in the rest of the string) */
4179 requiredsize = *outpos;
4180 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4181 goto overflow;
4182 requiredsize += repwlen;
4183 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4184 goto overflow;
4185 requiredsize += insize - newpos;
4186 if (requiredsize > outsize) {
4187 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4188 requiredsize = 2*outsize;
4189 if (unicode_resize(output, requiredsize) < 0)
4190 goto onError;
4191 }
4192 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4193 *outpos += repwlen;
4194 *endinpos = newpos;
4195 *inptr = *input + newpos;
4196
4197 /* we made it! */
4198 Py_DECREF(restuple);
4199 return 0;
4200
4201 overflow:
4202 PyErr_SetString(PyExc_OverflowError,
4203 "decoded result is too long for a Python string");
4204
4205 onError:
4206 Py_XDECREF(restuple);
4207 return -1;
4208 }
4209 #endif /* MS_WINDOWS */
4210
4211 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4212 unicode_decode_call_errorhandler_writer(
4213 const char *errors, PyObject **errorHandler,
4214 const char *encoding, const char *reason,
4215 const char **input, const char **inend, Py_ssize_t *startinpos,
4216 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4217 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4218 {
4219 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4220
4221 PyObject *restuple = NULL;
4222 PyObject *repunicode = NULL;
4223 Py_ssize_t insize;
4224 Py_ssize_t newpos;
4225 Py_ssize_t replen;
4226 Py_ssize_t remain;
4227 PyObject *inputobj = NULL;
4228 int need_to_grow = 0;
4229 const char *new_inptr;
4230
4231 if (*errorHandler == NULL) {
4232 *errorHandler = PyCodec_LookupError(errors);
4233 if (*errorHandler == NULL)
4234 goto onError;
4235 }
4236
4237 make_decode_exception(exceptionObject,
4238 encoding,
4239 *input, *inend - *input,
4240 *startinpos, *endinpos,
4241 reason);
4242 if (*exceptionObject == NULL)
4243 goto onError;
4244
4245 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4246 if (restuple == NULL)
4247 goto onError;
4248 if (!PyTuple_Check(restuple)) {
4249 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4250 goto onError;
4251 }
4252 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4253 goto onError;
4254
4255 /* Copy back the bytes variables, which might have been modified by the
4256 callback */
4257 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4258 if (!inputobj)
4259 goto onError;
4260 remain = *inend - *input - *endinpos;
4261 *input = PyBytes_AS_STRING(inputobj);
4262 insize = PyBytes_GET_SIZE(inputobj);
4263 *inend = *input + insize;
4264 /* we can DECREF safely, as the exception has another reference,
4265 so the object won't go away. */
4266 Py_DECREF(inputobj);
4267
4268 if (newpos<0)
4269 newpos = insize+newpos;
4270 if (newpos<0 || newpos>insize) {
4271 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4272 goto onError;
4273 }
4274
4275 replen = PyUnicode_GET_LENGTH(repunicode);
4276 if (replen > 1) {
4277 writer->min_length += replen - 1;
4278 need_to_grow = 1;
4279 }
4280 new_inptr = *input + newpos;
4281 if (*inend - new_inptr > remain) {
4282 /* We don't know the decoding algorithm here so we make the worst
4283 assumption that one byte decodes to one unicode character.
4284 If unfortunately one byte could decode to more unicode characters,
4285 the decoder may write out-of-bound then. Is it possible for the
4286 algorithms using this function? */
4287 writer->min_length += *inend - new_inptr - remain;
4288 need_to_grow = 1;
4289 }
4290 if (need_to_grow) {
4291 writer->overallocate = 1;
4292 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4293 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4294 goto onError;
4295 }
4296 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4297 goto onError;
4298
4299 *endinpos = newpos;
4300 *inptr = new_inptr;
4301
4302 /* we made it! */
4303 Py_DECREF(restuple);
4304 return 0;
4305
4306 onError:
4307 Py_XDECREF(restuple);
4308 return -1;
4309 }
4310
4311 /* --- UTF-7 Codec -------------------------------------------------------- */
4312
4313 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4314
4315 /* Three simple macros defining base-64. */
4316
4317 /* Is c a base-64 character? */
4318
4319 #define IS_BASE64(c) \
4320 (((c) >= 'A' && (c) <= 'Z') || \
4321 ((c) >= 'a' && (c) <= 'z') || \
4322 ((c) >= '0' && (c) <= '9') || \
4323 (c) == '+' || (c) == '/')
4324
4325 /* given that c is a base-64 character, what is its base-64 value? */
4326
4327 #define FROM_BASE64(c) \
4328 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4329 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4330 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4331 (c) == '+' ? 62 : 63)
4332
4333 /* What is the base-64 character of the bottom 6 bits of n? */
4334
4335 #define TO_BASE64(n) \
4336 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4337
4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4339 * decoded as itself. We are permissive on decoding; the only ASCII
4340 * byte not decoding to itself is the + which begins a base64
4341 * string. */
4342
4343 #define DECODE_DIRECT(c) \
4344 ((c) <= 127 && (c) != '+')
4345
4346 /* The UTF-7 encoder treats ASCII characters differently according to
4347 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4348 * the above). See RFC2152. This array identifies these different
4349 * sets:
4350 * 0 : "Set D"
4351 * alphanumeric and '(),-./:?
4352 * 1 : "Set O"
4353 * !"#$%&*;<=>@[]^_`{|}
4354 * 2 : "whitespace"
4355 * ht nl cr sp
4356 * 3 : special (must be base64 encoded)
4357 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4358 */
4359
4360 static
4361 char utf7_category[128] = {
4362 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4363 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4365 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4366 /* sp ! " # $ % & ' ( ) * + , - . / */
4367 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4368 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4370 /* @ A B C D E F G H I J K L M N O */
4371 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4372 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4374 /* ` a b c d e f g h i j k l m n o */
4375 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4376 /* p q r s t u v w x y z { | } ~ del */
4377 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4378 };
4379
4380 /* ENCODE_DIRECT: this character should be encoded as itself. The
4381 * answer depends on whether we are encoding set O as itself, and also
4382 * on whether we are encoding whitespace as itself. RFC2152 makes it
4383 * clear that the answers to these questions vary between
4384 * applications, so this code needs to be flexible. */
4385
4386 #define ENCODE_DIRECT(c, directO, directWS) \
4387 ((c) < 128 && (c) > 0 && \
4388 ((utf7_category[(c)] == 0) || \
4389 (directWS && (utf7_category[(c)] == 2)) || \
4390 (directO && (utf7_category[(c)] == 1))))
4391
4392 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4393 PyUnicode_DecodeUTF7(const char *s,
4394 Py_ssize_t size,
4395 const char *errors)
4396 {
4397 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4398 }
4399
4400 /* The decoder. The only state we preserve is our read position,
4401 * i.e. how many characters we have consumed. So if we end in the
4402 * middle of a shift sequence we have to back off the read position
4403 * and the output to the beginning of the sequence, otherwise we lose
4404 * all the shift state (seen bits, number of bits seen, high
4405 * surrogate). */
4406
4407 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4408 PyUnicode_DecodeUTF7Stateful(const char *s,
4409 Py_ssize_t size,
4410 const char *errors,
4411 Py_ssize_t *consumed)
4412 {
4413 const char *starts = s;
4414 Py_ssize_t startinpos;
4415 Py_ssize_t endinpos;
4416 const char *e;
4417 _PyUnicodeWriter writer;
4418 const char *errmsg = "";
4419 int inShift = 0;
4420 Py_ssize_t shiftOutStart;
4421 unsigned int base64bits = 0;
4422 unsigned long base64buffer = 0;
4423 Py_UCS4 surrogate = 0;
4424 PyObject *errorHandler = NULL;
4425 PyObject *exc = NULL;
4426
4427 if (size == 0) {
4428 if (consumed)
4429 *consumed = 0;
4430 _Py_RETURN_UNICODE_EMPTY();
4431 }
4432
4433 /* Start off assuming it's all ASCII. Widen later as necessary. */
4434 _PyUnicodeWriter_Init(&writer);
4435 writer.min_length = size;
4436
4437 shiftOutStart = 0;
4438 e = s + size;
4439
4440 while (s < e) {
4441 Py_UCS4 ch;
4442 restart:
4443 ch = (unsigned char) *s;
4444
4445 if (inShift) { /* in a base-64 section */
4446 if (IS_BASE64(ch)) { /* consume a base-64 character */
4447 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4448 base64bits += 6;
4449 s++;
4450 if (base64bits >= 16) {
4451 /* we have enough bits for a UTF-16 value */
4452 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4453 base64bits -= 16;
4454 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4455 assert(outCh <= 0xffff);
4456 if (surrogate) {
4457 /* expecting a second surrogate */
4458 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4459 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4460 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4461 goto onError;
4462 surrogate = 0;
4463 continue;
4464 }
4465 else {
4466 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4467 goto onError;
4468 surrogate = 0;
4469 }
4470 }
4471 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4472 /* first surrogate */
4473 surrogate = outCh;
4474 }
4475 else {
4476 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4477 goto onError;
4478 }
4479 }
4480 }
4481 else { /* now leaving a base-64 section */
4482 inShift = 0;
4483 if (base64bits > 0) { /* left-over bits */
4484 if (base64bits >= 6) {
4485 /* We've seen at least one base-64 character */
4486 s++;
4487 errmsg = "partial character in shift sequence";
4488 goto utf7Error;
4489 }
4490 else {
4491 /* Some bits remain; they should be zero */
4492 if (base64buffer != 0) {
4493 s++;
4494 errmsg = "non-zero padding bits in shift sequence";
4495 goto utf7Error;
4496 }
4497 }
4498 }
4499 if (surrogate && DECODE_DIRECT(ch)) {
4500 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4501 goto onError;
4502 }
4503 surrogate = 0;
4504 if (ch == '-') {
4505 /* '-' is absorbed; other terminating
4506 characters are preserved */
4507 s++;
4508 }
4509 }
4510 }
4511 else if ( ch == '+' ) {
4512 startinpos = s-starts;
4513 s++; /* consume '+' */
4514 if (s < e && *s == '-') { /* '+-' encodes '+' */
4515 s++;
4516 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4517 goto onError;
4518 }
4519 else { /* begin base64-encoded section */
4520 inShift = 1;
4521 surrogate = 0;
4522 shiftOutStart = writer.pos;
4523 base64bits = 0;
4524 base64buffer = 0;
4525 }
4526 }
4527 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4528 s++;
4529 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4530 goto onError;
4531 }
4532 else {
4533 startinpos = s-starts;
4534 s++;
4535 errmsg = "unexpected special character";
4536 goto utf7Error;
4537 }
4538 continue;
4539 utf7Error:
4540 endinpos = s-starts;
4541 if (unicode_decode_call_errorhandler_writer(
4542 errors, &errorHandler,
4543 "utf7", errmsg,
4544 &starts, &e, &startinpos, &endinpos, &exc, &s,
4545 &writer))
4546 goto onError;
4547 }
4548
4549 /* end of string */
4550
4551 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4552 /* if we're in an inconsistent state, that's an error */
4553 inShift = 0;
4554 if (surrogate ||
4555 (base64bits >= 6) ||
4556 (base64bits > 0 && base64buffer != 0)) {
4557 endinpos = size;
4558 if (unicode_decode_call_errorhandler_writer(
4559 errors, &errorHandler,
4560 "utf7", "unterminated shift sequence",
4561 &starts, &e, &startinpos, &endinpos, &exc, &s,
4562 &writer))
4563 goto onError;
4564 if (s < e)
4565 goto restart;
4566 }
4567 }
4568
4569 /* return state */
4570 if (consumed) {
4571 if (inShift) {
4572 *consumed = startinpos;
4573 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4574 PyObject *result = PyUnicode_FromKindAndData(
4575 writer.kind, writer.data, shiftOutStart);
4576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
4578 _PyUnicodeWriter_Dealloc(&writer);
4579 return result;
4580 }
4581 writer.pos = shiftOutStart; /* back off output */
4582 }
4583 else {
4584 *consumed = s-starts;
4585 }
4586 }
4587
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 return _PyUnicodeWriter_Finish(&writer);
4591
4592 onError:
4593 Py_XDECREF(errorHandler);
4594 Py_XDECREF(exc);
4595 _PyUnicodeWriter_Dealloc(&writer);
4596 return NULL;
4597 }
4598
4599
4600 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4601 _PyUnicode_EncodeUTF7(PyObject *str,
4602 int base64SetO,
4603 int base64WhiteSpace,
4604 const char *errors)
4605 {
4606 int kind;
4607 void *data;
4608 Py_ssize_t len;
4609 PyObject *v;
4610 int inShift = 0;
4611 Py_ssize_t i;
4612 unsigned int base64bits = 0;
4613 unsigned long base64buffer = 0;
4614 char * out;
4615 char * start;
4616
4617 if (PyUnicode_READY(str) == -1)
4618 return NULL;
4619 kind = PyUnicode_KIND(str);
4620 data = PyUnicode_DATA(str);
4621 len = PyUnicode_GET_LENGTH(str);
4622
4623 if (len == 0)
4624 return PyBytes_FromStringAndSize(NULL, 0);
4625
4626 /* It might be possible to tighten this worst case */
4627 if (len > PY_SSIZE_T_MAX / 8)
4628 return PyErr_NoMemory();
4629 v = PyBytes_FromStringAndSize(NULL, len * 8);
4630 if (v == NULL)
4631 return NULL;
4632
4633 start = out = PyBytes_AS_STRING(v);
4634 for (i = 0; i < len; ++i) {
4635 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4636
4637 if (inShift) {
4638 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4639 /* shifting out */
4640 if (base64bits) { /* output remaining bits */
4641 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4642 base64buffer = 0;
4643 base64bits = 0;
4644 }
4645 inShift = 0;
4646 /* Characters not in the BASE64 set implicitly unshift the sequence
4647 so no '-' is required, except if the character is itself a '-' */
4648 if (IS_BASE64(ch) || ch == '-') {
4649 *out++ = '-';
4650 }
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 goto encode_char;
4655 }
4656 }
4657 else { /* not in a shift sequence */
4658 if (ch == '+') {
4659 *out++ = '+';
4660 *out++ = '-';
4661 }
4662 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4663 *out++ = (char) ch;
4664 }
4665 else {
4666 *out++ = '+';
4667 inShift = 1;
4668 goto encode_char;
4669 }
4670 }
4671 continue;
4672 encode_char:
4673 if (ch >= 0x10000) {
4674 assert(ch <= MAX_UNICODE);
4675
4676 /* code first surrogate */
4677 base64bits += 16;
4678 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4679 while (base64bits >= 6) {
4680 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4681 base64bits -= 6;
4682 }
4683 /* prepare second surrogate */
4684 ch = Py_UNICODE_LOW_SURROGATE(ch);
4685 }
4686 base64bits += 16;
4687 base64buffer = (base64buffer << 16) | ch;
4688 while (base64bits >= 6) {
4689 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4690 base64bits -= 6;
4691 }
4692 }
4693 if (base64bits)
4694 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4695 if (inShift)
4696 *out++ = '-';
4697 if (_PyBytes_Resize(&v, out - start) < 0)
4698 return NULL;
4699 return v;
4700 }
4701 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4703 Py_ssize_t size,
4704 int base64SetO,
4705 int base64WhiteSpace,
4706 const char *errors)
4707 {
4708 PyObject *result;
4709 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4710 if (tmp == NULL)
4711 return NULL;
4712 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4713 base64WhiteSpace, errors);
4714 Py_DECREF(tmp);
4715 return result;
4716 }
4717
4718 #undef IS_BASE64
4719 #undef FROM_BASE64
4720 #undef TO_BASE64
4721 #undef DECODE_DIRECT
4722 #undef ENCODE_DIRECT
4723
4724 /* --- UTF-8 Codec -------------------------------------------------------- */
4725
4726 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4727 PyUnicode_DecodeUTF8(const char *s,
4728 Py_ssize_t size,
4729 const char *errors)
4730 {
4731 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4732 }
4733
4734 #include "stringlib/asciilib.h"
4735 #include "stringlib/codecs.h"
4736 #include "stringlib/undef.h"
4737
4738 #include "stringlib/ucs1lib.h"
4739 #include "stringlib/codecs.h"
4740 #include "stringlib/undef.h"
4741
4742 #include "stringlib/ucs2lib.h"
4743 #include "stringlib/codecs.h"
4744 #include "stringlib/undef.h"
4745
4746 #include "stringlib/ucs4lib.h"
4747 #include "stringlib/codecs.h"
4748 #include "stringlib/undef.h"
4749
4750 /* Mask to quickly check whether a C 'long' contains a
4751 non-ASCII, UTF8-encoded char. */
4752 #if (SIZEOF_LONG == 8)
4753 # define ASCII_CHAR_MASK 0x8080808080808080UL
4754 #elif (SIZEOF_LONG == 4)
4755 # define ASCII_CHAR_MASK 0x80808080UL
4756 #else
4757 # error C 'long' size should be either 4 or 8!
4758 #endif
4759
4760 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4762 {
4763 const char *p = start;
4764 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4765
4766 /*
4767 * Issue #17237: m68k is a bit different from most architectures in
4768 * that objects do not use "natural alignment" - for example, int and
4769 * long are only aligned at 2-byte boundaries. Therefore the assert()
4770 * won't work; also, tests have shown that skipping the "optimised
4771 * version" will even speed up m68k.
4772 */
4773 #if !defined(__m68k__)
4774 #if SIZEOF_LONG <= SIZEOF_VOID_P
4775 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4776 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4777 /* Fast path, see in STRINGLIB(utf8_decode) for
4778 an explanation. */
4779 /* Help allocation */
4780 const char *_p = p;
4781 Py_UCS1 * q = dest;
4782 while (_p < aligned_end) {
4783 unsigned long value = *(const unsigned long *) _p;
4784 if (value & ASCII_CHAR_MASK)
4785 break;
4786 *((unsigned long *)q) = value;
4787 _p += SIZEOF_LONG;
4788 q += SIZEOF_LONG;
4789 }
4790 p = _p;
4791 while (p < end) {
4792 if ((unsigned char)*p & 0x80)
4793 break;
4794 *q++ = *p++;
4795 }
4796 return p - start;
4797 }
4798 #endif
4799 #endif
4800 while (p < end) {
4801 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4802 for an explanation. */
4803 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4804 /* Help allocation */
4805 const char *_p = p;
4806 while (_p < aligned_end) {
4807 unsigned long value = *(unsigned long *) _p;
4808 if (value & ASCII_CHAR_MASK)
4809 break;
4810 _p += SIZEOF_LONG;
4811 }
4812 p = _p;
4813 if (_p == end)
4814 break;
4815 }
4816 if ((unsigned char)*p & 0x80)
4817 break;
4818 ++p;
4819 }
4820 memcpy(dest, start, p - start);
4821 return p - start;
4822 }
4823
4824 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4825 PyUnicode_DecodeUTF8Stateful(const char *s,
4826 Py_ssize_t size,
4827 const char *errors,
4828 Py_ssize_t *consumed)
4829 {
4830 _PyUnicodeWriter writer;
4831 const char *starts = s;
4832 const char *end = s + size;
4833
4834 Py_ssize_t startinpos;
4835 Py_ssize_t endinpos;
4836 const char *errmsg = "";
4837 PyObject *error_handler_obj = NULL;
4838 PyObject *exc = NULL;
4839 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4840
4841 if (size == 0) {
4842 if (consumed)
4843 *consumed = 0;
4844 _Py_RETURN_UNICODE_EMPTY();
4845 }
4846
4847 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4848 if (size == 1 && (unsigned char)s[0] < 128) {
4849 if (consumed)
4850 *consumed = 1;
4851 return get_latin1_char((unsigned char)s[0]);
4852 }
4853
4854 _PyUnicodeWriter_Init(&writer);
4855 writer.min_length = size;
4856 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4857 goto onError;
4858
4859 writer.pos = ascii_decode(s, end, writer.data);
4860 s += writer.pos;
4861 while (s < end) {
4862 Py_UCS4 ch;
4863 int kind = writer.kind;
4864
4865 if (kind == PyUnicode_1BYTE_KIND) {
4866 if (PyUnicode_IS_ASCII(writer.buffer))
4867 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4868 else
4869 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4870 } else if (kind == PyUnicode_2BYTE_KIND) {
4871 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4872 } else {
4873 assert(kind == PyUnicode_4BYTE_KIND);
4874 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4875 }
4876
4877 switch (ch) {
4878 case 0:
4879 if (s == end || consumed)
4880 goto End;
4881 errmsg = "unexpected end of data";
4882 startinpos = s - starts;
4883 endinpos = end - starts;
4884 break;
4885 case 1:
4886 errmsg = "invalid start byte";
4887 startinpos = s - starts;
4888 endinpos = startinpos + 1;
4889 break;
4890 case 2:
4891 case 3:
4892 case 4:
4893 errmsg = "invalid continuation byte";
4894 startinpos = s - starts;
4895 endinpos = startinpos + ch - 1;
4896 break;
4897 default:
4898 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4899 goto onError;
4900 continue;
4901 }
4902
4903 if (error_handler == _Py_ERROR_UNKNOWN)
4904 error_handler = get_error_handler(errors);
4905
4906 switch (error_handler) {
4907 case _Py_ERROR_IGNORE:
4908 s += (endinpos - startinpos);
4909 break;
4910
4911 case _Py_ERROR_REPLACE:
4912 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4913 goto onError;
4914 s += (endinpos - startinpos);
4915 break;
4916
4917 case _Py_ERROR_SURROGATEESCAPE:
4918 {
4919 Py_ssize_t i;
4920
4921 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4922 goto onError;
4923 for (i=startinpos; i<endinpos; i++) {
4924 ch = (Py_UCS4)(unsigned char)(starts[i]);
4925 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4926 ch + 0xdc00);
4927 writer.pos++;
4928 }
4929 s += (endinpos - startinpos);
4930 break;
4931 }
4932
4933 default:
4934 if (unicode_decode_call_errorhandler_writer(
4935 errors, &error_handler_obj,
4936 "utf-8", errmsg,
4937 &starts, &end, &startinpos, &endinpos, &exc, &s,
4938 &writer))
4939 goto onError;
4940 }
4941 }
4942
4943 End:
4944 if (consumed)
4945 *consumed = s - starts;
4946
4947 Py_XDECREF(error_handler_obj);
4948 Py_XDECREF(exc);
4949 return _PyUnicodeWriter_Finish(&writer);
4950
4951 onError:
4952 Py_XDECREF(error_handler_obj);
4953 Py_XDECREF(exc);
4954 _PyUnicodeWriter_Dealloc(&writer);
4955 return NULL;
4956 }
4957
4958
4959 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4960 non-zero, use strict error handler otherwise.
4961
4962 On success, write a pointer to a newly allocated wide character string into
4963 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4964 (in number of wchar_t units) into *wlen (if wlen is set).
4965
4966 On memory allocation failure, return -1.
4967
4968 On decoding error (if surrogateescape is zero), return -2. If wlen is
4969 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4970 is not NULL, write the decoding error message into *reason. */
4971 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,int surrogateescape)4972 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4973 const char **reason, int surrogateescape)
4974 {
4975 const char *orig_s = s;
4976 const char *e;
4977 wchar_t *unicode;
4978 Py_ssize_t outpos;
4979
4980 /* Note: size will always be longer than the resulting Unicode
4981 character count */
4982 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
4983 return -1;
4984 }
4985
4986 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4987 if (!unicode) {
4988 return -1;
4989 }
4990
4991 /* Unpack UTF-8 encoded data */
4992 e = s + size;
4993 outpos = 0;
4994 while (s < e) {
4995 Py_UCS4 ch;
4996 #if SIZEOF_WCHAR_T == 4
4997 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4998 #else
4999 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5000 #endif
5001 if (ch > 0xFF) {
5002 #if SIZEOF_WCHAR_T == 4
5003 Py_UNREACHABLE();
5004 #else
5005 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5006 /* write a surrogate pair */
5007 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5008 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5009 #endif
5010 }
5011 else {
5012 if (!ch && s == e)
5013 break;
5014 if (!surrogateescape) {
5015 PyMem_RawFree(unicode );
5016 if (reason != NULL) {
5017 switch (ch) {
5018 case 0:
5019 *reason = "unexpected end of data";
5020 break;
5021 case 1:
5022 *reason = "invalid start byte";
5023 break;
5024 /* 2, 3, 4 */
5025 default:
5026 *reason = "invalid continuation byte";
5027 break;
5028 }
5029 }
5030 if (wlen != NULL) {
5031 *wlen = s - orig_s;
5032 }
5033 return -2;
5034 }
5035 /* surrogateescape */
5036 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5037 }
5038 }
5039 unicode[outpos] = L'\0';
5040 if (wlen) {
5041 *wlen = outpos;
5042 }
5043 *wstr = unicode;
5044 return 0;
5045 }
5046
5047 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen)5048 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5049 {
5050 wchar_t *wstr;
5051 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5052 if (res != 0) {
5053 return NULL;
5054 }
5055 return wstr;
5056 }
5057
5058
5059 /* UTF-8 encoder using the surrogateescape error handler .
5060
5061 On success, return 0 and write the newly allocated character string (use
5062 PyMem_Free() to free the memory) into *str.
5063
5064 On encoding failure, return -2 and write the position of the invalid
5065 surrogate character into *error_pos (if error_pos is set) and the decoding
5066 error message into *reason (if reason is set).
5067
5068 On memory allocation failure, return -1. */
5069 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,int surrogateescape)5070 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5071 const char **reason, int raw_malloc, int surrogateescape)
5072 {
5073 const Py_ssize_t max_char_size = 4;
5074 Py_ssize_t len = wcslen(text);
5075
5076 assert(len >= 0);
5077
5078 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5079 return -1;
5080 }
5081 char *bytes;
5082 if (raw_malloc) {
5083 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5084 }
5085 else {
5086 bytes = PyMem_Malloc((len + 1) * max_char_size);
5087 }
5088 if (bytes == NULL) {
5089 return -1;
5090 }
5091
5092 char *p = bytes;
5093 Py_ssize_t i;
5094 for (i = 0; i < len; i++) {
5095 Py_UCS4 ch = text[i];
5096
5097 if (ch < 0x80) {
5098 /* Encode ASCII */
5099 *p++ = (char) ch;
5100
5101 }
5102 else if (ch < 0x0800) {
5103 /* Encode Latin-1 */
5104 *p++ = (char)(0xc0 | (ch >> 6));
5105 *p++ = (char)(0x80 | (ch & 0x3f));
5106 }
5107 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5108 /* surrogateescape error handler */
5109 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5110 if (error_pos != NULL) {
5111 *error_pos = (size_t)i;
5112 }
5113 if (reason != NULL) {
5114 *reason = "encoding error";
5115 }
5116 if (raw_malloc) {
5117 PyMem_RawFree(bytes);
5118 }
5119 else {
5120 PyMem_Free(bytes);
5121 }
5122 return -2;
5123 }
5124 *p++ = (char)(ch & 0xff);
5125 }
5126 else if (ch < 0x10000) {
5127 *p++ = (char)(0xe0 | (ch >> 12));
5128 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5129 *p++ = (char)(0x80 | (ch & 0x3f));
5130 }
5131 else { /* ch >= 0x10000 */
5132 assert(ch <= MAX_UNICODE);
5133 /* Encode UCS4 Unicode ordinals */
5134 *p++ = (char)(0xf0 | (ch >> 18));
5135 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5137 *p++ = (char)(0x80 | (ch & 0x3f));
5138 }
5139 }
5140 *p++ = '\0';
5141
5142 size_t final_size = (p - bytes);
5143 char *bytes2;
5144 if (raw_malloc) {
5145 bytes2 = PyMem_RawRealloc(bytes, final_size);
5146 }
5147 else {
5148 bytes2 = PyMem_Realloc(bytes, final_size);
5149 }
5150 if (bytes2 == NULL) {
5151 if (error_pos != NULL) {
5152 *error_pos = (size_t)-1;
5153 }
5154 if (raw_malloc) {
5155 PyMem_RawFree(bytes);
5156 }
5157 else {
5158 PyMem_Free(bytes);
5159 }
5160 return -1;
5161 }
5162 *str = bytes2;
5163 return 0;
5164 }
5165
5166
5167 /* Primary internal function which creates utf8 encoded bytes objects.
5168
5169 Allocation strategy: if the string is short, convert into a stack buffer
5170 and allocate exactly as much space needed at the end. Else allocate the
5171 maximum possible needed (4 result bytes per Unicode character), and return
5172 the excess memory at the end.
5173 */
5174 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5175 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5176 {
5177 enum PyUnicode_Kind kind;
5178 void *data;
5179 Py_ssize_t size;
5180
5181 if (!PyUnicode_Check(unicode)) {
5182 PyErr_BadArgument();
5183 return NULL;
5184 }
5185
5186 if (PyUnicode_READY(unicode) == -1)
5187 return NULL;
5188
5189 if (PyUnicode_UTF8(unicode))
5190 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5191 PyUnicode_UTF8_LENGTH(unicode));
5192
5193 kind = PyUnicode_KIND(unicode);
5194 data = PyUnicode_DATA(unicode);
5195 size = PyUnicode_GET_LENGTH(unicode);
5196
5197 switch (kind) {
5198 default:
5199 Py_UNREACHABLE();
5200 case PyUnicode_1BYTE_KIND:
5201 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5202 assert(!PyUnicode_IS_ASCII(unicode));
5203 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5204 case PyUnicode_2BYTE_KIND:
5205 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5206 case PyUnicode_4BYTE_KIND:
5207 return ucs4lib_utf8_encoder(unicode, data, size, errors);
5208 }
5209 }
5210
5211 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5212 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5213 Py_ssize_t size,
5214 const char *errors)
5215 {
5216 PyObject *v, *unicode;
5217
5218 unicode = PyUnicode_FromWideChar(s, size);
5219 if (unicode == NULL)
5220 return NULL;
5221 v = _PyUnicode_AsUTF8String(unicode, errors);
5222 Py_DECREF(unicode);
5223 return v;
5224 }
5225
5226 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5227 PyUnicode_AsUTF8String(PyObject *unicode)
5228 {
5229 return _PyUnicode_AsUTF8String(unicode, NULL);
5230 }
5231
5232 /* --- UTF-32 Codec ------------------------------------------------------- */
5233
5234 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5235 PyUnicode_DecodeUTF32(const char *s,
5236 Py_ssize_t size,
5237 const char *errors,
5238 int *byteorder)
5239 {
5240 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5241 }
5242
5243 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5244 PyUnicode_DecodeUTF32Stateful(const char *s,
5245 Py_ssize_t size,
5246 const char *errors,
5247 int *byteorder,
5248 Py_ssize_t *consumed)
5249 {
5250 const char *starts = s;
5251 Py_ssize_t startinpos;
5252 Py_ssize_t endinpos;
5253 _PyUnicodeWriter writer;
5254 const unsigned char *q, *e;
5255 int le, bo = 0; /* assume native ordering by default */
5256 const char *encoding;
5257 const char *errmsg = "";
5258 PyObject *errorHandler = NULL;
5259 PyObject *exc = NULL;
5260
5261 q = (unsigned char *)s;
5262 e = q + size;
5263
5264 if (byteorder)
5265 bo = *byteorder;
5266
5267 /* Check for BOM marks (U+FEFF) in the input and adjust current
5268 byte order setting accordingly. In native mode, the leading BOM
5269 mark is skipped, in all other modes, it is copied to the output
5270 stream as-is (giving a ZWNBSP character). */
5271 if (bo == 0 && size >= 4) {
5272 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5273 if (bom == 0x0000FEFF) {
5274 bo = -1;
5275 q += 4;
5276 }
5277 else if (bom == 0xFFFE0000) {
5278 bo = 1;
5279 q += 4;
5280 }
5281 if (byteorder)
5282 *byteorder = bo;
5283 }
5284
5285 if (q == e) {
5286 if (consumed)
5287 *consumed = size;
5288 _Py_RETURN_UNICODE_EMPTY();
5289 }
5290
5291 #ifdef WORDS_BIGENDIAN
5292 le = bo < 0;
5293 #else
5294 le = bo <= 0;
5295 #endif
5296 encoding = le ? "utf-32-le" : "utf-32-be";
5297
5298 _PyUnicodeWriter_Init(&writer);
5299 writer.min_length = (e - q + 3) / 4;
5300 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5301 goto onError;
5302
5303 while (1) {
5304 Py_UCS4 ch = 0;
5305 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5306
5307 if (e - q >= 4) {
5308 enum PyUnicode_Kind kind = writer.kind;
5309 void *data = writer.data;
5310 const unsigned char *last = e - 4;
5311 Py_ssize_t pos = writer.pos;
5312 if (le) {
5313 do {
5314 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5315 if (ch > maxch)
5316 break;
5317 if (kind != PyUnicode_1BYTE_KIND &&
5318 Py_UNICODE_IS_SURROGATE(ch))
5319 break;
5320 PyUnicode_WRITE(kind, data, pos++, ch);
5321 q += 4;
5322 } while (q <= last);
5323 }
5324 else {
5325 do {
5326 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5327 if (ch > maxch)
5328 break;
5329 if (kind != PyUnicode_1BYTE_KIND &&
5330 Py_UNICODE_IS_SURROGATE(ch))
5331 break;
5332 PyUnicode_WRITE(kind, data, pos++, ch);
5333 q += 4;
5334 } while (q <= last);
5335 }
5336 writer.pos = pos;
5337 }
5338
5339 if (Py_UNICODE_IS_SURROGATE(ch)) {
5340 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5341 startinpos = ((const char *)q) - starts;
5342 endinpos = startinpos + 4;
5343 }
5344 else if (ch <= maxch) {
5345 if (q == e || consumed)
5346 break;
5347 /* remaining bytes at the end? (size should be divisible by 4) */
5348 errmsg = "truncated data";
5349 startinpos = ((const char *)q) - starts;
5350 endinpos = ((const char *)e) - starts;
5351 }
5352 else {
5353 if (ch < 0x110000) {
5354 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5355 goto onError;
5356 q += 4;
5357 continue;
5358 }
5359 errmsg = "code point not in range(0x110000)";
5360 startinpos = ((const char *)q) - starts;
5361 endinpos = startinpos + 4;
5362 }
5363
5364 /* The remaining input chars are ignored if the callback
5365 chooses to skip the input */
5366 if (unicode_decode_call_errorhandler_writer(
5367 errors, &errorHandler,
5368 encoding, errmsg,
5369 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5370 &writer))
5371 goto onError;
5372 }
5373
5374 if (consumed)
5375 *consumed = (const char *)q-starts;
5376
5377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
5379 return _PyUnicodeWriter_Finish(&writer);
5380
5381 onError:
5382 _PyUnicodeWriter_Dealloc(&writer);
5383 Py_XDECREF(errorHandler);
5384 Py_XDECREF(exc);
5385 return NULL;
5386 }
5387
5388 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5389 _PyUnicode_EncodeUTF32(PyObject *str,
5390 const char *errors,
5391 int byteorder)
5392 {
5393 enum PyUnicode_Kind kind;
5394 const void *data;
5395 Py_ssize_t len;
5396 PyObject *v;
5397 uint32_t *out;
5398 #if PY_LITTLE_ENDIAN
5399 int native_ordering = byteorder <= 0;
5400 #else
5401 int native_ordering = byteorder >= 0;
5402 #endif
5403 const char *encoding;
5404 Py_ssize_t nsize, pos;
5405 PyObject *errorHandler = NULL;
5406 PyObject *exc = NULL;
5407 PyObject *rep = NULL;
5408
5409 if (!PyUnicode_Check(str)) {
5410 PyErr_BadArgument();
5411 return NULL;
5412 }
5413 if (PyUnicode_READY(str) == -1)
5414 return NULL;
5415 kind = PyUnicode_KIND(str);
5416 data = PyUnicode_DATA(str);
5417 len = PyUnicode_GET_LENGTH(str);
5418
5419 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5420 return PyErr_NoMemory();
5421 nsize = len + (byteorder == 0);
5422 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5423 if (v == NULL)
5424 return NULL;
5425
5426 /* output buffer is 4-bytes aligned */
5427 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5428 out = (uint32_t *)PyBytes_AS_STRING(v);
5429 if (byteorder == 0)
5430 *out++ = 0xFEFF;
5431 if (len == 0)
5432 goto done;
5433
5434 if (byteorder == -1)
5435 encoding = "utf-32-le";
5436 else if (byteorder == 1)
5437 encoding = "utf-32-be";
5438 else
5439 encoding = "utf-32";
5440
5441 if (kind == PyUnicode_1BYTE_KIND) {
5442 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5443 goto done;
5444 }
5445
5446 pos = 0;
5447 while (pos < len) {
5448 Py_ssize_t repsize, moreunits;
5449
5450 if (kind == PyUnicode_2BYTE_KIND) {
5451 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5452 &out, native_ordering);
5453 }
5454 else {
5455 assert(kind == PyUnicode_4BYTE_KIND);
5456 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5457 &out, native_ordering);
5458 }
5459 if (pos == len)
5460 break;
5461
5462 rep = unicode_encode_call_errorhandler(
5463 errors, &errorHandler,
5464 encoding, "surrogates not allowed",
5465 str, &exc, pos, pos + 1, &pos);
5466 if (!rep)
5467 goto error;
5468
5469 if (PyBytes_Check(rep)) {
5470 repsize = PyBytes_GET_SIZE(rep);
5471 if (repsize & 3) {
5472 raise_encode_exception(&exc, encoding,
5473 str, pos - 1, pos,
5474 "surrogates not allowed");
5475 goto error;
5476 }
5477 moreunits = repsize / 4;
5478 }
5479 else {
5480 assert(PyUnicode_Check(rep));
5481 if (PyUnicode_READY(rep) < 0)
5482 goto error;
5483 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5484 if (!PyUnicode_IS_ASCII(rep)) {
5485 raise_encode_exception(&exc, encoding,
5486 str, pos - 1, pos,
5487 "surrogates not allowed");
5488 goto error;
5489 }
5490 }
5491
5492 /* four bytes are reserved for each surrogate */
5493 if (moreunits > 1) {
5494 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5495 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5496 /* integer overflow */
5497 PyErr_NoMemory();
5498 goto error;
5499 }
5500 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5501 goto error;
5502 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5503 }
5504
5505 if (PyBytes_Check(rep)) {
5506 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5507 out += moreunits;
5508 } else /* rep is unicode */ {
5509 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5510 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5511 &out, native_ordering);
5512 }
5513
5514 Py_CLEAR(rep);
5515 }
5516
5517 /* Cut back to size actually needed. This is necessary for, for example,
5518 encoding of a string containing isolated surrogates and the 'ignore'
5519 handler is used. */
5520 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5521 if (nsize != PyBytes_GET_SIZE(v))
5522 _PyBytes_Resize(&v, nsize);
5523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
5525 done:
5526 return v;
5527 error:
5528 Py_XDECREF(rep);
5529 Py_XDECREF(errorHandler);
5530 Py_XDECREF(exc);
5531 Py_XDECREF(v);
5532 return NULL;
5533 }
5534
5535 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5536 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5537 Py_ssize_t size,
5538 const char *errors,
5539 int byteorder)
5540 {
5541 PyObject *result;
5542 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5543 if (tmp == NULL)
5544 return NULL;
5545 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5546 Py_DECREF(tmp);
5547 return result;
5548 }
5549
5550 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5551 PyUnicode_AsUTF32String(PyObject *unicode)
5552 {
5553 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5554 }
5555
5556 /* --- UTF-16 Codec ------------------------------------------------------- */
5557
5558 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5559 PyUnicode_DecodeUTF16(const char *s,
5560 Py_ssize_t size,
5561 const char *errors,
5562 int *byteorder)
5563 {
5564 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5565 }
5566
5567 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5568 PyUnicode_DecodeUTF16Stateful(const char *s,
5569 Py_ssize_t size,
5570 const char *errors,
5571 int *byteorder,
5572 Py_ssize_t *consumed)
5573 {
5574 const char *starts = s;
5575 Py_ssize_t startinpos;
5576 Py_ssize_t endinpos;
5577 _PyUnicodeWriter writer;
5578 const unsigned char *q, *e;
5579 int bo = 0; /* assume native ordering by default */
5580 int native_ordering;
5581 const char *errmsg = "";
5582 PyObject *errorHandler = NULL;
5583 PyObject *exc = NULL;
5584 const char *encoding;
5585
5586 q = (unsigned char *)s;
5587 e = q + size;
5588
5589 if (byteorder)
5590 bo = *byteorder;
5591
5592 /* Check for BOM marks (U+FEFF) in the input and adjust current
5593 byte order setting accordingly. In native mode, the leading BOM
5594 mark is skipped, in all other modes, it is copied to the output
5595 stream as-is (giving a ZWNBSP character). */
5596 if (bo == 0 && size >= 2) {
5597 const Py_UCS4 bom = (q[1] << 8) | q[0];
5598 if (bom == 0xFEFF) {
5599 q += 2;
5600 bo = -1;
5601 }
5602 else if (bom == 0xFFFE) {
5603 q += 2;
5604 bo = 1;
5605 }
5606 if (byteorder)
5607 *byteorder = bo;
5608 }
5609
5610 if (q == e) {
5611 if (consumed)
5612 *consumed = size;
5613 _Py_RETURN_UNICODE_EMPTY();
5614 }
5615
5616 #if PY_LITTLE_ENDIAN
5617 native_ordering = bo <= 0;
5618 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5619 #else
5620 native_ordering = bo >= 0;
5621 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5622 #endif
5623
5624 /* Note: size will always be longer than the resulting Unicode
5625 character count normally. Error handler will take care of
5626 resizing when needed. */
5627 _PyUnicodeWriter_Init(&writer);
5628 writer.min_length = (e - q + 1) / 2;
5629 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5630 goto onError;
5631
5632 while (1) {
5633 Py_UCS4 ch = 0;
5634 if (e - q >= 2) {
5635 int kind = writer.kind;
5636 if (kind == PyUnicode_1BYTE_KIND) {
5637 if (PyUnicode_IS_ASCII(writer.buffer))
5638 ch = asciilib_utf16_decode(&q, e,
5639 (Py_UCS1*)writer.data, &writer.pos,
5640 native_ordering);
5641 else
5642 ch = ucs1lib_utf16_decode(&q, e,
5643 (Py_UCS1*)writer.data, &writer.pos,
5644 native_ordering);
5645 } else if (kind == PyUnicode_2BYTE_KIND) {
5646 ch = ucs2lib_utf16_decode(&q, e,
5647 (Py_UCS2*)writer.data, &writer.pos,
5648 native_ordering);
5649 } else {
5650 assert(kind == PyUnicode_4BYTE_KIND);
5651 ch = ucs4lib_utf16_decode(&q, e,
5652 (Py_UCS4*)writer.data, &writer.pos,
5653 native_ordering);
5654 }
5655 }
5656
5657 switch (ch)
5658 {
5659 case 0:
5660 /* remaining byte at the end? (size should be even) */
5661 if (q == e || consumed)
5662 goto End;
5663 errmsg = "truncated data";
5664 startinpos = ((const char *)q) - starts;
5665 endinpos = ((const char *)e) - starts;
5666 break;
5667 /* The remaining input chars are ignored if the callback
5668 chooses to skip the input */
5669 case 1:
5670 q -= 2;
5671 if (consumed)
5672 goto End;
5673 errmsg = "unexpected end of data";
5674 startinpos = ((const char *)q) - starts;
5675 endinpos = ((const char *)e) - starts;
5676 break;
5677 case 2:
5678 errmsg = "illegal encoding";
5679 startinpos = ((const char *)q) - 2 - starts;
5680 endinpos = startinpos + 2;
5681 break;
5682 case 3:
5683 errmsg = "illegal UTF-16 surrogate";
5684 startinpos = ((const char *)q) - 4 - starts;
5685 endinpos = startinpos + 2;
5686 break;
5687 default:
5688 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5689 goto onError;
5690 continue;
5691 }
5692
5693 if (unicode_decode_call_errorhandler_writer(
5694 errors,
5695 &errorHandler,
5696 encoding, errmsg,
5697 &starts,
5698 (const char **)&e,
5699 &startinpos,
5700 &endinpos,
5701 &exc,
5702 (const char **)&q,
5703 &writer))
5704 goto onError;
5705 }
5706
5707 End:
5708 if (consumed)
5709 *consumed = (const char *)q-starts;
5710
5711 Py_XDECREF(errorHandler);
5712 Py_XDECREF(exc);
5713 return _PyUnicodeWriter_Finish(&writer);
5714
5715 onError:
5716 _PyUnicodeWriter_Dealloc(&writer);
5717 Py_XDECREF(errorHandler);
5718 Py_XDECREF(exc);
5719 return NULL;
5720 }
5721
5722 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5723 _PyUnicode_EncodeUTF16(PyObject *str,
5724 const char *errors,
5725 int byteorder)
5726 {
5727 enum PyUnicode_Kind kind;
5728 const void *data;
5729 Py_ssize_t len;
5730 PyObject *v;
5731 unsigned short *out;
5732 Py_ssize_t pairs;
5733 #if PY_BIG_ENDIAN
5734 int native_ordering = byteorder >= 0;
5735 #else
5736 int native_ordering = byteorder <= 0;
5737 #endif
5738 const char *encoding;
5739 Py_ssize_t nsize, pos;
5740 PyObject *errorHandler = NULL;
5741 PyObject *exc = NULL;
5742 PyObject *rep = NULL;
5743
5744 if (!PyUnicode_Check(str)) {
5745 PyErr_BadArgument();
5746 return NULL;
5747 }
5748 if (PyUnicode_READY(str) == -1)
5749 return NULL;
5750 kind = PyUnicode_KIND(str);
5751 data = PyUnicode_DATA(str);
5752 len = PyUnicode_GET_LENGTH(str);
5753
5754 pairs = 0;
5755 if (kind == PyUnicode_4BYTE_KIND) {
5756 const Py_UCS4 *in = (const Py_UCS4 *)data;
5757 const Py_UCS4 *end = in + len;
5758 while (in < end) {
5759 if (*in++ >= 0x10000) {
5760 pairs++;
5761 }
5762 }
5763 }
5764 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5765 return PyErr_NoMemory();
5766 }
5767 nsize = len + pairs + (byteorder == 0);
5768 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5769 if (v == NULL) {
5770 return NULL;
5771 }
5772
5773 /* output buffer is 2-bytes aligned */
5774 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5775 out = (unsigned short *)PyBytes_AS_STRING(v);
5776 if (byteorder == 0) {
5777 *out++ = 0xFEFF;
5778 }
5779 if (len == 0) {
5780 goto done;
5781 }
5782
5783 if (kind == PyUnicode_1BYTE_KIND) {
5784 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5785 goto done;
5786 }
5787
5788 if (byteorder < 0) {
5789 encoding = "utf-16-le";
5790 }
5791 else if (byteorder > 0) {
5792 encoding = "utf-16-be";
5793 }
5794 else {
5795 encoding = "utf-16";
5796 }
5797
5798 pos = 0;
5799 while (pos < len) {
5800 Py_ssize_t repsize, moreunits;
5801
5802 if (kind == PyUnicode_2BYTE_KIND) {
5803 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5804 &out, native_ordering);
5805 }
5806 else {
5807 assert(kind == PyUnicode_4BYTE_KIND);
5808 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5809 &out, native_ordering);
5810 }
5811 if (pos == len)
5812 break;
5813
5814 rep = unicode_encode_call_errorhandler(
5815 errors, &errorHandler,
5816 encoding, "surrogates not allowed",
5817 str, &exc, pos, pos + 1, &pos);
5818 if (!rep)
5819 goto error;
5820
5821 if (PyBytes_Check(rep)) {
5822 repsize = PyBytes_GET_SIZE(rep);
5823 if (repsize & 1) {
5824 raise_encode_exception(&exc, encoding,
5825 str, pos - 1, pos,
5826 "surrogates not allowed");
5827 goto error;
5828 }
5829 moreunits = repsize / 2;
5830 }
5831 else {
5832 assert(PyUnicode_Check(rep));
5833 if (PyUnicode_READY(rep) < 0)
5834 goto error;
5835 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5836 if (!PyUnicode_IS_ASCII(rep)) {
5837 raise_encode_exception(&exc, encoding,
5838 str, pos - 1, pos,
5839 "surrogates not allowed");
5840 goto error;
5841 }
5842 }
5843
5844 /* two bytes are reserved for each surrogate */
5845 if (moreunits > 1) {
5846 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5847 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5848 /* integer overflow */
5849 PyErr_NoMemory();
5850 goto error;
5851 }
5852 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
5853 goto error;
5854 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5855 }
5856
5857 if (PyBytes_Check(rep)) {
5858 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5859 out += moreunits;
5860 } else /* rep is unicode */ {
5861 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5862 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5863 &out, native_ordering);
5864 }
5865
5866 Py_CLEAR(rep);
5867 }
5868
5869 /* Cut back to size actually needed. This is necessary for, for example,
5870 encoding of a string containing isolated surrogates and the 'ignore' handler
5871 is used. */
5872 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5873 if (nsize != PyBytes_GET_SIZE(v))
5874 _PyBytes_Resize(&v, nsize);
5875 Py_XDECREF(errorHandler);
5876 Py_XDECREF(exc);
5877 done:
5878 return v;
5879 error:
5880 Py_XDECREF(rep);
5881 Py_XDECREF(errorHandler);
5882 Py_XDECREF(exc);
5883 Py_XDECREF(v);
5884 return NULL;
5885 #undef STORECHAR
5886 }
5887
5888 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5889 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5890 Py_ssize_t size,
5891 const char *errors,
5892 int byteorder)
5893 {
5894 PyObject *result;
5895 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5896 if (tmp == NULL)
5897 return NULL;
5898 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5899 Py_DECREF(tmp);
5900 return result;
5901 }
5902
5903 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5904 PyUnicode_AsUTF16String(PyObject *unicode)
5905 {
5906 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5907 }
5908
5909 /* --- Unicode Escape Codec ----------------------------------------------- */
5910
5911 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5912
5913 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5914 _PyUnicode_DecodeUnicodeEscape(const char *s,
5915 Py_ssize_t size,
5916 const char *errors,
5917 const char **first_invalid_escape)
5918 {
5919 const char *starts = s;
5920 _PyUnicodeWriter writer;
5921 const char *end;
5922 PyObject *errorHandler = NULL;
5923 PyObject *exc = NULL;
5924
5925 // so we can remember if we've seen an invalid escape char or not
5926 *first_invalid_escape = NULL;
5927
5928 if (size == 0) {
5929 _Py_RETURN_UNICODE_EMPTY();
5930 }
5931 /* Escaped strings will always be longer than the resulting
5932 Unicode string, so we start with size here and then reduce the
5933 length after conversion to the true value.
5934 (but if the error callback returns a long replacement string
5935 we'll have to allocate more space) */
5936 _PyUnicodeWriter_Init(&writer);
5937 writer.min_length = size;
5938 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5939 goto onError;
5940 }
5941
5942 end = s + size;
5943 while (s < end) {
5944 unsigned char c = (unsigned char) *s++;
5945 Py_UCS4 ch;
5946 int count;
5947 Py_ssize_t startinpos;
5948 Py_ssize_t endinpos;
5949 const char *message;
5950
5951 #define WRITE_ASCII_CHAR(ch) \
5952 do { \
5953 assert(ch <= 127); \
5954 assert(writer.pos < writer.size); \
5955 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5956 } while(0)
5957
5958 #define WRITE_CHAR(ch) \
5959 do { \
5960 if (ch <= writer.maxchar) { \
5961 assert(writer.pos < writer.size); \
5962 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5963 } \
5964 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5965 goto onError; \
5966 } \
5967 } while(0)
5968
5969 /* Non-escape characters are interpreted as Unicode ordinals */
5970 if (c != '\\') {
5971 WRITE_CHAR(c);
5972 continue;
5973 }
5974
5975 startinpos = s - starts - 1;
5976 /* \ - Escapes */
5977 if (s >= end) {
5978 message = "\\ at end of string";
5979 goto error;
5980 }
5981 c = (unsigned char) *s++;
5982
5983 assert(writer.pos < writer.size);
5984 switch (c) {
5985
5986 /* \x escapes */
5987 case '\n': continue;
5988 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5989 case '\'': WRITE_ASCII_CHAR('\''); continue;
5990 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5991 case 'b': WRITE_ASCII_CHAR('\b'); continue;
5992 /* FF */
5993 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5994 case 't': WRITE_ASCII_CHAR('\t'); continue;
5995 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5996 case 'r': WRITE_ASCII_CHAR('\r'); continue;
5997 /* VT */
5998 case 'v': WRITE_ASCII_CHAR('\013'); continue;
5999 /* BEL, not classic C */
6000 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6001
6002 /* \OOO (octal) escapes */
6003 case '0': case '1': case '2': case '3':
6004 case '4': case '5': case '6': case '7':
6005 ch = c - '0';
6006 if (s < end && '0' <= *s && *s <= '7') {
6007 ch = (ch<<3) + *s++ - '0';
6008 if (s < end && '0' <= *s && *s <= '7') {
6009 ch = (ch<<3) + *s++ - '0';
6010 }
6011 }
6012 WRITE_CHAR(ch);
6013 continue;
6014
6015 /* hex escapes */
6016 /* \xXX */
6017 case 'x':
6018 count = 2;
6019 message = "truncated \\xXX escape";
6020 goto hexescape;
6021
6022 /* \uXXXX */
6023 case 'u':
6024 count = 4;
6025 message = "truncated \\uXXXX escape";
6026 goto hexescape;
6027
6028 /* \UXXXXXXXX */
6029 case 'U':
6030 count = 8;
6031 message = "truncated \\UXXXXXXXX escape";
6032 hexescape:
6033 for (ch = 0; count && s < end; ++s, --count) {
6034 c = (unsigned char)*s;
6035 ch <<= 4;
6036 if (c >= '0' && c <= '9') {
6037 ch += c - '0';
6038 }
6039 else if (c >= 'a' && c <= 'f') {
6040 ch += c - ('a' - 10);
6041 }
6042 else if (c >= 'A' && c <= 'F') {
6043 ch += c - ('A' - 10);
6044 }
6045 else {
6046 break;
6047 }
6048 }
6049 if (count) {
6050 goto error;
6051 }
6052
6053 /* when we get here, ch is a 32-bit unicode character */
6054 if (ch > MAX_UNICODE) {
6055 message = "illegal Unicode character";
6056 goto error;
6057 }
6058
6059 WRITE_CHAR(ch);
6060 continue;
6061
6062 /* \N{name} */
6063 case 'N':
6064 if (ucnhash_CAPI == NULL) {
6065 /* load the unicode data module */
6066 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6067 PyUnicodeData_CAPSULE_NAME, 1);
6068 if (ucnhash_CAPI == NULL) {
6069 PyErr_SetString(
6070 PyExc_UnicodeError,
6071 "\\N escapes not supported (can't load unicodedata module)"
6072 );
6073 goto onError;
6074 }
6075 }
6076
6077 message = "malformed \\N character escape";
6078 if (s < end && *s == '{') {
6079 const char *start = ++s;
6080 size_t namelen;
6081 /* look for the closing brace */
6082 while (s < end && *s != '}')
6083 s++;
6084 namelen = s - start;
6085 if (namelen && s < end) {
6086 /* found a name. look it up in the unicode database */
6087 s++;
6088 ch = 0xffffffff; /* in case 'getcode' messes up */
6089 if (namelen <= INT_MAX &&
6090 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6091 &ch, 0)) {
6092 assert(ch <= MAX_UNICODE);
6093 WRITE_CHAR(ch);
6094 continue;
6095 }
6096 message = "unknown Unicode character name";
6097 }
6098 }
6099 goto error;
6100
6101 default:
6102 if (*first_invalid_escape == NULL) {
6103 *first_invalid_escape = s-1; /* Back up one char, since we've
6104 already incremented s. */
6105 }
6106 WRITE_ASCII_CHAR('\\');
6107 WRITE_CHAR(c);
6108 continue;
6109 }
6110
6111 error:
6112 endinpos = s-starts;
6113 writer.min_length = end - s + writer.pos;
6114 if (unicode_decode_call_errorhandler_writer(
6115 errors, &errorHandler,
6116 "unicodeescape", message,
6117 &starts, &end, &startinpos, &endinpos, &exc, &s,
6118 &writer)) {
6119 goto onError;
6120 }
6121 assert(end - s <= writer.size - writer.pos);
6122
6123 #undef WRITE_ASCII_CHAR
6124 #undef WRITE_CHAR
6125 }
6126
6127 Py_XDECREF(errorHandler);
6128 Py_XDECREF(exc);
6129 return _PyUnicodeWriter_Finish(&writer);
6130
6131 onError:
6132 _PyUnicodeWriter_Dealloc(&writer);
6133 Py_XDECREF(errorHandler);
6134 Py_XDECREF(exc);
6135 return NULL;
6136 }
6137
6138 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6139 PyUnicode_DecodeUnicodeEscape(const char *s,
6140 Py_ssize_t size,
6141 const char *errors)
6142 {
6143 const char *first_invalid_escape;
6144 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6145 &first_invalid_escape);
6146 if (result == NULL)
6147 return NULL;
6148 if (first_invalid_escape != NULL) {
6149 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6150 "invalid escape sequence '\\%c'",
6151 (unsigned char)*first_invalid_escape) < 0) {
6152 Py_DECREF(result);
6153 return NULL;
6154 }
6155 }
6156 return result;
6157 }
6158
6159 /* Return a Unicode-Escape string version of the Unicode object. */
6160
6161 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6162 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6163 {
6164 Py_ssize_t i, len;
6165 PyObject *repr;
6166 char *p;
6167 enum PyUnicode_Kind kind;
6168 void *data;
6169 Py_ssize_t expandsize;
6170
6171 /* Initial allocation is based on the longest-possible character
6172 escape.
6173
6174 For UCS1 strings it's '\xxx', 4 bytes per source character.
6175 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6176 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6177 */
6178
6179 if (!PyUnicode_Check(unicode)) {
6180 PyErr_BadArgument();
6181 return NULL;
6182 }
6183 if (PyUnicode_READY(unicode) == -1) {
6184 return NULL;
6185 }
6186
6187 len = PyUnicode_GET_LENGTH(unicode);
6188 if (len == 0) {
6189 return PyBytes_FromStringAndSize(NULL, 0);
6190 }
6191
6192 kind = PyUnicode_KIND(unicode);
6193 data = PyUnicode_DATA(unicode);
6194 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6195 bytes, and 1 byte characters 4. */
6196 expandsize = kind * 2 + 2;
6197 if (len > PY_SSIZE_T_MAX / expandsize) {
6198 return PyErr_NoMemory();
6199 }
6200 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6201 if (repr == NULL) {
6202 return NULL;
6203 }
6204
6205 p = PyBytes_AS_STRING(repr);
6206 for (i = 0; i < len; i++) {
6207 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6208
6209 /* U+0000-U+00ff range */
6210 if (ch < 0x100) {
6211 if (ch >= ' ' && ch < 127) {
6212 if (ch != '\\') {
6213 /* Copy printable US ASCII as-is */
6214 *p++ = (char) ch;
6215 }
6216 /* Escape backslashes */
6217 else {
6218 *p++ = '\\';
6219 *p++ = '\\';
6220 }
6221 }
6222
6223 /* Map special whitespace to '\t', \n', '\r' */
6224 else if (ch == '\t') {
6225 *p++ = '\\';
6226 *p++ = 't';
6227 }
6228 else if (ch == '\n') {
6229 *p++ = '\\';
6230 *p++ = 'n';
6231 }
6232 else if (ch == '\r') {
6233 *p++ = '\\';
6234 *p++ = 'r';
6235 }
6236
6237 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6238 else {
6239 *p++ = '\\';
6240 *p++ = 'x';
6241 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6242 *p++ = Py_hexdigits[ch & 0x000F];
6243 }
6244 }
6245 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6246 else if (ch < 0x10000) {
6247 *p++ = '\\';
6248 *p++ = 'u';
6249 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6250 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6251 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6252 *p++ = Py_hexdigits[ch & 0x000F];
6253 }
6254 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6255 else {
6256
6257 /* Make sure that the first two digits are zero */
6258 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6259 *p++ = '\\';
6260 *p++ = 'U';
6261 *p++ = '0';
6262 *p++ = '0';
6263 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6264 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6265 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6266 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6267 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6268 *p++ = Py_hexdigits[ch & 0x0000000F];
6269 }
6270 }
6271
6272 assert(p - PyBytes_AS_STRING(repr) > 0);
6273 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6274 return NULL;
6275 }
6276 return repr;
6277 }
6278
6279 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6280 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6281 Py_ssize_t size)
6282 {
6283 PyObject *result;
6284 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6285 if (tmp == NULL) {
6286 return NULL;
6287 }
6288
6289 result = PyUnicode_AsUnicodeEscapeString(tmp);
6290 Py_DECREF(tmp);
6291 return result;
6292 }
6293
6294 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6295
6296 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6297 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6298 Py_ssize_t size,
6299 const char *errors)
6300 {
6301 const char *starts = s;
6302 _PyUnicodeWriter writer;
6303 const char *end;
6304 PyObject *errorHandler = NULL;
6305 PyObject *exc = NULL;
6306
6307 if (size == 0) {
6308 _Py_RETURN_UNICODE_EMPTY();
6309 }
6310
6311 /* Escaped strings will always be longer than the resulting
6312 Unicode string, so we start with size here and then reduce the
6313 length after conversion to the true value. (But decoding error
6314 handler might have to resize the string) */
6315 _PyUnicodeWriter_Init(&writer);
6316 writer.min_length = size;
6317 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6318 goto onError;
6319 }
6320
6321 end = s + size;
6322 while (s < end) {
6323 unsigned char c = (unsigned char) *s++;
6324 Py_UCS4 ch;
6325 int count;
6326 Py_ssize_t startinpos;
6327 Py_ssize_t endinpos;
6328 const char *message;
6329
6330 #define WRITE_CHAR(ch) \
6331 do { \
6332 if (ch <= writer.maxchar) { \
6333 assert(writer.pos < writer.size); \
6334 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6335 } \
6336 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6337 goto onError; \
6338 } \
6339 } while(0)
6340
6341 /* Non-escape characters are interpreted as Unicode ordinals */
6342 if (c != '\\' || s >= end) {
6343 WRITE_CHAR(c);
6344 continue;
6345 }
6346
6347 c = (unsigned char) *s++;
6348 if (c == 'u') {
6349 count = 4;
6350 message = "truncated \\uXXXX escape";
6351 }
6352 else if (c == 'U') {
6353 count = 8;
6354 message = "truncated \\UXXXXXXXX escape";
6355 }
6356 else {
6357 assert(writer.pos < writer.size);
6358 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6359 WRITE_CHAR(c);
6360 continue;
6361 }
6362 startinpos = s - starts - 2;
6363
6364 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6365 for (ch = 0; count && s < end; ++s, --count) {
6366 c = (unsigned char)*s;
6367 ch <<= 4;
6368 if (c >= '0' && c <= '9') {
6369 ch += c - '0';
6370 }
6371 else if (c >= 'a' && c <= 'f') {
6372 ch += c - ('a' - 10);
6373 }
6374 else if (c >= 'A' && c <= 'F') {
6375 ch += c - ('A' - 10);
6376 }
6377 else {
6378 break;
6379 }
6380 }
6381 if (!count) {
6382 if (ch <= MAX_UNICODE) {
6383 WRITE_CHAR(ch);
6384 continue;
6385 }
6386 message = "\\Uxxxxxxxx out of range";
6387 }
6388
6389 endinpos = s-starts;
6390 writer.min_length = end - s + writer.pos;
6391 if (unicode_decode_call_errorhandler_writer(
6392 errors, &errorHandler,
6393 "rawunicodeescape", message,
6394 &starts, &end, &startinpos, &endinpos, &exc, &s,
6395 &writer)) {
6396 goto onError;
6397 }
6398 assert(end - s <= writer.size - writer.pos);
6399
6400 #undef WRITE_CHAR
6401 }
6402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
6404 return _PyUnicodeWriter_Finish(&writer);
6405
6406 onError:
6407 _PyUnicodeWriter_Dealloc(&writer);
6408 Py_XDECREF(errorHandler);
6409 Py_XDECREF(exc);
6410 return NULL;
6411
6412 }
6413
6414
6415 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6416 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6417 {
6418 PyObject *repr;
6419 char *p;
6420 Py_ssize_t expandsize, pos;
6421 int kind;
6422 void *data;
6423 Py_ssize_t len;
6424
6425 if (!PyUnicode_Check(unicode)) {
6426 PyErr_BadArgument();
6427 return NULL;
6428 }
6429 if (PyUnicode_READY(unicode) == -1) {
6430 return NULL;
6431 }
6432 kind = PyUnicode_KIND(unicode);
6433 data = PyUnicode_DATA(unicode);
6434 len = PyUnicode_GET_LENGTH(unicode);
6435 if (kind == PyUnicode_1BYTE_KIND) {
6436 return PyBytes_FromStringAndSize(data, len);
6437 }
6438
6439 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6440 bytes, and 1 byte characters 4. */
6441 expandsize = kind * 2 + 2;
6442
6443 if (len > PY_SSIZE_T_MAX / expandsize) {
6444 return PyErr_NoMemory();
6445 }
6446 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6447 if (repr == NULL) {
6448 return NULL;
6449 }
6450 if (len == 0) {
6451 return repr;
6452 }
6453
6454 p = PyBytes_AS_STRING(repr);
6455 for (pos = 0; pos < len; pos++) {
6456 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6457
6458 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6459 if (ch < 0x100) {
6460 *p++ = (char) ch;
6461 }
6462 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6463 else if (ch < 0x10000) {
6464 *p++ = '\\';
6465 *p++ = 'u';
6466 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6467 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6468 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6469 *p++ = Py_hexdigits[ch & 15];
6470 }
6471 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6472 else {
6473 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6474 *p++ = '\\';
6475 *p++ = 'U';
6476 *p++ = '0';
6477 *p++ = '0';
6478 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6479 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6480 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6481 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6482 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6483 *p++ = Py_hexdigits[ch & 15];
6484 }
6485 }
6486
6487 assert(p > PyBytes_AS_STRING(repr));
6488 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6489 return NULL;
6490 }
6491 return repr;
6492 }
6493
6494 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6495 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6496 Py_ssize_t size)
6497 {
6498 PyObject *result;
6499 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6500 if (tmp == NULL)
6501 return NULL;
6502 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6503 Py_DECREF(tmp);
6504 return result;
6505 }
6506
6507 /* --- Unicode Internal Codec ------------------------------------------- */
6508
6509 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6510 _PyUnicode_DecodeUnicodeInternal(const char *s,
6511 Py_ssize_t size,
6512 const char *errors)
6513 {
6514 const char *starts = s;
6515 Py_ssize_t startinpos;
6516 Py_ssize_t endinpos;
6517 _PyUnicodeWriter writer;
6518 const char *end;
6519 const char *reason;
6520 PyObject *errorHandler = NULL;
6521 PyObject *exc = NULL;
6522
6523 if (PyErr_WarnEx(PyExc_DeprecationWarning,
6524 "unicode_internal codec has been deprecated",
6525 1))
6526 return NULL;
6527
6528 if (size < 0) {
6529 PyErr_BadInternalCall();
6530 return NULL;
6531 }
6532 if (size == 0)
6533 _Py_RETURN_UNICODE_EMPTY();
6534
6535 _PyUnicodeWriter_Init(&writer);
6536 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6537 PyErr_NoMemory();
6538 goto onError;
6539 }
6540 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6541
6542 end = s + size;
6543 while (s < end) {
6544 Py_UNICODE uch;
6545 Py_UCS4 ch;
6546 if (end - s < Py_UNICODE_SIZE) {
6547 endinpos = end-starts;
6548 reason = "truncated input";
6549 goto error;
6550 }
6551 /* We copy the raw representation one byte at a time because the
6552 pointer may be unaligned (see test_codeccallbacks). */
6553 ((char *) &uch)[0] = s[0];
6554 ((char *) &uch)[1] = s[1];
6555 #ifdef Py_UNICODE_WIDE
6556 ((char *) &uch)[2] = s[2];
6557 ((char *) &uch)[3] = s[3];
6558 #endif
6559 ch = uch;
6560 #ifdef Py_UNICODE_WIDE
6561 /* We have to sanity check the raw data, otherwise doom looms for
6562 some malformed UCS-4 data. */
6563 if (ch > 0x10ffff) {
6564 endinpos = s - starts + Py_UNICODE_SIZE;
6565 reason = "illegal code point (> 0x10FFFF)";
6566 goto error;
6567 }
6568 #endif
6569 s += Py_UNICODE_SIZE;
6570 #ifndef Py_UNICODE_WIDE
6571 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6572 {
6573 Py_UNICODE uch2;
6574 ((char *) &uch2)[0] = s[0];
6575 ((char *) &uch2)[1] = s[1];
6576 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6577 {
6578 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6579 s += Py_UNICODE_SIZE;
6580 }
6581 }
6582 #endif
6583
6584 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6585 goto onError;
6586 continue;
6587
6588 error:
6589 startinpos = s - starts;
6590 if (unicode_decode_call_errorhandler_writer(
6591 errors, &errorHandler,
6592 "unicode_internal", reason,
6593 &starts, &end, &startinpos, &endinpos, &exc, &s,
6594 &writer))
6595 goto onError;
6596 }
6597
6598 Py_XDECREF(errorHandler);
6599 Py_XDECREF(exc);
6600 return _PyUnicodeWriter_Finish(&writer);
6601
6602 onError:
6603 _PyUnicodeWriter_Dealloc(&writer);
6604 Py_XDECREF(errorHandler);
6605 Py_XDECREF(exc);
6606 return NULL;
6607 }
6608
6609 /* --- Latin-1 Codec ------------------------------------------------------ */
6610
6611 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6612 PyUnicode_DecodeLatin1(const char *s,
6613 Py_ssize_t size,
6614 const char *errors)
6615 {
6616 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6617 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6618 }
6619
6620 /* create or adjust a UnicodeEncodeError */
6621 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6622 make_encode_exception(PyObject **exceptionObject,
6623 const char *encoding,
6624 PyObject *unicode,
6625 Py_ssize_t startpos, Py_ssize_t endpos,
6626 const char *reason)
6627 {
6628 if (*exceptionObject == NULL) {
6629 *exceptionObject = PyObject_CallFunction(
6630 PyExc_UnicodeEncodeError, "sOnns",
6631 encoding, unicode, startpos, endpos, reason);
6632 }
6633 else {
6634 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6635 goto onError;
6636 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6637 goto onError;
6638 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6639 goto onError;
6640 return;
6641 onError:
6642 Py_CLEAR(*exceptionObject);
6643 }
6644 }
6645
6646 /* raises a UnicodeEncodeError */
6647 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6648 raise_encode_exception(PyObject **exceptionObject,
6649 const char *encoding,
6650 PyObject *unicode,
6651 Py_ssize_t startpos, Py_ssize_t endpos,
6652 const char *reason)
6653 {
6654 make_encode_exception(exceptionObject,
6655 encoding, unicode, startpos, endpos, reason);
6656 if (*exceptionObject != NULL)
6657 PyCodec_StrictErrors(*exceptionObject);
6658 }
6659
6660 /* error handling callback helper:
6661 build arguments, call the callback and check the arguments,
6662 put the result into newpos and return the replacement string, which
6663 has to be freed by the caller */
6664 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6665 unicode_encode_call_errorhandler(const char *errors,
6666 PyObject **errorHandler,
6667 const char *encoding, const char *reason,
6668 PyObject *unicode, PyObject **exceptionObject,
6669 Py_ssize_t startpos, Py_ssize_t endpos,
6670 Py_ssize_t *newpos)
6671 {
6672 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6673 Py_ssize_t len;
6674 PyObject *restuple;
6675 PyObject *resunicode;
6676
6677 if (*errorHandler == NULL) {
6678 *errorHandler = PyCodec_LookupError(errors);
6679 if (*errorHandler == NULL)
6680 return NULL;
6681 }
6682
6683 if (PyUnicode_READY(unicode) == -1)
6684 return NULL;
6685 len = PyUnicode_GET_LENGTH(unicode);
6686
6687 make_encode_exception(exceptionObject,
6688 encoding, unicode, startpos, endpos, reason);
6689 if (*exceptionObject == NULL)
6690 return NULL;
6691
6692 restuple = PyObject_CallFunctionObjArgs(
6693 *errorHandler, *exceptionObject, NULL);
6694 if (restuple == NULL)
6695 return NULL;
6696 if (!PyTuple_Check(restuple)) {
6697 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698 Py_DECREF(restuple);
6699 return NULL;
6700 }
6701 if (!PyArg_ParseTuple(restuple, argparse,
6702 &resunicode, newpos)) {
6703 Py_DECREF(restuple);
6704 return NULL;
6705 }
6706 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6707 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6708 Py_DECREF(restuple);
6709 return NULL;
6710 }
6711 if (*newpos<0)
6712 *newpos = len + *newpos;
6713 if (*newpos<0 || *newpos>len) {
6714 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6715 Py_DECREF(restuple);
6716 return NULL;
6717 }
6718 Py_INCREF(resunicode);
6719 Py_DECREF(restuple);
6720 return resunicode;
6721 }
6722
6723 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6724 unicode_encode_ucs1(PyObject *unicode,
6725 const char *errors,
6726 const Py_UCS4 limit)
6727 {
6728 /* input state */
6729 Py_ssize_t pos=0, size;
6730 int kind;
6731 void *data;
6732 /* pointer into the output */
6733 char *str;
6734 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6735 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6736 PyObject *error_handler_obj = NULL;
6737 PyObject *exc = NULL;
6738 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6739 PyObject *rep = NULL;
6740 /* output object */
6741 _PyBytesWriter writer;
6742
6743 if (PyUnicode_READY(unicode) == -1)
6744 return NULL;
6745 size = PyUnicode_GET_LENGTH(unicode);
6746 kind = PyUnicode_KIND(unicode);
6747 data = PyUnicode_DATA(unicode);
6748 /* allocate enough for a simple encoding without
6749 replacements, if we need more, we'll resize */
6750 if (size == 0)
6751 return PyBytes_FromStringAndSize(NULL, 0);
6752
6753 _PyBytesWriter_Init(&writer);
6754 str = _PyBytesWriter_Alloc(&writer, size);
6755 if (str == NULL)
6756 return NULL;
6757
6758 while (pos < size) {
6759 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6760
6761 /* can we encode this? */
6762 if (ch < limit) {
6763 /* no overflow check, because we know that the space is enough */
6764 *str++ = (char)ch;
6765 ++pos;
6766 }
6767 else {
6768 Py_ssize_t newpos, i;
6769 /* startpos for collecting unencodable chars */
6770 Py_ssize_t collstart = pos;
6771 Py_ssize_t collend = collstart + 1;
6772 /* find all unecodable characters */
6773
6774 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6775 ++collend;
6776
6777 /* Only overallocate the buffer if it's not the last write */
6778 writer.overallocate = (collend < size);
6779
6780 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6781 if (error_handler == _Py_ERROR_UNKNOWN)
6782 error_handler = get_error_handler(errors);
6783
6784 switch (error_handler) {
6785 case _Py_ERROR_STRICT:
6786 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6787 goto onError;
6788
6789 case _Py_ERROR_REPLACE:
6790 memset(str, '?', collend - collstart);
6791 str += (collend - collstart);
6792 /* fall through */
6793 case _Py_ERROR_IGNORE:
6794 pos = collend;
6795 break;
6796
6797 case _Py_ERROR_BACKSLASHREPLACE:
6798 /* subtract preallocated bytes */
6799 writer.min_size -= (collend - collstart);
6800 str = backslashreplace(&writer, str,
6801 unicode, collstart, collend);
6802 if (str == NULL)
6803 goto onError;
6804 pos = collend;
6805 break;
6806
6807 case _Py_ERROR_XMLCHARREFREPLACE:
6808 /* subtract preallocated bytes */
6809 writer.min_size -= (collend - collstart);
6810 str = xmlcharrefreplace(&writer, str,
6811 unicode, collstart, collend);
6812 if (str == NULL)
6813 goto onError;
6814 pos = collend;
6815 break;
6816
6817 case _Py_ERROR_SURROGATEESCAPE:
6818 for (i = collstart; i < collend; ++i) {
6819 ch = PyUnicode_READ(kind, data, i);
6820 if (ch < 0xdc80 || 0xdcff < ch) {
6821 /* Not a UTF-8b surrogate */
6822 break;
6823 }
6824 *str++ = (char)(ch - 0xdc00);
6825 ++pos;
6826 }
6827 if (i >= collend)
6828 break;
6829 collstart = pos;
6830 assert(collstart != collend);
6831 /* fall through */
6832
6833 default:
6834 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6835 encoding, reason, unicode, &exc,
6836 collstart, collend, &newpos);
6837 if (rep == NULL)
6838 goto onError;
6839
6840 /* subtract preallocated bytes */
6841 writer.min_size -= newpos - collstart;
6842
6843 if (PyBytes_Check(rep)) {
6844 /* Directly copy bytes result to output. */
6845 str = _PyBytesWriter_WriteBytes(&writer, str,
6846 PyBytes_AS_STRING(rep),
6847 PyBytes_GET_SIZE(rep));
6848 }
6849 else {
6850 assert(PyUnicode_Check(rep));
6851
6852 if (PyUnicode_READY(rep) < 0)
6853 goto onError;
6854
6855 if (limit == 256 ?
6856 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6857 !PyUnicode_IS_ASCII(rep))
6858 {
6859 /* Not all characters are smaller than limit */
6860 raise_encode_exception(&exc, encoding, unicode,
6861 collstart, collend, reason);
6862 goto onError;
6863 }
6864 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6865 str = _PyBytesWriter_WriteBytes(&writer, str,
6866 PyUnicode_DATA(rep),
6867 PyUnicode_GET_LENGTH(rep));
6868 }
6869 if (str == NULL)
6870 goto onError;
6871
6872 pos = newpos;
6873 Py_CLEAR(rep);
6874 }
6875
6876 /* If overallocation was disabled, ensure that it was the last
6877 write. Otherwise, we missed an optimization */
6878 assert(writer.overallocate || pos == size);
6879 }
6880 }
6881
6882 Py_XDECREF(error_handler_obj);
6883 Py_XDECREF(exc);
6884 return _PyBytesWriter_Finish(&writer, str);
6885
6886 onError:
6887 Py_XDECREF(rep);
6888 _PyBytesWriter_Dealloc(&writer);
6889 Py_XDECREF(error_handler_obj);
6890 Py_XDECREF(exc);
6891 return NULL;
6892 }
6893
6894 /* Deprecated */
6895 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6896 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6897 Py_ssize_t size,
6898 const char *errors)
6899 {
6900 PyObject *result;
6901 PyObject *unicode = PyUnicode_FromWideChar(p, size);
6902 if (unicode == NULL)
6903 return NULL;
6904 result = unicode_encode_ucs1(unicode, errors, 256);
6905 Py_DECREF(unicode);
6906 return result;
6907 }
6908
6909 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6910 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6911 {
6912 if (!PyUnicode_Check(unicode)) {
6913 PyErr_BadArgument();
6914 return NULL;
6915 }
6916 if (PyUnicode_READY(unicode) == -1)
6917 return NULL;
6918 /* Fast path: if it is a one-byte string, construct
6919 bytes object directly. */
6920 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6921 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6922 PyUnicode_GET_LENGTH(unicode));
6923 /* Non-Latin-1 characters present. Defer to above function to
6924 raise the exception. */
6925 return unicode_encode_ucs1(unicode, errors, 256);
6926 }
6927
6928 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6929 PyUnicode_AsLatin1String(PyObject *unicode)
6930 {
6931 return _PyUnicode_AsLatin1String(unicode, NULL);
6932 }
6933
6934 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6935
6936 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6937 PyUnicode_DecodeASCII(const char *s,
6938 Py_ssize_t size,
6939 const char *errors)
6940 {
6941 const char *starts = s;
6942 _PyUnicodeWriter writer;
6943 int kind;
6944 void *data;
6945 Py_ssize_t startinpos;
6946 Py_ssize_t endinpos;
6947 Py_ssize_t outpos;
6948 const char *e;
6949 PyObject *error_handler_obj = NULL;
6950 PyObject *exc = NULL;
6951 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6952
6953 if (size == 0)
6954 _Py_RETURN_UNICODE_EMPTY();
6955
6956 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6957 if (size == 1 && (unsigned char)s[0] < 128)
6958 return get_latin1_char((unsigned char)s[0]);
6959
6960 _PyUnicodeWriter_Init(&writer);
6961 writer.min_length = size;
6962 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6963 return NULL;
6964
6965 e = s + size;
6966 data = writer.data;
6967 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6968 writer.pos = outpos;
6969 if (writer.pos == size)
6970 return _PyUnicodeWriter_Finish(&writer);
6971
6972 s += writer.pos;
6973 kind = writer.kind;
6974 while (s < e) {
6975 unsigned char c = (unsigned char)*s;
6976 if (c < 128) {
6977 PyUnicode_WRITE(kind, data, writer.pos, c);
6978 writer.pos++;
6979 ++s;
6980 continue;
6981 }
6982
6983 /* byte outsize range 0x00..0x7f: call the error handler */
6984
6985 if (error_handler == _Py_ERROR_UNKNOWN)
6986 error_handler = get_error_handler(errors);
6987
6988 switch (error_handler)
6989 {
6990 case _Py_ERROR_REPLACE:
6991 case _Py_ERROR_SURROGATEESCAPE:
6992 /* Fast-path: the error handler only writes one character,
6993 but we may switch to UCS2 at the first write */
6994 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6995 goto onError;
6996 kind = writer.kind;
6997 data = writer.data;
6998
6999 if (error_handler == _Py_ERROR_REPLACE)
7000 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7001 else
7002 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7003 writer.pos++;
7004 ++s;
7005 break;
7006
7007 case _Py_ERROR_IGNORE:
7008 ++s;
7009 break;
7010
7011 default:
7012 startinpos = s-starts;
7013 endinpos = startinpos + 1;
7014 if (unicode_decode_call_errorhandler_writer(
7015 errors, &error_handler_obj,
7016 "ascii", "ordinal not in range(128)",
7017 &starts, &e, &startinpos, &endinpos, &exc, &s,
7018 &writer))
7019 goto onError;
7020 kind = writer.kind;
7021 data = writer.data;
7022 }
7023 }
7024 Py_XDECREF(error_handler_obj);
7025 Py_XDECREF(exc);
7026 return _PyUnicodeWriter_Finish(&writer);
7027
7028 onError:
7029 _PyUnicodeWriter_Dealloc(&writer);
7030 Py_XDECREF(error_handler_obj);
7031 Py_XDECREF(exc);
7032 return NULL;
7033 }
7034
7035 /* Deprecated */
7036 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7037 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7038 Py_ssize_t size,
7039 const char *errors)
7040 {
7041 PyObject *result;
7042 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7043 if (unicode == NULL)
7044 return NULL;
7045 result = unicode_encode_ucs1(unicode, errors, 128);
7046 Py_DECREF(unicode);
7047 return result;
7048 }
7049
7050 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7051 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7052 {
7053 if (!PyUnicode_Check(unicode)) {
7054 PyErr_BadArgument();
7055 return NULL;
7056 }
7057 if (PyUnicode_READY(unicode) == -1)
7058 return NULL;
7059 /* Fast path: if it is an ASCII-only string, construct bytes object
7060 directly. Else defer to above function to raise the exception. */
7061 if (PyUnicode_IS_ASCII(unicode))
7062 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7063 PyUnicode_GET_LENGTH(unicode));
7064 return unicode_encode_ucs1(unicode, errors, 128);
7065 }
7066
7067 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7068 PyUnicode_AsASCIIString(PyObject *unicode)
7069 {
7070 return _PyUnicode_AsASCIIString(unicode, NULL);
7071 }
7072
7073 #ifdef MS_WINDOWS
7074
7075 /* --- MBCS codecs for Windows -------------------------------------------- */
7076
7077 #if SIZEOF_INT < SIZEOF_SIZE_T
7078 #define NEED_RETRY
7079 #endif
7080
7081 #ifndef WC_ERR_INVALID_CHARS
7082 # define WC_ERR_INVALID_CHARS 0x0080
7083 #endif
7084
7085 static const char*
code_page_name(UINT code_page,PyObject ** obj)7086 code_page_name(UINT code_page, PyObject **obj)
7087 {
7088 *obj = NULL;
7089 if (code_page == CP_ACP)
7090 return "mbcs";
7091 if (code_page == CP_UTF7)
7092 return "CP_UTF7";
7093 if (code_page == CP_UTF8)
7094 return "CP_UTF8";
7095
7096 *obj = PyBytes_FromFormat("cp%u", code_page);
7097 if (*obj == NULL)
7098 return NULL;
7099 return PyBytes_AS_STRING(*obj);
7100 }
7101
7102 static DWORD
decode_code_page_flags(UINT code_page)7103 decode_code_page_flags(UINT code_page)
7104 {
7105 if (code_page == CP_UTF7) {
7106 /* The CP_UTF7 decoder only supports flags=0 */
7107 return 0;
7108 }
7109 else
7110 return MB_ERR_INVALID_CHARS;
7111 }
7112
7113 /*
7114 * Decode a byte string from a Windows code page into unicode object in strict
7115 * mode.
7116 *
7117 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7118 * OSError and returns -1 on other error.
7119 */
7120 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7121 decode_code_page_strict(UINT code_page,
7122 PyObject **v,
7123 const char *in,
7124 int insize)
7125 {
7126 const DWORD flags = decode_code_page_flags(code_page);
7127 wchar_t *out;
7128 DWORD outsize;
7129
7130 /* First get the size of the result */
7131 assert(insize > 0);
7132 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7133 if (outsize <= 0)
7134 goto error;
7135
7136 if (*v == NULL) {
7137 /* Create unicode object */
7138 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7139 *v = (PyObject*)_PyUnicode_New(outsize);
7140 if (*v == NULL)
7141 return -1;
7142 out = PyUnicode_AS_UNICODE(*v);
7143 }
7144 else {
7145 /* Extend unicode object */
7146 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7147 if (unicode_resize(v, n + outsize) < 0)
7148 return -1;
7149 out = PyUnicode_AS_UNICODE(*v) + n;
7150 }
7151
7152 /* Do the conversion */
7153 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7154 if (outsize <= 0)
7155 goto error;
7156 return insize;
7157
7158 error:
7159 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7160 return -2;
7161 PyErr_SetFromWindowsErr(0);
7162 return -1;
7163 }
7164
7165 /*
7166 * Decode a byte string from a code page into unicode object with an error
7167 * handler.
7168 *
7169 * Returns consumed size if succeed, or raise an OSError or
7170 * UnicodeDecodeError exception and returns -1 on error.
7171 */
7172 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7173 decode_code_page_errors(UINT code_page,
7174 PyObject **v,
7175 const char *in, const int size,
7176 const char *errors, int final)
7177 {
7178 const char *startin = in;
7179 const char *endin = in + size;
7180 const DWORD flags = decode_code_page_flags(code_page);
7181 /* Ideally, we should get reason from FormatMessage. This is the Windows
7182 2000 English version of the message. */
7183 const char *reason = "No mapping for the Unicode character exists "
7184 "in the target code page.";
7185 /* each step cannot decode more than 1 character, but a character can be
7186 represented as a surrogate pair */
7187 wchar_t buffer[2], *out;
7188 int insize;
7189 Py_ssize_t outsize;
7190 PyObject *errorHandler = NULL;
7191 PyObject *exc = NULL;
7192 PyObject *encoding_obj = NULL;
7193 const char *encoding;
7194 DWORD err;
7195 int ret = -1;
7196
7197 assert(size > 0);
7198
7199 encoding = code_page_name(code_page, &encoding_obj);
7200 if (encoding == NULL)
7201 return -1;
7202
7203 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7204 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7205 UnicodeDecodeError. */
7206 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7207 if (exc != NULL) {
7208 PyCodec_StrictErrors(exc);
7209 Py_CLEAR(exc);
7210 }
7211 goto error;
7212 }
7213
7214 if (*v == NULL) {
7215 /* Create unicode object */
7216 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7217 PyErr_NoMemory();
7218 goto error;
7219 }
7220 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7221 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7222 if (*v == NULL)
7223 goto error;
7224 out = PyUnicode_AS_UNICODE(*v);
7225 }
7226 else {
7227 /* Extend unicode object */
7228 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7229 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7230 PyErr_NoMemory();
7231 goto error;
7232 }
7233 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7234 goto error;
7235 out = PyUnicode_AS_UNICODE(*v) + n;
7236 }
7237
7238 /* Decode the byte string character per character */
7239 while (in < endin)
7240 {
7241 /* Decode a character */
7242 insize = 1;
7243 do
7244 {
7245 outsize = MultiByteToWideChar(code_page, flags,
7246 in, insize,
7247 buffer, Py_ARRAY_LENGTH(buffer));
7248 if (outsize > 0)
7249 break;
7250 err = GetLastError();
7251 if (err != ERROR_NO_UNICODE_TRANSLATION
7252 && err != ERROR_INSUFFICIENT_BUFFER)
7253 {
7254 PyErr_SetFromWindowsErr(0);
7255 goto error;
7256 }
7257 insize++;
7258 }
7259 /* 4=maximum length of a UTF-8 sequence */
7260 while (insize <= 4 && (in + insize) <= endin);
7261
7262 if (outsize <= 0) {
7263 Py_ssize_t startinpos, endinpos, outpos;
7264
7265 /* last character in partial decode? */
7266 if (in + insize >= endin && !final)
7267 break;
7268
7269 startinpos = in - startin;
7270 endinpos = startinpos + 1;
7271 outpos = out - PyUnicode_AS_UNICODE(*v);
7272 if (unicode_decode_call_errorhandler_wchar(
7273 errors, &errorHandler,
7274 encoding, reason,
7275 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7276 v, &outpos))
7277 {
7278 goto error;
7279 }
7280 out = PyUnicode_AS_UNICODE(*v) + outpos;
7281 }
7282 else {
7283 in += insize;
7284 memcpy(out, buffer, outsize * sizeof(wchar_t));
7285 out += outsize;
7286 }
7287 }
7288
7289 /* write a NUL character at the end */
7290 *out = 0;
7291
7292 /* Extend unicode object */
7293 outsize = out - PyUnicode_AS_UNICODE(*v);
7294 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7295 if (unicode_resize(v, outsize) < 0)
7296 goto error;
7297 /* (in - startin) <= size and size is an int */
7298 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7299
7300 error:
7301 Py_XDECREF(encoding_obj);
7302 Py_XDECREF(errorHandler);
7303 Py_XDECREF(exc);
7304 return ret;
7305 }
7306
7307 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7308 decode_code_page_stateful(int code_page,
7309 const char *s, Py_ssize_t size,
7310 const char *errors, Py_ssize_t *consumed)
7311 {
7312 PyObject *v = NULL;
7313 int chunk_size, final, converted, done;
7314
7315 if (code_page < 0) {
7316 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7317 return NULL;
7318 }
7319 if (size < 0) {
7320 PyErr_BadInternalCall();
7321 return NULL;
7322 }
7323
7324 if (consumed)
7325 *consumed = 0;
7326
7327 do
7328 {
7329 #ifdef NEED_RETRY
7330 if (size > INT_MAX) {
7331 chunk_size = INT_MAX;
7332 final = 0;
7333 done = 0;
7334 }
7335 else
7336 #endif
7337 {
7338 chunk_size = (int)size;
7339 final = (consumed == NULL);
7340 done = 1;
7341 }
7342
7343 if (chunk_size == 0 && done) {
7344 if (v != NULL)
7345 break;
7346 _Py_RETURN_UNICODE_EMPTY();
7347 }
7348
7349 converted = decode_code_page_strict(code_page, &v,
7350 s, chunk_size);
7351 if (converted == -2)
7352 converted = decode_code_page_errors(code_page, &v,
7353 s, chunk_size,
7354 errors, final);
7355 assert(converted != 0 || done);
7356
7357 if (converted < 0) {
7358 Py_XDECREF(v);
7359 return NULL;
7360 }
7361
7362 if (consumed)
7363 *consumed += converted;
7364
7365 s += converted;
7366 size -= converted;
7367 } while (!done);
7368
7369 return unicode_result(v);
7370 }
7371
7372 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7373 PyUnicode_DecodeCodePageStateful(int code_page,
7374 const char *s,
7375 Py_ssize_t size,
7376 const char *errors,
7377 Py_ssize_t *consumed)
7378 {
7379 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7380 }
7381
7382 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7383 PyUnicode_DecodeMBCSStateful(const char *s,
7384 Py_ssize_t size,
7385 const char *errors,
7386 Py_ssize_t *consumed)
7387 {
7388 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7389 }
7390
7391 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7392 PyUnicode_DecodeMBCS(const char *s,
7393 Py_ssize_t size,
7394 const char *errors)
7395 {
7396 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7397 }
7398
7399 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7400 encode_code_page_flags(UINT code_page, const char *errors)
7401 {
7402 if (code_page == CP_UTF8) {
7403 return WC_ERR_INVALID_CHARS;
7404 }
7405 else if (code_page == CP_UTF7) {
7406 /* CP_UTF7 only supports flags=0 */
7407 return 0;
7408 }
7409 else {
7410 if (errors != NULL && strcmp(errors, "replace") == 0)
7411 return 0;
7412 else
7413 return WC_NO_BEST_FIT_CHARS;
7414 }
7415 }
7416
7417 /*
7418 * Encode a Unicode string to a Windows code page into a byte string in strict
7419 * mode.
7420 *
7421 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7422 * an OSError and returns -1 on other error.
7423 */
7424 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7425 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7426 PyObject *unicode, Py_ssize_t offset, int len,
7427 const char* errors)
7428 {
7429 BOOL usedDefaultChar = FALSE;
7430 BOOL *pusedDefaultChar = &usedDefaultChar;
7431 int outsize;
7432 wchar_t *p;
7433 Py_ssize_t size;
7434 const DWORD flags = encode_code_page_flags(code_page, NULL);
7435 char *out;
7436 /* Create a substring so that we can get the UTF-16 representation
7437 of just the slice under consideration. */
7438 PyObject *substring;
7439
7440 assert(len > 0);
7441
7442 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7443 pusedDefaultChar = &usedDefaultChar;
7444 else
7445 pusedDefaultChar = NULL;
7446
7447 substring = PyUnicode_Substring(unicode, offset, offset+len);
7448 if (substring == NULL)
7449 return -1;
7450 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7451 if (p == NULL) {
7452 Py_DECREF(substring);
7453 return -1;
7454 }
7455 assert(size <= INT_MAX);
7456
7457 /* First get the size of the result */
7458 outsize = WideCharToMultiByte(code_page, flags,
7459 p, (int)size,
7460 NULL, 0,
7461 NULL, pusedDefaultChar);
7462 if (outsize <= 0)
7463 goto error;
7464 /* If we used a default char, then we failed! */
7465 if (pusedDefaultChar && *pusedDefaultChar) {
7466 Py_DECREF(substring);
7467 return -2;
7468 }
7469
7470 if (*outbytes == NULL) {
7471 /* Create string object */
7472 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7473 if (*outbytes == NULL) {
7474 Py_DECREF(substring);
7475 return -1;
7476 }
7477 out = PyBytes_AS_STRING(*outbytes);
7478 }
7479 else {
7480 /* Extend string object */
7481 const Py_ssize_t n = PyBytes_Size(*outbytes);
7482 if (outsize > PY_SSIZE_T_MAX - n) {
7483 PyErr_NoMemory();
7484 Py_DECREF(substring);
7485 return -1;
7486 }
7487 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7488 Py_DECREF(substring);
7489 return -1;
7490 }
7491 out = PyBytes_AS_STRING(*outbytes) + n;
7492 }
7493
7494 /* Do the conversion */
7495 outsize = WideCharToMultiByte(code_page, flags,
7496 p, (int)size,
7497 out, outsize,
7498 NULL, pusedDefaultChar);
7499 Py_CLEAR(substring);
7500 if (outsize <= 0)
7501 goto error;
7502 if (pusedDefaultChar && *pusedDefaultChar)
7503 return -2;
7504 return 0;
7505
7506 error:
7507 Py_XDECREF(substring);
7508 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7509 return -2;
7510 PyErr_SetFromWindowsErr(0);
7511 return -1;
7512 }
7513
7514 /*
7515 * Encode a Unicode string to a Windows code page into a byte string using an
7516 * error handler.
7517 *
7518 * Returns consumed characters if succeed, or raise an OSError and returns
7519 * -1 on other error.
7520 */
7521 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7522 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7523 PyObject *unicode, Py_ssize_t unicode_offset,
7524 Py_ssize_t insize, const char* errors)
7525 {
7526 const DWORD flags = encode_code_page_flags(code_page, errors);
7527 Py_ssize_t pos = unicode_offset;
7528 Py_ssize_t endin = unicode_offset + insize;
7529 /* Ideally, we should get reason from FormatMessage. This is the Windows
7530 2000 English version of the message. */
7531 const char *reason = "invalid character";
7532 /* 4=maximum length of a UTF-8 sequence */
7533 char buffer[4];
7534 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7535 Py_ssize_t outsize;
7536 char *out;
7537 PyObject *errorHandler = NULL;
7538 PyObject *exc = NULL;
7539 PyObject *encoding_obj = NULL;
7540 const char *encoding;
7541 Py_ssize_t newpos, newoutsize;
7542 PyObject *rep;
7543 int ret = -1;
7544
7545 assert(insize > 0);
7546
7547 encoding = code_page_name(code_page, &encoding_obj);
7548 if (encoding == NULL)
7549 return -1;
7550
7551 if (errors == NULL || strcmp(errors, "strict") == 0) {
7552 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7553 then we raise a UnicodeEncodeError. */
7554 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7555 if (exc != NULL) {
7556 PyCodec_StrictErrors(exc);
7557 Py_DECREF(exc);
7558 }
7559 Py_XDECREF(encoding_obj);
7560 return -1;
7561 }
7562
7563 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7564 pusedDefaultChar = &usedDefaultChar;
7565 else
7566 pusedDefaultChar = NULL;
7567
7568 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7569 PyErr_NoMemory();
7570 goto error;
7571 }
7572 outsize = insize * Py_ARRAY_LENGTH(buffer);
7573
7574 if (*outbytes == NULL) {
7575 /* Create string object */
7576 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7577 if (*outbytes == NULL)
7578 goto error;
7579 out = PyBytes_AS_STRING(*outbytes);
7580 }
7581 else {
7582 /* Extend string object */
7583 Py_ssize_t n = PyBytes_Size(*outbytes);
7584 if (n > PY_SSIZE_T_MAX - outsize) {
7585 PyErr_NoMemory();
7586 goto error;
7587 }
7588 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7589 goto error;
7590 out = PyBytes_AS_STRING(*outbytes) + n;
7591 }
7592
7593 /* Encode the string character per character */
7594 while (pos < endin)
7595 {
7596 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7597 wchar_t chars[2];
7598 int charsize;
7599 if (ch < 0x10000) {
7600 chars[0] = (wchar_t)ch;
7601 charsize = 1;
7602 }
7603 else {
7604 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7605 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7606 charsize = 2;
7607 }
7608
7609 outsize = WideCharToMultiByte(code_page, flags,
7610 chars, charsize,
7611 buffer, Py_ARRAY_LENGTH(buffer),
7612 NULL, pusedDefaultChar);
7613 if (outsize > 0) {
7614 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7615 {
7616 pos++;
7617 memcpy(out, buffer, outsize);
7618 out += outsize;
7619 continue;
7620 }
7621 }
7622 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7623 PyErr_SetFromWindowsErr(0);
7624 goto error;
7625 }
7626
7627 rep = unicode_encode_call_errorhandler(
7628 errors, &errorHandler, encoding, reason,
7629 unicode, &exc,
7630 pos, pos + 1, &newpos);
7631 if (rep == NULL)
7632 goto error;
7633 pos = newpos;
7634
7635 if (PyBytes_Check(rep)) {
7636 outsize = PyBytes_GET_SIZE(rep);
7637 if (outsize != 1) {
7638 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7639 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7640 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7641 Py_DECREF(rep);
7642 goto error;
7643 }
7644 out = PyBytes_AS_STRING(*outbytes) + offset;
7645 }
7646 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7647 out += outsize;
7648 }
7649 else {
7650 Py_ssize_t i;
7651 enum PyUnicode_Kind kind;
7652 void *data;
7653
7654 if (PyUnicode_READY(rep) == -1) {
7655 Py_DECREF(rep);
7656 goto error;
7657 }
7658
7659 outsize = PyUnicode_GET_LENGTH(rep);
7660 if (outsize != 1) {
7661 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7662 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7663 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7664 Py_DECREF(rep);
7665 goto error;
7666 }
7667 out = PyBytes_AS_STRING(*outbytes) + offset;
7668 }
7669 kind = PyUnicode_KIND(rep);
7670 data = PyUnicode_DATA(rep);
7671 for (i=0; i < outsize; i++) {
7672 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7673 if (ch > 127) {
7674 raise_encode_exception(&exc,
7675 encoding, unicode,
7676 pos, pos + 1,
7677 "unable to encode error handler result to ASCII");
7678 Py_DECREF(rep);
7679 goto error;
7680 }
7681 *out = (unsigned char)ch;
7682 out++;
7683 }
7684 }
7685 Py_DECREF(rep);
7686 }
7687 /* write a NUL byte */
7688 *out = 0;
7689 outsize = out - PyBytes_AS_STRING(*outbytes);
7690 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7691 if (_PyBytes_Resize(outbytes, outsize) < 0)
7692 goto error;
7693 ret = 0;
7694
7695 error:
7696 Py_XDECREF(encoding_obj);
7697 Py_XDECREF(errorHandler);
7698 Py_XDECREF(exc);
7699 return ret;
7700 }
7701
7702 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7703 encode_code_page(int code_page,
7704 PyObject *unicode,
7705 const char *errors)
7706 {
7707 Py_ssize_t len;
7708 PyObject *outbytes = NULL;
7709 Py_ssize_t offset;
7710 int chunk_len, ret, done;
7711
7712 if (!PyUnicode_Check(unicode)) {
7713 PyErr_BadArgument();
7714 return NULL;
7715 }
7716
7717 if (PyUnicode_READY(unicode) == -1)
7718 return NULL;
7719 len = PyUnicode_GET_LENGTH(unicode);
7720
7721 if (code_page < 0) {
7722 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7723 return NULL;
7724 }
7725
7726 if (len == 0)
7727 return PyBytes_FromStringAndSize(NULL, 0);
7728
7729 offset = 0;
7730 do
7731 {
7732 #ifdef NEED_RETRY
7733 /* UTF-16 encoding may double the size, so use only INT_MAX/2
7734 chunks. */
7735 if (len > INT_MAX/2) {
7736 chunk_len = INT_MAX/2;
7737 done = 0;
7738 }
7739 else
7740 #endif
7741 {
7742 chunk_len = (int)len;
7743 done = 1;
7744 }
7745
7746 ret = encode_code_page_strict(code_page, &outbytes,
7747 unicode, offset, chunk_len,
7748 errors);
7749 if (ret == -2)
7750 ret = encode_code_page_errors(code_page, &outbytes,
7751 unicode, offset,
7752 chunk_len, errors);
7753 if (ret < 0) {
7754 Py_XDECREF(outbytes);
7755 return NULL;
7756 }
7757
7758 offset += chunk_len;
7759 len -= chunk_len;
7760 } while (!done);
7761
7762 return outbytes;
7763 }
7764
7765 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7766 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7767 Py_ssize_t size,
7768 const char *errors)
7769 {
7770 PyObject *unicode, *res;
7771 unicode = PyUnicode_FromWideChar(p, size);
7772 if (unicode == NULL)
7773 return NULL;
7774 res = encode_code_page(CP_ACP, unicode, errors);
7775 Py_DECREF(unicode);
7776 return res;
7777 }
7778
7779 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7780 PyUnicode_EncodeCodePage(int code_page,
7781 PyObject *unicode,
7782 const char *errors)
7783 {
7784 return encode_code_page(code_page, unicode, errors);
7785 }
7786
7787 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7788 PyUnicode_AsMBCSString(PyObject *unicode)
7789 {
7790 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7791 }
7792
7793 #undef NEED_RETRY
7794
7795 #endif /* MS_WINDOWS */
7796
7797 /* --- Character Mapping Codec -------------------------------------------- */
7798
7799 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7800 charmap_decode_string(const char *s,
7801 Py_ssize_t size,
7802 PyObject *mapping,
7803 const char *errors,
7804 _PyUnicodeWriter *writer)
7805 {
7806 const char *starts = s;
7807 const char *e;
7808 Py_ssize_t startinpos, endinpos;
7809 PyObject *errorHandler = NULL, *exc = NULL;
7810 Py_ssize_t maplen;
7811 enum PyUnicode_Kind mapkind;
7812 void *mapdata;
7813 Py_UCS4 x;
7814 unsigned char ch;
7815
7816 if (PyUnicode_READY(mapping) == -1)
7817 return -1;
7818
7819 maplen = PyUnicode_GET_LENGTH(mapping);
7820 mapdata = PyUnicode_DATA(mapping);
7821 mapkind = PyUnicode_KIND(mapping);
7822
7823 e = s + size;
7824
7825 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7826 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7827 * is disabled in encoding aliases, latin1 is preferred because
7828 * its implementation is faster. */
7829 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7830 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7831 Py_UCS4 maxchar = writer->maxchar;
7832
7833 assert (writer->kind == PyUnicode_1BYTE_KIND);
7834 while (s < e) {
7835 ch = *s;
7836 x = mapdata_ucs1[ch];
7837 if (x > maxchar) {
7838 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7839 goto onError;
7840 maxchar = writer->maxchar;
7841 outdata = (Py_UCS1 *)writer->data;
7842 }
7843 outdata[writer->pos] = x;
7844 writer->pos++;
7845 ++s;
7846 }
7847 return 0;
7848 }
7849
7850 while (s < e) {
7851 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7852 enum PyUnicode_Kind outkind = writer->kind;
7853 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7854 if (outkind == PyUnicode_1BYTE_KIND) {
7855 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7856 Py_UCS4 maxchar = writer->maxchar;
7857 while (s < e) {
7858 ch = *s;
7859 x = mapdata_ucs2[ch];
7860 if (x > maxchar)
7861 goto Error;
7862 outdata[writer->pos] = x;
7863 writer->pos++;
7864 ++s;
7865 }
7866 break;
7867 }
7868 else if (outkind == PyUnicode_2BYTE_KIND) {
7869 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7870 while (s < e) {
7871 ch = *s;
7872 x = mapdata_ucs2[ch];
7873 if (x == 0xFFFE)
7874 goto Error;
7875 outdata[writer->pos] = x;
7876 writer->pos++;
7877 ++s;
7878 }
7879 break;
7880 }
7881 }
7882 ch = *s;
7883
7884 if (ch < maplen)
7885 x = PyUnicode_READ(mapkind, mapdata, ch);
7886 else
7887 x = 0xfffe; /* invalid value */
7888 Error:
7889 if (x == 0xfffe)
7890 {
7891 /* undefined mapping */
7892 startinpos = s-starts;
7893 endinpos = startinpos+1;
7894 if (unicode_decode_call_errorhandler_writer(
7895 errors, &errorHandler,
7896 "charmap", "character maps to <undefined>",
7897 &starts, &e, &startinpos, &endinpos, &exc, &s,
7898 writer)) {
7899 goto onError;
7900 }
7901 continue;
7902 }
7903
7904 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7905 goto onError;
7906 ++s;
7907 }
7908 Py_XDECREF(errorHandler);
7909 Py_XDECREF(exc);
7910 return 0;
7911
7912 onError:
7913 Py_XDECREF(errorHandler);
7914 Py_XDECREF(exc);
7915 return -1;
7916 }
7917
7918 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7919 charmap_decode_mapping(const char *s,
7920 Py_ssize_t size,
7921 PyObject *mapping,
7922 const char *errors,
7923 _PyUnicodeWriter *writer)
7924 {
7925 const char *starts = s;
7926 const char *e;
7927 Py_ssize_t startinpos, endinpos;
7928 PyObject *errorHandler = NULL, *exc = NULL;
7929 unsigned char ch;
7930 PyObject *key, *item = NULL;
7931
7932 e = s + size;
7933
7934 while (s < e) {
7935 ch = *s;
7936
7937 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7938 key = PyLong_FromLong((long)ch);
7939 if (key == NULL)
7940 goto onError;
7941
7942 item = PyObject_GetItem(mapping, key);
7943 Py_DECREF(key);
7944 if (item == NULL) {
7945 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7946 /* No mapping found means: mapping is undefined. */
7947 PyErr_Clear();
7948 goto Undefined;
7949 } else
7950 goto onError;
7951 }
7952
7953 /* Apply mapping */
7954 if (item == Py_None)
7955 goto Undefined;
7956 if (PyLong_Check(item)) {
7957 long value = PyLong_AS_LONG(item);
7958 if (value == 0xFFFE)
7959 goto Undefined;
7960 if (value < 0 || value > MAX_UNICODE) {
7961 PyErr_Format(PyExc_TypeError,
7962 "character mapping must be in range(0x%lx)",
7963 (unsigned long)MAX_UNICODE + 1);
7964 goto onError;
7965 }
7966
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else if (PyUnicode_Check(item)) {
7971 if (PyUnicode_READY(item) == -1)
7972 goto onError;
7973 if (PyUnicode_GET_LENGTH(item) == 1) {
7974 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7975 if (value == 0xFFFE)
7976 goto Undefined;
7977 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7978 goto onError;
7979 }
7980 else {
7981 writer->overallocate = 1;
7982 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7983 goto onError;
7984 }
7985 }
7986 else {
7987 /* wrong return value */
7988 PyErr_SetString(PyExc_TypeError,
7989 "character mapping must return integer, None or str");
7990 goto onError;
7991 }
7992 Py_CLEAR(item);
7993 ++s;
7994 continue;
7995
7996 Undefined:
7997 /* undefined mapping */
7998 Py_CLEAR(item);
7999 startinpos = s-starts;
8000 endinpos = startinpos+1;
8001 if (unicode_decode_call_errorhandler_writer(
8002 errors, &errorHandler,
8003 "charmap", "character maps to <undefined>",
8004 &starts, &e, &startinpos, &endinpos, &exc, &s,
8005 writer)) {
8006 goto onError;
8007 }
8008 }
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return 0;
8012
8013 onError:
8014 Py_XDECREF(item);
8015 Py_XDECREF(errorHandler);
8016 Py_XDECREF(exc);
8017 return -1;
8018 }
8019
8020 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8021 PyUnicode_DecodeCharmap(const char *s,
8022 Py_ssize_t size,
8023 PyObject *mapping,
8024 const char *errors)
8025 {
8026 _PyUnicodeWriter writer;
8027
8028 /* Default to Latin-1 */
8029 if (mapping == NULL)
8030 return PyUnicode_DecodeLatin1(s, size, errors);
8031
8032 if (size == 0)
8033 _Py_RETURN_UNICODE_EMPTY();
8034 _PyUnicodeWriter_Init(&writer);
8035 writer.min_length = size;
8036 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8037 goto onError;
8038
8039 if (PyUnicode_CheckExact(mapping)) {
8040 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8041 goto onError;
8042 }
8043 else {
8044 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8045 goto onError;
8046 }
8047 return _PyUnicodeWriter_Finish(&writer);
8048
8049 onError:
8050 _PyUnicodeWriter_Dealloc(&writer);
8051 return NULL;
8052 }
8053
8054 /* Charmap encoding: the lookup table */
8055
8056 struct encoding_map {
8057 PyObject_HEAD
8058 unsigned char level1[32];
8059 int count2, count3;
8060 unsigned char level23[1];
8061 };
8062
8063 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8064 encoding_map_size(PyObject *obj, PyObject* args)
8065 {
8066 struct encoding_map *map = (struct encoding_map*)obj;
8067 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8068 128*map->count3);
8069 }
8070
8071 static PyMethodDef encoding_map_methods[] = {
8072 {"size", encoding_map_size, METH_NOARGS,
8073 PyDoc_STR("Return the size (in bytes) of this object") },
8074 { 0 }
8075 };
8076
8077 static void
encoding_map_dealloc(PyObject * o)8078 encoding_map_dealloc(PyObject* o)
8079 {
8080 PyObject_FREE(o);
8081 }
8082
8083 static PyTypeObject EncodingMapType = {
8084 PyVarObject_HEAD_INIT(NULL, 0)
8085 "EncodingMap", /*tp_name*/
8086 sizeof(struct encoding_map), /*tp_basicsize*/
8087 0, /*tp_itemsize*/
8088 /* methods */
8089 encoding_map_dealloc, /*tp_dealloc*/
8090 0, /*tp_print*/
8091 0, /*tp_getattr*/
8092 0, /*tp_setattr*/
8093 0, /*tp_reserved*/
8094 0, /*tp_repr*/
8095 0, /*tp_as_number*/
8096 0, /*tp_as_sequence*/
8097 0, /*tp_as_mapping*/
8098 0, /*tp_hash*/
8099 0, /*tp_call*/
8100 0, /*tp_str*/
8101 0, /*tp_getattro*/
8102 0, /*tp_setattro*/
8103 0, /*tp_as_buffer*/
8104 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8105 0, /*tp_doc*/
8106 0, /*tp_traverse*/
8107 0, /*tp_clear*/
8108 0, /*tp_richcompare*/
8109 0, /*tp_weaklistoffset*/
8110 0, /*tp_iter*/
8111 0, /*tp_iternext*/
8112 encoding_map_methods, /*tp_methods*/
8113 0, /*tp_members*/
8114 0, /*tp_getset*/
8115 0, /*tp_base*/
8116 0, /*tp_dict*/
8117 0, /*tp_descr_get*/
8118 0, /*tp_descr_set*/
8119 0, /*tp_dictoffset*/
8120 0, /*tp_init*/
8121 0, /*tp_alloc*/
8122 0, /*tp_new*/
8123 0, /*tp_free*/
8124 0, /*tp_is_gc*/
8125 };
8126
8127 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8128 PyUnicode_BuildEncodingMap(PyObject* string)
8129 {
8130 PyObject *result;
8131 struct encoding_map *mresult;
8132 int i;
8133 int need_dict = 0;
8134 unsigned char level1[32];
8135 unsigned char level2[512];
8136 unsigned char *mlevel1, *mlevel2, *mlevel3;
8137 int count2 = 0, count3 = 0;
8138 int kind;
8139 void *data;
8140 Py_ssize_t length;
8141 Py_UCS4 ch;
8142
8143 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8144 PyErr_BadArgument();
8145 return NULL;
8146 }
8147 kind = PyUnicode_KIND(string);
8148 data = PyUnicode_DATA(string);
8149 length = PyUnicode_GET_LENGTH(string);
8150 length = Py_MIN(length, 256);
8151 memset(level1, 0xFF, sizeof level1);
8152 memset(level2, 0xFF, sizeof level2);
8153
8154 /* If there isn't a one-to-one mapping of NULL to \0,
8155 or if there are non-BMP characters, we need to use
8156 a mapping dictionary. */
8157 if (PyUnicode_READ(kind, data, 0) != 0)
8158 need_dict = 1;
8159 for (i = 1; i < length; i++) {
8160 int l1, l2;
8161 ch = PyUnicode_READ(kind, data, i);
8162 if (ch == 0 || ch > 0xFFFF) {
8163 need_dict = 1;
8164 break;
8165 }
8166 if (ch == 0xFFFE)
8167 /* unmapped character */
8168 continue;
8169 l1 = ch >> 11;
8170 l2 = ch >> 7;
8171 if (level1[l1] == 0xFF)
8172 level1[l1] = count2++;
8173 if (level2[l2] == 0xFF)
8174 level2[l2] = count3++;
8175 }
8176
8177 if (count2 >= 0xFF || count3 >= 0xFF)
8178 need_dict = 1;
8179
8180 if (need_dict) {
8181 PyObject *result = PyDict_New();
8182 PyObject *key, *value;
8183 if (!result)
8184 return NULL;
8185 for (i = 0; i < length; i++) {
8186 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8187 value = PyLong_FromLong(i);
8188 if (!key || !value)
8189 goto failed1;
8190 if (PyDict_SetItem(result, key, value) == -1)
8191 goto failed1;
8192 Py_DECREF(key);
8193 Py_DECREF(value);
8194 }
8195 return result;
8196 failed1:
8197 Py_XDECREF(key);
8198 Py_XDECREF(value);
8199 Py_DECREF(result);
8200 return NULL;
8201 }
8202
8203 /* Create a three-level trie */
8204 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8205 16*count2 + 128*count3 - 1);
8206 if (!result)
8207 return PyErr_NoMemory();
8208 PyObject_Init(result, &EncodingMapType);
8209 mresult = (struct encoding_map*)result;
8210 mresult->count2 = count2;
8211 mresult->count3 = count3;
8212 mlevel1 = mresult->level1;
8213 mlevel2 = mresult->level23;
8214 mlevel3 = mresult->level23 + 16*count2;
8215 memcpy(mlevel1, level1, 32);
8216 memset(mlevel2, 0xFF, 16*count2);
8217 memset(mlevel3, 0, 128*count3);
8218 count3 = 0;
8219 for (i = 1; i < length; i++) {
8220 int o1, o2, o3, i2, i3;
8221 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8222 if (ch == 0xFFFE)
8223 /* unmapped character */
8224 continue;
8225 o1 = ch>>11;
8226 o2 = (ch>>7) & 0xF;
8227 i2 = 16*mlevel1[o1] + o2;
8228 if (mlevel2[i2] == 0xFF)
8229 mlevel2[i2] = count3++;
8230 o3 = ch & 0x7F;
8231 i3 = 128*mlevel2[i2] + o3;
8232 mlevel3[i3] = i;
8233 }
8234 return result;
8235 }
8236
8237 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8238 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8239 {
8240 struct encoding_map *map = (struct encoding_map*)mapping;
8241 int l1 = c>>11;
8242 int l2 = (c>>7) & 0xF;
8243 int l3 = c & 0x7F;
8244 int i;
8245
8246 if (c > 0xFFFF)
8247 return -1;
8248 if (c == 0)
8249 return 0;
8250 /* level 1*/
8251 i = map->level1[l1];
8252 if (i == 0xFF) {
8253 return -1;
8254 }
8255 /* level 2*/
8256 i = map->level23[16*i+l2];
8257 if (i == 0xFF) {
8258 return -1;
8259 }
8260 /* level 3 */
8261 i = map->level23[16*map->count2 + 128*i + l3];
8262 if (i == 0) {
8263 return -1;
8264 }
8265 return i;
8266 }
8267
8268 /* Lookup the character ch in the mapping. If the character
8269 can't be found, Py_None is returned (or NULL, if another
8270 error occurred). */
8271 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8272 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8273 {
8274 PyObject *w = PyLong_FromLong((long)c);
8275 PyObject *x;
8276
8277 if (w == NULL)
8278 return NULL;
8279 x = PyObject_GetItem(mapping, w);
8280 Py_DECREF(w);
8281 if (x == NULL) {
8282 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8283 /* No mapping found means: mapping is undefined. */
8284 PyErr_Clear();
8285 Py_RETURN_NONE;
8286 } else
8287 return NULL;
8288 }
8289 else if (x == Py_None)
8290 return x;
8291 else if (PyLong_Check(x)) {
8292 long value = PyLong_AS_LONG(x);
8293 if (value < 0 || value > 255) {
8294 PyErr_SetString(PyExc_TypeError,
8295 "character mapping must be in range(256)");
8296 Py_DECREF(x);
8297 return NULL;
8298 }
8299 return x;
8300 }
8301 else if (PyBytes_Check(x))
8302 return x;
8303 else {
8304 /* wrong return value */
8305 PyErr_Format(PyExc_TypeError,
8306 "character mapping must return integer, bytes or None, not %.400s",
8307 x->ob_type->tp_name);
8308 Py_DECREF(x);
8309 return NULL;
8310 }
8311 }
8312
8313 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8314 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8315 {
8316 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8317 /* exponentially overallocate to minimize reallocations */
8318 if (requiredsize < 2*outsize)
8319 requiredsize = 2*outsize;
8320 if (_PyBytes_Resize(outobj, requiredsize))
8321 return -1;
8322 return 0;
8323 }
8324
8325 typedef enum charmapencode_result {
8326 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8327 } charmapencode_result;
8328 /* lookup the character, put the result in the output string and adjust
8329 various state variables. Resize the output bytes object if not enough
8330 space is available. Return a new reference to the object that
8331 was put in the output buffer, or Py_None, if the mapping was undefined
8332 (in which case no character was written) or NULL, if a
8333 reallocation error occurred. The caller must decref the result */
8334 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8335 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8336 PyObject **outobj, Py_ssize_t *outpos)
8337 {
8338 PyObject *rep;
8339 char *outstart;
8340 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8341
8342 if (Py_TYPE(mapping) == &EncodingMapType) {
8343 int res = encoding_map_lookup(c, mapping);
8344 Py_ssize_t requiredsize = *outpos+1;
8345 if (res == -1)
8346 return enc_FAILED;
8347 if (outsize<requiredsize)
8348 if (charmapencode_resize(outobj, outpos, requiredsize))
8349 return enc_EXCEPTION;
8350 outstart = PyBytes_AS_STRING(*outobj);
8351 outstart[(*outpos)++] = (char)res;
8352 return enc_SUCCESS;
8353 }
8354
8355 rep = charmapencode_lookup(c, mapping);
8356 if (rep==NULL)
8357 return enc_EXCEPTION;
8358 else if (rep==Py_None) {
8359 Py_DECREF(rep);
8360 return enc_FAILED;
8361 } else {
8362 if (PyLong_Check(rep)) {
8363 Py_ssize_t requiredsize = *outpos+1;
8364 if (outsize<requiredsize)
8365 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8366 Py_DECREF(rep);
8367 return enc_EXCEPTION;
8368 }
8369 outstart = PyBytes_AS_STRING(*outobj);
8370 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8371 }
8372 else {
8373 const char *repchars = PyBytes_AS_STRING(rep);
8374 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8375 Py_ssize_t requiredsize = *outpos+repsize;
8376 if (outsize<requiredsize)
8377 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8378 Py_DECREF(rep);
8379 return enc_EXCEPTION;
8380 }
8381 outstart = PyBytes_AS_STRING(*outobj);
8382 memcpy(outstart + *outpos, repchars, repsize);
8383 *outpos += repsize;
8384 }
8385 }
8386 Py_DECREF(rep);
8387 return enc_SUCCESS;
8388 }
8389
8390 /* handle an error in PyUnicode_EncodeCharmap
8391 Return 0 on success, -1 on error */
8392 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8393 charmap_encoding_error(
8394 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8395 PyObject **exceptionObject,
8396 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8397 PyObject **res, Py_ssize_t *respos)
8398 {
8399 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8400 Py_ssize_t size, repsize;
8401 Py_ssize_t newpos;
8402 enum PyUnicode_Kind kind;
8403 void *data;
8404 Py_ssize_t index;
8405 /* startpos for collecting unencodable chars */
8406 Py_ssize_t collstartpos = *inpos;
8407 Py_ssize_t collendpos = *inpos+1;
8408 Py_ssize_t collpos;
8409 const char *encoding = "charmap";
8410 const char *reason = "character maps to <undefined>";
8411 charmapencode_result x;
8412 Py_UCS4 ch;
8413 int val;
8414
8415 if (PyUnicode_READY(unicode) == -1)
8416 return -1;
8417 size = PyUnicode_GET_LENGTH(unicode);
8418 /* find all unencodable characters */
8419 while (collendpos < size) {
8420 PyObject *rep;
8421 if (Py_TYPE(mapping) == &EncodingMapType) {
8422 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8423 val = encoding_map_lookup(ch, mapping);
8424 if (val != -1)
8425 break;
8426 ++collendpos;
8427 continue;
8428 }
8429
8430 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8431 rep = charmapencode_lookup(ch, mapping);
8432 if (rep==NULL)
8433 return -1;
8434 else if (rep!=Py_None) {
8435 Py_DECREF(rep);
8436 break;
8437 }
8438 Py_DECREF(rep);
8439 ++collendpos;
8440 }
8441 /* cache callback name lookup
8442 * (if not done yet, i.e. it's the first error) */
8443 if (*error_handler == _Py_ERROR_UNKNOWN)
8444 *error_handler = get_error_handler(errors);
8445
8446 switch (*error_handler) {
8447 case _Py_ERROR_STRICT:
8448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8449 return -1;
8450
8451 case _Py_ERROR_REPLACE:
8452 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8453 x = charmapencode_output('?', mapping, res, respos);
8454 if (x==enc_EXCEPTION) {
8455 return -1;
8456 }
8457 else if (x==enc_FAILED) {
8458 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8459 return -1;
8460 }
8461 }
8462 /* fall through */
8463 case _Py_ERROR_IGNORE:
8464 *inpos = collendpos;
8465 break;
8466
8467 case _Py_ERROR_XMLCHARREFREPLACE:
8468 /* generate replacement (temporarily (mis)uses p) */
8469 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8470 char buffer[2+29+1+1];
8471 char *cp;
8472 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8473 for (cp = buffer; *cp; ++cp) {
8474 x = charmapencode_output(*cp, mapping, res, respos);
8475 if (x==enc_EXCEPTION)
8476 return -1;
8477 else if (x==enc_FAILED) {
8478 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8479 return -1;
8480 }
8481 }
8482 }
8483 *inpos = collendpos;
8484 break;
8485
8486 default:
8487 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8488 encoding, reason, unicode, exceptionObject,
8489 collstartpos, collendpos, &newpos);
8490 if (repunicode == NULL)
8491 return -1;
8492 if (PyBytes_Check(repunicode)) {
8493 /* Directly copy bytes result to output. */
8494 Py_ssize_t outsize = PyBytes_Size(*res);
8495 Py_ssize_t requiredsize;
8496 repsize = PyBytes_Size(repunicode);
8497 requiredsize = *respos + repsize;
8498 if (requiredsize > outsize)
8499 /* Make room for all additional bytes. */
8500 if (charmapencode_resize(res, respos, requiredsize)) {
8501 Py_DECREF(repunicode);
8502 return -1;
8503 }
8504 memcpy(PyBytes_AsString(*res) + *respos,
8505 PyBytes_AsString(repunicode), repsize);
8506 *respos += repsize;
8507 *inpos = newpos;
8508 Py_DECREF(repunicode);
8509 break;
8510 }
8511 /* generate replacement */
8512 if (PyUnicode_READY(repunicode) == -1) {
8513 Py_DECREF(repunicode);
8514 return -1;
8515 }
8516 repsize = PyUnicode_GET_LENGTH(repunicode);
8517 data = PyUnicode_DATA(repunicode);
8518 kind = PyUnicode_KIND(repunicode);
8519 for (index = 0; index < repsize; index++) {
8520 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8521 x = charmapencode_output(repch, mapping, res, respos);
8522 if (x==enc_EXCEPTION) {
8523 Py_DECREF(repunicode);
8524 return -1;
8525 }
8526 else if (x==enc_FAILED) {
8527 Py_DECREF(repunicode);
8528 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8529 return -1;
8530 }
8531 }
8532 *inpos = newpos;
8533 Py_DECREF(repunicode);
8534 }
8535 return 0;
8536 }
8537
8538 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8539 _PyUnicode_EncodeCharmap(PyObject *unicode,
8540 PyObject *mapping,
8541 const char *errors)
8542 {
8543 /* output object */
8544 PyObject *res = NULL;
8545 /* current input position */
8546 Py_ssize_t inpos = 0;
8547 Py_ssize_t size;
8548 /* current output position */
8549 Py_ssize_t respos = 0;
8550 PyObject *error_handler_obj = NULL;
8551 PyObject *exc = NULL;
8552 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8553 void *data;
8554 int kind;
8555
8556 if (PyUnicode_READY(unicode) == -1)
8557 return NULL;
8558 size = PyUnicode_GET_LENGTH(unicode);
8559 data = PyUnicode_DATA(unicode);
8560 kind = PyUnicode_KIND(unicode);
8561
8562 /* Default to Latin-1 */
8563 if (mapping == NULL)
8564 return unicode_encode_ucs1(unicode, errors, 256);
8565
8566 /* allocate enough for a simple encoding without
8567 replacements, if we need more, we'll resize */
8568 res = PyBytes_FromStringAndSize(NULL, size);
8569 if (res == NULL)
8570 goto onError;
8571 if (size == 0)
8572 return res;
8573
8574 while (inpos<size) {
8575 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8576 /* try to encode it */
8577 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8578 if (x==enc_EXCEPTION) /* error */
8579 goto onError;
8580 if (x==enc_FAILED) { /* unencodable character */
8581 if (charmap_encoding_error(unicode, &inpos, mapping,
8582 &exc,
8583 &error_handler, &error_handler_obj, errors,
8584 &res, &respos)) {
8585 goto onError;
8586 }
8587 }
8588 else
8589 /* done with this character => adjust input position */
8590 ++inpos;
8591 }
8592
8593 /* Resize if we allocated to much */
8594 if (respos<PyBytes_GET_SIZE(res))
8595 if (_PyBytes_Resize(&res, respos) < 0)
8596 goto onError;
8597
8598 Py_XDECREF(exc);
8599 Py_XDECREF(error_handler_obj);
8600 return res;
8601
8602 onError:
8603 Py_XDECREF(res);
8604 Py_XDECREF(exc);
8605 Py_XDECREF(error_handler_obj);
8606 return NULL;
8607 }
8608
8609 /* Deprecated */
8610 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8611 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8612 Py_ssize_t size,
8613 PyObject *mapping,
8614 const char *errors)
8615 {
8616 PyObject *result;
8617 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8618 if (unicode == NULL)
8619 return NULL;
8620 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8621 Py_DECREF(unicode);
8622 return result;
8623 }
8624
8625 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8626 PyUnicode_AsCharmapString(PyObject *unicode,
8627 PyObject *mapping)
8628 {
8629 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8630 PyErr_BadArgument();
8631 return NULL;
8632 }
8633 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8634 }
8635
8636 /* create or adjust a UnicodeTranslateError */
8637 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8638 make_translate_exception(PyObject **exceptionObject,
8639 PyObject *unicode,
8640 Py_ssize_t startpos, Py_ssize_t endpos,
8641 const char *reason)
8642 {
8643 if (*exceptionObject == NULL) {
8644 *exceptionObject = _PyUnicodeTranslateError_Create(
8645 unicode, startpos, endpos, reason);
8646 }
8647 else {
8648 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8649 goto onError;
8650 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8651 goto onError;
8652 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8653 goto onError;
8654 return;
8655 onError:
8656 Py_CLEAR(*exceptionObject);
8657 }
8658 }
8659
8660 /* error handling callback helper:
8661 build arguments, call the callback and check the arguments,
8662 put the result into newpos and return the replacement string, which
8663 has to be freed by the caller */
8664 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8665 unicode_translate_call_errorhandler(const char *errors,
8666 PyObject **errorHandler,
8667 const char *reason,
8668 PyObject *unicode, PyObject **exceptionObject,
8669 Py_ssize_t startpos, Py_ssize_t endpos,
8670 Py_ssize_t *newpos)
8671 {
8672 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8673
8674 Py_ssize_t i_newpos;
8675 PyObject *restuple;
8676 PyObject *resunicode;
8677
8678 if (*errorHandler == NULL) {
8679 *errorHandler = PyCodec_LookupError(errors);
8680 if (*errorHandler == NULL)
8681 return NULL;
8682 }
8683
8684 make_translate_exception(exceptionObject,
8685 unicode, startpos, endpos, reason);
8686 if (*exceptionObject == NULL)
8687 return NULL;
8688
8689 restuple = PyObject_CallFunctionObjArgs(
8690 *errorHandler, *exceptionObject, NULL);
8691 if (restuple == NULL)
8692 return NULL;
8693 if (!PyTuple_Check(restuple)) {
8694 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8695 Py_DECREF(restuple);
8696 return NULL;
8697 }
8698 if (!PyArg_ParseTuple(restuple, argparse,
8699 &resunicode, &i_newpos)) {
8700 Py_DECREF(restuple);
8701 return NULL;
8702 }
8703 if (i_newpos<0)
8704 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8705 else
8706 *newpos = i_newpos;
8707 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8708 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8709 Py_DECREF(restuple);
8710 return NULL;
8711 }
8712 Py_INCREF(resunicode);
8713 Py_DECREF(restuple);
8714 return resunicode;
8715 }
8716
8717 /* Lookup the character ch in the mapping and put the result in result,
8718 which must be decrefed by the caller.
8719 Return 0 on success, -1 on error */
8720 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8721 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8722 {
8723 PyObject *w = PyLong_FromLong((long)c);
8724 PyObject *x;
8725
8726 if (w == NULL)
8727 return -1;
8728 x = PyObject_GetItem(mapping, w);
8729 Py_DECREF(w);
8730 if (x == NULL) {
8731 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8732 /* No mapping found means: use 1:1 mapping. */
8733 PyErr_Clear();
8734 *result = NULL;
8735 return 0;
8736 } else
8737 return -1;
8738 }
8739 else if (x == Py_None) {
8740 *result = x;
8741 return 0;
8742 }
8743 else if (PyLong_Check(x)) {
8744 long value = PyLong_AS_LONG(x);
8745 if (value < 0 || value > MAX_UNICODE) {
8746 PyErr_Format(PyExc_ValueError,
8747 "character mapping must be in range(0x%x)",
8748 MAX_UNICODE+1);
8749 Py_DECREF(x);
8750 return -1;
8751 }
8752 *result = x;
8753 return 0;
8754 }
8755 else if (PyUnicode_Check(x)) {
8756 *result = x;
8757 return 0;
8758 }
8759 else {
8760 /* wrong return value */
8761 PyErr_SetString(PyExc_TypeError,
8762 "character mapping must return integer, None or str");
8763 Py_DECREF(x);
8764 return -1;
8765 }
8766 }
8767
8768 /* lookup the character, write the result into the writer.
8769 Return 1 if the result was written into the writer, return 0 if the mapping
8770 was undefined, raise an exception return -1 on error. */
8771 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8772 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8773 _PyUnicodeWriter *writer)
8774 {
8775 PyObject *item;
8776
8777 if (charmaptranslate_lookup(ch, mapping, &item))
8778 return -1;
8779
8780 if (item == NULL) {
8781 /* not found => default to 1:1 mapping */
8782 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8783 return -1;
8784 }
8785 return 1;
8786 }
8787
8788 if (item == Py_None) {
8789 Py_DECREF(item);
8790 return 0;
8791 }
8792
8793 if (PyLong_Check(item)) {
8794 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8795 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8796 used it */
8797 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8798 Py_DECREF(item);
8799 return -1;
8800 }
8801 Py_DECREF(item);
8802 return 1;
8803 }
8804
8805 if (!PyUnicode_Check(item)) {
8806 Py_DECREF(item);
8807 return -1;
8808 }
8809
8810 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8811 Py_DECREF(item);
8812 return -1;
8813 }
8814
8815 Py_DECREF(item);
8816 return 1;
8817 }
8818
8819 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8820 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8821 Py_UCS1 *translate)
8822 {
8823 PyObject *item = NULL;
8824 int ret = 0;
8825
8826 if (charmaptranslate_lookup(ch, mapping, &item)) {
8827 return -1;
8828 }
8829
8830 if (item == Py_None) {
8831 /* deletion */
8832 translate[ch] = 0xfe;
8833 }
8834 else if (item == NULL) {
8835 /* not found => default to 1:1 mapping */
8836 translate[ch] = ch;
8837 return 1;
8838 }
8839 else if (PyLong_Check(item)) {
8840 long replace = PyLong_AS_LONG(item);
8841 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8842 used it */
8843 if (127 < replace) {
8844 /* invalid character or character outside ASCII:
8845 skip the fast translate */
8846 goto exit;
8847 }
8848 translate[ch] = (Py_UCS1)replace;
8849 }
8850 else if (PyUnicode_Check(item)) {
8851 Py_UCS4 replace;
8852
8853 if (PyUnicode_READY(item) == -1) {
8854 Py_DECREF(item);
8855 return -1;
8856 }
8857 if (PyUnicode_GET_LENGTH(item) != 1)
8858 goto exit;
8859
8860 replace = PyUnicode_READ_CHAR(item, 0);
8861 if (replace > 127)
8862 goto exit;
8863 translate[ch] = (Py_UCS1)replace;
8864 }
8865 else {
8866 /* not None, NULL, long or unicode */
8867 goto exit;
8868 }
8869 ret = 1;
8870
8871 exit:
8872 Py_DECREF(item);
8873 return ret;
8874 }
8875
8876 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8877 was translated into writer, return 0 if the input string was partially
8878 translated into writer, raise an exception and return -1 on error. */
8879 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8880 unicode_fast_translate(PyObject *input, PyObject *mapping,
8881 _PyUnicodeWriter *writer, int ignore,
8882 Py_ssize_t *input_pos)
8883 {
8884 Py_UCS1 ascii_table[128], ch, ch2;
8885 Py_ssize_t len;
8886 Py_UCS1 *in, *end, *out;
8887 int res = 0;
8888
8889 len = PyUnicode_GET_LENGTH(input);
8890
8891 memset(ascii_table, 0xff, 128);
8892
8893 in = PyUnicode_1BYTE_DATA(input);
8894 end = in + len;
8895
8896 assert(PyUnicode_IS_ASCII(writer->buffer));
8897 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8898 out = PyUnicode_1BYTE_DATA(writer->buffer);
8899
8900 for (; in < end; in++) {
8901 ch = *in;
8902 ch2 = ascii_table[ch];
8903 if (ch2 == 0xff) {
8904 int translate = unicode_fast_translate_lookup(mapping, ch,
8905 ascii_table);
8906 if (translate < 0)
8907 return -1;
8908 if (translate == 0)
8909 goto exit;
8910 ch2 = ascii_table[ch];
8911 }
8912 if (ch2 == 0xfe) {
8913 if (ignore)
8914 continue;
8915 goto exit;
8916 }
8917 assert(ch2 < 128);
8918 *out = ch2;
8919 out++;
8920 }
8921 res = 1;
8922
8923 exit:
8924 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8925 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8926 return res;
8927 }
8928
8929 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8930 _PyUnicode_TranslateCharmap(PyObject *input,
8931 PyObject *mapping,
8932 const char *errors)
8933 {
8934 /* input object */
8935 char *data;
8936 Py_ssize_t size, i;
8937 int kind;
8938 /* output buffer */
8939 _PyUnicodeWriter writer;
8940 /* error handler */
8941 const char *reason = "character maps to <undefined>";
8942 PyObject *errorHandler = NULL;
8943 PyObject *exc = NULL;
8944 int ignore;
8945 int res;
8946
8947 if (mapping == NULL) {
8948 PyErr_BadArgument();
8949 return NULL;
8950 }
8951
8952 if (PyUnicode_READY(input) == -1)
8953 return NULL;
8954 data = (char*)PyUnicode_DATA(input);
8955 kind = PyUnicode_KIND(input);
8956 size = PyUnicode_GET_LENGTH(input);
8957
8958 if (size == 0)
8959 return PyUnicode_FromObject(input);
8960
8961 /* allocate enough for a simple 1:1 translation without
8962 replacements, if we need more, we'll resize */
8963 _PyUnicodeWriter_Init(&writer);
8964 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8965 goto onError;
8966
8967 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8968
8969 if (PyUnicode_READY(input) == -1)
8970 return NULL;
8971 if (PyUnicode_IS_ASCII(input)) {
8972 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8973 if (res < 0) {
8974 _PyUnicodeWriter_Dealloc(&writer);
8975 return NULL;
8976 }
8977 if (res == 1)
8978 return _PyUnicodeWriter_Finish(&writer);
8979 }
8980 else {
8981 i = 0;
8982 }
8983
8984 while (i<size) {
8985 /* try to encode it */
8986 int translate;
8987 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8988 Py_ssize_t newpos;
8989 /* startpos for collecting untranslatable chars */
8990 Py_ssize_t collstart;
8991 Py_ssize_t collend;
8992 Py_UCS4 ch;
8993
8994 ch = PyUnicode_READ(kind, data, i);
8995 translate = charmaptranslate_output(ch, mapping, &writer);
8996 if (translate < 0)
8997 goto onError;
8998
8999 if (translate != 0) {
9000 /* it worked => adjust input pointer */
9001 ++i;
9002 continue;
9003 }
9004
9005 /* untranslatable character */
9006 collstart = i;
9007 collend = i+1;
9008
9009 /* find all untranslatable characters */
9010 while (collend < size) {
9011 PyObject *x;
9012 ch = PyUnicode_READ(kind, data, collend);
9013 if (charmaptranslate_lookup(ch, mapping, &x))
9014 goto onError;
9015 Py_XDECREF(x);
9016 if (x != Py_None)
9017 break;
9018 ++collend;
9019 }
9020
9021 if (ignore) {
9022 i = collend;
9023 }
9024 else {
9025 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9026 reason, input, &exc,
9027 collstart, collend, &newpos);
9028 if (repunicode == NULL)
9029 goto onError;
9030 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9031 Py_DECREF(repunicode);
9032 goto onError;
9033 }
9034 Py_DECREF(repunicode);
9035 i = newpos;
9036 }
9037 }
9038 Py_XDECREF(exc);
9039 Py_XDECREF(errorHandler);
9040 return _PyUnicodeWriter_Finish(&writer);
9041
9042 onError:
9043 _PyUnicodeWriter_Dealloc(&writer);
9044 Py_XDECREF(exc);
9045 Py_XDECREF(errorHandler);
9046 return NULL;
9047 }
9048
9049 /* Deprecated. Use PyUnicode_Translate instead. */
9050 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9051 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9052 Py_ssize_t size,
9053 PyObject *mapping,
9054 const char *errors)
9055 {
9056 PyObject *result;
9057 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9058 if (!unicode)
9059 return NULL;
9060 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9061 Py_DECREF(unicode);
9062 return result;
9063 }
9064
9065 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9066 PyUnicode_Translate(PyObject *str,
9067 PyObject *mapping,
9068 const char *errors)
9069 {
9070 if (ensure_unicode(str) < 0)
9071 return NULL;
9072 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9073 }
9074
9075 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9076 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9077 {
9078 if (!PyUnicode_Check(unicode)) {
9079 PyErr_BadInternalCall();
9080 return NULL;
9081 }
9082 if (PyUnicode_READY(unicode) == -1)
9083 return NULL;
9084 if (PyUnicode_IS_ASCII(unicode)) {
9085 /* If the string is already ASCII, just return the same string */
9086 Py_INCREF(unicode);
9087 return unicode;
9088 }
9089
9090 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9091 PyObject *result = PyUnicode_New(len, 127);
9092 if (result == NULL) {
9093 return NULL;
9094 }
9095
9096 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9097 int kind = PyUnicode_KIND(unicode);
9098 const void *data = PyUnicode_DATA(unicode);
9099 Py_ssize_t i;
9100 for (i = 0; i < len; ++i) {
9101 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9102 if (ch < 127) {
9103 out[i] = ch;
9104 }
9105 else if (Py_UNICODE_ISSPACE(ch)) {
9106 out[i] = ' ';
9107 }
9108 else {
9109 int decimal = Py_UNICODE_TODECIMAL(ch);
9110 if (decimal < 0) {
9111 out[i] = '?';
9112 out[i+1] = '\0';
9113 _PyUnicode_LENGTH(result) = i + 1;
9114 break;
9115 }
9116 out[i] = '0' + decimal;
9117 }
9118 }
9119
9120 assert(_PyUnicode_CheckConsistency(result, 1));
9121 return result;
9122 }
9123
9124 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9125 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9126 Py_ssize_t length)
9127 {
9128 PyObject *decimal;
9129 Py_ssize_t i;
9130 Py_UCS4 maxchar;
9131 enum PyUnicode_Kind kind;
9132 void *data;
9133
9134 maxchar = 127;
9135 for (i = 0; i < length; i++) {
9136 Py_UCS4 ch = s[i];
9137 if (ch > 127) {
9138 int decimal = Py_UNICODE_TODECIMAL(ch);
9139 if (decimal >= 0)
9140 ch = '0' + decimal;
9141 maxchar = Py_MAX(maxchar, ch);
9142 }
9143 }
9144
9145 /* Copy to a new string */
9146 decimal = PyUnicode_New(length, maxchar);
9147 if (decimal == NULL)
9148 return decimal;
9149 kind = PyUnicode_KIND(decimal);
9150 data = PyUnicode_DATA(decimal);
9151 /* Iterate over code points */
9152 for (i = 0; i < length; i++) {
9153 Py_UCS4 ch = s[i];
9154 if (ch > 127) {
9155 int decimal = Py_UNICODE_TODECIMAL(ch);
9156 if (decimal >= 0)
9157 ch = '0' + decimal;
9158 }
9159 PyUnicode_WRITE(kind, data, i, ch);
9160 }
9161 return unicode_result(decimal);
9162 }
9163 /* --- Decimal Encoder ---------------------------------------------------- */
9164
9165 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9166 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9167 Py_ssize_t length,
9168 char *output,
9169 const char *errors)
9170 {
9171 PyObject *unicode;
9172 Py_ssize_t i;
9173 enum PyUnicode_Kind kind;
9174 void *data;
9175
9176 if (output == NULL) {
9177 PyErr_BadArgument();
9178 return -1;
9179 }
9180
9181 unicode = PyUnicode_FromWideChar(s, length);
9182 if (unicode == NULL)
9183 return -1;
9184
9185 kind = PyUnicode_KIND(unicode);
9186 data = PyUnicode_DATA(unicode);
9187
9188 for (i=0; i < length; ) {
9189 PyObject *exc;
9190 Py_UCS4 ch;
9191 int decimal;
9192 Py_ssize_t startpos;
9193
9194 ch = PyUnicode_READ(kind, data, i);
9195
9196 if (Py_UNICODE_ISSPACE(ch)) {
9197 *output++ = ' ';
9198 i++;
9199 continue;
9200 }
9201 decimal = Py_UNICODE_TODECIMAL(ch);
9202 if (decimal >= 0) {
9203 *output++ = '0' + decimal;
9204 i++;
9205 continue;
9206 }
9207 if (0 < ch && ch < 256) {
9208 *output++ = (char)ch;
9209 i++;
9210 continue;
9211 }
9212
9213 startpos = i;
9214 exc = NULL;
9215 raise_encode_exception(&exc, "decimal", unicode,
9216 startpos, startpos+1,
9217 "invalid decimal Unicode string");
9218 Py_XDECREF(exc);
9219 Py_DECREF(unicode);
9220 return -1;
9221 }
9222 /* 0-terminate the output string */
9223 *output++ = '\0';
9224 Py_DECREF(unicode);
9225 return 0;
9226 }
9227
9228 /* --- Helpers ------------------------------------------------------------ */
9229
9230 /* helper macro to fixup start/end slice values */
9231 #define ADJUST_INDICES(start, end, len) \
9232 if (end > len) \
9233 end = len; \
9234 else if (end < 0) { \
9235 end += len; \
9236 if (end < 0) \
9237 end = 0; \
9238 } \
9239 if (start < 0) { \
9240 start += len; \
9241 if (start < 0) \
9242 start = 0; \
9243 }
9244
9245 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9246 any_find_slice(PyObject* s1, PyObject* s2,
9247 Py_ssize_t start,
9248 Py_ssize_t end,
9249 int direction)
9250 {
9251 int kind1, kind2;
9252 void *buf1, *buf2;
9253 Py_ssize_t len1, len2, result;
9254
9255 kind1 = PyUnicode_KIND(s1);
9256 kind2 = PyUnicode_KIND(s2);
9257 if (kind1 < kind2)
9258 return -1;
9259
9260 len1 = PyUnicode_GET_LENGTH(s1);
9261 len2 = PyUnicode_GET_LENGTH(s2);
9262 ADJUST_INDICES(start, end, len1);
9263 if (end - start < len2)
9264 return -1;
9265
9266 buf1 = PyUnicode_DATA(s1);
9267 buf2 = PyUnicode_DATA(s2);
9268 if (len2 == 1) {
9269 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9270 result = findchar((const char *)buf1 + kind1*start,
9271 kind1, end - start, ch, direction);
9272 if (result == -1)
9273 return -1;
9274 else
9275 return start + result;
9276 }
9277
9278 if (kind2 != kind1) {
9279 buf2 = _PyUnicode_AsKind(s2, kind1);
9280 if (!buf2)
9281 return -2;
9282 }
9283
9284 if (direction > 0) {
9285 switch (kind1) {
9286 case PyUnicode_1BYTE_KIND:
9287 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9288 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9289 else
9290 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 case PyUnicode_2BYTE_KIND:
9293 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9294 break;
9295 case PyUnicode_4BYTE_KIND:
9296 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9297 break;
9298 default:
9299 Py_UNREACHABLE();
9300 }
9301 }
9302 else {
9303 switch (kind1) {
9304 case PyUnicode_1BYTE_KIND:
9305 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9306 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9307 else
9308 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 case PyUnicode_2BYTE_KIND:
9311 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9312 break;
9313 case PyUnicode_4BYTE_KIND:
9314 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9315 break;
9316 default:
9317 Py_UNREACHABLE();
9318 }
9319 }
9320
9321 if (kind2 != kind1)
9322 PyMem_Free(buf2);
9323
9324 return result;
9325 }
9326
9327 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9328 #include "stringlib/localeutil.h"
9329
9330 /**
9331 * InsertThousandsGrouping:
9332 * @writer: Unicode writer.
9333 * @n_buffer: Number of characters in @buffer.
9334 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9335 * @d_pos: Start of digits string.
9336 * @n_digits: The number of digits in the string, in which we want
9337 * to put the grouping chars.
9338 * @min_width: The minimum width of the digits in the output string.
9339 * Output will be zero-padded on the left to fill.
9340 * @grouping: see definition in localeconv().
9341 * @thousands_sep: see definition in localeconv().
9342 *
9343 * There are 2 modes: counting and filling. If @writer is NULL,
9344 * we are in counting mode, else filling mode.
9345 * If counting, the required buffer size is returned.
9346 * If filling, we know the buffer will be large enough, so we don't
9347 * need to pass in the buffer size.
9348 * Inserts thousand grouping characters (as defined by grouping and
9349 * thousands_sep) into @writer.
9350 *
9351 * Return value: -1 on error, number of characters otherwise.
9352 **/
9353 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9354 _PyUnicode_InsertThousandsGrouping(
9355 _PyUnicodeWriter *writer,
9356 Py_ssize_t n_buffer,
9357 PyObject *digits,
9358 Py_ssize_t d_pos,
9359 Py_ssize_t n_digits,
9360 Py_ssize_t min_width,
9361 const char *grouping,
9362 PyObject *thousands_sep,
9363 Py_UCS4 *maxchar)
9364 {
9365 min_width = Py_MAX(0, min_width);
9366 if (writer) {
9367 assert(digits != NULL);
9368 assert(maxchar == NULL);
9369 }
9370 else {
9371 assert(digits == NULL);
9372 assert(maxchar != NULL);
9373 }
9374 assert(0 <= d_pos);
9375 assert(0 <= n_digits);
9376 assert(grouping != NULL);
9377
9378 if (digits != NULL) {
9379 if (PyUnicode_READY(digits) == -1) {
9380 return -1;
9381 }
9382 }
9383 if (PyUnicode_READY(thousands_sep) == -1) {
9384 return -1;
9385 }
9386
9387 Py_ssize_t count = 0;
9388 Py_ssize_t n_zeros;
9389 int loop_broken = 0;
9390 int use_separator = 0; /* First time through, don't append the
9391 separator. They only go between
9392 groups. */
9393 Py_ssize_t buffer_pos;
9394 Py_ssize_t digits_pos;
9395 Py_ssize_t len;
9396 Py_ssize_t n_chars;
9397 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9398 be looked at */
9399 /* A generator that returns all of the grouping widths, until it
9400 returns 0. */
9401 GroupGenerator groupgen;
9402 GroupGenerator_init(&groupgen, grouping);
9403 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9404
9405 /* if digits are not grouped, thousands separator
9406 should be an empty string */
9407 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9408
9409 digits_pos = d_pos + n_digits;
9410 if (writer) {
9411 buffer_pos = writer->pos + n_buffer;
9412 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9413 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9414 }
9415 else {
9416 buffer_pos = n_buffer;
9417 }
9418
9419 if (!writer) {
9420 *maxchar = 127;
9421 }
9422
9423 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9424 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9425 n_zeros = Py_MAX(0, len - remaining);
9426 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9427
9428 /* Use n_zero zero's and n_chars chars */
9429
9430 /* Count only, don't do anything. */
9431 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9432
9433 /* Copy into the writer. */
9434 InsertThousandsGrouping_fill(writer, &buffer_pos,
9435 digits, &digits_pos,
9436 n_chars, n_zeros,
9437 use_separator ? thousands_sep : NULL,
9438 thousands_sep_len, maxchar);
9439
9440 /* Use a separator next time. */
9441 use_separator = 1;
9442
9443 remaining -= n_chars;
9444 min_width -= len;
9445
9446 if (remaining <= 0 && min_width <= 0) {
9447 loop_broken = 1;
9448 break;
9449 }
9450 min_width -= thousands_sep_len;
9451 }
9452 if (!loop_broken) {
9453 /* We left the loop without using a break statement. */
9454
9455 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9456 n_zeros = Py_MAX(0, len - remaining);
9457 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9458
9459 /* Use n_zero zero's and n_chars chars */
9460 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9461
9462 /* Copy into the writer. */
9463 InsertThousandsGrouping_fill(writer, &buffer_pos,
9464 digits, &digits_pos,
9465 n_chars, n_zeros,
9466 use_separator ? thousands_sep : NULL,
9467 thousands_sep_len, maxchar);
9468 }
9469 return count;
9470 }
9471
9472
9473 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9474 PyUnicode_Count(PyObject *str,
9475 PyObject *substr,
9476 Py_ssize_t start,
9477 Py_ssize_t end)
9478 {
9479 Py_ssize_t result;
9480 int kind1, kind2;
9481 void *buf1 = NULL, *buf2 = NULL;
9482 Py_ssize_t len1, len2;
9483
9484 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9485 return -1;
9486
9487 kind1 = PyUnicode_KIND(str);
9488 kind2 = PyUnicode_KIND(substr);
9489 if (kind1 < kind2)
9490 return 0;
9491
9492 len1 = PyUnicode_GET_LENGTH(str);
9493 len2 = PyUnicode_GET_LENGTH(substr);
9494 ADJUST_INDICES(start, end, len1);
9495 if (end - start < len2)
9496 return 0;
9497
9498 buf1 = PyUnicode_DATA(str);
9499 buf2 = PyUnicode_DATA(substr);
9500 if (kind2 != kind1) {
9501 buf2 = _PyUnicode_AsKind(substr, kind1);
9502 if (!buf2)
9503 goto onError;
9504 }
9505
9506 switch (kind1) {
9507 case PyUnicode_1BYTE_KIND:
9508 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9509 result = asciilib_count(
9510 ((Py_UCS1*)buf1) + start, end - start,
9511 buf2, len2, PY_SSIZE_T_MAX
9512 );
9513 else
9514 result = ucs1lib_count(
9515 ((Py_UCS1*)buf1) + start, end - start,
9516 buf2, len2, PY_SSIZE_T_MAX
9517 );
9518 break;
9519 case PyUnicode_2BYTE_KIND:
9520 result = ucs2lib_count(
9521 ((Py_UCS2*)buf1) + start, end - start,
9522 buf2, len2, PY_SSIZE_T_MAX
9523 );
9524 break;
9525 case PyUnicode_4BYTE_KIND:
9526 result = ucs4lib_count(
9527 ((Py_UCS4*)buf1) + start, end - start,
9528 buf2, len2, PY_SSIZE_T_MAX
9529 );
9530 break;
9531 default:
9532 Py_UNREACHABLE();
9533 }
9534
9535 if (kind2 != kind1)
9536 PyMem_Free(buf2);
9537
9538 return result;
9539 onError:
9540 if (kind2 != kind1 && buf2)
9541 PyMem_Free(buf2);
9542 return -1;
9543 }
9544
9545 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9546 PyUnicode_Find(PyObject *str,
9547 PyObject *substr,
9548 Py_ssize_t start,
9549 Py_ssize_t end,
9550 int direction)
9551 {
9552 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9553 return -2;
9554
9555 return any_find_slice(str, substr, start, end, direction);
9556 }
9557
9558 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9559 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9560 Py_ssize_t start, Py_ssize_t end,
9561 int direction)
9562 {
9563 int kind;
9564 Py_ssize_t len, result;
9565 if (PyUnicode_READY(str) == -1)
9566 return -2;
9567 len = PyUnicode_GET_LENGTH(str);
9568 ADJUST_INDICES(start, end, len);
9569 if (end - start < 1)
9570 return -1;
9571 kind = PyUnicode_KIND(str);
9572 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9573 kind, end-start, ch, direction);
9574 if (result == -1)
9575 return -1;
9576 else
9577 return start + result;
9578 }
9579
9580 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9581 tailmatch(PyObject *self,
9582 PyObject *substring,
9583 Py_ssize_t start,
9584 Py_ssize_t end,
9585 int direction)
9586 {
9587 int kind_self;
9588 int kind_sub;
9589 void *data_self;
9590 void *data_sub;
9591 Py_ssize_t offset;
9592 Py_ssize_t i;
9593 Py_ssize_t end_sub;
9594
9595 if (PyUnicode_READY(self) == -1 ||
9596 PyUnicode_READY(substring) == -1)
9597 return -1;
9598
9599 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9600 end -= PyUnicode_GET_LENGTH(substring);
9601 if (end < start)
9602 return 0;
9603
9604 if (PyUnicode_GET_LENGTH(substring) == 0)
9605 return 1;
9606
9607 kind_self = PyUnicode_KIND(self);
9608 data_self = PyUnicode_DATA(self);
9609 kind_sub = PyUnicode_KIND(substring);
9610 data_sub = PyUnicode_DATA(substring);
9611 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9612
9613 if (direction > 0)
9614 offset = end;
9615 else
9616 offset = start;
9617
9618 if (PyUnicode_READ(kind_self, data_self, offset) ==
9619 PyUnicode_READ(kind_sub, data_sub, 0) &&
9620 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9621 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9622 /* If both are of the same kind, memcmp is sufficient */
9623 if (kind_self == kind_sub) {
9624 return ! memcmp((char *)data_self +
9625 (offset * PyUnicode_KIND(substring)),
9626 data_sub,
9627 PyUnicode_GET_LENGTH(substring) *
9628 PyUnicode_KIND(substring));
9629 }
9630 /* otherwise we have to compare each character by first accessing it */
9631 else {
9632 /* We do not need to compare 0 and len(substring)-1 because
9633 the if statement above ensured already that they are equal
9634 when we end up here. */
9635 for (i = 1; i < end_sub; ++i) {
9636 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9637 PyUnicode_READ(kind_sub, data_sub, i))
9638 return 0;
9639 }
9640 return 1;
9641 }
9642 }
9643
9644 return 0;
9645 }
9646
9647 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9648 PyUnicode_Tailmatch(PyObject *str,
9649 PyObject *substr,
9650 Py_ssize_t start,
9651 Py_ssize_t end,
9652 int direction)
9653 {
9654 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9655 return -1;
9656
9657 return tailmatch(str, substr, start, end, direction);
9658 }
9659
9660 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9661 ascii_upper_or_lower(PyObject *self, int lower)
9662 {
9663 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9664 char *resdata, *data = PyUnicode_DATA(self);
9665 PyObject *res;
9666
9667 res = PyUnicode_New(len, 127);
9668 if (res == NULL)
9669 return NULL;
9670 resdata = PyUnicode_DATA(res);
9671 if (lower)
9672 _Py_bytes_lower(resdata, data, len);
9673 else
9674 _Py_bytes_upper(resdata, data, len);
9675 return res;
9676 }
9677
9678 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9679 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9680 {
9681 Py_ssize_t j;
9682 int final_sigma;
9683 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9684 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9685
9686 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9687
9688 where ! is a negation and \p{xxx} is a character with property xxx.
9689 */
9690 for (j = i - 1; j >= 0; j--) {
9691 c = PyUnicode_READ(kind, data, j);
9692 if (!_PyUnicode_IsCaseIgnorable(c))
9693 break;
9694 }
9695 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9696 if (final_sigma) {
9697 for (j = i + 1; j < length; j++) {
9698 c = PyUnicode_READ(kind, data, j);
9699 if (!_PyUnicode_IsCaseIgnorable(c))
9700 break;
9701 }
9702 final_sigma = j == length || !_PyUnicode_IsCased(c);
9703 }
9704 return (final_sigma) ? 0x3C2 : 0x3C3;
9705 }
9706
9707 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9708 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9709 Py_UCS4 c, Py_UCS4 *mapped)
9710 {
9711 /* Obscure special case. */
9712 if (c == 0x3A3) {
9713 mapped[0] = handle_capital_sigma(kind, data, length, i);
9714 return 1;
9715 }
9716 return _PyUnicode_ToLowerFull(c, mapped);
9717 }
9718
9719 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9720 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9721 {
9722 Py_ssize_t i, k = 0;
9723 int n_res, j;
9724 Py_UCS4 c, mapped[3];
9725
9726 c = PyUnicode_READ(kind, data, 0);
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 for (j = 0; j < n_res; j++) {
9729 *maxchar = Py_MAX(*maxchar, mapped[j]);
9730 res[k++] = mapped[j];
9731 }
9732 for (i = 1; i < length; i++) {
9733 c = PyUnicode_READ(kind, data, i);
9734 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9735 for (j = 0; j < n_res; j++) {
9736 *maxchar = Py_MAX(*maxchar, mapped[j]);
9737 res[k++] = mapped[j];
9738 }
9739 }
9740 return k;
9741 }
9742
9743 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9744 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9745 Py_ssize_t i, k = 0;
9746
9747 for (i = 0; i < length; i++) {
9748 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9749 int n_res, j;
9750 if (Py_UNICODE_ISUPPER(c)) {
9751 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9752 }
9753 else if (Py_UNICODE_ISLOWER(c)) {
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 }
9756 else {
9757 n_res = 1;
9758 mapped[0] = c;
9759 }
9760 for (j = 0; j < n_res; j++) {
9761 *maxchar = Py_MAX(*maxchar, mapped[j]);
9762 res[k++] = mapped[j];
9763 }
9764 }
9765 return k;
9766 }
9767
9768 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9769 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9770 Py_UCS4 *maxchar, int lower)
9771 {
9772 Py_ssize_t i, k = 0;
9773
9774 for (i = 0; i < length; i++) {
9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776 int n_res, j;
9777 if (lower)
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 else
9780 n_res = _PyUnicode_ToUpperFull(c, mapped);
9781 for (j = 0; j < n_res; j++) {
9782 *maxchar = Py_MAX(*maxchar, mapped[j]);
9783 res[k++] = mapped[j];
9784 }
9785 }
9786 return k;
9787 }
9788
9789 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9790 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791 {
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9793 }
9794
9795 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9796 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797 {
9798 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9799 }
9800
9801 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9802 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9803 {
9804 Py_ssize_t i, k = 0;
9805
9806 for (i = 0; i < length; i++) {
9807 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808 Py_UCS4 mapped[3];
9809 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9810 for (j = 0; j < n_res; j++) {
9811 *maxchar = Py_MAX(*maxchar, mapped[j]);
9812 res[k++] = mapped[j];
9813 }
9814 }
9815 return k;
9816 }
9817
9818 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9819 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820 {
9821 Py_ssize_t i, k = 0;
9822 int previous_is_cased;
9823
9824 previous_is_cased = 0;
9825 for (i = 0; i < length; i++) {
9826 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int n_res, j;
9829
9830 if (previous_is_cased)
9831 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9832 else
9833 n_res = _PyUnicode_ToTitleFull(c, mapped);
9834
9835 for (j = 0; j < n_res; j++) {
9836 *maxchar = Py_MAX(*maxchar, mapped[j]);
9837 res[k++] = mapped[j];
9838 }
9839
9840 previous_is_cased = _PyUnicode_IsCased(c);
9841 }
9842 return k;
9843 }
9844
9845 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9846 case_operation(PyObject *self,
9847 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9848 {
9849 PyObject *res = NULL;
9850 Py_ssize_t length, newlength = 0;
9851 int kind, outkind;
9852 void *data, *outdata;
9853 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9854
9855 assert(PyUnicode_IS_READY(self));
9856
9857 kind = PyUnicode_KIND(self);
9858 data = PyUnicode_DATA(self);
9859 length = PyUnicode_GET_LENGTH(self);
9860 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9861 PyErr_SetString(PyExc_OverflowError, "string is too long");
9862 return NULL;
9863 }
9864 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9865 if (tmp == NULL)
9866 return PyErr_NoMemory();
9867 newlength = perform(kind, data, length, tmp, &maxchar);
9868 res = PyUnicode_New(newlength, maxchar);
9869 if (res == NULL)
9870 goto leave;
9871 tmpend = tmp + newlength;
9872 outdata = PyUnicode_DATA(res);
9873 outkind = PyUnicode_KIND(res);
9874 switch (outkind) {
9875 case PyUnicode_1BYTE_KIND:
9876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9877 break;
9878 case PyUnicode_2BYTE_KIND:
9879 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9880 break;
9881 case PyUnicode_4BYTE_KIND:
9882 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9883 break;
9884 default:
9885 Py_UNREACHABLE();
9886 }
9887 leave:
9888 PyMem_FREE(tmp);
9889 return res;
9890 }
9891
9892 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9893 PyUnicode_Join(PyObject *separator, PyObject *seq)
9894 {
9895 PyObject *res;
9896 PyObject *fseq;
9897 Py_ssize_t seqlen;
9898 PyObject **items;
9899
9900 fseq = PySequence_Fast(seq, "can only join an iterable");
9901 if (fseq == NULL) {
9902 return NULL;
9903 }
9904
9905 /* NOTE: the following code can't call back into Python code,
9906 * so we are sure that fseq won't be mutated.
9907 */
9908
9909 items = PySequence_Fast_ITEMS(fseq);
9910 seqlen = PySequence_Fast_GET_SIZE(fseq);
9911 res = _PyUnicode_JoinArray(separator, items, seqlen);
9912 Py_DECREF(fseq);
9913 return res;
9914 }
9915
9916 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9917 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9918 {
9919 PyObject *res = NULL; /* the result */
9920 PyObject *sep = NULL;
9921 Py_ssize_t seplen;
9922 PyObject *item;
9923 Py_ssize_t sz, i, res_offset;
9924 Py_UCS4 maxchar;
9925 Py_UCS4 item_maxchar;
9926 int use_memcpy;
9927 unsigned char *res_data = NULL, *sep_data = NULL;
9928 PyObject *last_obj;
9929 unsigned int kind = 0;
9930
9931 /* If empty sequence, return u"". */
9932 if (seqlen == 0) {
9933 _Py_RETURN_UNICODE_EMPTY();
9934 }
9935
9936 /* If singleton sequence with an exact Unicode, return that. */
9937 last_obj = NULL;
9938 if (seqlen == 1) {
9939 if (PyUnicode_CheckExact(items[0])) {
9940 res = items[0];
9941 Py_INCREF(res);
9942 return res;
9943 }
9944 seplen = 0;
9945 maxchar = 0;
9946 }
9947 else {
9948 /* Set up sep and seplen */
9949 if (separator == NULL) {
9950 /* fall back to a blank space separator */
9951 sep = PyUnicode_FromOrdinal(' ');
9952 if (!sep)
9953 goto onError;
9954 seplen = 1;
9955 maxchar = 32;
9956 }
9957 else {
9958 if (!PyUnicode_Check(separator)) {
9959 PyErr_Format(PyExc_TypeError,
9960 "separator: expected str instance,"
9961 " %.80s found",
9962 Py_TYPE(separator)->tp_name);
9963 goto onError;
9964 }
9965 if (PyUnicode_READY(separator))
9966 goto onError;
9967 sep = separator;
9968 seplen = PyUnicode_GET_LENGTH(separator);
9969 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9970 /* inc refcount to keep this code path symmetric with the
9971 above case of a blank separator */
9972 Py_INCREF(sep);
9973 }
9974 last_obj = sep;
9975 }
9976
9977 /* There are at least two things to join, or else we have a subclass
9978 * of str in the sequence.
9979 * Do a pre-pass to figure out the total amount of space we'll
9980 * need (sz), and see whether all argument are strings.
9981 */
9982 sz = 0;
9983 #ifdef Py_DEBUG
9984 use_memcpy = 0;
9985 #else
9986 use_memcpy = 1;
9987 #endif
9988 for (i = 0; i < seqlen; i++) {
9989 size_t add_sz;
9990 item = items[i];
9991 if (!PyUnicode_Check(item)) {
9992 PyErr_Format(PyExc_TypeError,
9993 "sequence item %zd: expected str instance,"
9994 " %.80s found",
9995 i, Py_TYPE(item)->tp_name);
9996 goto onError;
9997 }
9998 if (PyUnicode_READY(item) == -1)
9999 goto onError;
10000 add_sz = PyUnicode_GET_LENGTH(item);
10001 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10002 maxchar = Py_MAX(maxchar, item_maxchar);
10003 if (i != 0) {
10004 add_sz += seplen;
10005 }
10006 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10007 PyErr_SetString(PyExc_OverflowError,
10008 "join() result is too long for a Python string");
10009 goto onError;
10010 }
10011 sz += add_sz;
10012 if (use_memcpy && last_obj != NULL) {
10013 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10014 use_memcpy = 0;
10015 }
10016 last_obj = item;
10017 }
10018
10019 res = PyUnicode_New(sz, maxchar);
10020 if (res == NULL)
10021 goto onError;
10022
10023 /* Catenate everything. */
10024 #ifdef Py_DEBUG
10025 use_memcpy = 0;
10026 #else
10027 if (use_memcpy) {
10028 res_data = PyUnicode_1BYTE_DATA(res);
10029 kind = PyUnicode_KIND(res);
10030 if (seplen != 0)
10031 sep_data = PyUnicode_1BYTE_DATA(sep);
10032 }
10033 #endif
10034 if (use_memcpy) {
10035 for (i = 0; i < seqlen; ++i) {
10036 Py_ssize_t itemlen;
10037 item = items[i];
10038
10039 /* Copy item, and maybe the separator. */
10040 if (i && seplen != 0) {
10041 memcpy(res_data,
10042 sep_data,
10043 kind * seplen);
10044 res_data += kind * seplen;
10045 }
10046
10047 itemlen = PyUnicode_GET_LENGTH(item);
10048 if (itemlen != 0) {
10049 memcpy(res_data,
10050 PyUnicode_DATA(item),
10051 kind * itemlen);
10052 res_data += kind * itemlen;
10053 }
10054 }
10055 assert(res_data == PyUnicode_1BYTE_DATA(res)
10056 + kind * PyUnicode_GET_LENGTH(res));
10057 }
10058 else {
10059 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10060 Py_ssize_t itemlen;
10061 item = items[i];
10062
10063 /* Copy item, and maybe the separator. */
10064 if (i && seplen != 0) {
10065 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10066 res_offset += seplen;
10067 }
10068
10069 itemlen = PyUnicode_GET_LENGTH(item);
10070 if (itemlen != 0) {
10071 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10072 res_offset += itemlen;
10073 }
10074 }
10075 assert(res_offset == PyUnicode_GET_LENGTH(res));
10076 }
10077
10078 Py_XDECREF(sep);
10079 assert(_PyUnicode_CheckConsistency(res, 1));
10080 return res;
10081
10082 onError:
10083 Py_XDECREF(sep);
10084 Py_XDECREF(res);
10085 return NULL;
10086 }
10087
10088 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10089 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10090 Py_UCS4 fill_char)
10091 {
10092 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10093 void *data = PyUnicode_DATA(unicode);
10094 assert(PyUnicode_IS_READY(unicode));
10095 assert(unicode_modifiable(unicode));
10096 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10097 assert(start >= 0);
10098 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10099 FILL(kind, data, fill_char, start, length);
10100 }
10101
10102 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10103 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10104 Py_UCS4 fill_char)
10105 {
10106 Py_ssize_t maxlen;
10107
10108 if (!PyUnicode_Check(unicode)) {
10109 PyErr_BadInternalCall();
10110 return -1;
10111 }
10112 if (PyUnicode_READY(unicode) == -1)
10113 return -1;
10114 if (unicode_check_modifiable(unicode))
10115 return -1;
10116
10117 if (start < 0) {
10118 PyErr_SetString(PyExc_IndexError, "string index out of range");
10119 return -1;
10120 }
10121 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10122 PyErr_SetString(PyExc_ValueError,
10123 "fill character is bigger than "
10124 "the string maximum character");
10125 return -1;
10126 }
10127
10128 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10129 length = Py_MIN(maxlen, length);
10130 if (length <= 0)
10131 return 0;
10132
10133 _PyUnicode_FastFill(unicode, start, length, fill_char);
10134 return length;
10135 }
10136
10137 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10138 pad(PyObject *self,
10139 Py_ssize_t left,
10140 Py_ssize_t right,
10141 Py_UCS4 fill)
10142 {
10143 PyObject *u;
10144 Py_UCS4 maxchar;
10145 int kind;
10146 void *data;
10147
10148 if (left < 0)
10149 left = 0;
10150 if (right < 0)
10151 right = 0;
10152
10153 if (left == 0 && right == 0)
10154 return unicode_result_unchanged(self);
10155
10156 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10157 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10158 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10159 return NULL;
10160 }
10161 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10162 maxchar = Py_MAX(maxchar, fill);
10163 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10164 if (!u)
10165 return NULL;
10166
10167 kind = PyUnicode_KIND(u);
10168 data = PyUnicode_DATA(u);
10169 if (left)
10170 FILL(kind, data, fill, 0, left);
10171 if (right)
10172 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10173 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10174 assert(_PyUnicode_CheckConsistency(u, 1));
10175 return u;
10176 }
10177
10178 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10179 PyUnicode_Splitlines(PyObject *string, int keepends)
10180 {
10181 PyObject *list;
10182
10183 if (ensure_unicode(string) < 0)
10184 return NULL;
10185
10186 switch (PyUnicode_KIND(string)) {
10187 case PyUnicode_1BYTE_KIND:
10188 if (PyUnicode_IS_ASCII(string))
10189 list = asciilib_splitlines(
10190 string, PyUnicode_1BYTE_DATA(string),
10191 PyUnicode_GET_LENGTH(string), keepends);
10192 else
10193 list = ucs1lib_splitlines(
10194 string, PyUnicode_1BYTE_DATA(string),
10195 PyUnicode_GET_LENGTH(string), keepends);
10196 break;
10197 case PyUnicode_2BYTE_KIND:
10198 list = ucs2lib_splitlines(
10199 string, PyUnicode_2BYTE_DATA(string),
10200 PyUnicode_GET_LENGTH(string), keepends);
10201 break;
10202 case PyUnicode_4BYTE_KIND:
10203 list = ucs4lib_splitlines(
10204 string, PyUnicode_4BYTE_DATA(string),
10205 PyUnicode_GET_LENGTH(string), keepends);
10206 break;
10207 default:
10208 Py_UNREACHABLE();
10209 }
10210 return list;
10211 }
10212
10213 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10214 split(PyObject *self,
10215 PyObject *substring,
10216 Py_ssize_t maxcount)
10217 {
10218 int kind1, kind2;
10219 void *buf1, *buf2;
10220 Py_ssize_t len1, len2;
10221 PyObject* out;
10222
10223 if (maxcount < 0)
10224 maxcount = PY_SSIZE_T_MAX;
10225
10226 if (PyUnicode_READY(self) == -1)
10227 return NULL;
10228
10229 if (substring == NULL)
10230 switch (PyUnicode_KIND(self)) {
10231 case PyUnicode_1BYTE_KIND:
10232 if (PyUnicode_IS_ASCII(self))
10233 return asciilib_split_whitespace(
10234 self, PyUnicode_1BYTE_DATA(self),
10235 PyUnicode_GET_LENGTH(self), maxcount
10236 );
10237 else
10238 return ucs1lib_split_whitespace(
10239 self, PyUnicode_1BYTE_DATA(self),
10240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
10242 case PyUnicode_2BYTE_KIND:
10243 return ucs2lib_split_whitespace(
10244 self, PyUnicode_2BYTE_DATA(self),
10245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
10247 case PyUnicode_4BYTE_KIND:
10248 return ucs4lib_split_whitespace(
10249 self, PyUnicode_4BYTE_DATA(self),
10250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 default:
10253 Py_UNREACHABLE();
10254 }
10255
10256 if (PyUnicode_READY(substring) == -1)
10257 return NULL;
10258
10259 kind1 = PyUnicode_KIND(self);
10260 kind2 = PyUnicode_KIND(substring);
10261 len1 = PyUnicode_GET_LENGTH(self);
10262 len2 = PyUnicode_GET_LENGTH(substring);
10263 if (kind1 < kind2 || len1 < len2) {
10264 out = PyList_New(1);
10265 if (out == NULL)
10266 return NULL;
10267 Py_INCREF(self);
10268 PyList_SET_ITEM(out, 0, self);
10269 return out;
10270 }
10271 buf1 = PyUnicode_DATA(self);
10272 buf2 = PyUnicode_DATA(substring);
10273 if (kind2 != kind1) {
10274 buf2 = _PyUnicode_AsKind(substring, kind1);
10275 if (!buf2)
10276 return NULL;
10277 }
10278
10279 switch (kind1) {
10280 case PyUnicode_1BYTE_KIND:
10281 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10282 out = asciilib_split(
10283 self, buf1, len1, buf2, len2, maxcount);
10284 else
10285 out = ucs1lib_split(
10286 self, buf1, len1, buf2, len2, maxcount);
10287 break;
10288 case PyUnicode_2BYTE_KIND:
10289 out = ucs2lib_split(
10290 self, buf1, len1, buf2, len2, maxcount);
10291 break;
10292 case PyUnicode_4BYTE_KIND:
10293 out = ucs4lib_split(
10294 self, buf1, len1, buf2, len2, maxcount);
10295 break;
10296 default:
10297 out = NULL;
10298 }
10299 if (kind2 != kind1)
10300 PyMem_Free(buf2);
10301 return out;
10302 }
10303
10304 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10305 rsplit(PyObject *self,
10306 PyObject *substring,
10307 Py_ssize_t maxcount)
10308 {
10309 int kind1, kind2;
10310 void *buf1, *buf2;
10311 Py_ssize_t len1, len2;
10312 PyObject* out;
10313
10314 if (maxcount < 0)
10315 maxcount = PY_SSIZE_T_MAX;
10316
10317 if (PyUnicode_READY(self) == -1)
10318 return NULL;
10319
10320 if (substring == NULL)
10321 switch (PyUnicode_KIND(self)) {
10322 case PyUnicode_1BYTE_KIND:
10323 if (PyUnicode_IS_ASCII(self))
10324 return asciilib_rsplit_whitespace(
10325 self, PyUnicode_1BYTE_DATA(self),
10326 PyUnicode_GET_LENGTH(self), maxcount
10327 );
10328 else
10329 return ucs1lib_rsplit_whitespace(
10330 self, PyUnicode_1BYTE_DATA(self),
10331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
10333 case PyUnicode_2BYTE_KIND:
10334 return ucs2lib_rsplit_whitespace(
10335 self, PyUnicode_2BYTE_DATA(self),
10336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
10338 case PyUnicode_4BYTE_KIND:
10339 return ucs4lib_rsplit_whitespace(
10340 self, PyUnicode_4BYTE_DATA(self),
10341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 default:
10344 Py_UNREACHABLE();
10345 }
10346
10347 if (PyUnicode_READY(substring) == -1)
10348 return NULL;
10349
10350 kind1 = PyUnicode_KIND(self);
10351 kind2 = PyUnicode_KIND(substring);
10352 len1 = PyUnicode_GET_LENGTH(self);
10353 len2 = PyUnicode_GET_LENGTH(substring);
10354 if (kind1 < kind2 || len1 < len2) {
10355 out = PyList_New(1);
10356 if (out == NULL)
10357 return NULL;
10358 Py_INCREF(self);
10359 PyList_SET_ITEM(out, 0, self);
10360 return out;
10361 }
10362 buf1 = PyUnicode_DATA(self);
10363 buf2 = PyUnicode_DATA(substring);
10364 if (kind2 != kind1) {
10365 buf2 = _PyUnicode_AsKind(substring, kind1);
10366 if (!buf2)
10367 return NULL;
10368 }
10369
10370 switch (kind1) {
10371 case PyUnicode_1BYTE_KIND:
10372 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10373 out = asciilib_rsplit(
10374 self, buf1, len1, buf2, len2, maxcount);
10375 else
10376 out = ucs1lib_rsplit(
10377 self, buf1, len1, buf2, len2, maxcount);
10378 break;
10379 case PyUnicode_2BYTE_KIND:
10380 out = ucs2lib_rsplit(
10381 self, buf1, len1, buf2, len2, maxcount);
10382 break;
10383 case PyUnicode_4BYTE_KIND:
10384 out = ucs4lib_rsplit(
10385 self, buf1, len1, buf2, len2, maxcount);
10386 break;
10387 default:
10388 out = NULL;
10389 }
10390 if (kind2 != kind1)
10391 PyMem_Free(buf2);
10392 return out;
10393 }
10394
10395 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10396 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10397 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10398 {
10399 switch (kind) {
10400 case PyUnicode_1BYTE_KIND:
10401 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10402 return asciilib_find(buf1, len1, buf2, len2, offset);
10403 else
10404 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10405 case PyUnicode_2BYTE_KIND:
10406 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10407 case PyUnicode_4BYTE_KIND:
10408 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10409 }
10410 Py_UNREACHABLE();
10411 }
10412
10413 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10414 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10415 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10416 {
10417 switch (kind) {
10418 case PyUnicode_1BYTE_KIND:
10419 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10420 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10421 else
10422 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10423 case PyUnicode_2BYTE_KIND:
10424 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10425 case PyUnicode_4BYTE_KIND:
10426 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10427 }
10428 Py_UNREACHABLE();
10429 }
10430
10431 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10432 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10433 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10434 {
10435 int kind = PyUnicode_KIND(u);
10436 void *data = PyUnicode_DATA(u);
10437 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10438 if (kind == PyUnicode_1BYTE_KIND) {
10439 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10440 (Py_UCS1 *)data + len,
10441 u1, u2, maxcount);
10442 }
10443 else if (kind == PyUnicode_2BYTE_KIND) {
10444 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10445 (Py_UCS2 *)data + len,
10446 u1, u2, maxcount);
10447 }
10448 else {
10449 assert(kind == PyUnicode_4BYTE_KIND);
10450 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10451 (Py_UCS4 *)data + len,
10452 u1, u2, maxcount);
10453 }
10454 }
10455
10456 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10457 replace(PyObject *self, PyObject *str1,
10458 PyObject *str2, Py_ssize_t maxcount)
10459 {
10460 PyObject *u;
10461 char *sbuf = PyUnicode_DATA(self);
10462 char *buf1 = PyUnicode_DATA(str1);
10463 char *buf2 = PyUnicode_DATA(str2);
10464 int srelease = 0, release1 = 0, release2 = 0;
10465 int skind = PyUnicode_KIND(self);
10466 int kind1 = PyUnicode_KIND(str1);
10467 int kind2 = PyUnicode_KIND(str2);
10468 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10469 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10470 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10471 int mayshrink;
10472 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10473
10474 if (maxcount < 0)
10475 maxcount = PY_SSIZE_T_MAX;
10476 else if (maxcount == 0 || slen == 0)
10477 goto nothing;
10478
10479 if (str1 == str2)
10480 goto nothing;
10481
10482 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10483 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10484 if (maxchar < maxchar_str1)
10485 /* substring too wide to be present */
10486 goto nothing;
10487 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10488 /* Replacing str1 with str2 may cause a maxchar reduction in the
10489 result string. */
10490 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10491 maxchar = Py_MAX(maxchar, maxchar_str2);
10492
10493 if (len1 == len2) {
10494 /* same length */
10495 if (len1 == 0)
10496 goto nothing;
10497 if (len1 == 1) {
10498 /* replace characters */
10499 Py_UCS4 u1, u2;
10500 Py_ssize_t pos;
10501
10502 u1 = PyUnicode_READ(kind1, buf1, 0);
10503 pos = findchar(sbuf, skind, slen, u1, 1);
10504 if (pos < 0)
10505 goto nothing;
10506 u2 = PyUnicode_READ(kind2, buf2, 0);
10507 u = PyUnicode_New(slen, maxchar);
10508 if (!u)
10509 goto error;
10510
10511 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10512 replace_1char_inplace(u, pos, u1, u2, maxcount);
10513 }
10514 else {
10515 int rkind = skind;
10516 char *res;
10517 Py_ssize_t i;
10518
10519 if (kind1 < rkind) {
10520 /* widen substring */
10521 buf1 = _PyUnicode_AsKind(str1, rkind);
10522 if (!buf1) goto error;
10523 release1 = 1;
10524 }
10525 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10526 if (i < 0)
10527 goto nothing;
10528 if (rkind > kind2) {
10529 /* widen replacement */
10530 buf2 = _PyUnicode_AsKind(str2, rkind);
10531 if (!buf2) goto error;
10532 release2 = 1;
10533 }
10534 else if (rkind < kind2) {
10535 /* widen self and buf1 */
10536 rkind = kind2;
10537 if (release1) PyMem_Free(buf1);
10538 release1 = 0;
10539 sbuf = _PyUnicode_AsKind(self, rkind);
10540 if (!sbuf) goto error;
10541 srelease = 1;
10542 buf1 = _PyUnicode_AsKind(str1, rkind);
10543 if (!buf1) goto error;
10544 release1 = 1;
10545 }
10546 u = PyUnicode_New(slen, maxchar);
10547 if (!u)
10548 goto error;
10549 assert(PyUnicode_KIND(u) == rkind);
10550 res = PyUnicode_DATA(u);
10551
10552 memcpy(res, sbuf, rkind * slen);
10553 /* change everything in-place, starting with this one */
10554 memcpy(res + rkind * i,
10555 buf2,
10556 rkind * len2);
10557 i += len1;
10558
10559 while ( --maxcount > 0) {
10560 i = anylib_find(rkind, self,
10561 sbuf+rkind*i, slen-i,
10562 str1, buf1, len1, i);
10563 if (i == -1)
10564 break;
10565 memcpy(res + rkind * i,
10566 buf2,
10567 rkind * len2);
10568 i += len1;
10569 }
10570 }
10571 }
10572 else {
10573 Py_ssize_t n, i, j, ires;
10574 Py_ssize_t new_size;
10575 int rkind = skind;
10576 char *res;
10577
10578 if (kind1 < rkind) {
10579 /* widen substring */
10580 buf1 = _PyUnicode_AsKind(str1, rkind);
10581 if (!buf1) goto error;
10582 release1 = 1;
10583 }
10584 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10585 if (n == 0)
10586 goto nothing;
10587 if (kind2 < rkind) {
10588 /* widen replacement */
10589 buf2 = _PyUnicode_AsKind(str2, rkind);
10590 if (!buf2) goto error;
10591 release2 = 1;
10592 }
10593 else if (kind2 > rkind) {
10594 /* widen self and buf1 */
10595 rkind = kind2;
10596 sbuf = _PyUnicode_AsKind(self, rkind);
10597 if (!sbuf) goto error;
10598 srelease = 1;
10599 if (release1) PyMem_Free(buf1);
10600 release1 = 0;
10601 buf1 = _PyUnicode_AsKind(str1, rkind);
10602 if (!buf1) goto error;
10603 release1 = 1;
10604 }
10605 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10606 PyUnicode_GET_LENGTH(str1))); */
10607 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10608 PyErr_SetString(PyExc_OverflowError,
10609 "replace string is too long");
10610 goto error;
10611 }
10612 new_size = slen + n * (len2 - len1);
10613 if (new_size == 0) {
10614 _Py_INCREF_UNICODE_EMPTY();
10615 if (!unicode_empty)
10616 goto error;
10617 u = unicode_empty;
10618 goto done;
10619 }
10620 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10621 PyErr_SetString(PyExc_OverflowError,
10622 "replace string is too long");
10623 goto error;
10624 }
10625 u = PyUnicode_New(new_size, maxchar);
10626 if (!u)
10627 goto error;
10628 assert(PyUnicode_KIND(u) == rkind);
10629 res = PyUnicode_DATA(u);
10630 ires = i = 0;
10631 if (len1 > 0) {
10632 while (n-- > 0) {
10633 /* look for next match */
10634 j = anylib_find(rkind, self,
10635 sbuf + rkind * i, slen-i,
10636 str1, buf1, len1, i);
10637 if (j == -1)
10638 break;
10639 else if (j > i) {
10640 /* copy unchanged part [i:j] */
10641 memcpy(res + rkind * ires,
10642 sbuf + rkind * i,
10643 rkind * (j-i));
10644 ires += j - i;
10645 }
10646 /* copy substitution string */
10647 if (len2 > 0) {
10648 memcpy(res + rkind * ires,
10649 buf2,
10650 rkind * len2);
10651 ires += len2;
10652 }
10653 i = j + len1;
10654 }
10655 if (i < slen)
10656 /* copy tail [i:] */
10657 memcpy(res + rkind * ires,
10658 sbuf + rkind * i,
10659 rkind * (slen-i));
10660 }
10661 else {
10662 /* interleave */
10663 while (n > 0) {
10664 memcpy(res + rkind * ires,
10665 buf2,
10666 rkind * len2);
10667 ires += len2;
10668 if (--n <= 0)
10669 break;
10670 memcpy(res + rkind * ires,
10671 sbuf + rkind * i,
10672 rkind);
10673 ires++;
10674 i++;
10675 }
10676 memcpy(res + rkind * ires,
10677 sbuf + rkind * i,
10678 rkind * (slen-i));
10679 }
10680 }
10681
10682 if (mayshrink) {
10683 unicode_adjust_maxchar(&u);
10684 if (u == NULL)
10685 goto error;
10686 }
10687
10688 done:
10689 if (srelease)
10690 PyMem_FREE(sbuf);
10691 if (release1)
10692 PyMem_FREE(buf1);
10693 if (release2)
10694 PyMem_FREE(buf2);
10695 assert(_PyUnicode_CheckConsistency(u, 1));
10696 return u;
10697
10698 nothing:
10699 /* nothing to replace; return original string (when possible) */
10700 if (srelease)
10701 PyMem_FREE(sbuf);
10702 if (release1)
10703 PyMem_FREE(buf1);
10704 if (release2)
10705 PyMem_FREE(buf2);
10706 return unicode_result_unchanged(self);
10707
10708 error:
10709 if (srelease && sbuf)
10710 PyMem_FREE(sbuf);
10711 if (release1 && buf1)
10712 PyMem_FREE(buf1);
10713 if (release2 && buf2)
10714 PyMem_FREE(buf2);
10715 return NULL;
10716 }
10717
10718 /* --- Unicode Object Methods --------------------------------------------- */
10719
10720 /*[clinic input]
10721 str.title as unicode_title
10722
10723 Return a version of the string where each word is titlecased.
10724
10725 More specifically, words start with uppercased characters and all remaining
10726 cased characters have lower case.
10727 [clinic start generated code]*/
10728
10729 static PyObject *
unicode_title_impl(PyObject * self)10730 unicode_title_impl(PyObject *self)
10731 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10732 {
10733 if (PyUnicode_READY(self) == -1)
10734 return NULL;
10735 return case_operation(self, do_title);
10736 }
10737
10738 /*[clinic input]
10739 str.capitalize as unicode_capitalize
10740
10741 Return a capitalized version of the string.
10742
10743 More specifically, make the first character have upper case and the rest lower
10744 case.
10745 [clinic start generated code]*/
10746
10747 static PyObject *
unicode_capitalize_impl(PyObject * self)10748 unicode_capitalize_impl(PyObject *self)
10749 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10750 {
10751 if (PyUnicode_READY(self) == -1)
10752 return NULL;
10753 if (PyUnicode_GET_LENGTH(self) == 0)
10754 return unicode_result_unchanged(self);
10755 return case_operation(self, do_capitalize);
10756 }
10757
10758 /*[clinic input]
10759 str.casefold as unicode_casefold
10760
10761 Return a version of the string suitable for caseless comparisons.
10762 [clinic start generated code]*/
10763
10764 static PyObject *
unicode_casefold_impl(PyObject * self)10765 unicode_casefold_impl(PyObject *self)
10766 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10767 {
10768 if (PyUnicode_READY(self) == -1)
10769 return NULL;
10770 if (PyUnicode_IS_ASCII(self))
10771 return ascii_upper_or_lower(self, 1);
10772 return case_operation(self, do_casefold);
10773 }
10774
10775
10776 /* Argument converter. Accepts a single Unicode character. */
10777
10778 static int
convert_uc(PyObject * obj,void * addr)10779 convert_uc(PyObject *obj, void *addr)
10780 {
10781 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10782
10783 if (!PyUnicode_Check(obj)) {
10784 PyErr_Format(PyExc_TypeError,
10785 "The fill character must be a unicode character, "
10786 "not %.100s", Py_TYPE(obj)->tp_name);
10787 return 0;
10788 }
10789 if (PyUnicode_READY(obj) < 0)
10790 return 0;
10791 if (PyUnicode_GET_LENGTH(obj) != 1) {
10792 PyErr_SetString(PyExc_TypeError,
10793 "The fill character must be exactly one character long");
10794 return 0;
10795 }
10796 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10797 return 1;
10798 }
10799
10800 /*[clinic input]
10801 str.center as unicode_center
10802
10803 width: Py_ssize_t
10804 fillchar: Py_UCS4 = ' '
10805 /
10806
10807 Return a centered string of length width.
10808
10809 Padding is done using the specified fill character (default is a space).
10810 [clinic start generated code]*/
10811
10812 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10813 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10814 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10815 {
10816 Py_ssize_t marg, left;
10817
10818 if (PyUnicode_READY(self) == -1)
10819 return NULL;
10820
10821 if (PyUnicode_GET_LENGTH(self) >= width)
10822 return unicode_result_unchanged(self);
10823
10824 marg = width - PyUnicode_GET_LENGTH(self);
10825 left = marg / 2 + (marg & width & 1);
10826
10827 return pad(self, left, marg - left, fillchar);
10828 }
10829
10830 /* This function assumes that str1 and str2 are readied by the caller. */
10831
10832 static int
unicode_compare(PyObject * str1,PyObject * str2)10833 unicode_compare(PyObject *str1, PyObject *str2)
10834 {
10835 #define COMPARE(TYPE1, TYPE2) \
10836 do { \
10837 TYPE1* p1 = (TYPE1 *)data1; \
10838 TYPE2* p2 = (TYPE2 *)data2; \
10839 TYPE1* end = p1 + len; \
10840 Py_UCS4 c1, c2; \
10841 for (; p1 != end; p1++, p2++) { \
10842 c1 = *p1; \
10843 c2 = *p2; \
10844 if (c1 != c2) \
10845 return (c1 < c2) ? -1 : 1; \
10846 } \
10847 } \
10848 while (0)
10849
10850 int kind1, kind2;
10851 void *data1, *data2;
10852 Py_ssize_t len1, len2, len;
10853
10854 kind1 = PyUnicode_KIND(str1);
10855 kind2 = PyUnicode_KIND(str2);
10856 data1 = PyUnicode_DATA(str1);
10857 data2 = PyUnicode_DATA(str2);
10858 len1 = PyUnicode_GET_LENGTH(str1);
10859 len2 = PyUnicode_GET_LENGTH(str2);
10860 len = Py_MIN(len1, len2);
10861
10862 switch(kind1) {
10863 case PyUnicode_1BYTE_KIND:
10864 {
10865 switch(kind2) {
10866 case PyUnicode_1BYTE_KIND:
10867 {
10868 int cmp = memcmp(data1, data2, len);
10869 /* normalize result of memcmp() into the range [-1; 1] */
10870 if (cmp < 0)
10871 return -1;
10872 if (cmp > 0)
10873 return 1;
10874 break;
10875 }
10876 case PyUnicode_2BYTE_KIND:
10877 COMPARE(Py_UCS1, Py_UCS2);
10878 break;
10879 case PyUnicode_4BYTE_KIND:
10880 COMPARE(Py_UCS1, Py_UCS4);
10881 break;
10882 default:
10883 Py_UNREACHABLE();
10884 }
10885 break;
10886 }
10887 case PyUnicode_2BYTE_KIND:
10888 {
10889 switch(kind2) {
10890 case PyUnicode_1BYTE_KIND:
10891 COMPARE(Py_UCS2, Py_UCS1);
10892 break;
10893 case PyUnicode_2BYTE_KIND:
10894 {
10895 COMPARE(Py_UCS2, Py_UCS2);
10896 break;
10897 }
10898 case PyUnicode_4BYTE_KIND:
10899 COMPARE(Py_UCS2, Py_UCS4);
10900 break;
10901 default:
10902 Py_UNREACHABLE();
10903 }
10904 break;
10905 }
10906 case PyUnicode_4BYTE_KIND:
10907 {
10908 switch(kind2) {
10909 case PyUnicode_1BYTE_KIND:
10910 COMPARE(Py_UCS4, Py_UCS1);
10911 break;
10912 case PyUnicode_2BYTE_KIND:
10913 COMPARE(Py_UCS4, Py_UCS2);
10914 break;
10915 case PyUnicode_4BYTE_KIND:
10916 {
10917 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10918 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10919 /* normalize result of wmemcmp() into the range [-1; 1] */
10920 if (cmp < 0)
10921 return -1;
10922 if (cmp > 0)
10923 return 1;
10924 #else
10925 COMPARE(Py_UCS4, Py_UCS4);
10926 #endif
10927 break;
10928 }
10929 default:
10930 Py_UNREACHABLE();
10931 }
10932 break;
10933 }
10934 default:
10935 Py_UNREACHABLE();
10936 }
10937
10938 if (len1 == len2)
10939 return 0;
10940 if (len1 < len2)
10941 return -1;
10942 else
10943 return 1;
10944
10945 #undef COMPARE
10946 }
10947
10948 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10949 unicode_compare_eq(PyObject *str1, PyObject *str2)
10950 {
10951 int kind;
10952 void *data1, *data2;
10953 Py_ssize_t len;
10954 int cmp;
10955
10956 len = PyUnicode_GET_LENGTH(str1);
10957 if (PyUnicode_GET_LENGTH(str2) != len)
10958 return 0;
10959 kind = PyUnicode_KIND(str1);
10960 if (PyUnicode_KIND(str2) != kind)
10961 return 0;
10962 data1 = PyUnicode_DATA(str1);
10963 data2 = PyUnicode_DATA(str2);
10964
10965 cmp = memcmp(data1, data2, len * kind);
10966 return (cmp == 0);
10967 }
10968
10969
10970 int
PyUnicode_Compare(PyObject * left,PyObject * right)10971 PyUnicode_Compare(PyObject *left, PyObject *right)
10972 {
10973 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10974 if (PyUnicode_READY(left) == -1 ||
10975 PyUnicode_READY(right) == -1)
10976 return -1;
10977
10978 /* a string is equal to itself */
10979 if (left == right)
10980 return 0;
10981
10982 return unicode_compare(left, right);
10983 }
10984 PyErr_Format(PyExc_TypeError,
10985 "Can't compare %.100s and %.100s",
10986 left->ob_type->tp_name,
10987 right->ob_type->tp_name);
10988 return -1;
10989 }
10990
10991 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)10992 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10993 {
10994 Py_ssize_t i;
10995 int kind;
10996 Py_UCS4 chr;
10997 const unsigned char *ustr = (const unsigned char *)str;
10998
10999 assert(_PyUnicode_CHECK(uni));
11000 if (!PyUnicode_IS_READY(uni)) {
11001 const wchar_t *ws = _PyUnicode_WSTR(uni);
11002 /* Compare Unicode string and source character set string */
11003 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11004 if (chr != ustr[i])
11005 return (chr < ustr[i]) ? -1 : 1;
11006 }
11007 /* This check keeps Python strings that end in '\0' from comparing equal
11008 to C strings identical up to that point. */
11009 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11010 return 1; /* uni is longer */
11011 if (ustr[i])
11012 return -1; /* str is longer */
11013 return 0;
11014 }
11015 kind = PyUnicode_KIND(uni);
11016 if (kind == PyUnicode_1BYTE_KIND) {
11017 const void *data = PyUnicode_1BYTE_DATA(uni);
11018 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11019 size_t len, len2 = strlen(str);
11020 int cmp;
11021
11022 len = Py_MIN(len1, len2);
11023 cmp = memcmp(data, str, len);
11024 if (cmp != 0) {
11025 if (cmp < 0)
11026 return -1;
11027 else
11028 return 1;
11029 }
11030 if (len1 > len2)
11031 return 1; /* uni is longer */
11032 if (len1 < len2)
11033 return -1; /* str is longer */
11034 return 0;
11035 }
11036 else {
11037 void *data = PyUnicode_DATA(uni);
11038 /* Compare Unicode string and source character set string */
11039 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11040 if (chr != (unsigned char)str[i])
11041 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11042 /* This check keeps Python strings that end in '\0' from comparing equal
11043 to C strings identical up to that point. */
11044 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11045 return 1; /* uni is longer */
11046 if (str[i])
11047 return -1; /* str is longer */
11048 return 0;
11049 }
11050 }
11051
11052 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11053 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11054 {
11055 size_t i, len;
11056 const wchar_t *p;
11057 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11058 if (strlen(str) != len)
11059 return 0;
11060 p = _PyUnicode_WSTR(unicode);
11061 assert(p);
11062 for (i = 0; i < len; i++) {
11063 unsigned char c = (unsigned char)str[i];
11064 if (c >= 128 || p[i] != (wchar_t)c)
11065 return 0;
11066 }
11067 return 1;
11068 }
11069
11070 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11071 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11072 {
11073 size_t len;
11074 assert(_PyUnicode_CHECK(unicode));
11075 assert(str);
11076 #ifndef NDEBUG
11077 for (const char *p = str; *p; p++) {
11078 assert((unsigned char)*p < 128);
11079 }
11080 #endif
11081 if (PyUnicode_READY(unicode) == -1) {
11082 /* Memory error or bad data */
11083 PyErr_Clear();
11084 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11085 }
11086 if (!PyUnicode_IS_ASCII(unicode))
11087 return 0;
11088 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11089 return strlen(str) == len &&
11090 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11091 }
11092
11093 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11094 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11095 {
11096 PyObject *right_uni;
11097 Py_hash_t hash;
11098
11099 assert(_PyUnicode_CHECK(left));
11100 assert(right->string);
11101 #ifndef NDEBUG
11102 for (const char *p = right->string; *p; p++) {
11103 assert((unsigned char)*p < 128);
11104 }
11105 #endif
11106
11107 if (PyUnicode_READY(left) == -1) {
11108 /* memory error or bad data */
11109 PyErr_Clear();
11110 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11111 }
11112
11113 if (!PyUnicode_IS_ASCII(left))
11114 return 0;
11115
11116 right_uni = _PyUnicode_FromId(right); /* borrowed */
11117 if (right_uni == NULL) {
11118 /* memory error or bad data */
11119 PyErr_Clear();
11120 return _PyUnicode_EqualToASCIIString(left, right->string);
11121 }
11122
11123 if (left == right_uni)
11124 return 1;
11125
11126 if (PyUnicode_CHECK_INTERNED(left))
11127 return 0;
11128
11129 assert(_PyUnicode_HASH(right_uni) != -1);
11130 hash = _PyUnicode_HASH(left);
11131 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11132 return 0;
11133
11134 return unicode_compare_eq(left, right_uni);
11135 }
11136
11137 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11138 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11139 {
11140 int result;
11141
11142 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11143 Py_RETURN_NOTIMPLEMENTED;
11144
11145 if (PyUnicode_READY(left) == -1 ||
11146 PyUnicode_READY(right) == -1)
11147 return NULL;
11148
11149 if (left == right) {
11150 switch (op) {
11151 case Py_EQ:
11152 case Py_LE:
11153 case Py_GE:
11154 /* a string is equal to itself */
11155 Py_RETURN_TRUE;
11156 case Py_NE:
11157 case Py_LT:
11158 case Py_GT:
11159 Py_RETURN_FALSE;
11160 default:
11161 PyErr_BadArgument();
11162 return NULL;
11163 }
11164 }
11165 else if (op == Py_EQ || op == Py_NE) {
11166 result = unicode_compare_eq(left, right);
11167 result ^= (op == Py_NE);
11168 return PyBool_FromLong(result);
11169 }
11170 else {
11171 result = unicode_compare(left, right);
11172 Py_RETURN_RICHCOMPARE(result, 0, op);
11173 }
11174 }
11175
11176 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11177 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11178 {
11179 return unicode_eq(aa, bb);
11180 }
11181
11182 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11183 PyUnicode_Contains(PyObject *str, PyObject *substr)
11184 {
11185 int kind1, kind2;
11186 void *buf1, *buf2;
11187 Py_ssize_t len1, len2;
11188 int result;
11189
11190 if (!PyUnicode_Check(substr)) {
11191 PyErr_Format(PyExc_TypeError,
11192 "'in <string>' requires string as left operand, not %.100s",
11193 Py_TYPE(substr)->tp_name);
11194 return -1;
11195 }
11196 if (PyUnicode_READY(substr) == -1)
11197 return -1;
11198 if (ensure_unicode(str) < 0)
11199 return -1;
11200
11201 kind1 = PyUnicode_KIND(str);
11202 kind2 = PyUnicode_KIND(substr);
11203 if (kind1 < kind2)
11204 return 0;
11205 len1 = PyUnicode_GET_LENGTH(str);
11206 len2 = PyUnicode_GET_LENGTH(substr);
11207 if (len1 < len2)
11208 return 0;
11209 buf1 = PyUnicode_DATA(str);
11210 buf2 = PyUnicode_DATA(substr);
11211 if (len2 == 1) {
11212 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11213 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11214 return result;
11215 }
11216 if (kind2 != kind1) {
11217 buf2 = _PyUnicode_AsKind(substr, kind1);
11218 if (!buf2)
11219 return -1;
11220 }
11221
11222 switch (kind1) {
11223 case PyUnicode_1BYTE_KIND:
11224 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11225 break;
11226 case PyUnicode_2BYTE_KIND:
11227 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11228 break;
11229 case PyUnicode_4BYTE_KIND:
11230 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11231 break;
11232 default:
11233 Py_UNREACHABLE();
11234 }
11235
11236 if (kind2 != kind1)
11237 PyMem_Free(buf2);
11238
11239 return result;
11240 }
11241
11242 /* Concat to string or Unicode object giving a new Unicode object. */
11243
11244 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11245 PyUnicode_Concat(PyObject *left, PyObject *right)
11246 {
11247 PyObject *result;
11248 Py_UCS4 maxchar, maxchar2;
11249 Py_ssize_t left_len, right_len, new_len;
11250
11251 if (ensure_unicode(left) < 0)
11252 return NULL;
11253
11254 if (!PyUnicode_Check(right)) {
11255 PyErr_Format(PyExc_TypeError,
11256 "can only concatenate str (not \"%.200s\") to str",
11257 right->ob_type->tp_name);
11258 return NULL;
11259 }
11260 if (PyUnicode_READY(right) < 0)
11261 return NULL;
11262
11263 /* Shortcuts */
11264 if (left == unicode_empty)
11265 return PyUnicode_FromObject(right);
11266 if (right == unicode_empty)
11267 return PyUnicode_FromObject(left);
11268
11269 left_len = PyUnicode_GET_LENGTH(left);
11270 right_len = PyUnicode_GET_LENGTH(right);
11271 if (left_len > PY_SSIZE_T_MAX - right_len) {
11272 PyErr_SetString(PyExc_OverflowError,
11273 "strings are too large to concat");
11274 return NULL;
11275 }
11276 new_len = left_len + right_len;
11277
11278 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11279 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11280 maxchar = Py_MAX(maxchar, maxchar2);
11281
11282 /* Concat the two Unicode strings */
11283 result = PyUnicode_New(new_len, maxchar);
11284 if (result == NULL)
11285 return NULL;
11286 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11287 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11288 assert(_PyUnicode_CheckConsistency(result, 1));
11289 return result;
11290 }
11291
11292 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11293 PyUnicode_Append(PyObject **p_left, PyObject *right)
11294 {
11295 PyObject *left, *res;
11296 Py_UCS4 maxchar, maxchar2;
11297 Py_ssize_t left_len, right_len, new_len;
11298
11299 if (p_left == NULL) {
11300 if (!PyErr_Occurred())
11301 PyErr_BadInternalCall();
11302 return;
11303 }
11304 left = *p_left;
11305 if (right == NULL || left == NULL
11306 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11307 if (!PyErr_Occurred())
11308 PyErr_BadInternalCall();
11309 goto error;
11310 }
11311
11312 if (PyUnicode_READY(left) == -1)
11313 goto error;
11314 if (PyUnicode_READY(right) == -1)
11315 goto error;
11316
11317 /* Shortcuts */
11318 if (left == unicode_empty) {
11319 Py_DECREF(left);
11320 Py_INCREF(right);
11321 *p_left = right;
11322 return;
11323 }
11324 if (right == unicode_empty)
11325 return;
11326
11327 left_len = PyUnicode_GET_LENGTH(left);
11328 right_len = PyUnicode_GET_LENGTH(right);
11329 if (left_len > PY_SSIZE_T_MAX - right_len) {
11330 PyErr_SetString(PyExc_OverflowError,
11331 "strings are too large to concat");
11332 goto error;
11333 }
11334 new_len = left_len + right_len;
11335
11336 if (unicode_modifiable(left)
11337 && PyUnicode_CheckExact(right)
11338 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11339 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11340 to change the structure size, but characters are stored just after
11341 the structure, and so it requires to move all characters which is
11342 not so different than duplicating the string. */
11343 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11344 {
11345 /* append inplace */
11346 if (unicode_resize(p_left, new_len) != 0)
11347 goto error;
11348
11349 /* copy 'right' into the newly allocated area of 'left' */
11350 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11351 }
11352 else {
11353 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11354 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11355 maxchar = Py_MAX(maxchar, maxchar2);
11356
11357 /* Concat the two Unicode strings */
11358 res = PyUnicode_New(new_len, maxchar);
11359 if (res == NULL)
11360 goto error;
11361 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11362 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11363 Py_DECREF(left);
11364 *p_left = res;
11365 }
11366 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11367 return;
11368
11369 error:
11370 Py_CLEAR(*p_left);
11371 }
11372
11373 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11374 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11375 {
11376 PyUnicode_Append(pleft, right);
11377 Py_XDECREF(right);
11378 }
11379
11380 /*
11381 Wraps stringlib_parse_args_finds() and additionally ensures that the
11382 first argument is a unicode object.
11383 */
11384
11385 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11386 parse_args_finds_unicode(const char * function_name, PyObject *args,
11387 PyObject **substring,
11388 Py_ssize_t *start, Py_ssize_t *end)
11389 {
11390 if(stringlib_parse_args_finds(function_name, args, substring,
11391 start, end)) {
11392 if (ensure_unicode(*substring) < 0)
11393 return 0;
11394 return 1;
11395 }
11396 return 0;
11397 }
11398
11399 PyDoc_STRVAR(count__doc__,
11400 "S.count(sub[, start[, end]]) -> int\n\
11401 \n\
11402 Return the number of non-overlapping occurrences of substring sub in\n\
11403 string S[start:end]. Optional arguments start and end are\n\
11404 interpreted as in slice notation.");
11405
11406 static PyObject *
unicode_count(PyObject * self,PyObject * args)11407 unicode_count(PyObject *self, PyObject *args)
11408 {
11409 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11410 Py_ssize_t start = 0;
11411 Py_ssize_t end = PY_SSIZE_T_MAX;
11412 PyObject *result;
11413 int kind1, kind2;
11414 void *buf1, *buf2;
11415 Py_ssize_t len1, len2, iresult;
11416
11417 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11418 return NULL;
11419
11420 kind1 = PyUnicode_KIND(self);
11421 kind2 = PyUnicode_KIND(substring);
11422 if (kind1 < kind2)
11423 return PyLong_FromLong(0);
11424
11425 len1 = PyUnicode_GET_LENGTH(self);
11426 len2 = PyUnicode_GET_LENGTH(substring);
11427 ADJUST_INDICES(start, end, len1);
11428 if (end - start < len2)
11429 return PyLong_FromLong(0);
11430
11431 buf1 = PyUnicode_DATA(self);
11432 buf2 = PyUnicode_DATA(substring);
11433 if (kind2 != kind1) {
11434 buf2 = _PyUnicode_AsKind(substring, kind1);
11435 if (!buf2)
11436 return NULL;
11437 }
11438 switch (kind1) {
11439 case PyUnicode_1BYTE_KIND:
11440 iresult = ucs1lib_count(
11441 ((Py_UCS1*)buf1) + start, end - start,
11442 buf2, len2, PY_SSIZE_T_MAX
11443 );
11444 break;
11445 case PyUnicode_2BYTE_KIND:
11446 iresult = ucs2lib_count(
11447 ((Py_UCS2*)buf1) + start, end - start,
11448 buf2, len2, PY_SSIZE_T_MAX
11449 );
11450 break;
11451 case PyUnicode_4BYTE_KIND:
11452 iresult = ucs4lib_count(
11453 ((Py_UCS4*)buf1) + start, end - start,
11454 buf2, len2, PY_SSIZE_T_MAX
11455 );
11456 break;
11457 default:
11458 Py_UNREACHABLE();
11459 }
11460
11461 result = PyLong_FromSsize_t(iresult);
11462
11463 if (kind2 != kind1)
11464 PyMem_Free(buf2);
11465
11466 return result;
11467 }
11468
11469 /*[clinic input]
11470 str.encode as unicode_encode
11471
11472 encoding: str(c_default="NULL") = 'utf-8'
11473 The encoding in which to encode the string.
11474 errors: str(c_default="NULL") = 'strict'
11475 The error handling scheme to use for encoding errors.
11476 The default is 'strict' meaning that encoding errors raise a
11477 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11478 'xmlcharrefreplace' as well as any other name registered with
11479 codecs.register_error that can handle UnicodeEncodeErrors.
11480
11481 Encode the string using the codec registered for encoding.
11482 [clinic start generated code]*/
11483
11484 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11485 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11486 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11487 {
11488 return PyUnicode_AsEncodedString(self, encoding, errors);
11489 }
11490
11491 /*[clinic input]
11492 str.expandtabs as unicode_expandtabs
11493
11494 tabsize: int = 8
11495
11496 Return a copy where all tab characters are expanded using spaces.
11497
11498 If tabsize is not given, a tab size of 8 characters is assumed.
11499 [clinic start generated code]*/
11500
11501 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11502 unicode_expandtabs_impl(PyObject *self, int tabsize)
11503 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11504 {
11505 Py_ssize_t i, j, line_pos, src_len, incr;
11506 Py_UCS4 ch;
11507 PyObject *u;
11508 void *src_data, *dest_data;
11509 int kind;
11510 int found;
11511
11512 if (PyUnicode_READY(self) == -1)
11513 return NULL;
11514
11515 /* First pass: determine size of output string */
11516 src_len = PyUnicode_GET_LENGTH(self);
11517 i = j = line_pos = 0;
11518 kind = PyUnicode_KIND(self);
11519 src_data = PyUnicode_DATA(self);
11520 found = 0;
11521 for (; i < src_len; i++) {
11522 ch = PyUnicode_READ(kind, src_data, i);
11523 if (ch == '\t') {
11524 found = 1;
11525 if (tabsize > 0) {
11526 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11527 if (j > PY_SSIZE_T_MAX - incr)
11528 goto overflow;
11529 line_pos += incr;
11530 j += incr;
11531 }
11532 }
11533 else {
11534 if (j > PY_SSIZE_T_MAX - 1)
11535 goto overflow;
11536 line_pos++;
11537 j++;
11538 if (ch == '\n' || ch == '\r')
11539 line_pos = 0;
11540 }
11541 }
11542 if (!found)
11543 return unicode_result_unchanged(self);
11544
11545 /* Second pass: create output string and fill it */
11546 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11547 if (!u)
11548 return NULL;
11549 dest_data = PyUnicode_DATA(u);
11550
11551 i = j = line_pos = 0;
11552
11553 for (; i < src_len; i++) {
11554 ch = PyUnicode_READ(kind, src_data, i);
11555 if (ch == '\t') {
11556 if (tabsize > 0) {
11557 incr = tabsize - (line_pos % tabsize);
11558 line_pos += incr;
11559 FILL(kind, dest_data, ' ', j, incr);
11560 j += incr;
11561 }
11562 }
11563 else {
11564 line_pos++;
11565 PyUnicode_WRITE(kind, dest_data, j, ch);
11566 j++;
11567 if (ch == '\n' || ch == '\r')
11568 line_pos = 0;
11569 }
11570 }
11571 assert (j == PyUnicode_GET_LENGTH(u));
11572 return unicode_result(u);
11573
11574 overflow:
11575 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11576 return NULL;
11577 }
11578
11579 PyDoc_STRVAR(find__doc__,
11580 "S.find(sub[, start[, end]]) -> int\n\
11581 \n\
11582 Return the lowest index in S where substring sub is found,\n\
11583 such that sub is contained within S[start:end]. Optional\n\
11584 arguments start and end are interpreted as in slice notation.\n\
11585 \n\
11586 Return -1 on failure.");
11587
11588 static PyObject *
unicode_find(PyObject * self,PyObject * args)11589 unicode_find(PyObject *self, PyObject *args)
11590 {
11591 /* initialize variables to prevent gcc warning */
11592 PyObject *substring = NULL;
11593 Py_ssize_t start = 0;
11594 Py_ssize_t end = 0;
11595 Py_ssize_t result;
11596
11597 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11598 return NULL;
11599
11600 if (PyUnicode_READY(self) == -1)
11601 return NULL;
11602
11603 result = any_find_slice(self, substring, start, end, 1);
11604
11605 if (result == -2)
11606 return NULL;
11607
11608 return PyLong_FromSsize_t(result);
11609 }
11610
11611 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11612 unicode_getitem(PyObject *self, Py_ssize_t index)
11613 {
11614 void *data;
11615 enum PyUnicode_Kind kind;
11616 Py_UCS4 ch;
11617
11618 if (!PyUnicode_Check(self)) {
11619 PyErr_BadArgument();
11620 return NULL;
11621 }
11622 if (PyUnicode_READY(self) == -1) {
11623 return NULL;
11624 }
11625 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11626 PyErr_SetString(PyExc_IndexError, "string index out of range");
11627 return NULL;
11628 }
11629 kind = PyUnicode_KIND(self);
11630 data = PyUnicode_DATA(self);
11631 ch = PyUnicode_READ(kind, data, index);
11632 return unicode_char(ch);
11633 }
11634
11635 /* Believe it or not, this produces the same value for ASCII strings
11636 as bytes_hash(). */
11637 static Py_hash_t
unicode_hash(PyObject * self)11638 unicode_hash(PyObject *self)
11639 {
11640 Py_ssize_t len;
11641 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11642
11643 #ifdef Py_DEBUG
11644 assert(_Py_HashSecret_Initialized);
11645 #endif
11646 if (_PyUnicode_HASH(self) != -1)
11647 return _PyUnicode_HASH(self);
11648 if (PyUnicode_READY(self) == -1)
11649 return -1;
11650 len = PyUnicode_GET_LENGTH(self);
11651 /*
11652 We make the hash of the empty string be 0, rather than using
11653 (prefix ^ suffix), since this slightly obfuscates the hash secret
11654 */
11655 if (len == 0) {
11656 _PyUnicode_HASH(self) = 0;
11657 return 0;
11658 }
11659 x = _Py_HashBytes(PyUnicode_DATA(self),
11660 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11661 _PyUnicode_HASH(self) = x;
11662 return x;
11663 }
11664
11665 PyDoc_STRVAR(index__doc__,
11666 "S.index(sub[, start[, end]]) -> int\n\
11667 \n\
11668 Return the lowest index in S where substring sub is found, \n\
11669 such that sub is contained within S[start:end]. Optional\n\
11670 arguments start and end are interpreted as in slice notation.\n\
11671 \n\
11672 Raises ValueError when the substring is not found.");
11673
11674 static PyObject *
unicode_index(PyObject * self,PyObject * args)11675 unicode_index(PyObject *self, PyObject *args)
11676 {
11677 /* initialize variables to prevent gcc warning */
11678 Py_ssize_t result;
11679 PyObject *substring = NULL;
11680 Py_ssize_t start = 0;
11681 Py_ssize_t end = 0;
11682
11683 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11684 return NULL;
11685
11686 if (PyUnicode_READY(self) == -1)
11687 return NULL;
11688
11689 result = any_find_slice(self, substring, start, end, 1);
11690
11691 if (result == -2)
11692 return NULL;
11693
11694 if (result < 0) {
11695 PyErr_SetString(PyExc_ValueError, "substring not found");
11696 return NULL;
11697 }
11698
11699 return PyLong_FromSsize_t(result);
11700 }
11701
11702 /*[clinic input]
11703 str.isascii as unicode_isascii
11704
11705 Return True if all characters in the string are ASCII, False otherwise.
11706
11707 ASCII characters have code points in the range U+0000-U+007F.
11708 Empty string is ASCII too.
11709 [clinic start generated code]*/
11710
11711 static PyObject *
unicode_isascii_impl(PyObject * self)11712 unicode_isascii_impl(PyObject *self)
11713 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11714 {
11715 if (PyUnicode_READY(self) == -1) {
11716 return NULL;
11717 }
11718 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11719 }
11720
11721 /*[clinic input]
11722 str.islower as unicode_islower
11723
11724 Return True if the string is a lowercase string, False otherwise.
11725
11726 A string is lowercase if all cased characters in the string are lowercase and
11727 there is at least one cased character in the string.
11728 [clinic start generated code]*/
11729
11730 static PyObject *
unicode_islower_impl(PyObject * self)11731 unicode_islower_impl(PyObject *self)
11732 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11733 {
11734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
11737 int cased;
11738
11739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741 length = PyUnicode_GET_LENGTH(self);
11742 kind = PyUnicode_KIND(self);
11743 data = PyUnicode_DATA(self);
11744
11745 /* Shortcut for single character strings */
11746 if (length == 1)
11747 return PyBool_FromLong(
11748 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11749
11750 /* Special case for empty strings */
11751 if (length == 0)
11752 Py_RETURN_FALSE;
11753
11754 cased = 0;
11755 for (i = 0; i < length; i++) {
11756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11757
11758 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11759 Py_RETURN_FALSE;
11760 else if (!cased && Py_UNICODE_ISLOWER(ch))
11761 cased = 1;
11762 }
11763 return PyBool_FromLong(cased);
11764 }
11765
11766 /*[clinic input]
11767 str.isupper as unicode_isupper
11768
11769 Return True if the string is an uppercase string, False otherwise.
11770
11771 A string is uppercase if all cased characters in the string are uppercase and
11772 there is at least one cased character in the string.
11773 [clinic start generated code]*/
11774
11775 static PyObject *
unicode_isupper_impl(PyObject * self)11776 unicode_isupper_impl(PyObject *self)
11777 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11778 {
11779 Py_ssize_t i, length;
11780 int kind;
11781 void *data;
11782 int cased;
11783
11784 if (PyUnicode_READY(self) == -1)
11785 return NULL;
11786 length = PyUnicode_GET_LENGTH(self);
11787 kind = PyUnicode_KIND(self);
11788 data = PyUnicode_DATA(self);
11789
11790 /* Shortcut for single character strings */
11791 if (length == 1)
11792 return PyBool_FromLong(
11793 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11794
11795 /* Special case for empty strings */
11796 if (length == 0)
11797 Py_RETURN_FALSE;
11798
11799 cased = 0;
11800 for (i = 0; i < length; i++) {
11801 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11802
11803 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11804 Py_RETURN_FALSE;
11805 else if (!cased && Py_UNICODE_ISUPPER(ch))
11806 cased = 1;
11807 }
11808 return PyBool_FromLong(cased);
11809 }
11810
11811 /*[clinic input]
11812 str.istitle as unicode_istitle
11813
11814 Return True if the string is a title-cased string, False otherwise.
11815
11816 In a title-cased string, upper- and title-case characters may only
11817 follow uncased characters and lowercase characters only cased ones.
11818 [clinic start generated code]*/
11819
11820 static PyObject *
unicode_istitle_impl(PyObject * self)11821 unicode_istitle_impl(PyObject *self)
11822 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11823 {
11824 Py_ssize_t i, length;
11825 int kind;
11826 void *data;
11827 int cased, previous_is_cased;
11828
11829 if (PyUnicode_READY(self) == -1)
11830 return NULL;
11831 length = PyUnicode_GET_LENGTH(self);
11832 kind = PyUnicode_KIND(self);
11833 data = PyUnicode_DATA(self);
11834
11835 /* Shortcut for single character strings */
11836 if (length == 1) {
11837 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11838 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11839 (Py_UNICODE_ISUPPER(ch) != 0));
11840 }
11841
11842 /* Special case for empty strings */
11843 if (length == 0)
11844 Py_RETURN_FALSE;
11845
11846 cased = 0;
11847 previous_is_cased = 0;
11848 for (i = 0; i < length; i++) {
11849 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11850
11851 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11852 if (previous_is_cased)
11853 Py_RETURN_FALSE;
11854 previous_is_cased = 1;
11855 cased = 1;
11856 }
11857 else if (Py_UNICODE_ISLOWER(ch)) {
11858 if (!previous_is_cased)
11859 Py_RETURN_FALSE;
11860 previous_is_cased = 1;
11861 cased = 1;
11862 }
11863 else
11864 previous_is_cased = 0;
11865 }
11866 return PyBool_FromLong(cased);
11867 }
11868
11869 /*[clinic input]
11870 str.isspace as unicode_isspace
11871
11872 Return True if the string is a whitespace string, False otherwise.
11873
11874 A string is whitespace if all characters in the string are whitespace and there
11875 is at least one character in the string.
11876 [clinic start generated code]*/
11877
11878 static PyObject *
unicode_isspace_impl(PyObject * self)11879 unicode_isspace_impl(PyObject *self)
11880 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11881 {
11882 Py_ssize_t i, length;
11883 int kind;
11884 void *data;
11885
11886 if (PyUnicode_READY(self) == -1)
11887 return NULL;
11888 length = PyUnicode_GET_LENGTH(self);
11889 kind = PyUnicode_KIND(self);
11890 data = PyUnicode_DATA(self);
11891
11892 /* Shortcut for single character strings */
11893 if (length == 1)
11894 return PyBool_FromLong(
11895 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11896
11897 /* Special case for empty strings */
11898 if (length == 0)
11899 Py_RETURN_FALSE;
11900
11901 for (i = 0; i < length; i++) {
11902 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903 if (!Py_UNICODE_ISSPACE(ch))
11904 Py_RETURN_FALSE;
11905 }
11906 Py_RETURN_TRUE;
11907 }
11908
11909 /*[clinic input]
11910 str.isalpha as unicode_isalpha
11911
11912 Return True if the string is an alphabetic string, False otherwise.
11913
11914 A string is alphabetic if all characters in the string are alphabetic and there
11915 is at least one character in the string.
11916 [clinic start generated code]*/
11917
11918 static PyObject *
unicode_isalpha_impl(PyObject * self)11919 unicode_isalpha_impl(PyObject *self)
11920 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11921 {
11922 Py_ssize_t i, length;
11923 int kind;
11924 void *data;
11925
11926 if (PyUnicode_READY(self) == -1)
11927 return NULL;
11928 length = PyUnicode_GET_LENGTH(self);
11929 kind = PyUnicode_KIND(self);
11930 data = PyUnicode_DATA(self);
11931
11932 /* Shortcut for single character strings */
11933 if (length == 1)
11934 return PyBool_FromLong(
11935 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11936
11937 /* Special case for empty strings */
11938 if (length == 0)
11939 Py_RETURN_FALSE;
11940
11941 for (i = 0; i < length; i++) {
11942 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11943 Py_RETURN_FALSE;
11944 }
11945 Py_RETURN_TRUE;
11946 }
11947
11948 /*[clinic input]
11949 str.isalnum as unicode_isalnum
11950
11951 Return True if the string is an alpha-numeric string, False otherwise.
11952
11953 A string is alpha-numeric if all characters in the string are alpha-numeric and
11954 there is at least one character in the string.
11955 [clinic start generated code]*/
11956
11957 static PyObject *
unicode_isalnum_impl(PyObject * self)11958 unicode_isalnum_impl(PyObject *self)
11959 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11960 {
11961 int kind;
11962 void *data;
11963 Py_ssize_t len, i;
11964
11965 if (PyUnicode_READY(self) == -1)
11966 return NULL;
11967
11968 kind = PyUnicode_KIND(self);
11969 data = PyUnicode_DATA(self);
11970 len = PyUnicode_GET_LENGTH(self);
11971
11972 /* Shortcut for single character strings */
11973 if (len == 1) {
11974 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11975 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11976 }
11977
11978 /* Special case for empty strings */
11979 if (len == 0)
11980 Py_RETURN_FALSE;
11981
11982 for (i = 0; i < len; i++) {
11983 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11984 if (!Py_UNICODE_ISALNUM(ch))
11985 Py_RETURN_FALSE;
11986 }
11987 Py_RETURN_TRUE;
11988 }
11989
11990 /*[clinic input]
11991 str.isdecimal as unicode_isdecimal
11992
11993 Return True if the string is a decimal string, False otherwise.
11994
11995 A string is a decimal string if all characters in the string are decimal and
11996 there is at least one character in the string.
11997 [clinic start generated code]*/
11998
11999 static PyObject *
unicode_isdecimal_impl(PyObject * self)12000 unicode_isdecimal_impl(PyObject *self)
12001 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12002 {
12003 Py_ssize_t i, length;
12004 int kind;
12005 void *data;
12006
12007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009 length = PyUnicode_GET_LENGTH(self);
12010 kind = PyUnicode_KIND(self);
12011 data = PyUnicode_DATA(self);
12012
12013 /* Shortcut for single character strings */
12014 if (length == 1)
12015 return PyBool_FromLong(
12016 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12017
12018 /* Special case for empty strings */
12019 if (length == 0)
12020 Py_RETURN_FALSE;
12021
12022 for (i = 0; i < length; i++) {
12023 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12024 Py_RETURN_FALSE;
12025 }
12026 Py_RETURN_TRUE;
12027 }
12028
12029 /*[clinic input]
12030 str.isdigit as unicode_isdigit
12031
12032 Return True if the string is a digit string, False otherwise.
12033
12034 A string is a digit string if all characters in the string are digits and there
12035 is at least one character in the string.
12036 [clinic start generated code]*/
12037
12038 static PyObject *
unicode_isdigit_impl(PyObject * self)12039 unicode_isdigit_impl(PyObject *self)
12040 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12041 {
12042 Py_ssize_t i, length;
12043 int kind;
12044 void *data;
12045
12046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048 length = PyUnicode_GET_LENGTH(self);
12049 kind = PyUnicode_KIND(self);
12050 data = PyUnicode_DATA(self);
12051
12052 /* Shortcut for single character strings */
12053 if (length == 1) {
12054 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12055 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12056 }
12057
12058 /* Special case for empty strings */
12059 if (length == 0)
12060 Py_RETURN_FALSE;
12061
12062 for (i = 0; i < length; i++) {
12063 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12064 Py_RETURN_FALSE;
12065 }
12066 Py_RETURN_TRUE;
12067 }
12068
12069 /*[clinic input]
12070 str.isnumeric as unicode_isnumeric
12071
12072 Return True if the string is a numeric string, False otherwise.
12073
12074 A string is numeric if all characters in the string are numeric and there is at
12075 least one character in the string.
12076 [clinic start generated code]*/
12077
12078 static PyObject *
unicode_isnumeric_impl(PyObject * self)12079 unicode_isnumeric_impl(PyObject *self)
12080 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12081 {
12082 Py_ssize_t i, length;
12083 int kind;
12084 void *data;
12085
12086 if (PyUnicode_READY(self) == -1)
12087 return NULL;
12088 length = PyUnicode_GET_LENGTH(self);
12089 kind = PyUnicode_KIND(self);
12090 data = PyUnicode_DATA(self);
12091
12092 /* Shortcut for single character strings */
12093 if (length == 1)
12094 return PyBool_FromLong(
12095 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12096
12097 /* Special case for empty strings */
12098 if (length == 0)
12099 Py_RETURN_FALSE;
12100
12101 for (i = 0; i < length; i++) {
12102 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12103 Py_RETURN_FALSE;
12104 }
12105 Py_RETURN_TRUE;
12106 }
12107
12108 int
PyUnicode_IsIdentifier(PyObject * self)12109 PyUnicode_IsIdentifier(PyObject *self)
12110 {
12111 int kind;
12112 void *data;
12113 Py_ssize_t i;
12114 Py_UCS4 first;
12115
12116 if (PyUnicode_READY(self) == -1) {
12117 Py_FatalError("identifier not ready");
12118 return 0;
12119 }
12120
12121 /* Special case for empty strings */
12122 if (PyUnicode_GET_LENGTH(self) == 0)
12123 return 0;
12124 kind = PyUnicode_KIND(self);
12125 data = PyUnicode_DATA(self);
12126
12127 /* PEP 3131 says that the first character must be in
12128 XID_Start and subsequent characters in XID_Continue,
12129 and for the ASCII range, the 2.x rules apply (i.e
12130 start with letters and underscore, continue with
12131 letters, digits, underscore). However, given the current
12132 definition of XID_Start and XID_Continue, it is sufficient
12133 to check just for these, except that _ must be allowed
12134 as starting an identifier. */
12135 first = PyUnicode_READ(kind, data, 0);
12136 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12137 return 0;
12138
12139 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12140 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12141 return 0;
12142 return 1;
12143 }
12144
12145 /*[clinic input]
12146 str.isidentifier as unicode_isidentifier
12147
12148 Return True if the string is a valid Python identifier, False otherwise.
12149
12150 Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12151 "class".
12152 [clinic start generated code]*/
12153
12154 static PyObject *
unicode_isidentifier_impl(PyObject * self)12155 unicode_isidentifier_impl(PyObject *self)
12156 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12157 {
12158 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12159 }
12160
12161 /*[clinic input]
12162 str.isprintable as unicode_isprintable
12163
12164 Return True if the string is printable, False otherwise.
12165
12166 A string is printable if all of its characters are considered printable in
12167 repr() or if it is empty.
12168 [clinic start generated code]*/
12169
12170 static PyObject *
unicode_isprintable_impl(PyObject * self)12171 unicode_isprintable_impl(PyObject *self)
12172 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12173 {
12174 Py_ssize_t i, length;
12175 int kind;
12176 void *data;
12177
12178 if (PyUnicode_READY(self) == -1)
12179 return NULL;
12180 length = PyUnicode_GET_LENGTH(self);
12181 kind = PyUnicode_KIND(self);
12182 data = PyUnicode_DATA(self);
12183
12184 /* Shortcut for single character strings */
12185 if (length == 1)
12186 return PyBool_FromLong(
12187 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12188
12189 for (i = 0; i < length; i++) {
12190 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12191 Py_RETURN_FALSE;
12192 }
12193 }
12194 Py_RETURN_TRUE;
12195 }
12196
12197 /*[clinic input]
12198 str.join as unicode_join
12199
12200 iterable: object
12201 /
12202
12203 Concatenate any number of strings.
12204
12205 The string whose method is called is inserted in between each given string.
12206 The result is returned as a new string.
12207
12208 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12209 [clinic start generated code]*/
12210
12211 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12212 unicode_join(PyObject *self, PyObject *iterable)
12213 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12214 {
12215 return PyUnicode_Join(self, iterable);
12216 }
12217
12218 static Py_ssize_t
unicode_length(PyObject * self)12219 unicode_length(PyObject *self)
12220 {
12221 if (PyUnicode_READY(self) == -1)
12222 return -1;
12223 return PyUnicode_GET_LENGTH(self);
12224 }
12225
12226 /*[clinic input]
12227 str.ljust as unicode_ljust
12228
12229 width: Py_ssize_t
12230 fillchar: Py_UCS4 = ' '
12231 /
12232
12233 Return a left-justified string of length width.
12234
12235 Padding is done using the specified fill character (default is a space).
12236 [clinic start generated code]*/
12237
12238 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12239 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12240 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12241 {
12242 if (PyUnicode_READY(self) == -1)
12243 return NULL;
12244
12245 if (PyUnicode_GET_LENGTH(self) >= width)
12246 return unicode_result_unchanged(self);
12247
12248 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12249 }
12250
12251 /*[clinic input]
12252 str.lower as unicode_lower
12253
12254 Return a copy of the string converted to lowercase.
12255 [clinic start generated code]*/
12256
12257 static PyObject *
unicode_lower_impl(PyObject * self)12258 unicode_lower_impl(PyObject *self)
12259 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12260 {
12261 if (PyUnicode_READY(self) == -1)
12262 return NULL;
12263 if (PyUnicode_IS_ASCII(self))
12264 return ascii_upper_or_lower(self, 1);
12265 return case_operation(self, do_lower);
12266 }
12267
12268 #define LEFTSTRIP 0
12269 #define RIGHTSTRIP 1
12270 #define BOTHSTRIP 2
12271
12272 /* Arrays indexed by above */
12273 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12274
12275 #define STRIPNAME(i) (stripfuncnames[i])
12276
12277 /* externally visible for str.strip(unicode) */
12278 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12279 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12280 {
12281 void *data;
12282 int kind;
12283 Py_ssize_t i, j, len;
12284 BLOOM_MASK sepmask;
12285 Py_ssize_t seplen;
12286
12287 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12288 return NULL;
12289
12290 kind = PyUnicode_KIND(self);
12291 data = PyUnicode_DATA(self);
12292 len = PyUnicode_GET_LENGTH(self);
12293 seplen = PyUnicode_GET_LENGTH(sepobj);
12294 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12295 PyUnicode_DATA(sepobj),
12296 seplen);
12297
12298 i = 0;
12299 if (striptype != RIGHTSTRIP) {
12300 while (i < len) {
12301 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12302 if (!BLOOM(sepmask, ch))
12303 break;
12304 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12305 break;
12306 i++;
12307 }
12308 }
12309
12310 j = len;
12311 if (striptype != LEFTSTRIP) {
12312 j--;
12313 while (j >= i) {
12314 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12315 if (!BLOOM(sepmask, ch))
12316 break;
12317 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12318 break;
12319 j--;
12320 }
12321
12322 j++;
12323 }
12324
12325 return PyUnicode_Substring(self, i, j);
12326 }
12327
12328 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12329 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12330 {
12331 unsigned char *data;
12332 int kind;
12333 Py_ssize_t length;
12334
12335 if (PyUnicode_READY(self) == -1)
12336 return NULL;
12337
12338 length = PyUnicode_GET_LENGTH(self);
12339 end = Py_MIN(end, length);
12340
12341 if (start == 0 && end == length)
12342 return unicode_result_unchanged(self);
12343
12344 if (start < 0 || end < 0) {
12345 PyErr_SetString(PyExc_IndexError, "string index out of range");
12346 return NULL;
12347 }
12348 if (start >= length || end < start)
12349 _Py_RETURN_UNICODE_EMPTY();
12350
12351 length = end - start;
12352 if (PyUnicode_IS_ASCII(self)) {
12353 data = PyUnicode_1BYTE_DATA(self);
12354 return _PyUnicode_FromASCII((char*)(data + start), length);
12355 }
12356 else {
12357 kind = PyUnicode_KIND(self);
12358 data = PyUnicode_1BYTE_DATA(self);
12359 return PyUnicode_FromKindAndData(kind,
12360 data + kind * start,
12361 length);
12362 }
12363 }
12364
12365 static PyObject *
do_strip(PyObject * self,int striptype)12366 do_strip(PyObject *self, int striptype)
12367 {
12368 Py_ssize_t len, i, j;
12369
12370 if (PyUnicode_READY(self) == -1)
12371 return NULL;
12372
12373 len = PyUnicode_GET_LENGTH(self);
12374
12375 if (PyUnicode_IS_ASCII(self)) {
12376 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12377
12378 i = 0;
12379 if (striptype != RIGHTSTRIP) {
12380 while (i < len) {
12381 Py_UCS1 ch = data[i];
12382 if (!_Py_ascii_whitespace[ch])
12383 break;
12384 i++;
12385 }
12386 }
12387
12388 j = len;
12389 if (striptype != LEFTSTRIP) {
12390 j--;
12391 while (j >= i) {
12392 Py_UCS1 ch = data[j];
12393 if (!_Py_ascii_whitespace[ch])
12394 break;
12395 j--;
12396 }
12397 j++;
12398 }
12399 }
12400 else {
12401 int kind = PyUnicode_KIND(self);
12402 void *data = PyUnicode_DATA(self);
12403
12404 i = 0;
12405 if (striptype != RIGHTSTRIP) {
12406 while (i < len) {
12407 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12408 if (!Py_UNICODE_ISSPACE(ch))
12409 break;
12410 i++;
12411 }
12412 }
12413
12414 j = len;
12415 if (striptype != LEFTSTRIP) {
12416 j--;
12417 while (j >= i) {
12418 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12419 if (!Py_UNICODE_ISSPACE(ch))
12420 break;
12421 j--;
12422 }
12423 j++;
12424 }
12425 }
12426
12427 return PyUnicode_Substring(self, i, j);
12428 }
12429
12430
12431 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12432 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12433 {
12434 if (sep != NULL && sep != Py_None) {
12435 if (PyUnicode_Check(sep))
12436 return _PyUnicode_XStrip(self, striptype, sep);
12437 else {
12438 PyErr_Format(PyExc_TypeError,
12439 "%s arg must be None or str",
12440 STRIPNAME(striptype));
12441 return NULL;
12442 }
12443 }
12444
12445 return do_strip(self, striptype);
12446 }
12447
12448
12449 /*[clinic input]
12450 str.strip as unicode_strip
12451
12452 chars: object = None
12453 /
12454
12455 Return a copy of the string with leading and trailing whitespace remove.
12456
12457 If chars is given and not None, remove characters in chars instead.
12458 [clinic start generated code]*/
12459
12460 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12461 unicode_strip_impl(PyObject *self, PyObject *chars)
12462 /*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
12463 {
12464 return do_argstrip(self, BOTHSTRIP, chars);
12465 }
12466
12467
12468 /*[clinic input]
12469 str.lstrip as unicode_lstrip
12470
12471 chars: object = NULL
12472 /
12473
12474 Return a copy of the string with leading whitespace removed.
12475
12476 If chars is given and not None, remove characters in chars instead.
12477 [clinic start generated code]*/
12478
12479 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12480 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12481 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12482 {
12483 return do_argstrip(self, LEFTSTRIP, chars);
12484 }
12485
12486
12487 /*[clinic input]
12488 str.rstrip as unicode_rstrip
12489
12490 chars: object = NULL
12491 /
12492
12493 Return a copy of the string with trailing whitespace removed.
12494
12495 If chars is given and not None, remove characters in chars instead.
12496 [clinic start generated code]*/
12497
12498 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12499 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12500 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12501 {
12502 return do_argstrip(self, RIGHTSTRIP, chars);
12503 }
12504
12505
12506 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12507 unicode_repeat(PyObject *str, Py_ssize_t len)
12508 {
12509 PyObject *u;
12510 Py_ssize_t nchars, n;
12511
12512 if (len < 1)
12513 _Py_RETURN_UNICODE_EMPTY();
12514
12515 /* no repeat, return original string */
12516 if (len == 1)
12517 return unicode_result_unchanged(str);
12518
12519 if (PyUnicode_READY(str) == -1)
12520 return NULL;
12521
12522 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12523 PyErr_SetString(PyExc_OverflowError,
12524 "repeated string is too long");
12525 return NULL;
12526 }
12527 nchars = len * PyUnicode_GET_LENGTH(str);
12528
12529 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12530 if (!u)
12531 return NULL;
12532 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12533
12534 if (PyUnicode_GET_LENGTH(str) == 1) {
12535 const int kind = PyUnicode_KIND(str);
12536 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12537 if (kind == PyUnicode_1BYTE_KIND) {
12538 void *to = PyUnicode_DATA(u);
12539 memset(to, (unsigned char)fill_char, len);
12540 }
12541 else if (kind == PyUnicode_2BYTE_KIND) {
12542 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12543 for (n = 0; n < len; ++n)
12544 ucs2[n] = fill_char;
12545 } else {
12546 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12547 assert(kind == PyUnicode_4BYTE_KIND);
12548 for (n = 0; n < len; ++n)
12549 ucs4[n] = fill_char;
12550 }
12551 }
12552 else {
12553 /* number of characters copied this far */
12554 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12555 const Py_ssize_t char_size = PyUnicode_KIND(str);
12556 char *to = (char *) PyUnicode_DATA(u);
12557 memcpy(to, PyUnicode_DATA(str),
12558 PyUnicode_GET_LENGTH(str) * char_size);
12559 while (done < nchars) {
12560 n = (done <= nchars-done) ? done : nchars-done;
12561 memcpy(to + (done * char_size), to, n * char_size);
12562 done += n;
12563 }
12564 }
12565
12566 assert(_PyUnicode_CheckConsistency(u, 1));
12567 return u;
12568 }
12569
12570 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12571 PyUnicode_Replace(PyObject *str,
12572 PyObject *substr,
12573 PyObject *replstr,
12574 Py_ssize_t maxcount)
12575 {
12576 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12577 ensure_unicode(replstr) < 0)
12578 return NULL;
12579 return replace(str, substr, replstr, maxcount);
12580 }
12581
12582 /*[clinic input]
12583 str.replace as unicode_replace
12584
12585 old: unicode
12586 new: unicode
12587 count: Py_ssize_t = -1
12588 Maximum number of occurrences to replace.
12589 -1 (the default value) means replace all occurrences.
12590 /
12591
12592 Return a copy with all occurrences of substring old replaced by new.
12593
12594 If the optional argument count is given, only the first count occurrences are
12595 replaced.
12596 [clinic start generated code]*/
12597
12598 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12599 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12600 Py_ssize_t count)
12601 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12602 {
12603 if (PyUnicode_READY(self) == -1)
12604 return NULL;
12605 return replace(self, old, new, count);
12606 }
12607
12608 static PyObject *
unicode_repr(PyObject * unicode)12609 unicode_repr(PyObject *unicode)
12610 {
12611 PyObject *repr;
12612 Py_ssize_t isize;
12613 Py_ssize_t osize, squote, dquote, i, o;
12614 Py_UCS4 max, quote;
12615 int ikind, okind, unchanged;
12616 void *idata, *odata;
12617
12618 if (PyUnicode_READY(unicode) == -1)
12619 return NULL;
12620
12621 isize = PyUnicode_GET_LENGTH(unicode);
12622 idata = PyUnicode_DATA(unicode);
12623
12624 /* Compute length of output, quote characters, and
12625 maximum character */
12626 osize = 0;
12627 max = 127;
12628 squote = dquote = 0;
12629 ikind = PyUnicode_KIND(unicode);
12630 for (i = 0; i < isize; i++) {
12631 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12632 Py_ssize_t incr = 1;
12633 switch (ch) {
12634 case '\'': squote++; break;
12635 case '"': dquote++; break;
12636 case '\\': case '\t': case '\r': case '\n':
12637 incr = 2;
12638 break;
12639 default:
12640 /* Fast-path ASCII */
12641 if (ch < ' ' || ch == 0x7f)
12642 incr = 4; /* \xHH */
12643 else if (ch < 0x7f)
12644 ;
12645 else if (Py_UNICODE_ISPRINTABLE(ch))
12646 max = ch > max ? ch : max;
12647 else if (ch < 0x100)
12648 incr = 4; /* \xHH */
12649 else if (ch < 0x10000)
12650 incr = 6; /* \uHHHH */
12651 else
12652 incr = 10; /* \uHHHHHHHH */
12653 }
12654 if (osize > PY_SSIZE_T_MAX - incr) {
12655 PyErr_SetString(PyExc_OverflowError,
12656 "string is too long to generate repr");
12657 return NULL;
12658 }
12659 osize += incr;
12660 }
12661
12662 quote = '\'';
12663 unchanged = (osize == isize);
12664 if (squote) {
12665 unchanged = 0;
12666 if (dquote)
12667 /* Both squote and dquote present. Use squote,
12668 and escape them */
12669 osize += squote;
12670 else
12671 quote = '"';
12672 }
12673 osize += 2; /* quotes */
12674
12675 repr = PyUnicode_New(osize, max);
12676 if (repr == NULL)
12677 return NULL;
12678 okind = PyUnicode_KIND(repr);
12679 odata = PyUnicode_DATA(repr);
12680
12681 PyUnicode_WRITE(okind, odata, 0, quote);
12682 PyUnicode_WRITE(okind, odata, osize-1, quote);
12683 if (unchanged) {
12684 _PyUnicode_FastCopyCharacters(repr, 1,
12685 unicode, 0,
12686 isize);
12687 }
12688 else {
12689 for (i = 0, o = 1; i < isize; i++) {
12690 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12691
12692 /* Escape quotes and backslashes */
12693 if ((ch == quote) || (ch == '\\')) {
12694 PyUnicode_WRITE(okind, odata, o++, '\\');
12695 PyUnicode_WRITE(okind, odata, o++, ch);
12696 continue;
12697 }
12698
12699 /* Map special whitespace to '\t', \n', '\r' */
12700 if (ch == '\t') {
12701 PyUnicode_WRITE(okind, odata, o++, '\\');
12702 PyUnicode_WRITE(okind, odata, o++, 't');
12703 }
12704 else if (ch == '\n') {
12705 PyUnicode_WRITE(okind, odata, o++, '\\');
12706 PyUnicode_WRITE(okind, odata, o++, 'n');
12707 }
12708 else if (ch == '\r') {
12709 PyUnicode_WRITE(okind, odata, o++, '\\');
12710 PyUnicode_WRITE(okind, odata, o++, 'r');
12711 }
12712
12713 /* Map non-printable US ASCII to '\xhh' */
12714 else if (ch < ' ' || ch == 0x7F) {
12715 PyUnicode_WRITE(okind, odata, o++, '\\');
12716 PyUnicode_WRITE(okind, odata, o++, 'x');
12717 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12718 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12719 }
12720
12721 /* Copy ASCII characters as-is */
12722 else if (ch < 0x7F) {
12723 PyUnicode_WRITE(okind, odata, o++, ch);
12724 }
12725
12726 /* Non-ASCII characters */
12727 else {
12728 /* Map Unicode whitespace and control characters
12729 (categories Z* and C* except ASCII space)
12730 */
12731 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12732 PyUnicode_WRITE(okind, odata, o++, '\\');
12733 /* Map 8-bit characters to '\xhh' */
12734 if (ch <= 0xff) {
12735 PyUnicode_WRITE(okind, odata, o++, 'x');
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12737 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12738 }
12739 /* Map 16-bit characters to '\uxxxx' */
12740 else if (ch <= 0xffff) {
12741 PyUnicode_WRITE(okind, odata, o++, 'u');
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12745 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12746 }
12747 /* Map 21-bit characters to '\U00xxxxxx' */
12748 else {
12749 PyUnicode_WRITE(okind, odata, o++, 'U');
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12758 }
12759 }
12760 /* Copy characters as-is */
12761 else {
12762 PyUnicode_WRITE(okind, odata, o++, ch);
12763 }
12764 }
12765 }
12766 }
12767 /* Closing quote already added at the beginning */
12768 assert(_PyUnicode_CheckConsistency(repr, 1));
12769 return repr;
12770 }
12771
12772 PyDoc_STRVAR(rfind__doc__,
12773 "S.rfind(sub[, start[, end]]) -> int\n\
12774 \n\
12775 Return the highest index in S where substring sub is found,\n\
12776 such that sub is contained within S[start:end]. Optional\n\
12777 arguments start and end are interpreted as in slice notation.\n\
12778 \n\
12779 Return -1 on failure.");
12780
12781 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12782 unicode_rfind(PyObject *self, PyObject *args)
12783 {
12784 /* initialize variables to prevent gcc warning */
12785 PyObject *substring = NULL;
12786 Py_ssize_t start = 0;
12787 Py_ssize_t end = 0;
12788 Py_ssize_t result;
12789
12790 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12791 return NULL;
12792
12793 if (PyUnicode_READY(self) == -1)
12794 return NULL;
12795
12796 result = any_find_slice(self, substring, start, end, -1);
12797
12798 if (result == -2)
12799 return NULL;
12800
12801 return PyLong_FromSsize_t(result);
12802 }
12803
12804 PyDoc_STRVAR(rindex__doc__,
12805 "S.rindex(sub[, start[, end]]) -> int\n\
12806 \n\
12807 Return the highest index in S where substring sub is found,\n\
12808 such that sub is contained within S[start:end]. Optional\n\
12809 arguments start and end are interpreted as in slice notation.\n\
12810 \n\
12811 Raises ValueError when the substring is not found.");
12812
12813 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12814 unicode_rindex(PyObject *self, PyObject *args)
12815 {
12816 /* initialize variables to prevent gcc warning */
12817 PyObject *substring = NULL;
12818 Py_ssize_t start = 0;
12819 Py_ssize_t end = 0;
12820 Py_ssize_t result;
12821
12822 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12823 return NULL;
12824
12825 if (PyUnicode_READY(self) == -1)
12826 return NULL;
12827
12828 result = any_find_slice(self, substring, start, end, -1);
12829
12830 if (result == -2)
12831 return NULL;
12832
12833 if (result < 0) {
12834 PyErr_SetString(PyExc_ValueError, "substring not found");
12835 return NULL;
12836 }
12837
12838 return PyLong_FromSsize_t(result);
12839 }
12840
12841 /*[clinic input]
12842 str.rjust as unicode_rjust
12843
12844 width: Py_ssize_t
12845 fillchar: Py_UCS4 = ' '
12846 /
12847
12848 Return a right-justified string of length width.
12849
12850 Padding is done using the specified fill character (default is a space).
12851 [clinic start generated code]*/
12852
12853 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12854 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12855 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12856 {
12857 if (PyUnicode_READY(self) == -1)
12858 return NULL;
12859
12860 if (PyUnicode_GET_LENGTH(self) >= width)
12861 return unicode_result_unchanged(self);
12862
12863 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12864 }
12865
12866 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12867 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12868 {
12869 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12870 return NULL;
12871
12872 return split(s, sep, maxsplit);
12873 }
12874
12875 /*[clinic input]
12876 str.split as unicode_split
12877
12878 sep: object = None
12879 The delimiter according which to split the string.
12880 None (the default value) means split according to any whitespace,
12881 and discard empty strings from the result.
12882 maxsplit: Py_ssize_t = -1
12883 Maximum number of splits to do.
12884 -1 (the default value) means no limit.
12885
12886 Return a list of the words in the string, using sep as the delimiter string.
12887 [clinic start generated code]*/
12888
12889 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12890 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12891 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12892 {
12893 if (sep == Py_None)
12894 return split(self, NULL, maxsplit);
12895 if (PyUnicode_Check(sep))
12896 return split(self, sep, maxsplit);
12897
12898 PyErr_Format(PyExc_TypeError,
12899 "must be str or None, not %.100s",
12900 Py_TYPE(sep)->tp_name);
12901 return NULL;
12902 }
12903
12904 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12905 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12906 {
12907 PyObject* out;
12908 int kind1, kind2;
12909 void *buf1, *buf2;
12910 Py_ssize_t len1, len2;
12911
12912 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12913 return NULL;
12914
12915 kind1 = PyUnicode_KIND(str_obj);
12916 kind2 = PyUnicode_KIND(sep_obj);
12917 len1 = PyUnicode_GET_LENGTH(str_obj);
12918 len2 = PyUnicode_GET_LENGTH(sep_obj);
12919 if (kind1 < kind2 || len1 < len2) {
12920 _Py_INCREF_UNICODE_EMPTY();
12921 if (!unicode_empty)
12922 out = NULL;
12923 else {
12924 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12925 Py_DECREF(unicode_empty);
12926 }
12927 return out;
12928 }
12929 buf1 = PyUnicode_DATA(str_obj);
12930 buf2 = PyUnicode_DATA(sep_obj);
12931 if (kind2 != kind1) {
12932 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12933 if (!buf2)
12934 return NULL;
12935 }
12936
12937 switch (kind1) {
12938 case PyUnicode_1BYTE_KIND:
12939 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12940 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941 else
12942 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943 break;
12944 case PyUnicode_2BYTE_KIND:
12945 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946 break;
12947 case PyUnicode_4BYTE_KIND:
12948 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12949 break;
12950 default:
12951 Py_UNREACHABLE();
12952 }
12953
12954 if (kind2 != kind1)
12955 PyMem_Free(buf2);
12956
12957 return out;
12958 }
12959
12960
12961 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12962 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12963 {
12964 PyObject* out;
12965 int kind1, kind2;
12966 void *buf1, *buf2;
12967 Py_ssize_t len1, len2;
12968
12969 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12970 return NULL;
12971
12972 kind1 = PyUnicode_KIND(str_obj);
12973 kind2 = PyUnicode_KIND(sep_obj);
12974 len1 = PyUnicode_GET_LENGTH(str_obj);
12975 len2 = PyUnicode_GET_LENGTH(sep_obj);
12976 if (kind1 < kind2 || len1 < len2) {
12977 _Py_INCREF_UNICODE_EMPTY();
12978 if (!unicode_empty)
12979 out = NULL;
12980 else {
12981 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12982 Py_DECREF(unicode_empty);
12983 }
12984 return out;
12985 }
12986 buf1 = PyUnicode_DATA(str_obj);
12987 buf2 = PyUnicode_DATA(sep_obj);
12988 if (kind2 != kind1) {
12989 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12990 if (!buf2)
12991 return NULL;
12992 }
12993
12994 switch (kind1) {
12995 case PyUnicode_1BYTE_KIND:
12996 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12997 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12998 else
12999 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13000 break;
13001 case PyUnicode_2BYTE_KIND:
13002 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13003 break;
13004 case PyUnicode_4BYTE_KIND:
13005 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13006 break;
13007 default:
13008 Py_UNREACHABLE();
13009 }
13010
13011 if (kind2 != kind1)
13012 PyMem_Free(buf2);
13013
13014 return out;
13015 }
13016
13017 /*[clinic input]
13018 str.partition as unicode_partition
13019
13020 sep: object
13021 /
13022
13023 Partition the string into three parts using the given separator.
13024
13025 This will search for the separator in the string. If the separator is found,
13026 returns a 3-tuple containing the part before the separator, the separator
13027 itself, and the part after it.
13028
13029 If the separator is not found, returns a 3-tuple containing the original string
13030 and two empty strings.
13031 [clinic start generated code]*/
13032
13033 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13034 unicode_partition(PyObject *self, PyObject *sep)
13035 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13036 {
13037 return PyUnicode_Partition(self, sep);
13038 }
13039
13040 /*[clinic input]
13041 str.rpartition as unicode_rpartition = str.partition
13042
13043 Partition the string into three parts using the given separator.
13044
13045 This will search for the separator in the string, starting at the end. If
13046 the separator is found, returns a 3-tuple containing the part before the
13047 separator, the separator itself, and the part after it.
13048
13049 If the separator is not found, returns a 3-tuple containing two empty strings
13050 and the original string.
13051 [clinic start generated code]*/
13052
13053 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13054 unicode_rpartition(PyObject *self, PyObject *sep)
13055 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13056 {
13057 return PyUnicode_RPartition(self, sep);
13058 }
13059
13060 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13061 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13062 {
13063 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13064 return NULL;
13065
13066 return rsplit(s, sep, maxsplit);
13067 }
13068
13069 /*[clinic input]
13070 str.rsplit as unicode_rsplit = str.split
13071
13072 Return a list of the words in the string, using sep as the delimiter string.
13073
13074 Splits are done starting at the end of the string and working to the front.
13075 [clinic start generated code]*/
13076
13077 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13078 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13079 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13080 {
13081 if (sep == Py_None)
13082 return rsplit(self, NULL, maxsplit);
13083 if (PyUnicode_Check(sep))
13084 return rsplit(self, sep, maxsplit);
13085
13086 PyErr_Format(PyExc_TypeError,
13087 "must be str or None, not %.100s",
13088 Py_TYPE(sep)->tp_name);
13089 return NULL;
13090 }
13091
13092 /*[clinic input]
13093 str.splitlines as unicode_splitlines
13094
13095 keepends: bool(accept={int}) = False
13096
13097 Return a list of the lines in the string, breaking at line boundaries.
13098
13099 Line breaks are not included in the resulting list unless keepends is given and
13100 true.
13101 [clinic start generated code]*/
13102
13103 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13104 unicode_splitlines_impl(PyObject *self, int keepends)
13105 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13106 {
13107 return PyUnicode_Splitlines(self, keepends);
13108 }
13109
13110 static
unicode_str(PyObject * self)13111 PyObject *unicode_str(PyObject *self)
13112 {
13113 return unicode_result_unchanged(self);
13114 }
13115
13116 /*[clinic input]
13117 str.swapcase as unicode_swapcase
13118
13119 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13120 [clinic start generated code]*/
13121
13122 static PyObject *
unicode_swapcase_impl(PyObject * self)13123 unicode_swapcase_impl(PyObject *self)
13124 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13125 {
13126 if (PyUnicode_READY(self) == -1)
13127 return NULL;
13128 return case_operation(self, do_swapcase);
13129 }
13130
13131 /*[clinic input]
13132
13133 @staticmethod
13134 str.maketrans as unicode_maketrans
13135
13136 x: object
13137
13138 y: unicode=NULL
13139
13140 z: unicode=NULL
13141
13142 /
13143
13144 Return a translation table usable for str.translate().
13145
13146 If there is only one argument, it must be a dictionary mapping Unicode
13147 ordinals (integers) or characters to Unicode ordinals, strings or None.
13148 Character keys will be then converted to ordinals.
13149 If there are two arguments, they must be strings of equal length, and
13150 in the resulting dictionary, each character in x will be mapped to the
13151 character at the same position in y. If there is a third argument, it
13152 must be a string, whose characters will be mapped to None in the result.
13153 [clinic start generated code]*/
13154
13155 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13156 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13157 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13158 {
13159 PyObject *new = NULL, *key, *value;
13160 Py_ssize_t i = 0;
13161 int res;
13162
13163 new = PyDict_New();
13164 if (!new)
13165 return NULL;
13166 if (y != NULL) {
13167 int x_kind, y_kind, z_kind;
13168 void *x_data, *y_data, *z_data;
13169
13170 /* x must be a string too, of equal length */
13171 if (!PyUnicode_Check(x)) {
13172 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13173 "be a string if there is a second argument");
13174 goto err;
13175 }
13176 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13177 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13178 "arguments must have equal length");
13179 goto err;
13180 }
13181 /* create entries for translating chars in x to those in y */
13182 x_kind = PyUnicode_KIND(x);
13183 y_kind = PyUnicode_KIND(y);
13184 x_data = PyUnicode_DATA(x);
13185 y_data = PyUnicode_DATA(y);
13186 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13187 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13188 if (!key)
13189 goto err;
13190 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13191 if (!value) {
13192 Py_DECREF(key);
13193 goto err;
13194 }
13195 res = PyDict_SetItem(new, key, value);
13196 Py_DECREF(key);
13197 Py_DECREF(value);
13198 if (res < 0)
13199 goto err;
13200 }
13201 /* create entries for deleting chars in z */
13202 if (z != NULL) {
13203 z_kind = PyUnicode_KIND(z);
13204 z_data = PyUnicode_DATA(z);
13205 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13206 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13207 if (!key)
13208 goto err;
13209 res = PyDict_SetItem(new, key, Py_None);
13210 Py_DECREF(key);
13211 if (res < 0)
13212 goto err;
13213 }
13214 }
13215 } else {
13216 int kind;
13217 void *data;
13218
13219 /* x must be a dict */
13220 if (!PyDict_CheckExact(x)) {
13221 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13222 "to maketrans it must be a dict");
13223 goto err;
13224 }
13225 /* copy entries into the new dict, converting string keys to int keys */
13226 while (PyDict_Next(x, &i, &key, &value)) {
13227 if (PyUnicode_Check(key)) {
13228 /* convert string keys to integer keys */
13229 PyObject *newkey;
13230 if (PyUnicode_GET_LENGTH(key) != 1) {
13231 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13232 "table must be of length 1");
13233 goto err;
13234 }
13235 kind = PyUnicode_KIND(key);
13236 data = PyUnicode_DATA(key);
13237 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13238 if (!newkey)
13239 goto err;
13240 res = PyDict_SetItem(new, newkey, value);
13241 Py_DECREF(newkey);
13242 if (res < 0)
13243 goto err;
13244 } else if (PyLong_Check(key)) {
13245 /* just keep integer keys */
13246 if (PyDict_SetItem(new, key, value) < 0)
13247 goto err;
13248 } else {
13249 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13250 "be strings or integers");
13251 goto err;
13252 }
13253 }
13254 }
13255 return new;
13256 err:
13257 Py_DECREF(new);
13258 return NULL;
13259 }
13260
13261 /*[clinic input]
13262 str.translate as unicode_translate
13263
13264 table: object
13265 Translation table, which must be a mapping of Unicode ordinals to
13266 Unicode ordinals, strings, or None.
13267 /
13268
13269 Replace each character in the string using the given translation table.
13270
13271 The table must implement lookup/indexing via __getitem__, for instance a
13272 dictionary or list. If this operation raises LookupError, the character is
13273 left untouched. Characters mapped to None are deleted.
13274 [clinic start generated code]*/
13275
13276 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13277 unicode_translate(PyObject *self, PyObject *table)
13278 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13279 {
13280 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13281 }
13282
13283 /*[clinic input]
13284 str.upper as unicode_upper
13285
13286 Return a copy of the string converted to uppercase.
13287 [clinic start generated code]*/
13288
13289 static PyObject *
unicode_upper_impl(PyObject * self)13290 unicode_upper_impl(PyObject *self)
13291 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13292 {
13293 if (PyUnicode_READY(self) == -1)
13294 return NULL;
13295 if (PyUnicode_IS_ASCII(self))
13296 return ascii_upper_or_lower(self, 0);
13297 return case_operation(self, do_upper);
13298 }
13299
13300 /*[clinic input]
13301 str.zfill as unicode_zfill
13302
13303 width: Py_ssize_t
13304 /
13305
13306 Pad a numeric string with zeros on the left, to fill a field of the given width.
13307
13308 The string is never truncated.
13309 [clinic start generated code]*/
13310
13311 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13312 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13313 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13314 {
13315 Py_ssize_t fill;
13316 PyObject *u;
13317 int kind;
13318 void *data;
13319 Py_UCS4 chr;
13320
13321 if (PyUnicode_READY(self) == -1)
13322 return NULL;
13323
13324 if (PyUnicode_GET_LENGTH(self) >= width)
13325 return unicode_result_unchanged(self);
13326
13327 fill = width - PyUnicode_GET_LENGTH(self);
13328
13329 u = pad(self, fill, 0, '0');
13330
13331 if (u == NULL)
13332 return NULL;
13333
13334 kind = PyUnicode_KIND(u);
13335 data = PyUnicode_DATA(u);
13336 chr = PyUnicode_READ(kind, data, fill);
13337
13338 if (chr == '+' || chr == '-') {
13339 /* move sign to beginning of string */
13340 PyUnicode_WRITE(kind, data, 0, chr);
13341 PyUnicode_WRITE(kind, data, fill, '0');
13342 }
13343
13344 assert(_PyUnicode_CheckConsistency(u, 1));
13345 return u;
13346 }
13347
13348 #if 0
13349 static PyObject *
13350 unicode__decimal2ascii(PyObject *self)
13351 {
13352 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13353 }
13354 #endif
13355
13356 PyDoc_STRVAR(startswith__doc__,
13357 "S.startswith(prefix[, start[, end]]) -> bool\n\
13358 \n\
13359 Return True if S starts with the specified prefix, False otherwise.\n\
13360 With optional start, test S beginning at that position.\n\
13361 With optional end, stop comparing S at that position.\n\
13362 prefix can also be a tuple of strings to try.");
13363
13364 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13365 unicode_startswith(PyObject *self,
13366 PyObject *args)
13367 {
13368 PyObject *subobj;
13369 PyObject *substring;
13370 Py_ssize_t start = 0;
13371 Py_ssize_t end = PY_SSIZE_T_MAX;
13372 int result;
13373
13374 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13375 return NULL;
13376 if (PyTuple_Check(subobj)) {
13377 Py_ssize_t i;
13378 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13379 substring = PyTuple_GET_ITEM(subobj, i);
13380 if (!PyUnicode_Check(substring)) {
13381 PyErr_Format(PyExc_TypeError,
13382 "tuple for startswith must only contain str, "
13383 "not %.100s",
13384 Py_TYPE(substring)->tp_name);
13385 return NULL;
13386 }
13387 result = tailmatch(self, substring, start, end, -1);
13388 if (result == -1)
13389 return NULL;
13390 if (result) {
13391 Py_RETURN_TRUE;
13392 }
13393 }
13394 /* nothing matched */
13395 Py_RETURN_FALSE;
13396 }
13397 if (!PyUnicode_Check(subobj)) {
13398 PyErr_Format(PyExc_TypeError,
13399 "startswith first arg must be str or "
13400 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13401 return NULL;
13402 }
13403 result = tailmatch(self, subobj, start, end, -1);
13404 if (result == -1)
13405 return NULL;
13406 return PyBool_FromLong(result);
13407 }
13408
13409
13410 PyDoc_STRVAR(endswith__doc__,
13411 "S.endswith(suffix[, start[, end]]) -> bool\n\
13412 \n\
13413 Return True if S ends with the specified suffix, False otherwise.\n\
13414 With optional start, test S beginning at that position.\n\
13415 With optional end, stop comparing S at that position.\n\
13416 suffix can also be a tuple of strings to try.");
13417
13418 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13419 unicode_endswith(PyObject *self,
13420 PyObject *args)
13421 {
13422 PyObject *subobj;
13423 PyObject *substring;
13424 Py_ssize_t start = 0;
13425 Py_ssize_t end = PY_SSIZE_T_MAX;
13426 int result;
13427
13428 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13429 return NULL;
13430 if (PyTuple_Check(subobj)) {
13431 Py_ssize_t i;
13432 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13433 substring = PyTuple_GET_ITEM(subobj, i);
13434 if (!PyUnicode_Check(substring)) {
13435 PyErr_Format(PyExc_TypeError,
13436 "tuple for endswith must only contain str, "
13437 "not %.100s",
13438 Py_TYPE(substring)->tp_name);
13439 return NULL;
13440 }
13441 result = tailmatch(self, substring, start, end, +1);
13442 if (result == -1)
13443 return NULL;
13444 if (result) {
13445 Py_RETURN_TRUE;
13446 }
13447 }
13448 Py_RETURN_FALSE;
13449 }
13450 if (!PyUnicode_Check(subobj)) {
13451 PyErr_Format(PyExc_TypeError,
13452 "endswith first arg must be str or "
13453 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13454 return NULL;
13455 }
13456 result = tailmatch(self, subobj, start, end, +1);
13457 if (result == -1)
13458 return NULL;
13459 return PyBool_FromLong(result);
13460 }
13461
13462 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13463 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13464 {
13465 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13466 writer->data = PyUnicode_DATA(writer->buffer);
13467
13468 if (!writer->readonly) {
13469 writer->kind = PyUnicode_KIND(writer->buffer);
13470 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13471 }
13472 else {
13473 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13474 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13475 writer->kind = PyUnicode_WCHAR_KIND;
13476 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13477
13478 /* Copy-on-write mode: set buffer size to 0 so
13479 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13480 * next write. */
13481 writer->size = 0;
13482 }
13483 }
13484
13485 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13486 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13487 {
13488 memset(writer, 0, sizeof(*writer));
13489
13490 /* ASCII is the bare minimum */
13491 writer->min_char = 127;
13492
13493 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13494 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13495 writer->kind = PyUnicode_WCHAR_KIND;
13496 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13497 }
13498
13499 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13500 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13501 Py_ssize_t length, Py_UCS4 maxchar)
13502 {
13503 Py_ssize_t newlen;
13504 PyObject *newbuffer;
13505
13506 assert(maxchar <= MAX_UNICODE);
13507
13508 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13509 assert((maxchar > writer->maxchar && length >= 0)
13510 || length > 0);
13511
13512 if (length > PY_SSIZE_T_MAX - writer->pos) {
13513 PyErr_NoMemory();
13514 return -1;
13515 }
13516 newlen = writer->pos + length;
13517
13518 maxchar = Py_MAX(maxchar, writer->min_char);
13519
13520 if (writer->buffer == NULL) {
13521 assert(!writer->readonly);
13522 if (writer->overallocate
13523 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13524 /* overallocate to limit the number of realloc() */
13525 newlen += newlen / OVERALLOCATE_FACTOR;
13526 }
13527 if (newlen < writer->min_length)
13528 newlen = writer->min_length;
13529
13530 writer->buffer = PyUnicode_New(newlen, maxchar);
13531 if (writer->buffer == NULL)
13532 return -1;
13533 }
13534 else if (newlen > writer->size) {
13535 if (writer->overallocate
13536 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13537 /* overallocate to limit the number of realloc() */
13538 newlen += newlen / OVERALLOCATE_FACTOR;
13539 }
13540 if (newlen < writer->min_length)
13541 newlen = writer->min_length;
13542
13543 if (maxchar > writer->maxchar || writer->readonly) {
13544 /* resize + widen */
13545 maxchar = Py_MAX(maxchar, writer->maxchar);
13546 newbuffer = PyUnicode_New(newlen, maxchar);
13547 if (newbuffer == NULL)
13548 return -1;
13549 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13550 writer->buffer, 0, writer->pos);
13551 Py_DECREF(writer->buffer);
13552 writer->readonly = 0;
13553 }
13554 else {
13555 newbuffer = resize_compact(writer->buffer, newlen);
13556 if (newbuffer == NULL)
13557 return -1;
13558 }
13559 writer->buffer = newbuffer;
13560 }
13561 else if (maxchar > writer->maxchar) {
13562 assert(!writer->readonly);
13563 newbuffer = PyUnicode_New(writer->size, maxchar);
13564 if (newbuffer == NULL)
13565 return -1;
13566 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13567 writer->buffer, 0, writer->pos);
13568 Py_SETREF(writer->buffer, newbuffer);
13569 }
13570 _PyUnicodeWriter_Update(writer);
13571 return 0;
13572
13573 #undef OVERALLOCATE_FACTOR
13574 }
13575
13576 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13577 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13578 enum PyUnicode_Kind kind)
13579 {
13580 Py_UCS4 maxchar;
13581
13582 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13583 assert(writer->kind < kind);
13584
13585 switch (kind)
13586 {
13587 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13588 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13589 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13590 default:
13591 Py_UNREACHABLE();
13592 }
13593
13594 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13595 }
13596
13597 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13598 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13599 {
13600 assert(ch <= MAX_UNICODE);
13601 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13602 return -1;
13603 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13604 writer->pos++;
13605 return 0;
13606 }
13607
13608 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13609 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13610 {
13611 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13612 }
13613
13614 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13615 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13616 {
13617 Py_UCS4 maxchar;
13618 Py_ssize_t len;
13619
13620 if (PyUnicode_READY(str) == -1)
13621 return -1;
13622 len = PyUnicode_GET_LENGTH(str);
13623 if (len == 0)
13624 return 0;
13625 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13626 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13627 if (writer->buffer == NULL && !writer->overallocate) {
13628 assert(_PyUnicode_CheckConsistency(str, 1));
13629 writer->readonly = 1;
13630 Py_INCREF(str);
13631 writer->buffer = str;
13632 _PyUnicodeWriter_Update(writer);
13633 writer->pos += len;
13634 return 0;
13635 }
13636 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13637 return -1;
13638 }
13639 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13640 str, 0, len);
13641 writer->pos += len;
13642 return 0;
13643 }
13644
13645 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13646 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13647 Py_ssize_t start, Py_ssize_t end)
13648 {
13649 Py_UCS4 maxchar;
13650 Py_ssize_t len;
13651
13652 if (PyUnicode_READY(str) == -1)
13653 return -1;
13654
13655 assert(0 <= start);
13656 assert(end <= PyUnicode_GET_LENGTH(str));
13657 assert(start <= end);
13658
13659 if (end == 0)
13660 return 0;
13661
13662 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13663 return _PyUnicodeWriter_WriteStr(writer, str);
13664
13665 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13666 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13667 else
13668 maxchar = writer->maxchar;
13669 len = end - start;
13670
13671 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13672 return -1;
13673
13674 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13675 str, start, len);
13676 writer->pos += len;
13677 return 0;
13678 }
13679
13680 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13681 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13682 const char *ascii, Py_ssize_t len)
13683 {
13684 if (len == -1)
13685 len = strlen(ascii);
13686
13687 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13688
13689 if (writer->buffer == NULL && !writer->overallocate) {
13690 PyObject *str;
13691
13692 str = _PyUnicode_FromASCII(ascii, len);
13693 if (str == NULL)
13694 return -1;
13695
13696 writer->readonly = 1;
13697 writer->buffer = str;
13698 _PyUnicodeWriter_Update(writer);
13699 writer->pos += len;
13700 return 0;
13701 }
13702
13703 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13704 return -1;
13705
13706 switch (writer->kind)
13707 {
13708 case PyUnicode_1BYTE_KIND:
13709 {
13710 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13711 Py_UCS1 *data = writer->data;
13712
13713 memcpy(data + writer->pos, str, len);
13714 break;
13715 }
13716 case PyUnicode_2BYTE_KIND:
13717 {
13718 _PyUnicode_CONVERT_BYTES(
13719 Py_UCS1, Py_UCS2,
13720 ascii, ascii + len,
13721 (Py_UCS2 *)writer->data + writer->pos);
13722 break;
13723 }
13724 case PyUnicode_4BYTE_KIND:
13725 {
13726 _PyUnicode_CONVERT_BYTES(
13727 Py_UCS1, Py_UCS4,
13728 ascii, ascii + len,
13729 (Py_UCS4 *)writer->data + writer->pos);
13730 break;
13731 }
13732 default:
13733 Py_UNREACHABLE();
13734 }
13735
13736 writer->pos += len;
13737 return 0;
13738 }
13739
13740 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13741 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13742 const char *str, Py_ssize_t len)
13743 {
13744 Py_UCS4 maxchar;
13745
13746 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13747 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13748 return -1;
13749 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13750 writer->pos += len;
13751 return 0;
13752 }
13753
13754 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13755 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13756 {
13757 PyObject *str;
13758
13759 if (writer->pos == 0) {
13760 Py_CLEAR(writer->buffer);
13761 _Py_RETURN_UNICODE_EMPTY();
13762 }
13763
13764 str = writer->buffer;
13765 writer->buffer = NULL;
13766
13767 if (writer->readonly) {
13768 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13769 return str;
13770 }
13771
13772 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13773 PyObject *str2;
13774 str2 = resize_compact(str, writer->pos);
13775 if (str2 == NULL) {
13776 Py_DECREF(str);
13777 return NULL;
13778 }
13779 str = str2;
13780 }
13781
13782 assert(_PyUnicode_CheckConsistency(str, 1));
13783 return unicode_result_ready(str);
13784 }
13785
13786 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13787 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13788 {
13789 Py_CLEAR(writer->buffer);
13790 }
13791
13792 #include "stringlib/unicode_format.h"
13793
13794 PyDoc_STRVAR(format__doc__,
13795 "S.format(*args, **kwargs) -> str\n\
13796 \n\
13797 Return a formatted version of S, using substitutions from args and kwargs.\n\
13798 The substitutions are identified by braces ('{' and '}').");
13799
13800 PyDoc_STRVAR(format_map__doc__,
13801 "S.format_map(mapping) -> str\n\
13802 \n\
13803 Return a formatted version of S, using substitutions from mapping.\n\
13804 The substitutions are identified by braces ('{' and '}').");
13805
13806 /*[clinic input]
13807 str.__format__ as unicode___format__
13808
13809 format_spec: unicode
13810 /
13811
13812 Return a formatted version of the string as described by format_spec.
13813 [clinic start generated code]*/
13814
13815 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13816 unicode___format___impl(PyObject *self, PyObject *format_spec)
13817 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13818 {
13819 _PyUnicodeWriter writer;
13820 int ret;
13821
13822 if (PyUnicode_READY(self) == -1)
13823 return NULL;
13824 _PyUnicodeWriter_Init(&writer);
13825 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13826 self, format_spec, 0,
13827 PyUnicode_GET_LENGTH(format_spec));
13828 if (ret == -1) {
13829 _PyUnicodeWriter_Dealloc(&writer);
13830 return NULL;
13831 }
13832 return _PyUnicodeWriter_Finish(&writer);
13833 }
13834
13835 /*[clinic input]
13836 str.__sizeof__ as unicode_sizeof
13837
13838 Return the size of the string in memory, in bytes.
13839 [clinic start generated code]*/
13840
13841 static PyObject *
unicode_sizeof_impl(PyObject * self)13842 unicode_sizeof_impl(PyObject *self)
13843 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13844 {
13845 Py_ssize_t size;
13846
13847 /* If it's a compact object, account for base structure +
13848 character data. */
13849 if (PyUnicode_IS_COMPACT_ASCII(self))
13850 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13851 else if (PyUnicode_IS_COMPACT(self))
13852 size = sizeof(PyCompactUnicodeObject) +
13853 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13854 else {
13855 /* If it is a two-block object, account for base object, and
13856 for character block if present. */
13857 size = sizeof(PyUnicodeObject);
13858 if (_PyUnicode_DATA_ANY(self))
13859 size += (PyUnicode_GET_LENGTH(self) + 1) *
13860 PyUnicode_KIND(self);
13861 }
13862 /* If the wstr pointer is present, account for it unless it is shared
13863 with the data pointer. Check if the data is not shared. */
13864 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13865 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13866 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13867 size += PyUnicode_UTF8_LENGTH(self) + 1;
13868
13869 return PyLong_FromSsize_t(size);
13870 }
13871
13872 static PyObject *
unicode_getnewargs(PyObject * v)13873 unicode_getnewargs(PyObject *v)
13874 {
13875 PyObject *copy = _PyUnicode_Copy(v);
13876 if (!copy)
13877 return NULL;
13878 return Py_BuildValue("(N)", copy);
13879 }
13880
13881 static PyMethodDef unicode_methods[] = {
13882 UNICODE_ENCODE_METHODDEF
13883 UNICODE_REPLACE_METHODDEF
13884 UNICODE_SPLIT_METHODDEF
13885 UNICODE_RSPLIT_METHODDEF
13886 UNICODE_JOIN_METHODDEF
13887 UNICODE_CAPITALIZE_METHODDEF
13888 UNICODE_CASEFOLD_METHODDEF
13889 UNICODE_TITLE_METHODDEF
13890 UNICODE_CENTER_METHODDEF
13891 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13892 UNICODE_EXPANDTABS_METHODDEF
13893 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13894 UNICODE_PARTITION_METHODDEF
13895 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13896 UNICODE_LJUST_METHODDEF
13897 UNICODE_LOWER_METHODDEF
13898 UNICODE_LSTRIP_METHODDEF
13899 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13900 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13901 UNICODE_RJUST_METHODDEF
13902 UNICODE_RSTRIP_METHODDEF
13903 UNICODE_RPARTITION_METHODDEF
13904 UNICODE_SPLITLINES_METHODDEF
13905 UNICODE_STRIP_METHODDEF
13906 UNICODE_SWAPCASE_METHODDEF
13907 UNICODE_TRANSLATE_METHODDEF
13908 UNICODE_UPPER_METHODDEF
13909 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13910 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13911 UNICODE_ISASCII_METHODDEF
13912 UNICODE_ISLOWER_METHODDEF
13913 UNICODE_ISUPPER_METHODDEF
13914 UNICODE_ISTITLE_METHODDEF
13915 UNICODE_ISSPACE_METHODDEF
13916 UNICODE_ISDECIMAL_METHODDEF
13917 UNICODE_ISDIGIT_METHODDEF
13918 UNICODE_ISNUMERIC_METHODDEF
13919 UNICODE_ISALPHA_METHODDEF
13920 UNICODE_ISALNUM_METHODDEF
13921 UNICODE_ISIDENTIFIER_METHODDEF
13922 UNICODE_ISPRINTABLE_METHODDEF
13923 UNICODE_ZFILL_METHODDEF
13924 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13925 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13926 UNICODE___FORMAT___METHODDEF
13927 UNICODE_MAKETRANS_METHODDEF
13928 UNICODE_SIZEOF_METHODDEF
13929 #if 0
13930 /* These methods are just used for debugging the implementation. */
13931 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13932 #endif
13933
13934 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
13935 {NULL, NULL}
13936 };
13937
13938 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13939 unicode_mod(PyObject *v, PyObject *w)
13940 {
13941 if (!PyUnicode_Check(v))
13942 Py_RETURN_NOTIMPLEMENTED;
13943 return PyUnicode_Format(v, w);
13944 }
13945
13946 static PyNumberMethods unicode_as_number = {
13947 0, /*nb_add*/
13948 0, /*nb_subtract*/
13949 0, /*nb_multiply*/
13950 unicode_mod, /*nb_remainder*/
13951 };
13952
13953 static PySequenceMethods unicode_as_sequence = {
13954 (lenfunc) unicode_length, /* sq_length */
13955 PyUnicode_Concat, /* sq_concat */
13956 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13957 (ssizeargfunc) unicode_getitem, /* sq_item */
13958 0, /* sq_slice */
13959 0, /* sq_ass_item */
13960 0, /* sq_ass_slice */
13961 PyUnicode_Contains, /* sq_contains */
13962 };
13963
13964 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13965 unicode_subscript(PyObject* self, PyObject* item)
13966 {
13967 if (PyUnicode_READY(self) == -1)
13968 return NULL;
13969
13970 if (PyIndex_Check(item)) {
13971 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13972 if (i == -1 && PyErr_Occurred())
13973 return NULL;
13974 if (i < 0)
13975 i += PyUnicode_GET_LENGTH(self);
13976 return unicode_getitem(self, i);
13977 } else if (PySlice_Check(item)) {
13978 Py_ssize_t start, stop, step, slicelength, cur, i;
13979 PyObject *result;
13980 void *src_data, *dest_data;
13981 int src_kind, dest_kind;
13982 Py_UCS4 ch, max_char, kind_limit;
13983
13984 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13985 return NULL;
13986 }
13987 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13988 &start, &stop, step);
13989
13990 if (slicelength <= 0) {
13991 _Py_RETURN_UNICODE_EMPTY();
13992 } else if (start == 0 && step == 1 &&
13993 slicelength == PyUnicode_GET_LENGTH(self)) {
13994 return unicode_result_unchanged(self);
13995 } else if (step == 1) {
13996 return PyUnicode_Substring(self,
13997 start, start + slicelength);
13998 }
13999 /* General case */
14000 src_kind = PyUnicode_KIND(self);
14001 src_data = PyUnicode_DATA(self);
14002 if (!PyUnicode_IS_ASCII(self)) {
14003 kind_limit = kind_maxchar_limit(src_kind);
14004 max_char = 0;
14005 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14006 ch = PyUnicode_READ(src_kind, src_data, cur);
14007 if (ch > max_char) {
14008 max_char = ch;
14009 if (max_char >= kind_limit)
14010 break;
14011 }
14012 }
14013 }
14014 else
14015 max_char = 127;
14016 result = PyUnicode_New(slicelength, max_char);
14017 if (result == NULL)
14018 return NULL;
14019 dest_kind = PyUnicode_KIND(result);
14020 dest_data = PyUnicode_DATA(result);
14021
14022 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14023 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14024 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14025 }
14026 assert(_PyUnicode_CheckConsistency(result, 1));
14027 return result;
14028 } else {
14029 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14030 return NULL;
14031 }
14032 }
14033
14034 static PyMappingMethods unicode_as_mapping = {
14035 (lenfunc)unicode_length, /* mp_length */
14036 (binaryfunc)unicode_subscript, /* mp_subscript */
14037 (objobjargproc)0, /* mp_ass_subscript */
14038 };
14039
14040
14041 /* Helpers for PyUnicode_Format() */
14042
14043 struct unicode_formatter_t {
14044 PyObject *args;
14045 int args_owned;
14046 Py_ssize_t arglen, argidx;
14047 PyObject *dict;
14048
14049 enum PyUnicode_Kind fmtkind;
14050 Py_ssize_t fmtcnt, fmtpos;
14051 void *fmtdata;
14052 PyObject *fmtstr;
14053
14054 _PyUnicodeWriter writer;
14055 };
14056
14057 struct unicode_format_arg_t {
14058 Py_UCS4 ch;
14059 int flags;
14060 Py_ssize_t width;
14061 int prec;
14062 int sign;
14063 };
14064
14065 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14066 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14067 {
14068 Py_ssize_t argidx = ctx->argidx;
14069
14070 if (argidx < ctx->arglen) {
14071 ctx->argidx++;
14072 if (ctx->arglen < 0)
14073 return ctx->args;
14074 else
14075 return PyTuple_GetItem(ctx->args, argidx);
14076 }
14077 PyErr_SetString(PyExc_TypeError,
14078 "not enough arguments for format string");
14079 return NULL;
14080 }
14081
14082 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14083
14084 /* Format a float into the writer if the writer is not NULL, or into *p_output
14085 otherwise.
14086
14087 Return 0 on success, raise an exception and return -1 on error. */
14088 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14089 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14090 PyObject **p_output,
14091 _PyUnicodeWriter *writer)
14092 {
14093 char *p;
14094 double x;
14095 Py_ssize_t len;
14096 int prec;
14097 int dtoa_flags;
14098
14099 x = PyFloat_AsDouble(v);
14100 if (x == -1.0 && PyErr_Occurred())
14101 return -1;
14102
14103 prec = arg->prec;
14104 if (prec < 0)
14105 prec = 6;
14106
14107 if (arg->flags & F_ALT)
14108 dtoa_flags = Py_DTSF_ALT;
14109 else
14110 dtoa_flags = 0;
14111 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14112 if (p == NULL)
14113 return -1;
14114 len = strlen(p);
14115 if (writer) {
14116 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14117 PyMem_Free(p);
14118 return -1;
14119 }
14120 }
14121 else
14122 *p_output = _PyUnicode_FromASCII(p, len);
14123 PyMem_Free(p);
14124 return 0;
14125 }
14126
14127 /* formatlong() emulates the format codes d, u, o, x and X, and
14128 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14129 * Python's regular ints.
14130 * Return value: a new PyUnicodeObject*, or NULL if error.
14131 * The output string is of the form
14132 * "-"? ("0x" | "0X")? digit+
14133 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14134 * set in flags. The case of hex digits will be correct,
14135 * There will be at least prec digits, zero-filled on the left if
14136 * necessary to get that many.
14137 * val object to be converted
14138 * flags bitmask of format flags; only F_ALT is looked at
14139 * prec minimum number of digits; 0-fill on left if needed
14140 * type a character in [duoxX]; u acts the same as d
14141 *
14142 * CAUTION: o, x and X conversions on regular ints can never
14143 * produce a '-' sign, but can for Python's unbounded ints.
14144 */
14145 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14146 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14147 {
14148 PyObject *result = NULL;
14149 char *buf;
14150 Py_ssize_t i;
14151 int sign; /* 1 if '-', else 0 */
14152 int len; /* number of characters */
14153 Py_ssize_t llen;
14154 int numdigits; /* len == numnondigits + numdigits */
14155 int numnondigits = 0;
14156
14157 /* Avoid exceeding SSIZE_T_MAX */
14158 if (prec > INT_MAX-3) {
14159 PyErr_SetString(PyExc_OverflowError,
14160 "precision too large");
14161 return NULL;
14162 }
14163
14164 assert(PyLong_Check(val));
14165
14166 switch (type) {
14167 default:
14168 Py_UNREACHABLE();
14169 case 'd':
14170 case 'i':
14171 case 'u':
14172 /* int and int subclasses should print numerically when a numeric */
14173 /* format code is used (see issue18780) */
14174 result = PyNumber_ToBase(val, 10);
14175 break;
14176 case 'o':
14177 numnondigits = 2;
14178 result = PyNumber_ToBase(val, 8);
14179 break;
14180 case 'x':
14181 case 'X':
14182 numnondigits = 2;
14183 result = PyNumber_ToBase(val, 16);
14184 break;
14185 }
14186 if (!result)
14187 return NULL;
14188
14189 assert(unicode_modifiable(result));
14190 assert(PyUnicode_IS_READY(result));
14191 assert(PyUnicode_IS_ASCII(result));
14192
14193 /* To modify the string in-place, there can only be one reference. */
14194 if (Py_REFCNT(result) != 1) {
14195 Py_DECREF(result);
14196 PyErr_BadInternalCall();
14197 return NULL;
14198 }
14199 buf = PyUnicode_DATA(result);
14200 llen = PyUnicode_GET_LENGTH(result);
14201 if (llen > INT_MAX) {
14202 Py_DECREF(result);
14203 PyErr_SetString(PyExc_ValueError,
14204 "string too large in _PyUnicode_FormatLong");
14205 return NULL;
14206 }
14207 len = (int)llen;
14208 sign = buf[0] == '-';
14209 numnondigits += sign;
14210 numdigits = len - numnondigits;
14211 assert(numdigits > 0);
14212
14213 /* Get rid of base marker unless F_ALT */
14214 if (((alt) == 0 &&
14215 (type == 'o' || type == 'x' || type == 'X'))) {
14216 assert(buf[sign] == '0');
14217 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14218 buf[sign+1] == 'o');
14219 numnondigits -= 2;
14220 buf += 2;
14221 len -= 2;
14222 if (sign)
14223 buf[0] = '-';
14224 assert(len == numnondigits + numdigits);
14225 assert(numdigits > 0);
14226 }
14227
14228 /* Fill with leading zeroes to meet minimum width. */
14229 if (prec > numdigits) {
14230 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14231 numnondigits + prec);
14232 char *b1;
14233 if (!r1) {
14234 Py_DECREF(result);
14235 return NULL;
14236 }
14237 b1 = PyBytes_AS_STRING(r1);
14238 for (i = 0; i < numnondigits; ++i)
14239 *b1++ = *buf++;
14240 for (i = 0; i < prec - numdigits; i++)
14241 *b1++ = '0';
14242 for (i = 0; i < numdigits; i++)
14243 *b1++ = *buf++;
14244 *b1 = '\0';
14245 Py_DECREF(result);
14246 result = r1;
14247 buf = PyBytes_AS_STRING(result);
14248 len = numnondigits + prec;
14249 }
14250
14251 /* Fix up case for hex conversions. */
14252 if (type == 'X') {
14253 /* Need to convert all lower case letters to upper case.
14254 and need to convert 0x to 0X (and -0x to -0X). */
14255 for (i = 0; i < len; i++)
14256 if (buf[i] >= 'a' && buf[i] <= 'x')
14257 buf[i] -= 'a'-'A';
14258 }
14259 if (!PyUnicode_Check(result)
14260 || buf != PyUnicode_DATA(result)) {
14261 PyObject *unicode;
14262 unicode = _PyUnicode_FromASCII(buf, len);
14263 Py_DECREF(result);
14264 result = unicode;
14265 }
14266 else if (len != PyUnicode_GET_LENGTH(result)) {
14267 if (PyUnicode_Resize(&result, len) < 0)
14268 Py_CLEAR(result);
14269 }
14270 return result;
14271 }
14272
14273 /* Format an integer or a float as an integer.
14274 * Return 1 if the number has been formatted into the writer,
14275 * 0 if the number has been formatted into *p_output
14276 * -1 and raise an exception on error */
14277 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14278 mainformatlong(PyObject *v,
14279 struct unicode_format_arg_t *arg,
14280 PyObject **p_output,
14281 _PyUnicodeWriter *writer)
14282 {
14283 PyObject *iobj, *res;
14284 char type = (char)arg->ch;
14285
14286 if (!PyNumber_Check(v))
14287 goto wrongtype;
14288
14289 /* make sure number is a type of integer for o, x, and X */
14290 if (!PyLong_Check(v)) {
14291 if (type == 'o' || type == 'x' || type == 'X') {
14292 iobj = PyNumber_Index(v);
14293 if (iobj == NULL) {
14294 if (PyErr_ExceptionMatches(PyExc_TypeError))
14295 goto wrongtype;
14296 return -1;
14297 }
14298 }
14299 else {
14300 iobj = PyNumber_Long(v);
14301 if (iobj == NULL ) {
14302 if (PyErr_ExceptionMatches(PyExc_TypeError))
14303 goto wrongtype;
14304 return -1;
14305 }
14306 }
14307 assert(PyLong_Check(iobj));
14308 }
14309 else {
14310 iobj = v;
14311 Py_INCREF(iobj);
14312 }
14313
14314 if (PyLong_CheckExact(v)
14315 && arg->width == -1 && arg->prec == -1
14316 && !(arg->flags & (F_SIGN | F_BLANK))
14317 && type != 'X')
14318 {
14319 /* Fast path */
14320 int alternate = arg->flags & F_ALT;
14321 int base;
14322
14323 switch(type)
14324 {
14325 default:
14326 Py_UNREACHABLE();
14327 case 'd':
14328 case 'i':
14329 case 'u':
14330 base = 10;
14331 break;
14332 case 'o':
14333 base = 8;
14334 break;
14335 case 'x':
14336 case 'X':
14337 base = 16;
14338 break;
14339 }
14340
14341 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14342 Py_DECREF(iobj);
14343 return -1;
14344 }
14345 Py_DECREF(iobj);
14346 return 1;
14347 }
14348
14349 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14350 Py_DECREF(iobj);
14351 if (res == NULL)
14352 return -1;
14353 *p_output = res;
14354 return 0;
14355
14356 wrongtype:
14357 switch(type)
14358 {
14359 case 'o':
14360 case 'x':
14361 case 'X':
14362 PyErr_Format(PyExc_TypeError,
14363 "%%%c format: an integer is required, "
14364 "not %.200s",
14365 type, Py_TYPE(v)->tp_name);
14366 break;
14367 default:
14368 PyErr_Format(PyExc_TypeError,
14369 "%%%c format: a number is required, "
14370 "not %.200s",
14371 type, Py_TYPE(v)->tp_name);
14372 break;
14373 }
14374 return -1;
14375 }
14376
14377 static Py_UCS4
formatchar(PyObject * v)14378 formatchar(PyObject *v)
14379 {
14380 /* presume that the buffer is at least 3 characters long */
14381 if (PyUnicode_Check(v)) {
14382 if (PyUnicode_GET_LENGTH(v) == 1) {
14383 return PyUnicode_READ_CHAR(v, 0);
14384 }
14385 goto onError;
14386 }
14387 else {
14388 PyObject *iobj;
14389 long x;
14390 /* make sure number is a type of integer */
14391 if (!PyLong_Check(v)) {
14392 iobj = PyNumber_Index(v);
14393 if (iobj == NULL) {
14394 goto onError;
14395 }
14396 x = PyLong_AsLong(iobj);
14397 Py_DECREF(iobj);
14398 }
14399 else {
14400 x = PyLong_AsLong(v);
14401 }
14402 if (x == -1 && PyErr_Occurred())
14403 goto onError;
14404
14405 if (x < 0 || x > MAX_UNICODE) {
14406 PyErr_SetString(PyExc_OverflowError,
14407 "%c arg not in range(0x110000)");
14408 return (Py_UCS4) -1;
14409 }
14410
14411 return (Py_UCS4) x;
14412 }
14413
14414 onError:
14415 PyErr_SetString(PyExc_TypeError,
14416 "%c requires int or char");
14417 return (Py_UCS4) -1;
14418 }
14419
14420 /* Parse options of an argument: flags, width, precision.
14421 Handle also "%(name)" syntax.
14422
14423 Return 0 if the argument has been formatted into arg->str.
14424 Return 1 if the argument has been written into ctx->writer,
14425 Raise an exception and return -1 on error. */
14426 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14427 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14428 struct unicode_format_arg_t *arg)
14429 {
14430 #define FORMAT_READ(ctx) \
14431 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14432
14433 PyObject *v;
14434
14435 if (arg->ch == '(') {
14436 /* Get argument value from a dictionary. Example: "%(name)s". */
14437 Py_ssize_t keystart;
14438 Py_ssize_t keylen;
14439 PyObject *key;
14440 int pcount = 1;
14441
14442 if (ctx->dict == NULL) {
14443 PyErr_SetString(PyExc_TypeError,
14444 "format requires a mapping");
14445 return -1;
14446 }
14447 ++ctx->fmtpos;
14448 --ctx->fmtcnt;
14449 keystart = ctx->fmtpos;
14450 /* Skip over balanced parentheses */
14451 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14452 arg->ch = FORMAT_READ(ctx);
14453 if (arg->ch == ')')
14454 --pcount;
14455 else if (arg->ch == '(')
14456 ++pcount;
14457 ctx->fmtpos++;
14458 }
14459 keylen = ctx->fmtpos - keystart - 1;
14460 if (ctx->fmtcnt < 0 || pcount > 0) {
14461 PyErr_SetString(PyExc_ValueError,
14462 "incomplete format key");
14463 return -1;
14464 }
14465 key = PyUnicode_Substring(ctx->fmtstr,
14466 keystart, keystart + keylen);
14467 if (key == NULL)
14468 return -1;
14469 if (ctx->args_owned) {
14470 ctx->args_owned = 0;
14471 Py_DECREF(ctx->args);
14472 }
14473 ctx->args = PyObject_GetItem(ctx->dict, key);
14474 Py_DECREF(key);
14475 if (ctx->args == NULL)
14476 return -1;
14477 ctx->args_owned = 1;
14478 ctx->arglen = -1;
14479 ctx->argidx = -2;
14480 }
14481
14482 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14483 while (--ctx->fmtcnt >= 0) {
14484 arg->ch = FORMAT_READ(ctx);
14485 ctx->fmtpos++;
14486 switch (arg->ch) {
14487 case '-': arg->flags |= F_LJUST; continue;
14488 case '+': arg->flags |= F_SIGN; continue;
14489 case ' ': arg->flags |= F_BLANK; continue;
14490 case '#': arg->flags |= F_ALT; continue;
14491 case '0': arg->flags |= F_ZERO; continue;
14492 }
14493 break;
14494 }
14495
14496 /* Parse width. Example: "%10s" => width=10 */
14497 if (arg->ch == '*') {
14498 v = unicode_format_getnextarg(ctx);
14499 if (v == NULL)
14500 return -1;
14501 if (!PyLong_Check(v)) {
14502 PyErr_SetString(PyExc_TypeError,
14503 "* wants int");
14504 return -1;
14505 }
14506 arg->width = PyLong_AsSsize_t(v);
14507 if (arg->width == -1 && PyErr_Occurred())
14508 return -1;
14509 if (arg->width < 0) {
14510 arg->flags |= F_LJUST;
14511 arg->width = -arg->width;
14512 }
14513 if (--ctx->fmtcnt >= 0) {
14514 arg->ch = FORMAT_READ(ctx);
14515 ctx->fmtpos++;
14516 }
14517 }
14518 else if (arg->ch >= '0' && arg->ch <= '9') {
14519 arg->width = arg->ch - '0';
14520 while (--ctx->fmtcnt >= 0) {
14521 arg->ch = FORMAT_READ(ctx);
14522 ctx->fmtpos++;
14523 if (arg->ch < '0' || arg->ch > '9')
14524 break;
14525 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14526 mixing signed and unsigned comparison. Since arg->ch is between
14527 '0' and '9', casting to int is safe. */
14528 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14529 PyErr_SetString(PyExc_ValueError,
14530 "width too big");
14531 return -1;
14532 }
14533 arg->width = arg->width*10 + (arg->ch - '0');
14534 }
14535 }
14536
14537 /* Parse precision. Example: "%.3f" => prec=3 */
14538 if (arg->ch == '.') {
14539 arg->prec = 0;
14540 if (--ctx->fmtcnt >= 0) {
14541 arg->ch = FORMAT_READ(ctx);
14542 ctx->fmtpos++;
14543 }
14544 if (arg->ch == '*') {
14545 v = unicode_format_getnextarg(ctx);
14546 if (v == NULL)
14547 return -1;
14548 if (!PyLong_Check(v)) {
14549 PyErr_SetString(PyExc_TypeError,
14550 "* wants int");
14551 return -1;
14552 }
14553 arg->prec = _PyLong_AsInt(v);
14554 if (arg->prec == -1 && PyErr_Occurred())
14555 return -1;
14556 if (arg->prec < 0)
14557 arg->prec = 0;
14558 if (--ctx->fmtcnt >= 0) {
14559 arg->ch = FORMAT_READ(ctx);
14560 ctx->fmtpos++;
14561 }
14562 }
14563 else if (arg->ch >= '0' && arg->ch <= '9') {
14564 arg->prec = arg->ch - '0';
14565 while (--ctx->fmtcnt >= 0) {
14566 arg->ch = FORMAT_READ(ctx);
14567 ctx->fmtpos++;
14568 if (arg->ch < '0' || arg->ch > '9')
14569 break;
14570 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14571 PyErr_SetString(PyExc_ValueError,
14572 "precision too big");
14573 return -1;
14574 }
14575 arg->prec = arg->prec*10 + (arg->ch - '0');
14576 }
14577 }
14578 }
14579
14580 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14581 if (ctx->fmtcnt >= 0) {
14582 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14583 if (--ctx->fmtcnt >= 0) {
14584 arg->ch = FORMAT_READ(ctx);
14585 ctx->fmtpos++;
14586 }
14587 }
14588 }
14589 if (ctx->fmtcnt < 0) {
14590 PyErr_SetString(PyExc_ValueError,
14591 "incomplete format");
14592 return -1;
14593 }
14594 return 0;
14595
14596 #undef FORMAT_READ
14597 }
14598
14599 /* Format one argument. Supported conversion specifiers:
14600
14601 - "s", "r", "a": any type
14602 - "i", "d", "u": int or float
14603 - "o", "x", "X": int
14604 - "e", "E", "f", "F", "g", "G": float
14605 - "c": int or str (1 character)
14606
14607 When possible, the output is written directly into the Unicode writer
14608 (ctx->writer). A string is created when padding is required.
14609
14610 Return 0 if the argument has been formatted into *p_str,
14611 1 if the argument has been written into ctx->writer,
14612 -1 on error. */
14613 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14614 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14615 struct unicode_format_arg_t *arg,
14616 PyObject **p_str)
14617 {
14618 PyObject *v;
14619 _PyUnicodeWriter *writer = &ctx->writer;
14620
14621 if (ctx->fmtcnt == 0)
14622 ctx->writer.overallocate = 0;
14623
14624 v = unicode_format_getnextarg(ctx);
14625 if (v == NULL)
14626 return -1;
14627
14628
14629 switch (arg->ch) {
14630 case 's':
14631 case 'r':
14632 case 'a':
14633 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14634 /* Fast path */
14635 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14636 return -1;
14637 return 1;
14638 }
14639
14640 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14641 *p_str = v;
14642 Py_INCREF(*p_str);
14643 }
14644 else {
14645 if (arg->ch == 's')
14646 *p_str = PyObject_Str(v);
14647 else if (arg->ch == 'r')
14648 *p_str = PyObject_Repr(v);
14649 else
14650 *p_str = PyObject_ASCII(v);
14651 }
14652 break;
14653
14654 case 'i':
14655 case 'd':
14656 case 'u':
14657 case 'o':
14658 case 'x':
14659 case 'X':
14660 {
14661 int ret = mainformatlong(v, arg, p_str, writer);
14662 if (ret != 0)
14663 return ret;
14664 arg->sign = 1;
14665 break;
14666 }
14667
14668 case 'e':
14669 case 'E':
14670 case 'f':
14671 case 'F':
14672 case 'g':
14673 case 'G':
14674 if (arg->width == -1 && arg->prec == -1
14675 && !(arg->flags & (F_SIGN | F_BLANK)))
14676 {
14677 /* Fast path */
14678 if (formatfloat(v, arg, NULL, writer) == -1)
14679 return -1;
14680 return 1;
14681 }
14682
14683 arg->sign = 1;
14684 if (formatfloat(v, arg, p_str, NULL) == -1)
14685 return -1;
14686 break;
14687
14688 case 'c':
14689 {
14690 Py_UCS4 ch = formatchar(v);
14691 if (ch == (Py_UCS4) -1)
14692 return -1;
14693 if (arg->width == -1 && arg->prec == -1) {
14694 /* Fast path */
14695 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14696 return -1;
14697 return 1;
14698 }
14699 *p_str = PyUnicode_FromOrdinal(ch);
14700 break;
14701 }
14702
14703 default:
14704 PyErr_Format(PyExc_ValueError,
14705 "unsupported format character '%c' (0x%x) "
14706 "at index %zd",
14707 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14708 (int)arg->ch,
14709 ctx->fmtpos - 1);
14710 return -1;
14711 }
14712 if (*p_str == NULL)
14713 return -1;
14714 assert (PyUnicode_Check(*p_str));
14715 return 0;
14716 }
14717
14718 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14719 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14720 struct unicode_format_arg_t *arg,
14721 PyObject *str)
14722 {
14723 Py_ssize_t len;
14724 enum PyUnicode_Kind kind;
14725 void *pbuf;
14726 Py_ssize_t pindex;
14727 Py_UCS4 signchar;
14728 Py_ssize_t buflen;
14729 Py_UCS4 maxchar;
14730 Py_ssize_t sublen;
14731 _PyUnicodeWriter *writer = &ctx->writer;
14732 Py_UCS4 fill;
14733
14734 fill = ' ';
14735 if (arg->sign && arg->flags & F_ZERO)
14736 fill = '0';
14737
14738 if (PyUnicode_READY(str) == -1)
14739 return -1;
14740
14741 len = PyUnicode_GET_LENGTH(str);
14742 if ((arg->width == -1 || arg->width <= len)
14743 && (arg->prec == -1 || arg->prec >= len)
14744 && !(arg->flags & (F_SIGN | F_BLANK)))
14745 {
14746 /* Fast path */
14747 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14748 return -1;
14749 return 0;
14750 }
14751
14752 /* Truncate the string for "s", "r" and "a" formats
14753 if the precision is set */
14754 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14755 if (arg->prec >= 0 && len > arg->prec)
14756 len = arg->prec;
14757 }
14758
14759 /* Adjust sign and width */
14760 kind = PyUnicode_KIND(str);
14761 pbuf = PyUnicode_DATA(str);
14762 pindex = 0;
14763 signchar = '\0';
14764 if (arg->sign) {
14765 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14766 if (ch == '-' || ch == '+') {
14767 signchar = ch;
14768 len--;
14769 pindex++;
14770 }
14771 else if (arg->flags & F_SIGN)
14772 signchar = '+';
14773 else if (arg->flags & F_BLANK)
14774 signchar = ' ';
14775 else
14776 arg->sign = 0;
14777 }
14778 if (arg->width < len)
14779 arg->width = len;
14780
14781 /* Prepare the writer */
14782 maxchar = writer->maxchar;
14783 if (!(arg->flags & F_LJUST)) {
14784 if (arg->sign) {
14785 if ((arg->width-1) > len)
14786 maxchar = Py_MAX(maxchar, fill);
14787 }
14788 else {
14789 if (arg->width > len)
14790 maxchar = Py_MAX(maxchar, fill);
14791 }
14792 }
14793 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14794 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14795 maxchar = Py_MAX(maxchar, strmaxchar);
14796 }
14797
14798 buflen = arg->width;
14799 if (arg->sign && len == arg->width)
14800 buflen++;
14801 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14802 return -1;
14803
14804 /* Write the sign if needed */
14805 if (arg->sign) {
14806 if (fill != ' ') {
14807 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14808 writer->pos += 1;
14809 }
14810 if (arg->width > len)
14811 arg->width--;
14812 }
14813
14814 /* Write the numeric prefix for "x", "X" and "o" formats
14815 if the alternate form is used.
14816 For example, write "0x" for the "%#x" format. */
14817 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14818 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14819 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14820 if (fill != ' ') {
14821 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14822 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14823 writer->pos += 2;
14824 pindex += 2;
14825 }
14826 arg->width -= 2;
14827 if (arg->width < 0)
14828 arg->width = 0;
14829 len -= 2;
14830 }
14831
14832 /* Pad left with the fill character if needed */
14833 if (arg->width > len && !(arg->flags & F_LJUST)) {
14834 sublen = arg->width - len;
14835 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14836 writer->pos += sublen;
14837 arg->width = len;
14838 }
14839
14840 /* If padding with spaces: write sign if needed and/or numeric prefix if
14841 the alternate form is used */
14842 if (fill == ' ') {
14843 if (arg->sign) {
14844 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14845 writer->pos += 1;
14846 }
14847 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14848 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14849 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14850 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14851 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14852 writer->pos += 2;
14853 pindex += 2;
14854 }
14855 }
14856
14857 /* Write characters */
14858 if (len) {
14859 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14860 str, pindex, len);
14861 writer->pos += len;
14862 }
14863
14864 /* Pad right with the fill character if needed */
14865 if (arg->width > len) {
14866 sublen = arg->width - len;
14867 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14868 writer->pos += sublen;
14869 }
14870 return 0;
14871 }
14872
14873 /* Helper of PyUnicode_Format(): format one arg.
14874 Return 0 on success, raise an exception and return -1 on error. */
14875 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14876 unicode_format_arg(struct unicode_formatter_t *ctx)
14877 {
14878 struct unicode_format_arg_t arg;
14879 PyObject *str;
14880 int ret;
14881
14882 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14883 if (arg.ch == '%') {
14884 ctx->fmtpos++;
14885 ctx->fmtcnt--;
14886 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14887 return -1;
14888 return 0;
14889 }
14890 arg.flags = 0;
14891 arg.width = -1;
14892 arg.prec = -1;
14893 arg.sign = 0;
14894 str = NULL;
14895
14896 ret = unicode_format_arg_parse(ctx, &arg);
14897 if (ret == -1)
14898 return -1;
14899
14900 ret = unicode_format_arg_format(ctx, &arg, &str);
14901 if (ret == -1)
14902 return -1;
14903
14904 if (ret != 1) {
14905 ret = unicode_format_arg_output(ctx, &arg, str);
14906 Py_DECREF(str);
14907 if (ret == -1)
14908 return -1;
14909 }
14910
14911 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14912 PyErr_SetString(PyExc_TypeError,
14913 "not all arguments converted during string formatting");
14914 return -1;
14915 }
14916 return 0;
14917 }
14918
14919 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14920 PyUnicode_Format(PyObject *format, PyObject *args)
14921 {
14922 struct unicode_formatter_t ctx;
14923
14924 if (format == NULL || args == NULL) {
14925 PyErr_BadInternalCall();
14926 return NULL;
14927 }
14928
14929 if (ensure_unicode(format) < 0)
14930 return NULL;
14931
14932 ctx.fmtstr = format;
14933 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14934 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14935 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14936 ctx.fmtpos = 0;
14937
14938 _PyUnicodeWriter_Init(&ctx.writer);
14939 ctx.writer.min_length = ctx.fmtcnt + 100;
14940 ctx.writer.overallocate = 1;
14941
14942 if (PyTuple_Check(args)) {
14943 ctx.arglen = PyTuple_Size(args);
14944 ctx.argidx = 0;
14945 }
14946 else {
14947 ctx.arglen = -1;
14948 ctx.argidx = -2;
14949 }
14950 ctx.args_owned = 0;
14951 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14952 ctx.dict = args;
14953 else
14954 ctx.dict = NULL;
14955 ctx.args = args;
14956
14957 while (--ctx.fmtcnt >= 0) {
14958 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14959 Py_ssize_t nonfmtpos;
14960
14961 nonfmtpos = ctx.fmtpos++;
14962 while (ctx.fmtcnt >= 0 &&
14963 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14964 ctx.fmtpos++;
14965 ctx.fmtcnt--;
14966 }
14967 if (ctx.fmtcnt < 0) {
14968 ctx.fmtpos--;
14969 ctx.writer.overallocate = 0;
14970 }
14971
14972 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14973 nonfmtpos, ctx.fmtpos) < 0)
14974 goto onError;
14975 }
14976 else {
14977 ctx.fmtpos++;
14978 if (unicode_format_arg(&ctx) == -1)
14979 goto onError;
14980 }
14981 }
14982
14983 if (ctx.argidx < ctx.arglen && !ctx.dict) {
14984 PyErr_SetString(PyExc_TypeError,
14985 "not all arguments converted during string formatting");
14986 goto onError;
14987 }
14988
14989 if (ctx.args_owned) {
14990 Py_DECREF(ctx.args);
14991 }
14992 return _PyUnicodeWriter_Finish(&ctx.writer);
14993
14994 onError:
14995 _PyUnicodeWriter_Dealloc(&ctx.writer);
14996 if (ctx.args_owned) {
14997 Py_DECREF(ctx.args);
14998 }
14999 return NULL;
15000 }
15001
15002 static PyObject *
15003 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15004
15005 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15006 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15007 {
15008 PyObject *x = NULL;
15009 static char *kwlist[] = {"object", "encoding", "errors", 0};
15010 char *encoding = NULL;
15011 char *errors = NULL;
15012
15013 if (type != &PyUnicode_Type)
15014 return unicode_subtype_new(type, args, kwds);
15015 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15016 kwlist, &x, &encoding, &errors))
15017 return NULL;
15018 if (x == NULL)
15019 _Py_RETURN_UNICODE_EMPTY();
15020 if (encoding == NULL && errors == NULL)
15021 return PyObject_Str(x);
15022 else
15023 return PyUnicode_FromEncodedObject(x, encoding, errors);
15024 }
15025
15026 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15027 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15028 {
15029 PyObject *unicode, *self;
15030 Py_ssize_t length, char_size;
15031 int share_wstr, share_utf8;
15032 unsigned int kind;
15033 void *data;
15034
15035 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15036
15037 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15038 if (unicode == NULL)
15039 return NULL;
15040 assert(_PyUnicode_CHECK(unicode));
15041 if (PyUnicode_READY(unicode) == -1) {
15042 Py_DECREF(unicode);
15043 return NULL;
15044 }
15045
15046 self = type->tp_alloc(type, 0);
15047 if (self == NULL) {
15048 Py_DECREF(unicode);
15049 return NULL;
15050 }
15051 kind = PyUnicode_KIND(unicode);
15052 length = PyUnicode_GET_LENGTH(unicode);
15053
15054 _PyUnicode_LENGTH(self) = length;
15055 #ifdef Py_DEBUG
15056 _PyUnicode_HASH(self) = -1;
15057 #else
15058 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15059 #endif
15060 _PyUnicode_STATE(self).interned = 0;
15061 _PyUnicode_STATE(self).kind = kind;
15062 _PyUnicode_STATE(self).compact = 0;
15063 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15064 _PyUnicode_STATE(self).ready = 1;
15065 _PyUnicode_WSTR(self) = NULL;
15066 _PyUnicode_UTF8_LENGTH(self) = 0;
15067 _PyUnicode_UTF8(self) = NULL;
15068 _PyUnicode_WSTR_LENGTH(self) = 0;
15069 _PyUnicode_DATA_ANY(self) = NULL;
15070
15071 share_utf8 = 0;
15072 share_wstr = 0;
15073 if (kind == PyUnicode_1BYTE_KIND) {
15074 char_size = 1;
15075 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15076 share_utf8 = 1;
15077 }
15078 else if (kind == PyUnicode_2BYTE_KIND) {
15079 char_size = 2;
15080 if (sizeof(wchar_t) == 2)
15081 share_wstr = 1;
15082 }
15083 else {
15084 assert(kind == PyUnicode_4BYTE_KIND);
15085 char_size = 4;
15086 if (sizeof(wchar_t) == 4)
15087 share_wstr = 1;
15088 }
15089
15090 /* Ensure we won't overflow the length. */
15091 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15092 PyErr_NoMemory();
15093 goto onError;
15094 }
15095 data = PyObject_MALLOC((length + 1) * char_size);
15096 if (data == NULL) {
15097 PyErr_NoMemory();
15098 goto onError;
15099 }
15100
15101 _PyUnicode_DATA_ANY(self) = data;
15102 if (share_utf8) {
15103 _PyUnicode_UTF8_LENGTH(self) = length;
15104 _PyUnicode_UTF8(self) = data;
15105 }
15106 if (share_wstr) {
15107 _PyUnicode_WSTR_LENGTH(self) = length;
15108 _PyUnicode_WSTR(self) = (wchar_t *)data;
15109 }
15110
15111 memcpy(data, PyUnicode_DATA(unicode),
15112 kind * (length + 1));
15113 assert(_PyUnicode_CheckConsistency(self, 1));
15114 #ifdef Py_DEBUG
15115 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15116 #endif
15117 Py_DECREF(unicode);
15118 return self;
15119
15120 onError:
15121 Py_DECREF(unicode);
15122 Py_DECREF(self);
15123 return NULL;
15124 }
15125
15126 PyDoc_STRVAR(unicode_doc,
15127 "str(object='') -> str\n\
15128 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15129 \n\
15130 Create a new string object from the given object. If encoding or\n\
15131 errors is specified, then the object must expose a data buffer\n\
15132 that will be decoded using the given encoding and error handler.\n\
15133 Otherwise, returns the result of object.__str__() (if defined)\n\
15134 or repr(object).\n\
15135 encoding defaults to sys.getdefaultencoding().\n\
15136 errors defaults to 'strict'.");
15137
15138 static PyObject *unicode_iter(PyObject *seq);
15139
15140 PyTypeObject PyUnicode_Type = {
15141 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15142 "str", /* tp_name */
15143 sizeof(PyUnicodeObject), /* tp_size */
15144 0, /* tp_itemsize */
15145 /* Slots */
15146 (destructor)unicode_dealloc, /* tp_dealloc */
15147 0, /* tp_print */
15148 0, /* tp_getattr */
15149 0, /* tp_setattr */
15150 0, /* tp_reserved */
15151 unicode_repr, /* tp_repr */
15152 &unicode_as_number, /* tp_as_number */
15153 &unicode_as_sequence, /* tp_as_sequence */
15154 &unicode_as_mapping, /* tp_as_mapping */
15155 (hashfunc) unicode_hash, /* tp_hash*/
15156 0, /* tp_call*/
15157 (reprfunc) unicode_str, /* tp_str */
15158 PyObject_GenericGetAttr, /* tp_getattro */
15159 0, /* tp_setattro */
15160 0, /* tp_as_buffer */
15161 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15162 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15163 unicode_doc, /* tp_doc */
15164 0, /* tp_traverse */
15165 0, /* tp_clear */
15166 PyUnicode_RichCompare, /* tp_richcompare */
15167 0, /* tp_weaklistoffset */
15168 unicode_iter, /* tp_iter */
15169 0, /* tp_iternext */
15170 unicode_methods, /* tp_methods */
15171 0, /* tp_members */
15172 0, /* tp_getset */
15173 &PyBaseObject_Type, /* tp_base */
15174 0, /* tp_dict */
15175 0, /* tp_descr_get */
15176 0, /* tp_descr_set */
15177 0, /* tp_dictoffset */
15178 0, /* tp_init */
15179 0, /* tp_alloc */
15180 unicode_new, /* tp_new */
15181 PyObject_Del, /* tp_free */
15182 };
15183
15184 /* Initialize the Unicode implementation */
15185
_PyUnicode_Init(void)15186 int _PyUnicode_Init(void)
15187 {
15188 /* XXX - move this array to unicodectype.c ? */
15189 Py_UCS2 linebreak[] = {
15190 0x000A, /* LINE FEED */
15191 0x000D, /* CARRIAGE RETURN */
15192 0x001C, /* FILE SEPARATOR */
15193 0x001D, /* GROUP SEPARATOR */
15194 0x001E, /* RECORD SEPARATOR */
15195 0x0085, /* NEXT LINE */
15196 0x2028, /* LINE SEPARATOR */
15197 0x2029, /* PARAGRAPH SEPARATOR */
15198 };
15199
15200 /* Init the implementation */
15201 _Py_INCREF_UNICODE_EMPTY();
15202 if (!unicode_empty)
15203 Py_FatalError("Can't create empty string");
15204 Py_DECREF(unicode_empty);
15205
15206 if (PyType_Ready(&PyUnicode_Type) < 0)
15207 Py_FatalError("Can't initialize 'unicode'");
15208
15209 /* initialize the linebreak bloom filter */
15210 bloom_linebreak = make_bloom_mask(
15211 PyUnicode_2BYTE_KIND, linebreak,
15212 Py_ARRAY_LENGTH(linebreak));
15213
15214 if (PyType_Ready(&EncodingMapType) < 0)
15215 Py_FatalError("Can't initialize encoding map type");
15216
15217 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15218 Py_FatalError("Can't initialize field name iterator type");
15219
15220 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15221 Py_FatalError("Can't initialize formatter iter type");
15222
15223 return 0;
15224 }
15225
15226 /* Finalize the Unicode implementation */
15227
15228 int
PyUnicode_ClearFreeList(void)15229 PyUnicode_ClearFreeList(void)
15230 {
15231 return 0;
15232 }
15233
15234 void
_PyUnicode_Fini(void)15235 _PyUnicode_Fini(void)
15236 {
15237 int i;
15238
15239 Py_CLEAR(unicode_empty);
15240
15241 for (i = 0; i < 256; i++)
15242 Py_CLEAR(unicode_latin1[i]);
15243 _PyUnicode_ClearStaticStrings();
15244 (void)PyUnicode_ClearFreeList();
15245 }
15246
15247 void
PyUnicode_InternInPlace(PyObject ** p)15248 PyUnicode_InternInPlace(PyObject **p)
15249 {
15250 PyObject *s = *p;
15251 PyObject *t;
15252 #ifdef Py_DEBUG
15253 assert(s != NULL);
15254 assert(_PyUnicode_CHECK(s));
15255 #else
15256 if (s == NULL || !PyUnicode_Check(s))
15257 return;
15258 #endif
15259 /* If it's a subclass, we don't really know what putting
15260 it in the interned dict might do. */
15261 if (!PyUnicode_CheckExact(s))
15262 return;
15263 if (PyUnicode_CHECK_INTERNED(s))
15264 return;
15265 if (interned == NULL) {
15266 interned = PyDict_New();
15267 if (interned == NULL) {
15268 PyErr_Clear(); /* Don't leave an exception */
15269 return;
15270 }
15271 }
15272 Py_ALLOW_RECURSION
15273 t = PyDict_SetDefault(interned, s, s);
15274 Py_END_ALLOW_RECURSION
15275 if (t == NULL) {
15276 PyErr_Clear();
15277 return;
15278 }
15279 if (t != s) {
15280 Py_INCREF(t);
15281 Py_SETREF(*p, t);
15282 return;
15283 }
15284 /* The two references in interned are not counted by refcnt.
15285 The deallocator will take care of this */
15286 Py_REFCNT(s) -= 2;
15287 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15288 }
15289
15290 void
PyUnicode_InternImmortal(PyObject ** p)15291 PyUnicode_InternImmortal(PyObject **p)
15292 {
15293 PyUnicode_InternInPlace(p);
15294 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15295 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15296 Py_INCREF(*p);
15297 }
15298 }
15299
15300 PyObject *
PyUnicode_InternFromString(const char * cp)15301 PyUnicode_InternFromString(const char *cp)
15302 {
15303 PyObject *s = PyUnicode_FromString(cp);
15304 if (s == NULL)
15305 return NULL;
15306 PyUnicode_InternInPlace(&s);
15307 return s;
15308 }
15309
15310 void
_Py_ReleaseInternedUnicodeStrings(void)15311 _Py_ReleaseInternedUnicodeStrings(void)
15312 {
15313 PyObject *keys;
15314 PyObject *s;
15315 Py_ssize_t i, n;
15316 Py_ssize_t immortal_size = 0, mortal_size = 0;
15317
15318 if (interned == NULL || !PyDict_Check(interned))
15319 return;
15320 keys = PyDict_Keys(interned);
15321 if (keys == NULL || !PyList_Check(keys)) {
15322 PyErr_Clear();
15323 return;
15324 }
15325
15326 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15327 detector, interned unicode strings are not forcibly deallocated;
15328 rather, we give them their stolen references back, and then clear
15329 and DECREF the interned dict. */
15330
15331 n = PyList_GET_SIZE(keys);
15332 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15333 n);
15334 for (i = 0; i < n; i++) {
15335 s = PyList_GET_ITEM(keys, i);
15336 if (PyUnicode_READY(s) == -1) {
15337 Py_UNREACHABLE();
15338 }
15339 switch (PyUnicode_CHECK_INTERNED(s)) {
15340 case SSTATE_NOT_INTERNED:
15341 /* XXX Shouldn't happen */
15342 break;
15343 case SSTATE_INTERNED_IMMORTAL:
15344 Py_REFCNT(s) += 1;
15345 immortal_size += PyUnicode_GET_LENGTH(s);
15346 break;
15347 case SSTATE_INTERNED_MORTAL:
15348 Py_REFCNT(s) += 2;
15349 mortal_size += PyUnicode_GET_LENGTH(s);
15350 break;
15351 default:
15352 Py_FatalError("Inconsistent interned string state.");
15353 }
15354 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15355 }
15356 fprintf(stderr, "total size of all interned strings: "
15357 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15358 "mortal/immortal\n", mortal_size, immortal_size);
15359 Py_DECREF(keys);
15360 PyDict_Clear(interned);
15361 Py_CLEAR(interned);
15362 }
15363
15364
15365 /********************* Unicode Iterator **************************/
15366
15367 typedef struct {
15368 PyObject_HEAD
15369 Py_ssize_t it_index;
15370 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15371 } unicodeiterobject;
15372
15373 static void
unicodeiter_dealloc(unicodeiterobject * it)15374 unicodeiter_dealloc(unicodeiterobject *it)
15375 {
15376 _PyObject_GC_UNTRACK(it);
15377 Py_XDECREF(it->it_seq);
15378 PyObject_GC_Del(it);
15379 }
15380
15381 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15382 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15383 {
15384 Py_VISIT(it->it_seq);
15385 return 0;
15386 }
15387
15388 static PyObject *
unicodeiter_next(unicodeiterobject * it)15389 unicodeiter_next(unicodeiterobject *it)
15390 {
15391 PyObject *seq, *item;
15392
15393 assert(it != NULL);
15394 seq = it->it_seq;
15395 if (seq == NULL)
15396 return NULL;
15397 assert(_PyUnicode_CHECK(seq));
15398
15399 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15400 int kind = PyUnicode_KIND(seq);
15401 void *data = PyUnicode_DATA(seq);
15402 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15403 item = PyUnicode_FromOrdinal(chr);
15404 if (item != NULL)
15405 ++it->it_index;
15406 return item;
15407 }
15408
15409 it->it_seq = NULL;
15410 Py_DECREF(seq);
15411 return NULL;
15412 }
15413
15414 static PyObject *
unicodeiter_len(unicodeiterobject * it)15415 unicodeiter_len(unicodeiterobject *it)
15416 {
15417 Py_ssize_t len = 0;
15418 if (it->it_seq)
15419 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15420 return PyLong_FromSsize_t(len);
15421 }
15422
15423 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15424
15425 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15426 unicodeiter_reduce(unicodeiterobject *it)
15427 {
15428 if (it->it_seq != NULL) {
15429 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15430 it->it_seq, it->it_index);
15431 } else {
15432 PyObject *u = (PyObject *)_PyUnicode_New(0);
15433 if (u == NULL)
15434 return NULL;
15435 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15436 }
15437 }
15438
15439 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15440
15441 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15442 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15443 {
15444 Py_ssize_t index = PyLong_AsSsize_t(state);
15445 if (index == -1 && PyErr_Occurred())
15446 return NULL;
15447 if (it->it_seq != NULL) {
15448 if (index < 0)
15449 index = 0;
15450 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15451 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15452 it->it_index = index;
15453 }
15454 Py_RETURN_NONE;
15455 }
15456
15457 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15458
15459 static PyMethodDef unicodeiter_methods[] = {
15460 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15461 length_hint_doc},
15462 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15463 reduce_doc},
15464 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15465 setstate_doc},
15466 {NULL, NULL} /* sentinel */
15467 };
15468
15469 PyTypeObject PyUnicodeIter_Type = {
15470 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15471 "str_iterator", /* tp_name */
15472 sizeof(unicodeiterobject), /* tp_basicsize */
15473 0, /* tp_itemsize */
15474 /* methods */
15475 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15476 0, /* tp_print */
15477 0, /* tp_getattr */
15478 0, /* tp_setattr */
15479 0, /* tp_reserved */
15480 0, /* tp_repr */
15481 0, /* tp_as_number */
15482 0, /* tp_as_sequence */
15483 0, /* tp_as_mapping */
15484 0, /* tp_hash */
15485 0, /* tp_call */
15486 0, /* tp_str */
15487 PyObject_GenericGetAttr, /* tp_getattro */
15488 0, /* tp_setattro */
15489 0, /* tp_as_buffer */
15490 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15491 0, /* tp_doc */
15492 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15493 0, /* tp_clear */
15494 0, /* tp_richcompare */
15495 0, /* tp_weaklistoffset */
15496 PyObject_SelfIter, /* tp_iter */
15497 (iternextfunc)unicodeiter_next, /* tp_iternext */
15498 unicodeiter_methods, /* tp_methods */
15499 0,
15500 };
15501
15502 static PyObject *
unicode_iter(PyObject * seq)15503 unicode_iter(PyObject *seq)
15504 {
15505 unicodeiterobject *it;
15506
15507 if (!PyUnicode_Check(seq)) {
15508 PyErr_BadInternalCall();
15509 return NULL;
15510 }
15511 if (PyUnicode_READY(seq) == -1)
15512 return NULL;
15513 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15514 if (it == NULL)
15515 return NULL;
15516 it->it_index = 0;
15517 Py_INCREF(seq);
15518 it->it_seq = seq;
15519 _PyObject_GC_TRACK(it);
15520 return (PyObject *)it;
15521 }
15522
15523
15524 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15525 Py_UNICODE_strlen(const Py_UNICODE *u)
15526 {
15527 return wcslen(u);
15528 }
15529
15530 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15531 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15532 {
15533 Py_UNICODE *u = s1;
15534 while ((*u++ = *s2++));
15535 return s1;
15536 }
15537
15538 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15539 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15540 {
15541 Py_UNICODE *u = s1;
15542 while ((*u++ = *s2++))
15543 if (n-- == 0)
15544 break;
15545 return s1;
15546 }
15547
15548 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15549 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15550 {
15551 Py_UNICODE *u1 = s1;
15552 u1 += wcslen(u1);
15553 while ((*u1++ = *s2++));
15554 return s1;
15555 }
15556
15557 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15558 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15559 {
15560 while (*s1 && *s2 && *s1 == *s2)
15561 s1++, s2++;
15562 if (*s1 && *s2)
15563 return (*s1 < *s2) ? -1 : +1;
15564 if (*s1)
15565 return 1;
15566 if (*s2)
15567 return -1;
15568 return 0;
15569 }
15570
15571 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15572 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15573 {
15574 Py_UNICODE u1, u2;
15575 for (; n != 0; n--) {
15576 u1 = *s1;
15577 u2 = *s2;
15578 if (u1 != u2)
15579 return (u1 < u2) ? -1 : +1;
15580 if (u1 == '\0')
15581 return 0;
15582 s1++;
15583 s2++;
15584 }
15585 return 0;
15586 }
15587
15588 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15589 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15590 {
15591 const Py_UNICODE *p;
15592 for (p = s; *p; p++)
15593 if (*p == c)
15594 return (Py_UNICODE*)p;
15595 return NULL;
15596 }
15597
15598 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15599 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15600 {
15601 const Py_UNICODE *p;
15602 p = s + wcslen(s);
15603 while (p != s) {
15604 p--;
15605 if (*p == c)
15606 return (Py_UNICODE*)p;
15607 }
15608 return NULL;
15609 }
15610
15611 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15612 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15613 {
15614 Py_UNICODE *u, *copy;
15615 Py_ssize_t len, size;
15616
15617 if (!PyUnicode_Check(unicode)) {
15618 PyErr_BadArgument();
15619 return NULL;
15620 }
15621 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15622 if (u == NULL)
15623 return NULL;
15624 /* Ensure we won't overflow the size. */
15625 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15626 PyErr_NoMemory();
15627 return NULL;
15628 }
15629 size = len + 1; /* copy the null character */
15630 size *= sizeof(Py_UNICODE);
15631 copy = PyMem_Malloc(size);
15632 if (copy == NULL) {
15633 PyErr_NoMemory();
15634 return NULL;
15635 }
15636 memcpy(copy, u, size);
15637 return copy;
15638 }
15639
15640 /* A _string module, to export formatter_parser and formatter_field_name_split
15641 to the string.Formatter class implemented in Python. */
15642
15643 static PyMethodDef _string_methods[] = {
15644 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15645 METH_O, PyDoc_STR("split the argument as a field name")},
15646 {"formatter_parser", (PyCFunction) formatter_parser,
15647 METH_O, PyDoc_STR("parse the argument as a format string")},
15648 {NULL, NULL}
15649 };
15650
15651 static struct PyModuleDef _string_module = {
15652 PyModuleDef_HEAD_INIT,
15653 "_string",
15654 PyDoc_STR("string helper module"),
15655 0,
15656 _string_methods,
15657 NULL,
15658 NULL,
15659 NULL,
15660 NULL
15661 };
15662
15663 PyMODINIT_FUNC
PyInit__string(void)15664 PyInit__string(void)
15665 {
15666 return PyModule_Create(&_string_module);
15667 }
15668
15669
15670 #ifdef __cplusplus
15671 }
15672 #endif
15673