1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_initconfig.h"
44 #include "pycore_fileutils.h"
45 #include "pycore_object.h"
46 #include "pycore_pylifecycle.h"
47 #include "pycore_pystate.h"
48 #include "ucnhash.h"
49 #include "bytes_methods.h"
50 #include "stringlib/eq.h"
51
52 #ifdef MS_WINDOWS
53 #include <windows.h>
54 #endif
55
56 /* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58 /* #define INTERNED_STATS 1 */
59
60
61 /*[clinic input]
62 class str "PyObject *" "&PyUnicode_Type"
63 [clinic start generated code]*/
64 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66 /*[python input]
67 class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77 [python start generated code]*/
78 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
79
80 /* --- Globals ------------------------------------------------------------
81
82 NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
85
86 */
87
88
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92
93 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94 #define MAX_UNICODE 0x10ffff
95
96 #ifdef Py_DEBUG
97 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
98 #else
99 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100 #endif
101
102 #define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104 #define PyUnicode_UTF8(op) \
105 (assert(_PyUnicode_CHECK(op)), \
106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
110 #define _PyUnicode_UTF8_LENGTH(op) \
111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112 #define PyUnicode_UTF8_LENGTH(op) \
113 (assert(_PyUnicode_CHECK(op)), \
114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
118 #define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120 #define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122 #define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124 #define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126 #define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
128 #define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131 #define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 ((PyASCIIObject *)(op))->length)
134 #define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
136
137 #undef PyUnicode_READY
138 #define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
141 0 : \
142 _PyUnicode_Ready(op)))
143
144 #define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148 #define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
152 /* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
154 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
156 && _PyUnicode_UTF8(op) \
157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
159 /* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
162 ((_PyUnicode_WSTR(op) && \
163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
166 /* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
185 } \
186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
188 } while (0)
189
190 #ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192 # define OVERALLOCATE_FACTOR 2
193 #else
194 /* On Linux, overallocate by 25% is the best factor */
195 # define OVERALLOCATE_FACTOR 4
196 #endif
197
198 /* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
205 */
206 static PyObject *interned = NULL;
207
208 /* The empty Unicode object is shared to improve performance. */
209 static PyObject *unicode_empty = NULL;
210
211 #define _Py_INCREF_UNICODE_EMPTY() \
212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
218 Py_INCREF(unicode_empty); \
219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
221 } \
222 } while (0)
223
224 #define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
229
230 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)231 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233 {
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
238 assert(value <= 0xff);
239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
245 assert(value <= 0xffff);
246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
253 assert(value <= MAX_UNICODE);
254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262 }
263
264
265 /* Forward declaration */
266 static inline int
267 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
268 static PyObject *
269 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271 static PyObject *
272 unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
275
276 /* List of static strings. */
277 static _Py_Identifier *static_strings = NULL;
278
279 /* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
281 static PyObject *unicode_latin1[256] = {NULL};
282
283 /* Fast detection of the most frequent whitespace characters */
284 const unsigned char _Py_ascii_whitespace[] = {
285 0, 0, 0, 0, 0, 0, 0, 0,
286 /* case 0x0009: * CHARACTER TABULATION */
287 /* case 0x000A: * LINE FEED */
288 /* case 0x000B: * LINE TABULATION */
289 /* case 0x000C: * FORM FEED */
290 /* case 0x000D: * CARRIAGE RETURN */
291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 /* case 0x001C: * FILE SEPARATOR */
294 /* case 0x001D: * GROUP SEPARATOR */
295 /* case 0x001E: * RECORD SEPARATOR */
296 /* case 0x001F: * UNIT SEPARATOR */
297 0, 0, 0, 0, 1, 1, 1, 1,
298 /* case 0x0020: * SPACE */
299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
312 };
313
314 /* forward */
315 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
316 static PyObject* get_latin1_char(unsigned char ch);
317 static int unicode_modifiable(PyObject *unicode);
318
319
320 static PyObject *
321 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
322 static PyObject *
323 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324 static PyObject *
325 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327 static PyObject *
328 unicode_encode_call_errorhandler(const char *errors,
329 PyObject **errorHandler,const char *encoding, const char *reason,
330 PyObject *unicode, PyObject **exceptionObject,
331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
333 static void
334 raise_encode_exception(PyObject **exceptionObject,
335 const char *encoding,
336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
339
340 /* Same for linebreaks */
341 static const unsigned char ascii_linebreak[] = {
342 0, 0, 0, 0, 0, 0, 0, 0,
343 /* 0x000A, * LINE FEED */
344 /* 0x000B, * LINE TABULATION */
345 /* 0x000C, * FORM FEED */
346 /* 0x000D, * CARRIAGE RETURN */
347 0, 0, 1, 1, 1, 1, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 /* 0x001C, * FILE SEPARATOR */
350 /* 0x001D, * GROUP SEPARATOR */
351 /* 0x001E, * RECORD SEPARATOR */
352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
366 };
367
368 static int convert_uc(PyObject *obj, void *addr);
369
370 #include "clinic/unicodeobject.c.h"
371
372 _Py_error_handler
_Py_GetErrorHandler(const char * errors)373 _Py_GetErrorHandler(const char *errors)
374 {
375 if (errors == NULL || strcmp(errors, "strict") == 0) {
376 return _Py_ERROR_STRICT;
377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
379 return _Py_ERROR_SURROGATEESCAPE;
380 }
381 if (strcmp(errors, "replace") == 0) {
382 return _Py_ERROR_REPLACE;
383 }
384 if (strcmp(errors, "ignore") == 0) {
385 return _Py_ERROR_IGNORE;
386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
388 return _Py_ERROR_BACKSLASHREPLACE;
389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
391 return _Py_ERROR_SURROGATEPASS;
392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
394 return _Py_ERROR_XMLCHARREFREPLACE;
395 }
396 return _Py_ERROR_OTHER;
397 }
398
399
400 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)401 get_error_handler_wide(const wchar_t *errors)
402 {
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425 }
426
427
428 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
430 Py_UNICODE
PyUnicode_GetMax(void)431 PyUnicode_GetMax(void)
432 {
433 #ifdef Py_UNICODE_WIDE
434 return 0x10FFFF;
435 #else
436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
439 #endif
440 }
441
442 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)443 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
444 {
445 #define CHECK(expr) \
446 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
447
448 PyASCIIObject *ascii;
449 unsigned int kind;
450
451 assert(op != NULL);
452 CHECK(PyUnicode_Check(op));
453
454 ascii = (PyASCIIObject *)op;
455 kind = ascii->state.kind;
456
457 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
458 CHECK(kind == PyUnicode_1BYTE_KIND);
459 CHECK(ascii->state.ready == 1);
460 }
461 else {
462 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
463 void *data;
464
465 if (ascii->state.compact == 1) {
466 data = compact + 1;
467 CHECK(kind == PyUnicode_1BYTE_KIND
468 || kind == PyUnicode_2BYTE_KIND
469 || kind == PyUnicode_4BYTE_KIND);
470 CHECK(ascii->state.ascii == 0);
471 CHECK(ascii->state.ready == 1);
472 CHECK(compact->utf8 != data);
473 }
474 else {
475 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
476
477 data = unicode->data.any;
478 if (kind == PyUnicode_WCHAR_KIND) {
479 CHECK(ascii->length == 0);
480 CHECK(ascii->hash == -1);
481 CHECK(ascii->state.compact == 0);
482 CHECK(ascii->state.ascii == 0);
483 CHECK(ascii->state.ready == 0);
484 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
485 CHECK(ascii->wstr != NULL);
486 CHECK(data == NULL);
487 CHECK(compact->utf8 == NULL);
488 }
489 else {
490 CHECK(kind == PyUnicode_1BYTE_KIND
491 || kind == PyUnicode_2BYTE_KIND
492 || kind == PyUnicode_4BYTE_KIND);
493 CHECK(ascii->state.compact == 0);
494 CHECK(ascii->state.ready == 1);
495 CHECK(data != NULL);
496 if (ascii->state.ascii) {
497 CHECK(compact->utf8 == data);
498 CHECK(compact->utf8_length == ascii->length);
499 }
500 else
501 CHECK(compact->utf8 != data);
502 }
503 }
504 if (kind != PyUnicode_WCHAR_KIND) {
505 if (
506 #if SIZEOF_WCHAR_T == 2
507 kind == PyUnicode_2BYTE_KIND
508 #else
509 kind == PyUnicode_4BYTE_KIND
510 #endif
511 )
512 {
513 CHECK(ascii->wstr == data);
514 CHECK(compact->wstr_length == ascii->length);
515 } else
516 CHECK(ascii->wstr != data);
517 }
518
519 if (compact->utf8 == NULL)
520 CHECK(compact->utf8_length == 0);
521 if (ascii->wstr == NULL)
522 CHECK(compact->wstr_length == 0);
523 }
524
525 /* check that the best kind is used: O(n) operation */
526 if (check_content && kind != PyUnicode_WCHAR_KIND) {
527 Py_ssize_t i;
528 Py_UCS4 maxchar = 0;
529 void *data;
530 Py_UCS4 ch;
531
532 data = PyUnicode_DATA(ascii);
533 for (i=0; i < ascii->length; i++)
534 {
535 ch = PyUnicode_READ(kind, data, i);
536 if (ch > maxchar)
537 maxchar = ch;
538 }
539 if (kind == PyUnicode_1BYTE_KIND) {
540 if (ascii->state.ascii == 0) {
541 CHECK(maxchar >= 128);
542 CHECK(maxchar <= 255);
543 }
544 else
545 CHECK(maxchar < 128);
546 }
547 else if (kind == PyUnicode_2BYTE_KIND) {
548 CHECK(maxchar >= 0x100);
549 CHECK(maxchar <= 0xFFFF);
550 }
551 else {
552 CHECK(maxchar >= 0x10000);
553 CHECK(maxchar <= MAX_UNICODE);
554 }
555 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
556 }
557 return 1;
558
559 #undef CHECK
560 }
561
562
563 static PyObject*
unicode_result_wchar(PyObject * unicode)564 unicode_result_wchar(PyObject *unicode)
565 {
566 #ifndef Py_DEBUG
567 Py_ssize_t len;
568
569 len = _PyUnicode_WSTR_LENGTH(unicode);
570 if (len == 0) {
571 Py_DECREF(unicode);
572 _Py_RETURN_UNICODE_EMPTY();
573 }
574
575 if (len == 1) {
576 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
577 if ((Py_UCS4)ch < 256) {
578 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
579 Py_DECREF(unicode);
580 return latin1_char;
581 }
582 }
583
584 if (_PyUnicode_Ready(unicode) < 0) {
585 Py_DECREF(unicode);
586 return NULL;
587 }
588 #else
589 assert(Py_REFCNT(unicode) == 1);
590
591 /* don't make the result ready in debug mode to ensure that the caller
592 makes the string ready before using it */
593 assert(_PyUnicode_CheckConsistency(unicode, 1));
594 #endif
595 return unicode;
596 }
597
598 static PyObject*
unicode_result_ready(PyObject * unicode)599 unicode_result_ready(PyObject *unicode)
600 {
601 Py_ssize_t length;
602
603 length = PyUnicode_GET_LENGTH(unicode);
604 if (length == 0) {
605 if (unicode != unicode_empty) {
606 Py_DECREF(unicode);
607 _Py_RETURN_UNICODE_EMPTY();
608 }
609 return unicode_empty;
610 }
611
612 if (length == 1) {
613 void *data = PyUnicode_DATA(unicode);
614 int kind = PyUnicode_KIND(unicode);
615 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
616 if (ch < 256) {
617 PyObject *latin1_char = unicode_latin1[ch];
618 if (latin1_char != NULL) {
619 if (unicode != latin1_char) {
620 Py_INCREF(latin1_char);
621 Py_DECREF(unicode);
622 }
623 return latin1_char;
624 }
625 else {
626 assert(_PyUnicode_CheckConsistency(unicode, 1));
627 Py_INCREF(unicode);
628 unicode_latin1[ch] = unicode;
629 return unicode;
630 }
631 }
632 }
633
634 assert(_PyUnicode_CheckConsistency(unicode, 1));
635 return unicode;
636 }
637
638 static PyObject*
unicode_result(PyObject * unicode)639 unicode_result(PyObject *unicode)
640 {
641 assert(_PyUnicode_CHECK(unicode));
642 if (PyUnicode_IS_READY(unicode))
643 return unicode_result_ready(unicode);
644 else
645 return unicode_result_wchar(unicode);
646 }
647
648 static PyObject*
unicode_result_unchanged(PyObject * unicode)649 unicode_result_unchanged(PyObject *unicode)
650 {
651 if (PyUnicode_CheckExact(unicode)) {
652 if (PyUnicode_READY(unicode) == -1)
653 return NULL;
654 Py_INCREF(unicode);
655 return unicode;
656 }
657 else
658 /* Subtype -- return genuine unicode string with the same value. */
659 return _PyUnicode_Copy(unicode);
660 }
661
662 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
663 ASCII, Latin1, UTF-8, etc. */
664 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)665 backslashreplace(_PyBytesWriter *writer, char *str,
666 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
667 {
668 Py_ssize_t size, i;
669 Py_UCS4 ch;
670 enum PyUnicode_Kind kind;
671 void *data;
672
673 assert(PyUnicode_IS_READY(unicode));
674 kind = PyUnicode_KIND(unicode);
675 data = PyUnicode_DATA(unicode);
676
677 size = 0;
678 /* determine replacement size */
679 for (i = collstart; i < collend; ++i) {
680 Py_ssize_t incr;
681
682 ch = PyUnicode_READ(kind, data, i);
683 if (ch < 0x100)
684 incr = 2+2;
685 else if (ch < 0x10000)
686 incr = 2+4;
687 else {
688 assert(ch <= MAX_UNICODE);
689 incr = 2+8;
690 }
691 if (size > PY_SSIZE_T_MAX - incr) {
692 PyErr_SetString(PyExc_OverflowError,
693 "encoded result is too long for a Python string");
694 return NULL;
695 }
696 size += incr;
697 }
698
699 str = _PyBytesWriter_Prepare(writer, str, size);
700 if (str == NULL)
701 return NULL;
702
703 /* generate replacement */
704 for (i = collstart; i < collend; ++i) {
705 ch = PyUnicode_READ(kind, data, i);
706 *str++ = '\\';
707 if (ch >= 0x00010000) {
708 *str++ = 'U';
709 *str++ = Py_hexdigits[(ch>>28)&0xf];
710 *str++ = Py_hexdigits[(ch>>24)&0xf];
711 *str++ = Py_hexdigits[(ch>>20)&0xf];
712 *str++ = Py_hexdigits[(ch>>16)&0xf];
713 *str++ = Py_hexdigits[(ch>>12)&0xf];
714 *str++ = Py_hexdigits[(ch>>8)&0xf];
715 }
716 else if (ch >= 0x100) {
717 *str++ = 'u';
718 *str++ = Py_hexdigits[(ch>>12)&0xf];
719 *str++ = Py_hexdigits[(ch>>8)&0xf];
720 }
721 else
722 *str++ = 'x';
723 *str++ = Py_hexdigits[(ch>>4)&0xf];
724 *str++ = Py_hexdigits[ch&0xf];
725 }
726 return str;
727 }
728
729 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
730 ASCII, Latin1, UTF-8, etc. */
731 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)732 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
733 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
734 {
735 Py_ssize_t size, i;
736 Py_UCS4 ch;
737 enum PyUnicode_Kind kind;
738 void *data;
739
740 assert(PyUnicode_IS_READY(unicode));
741 kind = PyUnicode_KIND(unicode);
742 data = PyUnicode_DATA(unicode);
743
744 size = 0;
745 /* determine replacement size */
746 for (i = collstart; i < collend; ++i) {
747 Py_ssize_t incr;
748
749 ch = PyUnicode_READ(kind, data, i);
750 if (ch < 10)
751 incr = 2+1+1;
752 else if (ch < 100)
753 incr = 2+2+1;
754 else if (ch < 1000)
755 incr = 2+3+1;
756 else if (ch < 10000)
757 incr = 2+4+1;
758 else if (ch < 100000)
759 incr = 2+5+1;
760 else if (ch < 1000000)
761 incr = 2+6+1;
762 else {
763 assert(ch <= MAX_UNICODE);
764 incr = 2+7+1;
765 }
766 if (size > PY_SSIZE_T_MAX - incr) {
767 PyErr_SetString(PyExc_OverflowError,
768 "encoded result is too long for a Python string");
769 return NULL;
770 }
771 size += incr;
772 }
773
774 str = _PyBytesWriter_Prepare(writer, str, size);
775 if (str == NULL)
776 return NULL;
777
778 /* generate replacement */
779 for (i = collstart; i < collend; ++i) {
780 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
781 }
782 return str;
783 }
784
785 /* --- Bloom Filters ----------------------------------------------------- */
786
787 /* stuff to implement simple "bloom filters" for Unicode characters.
788 to keep things simple, we use a single bitmask, using the least 5
789 bits from each unicode characters as the bit index. */
790
791 /* the linebreak mask is set up by Unicode_Init below */
792
793 #if LONG_BIT >= 128
794 #define BLOOM_WIDTH 128
795 #elif LONG_BIT >= 64
796 #define BLOOM_WIDTH 64
797 #elif LONG_BIT >= 32
798 #define BLOOM_WIDTH 32
799 #else
800 #error "LONG_BIT is smaller than 32"
801 #endif
802
803 #define BLOOM_MASK unsigned long
804
805 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
806
807 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
808
809 #define BLOOM_LINEBREAK(ch) \
810 ((ch) < 128U ? ascii_linebreak[(ch)] : \
811 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
812
813 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)814 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
815 {
816 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
817 do { \
818 TYPE *data = (TYPE *)PTR; \
819 TYPE *end = data + LEN; \
820 Py_UCS4 ch; \
821 for (; data != end; data++) { \
822 ch = *data; \
823 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
824 } \
825 break; \
826 } while (0)
827
828 /* calculate simple bloom-style bitmask for a given unicode string */
829
830 BLOOM_MASK mask;
831
832 mask = 0;
833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
836 break;
837 case PyUnicode_2BYTE_KIND:
838 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
839 break;
840 case PyUnicode_4BYTE_KIND:
841 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
842 break;
843 default:
844 Py_UNREACHABLE();
845 }
846 return mask;
847
848 #undef BLOOM_UPDATE
849 }
850
851 static int
ensure_unicode(PyObject * obj)852 ensure_unicode(PyObject *obj)
853 {
854 if (!PyUnicode_Check(obj)) {
855 PyErr_Format(PyExc_TypeError,
856 "must be str, not %.100s",
857 Py_TYPE(obj)->tp_name);
858 return -1;
859 }
860 return PyUnicode_READY(obj);
861 }
862
863 /* Compilation of templated routines */
864
865 #include "stringlib/asciilib.h"
866 #include "stringlib/fastsearch.h"
867 #include "stringlib/partition.h"
868 #include "stringlib/split.h"
869 #include "stringlib/count.h"
870 #include "stringlib/find.h"
871 #include "stringlib/find_max_char.h"
872 #include "stringlib/undef.h"
873
874 #include "stringlib/ucs1lib.h"
875 #include "stringlib/fastsearch.h"
876 #include "stringlib/partition.h"
877 #include "stringlib/split.h"
878 #include "stringlib/count.h"
879 #include "stringlib/find.h"
880 #include "stringlib/replace.h"
881 #include "stringlib/find_max_char.h"
882 #include "stringlib/undef.h"
883
884 #include "stringlib/ucs2lib.h"
885 #include "stringlib/fastsearch.h"
886 #include "stringlib/partition.h"
887 #include "stringlib/split.h"
888 #include "stringlib/count.h"
889 #include "stringlib/find.h"
890 #include "stringlib/replace.h"
891 #include "stringlib/find_max_char.h"
892 #include "stringlib/undef.h"
893
894 #include "stringlib/ucs4lib.h"
895 #include "stringlib/fastsearch.h"
896 #include "stringlib/partition.h"
897 #include "stringlib/split.h"
898 #include "stringlib/count.h"
899 #include "stringlib/find.h"
900 #include "stringlib/replace.h"
901 #include "stringlib/find_max_char.h"
902 #include "stringlib/undef.h"
903
904 #include "stringlib/unicodedefs.h"
905 #include "stringlib/fastsearch.h"
906 #include "stringlib/count.h"
907 #include "stringlib/find.h"
908 #include "stringlib/undef.h"
909
910 /* --- Unicode Object ----------------------------------------------------- */
911
912 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)913 findchar(const void *s, int kind,
914 Py_ssize_t size, Py_UCS4 ch,
915 int direction)
916 {
917 switch (kind) {
918 case PyUnicode_1BYTE_KIND:
919 if ((Py_UCS1) ch != ch)
920 return -1;
921 if (direction > 0)
922 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
923 else
924 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
925 case PyUnicode_2BYTE_KIND:
926 if ((Py_UCS2) ch != ch)
927 return -1;
928 if (direction > 0)
929 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
930 else
931 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
932 case PyUnicode_4BYTE_KIND:
933 if (direction > 0)
934 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
935 else
936 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
937 default:
938 Py_UNREACHABLE();
939 }
940 }
941
942 #ifdef Py_DEBUG
943 /* Fill the data of a Unicode string with invalid characters to detect bugs
944 earlier.
945
946 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
947 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
948 invalid character in Unicode 6.0. */
949 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)950 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
951 {
952 int kind = PyUnicode_KIND(unicode);
953 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
954 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
955 if (length <= old_length)
956 return;
957 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
958 }
959 #endif
960
961 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)962 resize_compact(PyObject *unicode, Py_ssize_t length)
963 {
964 Py_ssize_t char_size;
965 Py_ssize_t struct_size;
966 Py_ssize_t new_size;
967 int share_wstr;
968 PyObject *new_unicode;
969 #ifdef Py_DEBUG
970 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
971 #endif
972
973 assert(unicode_modifiable(unicode));
974 assert(PyUnicode_IS_READY(unicode));
975 assert(PyUnicode_IS_COMPACT(unicode));
976
977 char_size = PyUnicode_KIND(unicode);
978 if (PyUnicode_IS_ASCII(unicode))
979 struct_size = sizeof(PyASCIIObject);
980 else
981 struct_size = sizeof(PyCompactUnicodeObject);
982 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
983
984 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
985 PyErr_NoMemory();
986 return NULL;
987 }
988 new_size = (struct_size + (length + 1) * char_size);
989
990 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995 _Py_DEC_REFTOTAL;
996 _Py_ForgetReference(unicode);
997
998 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
999 if (new_unicode == NULL) {
1000 _Py_NewReference(unicode);
1001 PyErr_NoMemory();
1002 return NULL;
1003 }
1004 unicode = new_unicode;
1005 _Py_NewReference(unicode);
1006
1007 _PyUnicode_LENGTH(unicode) = length;
1008 if (share_wstr) {
1009 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1010 if (!PyUnicode_IS_ASCII(unicode))
1011 _PyUnicode_WSTR_LENGTH(unicode) = length;
1012 }
1013 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1014 PyObject_DEL(_PyUnicode_WSTR(unicode));
1015 _PyUnicode_WSTR(unicode) = NULL;
1016 if (!PyUnicode_IS_ASCII(unicode))
1017 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1018 }
1019 #ifdef Py_DEBUG
1020 unicode_fill_invalid(unicode, old_length);
1021 #endif
1022 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1023 length, 0);
1024 assert(_PyUnicode_CheckConsistency(unicode, 0));
1025 return unicode;
1026 }
1027
1028 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1029 resize_inplace(PyObject *unicode, Py_ssize_t length)
1030 {
1031 wchar_t *wstr;
1032 Py_ssize_t new_size;
1033 assert(!PyUnicode_IS_COMPACT(unicode));
1034 assert(Py_REFCNT(unicode) == 1);
1035
1036 if (PyUnicode_IS_READY(unicode)) {
1037 Py_ssize_t char_size;
1038 int share_wstr, share_utf8;
1039 void *data;
1040 #ifdef Py_DEBUG
1041 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1042 #endif
1043
1044 data = _PyUnicode_DATA_ANY(unicode);
1045 char_size = PyUnicode_KIND(unicode);
1046 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1047 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1048
1049 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1050 PyErr_NoMemory();
1051 return -1;
1052 }
1053 new_size = (length + 1) * char_size;
1054
1055 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1056 {
1057 PyObject_DEL(_PyUnicode_UTF8(unicode));
1058 _PyUnicode_UTF8(unicode) = NULL;
1059 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1060 }
1061
1062 data = (PyObject *)PyObject_REALLOC(data, new_size);
1063 if (data == NULL) {
1064 PyErr_NoMemory();
1065 return -1;
1066 }
1067 _PyUnicode_DATA_ANY(unicode) = data;
1068 if (share_wstr) {
1069 _PyUnicode_WSTR(unicode) = data;
1070 _PyUnicode_WSTR_LENGTH(unicode) = length;
1071 }
1072 if (share_utf8) {
1073 _PyUnicode_UTF8(unicode) = data;
1074 _PyUnicode_UTF8_LENGTH(unicode) = length;
1075 }
1076 _PyUnicode_LENGTH(unicode) = length;
1077 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1078 #ifdef Py_DEBUG
1079 unicode_fill_invalid(unicode, old_length);
1080 #endif
1081 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1082 assert(_PyUnicode_CheckConsistency(unicode, 0));
1083 return 0;
1084 }
1085 }
1086 assert(_PyUnicode_WSTR(unicode) != NULL);
1087
1088 /* check for integer overflow */
1089 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1090 PyErr_NoMemory();
1091 return -1;
1092 }
1093 new_size = sizeof(wchar_t) * (length + 1);
1094 wstr = _PyUnicode_WSTR(unicode);
1095 wstr = PyObject_REALLOC(wstr, new_size);
1096 if (!wstr) {
1097 PyErr_NoMemory();
1098 return -1;
1099 }
1100 _PyUnicode_WSTR(unicode) = wstr;
1101 _PyUnicode_WSTR(unicode)[length] = 0;
1102 _PyUnicode_WSTR_LENGTH(unicode) = length;
1103 assert(_PyUnicode_CheckConsistency(unicode, 0));
1104 return 0;
1105 }
1106
1107 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1108 resize_copy(PyObject *unicode, Py_ssize_t length)
1109 {
1110 Py_ssize_t copy_length;
1111 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1112 PyObject *copy;
1113
1114 assert(PyUnicode_IS_READY(unicode));
1115
1116 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1117 if (copy == NULL)
1118 return NULL;
1119
1120 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1122 return copy;
1123 }
1124 else {
1125 PyObject *w;
1126
1127 w = (PyObject*)_PyUnicode_New(length);
1128 if (w == NULL)
1129 return NULL;
1130 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1131 copy_length = Py_MIN(copy_length, length);
1132 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1133 copy_length * sizeof(wchar_t));
1134 return w;
1135 }
1136 }
1137
1138 /* We allocate one more byte to make sure the string is
1139 Ux0000 terminated; some code (e.g. new_identifier)
1140 relies on that.
1141
1142 XXX This allocator could further be enhanced by assuring that the
1143 free list never reduces its size below 1.
1144
1145 */
1146
1147 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1148 _PyUnicode_New(Py_ssize_t length)
1149 {
1150 PyUnicodeObject *unicode;
1151 size_t new_size;
1152
1153 /* Optimization for empty strings */
1154 if (length == 0 && unicode_empty != NULL) {
1155 Py_INCREF(unicode_empty);
1156 return (PyUnicodeObject*)unicode_empty;
1157 }
1158
1159 /* Ensure we won't overflow the size. */
1160 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1161 return (PyUnicodeObject *)PyErr_NoMemory();
1162 }
1163 if (length < 0) {
1164 PyErr_SetString(PyExc_SystemError,
1165 "Negative size passed to _PyUnicode_New");
1166 return NULL;
1167 }
1168
1169 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1170 if (unicode == NULL)
1171 return NULL;
1172 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1173
1174 _PyUnicode_WSTR_LENGTH(unicode) = length;
1175 _PyUnicode_HASH(unicode) = -1;
1176 _PyUnicode_STATE(unicode).interned = 0;
1177 _PyUnicode_STATE(unicode).kind = 0;
1178 _PyUnicode_STATE(unicode).compact = 0;
1179 _PyUnicode_STATE(unicode).ready = 0;
1180 _PyUnicode_STATE(unicode).ascii = 0;
1181 _PyUnicode_DATA_ANY(unicode) = NULL;
1182 _PyUnicode_LENGTH(unicode) = 0;
1183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1185
1186 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1187 if (!_PyUnicode_WSTR(unicode)) {
1188 Py_DECREF(unicode);
1189 PyErr_NoMemory();
1190 return NULL;
1191 }
1192
1193 /* Initialize the first element to guard against cases where
1194 * the caller fails before initializing str -- unicode_resize()
1195 * reads str[0], and the Keep-Alive optimization can keep memory
1196 * allocated for str alive across a call to unicode_dealloc(unicode).
1197 * We don't want unicode_resize to read uninitialized memory in
1198 * that case.
1199 */
1200 _PyUnicode_WSTR(unicode)[0] = 0;
1201 _PyUnicode_WSTR(unicode)[length] = 0;
1202
1203 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1204 return unicode;
1205 }
1206
1207 static const char*
unicode_kind_name(PyObject * unicode)1208 unicode_kind_name(PyObject *unicode)
1209 {
1210 /* don't check consistency: unicode_kind_name() is called from
1211 _PyUnicode_Dump() */
1212 if (!PyUnicode_IS_COMPACT(unicode))
1213 {
1214 if (!PyUnicode_IS_READY(unicode))
1215 return "wstr";
1216 switch (PyUnicode_KIND(unicode))
1217 {
1218 case PyUnicode_1BYTE_KIND:
1219 if (PyUnicode_IS_ASCII(unicode))
1220 return "legacy ascii";
1221 else
1222 return "legacy latin1";
1223 case PyUnicode_2BYTE_KIND:
1224 return "legacy UCS2";
1225 case PyUnicode_4BYTE_KIND:
1226 return "legacy UCS4";
1227 default:
1228 return "<legacy invalid kind>";
1229 }
1230 }
1231 assert(PyUnicode_IS_READY(unicode));
1232 switch (PyUnicode_KIND(unicode)) {
1233 case PyUnicode_1BYTE_KIND:
1234 if (PyUnicode_IS_ASCII(unicode))
1235 return "ascii";
1236 else
1237 return "latin1";
1238 case PyUnicode_2BYTE_KIND:
1239 return "UCS2";
1240 case PyUnicode_4BYTE_KIND:
1241 return "UCS4";
1242 default:
1243 return "<invalid compact kind>";
1244 }
1245 }
1246
1247 #ifdef Py_DEBUG
1248 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1249 char *_PyUnicode_utf8(void *unicode_raw){
1250 PyObject *unicode = _PyObject_CAST(unicode_raw);
1251 return PyUnicode_UTF8(unicode);
1252 }
1253
_PyUnicode_compact_data(void * unicode_raw)1254 void *_PyUnicode_compact_data(void *unicode_raw) {
1255 PyObject *unicode = _PyObject_CAST(unicode_raw);
1256 return _PyUnicode_COMPACT_DATA(unicode);
1257 }
_PyUnicode_data(void * unicode_raw)1258 void *_PyUnicode_data(void *unicode_raw) {
1259 PyObject *unicode = _PyObject_CAST(unicode_raw);
1260 printf("obj %p\n", (void*)unicode);
1261 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1262 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1263 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1264 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1265 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1266 return PyUnicode_DATA(unicode);
1267 }
1268
1269 void
_PyUnicode_Dump(PyObject * op)1270 _PyUnicode_Dump(PyObject *op)
1271 {
1272 PyASCIIObject *ascii = (PyASCIIObject *)op;
1273 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1274 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1275 void *data;
1276
1277 if (ascii->state.compact)
1278 {
1279 if (ascii->state.ascii)
1280 data = (ascii + 1);
1281 else
1282 data = (compact + 1);
1283 }
1284 else
1285 data = unicode->data.any;
1286 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1287 unicode_kind_name(op), ascii->length);
1288
1289 if (ascii->wstr == data)
1290 printf("shared ");
1291 printf("wstr=%p", (void *)ascii->wstr);
1292
1293 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1294 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1295 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1296 printf("shared ");
1297 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1298 (void *)compact->utf8, compact->utf8_length);
1299 }
1300 printf(", data=%p\n", data);
1301 }
1302 #endif
1303
1304 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1305 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1306 {
1307 PyObject *obj;
1308 PyCompactUnicodeObject *unicode;
1309 void *data;
1310 enum PyUnicode_Kind kind;
1311 int is_sharing, is_ascii;
1312 Py_ssize_t char_size;
1313 Py_ssize_t struct_size;
1314
1315 /* Optimization for empty strings */
1316 if (size == 0 && unicode_empty != NULL) {
1317 Py_INCREF(unicode_empty);
1318 return unicode_empty;
1319 }
1320
1321 is_ascii = 0;
1322 is_sharing = 0;
1323 struct_size = sizeof(PyCompactUnicodeObject);
1324 if (maxchar < 128) {
1325 kind = PyUnicode_1BYTE_KIND;
1326 char_size = 1;
1327 is_ascii = 1;
1328 struct_size = sizeof(PyASCIIObject);
1329 }
1330 else if (maxchar < 256) {
1331 kind = PyUnicode_1BYTE_KIND;
1332 char_size = 1;
1333 }
1334 else if (maxchar < 65536) {
1335 kind = PyUnicode_2BYTE_KIND;
1336 char_size = 2;
1337 if (sizeof(wchar_t) == 2)
1338 is_sharing = 1;
1339 }
1340 else {
1341 if (maxchar > MAX_UNICODE) {
1342 PyErr_SetString(PyExc_SystemError,
1343 "invalid maximum character passed to PyUnicode_New");
1344 return NULL;
1345 }
1346 kind = PyUnicode_4BYTE_KIND;
1347 char_size = 4;
1348 if (sizeof(wchar_t) == 4)
1349 is_sharing = 1;
1350 }
1351
1352 /* Ensure we won't overflow the size. */
1353 if (size < 0) {
1354 PyErr_SetString(PyExc_SystemError,
1355 "Negative size passed to PyUnicode_New");
1356 return NULL;
1357 }
1358 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1359 return PyErr_NoMemory();
1360
1361 /* Duplicated allocation code from _PyObject_New() instead of a call to
1362 * PyObject_New() so we are able to allocate space for the object and
1363 * it's data buffer.
1364 */
1365 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1366 if (obj == NULL)
1367 return PyErr_NoMemory();
1368 obj = PyObject_INIT(obj, &PyUnicode_Type);
1369 if (obj == NULL)
1370 return NULL;
1371
1372 unicode = (PyCompactUnicodeObject *)obj;
1373 if (is_ascii)
1374 data = ((PyASCIIObject*)obj) + 1;
1375 else
1376 data = unicode + 1;
1377 _PyUnicode_LENGTH(unicode) = size;
1378 _PyUnicode_HASH(unicode) = -1;
1379 _PyUnicode_STATE(unicode).interned = 0;
1380 _PyUnicode_STATE(unicode).kind = kind;
1381 _PyUnicode_STATE(unicode).compact = 1;
1382 _PyUnicode_STATE(unicode).ready = 1;
1383 _PyUnicode_STATE(unicode).ascii = is_ascii;
1384 if (is_ascii) {
1385 ((char*)data)[size] = 0;
1386 _PyUnicode_WSTR(unicode) = NULL;
1387 }
1388 else if (kind == PyUnicode_1BYTE_KIND) {
1389 ((char*)data)[size] = 0;
1390 _PyUnicode_WSTR(unicode) = NULL;
1391 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1392 unicode->utf8 = NULL;
1393 unicode->utf8_length = 0;
1394 }
1395 else {
1396 unicode->utf8 = NULL;
1397 unicode->utf8_length = 0;
1398 if (kind == PyUnicode_2BYTE_KIND)
1399 ((Py_UCS2*)data)[size] = 0;
1400 else /* kind == PyUnicode_4BYTE_KIND */
1401 ((Py_UCS4*)data)[size] = 0;
1402 if (is_sharing) {
1403 _PyUnicode_WSTR_LENGTH(unicode) = size;
1404 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1405 }
1406 else {
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 _PyUnicode_WSTR(unicode) = NULL;
1409 }
1410 }
1411 #ifdef Py_DEBUG
1412 unicode_fill_invalid((PyObject*)unicode, 0);
1413 #endif
1414 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1415 return obj;
1416 }
1417
1418 #if SIZEOF_WCHAR_T == 2
1419 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1420 will decode surrogate pairs, the other conversions are implemented as macros
1421 for efficiency.
1422
1423 This function assumes that unicode can hold one more code point than wstr
1424 characters for a terminating null character. */
1425 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1426 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1427 PyObject *unicode)
1428 {
1429 const wchar_t *iter;
1430 Py_UCS4 *ucs4_out;
1431
1432 assert(unicode != NULL);
1433 assert(_PyUnicode_CHECK(unicode));
1434 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1435 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1436
1437 for (iter = begin; iter < end; ) {
1438 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1439 _PyUnicode_GET_LENGTH(unicode)));
1440 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1441 && (iter+1) < end
1442 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1443 {
1444 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1445 iter += 2;
1446 }
1447 else {
1448 *ucs4_out++ = *iter;
1449 iter++;
1450 }
1451 }
1452 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1453 _PyUnicode_GET_LENGTH(unicode)));
1454
1455 }
1456 #endif
1457
1458 static int
unicode_check_modifiable(PyObject * unicode)1459 unicode_check_modifiable(PyObject *unicode)
1460 {
1461 if (!unicode_modifiable(unicode)) {
1462 PyErr_SetString(PyExc_SystemError,
1463 "Cannot modify a string currently used");
1464 return -1;
1465 }
1466 return 0;
1467 }
1468
1469 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1470 _copy_characters(PyObject *to, Py_ssize_t to_start,
1471 PyObject *from, Py_ssize_t from_start,
1472 Py_ssize_t how_many, int check_maxchar)
1473 {
1474 unsigned int from_kind, to_kind;
1475 void *from_data, *to_data;
1476
1477 assert(0 <= how_many);
1478 assert(0 <= from_start);
1479 assert(0 <= to_start);
1480 assert(PyUnicode_Check(from));
1481 assert(PyUnicode_IS_READY(from));
1482 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1483
1484 assert(PyUnicode_Check(to));
1485 assert(PyUnicode_IS_READY(to));
1486 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1487
1488 if (how_many == 0)
1489 return 0;
1490
1491 from_kind = PyUnicode_KIND(from);
1492 from_data = PyUnicode_DATA(from);
1493 to_kind = PyUnicode_KIND(to);
1494 to_data = PyUnicode_DATA(to);
1495
1496 #ifdef Py_DEBUG
1497 if (!check_maxchar
1498 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1499 {
1500 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501 Py_UCS4 ch;
1502 Py_ssize_t i;
1503 for (i=0; i < how_many; i++) {
1504 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505 assert(ch <= to_maxchar);
1506 }
1507 }
1508 #endif
1509
1510 if (from_kind == to_kind) {
1511 if (check_maxchar
1512 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1513 {
1514 /* Writing Latin-1 characters into an ASCII string requires to
1515 check that all written characters are pure ASCII */
1516 Py_UCS4 max_char;
1517 max_char = ucs1lib_find_max_char(from_data,
1518 (Py_UCS1*)from_data + how_many);
1519 if (max_char >= 128)
1520 return -1;
1521 }
1522 memcpy((char*)to_data + to_kind * to_start,
1523 (char*)from_data + from_kind * from_start,
1524 to_kind * how_many);
1525 }
1526 else if (from_kind == PyUnicode_1BYTE_KIND
1527 && to_kind == PyUnicode_2BYTE_KIND)
1528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS1, Py_UCS2,
1531 PyUnicode_1BYTE_DATA(from) + from_start,
1532 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_2BYTE_DATA(to) + to_start
1534 );
1535 }
1536 else if (from_kind == PyUnicode_1BYTE_KIND
1537 && to_kind == PyUnicode_4BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS1, Py_UCS4,
1541 PyUnicode_1BYTE_DATA(from) + from_start,
1542 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_4BYTE_DATA(to) + to_start
1544 );
1545 }
1546 else if (from_kind == PyUnicode_2BYTE_KIND
1547 && to_kind == PyUnicode_4BYTE_KIND)
1548 {
1549 _PyUnicode_CONVERT_BYTES(
1550 Py_UCS2, Py_UCS4,
1551 PyUnicode_2BYTE_DATA(from) + from_start,
1552 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1553 PyUnicode_4BYTE_DATA(to) + to_start
1554 );
1555 }
1556 else {
1557 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1558
1559 if (!check_maxchar) {
1560 if (from_kind == PyUnicode_2BYTE_KIND
1561 && to_kind == PyUnicode_1BYTE_KIND)
1562 {
1563 _PyUnicode_CONVERT_BYTES(
1564 Py_UCS2, Py_UCS1,
1565 PyUnicode_2BYTE_DATA(from) + from_start,
1566 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1567 PyUnicode_1BYTE_DATA(to) + to_start
1568 );
1569 }
1570 else if (from_kind == PyUnicode_4BYTE_KIND
1571 && to_kind == PyUnicode_1BYTE_KIND)
1572 {
1573 _PyUnicode_CONVERT_BYTES(
1574 Py_UCS4, Py_UCS1,
1575 PyUnicode_4BYTE_DATA(from) + from_start,
1576 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1577 PyUnicode_1BYTE_DATA(to) + to_start
1578 );
1579 }
1580 else if (from_kind == PyUnicode_4BYTE_KIND
1581 && to_kind == PyUnicode_2BYTE_KIND)
1582 {
1583 _PyUnicode_CONVERT_BYTES(
1584 Py_UCS4, Py_UCS2,
1585 PyUnicode_4BYTE_DATA(from) + from_start,
1586 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1587 PyUnicode_2BYTE_DATA(to) + to_start
1588 );
1589 }
1590 else {
1591 Py_UNREACHABLE();
1592 }
1593 }
1594 else {
1595 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1596 Py_UCS4 ch;
1597 Py_ssize_t i;
1598
1599 for (i=0; i < how_many; i++) {
1600 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1601 if (ch > to_maxchar)
1602 return -1;
1603 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1604 }
1605 }
1606 }
1607 return 0;
1608 }
1609
1610 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1611 _PyUnicode_FastCopyCharacters(
1612 PyObject *to, Py_ssize_t to_start,
1613 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1614 {
1615 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1616 }
1617
1618 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1619 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1620 PyObject *from, Py_ssize_t from_start,
1621 Py_ssize_t how_many)
1622 {
1623 int err;
1624
1625 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1626 PyErr_BadInternalCall();
1627 return -1;
1628 }
1629
1630 if (PyUnicode_READY(from) == -1)
1631 return -1;
1632 if (PyUnicode_READY(to) == -1)
1633 return -1;
1634
1635 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1636 PyErr_SetString(PyExc_IndexError, "string index out of range");
1637 return -1;
1638 }
1639 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1640 PyErr_SetString(PyExc_IndexError, "string index out of range");
1641 return -1;
1642 }
1643 if (how_many < 0) {
1644 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1645 return -1;
1646 }
1647 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1648 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1649 PyErr_Format(PyExc_SystemError,
1650 "Cannot write %zi characters at %zi "
1651 "in a string of %zi characters",
1652 how_many, to_start, PyUnicode_GET_LENGTH(to));
1653 return -1;
1654 }
1655
1656 if (how_many == 0)
1657 return 0;
1658
1659 if (unicode_check_modifiable(to))
1660 return -1;
1661
1662 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1663 if (err) {
1664 PyErr_Format(PyExc_SystemError,
1665 "Cannot copy %s characters "
1666 "into a string of %s characters",
1667 unicode_kind_name(from),
1668 unicode_kind_name(to));
1669 return -1;
1670 }
1671 return how_many;
1672 }
1673
1674 /* Find the maximum code point and count the number of surrogate pairs so a
1675 correct string length can be computed before converting a string to UCS4.
1676 This function counts single surrogates as a character and not as a pair.
1677
1678 Return 0 on success, or -1 on error. */
1679 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1680 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1681 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1682 {
1683 const wchar_t *iter;
1684 Py_UCS4 ch;
1685
1686 assert(num_surrogates != NULL && maxchar != NULL);
1687 *num_surrogates = 0;
1688 *maxchar = 0;
1689
1690 for (iter = begin; iter < end; ) {
1691 #if SIZEOF_WCHAR_T == 2
1692 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1693 && (iter+1) < end
1694 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1695 {
1696 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1697 ++(*num_surrogates);
1698 iter += 2;
1699 }
1700 else
1701 #endif
1702 {
1703 ch = *iter;
1704 iter++;
1705 }
1706 if (ch > *maxchar) {
1707 *maxchar = ch;
1708 if (*maxchar > MAX_UNICODE) {
1709 PyErr_Format(PyExc_ValueError,
1710 "character U+%x is not in range [U+0000; U+10ffff]",
1711 ch);
1712 return -1;
1713 }
1714 }
1715 }
1716 return 0;
1717 }
1718
1719 int
_PyUnicode_Ready(PyObject * unicode)1720 _PyUnicode_Ready(PyObject *unicode)
1721 {
1722 wchar_t *end;
1723 Py_UCS4 maxchar = 0;
1724 Py_ssize_t num_surrogates;
1725 #if SIZEOF_WCHAR_T == 2
1726 Py_ssize_t length_wo_surrogates;
1727 #endif
1728
1729 /* _PyUnicode_Ready() is only intended for old-style API usage where
1730 strings were created using _PyObject_New() and where no canonical
1731 representation (the str field) has been set yet aka strings
1732 which are not yet ready. */
1733 assert(_PyUnicode_CHECK(unicode));
1734 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1735 assert(_PyUnicode_WSTR(unicode) != NULL);
1736 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1737 assert(_PyUnicode_UTF8(unicode) == NULL);
1738 /* Actually, it should neither be interned nor be anything else: */
1739 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1740
1741 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1742 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1743 &maxchar, &num_surrogates) == -1)
1744 return -1;
1745
1746 if (maxchar < 256) {
1747 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1748 if (!_PyUnicode_DATA_ANY(unicode)) {
1749 PyErr_NoMemory();
1750 return -1;
1751 }
1752 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1753 _PyUnicode_WSTR(unicode), end,
1754 PyUnicode_1BYTE_DATA(unicode));
1755 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1756 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1757 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1758 if (maxchar < 128) {
1759 _PyUnicode_STATE(unicode).ascii = 1;
1760 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1761 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1762 }
1763 else {
1764 _PyUnicode_STATE(unicode).ascii = 0;
1765 _PyUnicode_UTF8(unicode) = NULL;
1766 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1767 }
1768 PyObject_FREE(_PyUnicode_WSTR(unicode));
1769 _PyUnicode_WSTR(unicode) = NULL;
1770 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1771 }
1772 /* In this case we might have to convert down from 4-byte native
1773 wchar_t to 2-byte unicode. */
1774 else if (maxchar < 65536) {
1775 assert(num_surrogates == 0 &&
1776 "FindMaxCharAndNumSurrogatePairs() messed up");
1777
1778 #if SIZEOF_WCHAR_T == 2
1779 /* We can share representations and are done. */
1780 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1781 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1782 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1783 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1784 _PyUnicode_UTF8(unicode) = NULL;
1785 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1786 #else
1787 /* sizeof(wchar_t) == 4 */
1788 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1789 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1790 if (!_PyUnicode_DATA_ANY(unicode)) {
1791 PyErr_NoMemory();
1792 return -1;
1793 }
1794 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1795 _PyUnicode_WSTR(unicode), end,
1796 PyUnicode_2BYTE_DATA(unicode));
1797 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1798 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1799 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1800 _PyUnicode_UTF8(unicode) = NULL;
1801 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1802 PyObject_FREE(_PyUnicode_WSTR(unicode));
1803 _PyUnicode_WSTR(unicode) = NULL;
1804 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1805 #endif
1806 }
1807 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1808 else {
1809 #if SIZEOF_WCHAR_T == 2
1810 /* in case the native representation is 2-bytes, we need to allocate a
1811 new normalized 4-byte version. */
1812 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1813 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1814 PyErr_NoMemory();
1815 return -1;
1816 }
1817 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1818 if (!_PyUnicode_DATA_ANY(unicode)) {
1819 PyErr_NoMemory();
1820 return -1;
1821 }
1822 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1823 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1824 _PyUnicode_UTF8(unicode) = NULL;
1825 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1826 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1827 _PyUnicode_STATE(unicode).ready = 1;
1828 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1829 PyObject_FREE(_PyUnicode_WSTR(unicode));
1830 _PyUnicode_WSTR(unicode) = NULL;
1831 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1832 #else
1833 assert(num_surrogates == 0);
1834
1835 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837 _PyUnicode_UTF8(unicode) = NULL;
1838 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1840 #endif
1841 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1842 }
1843 _PyUnicode_STATE(unicode).ready = 1;
1844 assert(_PyUnicode_CheckConsistency(unicode, 1));
1845 return 0;
1846 }
1847
1848 static void
unicode_dealloc(PyObject * unicode)1849 unicode_dealloc(PyObject *unicode)
1850 {
1851 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1852 case SSTATE_NOT_INTERNED:
1853 break;
1854
1855 case SSTATE_INTERNED_MORTAL:
1856 /* revive dead object temporarily for DelItem */
1857 Py_REFCNT(unicode) = 3;
1858 if (PyDict_DelItem(interned, unicode) != 0)
1859 Py_FatalError(
1860 "deletion of interned string failed");
1861 break;
1862
1863 case SSTATE_INTERNED_IMMORTAL:
1864 Py_FatalError("Immortal interned string died.");
1865 /* fall through */
1866
1867 default:
1868 Py_FatalError("Inconsistent interned string state.");
1869 }
1870
1871 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1872 PyObject_DEL(_PyUnicode_WSTR(unicode));
1873 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1874 PyObject_DEL(_PyUnicode_UTF8(unicode));
1875 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1876 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1877
1878 Py_TYPE(unicode)->tp_free(unicode);
1879 }
1880
1881 #ifdef Py_DEBUG
1882 static int
unicode_is_singleton(PyObject * unicode)1883 unicode_is_singleton(PyObject *unicode)
1884 {
1885 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1886 if (unicode == unicode_empty)
1887 return 1;
1888 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1889 {
1890 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1891 if (ch < 256 && unicode_latin1[ch] == unicode)
1892 return 1;
1893 }
1894 return 0;
1895 }
1896 #endif
1897
1898 static int
unicode_modifiable(PyObject * unicode)1899 unicode_modifiable(PyObject *unicode)
1900 {
1901 assert(_PyUnicode_CHECK(unicode));
1902 if (Py_REFCNT(unicode) != 1)
1903 return 0;
1904 if (_PyUnicode_HASH(unicode) != -1)
1905 return 0;
1906 if (PyUnicode_CHECK_INTERNED(unicode))
1907 return 0;
1908 if (!PyUnicode_CheckExact(unicode))
1909 return 0;
1910 #ifdef Py_DEBUG
1911 /* singleton refcount is greater than 1 */
1912 assert(!unicode_is_singleton(unicode));
1913 #endif
1914 return 1;
1915 }
1916
1917 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1918 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1919 {
1920 PyObject *unicode;
1921 Py_ssize_t old_length;
1922
1923 assert(p_unicode != NULL);
1924 unicode = *p_unicode;
1925
1926 assert(unicode != NULL);
1927 assert(PyUnicode_Check(unicode));
1928 assert(0 <= length);
1929
1930 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1931 old_length = PyUnicode_WSTR_LENGTH(unicode);
1932 else
1933 old_length = PyUnicode_GET_LENGTH(unicode);
1934 if (old_length == length)
1935 return 0;
1936
1937 if (length == 0) {
1938 _Py_INCREF_UNICODE_EMPTY();
1939 if (!unicode_empty)
1940 return -1;
1941 Py_SETREF(*p_unicode, unicode_empty);
1942 return 0;
1943 }
1944
1945 if (!unicode_modifiable(unicode)) {
1946 PyObject *copy = resize_copy(unicode, length);
1947 if (copy == NULL)
1948 return -1;
1949 Py_SETREF(*p_unicode, copy);
1950 return 0;
1951 }
1952
1953 if (PyUnicode_IS_COMPACT(unicode)) {
1954 PyObject *new_unicode = resize_compact(unicode, length);
1955 if (new_unicode == NULL)
1956 return -1;
1957 *p_unicode = new_unicode;
1958 return 0;
1959 }
1960 return resize_inplace(unicode, length);
1961 }
1962
1963 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1964 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1965 {
1966 PyObject *unicode;
1967 if (p_unicode == NULL) {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 unicode = *p_unicode;
1972 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1973 {
1974 PyErr_BadInternalCall();
1975 return -1;
1976 }
1977 return unicode_resize(p_unicode, length);
1978 }
1979
1980 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1981
1982 WARNING: The function doesn't copy the terminating null character and
1983 doesn't check the maximum character (may write a latin1 character in an
1984 ASCII string). */
1985 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1986 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1987 const char *str, Py_ssize_t len)
1988 {
1989 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1990 void *data = PyUnicode_DATA(unicode);
1991 const char *end = str + len;
1992
1993 switch (kind) {
1994 case PyUnicode_1BYTE_KIND: {
1995 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1996 #ifdef Py_DEBUG
1997 if (PyUnicode_IS_ASCII(unicode)) {
1998 Py_UCS4 maxchar = ucs1lib_find_max_char(
1999 (const Py_UCS1*)str,
2000 (const Py_UCS1*)str + len);
2001 assert(maxchar < 128);
2002 }
2003 #endif
2004 memcpy((char *) data + index, str, len);
2005 break;
2006 }
2007 case PyUnicode_2BYTE_KIND: {
2008 Py_UCS2 *start = (Py_UCS2 *)data + index;
2009 Py_UCS2 *ucs2 = start;
2010 assert(index <= PyUnicode_GET_LENGTH(unicode));
2011
2012 for (; str < end; ++ucs2, ++str)
2013 *ucs2 = (Py_UCS2)*str;
2014
2015 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2016 break;
2017 }
2018 default: {
2019 Py_UCS4 *start = (Py_UCS4 *)data + index;
2020 Py_UCS4 *ucs4 = start;
2021 assert(kind == PyUnicode_4BYTE_KIND);
2022 assert(index <= PyUnicode_GET_LENGTH(unicode));
2023
2024 for (; str < end; ++ucs4, ++str)
2025 *ucs4 = (Py_UCS4)*str;
2026
2027 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2028 }
2029 }
2030 }
2031
2032 static PyObject*
get_latin1_char(unsigned char ch)2033 get_latin1_char(unsigned char ch)
2034 {
2035 PyObject *unicode = unicode_latin1[ch];
2036 if (!unicode) {
2037 unicode = PyUnicode_New(1, ch);
2038 if (!unicode)
2039 return NULL;
2040 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2041 assert(_PyUnicode_CheckConsistency(unicode, 1));
2042 unicode_latin1[ch] = unicode;
2043 }
2044 Py_INCREF(unicode);
2045 return unicode;
2046 }
2047
2048 static PyObject*
unicode_char(Py_UCS4 ch)2049 unicode_char(Py_UCS4 ch)
2050 {
2051 PyObject *unicode;
2052
2053 assert(ch <= MAX_UNICODE);
2054
2055 if (ch < 256)
2056 return get_latin1_char(ch);
2057
2058 unicode = PyUnicode_New(1, ch);
2059 if (unicode == NULL)
2060 return NULL;
2061
2062 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2063 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2064 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2065 } else {
2066 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2067 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2068 }
2069 assert(_PyUnicode_CheckConsistency(unicode, 1));
2070 return unicode;
2071 }
2072
2073 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2074 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2075 {
2076 if (u == NULL)
2077 return (PyObject*)_PyUnicode_New(size);
2078
2079 if (size < 0) {
2080 PyErr_BadInternalCall();
2081 return NULL;
2082 }
2083
2084 return PyUnicode_FromWideChar(u, size);
2085 }
2086
2087 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2088 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2089 {
2090 PyObject *unicode;
2091 Py_UCS4 maxchar = 0;
2092 Py_ssize_t num_surrogates;
2093
2094 if (u == NULL && size != 0) {
2095 PyErr_BadInternalCall();
2096 return NULL;
2097 }
2098
2099 if (size == -1) {
2100 size = wcslen(u);
2101 }
2102
2103 /* If the Unicode data is known at construction time, we can apply
2104 some optimizations which share commonly used objects. */
2105
2106 /* Optimization for empty strings */
2107 if (size == 0)
2108 _Py_RETURN_UNICODE_EMPTY();
2109
2110 /* Single character Unicode objects in the Latin-1 range are
2111 shared when using this constructor */
2112 if (size == 1 && (Py_UCS4)*u < 256)
2113 return get_latin1_char((unsigned char)*u);
2114
2115 /* If not empty and not single character, copy the Unicode data
2116 into the new object */
2117 if (find_maxchar_surrogates(u, u + size,
2118 &maxchar, &num_surrogates) == -1)
2119 return NULL;
2120
2121 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2122 if (!unicode)
2123 return NULL;
2124
2125 switch (PyUnicode_KIND(unicode)) {
2126 case PyUnicode_1BYTE_KIND:
2127 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2128 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2129 break;
2130 case PyUnicode_2BYTE_KIND:
2131 #if Py_UNICODE_SIZE == 2
2132 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2133 #else
2134 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2135 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2136 #endif
2137 break;
2138 case PyUnicode_4BYTE_KIND:
2139 #if SIZEOF_WCHAR_T == 2
2140 /* This is the only case which has to process surrogates, thus
2141 a simple copy loop is not enough and we need a function. */
2142 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2143 #else
2144 assert(num_surrogates == 0);
2145 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2146 #endif
2147 break;
2148 default:
2149 Py_UNREACHABLE();
2150 }
2151
2152 return unicode_result(unicode);
2153 }
2154
2155 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2156 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2157 {
2158 if (size < 0) {
2159 PyErr_SetString(PyExc_SystemError,
2160 "Negative size passed to PyUnicode_FromStringAndSize");
2161 return NULL;
2162 }
2163 if (u != NULL)
2164 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2165 else
2166 return (PyObject *)_PyUnicode_New(size);
2167 }
2168
2169 PyObject *
PyUnicode_FromString(const char * u)2170 PyUnicode_FromString(const char *u)
2171 {
2172 size_t size = strlen(u);
2173 if (size > PY_SSIZE_T_MAX) {
2174 PyErr_SetString(PyExc_OverflowError, "input too long");
2175 return NULL;
2176 }
2177 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2178 }
2179
2180 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2181 _PyUnicode_FromId(_Py_Identifier *id)
2182 {
2183 if (!id->object) {
2184 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2185 strlen(id->string),
2186 NULL, NULL);
2187 if (!id->object)
2188 return NULL;
2189 PyUnicode_InternInPlace(&id->object);
2190 assert(!id->next);
2191 id->next = static_strings;
2192 static_strings = id;
2193 }
2194 return id->object;
2195 }
2196
2197 void
_PyUnicode_ClearStaticStrings()2198 _PyUnicode_ClearStaticStrings()
2199 {
2200 _Py_Identifier *tmp, *s = static_strings;
2201 while (s) {
2202 Py_CLEAR(s->object);
2203 tmp = s->next;
2204 s->next = NULL;
2205 s = tmp;
2206 }
2207 static_strings = NULL;
2208 }
2209
2210 /* Internal function, doesn't check maximum character */
2211
2212 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2213 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2214 {
2215 const unsigned char *s = (const unsigned char *)buffer;
2216 PyObject *unicode;
2217 if (size == 1) {
2218 #ifdef Py_DEBUG
2219 assert((unsigned char)s[0] < 128);
2220 #endif
2221 return get_latin1_char(s[0]);
2222 }
2223 unicode = PyUnicode_New(size, 127);
2224 if (!unicode)
2225 return NULL;
2226 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2227 assert(_PyUnicode_CheckConsistency(unicode, 1));
2228 return unicode;
2229 }
2230
2231 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2232 kind_maxchar_limit(unsigned int kind)
2233 {
2234 switch (kind) {
2235 case PyUnicode_1BYTE_KIND:
2236 return 0x80;
2237 case PyUnicode_2BYTE_KIND:
2238 return 0x100;
2239 case PyUnicode_4BYTE_KIND:
2240 return 0x10000;
2241 default:
2242 Py_UNREACHABLE();
2243 }
2244 }
2245
2246 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2247 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2248 {
2249 PyObject *res;
2250 unsigned char max_char;
2251
2252 if (size == 0)
2253 _Py_RETURN_UNICODE_EMPTY();
2254 assert(size > 0);
2255 if (size == 1)
2256 return get_latin1_char(u[0]);
2257
2258 max_char = ucs1lib_find_max_char(u, u + size);
2259 res = PyUnicode_New(size, max_char);
2260 if (!res)
2261 return NULL;
2262 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2263 assert(_PyUnicode_CheckConsistency(res, 1));
2264 return res;
2265 }
2266
2267 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2268 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2269 {
2270 PyObject *res;
2271 Py_UCS2 max_char;
2272
2273 if (size == 0)
2274 _Py_RETURN_UNICODE_EMPTY();
2275 assert(size > 0);
2276 if (size == 1)
2277 return unicode_char(u[0]);
2278
2279 max_char = ucs2lib_find_max_char(u, u + size);
2280 res = PyUnicode_New(size, max_char);
2281 if (!res)
2282 return NULL;
2283 if (max_char >= 256)
2284 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2285 else {
2286 _PyUnicode_CONVERT_BYTES(
2287 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2288 }
2289 assert(_PyUnicode_CheckConsistency(res, 1));
2290 return res;
2291 }
2292
2293 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2294 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2295 {
2296 PyObject *res;
2297 Py_UCS4 max_char;
2298
2299 if (size == 0)
2300 _Py_RETURN_UNICODE_EMPTY();
2301 assert(size > 0);
2302 if (size == 1)
2303 return unicode_char(u[0]);
2304
2305 max_char = ucs4lib_find_max_char(u, u + size);
2306 res = PyUnicode_New(size, max_char);
2307 if (!res)
2308 return NULL;
2309 if (max_char < 256)
2310 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2311 PyUnicode_1BYTE_DATA(res));
2312 else if (max_char < 0x10000)
2313 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2314 PyUnicode_2BYTE_DATA(res));
2315 else
2316 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2317 assert(_PyUnicode_CheckConsistency(res, 1));
2318 return res;
2319 }
2320
2321 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2322 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2323 {
2324 if (size < 0) {
2325 PyErr_SetString(PyExc_ValueError, "size must be positive");
2326 return NULL;
2327 }
2328 switch (kind) {
2329 case PyUnicode_1BYTE_KIND:
2330 return _PyUnicode_FromUCS1(buffer, size);
2331 case PyUnicode_2BYTE_KIND:
2332 return _PyUnicode_FromUCS2(buffer, size);
2333 case PyUnicode_4BYTE_KIND:
2334 return _PyUnicode_FromUCS4(buffer, size);
2335 default:
2336 PyErr_SetString(PyExc_SystemError, "invalid kind");
2337 return NULL;
2338 }
2339 }
2340
2341 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2342 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2343 {
2344 enum PyUnicode_Kind kind;
2345 void *startptr, *endptr;
2346
2347 assert(PyUnicode_IS_READY(unicode));
2348 assert(0 <= start);
2349 assert(end <= PyUnicode_GET_LENGTH(unicode));
2350 assert(start <= end);
2351
2352 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2353 return PyUnicode_MAX_CHAR_VALUE(unicode);
2354
2355 if (start == end)
2356 return 127;
2357
2358 if (PyUnicode_IS_ASCII(unicode))
2359 return 127;
2360
2361 kind = PyUnicode_KIND(unicode);
2362 startptr = PyUnicode_DATA(unicode);
2363 endptr = (char *)startptr + end * kind;
2364 startptr = (char *)startptr + start * kind;
2365 switch(kind) {
2366 case PyUnicode_1BYTE_KIND:
2367 return ucs1lib_find_max_char(startptr, endptr);
2368 case PyUnicode_2BYTE_KIND:
2369 return ucs2lib_find_max_char(startptr, endptr);
2370 case PyUnicode_4BYTE_KIND:
2371 return ucs4lib_find_max_char(startptr, endptr);
2372 default:
2373 Py_UNREACHABLE();
2374 }
2375 }
2376
2377 /* Ensure that a string uses the most efficient storage, if it is not the
2378 case: create a new string with of the right kind. Write NULL into *p_unicode
2379 on error. */
2380 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2381 unicode_adjust_maxchar(PyObject **p_unicode)
2382 {
2383 PyObject *unicode, *copy;
2384 Py_UCS4 max_char;
2385 Py_ssize_t len;
2386 unsigned int kind;
2387
2388 assert(p_unicode != NULL);
2389 unicode = *p_unicode;
2390 assert(PyUnicode_IS_READY(unicode));
2391 if (PyUnicode_IS_ASCII(unicode))
2392 return;
2393
2394 len = PyUnicode_GET_LENGTH(unicode);
2395 kind = PyUnicode_KIND(unicode);
2396 if (kind == PyUnicode_1BYTE_KIND) {
2397 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2398 max_char = ucs1lib_find_max_char(u, u + len);
2399 if (max_char >= 128)
2400 return;
2401 }
2402 else if (kind == PyUnicode_2BYTE_KIND) {
2403 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2404 max_char = ucs2lib_find_max_char(u, u + len);
2405 if (max_char >= 256)
2406 return;
2407 }
2408 else {
2409 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2410 assert(kind == PyUnicode_4BYTE_KIND);
2411 max_char = ucs4lib_find_max_char(u, u + len);
2412 if (max_char >= 0x10000)
2413 return;
2414 }
2415 copy = PyUnicode_New(len, max_char);
2416 if (copy != NULL)
2417 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2418 Py_DECREF(unicode);
2419 *p_unicode = copy;
2420 }
2421
2422 PyObject*
_PyUnicode_Copy(PyObject * unicode)2423 _PyUnicode_Copy(PyObject *unicode)
2424 {
2425 Py_ssize_t length;
2426 PyObject *copy;
2427
2428 if (!PyUnicode_Check(unicode)) {
2429 PyErr_BadInternalCall();
2430 return NULL;
2431 }
2432 if (PyUnicode_READY(unicode) == -1)
2433 return NULL;
2434
2435 length = PyUnicode_GET_LENGTH(unicode);
2436 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2437 if (!copy)
2438 return NULL;
2439 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2440
2441 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2442 length * PyUnicode_KIND(unicode));
2443 assert(_PyUnicode_CheckConsistency(copy, 1));
2444 return copy;
2445 }
2446
2447
2448 /* Widen Unicode objects to larger buffers. Don't write terminating null
2449 character. Return NULL on error. */
2450
2451 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2452 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2453 {
2454 Py_ssize_t len;
2455 void *result;
2456 unsigned int skind;
2457
2458 if (PyUnicode_READY(s) == -1)
2459 return NULL;
2460
2461 len = PyUnicode_GET_LENGTH(s);
2462 skind = PyUnicode_KIND(s);
2463 if (skind >= kind) {
2464 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2465 return NULL;
2466 }
2467 switch (kind) {
2468 case PyUnicode_2BYTE_KIND:
2469 result = PyMem_New(Py_UCS2, len);
2470 if (!result)
2471 return PyErr_NoMemory();
2472 assert(skind == PyUnicode_1BYTE_KIND);
2473 _PyUnicode_CONVERT_BYTES(
2474 Py_UCS1, Py_UCS2,
2475 PyUnicode_1BYTE_DATA(s),
2476 PyUnicode_1BYTE_DATA(s) + len,
2477 result);
2478 return result;
2479 case PyUnicode_4BYTE_KIND:
2480 result = PyMem_New(Py_UCS4, len);
2481 if (!result)
2482 return PyErr_NoMemory();
2483 if (skind == PyUnicode_2BYTE_KIND) {
2484 _PyUnicode_CONVERT_BYTES(
2485 Py_UCS2, Py_UCS4,
2486 PyUnicode_2BYTE_DATA(s),
2487 PyUnicode_2BYTE_DATA(s) + len,
2488 result);
2489 }
2490 else {
2491 assert(skind == PyUnicode_1BYTE_KIND);
2492 _PyUnicode_CONVERT_BYTES(
2493 Py_UCS1, Py_UCS4,
2494 PyUnicode_1BYTE_DATA(s),
2495 PyUnicode_1BYTE_DATA(s) + len,
2496 result);
2497 }
2498 return result;
2499 default:
2500 break;
2501 }
2502 PyErr_SetString(PyExc_SystemError, "invalid kind");
2503 return NULL;
2504 }
2505
2506 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2507 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2508 int copy_null)
2509 {
2510 int kind;
2511 void *data;
2512 Py_ssize_t len, targetlen;
2513 if (PyUnicode_READY(string) == -1)
2514 return NULL;
2515 kind = PyUnicode_KIND(string);
2516 data = PyUnicode_DATA(string);
2517 len = PyUnicode_GET_LENGTH(string);
2518 targetlen = len;
2519 if (copy_null)
2520 targetlen++;
2521 if (!target) {
2522 target = PyMem_New(Py_UCS4, targetlen);
2523 if (!target) {
2524 PyErr_NoMemory();
2525 return NULL;
2526 }
2527 }
2528 else {
2529 if (targetsize < targetlen) {
2530 PyErr_Format(PyExc_SystemError,
2531 "string is longer than the buffer");
2532 if (copy_null && 0 < targetsize)
2533 target[0] = 0;
2534 return NULL;
2535 }
2536 }
2537 if (kind == PyUnicode_1BYTE_KIND) {
2538 Py_UCS1 *start = (Py_UCS1 *) data;
2539 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2540 }
2541 else if (kind == PyUnicode_2BYTE_KIND) {
2542 Py_UCS2 *start = (Py_UCS2 *) data;
2543 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2544 }
2545 else {
2546 assert(kind == PyUnicode_4BYTE_KIND);
2547 memcpy(target, data, len * sizeof(Py_UCS4));
2548 }
2549 if (copy_null)
2550 target[len] = 0;
2551 return target;
2552 }
2553
2554 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2555 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2556 int copy_null)
2557 {
2558 if (target == NULL || targetsize < 0) {
2559 PyErr_BadInternalCall();
2560 return NULL;
2561 }
2562 return as_ucs4(string, target, targetsize, copy_null);
2563 }
2564
2565 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2566 PyUnicode_AsUCS4Copy(PyObject *string)
2567 {
2568 return as_ucs4(string, NULL, 0, 1);
2569 }
2570
2571 /* maximum number of characters required for output of %lld or %p.
2572 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2573 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2574 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2575
2576 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2577 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2578 Py_ssize_t width, Py_ssize_t precision)
2579 {
2580 Py_ssize_t length, fill, arglen;
2581 Py_UCS4 maxchar;
2582
2583 if (PyUnicode_READY(str) == -1)
2584 return -1;
2585
2586 length = PyUnicode_GET_LENGTH(str);
2587 if ((precision == -1 || precision >= length)
2588 && width <= length)
2589 return _PyUnicodeWriter_WriteStr(writer, str);
2590
2591 if (precision != -1)
2592 length = Py_MIN(precision, length);
2593
2594 arglen = Py_MAX(length, width);
2595 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2596 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2597 else
2598 maxchar = writer->maxchar;
2599
2600 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2601 return -1;
2602
2603 if (width > length) {
2604 fill = width - length;
2605 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2606 return -1;
2607 writer->pos += fill;
2608 }
2609
2610 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2611 str, 0, length);
2612 writer->pos += length;
2613 return 0;
2614 }
2615
2616 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2617 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2618 Py_ssize_t width, Py_ssize_t precision)
2619 {
2620 /* UTF-8 */
2621 Py_ssize_t length;
2622 PyObject *unicode;
2623 int res;
2624
2625 if (precision == -1) {
2626 length = strlen(str);
2627 }
2628 else {
2629 length = 0;
2630 while (length < precision && str[length]) {
2631 length++;
2632 }
2633 }
2634 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2635 if (unicode == NULL)
2636 return -1;
2637
2638 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2639 Py_DECREF(unicode);
2640 return res;
2641 }
2642
2643 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2644 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2645 const char *f, va_list *vargs)
2646 {
2647 const char *p;
2648 Py_ssize_t len;
2649 int zeropad;
2650 Py_ssize_t width;
2651 Py_ssize_t precision;
2652 int longflag;
2653 int longlongflag;
2654 int size_tflag;
2655 Py_ssize_t fill;
2656
2657 p = f;
2658 f++;
2659 zeropad = 0;
2660 if (*f == '0') {
2661 zeropad = 1;
2662 f++;
2663 }
2664
2665 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2666 width = -1;
2667 if (Py_ISDIGIT((unsigned)*f)) {
2668 width = *f - '0';
2669 f++;
2670 while (Py_ISDIGIT((unsigned)*f)) {
2671 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2672 PyErr_SetString(PyExc_ValueError,
2673 "width too big");
2674 return NULL;
2675 }
2676 width = (width * 10) + (*f - '0');
2677 f++;
2678 }
2679 }
2680 precision = -1;
2681 if (*f == '.') {
2682 f++;
2683 if (Py_ISDIGIT((unsigned)*f)) {
2684 precision = (*f - '0');
2685 f++;
2686 while (Py_ISDIGIT((unsigned)*f)) {
2687 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2688 PyErr_SetString(PyExc_ValueError,
2689 "precision too big");
2690 return NULL;
2691 }
2692 precision = (precision * 10) + (*f - '0');
2693 f++;
2694 }
2695 }
2696 if (*f == '%') {
2697 /* "%.3%s" => f points to "3" */
2698 f--;
2699 }
2700 }
2701 if (*f == '\0') {
2702 /* bogus format "%.123" => go backward, f points to "3" */
2703 f--;
2704 }
2705
2706 /* Handle %ld, %lu, %lld and %llu. */
2707 longflag = 0;
2708 longlongflag = 0;
2709 size_tflag = 0;
2710 if (*f == 'l') {
2711 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2712 longflag = 1;
2713 ++f;
2714 }
2715 else if (f[1] == 'l' &&
2716 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2717 longlongflag = 1;
2718 f += 2;
2719 }
2720 }
2721 /* handle the size_t flag. */
2722 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2723 size_tflag = 1;
2724 ++f;
2725 }
2726
2727 if (f[1] == '\0')
2728 writer->overallocate = 0;
2729
2730 switch (*f) {
2731 case 'c':
2732 {
2733 int ordinal = va_arg(*vargs, int);
2734 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2735 PyErr_SetString(PyExc_OverflowError,
2736 "character argument not in range(0x110000)");
2737 return NULL;
2738 }
2739 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2740 return NULL;
2741 break;
2742 }
2743
2744 case 'i':
2745 case 'd':
2746 case 'u':
2747 case 'x':
2748 {
2749 /* used by sprintf */
2750 char buffer[MAX_LONG_LONG_CHARS];
2751 Py_ssize_t arglen;
2752
2753 if (*f == 'u') {
2754 if (longflag)
2755 len = sprintf(buffer, "%lu",
2756 va_arg(*vargs, unsigned long));
2757 else if (longlongflag)
2758 len = sprintf(buffer, "%llu",
2759 va_arg(*vargs, unsigned long long));
2760 else if (size_tflag)
2761 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2762 va_arg(*vargs, size_t));
2763 else
2764 len = sprintf(buffer, "%u",
2765 va_arg(*vargs, unsigned int));
2766 }
2767 else if (*f == 'x') {
2768 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2769 }
2770 else {
2771 if (longflag)
2772 len = sprintf(buffer, "%li",
2773 va_arg(*vargs, long));
2774 else if (longlongflag)
2775 len = sprintf(buffer, "%lli",
2776 va_arg(*vargs, long long));
2777 else if (size_tflag)
2778 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2779 va_arg(*vargs, Py_ssize_t));
2780 else
2781 len = sprintf(buffer, "%i",
2782 va_arg(*vargs, int));
2783 }
2784 assert(len >= 0);
2785
2786 if (precision < len)
2787 precision = len;
2788
2789 arglen = Py_MAX(precision, width);
2790 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2791 return NULL;
2792
2793 if (width > precision) {
2794 Py_UCS4 fillchar;
2795 fill = width - precision;
2796 fillchar = zeropad?'0':' ';
2797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2798 return NULL;
2799 writer->pos += fill;
2800 }
2801 if (precision > len) {
2802 fill = precision - len;
2803 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2804 return NULL;
2805 writer->pos += fill;
2806 }
2807
2808 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2809 return NULL;
2810 break;
2811 }
2812
2813 case 'p':
2814 {
2815 char number[MAX_LONG_LONG_CHARS];
2816
2817 len = sprintf(number, "%p", va_arg(*vargs, void*));
2818 assert(len >= 0);
2819
2820 /* %p is ill-defined: ensure leading 0x. */
2821 if (number[1] == 'X')
2822 number[1] = 'x';
2823 else if (number[1] != 'x') {
2824 memmove(number + 2, number,
2825 strlen(number) + 1);
2826 number[0] = '0';
2827 number[1] = 'x';
2828 len += 2;
2829 }
2830
2831 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2832 return NULL;
2833 break;
2834 }
2835
2836 case 's':
2837 {
2838 /* UTF-8 */
2839 const char *s = va_arg(*vargs, const char*);
2840 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2841 return NULL;
2842 break;
2843 }
2844
2845 case 'U':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 assert(obj && _PyUnicode_CHECK(obj));
2849
2850 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2851 return NULL;
2852 break;
2853 }
2854
2855 case 'V':
2856 {
2857 PyObject *obj = va_arg(*vargs, PyObject *);
2858 const char *str = va_arg(*vargs, const char *);
2859 if (obj) {
2860 assert(_PyUnicode_CHECK(obj));
2861 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2862 return NULL;
2863 }
2864 else {
2865 assert(str != NULL);
2866 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2867 return NULL;
2868 }
2869 break;
2870 }
2871
2872 case 'S':
2873 {
2874 PyObject *obj = va_arg(*vargs, PyObject *);
2875 PyObject *str;
2876 assert(obj);
2877 str = PyObject_Str(obj);
2878 if (!str)
2879 return NULL;
2880 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2881 Py_DECREF(str);
2882 return NULL;
2883 }
2884 Py_DECREF(str);
2885 break;
2886 }
2887
2888 case 'R':
2889 {
2890 PyObject *obj = va_arg(*vargs, PyObject *);
2891 PyObject *repr;
2892 assert(obj);
2893 repr = PyObject_Repr(obj);
2894 if (!repr)
2895 return NULL;
2896 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2897 Py_DECREF(repr);
2898 return NULL;
2899 }
2900 Py_DECREF(repr);
2901 break;
2902 }
2903
2904 case 'A':
2905 {
2906 PyObject *obj = va_arg(*vargs, PyObject *);
2907 PyObject *ascii;
2908 assert(obj);
2909 ascii = PyObject_ASCII(obj);
2910 if (!ascii)
2911 return NULL;
2912 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2913 Py_DECREF(ascii);
2914 return NULL;
2915 }
2916 Py_DECREF(ascii);
2917 break;
2918 }
2919
2920 case '%':
2921 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2922 return NULL;
2923 break;
2924
2925 default:
2926 /* if we stumble upon an unknown formatting code, copy the rest
2927 of the format string to the output string. (we cannot just
2928 skip the code, since there's no way to know what's in the
2929 argument list) */
2930 len = strlen(p);
2931 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2932 return NULL;
2933 f = p+len;
2934 return f;
2935 }
2936
2937 f++;
2938 return f;
2939 }
2940
2941 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2942 PyUnicode_FromFormatV(const char *format, va_list vargs)
2943 {
2944 va_list vargs2;
2945 const char *f;
2946 _PyUnicodeWriter writer;
2947
2948 _PyUnicodeWriter_Init(&writer);
2949 writer.min_length = strlen(format) + 100;
2950 writer.overallocate = 1;
2951
2952 // Copy varags to be able to pass a reference to a subfunction.
2953 va_copy(vargs2, vargs);
2954
2955 for (f = format; *f; ) {
2956 if (*f == '%') {
2957 f = unicode_fromformat_arg(&writer, f, &vargs2);
2958 if (f == NULL)
2959 goto fail;
2960 }
2961 else {
2962 const char *p;
2963 Py_ssize_t len;
2964
2965 p = f;
2966 do
2967 {
2968 if ((unsigned char)*p > 127) {
2969 PyErr_Format(PyExc_ValueError,
2970 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2971 "string, got a non-ASCII byte: 0x%02x",
2972 (unsigned char)*p);
2973 goto fail;
2974 }
2975 p++;
2976 }
2977 while (*p != '\0' && *p != '%');
2978 len = p - f;
2979
2980 if (*p == '\0')
2981 writer.overallocate = 0;
2982
2983 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2984 goto fail;
2985
2986 f = p;
2987 }
2988 }
2989 va_end(vargs2);
2990 return _PyUnicodeWriter_Finish(&writer);
2991
2992 fail:
2993 va_end(vargs2);
2994 _PyUnicodeWriter_Dealloc(&writer);
2995 return NULL;
2996 }
2997
2998 PyObject *
PyUnicode_FromFormat(const char * format,...)2999 PyUnicode_FromFormat(const char *format, ...)
3000 {
3001 PyObject* ret;
3002 va_list vargs;
3003
3004 #ifdef HAVE_STDARG_PROTOTYPES
3005 va_start(vargs, format);
3006 #else
3007 va_start(vargs);
3008 #endif
3009 ret = PyUnicode_FromFormatV(format, vargs);
3010 va_end(vargs);
3011 return ret;
3012 }
3013
3014 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3015 unicode_get_widechar_size(PyObject *unicode)
3016 {
3017 Py_ssize_t res;
3018
3019 assert(unicode != NULL);
3020 assert(_PyUnicode_CHECK(unicode));
3021
3022 if (_PyUnicode_WSTR(unicode) != NULL) {
3023 return PyUnicode_WSTR_LENGTH(unicode);
3024 }
3025 assert(PyUnicode_IS_READY(unicode));
3026
3027 res = _PyUnicode_LENGTH(unicode);
3028 #if SIZEOF_WCHAR_T == 2
3029 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3030 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3031 const Py_UCS4 *end = s + res;
3032 for (; s < end; ++s) {
3033 if (*s > 0xFFFF) {
3034 ++res;
3035 }
3036 }
3037 }
3038 #endif
3039 return res;
3040 }
3041
3042 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3043 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3044 {
3045 const wchar_t *wstr;
3046
3047 assert(unicode != NULL);
3048 assert(_PyUnicode_CHECK(unicode));
3049
3050 wstr = _PyUnicode_WSTR(unicode);
3051 if (wstr != NULL) {
3052 memcpy(w, wstr, size * sizeof(wchar_t));
3053 return;
3054 }
3055 assert(PyUnicode_IS_READY(unicode));
3056
3057 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3058 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3059 for (; size--; ++s, ++w) {
3060 *w = *s;
3061 }
3062 }
3063 else {
3064 #if SIZEOF_WCHAR_T == 4
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3066 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 *w = *s;
3069 }
3070 #else
3071 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3072 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3073 for (; size--; ++s, ++w) {
3074 Py_UCS4 ch = *s;
3075 if (ch > 0xFFFF) {
3076 assert(ch <= MAX_UNICODE);
3077 /* encode surrogate pair in this case */
3078 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3079 if (!size--)
3080 break;
3081 *w = Py_UNICODE_LOW_SURROGATE(ch);
3082 }
3083 else {
3084 *w = ch;
3085 }
3086 }
3087 #endif
3088 }
3089 }
3090
3091 #ifdef HAVE_WCHAR_H
3092
3093 /* Convert a Unicode object to a wide character string.
3094
3095 - If w is NULL: return the number of wide characters (including the null
3096 character) required to convert the unicode object. Ignore size argument.
3097
3098 - Otherwise: return the number of wide characters (excluding the null
3099 character) written into w. Write at most size wide characters (including
3100 the null character). */
3101 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3102 PyUnicode_AsWideChar(PyObject *unicode,
3103 wchar_t *w,
3104 Py_ssize_t size)
3105 {
3106 Py_ssize_t res;
3107
3108 if (unicode == NULL) {
3109 PyErr_BadInternalCall();
3110 return -1;
3111 }
3112 if (!PyUnicode_Check(unicode)) {
3113 PyErr_BadArgument();
3114 return -1;
3115 }
3116
3117 res = unicode_get_widechar_size(unicode);
3118 if (w == NULL) {
3119 return res + 1;
3120 }
3121
3122 if (size > res) {
3123 size = res + 1;
3124 }
3125 else {
3126 res = size;
3127 }
3128 unicode_copy_as_widechar(unicode, w, size);
3129 return res;
3130 }
3131
3132 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3133 PyUnicode_AsWideCharString(PyObject *unicode,
3134 Py_ssize_t *size)
3135 {
3136 wchar_t *buffer;
3137 Py_ssize_t buflen;
3138
3139 if (unicode == NULL) {
3140 PyErr_BadInternalCall();
3141 return NULL;
3142 }
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 return NULL;
3146 }
3147
3148 buflen = unicode_get_widechar_size(unicode);
3149 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3150 if (buffer == NULL) {
3151 PyErr_NoMemory();
3152 return NULL;
3153 }
3154 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3155 if (size != NULL) {
3156 *size = buflen;
3157 }
3158 else if (wcslen(buffer) != (size_t)buflen) {
3159 PyMem_FREE(buffer);
3160 PyErr_SetString(PyExc_ValueError,
3161 "embedded null character");
3162 return NULL;
3163 }
3164 return buffer;
3165 }
3166
3167 #endif /* HAVE_WCHAR_H */
3168
3169 PyObject *
PyUnicode_FromOrdinal(int ordinal)3170 PyUnicode_FromOrdinal(int ordinal)
3171 {
3172 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3173 PyErr_SetString(PyExc_ValueError,
3174 "chr() arg not in range(0x110000)");
3175 return NULL;
3176 }
3177
3178 return unicode_char((Py_UCS4)ordinal);
3179 }
3180
3181 PyObject *
PyUnicode_FromObject(PyObject * obj)3182 PyUnicode_FromObject(PyObject *obj)
3183 {
3184 /* XXX Perhaps we should make this API an alias of
3185 PyObject_Str() instead ?! */
3186 if (PyUnicode_CheckExact(obj)) {
3187 if (PyUnicode_READY(obj) == -1)
3188 return NULL;
3189 Py_INCREF(obj);
3190 return obj;
3191 }
3192 if (PyUnicode_Check(obj)) {
3193 /* For a Unicode subtype that's not a Unicode object,
3194 return a true Unicode object with the same data. */
3195 return _PyUnicode_Copy(obj);
3196 }
3197 PyErr_Format(PyExc_TypeError,
3198 "Can't convert '%.100s' object to str implicitly",
3199 Py_TYPE(obj)->tp_name);
3200 return NULL;
3201 }
3202
3203 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3204 PyUnicode_FromEncodedObject(PyObject *obj,
3205 const char *encoding,
3206 const char *errors)
3207 {
3208 Py_buffer buffer;
3209 PyObject *v;
3210
3211 if (obj == NULL) {
3212 PyErr_BadInternalCall();
3213 return NULL;
3214 }
3215
3216 /* Decoding bytes objects is the most common case and should be fast */
3217 if (PyBytes_Check(obj)) {
3218 if (PyBytes_GET_SIZE(obj) == 0)
3219 _Py_RETURN_UNICODE_EMPTY();
3220 v = PyUnicode_Decode(
3221 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3222 encoding, errors);
3223 return v;
3224 }
3225
3226 if (PyUnicode_Check(obj)) {
3227 PyErr_SetString(PyExc_TypeError,
3228 "decoding str is not supported");
3229 return NULL;
3230 }
3231
3232 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3233 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3234 PyErr_Format(PyExc_TypeError,
3235 "decoding to str: need a bytes-like object, %.80s found",
3236 Py_TYPE(obj)->tp_name);
3237 return NULL;
3238 }
3239
3240 if (buffer.len == 0) {
3241 PyBuffer_Release(&buffer);
3242 _Py_RETURN_UNICODE_EMPTY();
3243 }
3244
3245 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3246 PyBuffer_Release(&buffer);
3247 return v;
3248 }
3249
3250 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3251 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3252 longer than lower_len-1). */
3253 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3254 _Py_normalize_encoding(const char *encoding,
3255 char *lower,
3256 size_t lower_len)
3257 {
3258 const char *e;
3259 char *l;
3260 char *l_end;
3261 int punct;
3262
3263 assert(encoding != NULL);
3264
3265 e = encoding;
3266 l = lower;
3267 l_end = &lower[lower_len - 1];
3268 punct = 0;
3269 while (1) {
3270 char c = *e;
3271 if (c == 0) {
3272 break;
3273 }
3274
3275 if (Py_ISALNUM(c) || c == '.') {
3276 if (punct && l != lower) {
3277 if (l == l_end) {
3278 return 0;
3279 }
3280 *l++ = '_';
3281 }
3282 punct = 0;
3283
3284 if (l == l_end) {
3285 return 0;
3286 }
3287 *l++ = Py_TOLOWER(c);
3288 }
3289 else {
3290 punct = 1;
3291 }
3292
3293 e++;
3294 }
3295 *l = '\0';
3296 return 1;
3297 }
3298
3299 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3300 PyUnicode_Decode(const char *s,
3301 Py_ssize_t size,
3302 const char *encoding,
3303 const char *errors)
3304 {
3305 PyObject *buffer = NULL, *unicode;
3306 Py_buffer info;
3307 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3308
3309 if (encoding == NULL) {
3310 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3311 }
3312
3313 /* Shortcuts for common default encodings */
3314 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3315 char *lower = buflower;
3316
3317 /* Fast paths */
3318 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3319 lower += 3;
3320 if (*lower == '_') {
3321 /* Match "utf8" and "utf_8" */
3322 lower++;
3323 }
3324
3325 if (lower[0] == '8' && lower[1] == 0) {
3326 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3327 }
3328 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3329 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3330 }
3331 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3332 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3333 }
3334 }
3335 else {
3336 if (strcmp(lower, "ascii") == 0
3337 || strcmp(lower, "us_ascii") == 0) {
3338 return PyUnicode_DecodeASCII(s, size, errors);
3339 }
3340 #ifdef MS_WINDOWS
3341 else if (strcmp(lower, "mbcs") == 0) {
3342 return PyUnicode_DecodeMBCS(s, size, errors);
3343 }
3344 #endif
3345 else if (strcmp(lower, "latin1") == 0
3346 || strcmp(lower, "latin_1") == 0
3347 || strcmp(lower, "iso_8859_1") == 0
3348 || strcmp(lower, "iso8859_1") == 0) {
3349 return PyUnicode_DecodeLatin1(s, size, errors);
3350 }
3351 }
3352 }
3353
3354 /* Decode via the codec registry */
3355 buffer = NULL;
3356 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3357 goto onError;
3358 buffer = PyMemoryView_FromBuffer(&info);
3359 if (buffer == NULL)
3360 goto onError;
3361 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3362 if (unicode == NULL)
3363 goto onError;
3364 if (!PyUnicode_Check(unicode)) {
3365 PyErr_Format(PyExc_TypeError,
3366 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3367 "use codecs.decode() to decode to arbitrary types",
3368 encoding,
3369 Py_TYPE(unicode)->tp_name);
3370 Py_DECREF(unicode);
3371 goto onError;
3372 }
3373 Py_DECREF(buffer);
3374 return unicode_result(unicode);
3375
3376 onError:
3377 Py_XDECREF(buffer);
3378 return NULL;
3379 }
3380
3381 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3382 PyUnicode_AsDecodedObject(PyObject *unicode,
3383 const char *encoding,
3384 const char *errors)
3385 {
3386 if (!PyUnicode_Check(unicode)) {
3387 PyErr_BadArgument();
3388 return NULL;
3389 }
3390
3391 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3392 "PyUnicode_AsDecodedObject() is deprecated; "
3393 "use PyCodec_Decode() to decode from str", 1) < 0)
3394 return NULL;
3395
3396 if (encoding == NULL)
3397 encoding = PyUnicode_GetDefaultEncoding();
3398
3399 /* Decode via the codec registry */
3400 return PyCodec_Decode(unicode, encoding, errors);
3401 }
3402
3403 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3404 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3405 const char *encoding,
3406 const char *errors)
3407 {
3408 PyObject *v;
3409
3410 if (!PyUnicode_Check(unicode)) {
3411 PyErr_BadArgument();
3412 goto onError;
3413 }
3414
3415 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3416 "PyUnicode_AsDecodedUnicode() is deprecated; "
3417 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3418 return NULL;
3419
3420 if (encoding == NULL)
3421 encoding = PyUnicode_GetDefaultEncoding();
3422
3423 /* Decode via the codec registry */
3424 v = PyCodec_Decode(unicode, encoding, errors);
3425 if (v == NULL)
3426 goto onError;
3427 if (!PyUnicode_Check(v)) {
3428 PyErr_Format(PyExc_TypeError,
3429 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3430 "use codecs.decode() to decode to arbitrary types",
3431 encoding,
3432 Py_TYPE(unicode)->tp_name);
3433 Py_DECREF(v);
3434 goto onError;
3435 }
3436 return unicode_result(v);
3437
3438 onError:
3439 return NULL;
3440 }
3441
3442 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3443 PyUnicode_Encode(const Py_UNICODE *s,
3444 Py_ssize_t size,
3445 const char *encoding,
3446 const char *errors)
3447 {
3448 PyObject *v, *unicode;
3449
3450 unicode = PyUnicode_FromWideChar(s, size);
3451 if (unicode == NULL)
3452 return NULL;
3453 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3454 Py_DECREF(unicode);
3455 return v;
3456 }
3457
3458 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3459 PyUnicode_AsEncodedObject(PyObject *unicode,
3460 const char *encoding,
3461 const char *errors)
3462 {
3463 PyObject *v;
3464
3465 if (!PyUnicode_Check(unicode)) {
3466 PyErr_BadArgument();
3467 goto onError;
3468 }
3469
3470 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3471 "PyUnicode_AsEncodedObject() is deprecated; "
3472 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3473 "or PyCodec_Encode() for generic encoding", 1) < 0)
3474 return NULL;
3475
3476 if (encoding == NULL)
3477 encoding = PyUnicode_GetDefaultEncoding();
3478
3479 /* Encode via the codec registry */
3480 v = PyCodec_Encode(unicode, encoding, errors);
3481 if (v == NULL)
3482 goto onError;
3483 return v;
3484
3485 onError:
3486 return NULL;
3487 }
3488
3489
3490 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3491 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3492 int current_locale)
3493 {
3494 Py_ssize_t wlen;
3495 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3496 if (wstr == NULL) {
3497 return NULL;
3498 }
3499
3500 if ((size_t)wlen != wcslen(wstr)) {
3501 PyErr_SetString(PyExc_ValueError, "embedded null character");
3502 PyMem_Free(wstr);
3503 return NULL;
3504 }
3505
3506 char *str;
3507 size_t error_pos;
3508 const char *reason;
3509 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3510 current_locale, error_handler);
3511 PyMem_Free(wstr);
3512
3513 if (res != 0) {
3514 if (res == -2) {
3515 PyObject *exc;
3516 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3517 "locale", unicode,
3518 (Py_ssize_t)error_pos,
3519 (Py_ssize_t)(error_pos+1),
3520 reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
3523 Py_DECREF(exc);
3524 }
3525 }
3526 else if (res == -3) {
3527 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3528 }
3529 else {
3530 PyErr_NoMemory();
3531 }
3532 return NULL;
3533 }
3534
3535 PyObject *bytes = PyBytes_FromString(str);
3536 PyMem_RawFree(str);
3537 return bytes;
3538 }
3539
3540 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3541 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3542 {
3543 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3544 return unicode_encode_locale(unicode, error_handler, 1);
3545 }
3546
3547 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3548 PyUnicode_EncodeFSDefault(PyObject *unicode)
3549 {
3550 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3551 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3552 if (interp->fs_codec.encoding) {
3553 return unicode_encode_utf8(unicode,
3554 interp->fs_codec.error_handler,
3555 interp->fs_codec.errors);
3556 }
3557 else {
3558 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3559 _Py_error_handler errors;
3560 errors = get_error_handler_wide(filesystem_errors);
3561 assert(errors != _Py_ERROR_UNKNOWN);
3562 return unicode_encode_utf8(unicode, errors, NULL);
3563 }
3564 #else
3565 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566 cannot use it to encode and decode filenames before it is loaded. Load
3567 the Python codec requires to encode at least its own filename. Use the C
3568 implementation of the locale codec until the codec registry is
3569 initialized and the Python codec is loaded. See initfsencoding(). */
3570 if (interp->fs_codec.encoding) {
3571 return PyUnicode_AsEncodedString(unicode,
3572 interp->fs_codec.encoding,
3573 interp->fs_codec.errors);
3574 }
3575 else {
3576 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3577 _Py_error_handler errors;
3578 errors = get_error_handler_wide(filesystem_errors);
3579 assert(errors != _Py_ERROR_UNKNOWN);
3580 return unicode_encode_locale(unicode, errors, 0);
3581 }
3582 #endif
3583 }
3584
3585 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3586 PyUnicode_AsEncodedString(PyObject *unicode,
3587 const char *encoding,
3588 const char *errors)
3589 {
3590 PyObject *v;
3591 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3592
3593 if (!PyUnicode_Check(unicode)) {
3594 PyErr_BadArgument();
3595 return NULL;
3596 }
3597
3598 if (encoding == NULL) {
3599 return _PyUnicode_AsUTF8String(unicode, errors);
3600 }
3601
3602 /* Shortcuts for common default encodings */
3603 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3604 char *lower = buflower;
3605
3606 /* Fast paths */
3607 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3608 lower += 3;
3609 if (*lower == '_') {
3610 /* Match "utf8" and "utf_8" */
3611 lower++;
3612 }
3613
3614 if (lower[0] == '8' && lower[1] == 0) {
3615 return _PyUnicode_AsUTF8String(unicode, errors);
3616 }
3617 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3618 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3619 }
3620 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3621 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3622 }
3623 }
3624 else {
3625 if (strcmp(lower, "ascii") == 0
3626 || strcmp(lower, "us_ascii") == 0) {
3627 return _PyUnicode_AsASCIIString(unicode, errors);
3628 }
3629 #ifdef MS_WINDOWS
3630 else if (strcmp(lower, "mbcs") == 0) {
3631 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3632 }
3633 #endif
3634 else if (strcmp(lower, "latin1") == 0 ||
3635 strcmp(lower, "latin_1") == 0 ||
3636 strcmp(lower, "iso_8859_1") == 0 ||
3637 strcmp(lower, "iso8859_1") == 0) {
3638 return _PyUnicode_AsLatin1String(unicode, errors);
3639 }
3640 }
3641 }
3642
3643 /* Encode via the codec registry */
3644 v = _PyCodec_EncodeText(unicode, encoding, errors);
3645 if (v == NULL)
3646 return NULL;
3647
3648 /* The normal path */
3649 if (PyBytes_Check(v))
3650 return v;
3651
3652 /* If the codec returns a buffer, raise a warning and convert to bytes */
3653 if (PyByteArray_Check(v)) {
3654 int error;
3655 PyObject *b;
3656
3657 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3658 "encoder %s returned bytearray instead of bytes; "
3659 "use codecs.encode() to encode to arbitrary types",
3660 encoding);
3661 if (error) {
3662 Py_DECREF(v);
3663 return NULL;
3664 }
3665
3666 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3667 PyByteArray_GET_SIZE(v));
3668 Py_DECREF(v);
3669 return b;
3670 }
3671
3672 PyErr_Format(PyExc_TypeError,
3673 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3674 "use codecs.encode() to encode to arbitrary types",
3675 encoding,
3676 Py_TYPE(v)->tp_name);
3677 Py_DECREF(v);
3678 return NULL;
3679 }
3680
3681 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3682 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3683 const char *encoding,
3684 const char *errors)
3685 {
3686 PyObject *v;
3687
3688 if (!PyUnicode_Check(unicode)) {
3689 PyErr_BadArgument();
3690 goto onError;
3691 }
3692
3693 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3694 "PyUnicode_AsEncodedUnicode() is deprecated; "
3695 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3696 return NULL;
3697
3698 if (encoding == NULL)
3699 encoding = PyUnicode_GetDefaultEncoding();
3700
3701 /* Encode via the codec registry */
3702 v = PyCodec_Encode(unicode, encoding, errors);
3703 if (v == NULL)
3704 goto onError;
3705 if (!PyUnicode_Check(v)) {
3706 PyErr_Format(PyExc_TypeError,
3707 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3708 "use codecs.encode() to encode to arbitrary types",
3709 encoding,
3710 Py_TYPE(v)->tp_name);
3711 Py_DECREF(v);
3712 goto onError;
3713 }
3714 return v;
3715
3716 onError:
3717 return NULL;
3718 }
3719
3720 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3721 unicode_decode_locale(const char *str, Py_ssize_t len,
3722 _Py_error_handler errors, int current_locale)
3723 {
3724 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3725 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3726 return NULL;
3727 }
3728
3729 wchar_t *wstr;
3730 size_t wlen;
3731 const char *reason;
3732 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3733 current_locale, errors);
3734 if (res != 0) {
3735 if (res == -2) {
3736 PyObject *exc;
3737 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3738 "locale", str, len,
3739 (Py_ssize_t)wlen,
3740 (Py_ssize_t)(wlen + 1),
3741 reason);
3742 if (exc != NULL) {
3743 PyCodec_StrictErrors(exc);
3744 Py_DECREF(exc);
3745 }
3746 }
3747 else if (res == -3) {
3748 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3749 }
3750 else {
3751 PyErr_NoMemory();
3752 }
3753 return NULL;
3754 }
3755
3756 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3757 PyMem_RawFree(wstr);
3758 return unicode;
3759 }
3760
3761 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3762 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3763 const char *errors)
3764 {
3765 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3766 return unicode_decode_locale(str, len, error_handler, 1);
3767 }
3768
3769 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3770 PyUnicode_DecodeLocale(const char *str, const char *errors)
3771 {
3772 Py_ssize_t size = (Py_ssize_t)strlen(str);
3773 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3774 return unicode_decode_locale(str, size, error_handler, 1);
3775 }
3776
3777
3778 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3779 PyUnicode_DecodeFSDefault(const char *s) {
3780 Py_ssize_t size = (Py_ssize_t)strlen(s);
3781 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3782 }
3783
3784 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3785 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3786 {
3787 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3788 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3789 if (interp->fs_codec.encoding) {
3790 return unicode_decode_utf8(s, size,
3791 interp->fs_codec.error_handler,
3792 interp->fs_codec.errors,
3793 NULL);
3794 }
3795 else {
3796 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3797 _Py_error_handler errors;
3798 errors = get_error_handler_wide(filesystem_errors);
3799 assert(errors != _Py_ERROR_UNKNOWN);
3800 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3801 }
3802 #else
3803 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3804 cannot use it to encode and decode filenames before it is loaded. Load
3805 the Python codec requires to encode at least its own filename. Use the C
3806 implementation of the locale codec until the codec registry is
3807 initialized and the Python codec is loaded. See initfsencoding(). */
3808 if (interp->fs_codec.encoding) {
3809 return PyUnicode_Decode(s, size,
3810 interp->fs_codec.encoding,
3811 interp->fs_codec.errors);
3812 }
3813 else {
3814 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3815 _Py_error_handler errors;
3816 errors = get_error_handler_wide(filesystem_errors);
3817 return unicode_decode_locale(s, size, errors, 0);
3818 }
3819 #endif
3820 }
3821
3822
3823 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3824 PyUnicode_FSConverter(PyObject* arg, void* addr)
3825 {
3826 PyObject *path = NULL;
3827 PyObject *output = NULL;
3828 Py_ssize_t size;
3829 void *data;
3830 if (arg == NULL) {
3831 Py_DECREF(*(PyObject**)addr);
3832 *(PyObject**)addr = NULL;
3833 return 1;
3834 }
3835 path = PyOS_FSPath(arg);
3836 if (path == NULL) {
3837 return 0;
3838 }
3839 if (PyBytes_Check(path)) {
3840 output = path;
3841 }
3842 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3843 output = PyUnicode_EncodeFSDefault(path);
3844 Py_DECREF(path);
3845 if (!output) {
3846 return 0;
3847 }
3848 assert(PyBytes_Check(output));
3849 }
3850
3851 size = PyBytes_GET_SIZE(output);
3852 data = PyBytes_AS_STRING(output);
3853 if ((size_t)size != strlen(data)) {
3854 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3855 Py_DECREF(output);
3856 return 0;
3857 }
3858 *(PyObject**)addr = output;
3859 return Py_CLEANUP_SUPPORTED;
3860 }
3861
3862
3863 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3864 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3865 {
3866 int is_buffer = 0;
3867 PyObject *path = NULL;
3868 PyObject *output = NULL;
3869 if (arg == NULL) {
3870 Py_DECREF(*(PyObject**)addr);
3871 *(PyObject**)addr = NULL;
3872 return 1;
3873 }
3874
3875 is_buffer = PyObject_CheckBuffer(arg);
3876 if (!is_buffer) {
3877 path = PyOS_FSPath(arg);
3878 if (path == NULL) {
3879 return 0;
3880 }
3881 }
3882 else {
3883 path = arg;
3884 Py_INCREF(arg);
3885 }
3886
3887 if (PyUnicode_Check(path)) {
3888 output = path;
3889 }
3890 else if (PyBytes_Check(path) || is_buffer) {
3891 PyObject *path_bytes = NULL;
3892
3893 if (!PyBytes_Check(path) &&
3894 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3895 "path should be string, bytes, or os.PathLike, not %.200s",
3896 Py_TYPE(arg)->tp_name)) {
3897 Py_DECREF(path);
3898 return 0;
3899 }
3900 path_bytes = PyBytes_FromObject(path);
3901 Py_DECREF(path);
3902 if (!path_bytes) {
3903 return 0;
3904 }
3905 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3906 PyBytes_GET_SIZE(path_bytes));
3907 Py_DECREF(path_bytes);
3908 if (!output) {
3909 return 0;
3910 }
3911 }
3912 else {
3913 PyErr_Format(PyExc_TypeError,
3914 "path should be string, bytes, or os.PathLike, not %.200s",
3915 Py_TYPE(arg)->tp_name);
3916 Py_DECREF(path);
3917 return 0;
3918 }
3919 if (PyUnicode_READY(output) == -1) {
3920 Py_DECREF(output);
3921 return 0;
3922 }
3923 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3924 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3925 PyErr_SetString(PyExc_ValueError, "embedded null character");
3926 Py_DECREF(output);
3927 return 0;
3928 }
3929 *(PyObject**)addr = output;
3930 return Py_CLEANUP_SUPPORTED;
3931 }
3932
3933
3934 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3935 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3936 {
3937 PyObject *bytes;
3938
3939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 return NULL;
3942 }
3943 if (PyUnicode_READY(unicode) == -1)
3944 return NULL;
3945
3946 if (PyUnicode_UTF8(unicode) == NULL) {
3947 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3948 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3949 if (bytes == NULL)
3950 return NULL;
3951 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3952 if (_PyUnicode_UTF8(unicode) == NULL) {
3953 PyErr_NoMemory();
3954 Py_DECREF(bytes);
3955 return NULL;
3956 }
3957 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3958 memcpy(_PyUnicode_UTF8(unicode),
3959 PyBytes_AS_STRING(bytes),
3960 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3961 Py_DECREF(bytes);
3962 }
3963
3964 if (psize)
3965 *psize = PyUnicode_UTF8_LENGTH(unicode);
3966 return PyUnicode_UTF8(unicode);
3967 }
3968
3969 const char *
PyUnicode_AsUTF8(PyObject * unicode)3970 PyUnicode_AsUTF8(PyObject *unicode)
3971 {
3972 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3973 }
3974
3975 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3976 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3977 {
3978 if (!PyUnicode_Check(unicode)) {
3979 PyErr_BadArgument();
3980 return NULL;
3981 }
3982 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3983 if (w == NULL) {
3984 /* Non-ASCII compact unicode object */
3985 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
3986 assert(PyUnicode_IS_READY(unicode));
3987
3988 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3989 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3990 PyErr_NoMemory();
3991 return NULL;
3992 }
3993 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3994 if (w == NULL) {
3995 PyErr_NoMemory();
3996 return NULL;
3997 }
3998 unicode_copy_as_widechar(unicode, w, wlen + 1);
3999 _PyUnicode_WSTR(unicode) = w;
4000 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4001 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4002 }
4003 }
4004 if (size != NULL)
4005 *size = PyUnicode_WSTR_LENGTH(unicode);
4006 return w;
4007 }
4008
4009 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4010 PyUnicode_AsUnicode(PyObject *unicode)
4011 {
4012 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4013 }
4014
4015 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4016 _PyUnicode_AsUnicode(PyObject *unicode)
4017 {
4018 Py_ssize_t size;
4019 const Py_UNICODE *wstr;
4020
4021 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4022 if (wstr && wcslen(wstr) != (size_t)size) {
4023 PyErr_SetString(PyExc_ValueError, "embedded null character");
4024 return NULL;
4025 }
4026 return wstr;
4027 }
4028
4029
4030 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4031 PyUnicode_GetSize(PyObject *unicode)
4032 {
4033 if (!PyUnicode_Check(unicode)) {
4034 PyErr_BadArgument();
4035 goto onError;
4036 }
4037 if (_PyUnicode_WSTR(unicode) == NULL) {
4038 if (PyUnicode_AsUnicode(unicode) == NULL)
4039 goto onError;
4040 }
4041 return PyUnicode_WSTR_LENGTH(unicode);
4042
4043 onError:
4044 return -1;
4045 }
4046
4047 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4048 PyUnicode_GetLength(PyObject *unicode)
4049 {
4050 if (!PyUnicode_Check(unicode)) {
4051 PyErr_BadArgument();
4052 return -1;
4053 }
4054 if (PyUnicode_READY(unicode) == -1)
4055 return -1;
4056 return PyUnicode_GET_LENGTH(unicode);
4057 }
4058
4059 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4060 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4061 {
4062 void *data;
4063 int kind;
4064
4065 if (!PyUnicode_Check(unicode)) {
4066 PyErr_BadArgument();
4067 return (Py_UCS4)-1;
4068 }
4069 if (PyUnicode_READY(unicode) == -1) {
4070 return (Py_UCS4)-1;
4071 }
4072 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4073 PyErr_SetString(PyExc_IndexError, "string index out of range");
4074 return (Py_UCS4)-1;
4075 }
4076 data = PyUnicode_DATA(unicode);
4077 kind = PyUnicode_KIND(unicode);
4078 return PyUnicode_READ(kind, data, index);
4079 }
4080
4081 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4082 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4083 {
4084 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4085 PyErr_BadArgument();
4086 return -1;
4087 }
4088 assert(PyUnicode_IS_READY(unicode));
4089 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4090 PyErr_SetString(PyExc_IndexError, "string index out of range");
4091 return -1;
4092 }
4093 if (unicode_check_modifiable(unicode))
4094 return -1;
4095 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4096 PyErr_SetString(PyExc_ValueError, "character out of range");
4097 return -1;
4098 }
4099 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4100 index, ch);
4101 return 0;
4102 }
4103
4104 const char *
PyUnicode_GetDefaultEncoding(void)4105 PyUnicode_GetDefaultEncoding(void)
4106 {
4107 return "utf-8";
4108 }
4109
4110 /* create or adjust a UnicodeDecodeError */
4111 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4112 make_decode_exception(PyObject **exceptionObject,
4113 const char *encoding,
4114 const char *input, Py_ssize_t length,
4115 Py_ssize_t startpos, Py_ssize_t endpos,
4116 const char *reason)
4117 {
4118 if (*exceptionObject == NULL) {
4119 *exceptionObject = PyUnicodeDecodeError_Create(
4120 encoding, input, length, startpos, endpos, reason);
4121 }
4122 else {
4123 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4124 goto onError;
4125 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4126 goto onError;
4127 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4128 goto onError;
4129 }
4130 return;
4131
4132 onError:
4133 Py_CLEAR(*exceptionObject);
4134 }
4135
4136 #ifdef MS_WINDOWS
4137 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4138 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4139 {
4140 if (newsize > *size) {
4141 wchar_t *newbuf = *buf;
4142 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4143 PyErr_NoMemory();
4144 return -1;
4145 }
4146 *buf = newbuf;
4147 }
4148 *size = newsize;
4149 return 0;
4150 }
4151
4152 /* error handling callback helper:
4153 build arguments, call the callback and check the arguments,
4154 if no exception occurred, copy the replacement to the output
4155 and adjust various state variables.
4156 return 0 on success, -1 on error
4157 */
4158
4159 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4160 unicode_decode_call_errorhandler_wchar(
4161 const char *errors, PyObject **errorHandler,
4162 const char *encoding, const char *reason,
4163 const char **input, const char **inend, Py_ssize_t *startinpos,
4164 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4165 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4166 {
4167 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4168
4169 PyObject *restuple = NULL;
4170 PyObject *repunicode = NULL;
4171 Py_ssize_t outsize;
4172 Py_ssize_t insize;
4173 Py_ssize_t requiredsize;
4174 Py_ssize_t newpos;
4175 PyObject *inputobj = NULL;
4176 wchar_t *repwstr;
4177 Py_ssize_t repwlen;
4178
4179 if (*errorHandler == NULL) {
4180 *errorHandler = PyCodec_LookupError(errors);
4181 if (*errorHandler == NULL)
4182 goto onError;
4183 }
4184
4185 make_decode_exception(exceptionObject,
4186 encoding,
4187 *input, *inend - *input,
4188 *startinpos, *endinpos,
4189 reason);
4190 if (*exceptionObject == NULL)
4191 goto onError;
4192
4193 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4194 if (restuple == NULL)
4195 goto onError;
4196 if (!PyTuple_Check(restuple)) {
4197 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4198 goto onError;
4199 }
4200 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4201 goto onError;
4202
4203 /* Copy back the bytes variables, which might have been modified by the
4204 callback */
4205 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4206 if (!inputobj)
4207 goto onError;
4208 *input = PyBytes_AS_STRING(inputobj);
4209 insize = PyBytes_GET_SIZE(inputobj);
4210 *inend = *input + insize;
4211 /* we can DECREF safely, as the exception has another reference,
4212 so the object won't go away. */
4213 Py_DECREF(inputobj);
4214
4215 if (newpos<0)
4216 newpos = insize+newpos;
4217 if (newpos<0 || newpos>insize) {
4218 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4219 goto onError;
4220 }
4221
4222 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4223 if (repwstr == NULL)
4224 goto onError;
4225 /* need more space? (at least enough for what we
4226 have+the replacement+the rest of the string (starting
4227 at the new input position), so we won't have to check space
4228 when there are no errors in the rest of the string) */
4229 requiredsize = *outpos;
4230 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4231 goto overflow;
4232 requiredsize += repwlen;
4233 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4234 goto overflow;
4235 requiredsize += insize - newpos;
4236 outsize = *bufsize;
4237 if (requiredsize > outsize) {
4238 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4239 requiredsize = 2*outsize;
4240 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4241 goto onError;
4242 }
4243 }
4244 wcsncpy(*buf + *outpos, repwstr, repwlen);
4245 *outpos += repwlen;
4246 *endinpos = newpos;
4247 *inptr = *input + newpos;
4248
4249 /* we made it! */
4250 Py_DECREF(restuple);
4251 return 0;
4252
4253 overflow:
4254 PyErr_SetString(PyExc_OverflowError,
4255 "decoded result is too long for a Python string");
4256
4257 onError:
4258 Py_XDECREF(restuple);
4259 return -1;
4260 }
4261 #endif /* MS_WINDOWS */
4262
4263 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4264 unicode_decode_call_errorhandler_writer(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4270 {
4271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
4275 Py_ssize_t insize;
4276 Py_ssize_t newpos;
4277 Py_ssize_t replen;
4278 Py_ssize_t remain;
4279 PyObject *inputobj = NULL;
4280 int need_to_grow = 0;
4281 const char *new_inptr;
4282
4283 if (*errorHandler == NULL) {
4284 *errorHandler = PyCodec_LookupError(errors);
4285 if (*errorHandler == NULL)
4286 goto onError;
4287 }
4288
4289 make_decode_exception(exceptionObject,
4290 encoding,
4291 *input, *inend - *input,
4292 *startinpos, *endinpos,
4293 reason);
4294 if (*exceptionObject == NULL)
4295 goto onError;
4296
4297 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4298 if (restuple == NULL)
4299 goto onError;
4300 if (!PyTuple_Check(restuple)) {
4301 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4302 goto onError;
4303 }
4304 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4305 goto onError;
4306
4307 /* Copy back the bytes variables, which might have been modified by the
4308 callback */
4309 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4310 if (!inputobj)
4311 goto onError;
4312 remain = *inend - *input - *endinpos;
4313 *input = PyBytes_AS_STRING(inputobj);
4314 insize = PyBytes_GET_SIZE(inputobj);
4315 *inend = *input + insize;
4316 /* we can DECREF safely, as the exception has another reference,
4317 so the object won't go away. */
4318 Py_DECREF(inputobj);
4319
4320 if (newpos<0)
4321 newpos = insize+newpos;
4322 if (newpos<0 || newpos>insize) {
4323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4324 goto onError;
4325 }
4326
4327 replen = PyUnicode_GET_LENGTH(repunicode);
4328 if (replen > 1) {
4329 writer->min_length += replen - 1;
4330 need_to_grow = 1;
4331 }
4332 new_inptr = *input + newpos;
4333 if (*inend - new_inptr > remain) {
4334 /* We don't know the decoding algorithm here so we make the worst
4335 assumption that one byte decodes to one unicode character.
4336 If unfortunately one byte could decode to more unicode characters,
4337 the decoder may write out-of-bound then. Is it possible for the
4338 algorithms using this function? */
4339 writer->min_length += *inend - new_inptr - remain;
4340 need_to_grow = 1;
4341 }
4342 if (need_to_grow) {
4343 writer->overallocate = 1;
4344 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4345 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4346 goto onError;
4347 }
4348 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4349 goto onError;
4350
4351 *endinpos = newpos;
4352 *inptr = new_inptr;
4353
4354 /* we made it! */
4355 Py_DECREF(restuple);
4356 return 0;
4357
4358 onError:
4359 Py_XDECREF(restuple);
4360 return -1;
4361 }
4362
4363 /* --- UTF-7 Codec -------------------------------------------------------- */
4364
4365 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4366
4367 /* Three simple macros defining base-64. */
4368
4369 /* Is c a base-64 character? */
4370
4371 #define IS_BASE64(c) \
4372 (((c) >= 'A' && (c) <= 'Z') || \
4373 ((c) >= 'a' && (c) <= 'z') || \
4374 ((c) >= '0' && (c) <= '9') || \
4375 (c) == '+' || (c) == '/')
4376
4377 /* given that c is a base-64 character, what is its base-64 value? */
4378
4379 #define FROM_BASE64(c) \
4380 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4381 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4382 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4383 (c) == '+' ? 62 : 63)
4384
4385 /* What is the base-64 character of the bottom 6 bits of n? */
4386
4387 #define TO_BASE64(n) \
4388 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4389
4390 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4391 * decoded as itself. We are permissive on decoding; the only ASCII
4392 * byte not decoding to itself is the + which begins a base64
4393 * string. */
4394
4395 #define DECODE_DIRECT(c) \
4396 ((c) <= 127 && (c) != '+')
4397
4398 /* The UTF-7 encoder treats ASCII characters differently according to
4399 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4400 * the above). See RFC2152. This array identifies these different
4401 * sets:
4402 * 0 : "Set D"
4403 * alphanumeric and '(),-./:?
4404 * 1 : "Set O"
4405 * !"#$%&*;<=>@[]^_`{|}
4406 * 2 : "whitespace"
4407 * ht nl cr sp
4408 * 3 : special (must be base64 encoded)
4409 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4410 */
4411
4412 static
4413 char utf7_category[128] = {
4414 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4415 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4416 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4417 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4418 /* sp ! " # $ % & ' ( ) * + , - . / */
4419 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4420 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4422 /* @ A B C D E F G H I J K L M N O */
4423 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4424 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4426 /* ` a b c d e f g h i j k l m n o */
4427 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4428 /* p q r s t u v w x y z { | } ~ del */
4429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4430 };
4431
4432 /* ENCODE_DIRECT: this character should be encoded as itself. The
4433 * answer depends on whether we are encoding set O as itself, and also
4434 * on whether we are encoding whitespace as itself. RFC2152 makes it
4435 * clear that the answers to these questions vary between
4436 * applications, so this code needs to be flexible. */
4437
4438 #define ENCODE_DIRECT(c, directO, directWS) \
4439 ((c) < 128 && (c) > 0 && \
4440 ((utf7_category[(c)] == 0) || \
4441 (directWS && (utf7_category[(c)] == 2)) || \
4442 (directO && (utf7_category[(c)] == 1))))
4443
4444 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4445 PyUnicode_DecodeUTF7(const char *s,
4446 Py_ssize_t size,
4447 const char *errors)
4448 {
4449 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4450 }
4451
4452 /* The decoder. The only state we preserve is our read position,
4453 * i.e. how many characters we have consumed. So if we end in the
4454 * middle of a shift sequence we have to back off the read position
4455 * and the output to the beginning of the sequence, otherwise we lose
4456 * all the shift state (seen bits, number of bits seen, high
4457 * surrogate). */
4458
4459 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4460 PyUnicode_DecodeUTF7Stateful(const char *s,
4461 Py_ssize_t size,
4462 const char *errors,
4463 Py_ssize_t *consumed)
4464 {
4465 const char *starts = s;
4466 Py_ssize_t startinpos;
4467 Py_ssize_t endinpos;
4468 const char *e;
4469 _PyUnicodeWriter writer;
4470 const char *errmsg = "";
4471 int inShift = 0;
4472 Py_ssize_t shiftOutStart;
4473 unsigned int base64bits = 0;
4474 unsigned long base64buffer = 0;
4475 Py_UCS4 surrogate = 0;
4476 PyObject *errorHandler = NULL;
4477 PyObject *exc = NULL;
4478
4479 if (size == 0) {
4480 if (consumed)
4481 *consumed = 0;
4482 _Py_RETURN_UNICODE_EMPTY();
4483 }
4484
4485 /* Start off assuming it's all ASCII. Widen later as necessary. */
4486 _PyUnicodeWriter_Init(&writer);
4487 writer.min_length = size;
4488
4489 shiftOutStart = 0;
4490 e = s + size;
4491
4492 while (s < e) {
4493 Py_UCS4 ch;
4494 restart:
4495 ch = (unsigned char) *s;
4496
4497 if (inShift) { /* in a base-64 section */
4498 if (IS_BASE64(ch)) { /* consume a base-64 character */
4499 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4500 base64bits += 6;
4501 s++;
4502 if (base64bits >= 16) {
4503 /* we have enough bits for a UTF-16 value */
4504 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4505 base64bits -= 16;
4506 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4507 assert(outCh <= 0xffff);
4508 if (surrogate) {
4509 /* expecting a second surrogate */
4510 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4511 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4512 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4513 goto onError;
4514 surrogate = 0;
4515 continue;
4516 }
4517 else {
4518 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4519 goto onError;
4520 surrogate = 0;
4521 }
4522 }
4523 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4524 /* first surrogate */
4525 surrogate = outCh;
4526 }
4527 else {
4528 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4529 goto onError;
4530 }
4531 }
4532 }
4533 else { /* now leaving a base-64 section */
4534 inShift = 0;
4535 if (base64bits > 0) { /* left-over bits */
4536 if (base64bits >= 6) {
4537 /* We've seen at least one base-64 character */
4538 s++;
4539 errmsg = "partial character in shift sequence";
4540 goto utf7Error;
4541 }
4542 else {
4543 /* Some bits remain; they should be zero */
4544 if (base64buffer != 0) {
4545 s++;
4546 errmsg = "non-zero padding bits in shift sequence";
4547 goto utf7Error;
4548 }
4549 }
4550 }
4551 if (surrogate && DECODE_DIRECT(ch)) {
4552 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4553 goto onError;
4554 }
4555 surrogate = 0;
4556 if (ch == '-') {
4557 /* '-' is absorbed; other terminating
4558 characters are preserved */
4559 s++;
4560 }
4561 }
4562 }
4563 else if ( ch == '+' ) {
4564 startinpos = s-starts;
4565 s++; /* consume '+' */
4566 if (s < e && *s == '-') { /* '+-' encodes '+' */
4567 s++;
4568 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4569 goto onError;
4570 }
4571 else if (s < e && !IS_BASE64(*s)) {
4572 s++;
4573 errmsg = "ill-formed sequence";
4574 goto utf7Error;
4575 }
4576 else { /* begin base64-encoded section */
4577 inShift = 1;
4578 surrogate = 0;
4579 shiftOutStart = writer.pos;
4580 base64bits = 0;
4581 base64buffer = 0;
4582 }
4583 }
4584 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4585 s++;
4586 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4587 goto onError;
4588 }
4589 else {
4590 startinpos = s-starts;
4591 s++;
4592 errmsg = "unexpected special character";
4593 goto utf7Error;
4594 }
4595 continue;
4596 utf7Error:
4597 endinpos = s-starts;
4598 if (unicode_decode_call_errorhandler_writer(
4599 errors, &errorHandler,
4600 "utf7", errmsg,
4601 &starts, &e, &startinpos, &endinpos, &exc, &s,
4602 &writer))
4603 goto onError;
4604 }
4605
4606 /* end of string */
4607
4608 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4609 /* if we're in an inconsistent state, that's an error */
4610 inShift = 0;
4611 if (surrogate ||
4612 (base64bits >= 6) ||
4613 (base64bits > 0 && base64buffer != 0)) {
4614 endinpos = size;
4615 if (unicode_decode_call_errorhandler_writer(
4616 errors, &errorHandler,
4617 "utf7", "unterminated shift sequence",
4618 &starts, &e, &startinpos, &endinpos, &exc, &s,
4619 &writer))
4620 goto onError;
4621 if (s < e)
4622 goto restart;
4623 }
4624 }
4625
4626 /* return state */
4627 if (consumed) {
4628 if (inShift) {
4629 *consumed = startinpos;
4630 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4631 PyObject *result = PyUnicode_FromKindAndData(
4632 writer.kind, writer.data, shiftOutStart);
4633 Py_XDECREF(errorHandler);
4634 Py_XDECREF(exc);
4635 _PyUnicodeWriter_Dealloc(&writer);
4636 return result;
4637 }
4638 writer.pos = shiftOutStart; /* back off output */
4639 }
4640 else {
4641 *consumed = s-starts;
4642 }
4643 }
4644
4645 Py_XDECREF(errorHandler);
4646 Py_XDECREF(exc);
4647 return _PyUnicodeWriter_Finish(&writer);
4648
4649 onError:
4650 Py_XDECREF(errorHandler);
4651 Py_XDECREF(exc);
4652 _PyUnicodeWriter_Dealloc(&writer);
4653 return NULL;
4654 }
4655
4656
4657 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4658 _PyUnicode_EncodeUTF7(PyObject *str,
4659 int base64SetO,
4660 int base64WhiteSpace,
4661 const char *errors)
4662 {
4663 int kind;
4664 void *data;
4665 Py_ssize_t len;
4666 PyObject *v;
4667 int inShift = 0;
4668 Py_ssize_t i;
4669 unsigned int base64bits = 0;
4670 unsigned long base64buffer = 0;
4671 char * out;
4672 char * start;
4673
4674 if (PyUnicode_READY(str) == -1)
4675 return NULL;
4676 kind = PyUnicode_KIND(str);
4677 data = PyUnicode_DATA(str);
4678 len = PyUnicode_GET_LENGTH(str);
4679
4680 if (len == 0)
4681 return PyBytes_FromStringAndSize(NULL, 0);
4682
4683 /* It might be possible to tighten this worst case */
4684 if (len > PY_SSIZE_T_MAX / 8)
4685 return PyErr_NoMemory();
4686 v = PyBytes_FromStringAndSize(NULL, len * 8);
4687 if (v == NULL)
4688 return NULL;
4689
4690 start = out = PyBytes_AS_STRING(v);
4691 for (i = 0; i < len; ++i) {
4692 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4693
4694 if (inShift) {
4695 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4696 /* shifting out */
4697 if (base64bits) { /* output remaining bits */
4698 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4699 base64buffer = 0;
4700 base64bits = 0;
4701 }
4702 inShift = 0;
4703 /* Characters not in the BASE64 set implicitly unshift the sequence
4704 so no '-' is required, except if the character is itself a '-' */
4705 if (IS_BASE64(ch) || ch == '-') {
4706 *out++ = '-';
4707 }
4708 *out++ = (char) ch;
4709 }
4710 else {
4711 goto encode_char;
4712 }
4713 }
4714 else { /* not in a shift sequence */
4715 if (ch == '+') {
4716 *out++ = '+';
4717 *out++ = '-';
4718 }
4719 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4720 *out++ = (char) ch;
4721 }
4722 else {
4723 *out++ = '+';
4724 inShift = 1;
4725 goto encode_char;
4726 }
4727 }
4728 continue;
4729 encode_char:
4730 if (ch >= 0x10000) {
4731 assert(ch <= MAX_UNICODE);
4732
4733 /* code first surrogate */
4734 base64bits += 16;
4735 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4736 while (base64bits >= 6) {
4737 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4738 base64bits -= 6;
4739 }
4740 /* prepare second surrogate */
4741 ch = Py_UNICODE_LOW_SURROGATE(ch);
4742 }
4743 base64bits += 16;
4744 base64buffer = (base64buffer << 16) | ch;
4745 while (base64bits >= 6) {
4746 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4747 base64bits -= 6;
4748 }
4749 }
4750 if (base64bits)
4751 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4752 if (inShift)
4753 *out++ = '-';
4754 if (_PyBytes_Resize(&v, out - start) < 0)
4755 return NULL;
4756 return v;
4757 }
4758 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4759 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4760 Py_ssize_t size,
4761 int base64SetO,
4762 int base64WhiteSpace,
4763 const char *errors)
4764 {
4765 PyObject *result;
4766 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4767 if (tmp == NULL)
4768 return NULL;
4769 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4770 base64WhiteSpace, errors);
4771 Py_DECREF(tmp);
4772 return result;
4773 }
4774
4775 #undef IS_BASE64
4776 #undef FROM_BASE64
4777 #undef TO_BASE64
4778 #undef DECODE_DIRECT
4779 #undef ENCODE_DIRECT
4780
4781 /* --- UTF-8 Codec -------------------------------------------------------- */
4782
4783 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4784 PyUnicode_DecodeUTF8(const char *s,
4785 Py_ssize_t size,
4786 const char *errors)
4787 {
4788 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4789 }
4790
4791 #include "stringlib/asciilib.h"
4792 #include "stringlib/codecs.h"
4793 #include "stringlib/undef.h"
4794
4795 #include "stringlib/ucs1lib.h"
4796 #include "stringlib/codecs.h"
4797 #include "stringlib/undef.h"
4798
4799 #include "stringlib/ucs2lib.h"
4800 #include "stringlib/codecs.h"
4801 #include "stringlib/undef.h"
4802
4803 #include "stringlib/ucs4lib.h"
4804 #include "stringlib/codecs.h"
4805 #include "stringlib/undef.h"
4806
4807 /* Mask to quickly check whether a C 'long' contains a
4808 non-ASCII, UTF8-encoded char. */
4809 #if (SIZEOF_LONG == 8)
4810 # define ASCII_CHAR_MASK 0x8080808080808080UL
4811 #elif (SIZEOF_LONG == 4)
4812 # define ASCII_CHAR_MASK 0x80808080UL
4813 #else
4814 # error C 'long' size should be either 4 or 8!
4815 #endif
4816
4817 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4818 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4819 {
4820 const char *p = start;
4821 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4822
4823 /*
4824 * Issue #17237: m68k is a bit different from most architectures in
4825 * that objects do not use "natural alignment" - for example, int and
4826 * long are only aligned at 2-byte boundaries. Therefore the assert()
4827 * won't work; also, tests have shown that skipping the "optimised
4828 * version" will even speed up m68k.
4829 */
4830 #if !defined(__m68k__)
4831 #if SIZEOF_LONG <= SIZEOF_VOID_P
4832 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4833 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4834 /* Fast path, see in STRINGLIB(utf8_decode) for
4835 an explanation. */
4836 /* Help allocation */
4837 const char *_p = p;
4838 Py_UCS1 * q = dest;
4839 while (_p < aligned_end) {
4840 unsigned long value = *(const unsigned long *) _p;
4841 if (value & ASCII_CHAR_MASK)
4842 break;
4843 *((unsigned long *)q) = value;
4844 _p += SIZEOF_LONG;
4845 q += SIZEOF_LONG;
4846 }
4847 p = _p;
4848 while (p < end) {
4849 if ((unsigned char)*p & 0x80)
4850 break;
4851 *q++ = *p++;
4852 }
4853 return p - start;
4854 }
4855 #endif
4856 #endif
4857 while (p < end) {
4858 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4859 for an explanation. */
4860 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4861 /* Help allocation */
4862 const char *_p = p;
4863 while (_p < aligned_end) {
4864 unsigned long value = *(unsigned long *) _p;
4865 if (value & ASCII_CHAR_MASK)
4866 break;
4867 _p += SIZEOF_LONG;
4868 }
4869 p = _p;
4870 if (_p == end)
4871 break;
4872 }
4873 if ((unsigned char)*p & 0x80)
4874 break;
4875 ++p;
4876 }
4877 memcpy(dest, start, p - start);
4878 return p - start;
4879 }
4880
4881 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4882 unicode_decode_utf8(const char *s, Py_ssize_t size,
4883 _Py_error_handler error_handler, const char *errors,
4884 Py_ssize_t *consumed)
4885 {
4886 _PyUnicodeWriter writer;
4887 const char *starts = s;
4888 const char *end = s + size;
4889
4890 Py_ssize_t startinpos;
4891 Py_ssize_t endinpos;
4892 const char *errmsg = "";
4893 PyObject *error_handler_obj = NULL;
4894 PyObject *exc = NULL;
4895
4896 if (size == 0) {
4897 if (consumed)
4898 *consumed = 0;
4899 _Py_RETURN_UNICODE_EMPTY();
4900 }
4901
4902 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4903 if (size == 1 && (unsigned char)s[0] < 128) {
4904 if (consumed)
4905 *consumed = 1;
4906 return get_latin1_char((unsigned char)s[0]);
4907 }
4908
4909 _PyUnicodeWriter_Init(&writer);
4910 writer.min_length = size;
4911 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4912 goto onError;
4913
4914 writer.pos = ascii_decode(s, end, writer.data);
4915 s += writer.pos;
4916 while (s < end) {
4917 Py_UCS4 ch;
4918 int kind = writer.kind;
4919
4920 if (kind == PyUnicode_1BYTE_KIND) {
4921 if (PyUnicode_IS_ASCII(writer.buffer))
4922 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4923 else
4924 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4925 } else if (kind == PyUnicode_2BYTE_KIND) {
4926 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4927 } else {
4928 assert(kind == PyUnicode_4BYTE_KIND);
4929 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4930 }
4931
4932 switch (ch) {
4933 case 0:
4934 if (s == end || consumed)
4935 goto End;
4936 errmsg = "unexpected end of data";
4937 startinpos = s - starts;
4938 endinpos = end - starts;
4939 break;
4940 case 1:
4941 errmsg = "invalid start byte";
4942 startinpos = s - starts;
4943 endinpos = startinpos + 1;
4944 break;
4945 case 2:
4946 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4947 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4948 {
4949 /* Truncated surrogate code in range D800-DFFF */
4950 goto End;
4951 }
4952 /* fall through */
4953 case 3:
4954 case 4:
4955 errmsg = "invalid continuation byte";
4956 startinpos = s - starts;
4957 endinpos = startinpos + ch - 1;
4958 break;
4959 default:
4960 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4961 goto onError;
4962 continue;
4963 }
4964
4965 if (error_handler == _Py_ERROR_UNKNOWN)
4966 error_handler = _Py_GetErrorHandler(errors);
4967
4968 switch (error_handler) {
4969 case _Py_ERROR_IGNORE:
4970 s += (endinpos - startinpos);
4971 break;
4972
4973 case _Py_ERROR_REPLACE:
4974 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4975 goto onError;
4976 s += (endinpos - startinpos);
4977 break;
4978
4979 case _Py_ERROR_SURROGATEESCAPE:
4980 {
4981 Py_ssize_t i;
4982
4983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4984 goto onError;
4985 for (i=startinpos; i<endinpos; i++) {
4986 ch = (Py_UCS4)(unsigned char)(starts[i]);
4987 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4988 ch + 0xdc00);
4989 writer.pos++;
4990 }
4991 s += (endinpos - startinpos);
4992 break;
4993 }
4994
4995 default:
4996 if (unicode_decode_call_errorhandler_writer(
4997 errors, &error_handler_obj,
4998 "utf-8", errmsg,
4999 &starts, &end, &startinpos, &endinpos, &exc, &s,
5000 &writer))
5001 goto onError;
5002 }
5003 }
5004
5005 End:
5006 if (consumed)
5007 *consumed = s - starts;
5008
5009 Py_XDECREF(error_handler_obj);
5010 Py_XDECREF(exc);
5011 return _PyUnicodeWriter_Finish(&writer);
5012
5013 onError:
5014 Py_XDECREF(error_handler_obj);
5015 Py_XDECREF(exc);
5016 _PyUnicodeWriter_Dealloc(&writer);
5017 return NULL;
5018 }
5019
5020
5021 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5022 PyUnicode_DecodeUTF8Stateful(const char *s,
5023 Py_ssize_t size,
5024 const char *errors,
5025 Py_ssize_t *consumed)
5026 {
5027 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5028 }
5029
5030
5031 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5032 non-zero, use strict error handler otherwise.
5033
5034 On success, write a pointer to a newly allocated wide character string into
5035 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5036 (in number of wchar_t units) into *wlen (if wlen is set).
5037
5038 On memory allocation failure, return -1.
5039
5040 On decoding error (if surrogateescape is zero), return -2. If wlen is
5041 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5042 is not NULL, write the decoding error message into *reason. */
5043 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5044 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5045 const char **reason, _Py_error_handler errors)
5046 {
5047 const char *orig_s = s;
5048 const char *e;
5049 wchar_t *unicode;
5050 Py_ssize_t outpos;
5051
5052 int surrogateescape = 0;
5053 int surrogatepass = 0;
5054 switch (errors)
5055 {
5056 case _Py_ERROR_STRICT:
5057 break;
5058 case _Py_ERROR_SURROGATEESCAPE:
5059 surrogateescape = 1;
5060 break;
5061 case _Py_ERROR_SURROGATEPASS:
5062 surrogatepass = 1;
5063 break;
5064 default:
5065 return -3;
5066 }
5067
5068 /* Note: size will always be longer than the resulting Unicode
5069 character count */
5070 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5071 return -1;
5072 }
5073
5074 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5075 if (!unicode) {
5076 return -1;
5077 }
5078
5079 /* Unpack UTF-8 encoded data */
5080 e = s + size;
5081 outpos = 0;
5082 while (s < e) {
5083 Py_UCS4 ch;
5084 #if SIZEOF_WCHAR_T == 4
5085 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5086 #else
5087 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5088 #endif
5089 if (ch > 0xFF) {
5090 #if SIZEOF_WCHAR_T == 4
5091 Py_UNREACHABLE();
5092 #else
5093 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5094 /* write a surrogate pair */
5095 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5096 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5097 #endif
5098 }
5099 else {
5100 if (!ch && s == e) {
5101 break;
5102 }
5103
5104 if (surrogateescape) {
5105 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5106 }
5107 else {
5108 /* Is it a valid three-byte code? */
5109 if (surrogatepass
5110 && (e - s) >= 3
5111 && (s[0] & 0xf0) == 0xe0
5112 && (s[1] & 0xc0) == 0x80
5113 && (s[2] & 0xc0) == 0x80)
5114 {
5115 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5116 s += 3;
5117 unicode[outpos++] = ch;
5118 }
5119 else {
5120 PyMem_RawFree(unicode );
5121 if (reason != NULL) {
5122 switch (ch) {
5123 case 0:
5124 *reason = "unexpected end of data";
5125 break;
5126 case 1:
5127 *reason = "invalid start byte";
5128 break;
5129 /* 2, 3, 4 */
5130 default:
5131 *reason = "invalid continuation byte";
5132 break;
5133 }
5134 }
5135 if (wlen != NULL) {
5136 *wlen = s - orig_s;
5137 }
5138 return -2;
5139 }
5140 }
5141 }
5142 }
5143 unicode[outpos] = L'\0';
5144 if (wlen) {
5145 *wlen = outpos;
5146 }
5147 *wstr = unicode;
5148 return 0;
5149 }
5150
5151
5152 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5153 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5154 size_t *wlen)
5155 {
5156 wchar_t *wstr;
5157 int res = _Py_DecodeUTF8Ex(arg, arglen,
5158 &wstr, wlen,
5159 NULL, _Py_ERROR_SURROGATEESCAPE);
5160 if (res != 0) {
5161 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5162 assert(res != -3);
5163 if (wlen) {
5164 *wlen = (size_t)res;
5165 }
5166 return NULL;
5167 }
5168 return wstr;
5169 }
5170
5171
5172 /* UTF-8 encoder using the surrogateescape error handler .
5173
5174 On success, return 0 and write the newly allocated character string (use
5175 PyMem_Free() to free the memory) into *str.
5176
5177 On encoding failure, return -2 and write the position of the invalid
5178 surrogate character into *error_pos (if error_pos is set) and the decoding
5179 error message into *reason (if reason is set).
5180
5181 On memory allocation failure, return -1. */
5182 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5183 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5184 const char **reason, int raw_malloc, _Py_error_handler errors)
5185 {
5186 const Py_ssize_t max_char_size = 4;
5187 Py_ssize_t len = wcslen(text);
5188
5189 assert(len >= 0);
5190
5191 int surrogateescape = 0;
5192 int surrogatepass = 0;
5193 switch (errors)
5194 {
5195 case _Py_ERROR_STRICT:
5196 break;
5197 case _Py_ERROR_SURROGATEESCAPE:
5198 surrogateescape = 1;
5199 break;
5200 case _Py_ERROR_SURROGATEPASS:
5201 surrogatepass = 1;
5202 break;
5203 default:
5204 return -3;
5205 }
5206
5207 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5208 return -1;
5209 }
5210 char *bytes;
5211 if (raw_malloc) {
5212 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5213 }
5214 else {
5215 bytes = PyMem_Malloc((len + 1) * max_char_size);
5216 }
5217 if (bytes == NULL) {
5218 return -1;
5219 }
5220
5221 char *p = bytes;
5222 Py_ssize_t i;
5223 for (i = 0; i < len; ) {
5224 Py_ssize_t ch_pos = i;
5225 Py_UCS4 ch = text[i];
5226 i++;
5227 #if Py_UNICODE_SIZE == 2
5228 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5229 && i < len
5230 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5231 {
5232 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5233 i++;
5234 }
5235 #endif
5236
5237 if (ch < 0x80) {
5238 /* Encode ASCII */
5239 *p++ = (char) ch;
5240
5241 }
5242 else if (ch < 0x0800) {
5243 /* Encode Latin-1 */
5244 *p++ = (char)(0xc0 | (ch >> 6));
5245 *p++ = (char)(0x80 | (ch & 0x3f));
5246 }
5247 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5248 /* surrogateescape error handler */
5249 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5250 if (error_pos != NULL) {
5251 *error_pos = (size_t)ch_pos;
5252 }
5253 if (reason != NULL) {
5254 *reason = "encoding error";
5255 }
5256 if (raw_malloc) {
5257 PyMem_RawFree(bytes);
5258 }
5259 else {
5260 PyMem_Free(bytes);
5261 }
5262 return -2;
5263 }
5264 *p++ = (char)(ch & 0xff);
5265 }
5266 else if (ch < 0x10000) {
5267 *p++ = (char)(0xe0 | (ch >> 12));
5268 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5269 *p++ = (char)(0x80 | (ch & 0x3f));
5270 }
5271 else { /* ch >= 0x10000 */
5272 assert(ch <= MAX_UNICODE);
5273 /* Encode UCS4 Unicode ordinals */
5274 *p++ = (char)(0xf0 | (ch >> 18));
5275 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5276 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5277 *p++ = (char)(0x80 | (ch & 0x3f));
5278 }
5279 }
5280 *p++ = '\0';
5281
5282 size_t final_size = (p - bytes);
5283 char *bytes2;
5284 if (raw_malloc) {
5285 bytes2 = PyMem_RawRealloc(bytes, final_size);
5286 }
5287 else {
5288 bytes2 = PyMem_Realloc(bytes, final_size);
5289 }
5290 if (bytes2 == NULL) {
5291 if (error_pos != NULL) {
5292 *error_pos = (size_t)-1;
5293 }
5294 if (raw_malloc) {
5295 PyMem_RawFree(bytes);
5296 }
5297 else {
5298 PyMem_Free(bytes);
5299 }
5300 return -1;
5301 }
5302 *str = bytes2;
5303 return 0;
5304 }
5305
5306
5307 /* Primary internal function which creates utf8 encoded bytes objects.
5308
5309 Allocation strategy: if the string is short, convert into a stack buffer
5310 and allocate exactly as much space needed at the end. Else allocate the
5311 maximum possible needed (4 result bytes per Unicode character), and return
5312 the excess memory at the end.
5313 */
5314 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5315 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5316 const char *errors)
5317 {
5318 enum PyUnicode_Kind kind;
5319 void *data;
5320 Py_ssize_t size;
5321
5322 if (!PyUnicode_Check(unicode)) {
5323 PyErr_BadArgument();
5324 return NULL;
5325 }
5326
5327 if (PyUnicode_READY(unicode) == -1)
5328 return NULL;
5329
5330 if (PyUnicode_UTF8(unicode))
5331 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5332 PyUnicode_UTF8_LENGTH(unicode));
5333
5334 kind = PyUnicode_KIND(unicode);
5335 data = PyUnicode_DATA(unicode);
5336 size = PyUnicode_GET_LENGTH(unicode);
5337
5338 switch (kind) {
5339 default:
5340 Py_UNREACHABLE();
5341 case PyUnicode_1BYTE_KIND:
5342 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5343 assert(!PyUnicode_IS_ASCII(unicode));
5344 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5345 case PyUnicode_2BYTE_KIND:
5346 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5347 case PyUnicode_4BYTE_KIND:
5348 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5349 }
5350 }
5351
5352 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5353 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5354 {
5355 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5356 }
5357
5358
5359 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5360 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5361 Py_ssize_t size,
5362 const char *errors)
5363 {
5364 PyObject *v, *unicode;
5365
5366 unicode = PyUnicode_FromWideChar(s, size);
5367 if (unicode == NULL)
5368 return NULL;
5369 v = _PyUnicode_AsUTF8String(unicode, errors);
5370 Py_DECREF(unicode);
5371 return v;
5372 }
5373
5374 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5375 PyUnicode_AsUTF8String(PyObject *unicode)
5376 {
5377 return _PyUnicode_AsUTF8String(unicode, NULL);
5378 }
5379
5380 /* --- UTF-32 Codec ------------------------------------------------------- */
5381
5382 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5383 PyUnicode_DecodeUTF32(const char *s,
5384 Py_ssize_t size,
5385 const char *errors,
5386 int *byteorder)
5387 {
5388 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5389 }
5390
5391 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5392 PyUnicode_DecodeUTF32Stateful(const char *s,
5393 Py_ssize_t size,
5394 const char *errors,
5395 int *byteorder,
5396 Py_ssize_t *consumed)
5397 {
5398 const char *starts = s;
5399 Py_ssize_t startinpos;
5400 Py_ssize_t endinpos;
5401 _PyUnicodeWriter writer;
5402 const unsigned char *q, *e;
5403 int le, bo = 0; /* assume native ordering by default */
5404 const char *encoding;
5405 const char *errmsg = "";
5406 PyObject *errorHandler = NULL;
5407 PyObject *exc = NULL;
5408
5409 q = (unsigned char *)s;
5410 e = q + size;
5411
5412 if (byteorder)
5413 bo = *byteorder;
5414
5415 /* Check for BOM marks (U+FEFF) in the input and adjust current
5416 byte order setting accordingly. In native mode, the leading BOM
5417 mark is skipped, in all other modes, it is copied to the output
5418 stream as-is (giving a ZWNBSP character). */
5419 if (bo == 0 && size >= 4) {
5420 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5421 if (bom == 0x0000FEFF) {
5422 bo = -1;
5423 q += 4;
5424 }
5425 else if (bom == 0xFFFE0000) {
5426 bo = 1;
5427 q += 4;
5428 }
5429 if (byteorder)
5430 *byteorder = bo;
5431 }
5432
5433 if (q == e) {
5434 if (consumed)
5435 *consumed = size;
5436 _Py_RETURN_UNICODE_EMPTY();
5437 }
5438
5439 #ifdef WORDS_BIGENDIAN
5440 le = bo < 0;
5441 #else
5442 le = bo <= 0;
5443 #endif
5444 encoding = le ? "utf-32-le" : "utf-32-be";
5445
5446 _PyUnicodeWriter_Init(&writer);
5447 writer.min_length = (e - q + 3) / 4;
5448 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5449 goto onError;
5450
5451 while (1) {
5452 Py_UCS4 ch = 0;
5453 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5454
5455 if (e - q >= 4) {
5456 enum PyUnicode_Kind kind = writer.kind;
5457 void *data = writer.data;
5458 const unsigned char *last = e - 4;
5459 Py_ssize_t pos = writer.pos;
5460 if (le) {
5461 do {
5462 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5463 if (ch > maxch)
5464 break;
5465 if (kind != PyUnicode_1BYTE_KIND &&
5466 Py_UNICODE_IS_SURROGATE(ch))
5467 break;
5468 PyUnicode_WRITE(kind, data, pos++, ch);
5469 q += 4;
5470 } while (q <= last);
5471 }
5472 else {
5473 do {
5474 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5475 if (ch > maxch)
5476 break;
5477 if (kind != PyUnicode_1BYTE_KIND &&
5478 Py_UNICODE_IS_SURROGATE(ch))
5479 break;
5480 PyUnicode_WRITE(kind, data, pos++, ch);
5481 q += 4;
5482 } while (q <= last);
5483 }
5484 writer.pos = pos;
5485 }
5486
5487 if (Py_UNICODE_IS_SURROGATE(ch)) {
5488 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5489 startinpos = ((const char *)q) - starts;
5490 endinpos = startinpos + 4;
5491 }
5492 else if (ch <= maxch) {
5493 if (q == e || consumed)
5494 break;
5495 /* remaining bytes at the end? (size should be divisible by 4) */
5496 errmsg = "truncated data";
5497 startinpos = ((const char *)q) - starts;
5498 endinpos = ((const char *)e) - starts;
5499 }
5500 else {
5501 if (ch < 0x110000) {
5502 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5503 goto onError;
5504 q += 4;
5505 continue;
5506 }
5507 errmsg = "code point not in range(0x110000)";
5508 startinpos = ((const char *)q) - starts;
5509 endinpos = startinpos + 4;
5510 }
5511
5512 /* The remaining input chars are ignored if the callback
5513 chooses to skip the input */
5514 if (unicode_decode_call_errorhandler_writer(
5515 errors, &errorHandler,
5516 encoding, errmsg,
5517 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5518 &writer))
5519 goto onError;
5520 }
5521
5522 if (consumed)
5523 *consumed = (const char *)q-starts;
5524
5525 Py_XDECREF(errorHandler);
5526 Py_XDECREF(exc);
5527 return _PyUnicodeWriter_Finish(&writer);
5528
5529 onError:
5530 _PyUnicodeWriter_Dealloc(&writer);
5531 Py_XDECREF(errorHandler);
5532 Py_XDECREF(exc);
5533 return NULL;
5534 }
5535
5536 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5537 _PyUnicode_EncodeUTF32(PyObject *str,
5538 const char *errors,
5539 int byteorder)
5540 {
5541 enum PyUnicode_Kind kind;
5542 const void *data;
5543 Py_ssize_t len;
5544 PyObject *v;
5545 uint32_t *out;
5546 #if PY_LITTLE_ENDIAN
5547 int native_ordering = byteorder <= 0;
5548 #else
5549 int native_ordering = byteorder >= 0;
5550 #endif
5551 const char *encoding;
5552 Py_ssize_t nsize, pos;
5553 PyObject *errorHandler = NULL;
5554 PyObject *exc = NULL;
5555 PyObject *rep = NULL;
5556
5557 if (!PyUnicode_Check(str)) {
5558 PyErr_BadArgument();
5559 return NULL;
5560 }
5561 if (PyUnicode_READY(str) == -1)
5562 return NULL;
5563 kind = PyUnicode_KIND(str);
5564 data = PyUnicode_DATA(str);
5565 len = PyUnicode_GET_LENGTH(str);
5566
5567 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5568 return PyErr_NoMemory();
5569 nsize = len + (byteorder == 0);
5570 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5571 if (v == NULL)
5572 return NULL;
5573
5574 /* output buffer is 4-bytes aligned */
5575 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5576 out = (uint32_t *)PyBytes_AS_STRING(v);
5577 if (byteorder == 0)
5578 *out++ = 0xFEFF;
5579 if (len == 0)
5580 goto done;
5581
5582 if (byteorder == -1)
5583 encoding = "utf-32-le";
5584 else if (byteorder == 1)
5585 encoding = "utf-32-be";
5586 else
5587 encoding = "utf-32";
5588
5589 if (kind == PyUnicode_1BYTE_KIND) {
5590 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5591 goto done;
5592 }
5593
5594 pos = 0;
5595 while (pos < len) {
5596 Py_ssize_t repsize, moreunits;
5597
5598 if (kind == PyUnicode_2BYTE_KIND) {
5599 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5600 &out, native_ordering);
5601 }
5602 else {
5603 assert(kind == PyUnicode_4BYTE_KIND);
5604 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5605 &out, native_ordering);
5606 }
5607 if (pos == len)
5608 break;
5609
5610 rep = unicode_encode_call_errorhandler(
5611 errors, &errorHandler,
5612 encoding, "surrogates not allowed",
5613 str, &exc, pos, pos + 1, &pos);
5614 if (!rep)
5615 goto error;
5616
5617 if (PyBytes_Check(rep)) {
5618 repsize = PyBytes_GET_SIZE(rep);
5619 if (repsize & 3) {
5620 raise_encode_exception(&exc, encoding,
5621 str, pos - 1, pos,
5622 "surrogates not allowed");
5623 goto error;
5624 }
5625 moreunits = repsize / 4;
5626 }
5627 else {
5628 assert(PyUnicode_Check(rep));
5629 if (PyUnicode_READY(rep) < 0)
5630 goto error;
5631 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5632 if (!PyUnicode_IS_ASCII(rep)) {
5633 raise_encode_exception(&exc, encoding,
5634 str, pos - 1, pos,
5635 "surrogates not allowed");
5636 goto error;
5637 }
5638 }
5639
5640 /* four bytes are reserved for each surrogate */
5641 if (moreunits > 1) {
5642 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5643 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5644 /* integer overflow */
5645 PyErr_NoMemory();
5646 goto error;
5647 }
5648 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5649 goto error;
5650 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5651 }
5652
5653 if (PyBytes_Check(rep)) {
5654 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5655 out += moreunits;
5656 } else /* rep is unicode */ {
5657 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5658 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5659 &out, native_ordering);
5660 }
5661
5662 Py_CLEAR(rep);
5663 }
5664
5665 /* Cut back to size actually needed. This is necessary for, for example,
5666 encoding of a string containing isolated surrogates and the 'ignore'
5667 handler is used. */
5668 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5669 if (nsize != PyBytes_GET_SIZE(v))
5670 _PyBytes_Resize(&v, nsize);
5671 Py_XDECREF(errorHandler);
5672 Py_XDECREF(exc);
5673 done:
5674 return v;
5675 error:
5676 Py_XDECREF(rep);
5677 Py_XDECREF(errorHandler);
5678 Py_XDECREF(exc);
5679 Py_XDECREF(v);
5680 return NULL;
5681 }
5682
5683 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5684 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5685 Py_ssize_t size,
5686 const char *errors,
5687 int byteorder)
5688 {
5689 PyObject *result;
5690 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5691 if (tmp == NULL)
5692 return NULL;
5693 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5694 Py_DECREF(tmp);
5695 return result;
5696 }
5697
5698 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5699 PyUnicode_AsUTF32String(PyObject *unicode)
5700 {
5701 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5702 }
5703
5704 /* --- UTF-16 Codec ------------------------------------------------------- */
5705
5706 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5707 PyUnicode_DecodeUTF16(const char *s,
5708 Py_ssize_t size,
5709 const char *errors,
5710 int *byteorder)
5711 {
5712 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5713 }
5714
5715 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5716 PyUnicode_DecodeUTF16Stateful(const char *s,
5717 Py_ssize_t size,
5718 const char *errors,
5719 int *byteorder,
5720 Py_ssize_t *consumed)
5721 {
5722 const char *starts = s;
5723 Py_ssize_t startinpos;
5724 Py_ssize_t endinpos;
5725 _PyUnicodeWriter writer;
5726 const unsigned char *q, *e;
5727 int bo = 0; /* assume native ordering by default */
5728 int native_ordering;
5729 const char *errmsg = "";
5730 PyObject *errorHandler = NULL;
5731 PyObject *exc = NULL;
5732 const char *encoding;
5733
5734 q = (unsigned char *)s;
5735 e = q + size;
5736
5737 if (byteorder)
5738 bo = *byteorder;
5739
5740 /* Check for BOM marks (U+FEFF) in the input and adjust current
5741 byte order setting accordingly. In native mode, the leading BOM
5742 mark is skipped, in all other modes, it is copied to the output
5743 stream as-is (giving a ZWNBSP character). */
5744 if (bo == 0 && size >= 2) {
5745 const Py_UCS4 bom = (q[1] << 8) | q[0];
5746 if (bom == 0xFEFF) {
5747 q += 2;
5748 bo = -1;
5749 }
5750 else if (bom == 0xFFFE) {
5751 q += 2;
5752 bo = 1;
5753 }
5754 if (byteorder)
5755 *byteorder = bo;
5756 }
5757
5758 if (q == e) {
5759 if (consumed)
5760 *consumed = size;
5761 _Py_RETURN_UNICODE_EMPTY();
5762 }
5763
5764 #if PY_LITTLE_ENDIAN
5765 native_ordering = bo <= 0;
5766 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5767 #else
5768 native_ordering = bo >= 0;
5769 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5770 #endif
5771
5772 /* Note: size will always be longer than the resulting Unicode
5773 character count normally. Error handler will take care of
5774 resizing when needed. */
5775 _PyUnicodeWriter_Init(&writer);
5776 writer.min_length = (e - q + 1) / 2;
5777 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5778 goto onError;
5779
5780 while (1) {
5781 Py_UCS4 ch = 0;
5782 if (e - q >= 2) {
5783 int kind = writer.kind;
5784 if (kind == PyUnicode_1BYTE_KIND) {
5785 if (PyUnicode_IS_ASCII(writer.buffer))
5786 ch = asciilib_utf16_decode(&q, e,
5787 (Py_UCS1*)writer.data, &writer.pos,
5788 native_ordering);
5789 else
5790 ch = ucs1lib_utf16_decode(&q, e,
5791 (Py_UCS1*)writer.data, &writer.pos,
5792 native_ordering);
5793 } else if (kind == PyUnicode_2BYTE_KIND) {
5794 ch = ucs2lib_utf16_decode(&q, e,
5795 (Py_UCS2*)writer.data, &writer.pos,
5796 native_ordering);
5797 } else {
5798 assert(kind == PyUnicode_4BYTE_KIND);
5799 ch = ucs4lib_utf16_decode(&q, e,
5800 (Py_UCS4*)writer.data, &writer.pos,
5801 native_ordering);
5802 }
5803 }
5804
5805 switch (ch)
5806 {
5807 case 0:
5808 /* remaining byte at the end? (size should be even) */
5809 if (q == e || consumed)
5810 goto End;
5811 errmsg = "truncated data";
5812 startinpos = ((const char *)q) - starts;
5813 endinpos = ((const char *)e) - starts;
5814 break;
5815 /* The remaining input chars are ignored if the callback
5816 chooses to skip the input */
5817 case 1:
5818 q -= 2;
5819 if (consumed)
5820 goto End;
5821 errmsg = "unexpected end of data";
5822 startinpos = ((const char *)q) - starts;
5823 endinpos = ((const char *)e) - starts;
5824 break;
5825 case 2:
5826 errmsg = "illegal encoding";
5827 startinpos = ((const char *)q) - 2 - starts;
5828 endinpos = startinpos + 2;
5829 break;
5830 case 3:
5831 errmsg = "illegal UTF-16 surrogate";
5832 startinpos = ((const char *)q) - 4 - starts;
5833 endinpos = startinpos + 2;
5834 break;
5835 default:
5836 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5837 goto onError;
5838 continue;
5839 }
5840
5841 if (unicode_decode_call_errorhandler_writer(
5842 errors,
5843 &errorHandler,
5844 encoding, errmsg,
5845 &starts,
5846 (const char **)&e,
5847 &startinpos,
5848 &endinpos,
5849 &exc,
5850 (const char **)&q,
5851 &writer))
5852 goto onError;
5853 }
5854
5855 End:
5856 if (consumed)
5857 *consumed = (const char *)q-starts;
5858
5859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
5861 return _PyUnicodeWriter_Finish(&writer);
5862
5863 onError:
5864 _PyUnicodeWriter_Dealloc(&writer);
5865 Py_XDECREF(errorHandler);
5866 Py_XDECREF(exc);
5867 return NULL;
5868 }
5869
5870 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5871 _PyUnicode_EncodeUTF16(PyObject *str,
5872 const char *errors,
5873 int byteorder)
5874 {
5875 enum PyUnicode_Kind kind;
5876 const void *data;
5877 Py_ssize_t len;
5878 PyObject *v;
5879 unsigned short *out;
5880 Py_ssize_t pairs;
5881 #if PY_BIG_ENDIAN
5882 int native_ordering = byteorder >= 0;
5883 #else
5884 int native_ordering = byteorder <= 0;
5885 #endif
5886 const char *encoding;
5887 Py_ssize_t nsize, pos;
5888 PyObject *errorHandler = NULL;
5889 PyObject *exc = NULL;
5890 PyObject *rep = NULL;
5891
5892 if (!PyUnicode_Check(str)) {
5893 PyErr_BadArgument();
5894 return NULL;
5895 }
5896 if (PyUnicode_READY(str) == -1)
5897 return NULL;
5898 kind = PyUnicode_KIND(str);
5899 data = PyUnicode_DATA(str);
5900 len = PyUnicode_GET_LENGTH(str);
5901
5902 pairs = 0;
5903 if (kind == PyUnicode_4BYTE_KIND) {
5904 const Py_UCS4 *in = (const Py_UCS4 *)data;
5905 const Py_UCS4 *end = in + len;
5906 while (in < end) {
5907 if (*in++ >= 0x10000) {
5908 pairs++;
5909 }
5910 }
5911 }
5912 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5913 return PyErr_NoMemory();
5914 }
5915 nsize = len + pairs + (byteorder == 0);
5916 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5917 if (v == NULL) {
5918 return NULL;
5919 }
5920
5921 /* output buffer is 2-bytes aligned */
5922 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5923 out = (unsigned short *)PyBytes_AS_STRING(v);
5924 if (byteorder == 0) {
5925 *out++ = 0xFEFF;
5926 }
5927 if (len == 0) {
5928 goto done;
5929 }
5930
5931 if (kind == PyUnicode_1BYTE_KIND) {
5932 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5933 goto done;
5934 }
5935
5936 if (byteorder < 0) {
5937 encoding = "utf-16-le";
5938 }
5939 else if (byteorder > 0) {
5940 encoding = "utf-16-be";
5941 }
5942 else {
5943 encoding = "utf-16";
5944 }
5945
5946 pos = 0;
5947 while (pos < len) {
5948 Py_ssize_t repsize, moreunits;
5949
5950 if (kind == PyUnicode_2BYTE_KIND) {
5951 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5952 &out, native_ordering);
5953 }
5954 else {
5955 assert(kind == PyUnicode_4BYTE_KIND);
5956 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5957 &out, native_ordering);
5958 }
5959 if (pos == len)
5960 break;
5961
5962 rep = unicode_encode_call_errorhandler(
5963 errors, &errorHandler,
5964 encoding, "surrogates not allowed",
5965 str, &exc, pos, pos + 1, &pos);
5966 if (!rep)
5967 goto error;
5968
5969 if (PyBytes_Check(rep)) {
5970 repsize = PyBytes_GET_SIZE(rep);
5971 if (repsize & 1) {
5972 raise_encode_exception(&exc, encoding,
5973 str, pos - 1, pos,
5974 "surrogates not allowed");
5975 goto error;
5976 }
5977 moreunits = repsize / 2;
5978 }
5979 else {
5980 assert(PyUnicode_Check(rep));
5981 if (PyUnicode_READY(rep) < 0)
5982 goto error;
5983 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5984 if (!PyUnicode_IS_ASCII(rep)) {
5985 raise_encode_exception(&exc, encoding,
5986 str, pos - 1, pos,
5987 "surrogates not allowed");
5988 goto error;
5989 }
5990 }
5991
5992 /* two bytes are reserved for each surrogate */
5993 if (moreunits > 1) {
5994 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5995 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5996 /* integer overflow */
5997 PyErr_NoMemory();
5998 goto error;
5999 }
6000 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6001 goto error;
6002 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6003 }
6004
6005 if (PyBytes_Check(rep)) {
6006 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6007 out += moreunits;
6008 } else /* rep is unicode */ {
6009 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6010 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6011 &out, native_ordering);
6012 }
6013
6014 Py_CLEAR(rep);
6015 }
6016
6017 /* Cut back to size actually needed. This is necessary for, for example,
6018 encoding of a string containing isolated surrogates and the 'ignore' handler
6019 is used. */
6020 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6021 if (nsize != PyBytes_GET_SIZE(v))
6022 _PyBytes_Resize(&v, nsize);
6023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
6025 done:
6026 return v;
6027 error:
6028 Py_XDECREF(rep);
6029 Py_XDECREF(errorHandler);
6030 Py_XDECREF(exc);
6031 Py_XDECREF(v);
6032 return NULL;
6033 #undef STORECHAR
6034 }
6035
6036 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6037 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6038 Py_ssize_t size,
6039 const char *errors,
6040 int byteorder)
6041 {
6042 PyObject *result;
6043 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6044 if (tmp == NULL)
6045 return NULL;
6046 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6047 Py_DECREF(tmp);
6048 return result;
6049 }
6050
6051 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6052 PyUnicode_AsUTF16String(PyObject *unicode)
6053 {
6054 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6055 }
6056
6057 /* --- Unicode Escape Codec ----------------------------------------------- */
6058
6059 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6060
6061 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6062 _PyUnicode_DecodeUnicodeEscape(const char *s,
6063 Py_ssize_t size,
6064 const char *errors,
6065 const char **first_invalid_escape)
6066 {
6067 const char *starts = s;
6068 _PyUnicodeWriter writer;
6069 const char *end;
6070 PyObject *errorHandler = NULL;
6071 PyObject *exc = NULL;
6072
6073 // so we can remember if we've seen an invalid escape char or not
6074 *first_invalid_escape = NULL;
6075
6076 if (size == 0) {
6077 _Py_RETURN_UNICODE_EMPTY();
6078 }
6079 /* Escaped strings will always be longer than the resulting
6080 Unicode string, so we start with size here and then reduce the
6081 length after conversion to the true value.
6082 (but if the error callback returns a long replacement string
6083 we'll have to allocate more space) */
6084 _PyUnicodeWriter_Init(&writer);
6085 writer.min_length = size;
6086 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6087 goto onError;
6088 }
6089
6090 end = s + size;
6091 while (s < end) {
6092 unsigned char c = (unsigned char) *s++;
6093 Py_UCS4 ch;
6094 int count;
6095 Py_ssize_t startinpos;
6096 Py_ssize_t endinpos;
6097 const char *message;
6098
6099 #define WRITE_ASCII_CHAR(ch) \
6100 do { \
6101 assert(ch <= 127); \
6102 assert(writer.pos < writer.size); \
6103 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6104 } while(0)
6105
6106 #define WRITE_CHAR(ch) \
6107 do { \
6108 if (ch <= writer.maxchar) { \
6109 assert(writer.pos < writer.size); \
6110 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6111 } \
6112 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6113 goto onError; \
6114 } \
6115 } while(0)
6116
6117 /* Non-escape characters are interpreted as Unicode ordinals */
6118 if (c != '\\') {
6119 WRITE_CHAR(c);
6120 continue;
6121 }
6122
6123 startinpos = s - starts - 1;
6124 /* \ - Escapes */
6125 if (s >= end) {
6126 message = "\\ at end of string";
6127 goto error;
6128 }
6129 c = (unsigned char) *s++;
6130
6131 assert(writer.pos < writer.size);
6132 switch (c) {
6133
6134 /* \x escapes */
6135 case '\n': continue;
6136 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6137 case '\'': WRITE_ASCII_CHAR('\''); continue;
6138 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6139 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6140 /* FF */
6141 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6142 case 't': WRITE_ASCII_CHAR('\t'); continue;
6143 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6144 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6145 /* VT */
6146 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6147 /* BEL, not classic C */
6148 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6149
6150 /* \OOO (octal) escapes */
6151 case '0': case '1': case '2': case '3':
6152 case '4': case '5': case '6': case '7':
6153 ch = c - '0';
6154 if (s < end && '0' <= *s && *s <= '7') {
6155 ch = (ch<<3) + *s++ - '0';
6156 if (s < end && '0' <= *s && *s <= '7') {
6157 ch = (ch<<3) + *s++ - '0';
6158 }
6159 }
6160 WRITE_CHAR(ch);
6161 continue;
6162
6163 /* hex escapes */
6164 /* \xXX */
6165 case 'x':
6166 count = 2;
6167 message = "truncated \\xXX escape";
6168 goto hexescape;
6169
6170 /* \uXXXX */
6171 case 'u':
6172 count = 4;
6173 message = "truncated \\uXXXX escape";
6174 goto hexescape;
6175
6176 /* \UXXXXXXXX */
6177 case 'U':
6178 count = 8;
6179 message = "truncated \\UXXXXXXXX escape";
6180 hexescape:
6181 for (ch = 0; count && s < end; ++s, --count) {
6182 c = (unsigned char)*s;
6183 ch <<= 4;
6184 if (c >= '0' && c <= '9') {
6185 ch += c - '0';
6186 }
6187 else if (c >= 'a' && c <= 'f') {
6188 ch += c - ('a' - 10);
6189 }
6190 else if (c >= 'A' && c <= 'F') {
6191 ch += c - ('A' - 10);
6192 }
6193 else {
6194 break;
6195 }
6196 }
6197 if (count) {
6198 goto error;
6199 }
6200
6201 /* when we get here, ch is a 32-bit unicode character */
6202 if (ch > MAX_UNICODE) {
6203 message = "illegal Unicode character";
6204 goto error;
6205 }
6206
6207 WRITE_CHAR(ch);
6208 continue;
6209
6210 /* \N{name} */
6211 case 'N':
6212 if (ucnhash_CAPI == NULL) {
6213 /* load the unicode data module */
6214 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6215 PyUnicodeData_CAPSULE_NAME, 1);
6216 if (ucnhash_CAPI == NULL) {
6217 PyErr_SetString(
6218 PyExc_UnicodeError,
6219 "\\N escapes not supported (can't load unicodedata module)"
6220 );
6221 goto onError;
6222 }
6223 }
6224
6225 message = "malformed \\N character escape";
6226 if (s < end && *s == '{') {
6227 const char *start = ++s;
6228 size_t namelen;
6229 /* look for the closing brace */
6230 while (s < end && *s != '}')
6231 s++;
6232 namelen = s - start;
6233 if (namelen && s < end) {
6234 /* found a name. look it up in the unicode database */
6235 s++;
6236 ch = 0xffffffff; /* in case 'getcode' messes up */
6237 if (namelen <= INT_MAX &&
6238 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6239 &ch, 0)) {
6240 assert(ch <= MAX_UNICODE);
6241 WRITE_CHAR(ch);
6242 continue;
6243 }
6244 message = "unknown Unicode character name";
6245 }
6246 }
6247 goto error;
6248
6249 default:
6250 if (*first_invalid_escape == NULL) {
6251 *first_invalid_escape = s-1; /* Back up one char, since we've
6252 already incremented s. */
6253 }
6254 WRITE_ASCII_CHAR('\\');
6255 WRITE_CHAR(c);
6256 continue;
6257 }
6258
6259 error:
6260 endinpos = s-starts;
6261 writer.min_length = end - s + writer.pos;
6262 if (unicode_decode_call_errorhandler_writer(
6263 errors, &errorHandler,
6264 "unicodeescape", message,
6265 &starts, &end, &startinpos, &endinpos, &exc, &s,
6266 &writer)) {
6267 goto onError;
6268 }
6269 assert(end - s <= writer.size - writer.pos);
6270
6271 #undef WRITE_ASCII_CHAR
6272 #undef WRITE_CHAR
6273 }
6274
6275 Py_XDECREF(errorHandler);
6276 Py_XDECREF(exc);
6277 return _PyUnicodeWriter_Finish(&writer);
6278
6279 onError:
6280 _PyUnicodeWriter_Dealloc(&writer);
6281 Py_XDECREF(errorHandler);
6282 Py_XDECREF(exc);
6283 return NULL;
6284 }
6285
6286 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6287 PyUnicode_DecodeUnicodeEscape(const char *s,
6288 Py_ssize_t size,
6289 const char *errors)
6290 {
6291 const char *first_invalid_escape;
6292 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6293 &first_invalid_escape);
6294 if (result == NULL)
6295 return NULL;
6296 if (first_invalid_escape != NULL) {
6297 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6298 "invalid escape sequence '\\%c'",
6299 (unsigned char)*first_invalid_escape) < 0) {
6300 Py_DECREF(result);
6301 return NULL;
6302 }
6303 }
6304 return result;
6305 }
6306
6307 /* Return a Unicode-Escape string version of the Unicode object. */
6308
6309 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6310 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6311 {
6312 Py_ssize_t i, len;
6313 PyObject *repr;
6314 char *p;
6315 enum PyUnicode_Kind kind;
6316 void *data;
6317 Py_ssize_t expandsize;
6318
6319 /* Initial allocation is based on the longest-possible character
6320 escape.
6321
6322 For UCS1 strings it's '\xxx', 4 bytes per source character.
6323 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6324 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6325 */
6326
6327 if (!PyUnicode_Check(unicode)) {
6328 PyErr_BadArgument();
6329 return NULL;
6330 }
6331 if (PyUnicode_READY(unicode) == -1) {
6332 return NULL;
6333 }
6334
6335 len = PyUnicode_GET_LENGTH(unicode);
6336 if (len == 0) {
6337 return PyBytes_FromStringAndSize(NULL, 0);
6338 }
6339
6340 kind = PyUnicode_KIND(unicode);
6341 data = PyUnicode_DATA(unicode);
6342 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6343 bytes, and 1 byte characters 4. */
6344 expandsize = kind * 2 + 2;
6345 if (len > PY_SSIZE_T_MAX / expandsize) {
6346 return PyErr_NoMemory();
6347 }
6348 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6349 if (repr == NULL) {
6350 return NULL;
6351 }
6352
6353 p = PyBytes_AS_STRING(repr);
6354 for (i = 0; i < len; i++) {
6355 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6356
6357 /* U+0000-U+00ff range */
6358 if (ch < 0x100) {
6359 if (ch >= ' ' && ch < 127) {
6360 if (ch != '\\') {
6361 /* Copy printable US ASCII as-is */
6362 *p++ = (char) ch;
6363 }
6364 /* Escape backslashes */
6365 else {
6366 *p++ = '\\';
6367 *p++ = '\\';
6368 }
6369 }
6370
6371 /* Map special whitespace to '\t', \n', '\r' */
6372 else if (ch == '\t') {
6373 *p++ = '\\';
6374 *p++ = 't';
6375 }
6376 else if (ch == '\n') {
6377 *p++ = '\\';
6378 *p++ = 'n';
6379 }
6380 else if (ch == '\r') {
6381 *p++ = '\\';
6382 *p++ = 'r';
6383 }
6384
6385 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6386 else {
6387 *p++ = '\\';
6388 *p++ = 'x';
6389 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390 *p++ = Py_hexdigits[ch & 0x000F];
6391 }
6392 }
6393 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6394 else if (ch < 0x10000) {
6395 *p++ = '\\';
6396 *p++ = 'u';
6397 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6398 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6399 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6400 *p++ = Py_hexdigits[ch & 0x000F];
6401 }
6402 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6403 else {
6404
6405 /* Make sure that the first two digits are zero */
6406 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6407 *p++ = '\\';
6408 *p++ = 'U';
6409 *p++ = '0';
6410 *p++ = '0';
6411 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6412 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6413 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6414 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6415 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6416 *p++ = Py_hexdigits[ch & 0x0000000F];
6417 }
6418 }
6419
6420 assert(p - PyBytes_AS_STRING(repr) > 0);
6421 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6422 return NULL;
6423 }
6424 return repr;
6425 }
6426
6427 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6428 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6429 Py_ssize_t size)
6430 {
6431 PyObject *result;
6432 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6433 if (tmp == NULL) {
6434 return NULL;
6435 }
6436
6437 result = PyUnicode_AsUnicodeEscapeString(tmp);
6438 Py_DECREF(tmp);
6439 return result;
6440 }
6441
6442 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6443
6444 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6445 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6446 Py_ssize_t size,
6447 const char *errors)
6448 {
6449 const char *starts = s;
6450 _PyUnicodeWriter writer;
6451 const char *end;
6452 PyObject *errorHandler = NULL;
6453 PyObject *exc = NULL;
6454
6455 if (size == 0) {
6456 _Py_RETURN_UNICODE_EMPTY();
6457 }
6458
6459 /* Escaped strings will always be longer than the resulting
6460 Unicode string, so we start with size here and then reduce the
6461 length after conversion to the true value. (But decoding error
6462 handler might have to resize the string) */
6463 _PyUnicodeWriter_Init(&writer);
6464 writer.min_length = size;
6465 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6466 goto onError;
6467 }
6468
6469 end = s + size;
6470 while (s < end) {
6471 unsigned char c = (unsigned char) *s++;
6472 Py_UCS4 ch;
6473 int count;
6474 Py_ssize_t startinpos;
6475 Py_ssize_t endinpos;
6476 const char *message;
6477
6478 #define WRITE_CHAR(ch) \
6479 do { \
6480 if (ch <= writer.maxchar) { \
6481 assert(writer.pos < writer.size); \
6482 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483 } \
6484 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485 goto onError; \
6486 } \
6487 } while(0)
6488
6489 /* Non-escape characters are interpreted as Unicode ordinals */
6490 if (c != '\\' || s >= end) {
6491 WRITE_CHAR(c);
6492 continue;
6493 }
6494
6495 c = (unsigned char) *s++;
6496 if (c == 'u') {
6497 count = 4;
6498 message = "truncated \\uXXXX escape";
6499 }
6500 else if (c == 'U') {
6501 count = 8;
6502 message = "truncated \\UXXXXXXXX escape";
6503 }
6504 else {
6505 assert(writer.pos < writer.size);
6506 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6507 WRITE_CHAR(c);
6508 continue;
6509 }
6510 startinpos = s - starts - 2;
6511
6512 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6513 for (ch = 0; count && s < end; ++s, --count) {
6514 c = (unsigned char)*s;
6515 ch <<= 4;
6516 if (c >= '0' && c <= '9') {
6517 ch += c - '0';
6518 }
6519 else if (c >= 'a' && c <= 'f') {
6520 ch += c - ('a' - 10);
6521 }
6522 else if (c >= 'A' && c <= 'F') {
6523 ch += c - ('A' - 10);
6524 }
6525 else {
6526 break;
6527 }
6528 }
6529 if (!count) {
6530 if (ch <= MAX_UNICODE) {
6531 WRITE_CHAR(ch);
6532 continue;
6533 }
6534 message = "\\Uxxxxxxxx out of range";
6535 }
6536
6537 endinpos = s-starts;
6538 writer.min_length = end - s + writer.pos;
6539 if (unicode_decode_call_errorhandler_writer(
6540 errors, &errorHandler,
6541 "rawunicodeescape", message,
6542 &starts, &end, &startinpos, &endinpos, &exc, &s,
6543 &writer)) {
6544 goto onError;
6545 }
6546 assert(end - s <= writer.size - writer.pos);
6547
6548 #undef WRITE_CHAR
6549 }
6550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
6552 return _PyUnicodeWriter_Finish(&writer);
6553
6554 onError:
6555 _PyUnicodeWriter_Dealloc(&writer);
6556 Py_XDECREF(errorHandler);
6557 Py_XDECREF(exc);
6558 return NULL;
6559
6560 }
6561
6562
6563 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6564 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6565 {
6566 PyObject *repr;
6567 char *p;
6568 Py_ssize_t expandsize, pos;
6569 int kind;
6570 void *data;
6571 Py_ssize_t len;
6572
6573 if (!PyUnicode_Check(unicode)) {
6574 PyErr_BadArgument();
6575 return NULL;
6576 }
6577 if (PyUnicode_READY(unicode) == -1) {
6578 return NULL;
6579 }
6580 kind = PyUnicode_KIND(unicode);
6581 data = PyUnicode_DATA(unicode);
6582 len = PyUnicode_GET_LENGTH(unicode);
6583 if (kind == PyUnicode_1BYTE_KIND) {
6584 return PyBytes_FromStringAndSize(data, len);
6585 }
6586
6587 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6588 bytes, and 1 byte characters 4. */
6589 expandsize = kind * 2 + 2;
6590
6591 if (len > PY_SSIZE_T_MAX / expandsize) {
6592 return PyErr_NoMemory();
6593 }
6594 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6595 if (repr == NULL) {
6596 return NULL;
6597 }
6598 if (len == 0) {
6599 return repr;
6600 }
6601
6602 p = PyBytes_AS_STRING(repr);
6603 for (pos = 0; pos < len; pos++) {
6604 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6605
6606 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6607 if (ch < 0x100) {
6608 *p++ = (char) ch;
6609 }
6610 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6611 else if (ch < 0x10000) {
6612 *p++ = '\\';
6613 *p++ = 'u';
6614 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6615 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6616 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6617 *p++ = Py_hexdigits[ch & 15];
6618 }
6619 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6620 else {
6621 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6622 *p++ = '\\';
6623 *p++ = 'U';
6624 *p++ = '0';
6625 *p++ = '0';
6626 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6627 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6628 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6629 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6630 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6631 *p++ = Py_hexdigits[ch & 15];
6632 }
6633 }
6634
6635 assert(p > PyBytes_AS_STRING(repr));
6636 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6637 return NULL;
6638 }
6639 return repr;
6640 }
6641
6642 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6643 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6644 Py_ssize_t size)
6645 {
6646 PyObject *result;
6647 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6648 if (tmp == NULL)
6649 return NULL;
6650 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6651 Py_DECREF(tmp);
6652 return result;
6653 }
6654
6655 /* --- Latin-1 Codec ------------------------------------------------------ */
6656
6657 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6658 PyUnicode_DecodeLatin1(const char *s,
6659 Py_ssize_t size,
6660 const char *errors)
6661 {
6662 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6663 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6664 }
6665
6666 /* create or adjust a UnicodeEncodeError */
6667 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6668 make_encode_exception(PyObject **exceptionObject,
6669 const char *encoding,
6670 PyObject *unicode,
6671 Py_ssize_t startpos, Py_ssize_t endpos,
6672 const char *reason)
6673 {
6674 if (*exceptionObject == NULL) {
6675 *exceptionObject = PyObject_CallFunction(
6676 PyExc_UnicodeEncodeError, "sOnns",
6677 encoding, unicode, startpos, endpos, reason);
6678 }
6679 else {
6680 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6681 goto onError;
6682 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6683 goto onError;
6684 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6685 goto onError;
6686 return;
6687 onError:
6688 Py_CLEAR(*exceptionObject);
6689 }
6690 }
6691
6692 /* raises a UnicodeEncodeError */
6693 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6694 raise_encode_exception(PyObject **exceptionObject,
6695 const char *encoding,
6696 PyObject *unicode,
6697 Py_ssize_t startpos, Py_ssize_t endpos,
6698 const char *reason)
6699 {
6700 make_encode_exception(exceptionObject,
6701 encoding, unicode, startpos, endpos, reason);
6702 if (*exceptionObject != NULL)
6703 PyCodec_StrictErrors(*exceptionObject);
6704 }
6705
6706 /* error handling callback helper:
6707 build arguments, call the callback and check the arguments,
6708 put the result into newpos and return the replacement string, which
6709 has to be freed by the caller */
6710 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6711 unicode_encode_call_errorhandler(const char *errors,
6712 PyObject **errorHandler,
6713 const char *encoding, const char *reason,
6714 PyObject *unicode, PyObject **exceptionObject,
6715 Py_ssize_t startpos, Py_ssize_t endpos,
6716 Py_ssize_t *newpos)
6717 {
6718 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6719 Py_ssize_t len;
6720 PyObject *restuple;
6721 PyObject *resunicode;
6722
6723 if (*errorHandler == NULL) {
6724 *errorHandler = PyCodec_LookupError(errors);
6725 if (*errorHandler == NULL)
6726 return NULL;
6727 }
6728
6729 if (PyUnicode_READY(unicode) == -1)
6730 return NULL;
6731 len = PyUnicode_GET_LENGTH(unicode);
6732
6733 make_encode_exception(exceptionObject,
6734 encoding, unicode, startpos, endpos, reason);
6735 if (*exceptionObject == NULL)
6736 return NULL;
6737
6738 restuple = PyObject_CallFunctionObjArgs(
6739 *errorHandler, *exceptionObject, NULL);
6740 if (restuple == NULL)
6741 return NULL;
6742 if (!PyTuple_Check(restuple)) {
6743 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6744 Py_DECREF(restuple);
6745 return NULL;
6746 }
6747 if (!PyArg_ParseTuple(restuple, argparse,
6748 &resunicode, newpos)) {
6749 Py_DECREF(restuple);
6750 return NULL;
6751 }
6752 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6753 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6754 Py_DECREF(restuple);
6755 return NULL;
6756 }
6757 if (*newpos<0)
6758 *newpos = len + *newpos;
6759 if (*newpos<0 || *newpos>len) {
6760 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6761 Py_DECREF(restuple);
6762 return NULL;
6763 }
6764 Py_INCREF(resunicode);
6765 Py_DECREF(restuple);
6766 return resunicode;
6767 }
6768
6769 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6770 unicode_encode_ucs1(PyObject *unicode,
6771 const char *errors,
6772 const Py_UCS4 limit)
6773 {
6774 /* input state */
6775 Py_ssize_t pos=0, size;
6776 int kind;
6777 void *data;
6778 /* pointer into the output */
6779 char *str;
6780 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6781 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6782 PyObject *error_handler_obj = NULL;
6783 PyObject *exc = NULL;
6784 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6785 PyObject *rep = NULL;
6786 /* output object */
6787 _PyBytesWriter writer;
6788
6789 if (PyUnicode_READY(unicode) == -1)
6790 return NULL;
6791 size = PyUnicode_GET_LENGTH(unicode);
6792 kind = PyUnicode_KIND(unicode);
6793 data = PyUnicode_DATA(unicode);
6794 /* allocate enough for a simple encoding without
6795 replacements, if we need more, we'll resize */
6796 if (size == 0)
6797 return PyBytes_FromStringAndSize(NULL, 0);
6798
6799 _PyBytesWriter_Init(&writer);
6800 str = _PyBytesWriter_Alloc(&writer, size);
6801 if (str == NULL)
6802 return NULL;
6803
6804 while (pos < size) {
6805 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6806
6807 /* can we encode this? */
6808 if (ch < limit) {
6809 /* no overflow check, because we know that the space is enough */
6810 *str++ = (char)ch;
6811 ++pos;
6812 }
6813 else {
6814 Py_ssize_t newpos, i;
6815 /* startpos for collecting unencodable chars */
6816 Py_ssize_t collstart = pos;
6817 Py_ssize_t collend = collstart + 1;
6818 /* find all unecodable characters */
6819
6820 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6821 ++collend;
6822
6823 /* Only overallocate the buffer if it's not the last write */
6824 writer.overallocate = (collend < size);
6825
6826 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6827 if (error_handler == _Py_ERROR_UNKNOWN)
6828 error_handler = _Py_GetErrorHandler(errors);
6829
6830 switch (error_handler) {
6831 case _Py_ERROR_STRICT:
6832 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6833 goto onError;
6834
6835 case _Py_ERROR_REPLACE:
6836 memset(str, '?', collend - collstart);
6837 str += (collend - collstart);
6838 /* fall through */
6839 case _Py_ERROR_IGNORE:
6840 pos = collend;
6841 break;
6842
6843 case _Py_ERROR_BACKSLASHREPLACE:
6844 /* subtract preallocated bytes */
6845 writer.min_size -= (collend - collstart);
6846 str = backslashreplace(&writer, str,
6847 unicode, collstart, collend);
6848 if (str == NULL)
6849 goto onError;
6850 pos = collend;
6851 break;
6852
6853 case _Py_ERROR_XMLCHARREFREPLACE:
6854 /* subtract preallocated bytes */
6855 writer.min_size -= (collend - collstart);
6856 str = xmlcharrefreplace(&writer, str,
6857 unicode, collstart, collend);
6858 if (str == NULL)
6859 goto onError;
6860 pos = collend;
6861 break;
6862
6863 case _Py_ERROR_SURROGATEESCAPE:
6864 for (i = collstart; i < collend; ++i) {
6865 ch = PyUnicode_READ(kind, data, i);
6866 if (ch < 0xdc80 || 0xdcff < ch) {
6867 /* Not a UTF-8b surrogate */
6868 break;
6869 }
6870 *str++ = (char)(ch - 0xdc00);
6871 ++pos;
6872 }
6873 if (i >= collend)
6874 break;
6875 collstart = pos;
6876 assert(collstart != collend);
6877 /* fall through */
6878
6879 default:
6880 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6881 encoding, reason, unicode, &exc,
6882 collstart, collend, &newpos);
6883 if (rep == NULL)
6884 goto onError;
6885
6886 /* subtract preallocated bytes */
6887 writer.min_size -= newpos - collstart;
6888
6889 if (PyBytes_Check(rep)) {
6890 /* Directly copy bytes result to output. */
6891 str = _PyBytesWriter_WriteBytes(&writer, str,
6892 PyBytes_AS_STRING(rep),
6893 PyBytes_GET_SIZE(rep));
6894 }
6895 else {
6896 assert(PyUnicode_Check(rep));
6897
6898 if (PyUnicode_READY(rep) < 0)
6899 goto onError;
6900
6901 if (limit == 256 ?
6902 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6903 !PyUnicode_IS_ASCII(rep))
6904 {
6905 /* Not all characters are smaller than limit */
6906 raise_encode_exception(&exc, encoding, unicode,
6907 collstart, collend, reason);
6908 goto onError;
6909 }
6910 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6911 str = _PyBytesWriter_WriteBytes(&writer, str,
6912 PyUnicode_DATA(rep),
6913 PyUnicode_GET_LENGTH(rep));
6914 }
6915 if (str == NULL)
6916 goto onError;
6917
6918 pos = newpos;
6919 Py_CLEAR(rep);
6920 }
6921
6922 /* If overallocation was disabled, ensure that it was the last
6923 write. Otherwise, we missed an optimization */
6924 assert(writer.overallocate || pos == size);
6925 }
6926 }
6927
6928 Py_XDECREF(error_handler_obj);
6929 Py_XDECREF(exc);
6930 return _PyBytesWriter_Finish(&writer, str);
6931
6932 onError:
6933 Py_XDECREF(rep);
6934 _PyBytesWriter_Dealloc(&writer);
6935 Py_XDECREF(error_handler_obj);
6936 Py_XDECREF(exc);
6937 return NULL;
6938 }
6939
6940 /* Deprecated */
6941 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6942 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6943 Py_ssize_t size,
6944 const char *errors)
6945 {
6946 PyObject *result;
6947 PyObject *unicode = PyUnicode_FromWideChar(p, size);
6948 if (unicode == NULL)
6949 return NULL;
6950 result = unicode_encode_ucs1(unicode, errors, 256);
6951 Py_DECREF(unicode);
6952 return result;
6953 }
6954
6955 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6956 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6957 {
6958 if (!PyUnicode_Check(unicode)) {
6959 PyErr_BadArgument();
6960 return NULL;
6961 }
6962 if (PyUnicode_READY(unicode) == -1)
6963 return NULL;
6964 /* Fast path: if it is a one-byte string, construct
6965 bytes object directly. */
6966 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6967 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6968 PyUnicode_GET_LENGTH(unicode));
6969 /* Non-Latin-1 characters present. Defer to above function to
6970 raise the exception. */
6971 return unicode_encode_ucs1(unicode, errors, 256);
6972 }
6973
6974 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6975 PyUnicode_AsLatin1String(PyObject *unicode)
6976 {
6977 return _PyUnicode_AsLatin1String(unicode, NULL);
6978 }
6979
6980 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6981
6982 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6983 PyUnicode_DecodeASCII(const char *s,
6984 Py_ssize_t size,
6985 const char *errors)
6986 {
6987 const char *starts = s;
6988 _PyUnicodeWriter writer;
6989 int kind;
6990 void *data;
6991 Py_ssize_t startinpos;
6992 Py_ssize_t endinpos;
6993 Py_ssize_t outpos;
6994 const char *e;
6995 PyObject *error_handler_obj = NULL;
6996 PyObject *exc = NULL;
6997 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6998
6999 if (size == 0)
7000 _Py_RETURN_UNICODE_EMPTY();
7001
7002 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7003 if (size == 1 && (unsigned char)s[0] < 128)
7004 return get_latin1_char((unsigned char)s[0]);
7005
7006 _PyUnicodeWriter_Init(&writer);
7007 writer.min_length = size;
7008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
7009 return NULL;
7010
7011 e = s + size;
7012 data = writer.data;
7013 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7014 writer.pos = outpos;
7015 if (writer.pos == size)
7016 return _PyUnicodeWriter_Finish(&writer);
7017
7018 s += writer.pos;
7019 kind = writer.kind;
7020 while (s < e) {
7021 unsigned char c = (unsigned char)*s;
7022 if (c < 128) {
7023 PyUnicode_WRITE(kind, data, writer.pos, c);
7024 writer.pos++;
7025 ++s;
7026 continue;
7027 }
7028
7029 /* byte outsize range 0x00..0x7f: call the error handler */
7030
7031 if (error_handler == _Py_ERROR_UNKNOWN)
7032 error_handler = _Py_GetErrorHandler(errors);
7033
7034 switch (error_handler)
7035 {
7036 case _Py_ERROR_REPLACE:
7037 case _Py_ERROR_SURROGATEESCAPE:
7038 /* Fast-path: the error handler only writes one character,
7039 but we may switch to UCS2 at the first write */
7040 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7041 goto onError;
7042 kind = writer.kind;
7043 data = writer.data;
7044
7045 if (error_handler == _Py_ERROR_REPLACE)
7046 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7047 else
7048 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7049 writer.pos++;
7050 ++s;
7051 break;
7052
7053 case _Py_ERROR_IGNORE:
7054 ++s;
7055 break;
7056
7057 default:
7058 startinpos = s-starts;
7059 endinpos = startinpos + 1;
7060 if (unicode_decode_call_errorhandler_writer(
7061 errors, &error_handler_obj,
7062 "ascii", "ordinal not in range(128)",
7063 &starts, &e, &startinpos, &endinpos, &exc, &s,
7064 &writer))
7065 goto onError;
7066 kind = writer.kind;
7067 data = writer.data;
7068 }
7069 }
7070 Py_XDECREF(error_handler_obj);
7071 Py_XDECREF(exc);
7072 return _PyUnicodeWriter_Finish(&writer);
7073
7074 onError:
7075 _PyUnicodeWriter_Dealloc(&writer);
7076 Py_XDECREF(error_handler_obj);
7077 Py_XDECREF(exc);
7078 return NULL;
7079 }
7080
7081 /* Deprecated */
7082 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7083 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7084 Py_ssize_t size,
7085 const char *errors)
7086 {
7087 PyObject *result;
7088 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7089 if (unicode == NULL)
7090 return NULL;
7091 result = unicode_encode_ucs1(unicode, errors, 128);
7092 Py_DECREF(unicode);
7093 return result;
7094 }
7095
7096 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7097 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7098 {
7099 if (!PyUnicode_Check(unicode)) {
7100 PyErr_BadArgument();
7101 return NULL;
7102 }
7103 if (PyUnicode_READY(unicode) == -1)
7104 return NULL;
7105 /* Fast path: if it is an ASCII-only string, construct bytes object
7106 directly. Else defer to above function to raise the exception. */
7107 if (PyUnicode_IS_ASCII(unicode))
7108 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7109 PyUnicode_GET_LENGTH(unicode));
7110 return unicode_encode_ucs1(unicode, errors, 128);
7111 }
7112
7113 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7114 PyUnicode_AsASCIIString(PyObject *unicode)
7115 {
7116 return _PyUnicode_AsASCIIString(unicode, NULL);
7117 }
7118
7119 #ifdef MS_WINDOWS
7120
7121 /* --- MBCS codecs for Windows -------------------------------------------- */
7122
7123 #if SIZEOF_INT < SIZEOF_SIZE_T
7124 #define NEED_RETRY
7125 #endif
7126
7127 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7128 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7129 both cases also and avoids partial characters overrunning the
7130 length limit in MultiByteToWideChar on Windows */
7131 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7132
7133 #ifndef WC_ERR_INVALID_CHARS
7134 # define WC_ERR_INVALID_CHARS 0x0080
7135 #endif
7136
7137 static const char*
code_page_name(UINT code_page,PyObject ** obj)7138 code_page_name(UINT code_page, PyObject **obj)
7139 {
7140 *obj = NULL;
7141 if (code_page == CP_ACP)
7142 return "mbcs";
7143 if (code_page == CP_UTF7)
7144 return "CP_UTF7";
7145 if (code_page == CP_UTF8)
7146 return "CP_UTF8";
7147
7148 *obj = PyBytes_FromFormat("cp%u", code_page);
7149 if (*obj == NULL)
7150 return NULL;
7151 return PyBytes_AS_STRING(*obj);
7152 }
7153
7154 static DWORD
decode_code_page_flags(UINT code_page)7155 decode_code_page_flags(UINT code_page)
7156 {
7157 if (code_page == CP_UTF7) {
7158 /* The CP_UTF7 decoder only supports flags=0 */
7159 return 0;
7160 }
7161 else
7162 return MB_ERR_INVALID_CHARS;
7163 }
7164
7165 /*
7166 * Decode a byte string from a Windows code page into unicode object in strict
7167 * mode.
7168 *
7169 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7170 * OSError and returns -1 on other error.
7171 */
7172 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7173 decode_code_page_strict(UINT code_page,
7174 wchar_t **buf,
7175 Py_ssize_t *bufsize,
7176 const char *in,
7177 int insize)
7178 {
7179 DWORD flags = MB_ERR_INVALID_CHARS;
7180 wchar_t *out;
7181 DWORD outsize;
7182
7183 /* First get the size of the result */
7184 assert(insize > 0);
7185 while ((outsize = MultiByteToWideChar(code_page, flags,
7186 in, insize, NULL, 0)) <= 0)
7187 {
7188 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7189 goto error;
7190 }
7191 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7192 flags = 0;
7193 }
7194
7195 /* Extend a wchar_t* buffer */
7196 Py_ssize_t n = *bufsize; /* Get the current length */
7197 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7198 return -1;
7199 }
7200 out = *buf + n;
7201
7202 /* Do the conversion */
7203 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7204 if (outsize <= 0)
7205 goto error;
7206 return insize;
7207
7208 error:
7209 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7210 return -2;
7211 PyErr_SetFromWindowsErr(0);
7212 return -1;
7213 }
7214
7215 /*
7216 * Decode a byte string from a code page into unicode object with an error
7217 * handler.
7218 *
7219 * Returns consumed size if succeed, or raise an OSError or
7220 * UnicodeDecodeError exception and returns -1 on error.
7221 */
7222 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7223 decode_code_page_errors(UINT code_page,
7224 wchar_t **buf,
7225 Py_ssize_t *bufsize,
7226 const char *in, const int size,
7227 const char *errors, int final)
7228 {
7229 const char *startin = in;
7230 const char *endin = in + size;
7231 DWORD flags = MB_ERR_INVALID_CHARS;
7232 /* Ideally, we should get reason from FormatMessage. This is the Windows
7233 2000 English version of the message. */
7234 const char *reason = "No mapping for the Unicode character exists "
7235 "in the target code page.";
7236 /* each step cannot decode more than 1 character, but a character can be
7237 represented as a surrogate pair */
7238 wchar_t buffer[2], *out;
7239 int insize;
7240 Py_ssize_t outsize;
7241 PyObject *errorHandler = NULL;
7242 PyObject *exc = NULL;
7243 PyObject *encoding_obj = NULL;
7244 const char *encoding;
7245 DWORD err;
7246 int ret = -1;
7247
7248 assert(size > 0);
7249
7250 encoding = code_page_name(code_page, &encoding_obj);
7251 if (encoding == NULL)
7252 return -1;
7253
7254 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7255 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7256 UnicodeDecodeError. */
7257 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7258 if (exc != NULL) {
7259 PyCodec_StrictErrors(exc);
7260 Py_CLEAR(exc);
7261 }
7262 goto error;
7263 }
7264
7265 /* Extend a wchar_t* buffer */
7266 Py_ssize_t n = *bufsize; /* Get the current length */
7267 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7268 PyErr_NoMemory();
7269 goto error;
7270 }
7271 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7272 goto error;
7273 }
7274 out = *buf + n;
7275
7276 /* Decode the byte string character per character */
7277 while (in < endin)
7278 {
7279 /* Decode a character */
7280 insize = 1;
7281 do
7282 {
7283 outsize = MultiByteToWideChar(code_page, flags,
7284 in, insize,
7285 buffer, Py_ARRAY_LENGTH(buffer));
7286 if (outsize > 0)
7287 break;
7288 err = GetLastError();
7289 if (err == ERROR_INVALID_FLAGS && flags) {
7290 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7291 flags = 0;
7292 continue;
7293 }
7294 if (err != ERROR_NO_UNICODE_TRANSLATION
7295 && err != ERROR_INSUFFICIENT_BUFFER)
7296 {
7297 PyErr_SetFromWindowsErr(0);
7298 goto error;
7299 }
7300 insize++;
7301 }
7302 /* 4=maximum length of a UTF-8 sequence */
7303 while (insize <= 4 && (in + insize) <= endin);
7304
7305 if (outsize <= 0) {
7306 Py_ssize_t startinpos, endinpos, outpos;
7307
7308 /* last character in partial decode? */
7309 if (in + insize >= endin && !final)
7310 break;
7311
7312 startinpos = in - startin;
7313 endinpos = startinpos + 1;
7314 outpos = out - *buf;
7315 if (unicode_decode_call_errorhandler_wchar(
7316 errors, &errorHandler,
7317 encoding, reason,
7318 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7319 buf, bufsize, &outpos))
7320 {
7321 goto error;
7322 }
7323 out = *buf + outpos;
7324 }
7325 else {
7326 in += insize;
7327 memcpy(out, buffer, outsize * sizeof(wchar_t));
7328 out += outsize;
7329 }
7330 }
7331
7332 /* Shrink the buffer */
7333 assert(out - *buf <= *bufsize);
7334 *bufsize = out - *buf;
7335 /* (in - startin) <= size and size is an int */
7336 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7337
7338 error:
7339 Py_XDECREF(encoding_obj);
7340 Py_XDECREF(errorHandler);
7341 Py_XDECREF(exc);
7342 return ret;
7343 }
7344
7345 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7346 decode_code_page_stateful(int code_page,
7347 const char *s, Py_ssize_t size,
7348 const char *errors, Py_ssize_t *consumed)
7349 {
7350 wchar_t *buf = NULL;
7351 Py_ssize_t bufsize = 0;
7352 int chunk_size, final, converted, done;
7353
7354 if (code_page < 0) {
7355 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7356 return NULL;
7357 }
7358 if (size < 0) {
7359 PyErr_BadInternalCall();
7360 return NULL;
7361 }
7362
7363 if (consumed)
7364 *consumed = 0;
7365
7366 do
7367 {
7368 #ifdef NEED_RETRY
7369 if (size > DECODING_CHUNK_SIZE) {
7370 chunk_size = DECODING_CHUNK_SIZE;
7371 final = 0;
7372 done = 0;
7373 }
7374 else
7375 #endif
7376 {
7377 chunk_size = (int)size;
7378 final = (consumed == NULL);
7379 done = 1;
7380 }
7381
7382 if (chunk_size == 0 && done) {
7383 if (buf != NULL)
7384 break;
7385 _Py_RETURN_UNICODE_EMPTY();
7386 }
7387
7388 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7389 s, chunk_size);
7390 if (converted == -2)
7391 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7392 s, chunk_size,
7393 errors, final);
7394 assert(converted != 0 || done);
7395
7396 if (converted < 0) {
7397 PyMem_Free(buf);
7398 return NULL;
7399 }
7400
7401 if (consumed)
7402 *consumed += converted;
7403
7404 s += converted;
7405 size -= converted;
7406 } while (!done);
7407
7408 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7409 PyMem_Free(buf);
7410 return v;
7411 }
7412
7413 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7414 PyUnicode_DecodeCodePageStateful(int code_page,
7415 const char *s,
7416 Py_ssize_t size,
7417 const char *errors,
7418 Py_ssize_t *consumed)
7419 {
7420 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7421 }
7422
7423 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7424 PyUnicode_DecodeMBCSStateful(const char *s,
7425 Py_ssize_t size,
7426 const char *errors,
7427 Py_ssize_t *consumed)
7428 {
7429 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7430 }
7431
7432 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7433 PyUnicode_DecodeMBCS(const char *s,
7434 Py_ssize_t size,
7435 const char *errors)
7436 {
7437 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7438 }
7439
7440 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7441 encode_code_page_flags(UINT code_page, const char *errors)
7442 {
7443 if (code_page == CP_UTF8) {
7444 return WC_ERR_INVALID_CHARS;
7445 }
7446 else if (code_page == CP_UTF7) {
7447 /* CP_UTF7 only supports flags=0 */
7448 return 0;
7449 }
7450 else {
7451 if (errors != NULL && strcmp(errors, "replace") == 0)
7452 return 0;
7453 else
7454 return WC_NO_BEST_FIT_CHARS;
7455 }
7456 }
7457
7458 /*
7459 * Encode a Unicode string to a Windows code page into a byte string in strict
7460 * mode.
7461 *
7462 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7463 * an OSError and returns -1 on other error.
7464 */
7465 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7466 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7467 PyObject *unicode, Py_ssize_t offset, int len,
7468 const char* errors)
7469 {
7470 BOOL usedDefaultChar = FALSE;
7471 BOOL *pusedDefaultChar = &usedDefaultChar;
7472 int outsize;
7473 wchar_t *p;
7474 Py_ssize_t size;
7475 const DWORD flags = encode_code_page_flags(code_page, NULL);
7476 char *out;
7477 /* Create a substring so that we can get the UTF-16 representation
7478 of just the slice under consideration. */
7479 PyObject *substring;
7480
7481 assert(len > 0);
7482
7483 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7484 pusedDefaultChar = &usedDefaultChar;
7485 else
7486 pusedDefaultChar = NULL;
7487
7488 substring = PyUnicode_Substring(unicode, offset, offset+len);
7489 if (substring == NULL)
7490 return -1;
7491 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7492 if (p == NULL) {
7493 Py_DECREF(substring);
7494 return -1;
7495 }
7496 assert(size <= INT_MAX);
7497
7498 /* First get the size of the result */
7499 outsize = WideCharToMultiByte(code_page, flags,
7500 p, (int)size,
7501 NULL, 0,
7502 NULL, pusedDefaultChar);
7503 if (outsize <= 0)
7504 goto error;
7505 /* If we used a default char, then we failed! */
7506 if (pusedDefaultChar && *pusedDefaultChar) {
7507 Py_DECREF(substring);
7508 return -2;
7509 }
7510
7511 if (*outbytes == NULL) {
7512 /* Create string object */
7513 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7514 if (*outbytes == NULL) {
7515 Py_DECREF(substring);
7516 return -1;
7517 }
7518 out = PyBytes_AS_STRING(*outbytes);
7519 }
7520 else {
7521 /* Extend string object */
7522 const Py_ssize_t n = PyBytes_Size(*outbytes);
7523 if (outsize > PY_SSIZE_T_MAX - n) {
7524 PyErr_NoMemory();
7525 Py_DECREF(substring);
7526 return -1;
7527 }
7528 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7529 Py_DECREF(substring);
7530 return -1;
7531 }
7532 out = PyBytes_AS_STRING(*outbytes) + n;
7533 }
7534
7535 /* Do the conversion */
7536 outsize = WideCharToMultiByte(code_page, flags,
7537 p, (int)size,
7538 out, outsize,
7539 NULL, pusedDefaultChar);
7540 Py_CLEAR(substring);
7541 if (outsize <= 0)
7542 goto error;
7543 if (pusedDefaultChar && *pusedDefaultChar)
7544 return -2;
7545 return 0;
7546
7547 error:
7548 Py_XDECREF(substring);
7549 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7550 return -2;
7551 PyErr_SetFromWindowsErr(0);
7552 return -1;
7553 }
7554
7555 /*
7556 * Encode a Unicode string to a Windows code page into a byte string using an
7557 * error handler.
7558 *
7559 * Returns consumed characters if succeed, or raise an OSError and returns
7560 * -1 on other error.
7561 */
7562 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7563 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7564 PyObject *unicode, Py_ssize_t unicode_offset,
7565 Py_ssize_t insize, const char* errors)
7566 {
7567 const DWORD flags = encode_code_page_flags(code_page, errors);
7568 Py_ssize_t pos = unicode_offset;
7569 Py_ssize_t endin = unicode_offset + insize;
7570 /* Ideally, we should get reason from FormatMessage. This is the Windows
7571 2000 English version of the message. */
7572 const char *reason = "invalid character";
7573 /* 4=maximum length of a UTF-8 sequence */
7574 char buffer[4];
7575 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7576 Py_ssize_t outsize;
7577 char *out;
7578 PyObject *errorHandler = NULL;
7579 PyObject *exc = NULL;
7580 PyObject *encoding_obj = NULL;
7581 const char *encoding;
7582 Py_ssize_t newpos, newoutsize;
7583 PyObject *rep;
7584 int ret = -1;
7585
7586 assert(insize > 0);
7587
7588 encoding = code_page_name(code_page, &encoding_obj);
7589 if (encoding == NULL)
7590 return -1;
7591
7592 if (errors == NULL || strcmp(errors, "strict") == 0) {
7593 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7594 then we raise a UnicodeEncodeError. */
7595 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7596 if (exc != NULL) {
7597 PyCodec_StrictErrors(exc);
7598 Py_DECREF(exc);
7599 }
7600 Py_XDECREF(encoding_obj);
7601 return -1;
7602 }
7603
7604 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7605 pusedDefaultChar = &usedDefaultChar;
7606 else
7607 pusedDefaultChar = NULL;
7608
7609 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7610 PyErr_NoMemory();
7611 goto error;
7612 }
7613 outsize = insize * Py_ARRAY_LENGTH(buffer);
7614
7615 if (*outbytes == NULL) {
7616 /* Create string object */
7617 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7618 if (*outbytes == NULL)
7619 goto error;
7620 out = PyBytes_AS_STRING(*outbytes);
7621 }
7622 else {
7623 /* Extend string object */
7624 Py_ssize_t n = PyBytes_Size(*outbytes);
7625 if (n > PY_SSIZE_T_MAX - outsize) {
7626 PyErr_NoMemory();
7627 goto error;
7628 }
7629 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7630 goto error;
7631 out = PyBytes_AS_STRING(*outbytes) + n;
7632 }
7633
7634 /* Encode the string character per character */
7635 while (pos < endin)
7636 {
7637 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7638 wchar_t chars[2];
7639 int charsize;
7640 if (ch < 0x10000) {
7641 chars[0] = (wchar_t)ch;
7642 charsize = 1;
7643 }
7644 else {
7645 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7646 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7647 charsize = 2;
7648 }
7649
7650 outsize = WideCharToMultiByte(code_page, flags,
7651 chars, charsize,
7652 buffer, Py_ARRAY_LENGTH(buffer),
7653 NULL, pusedDefaultChar);
7654 if (outsize > 0) {
7655 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7656 {
7657 pos++;
7658 memcpy(out, buffer, outsize);
7659 out += outsize;
7660 continue;
7661 }
7662 }
7663 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7664 PyErr_SetFromWindowsErr(0);
7665 goto error;
7666 }
7667
7668 rep = unicode_encode_call_errorhandler(
7669 errors, &errorHandler, encoding, reason,
7670 unicode, &exc,
7671 pos, pos + 1, &newpos);
7672 if (rep == NULL)
7673 goto error;
7674 pos = newpos;
7675
7676 if (PyBytes_Check(rep)) {
7677 outsize = PyBytes_GET_SIZE(rep);
7678 if (outsize != 1) {
7679 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7680 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7681 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7682 Py_DECREF(rep);
7683 goto error;
7684 }
7685 out = PyBytes_AS_STRING(*outbytes) + offset;
7686 }
7687 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7688 out += outsize;
7689 }
7690 else {
7691 Py_ssize_t i;
7692 enum PyUnicode_Kind kind;
7693 void *data;
7694
7695 if (PyUnicode_READY(rep) == -1) {
7696 Py_DECREF(rep);
7697 goto error;
7698 }
7699
7700 outsize = PyUnicode_GET_LENGTH(rep);
7701 if (outsize != 1) {
7702 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7703 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7704 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7705 Py_DECREF(rep);
7706 goto error;
7707 }
7708 out = PyBytes_AS_STRING(*outbytes) + offset;
7709 }
7710 kind = PyUnicode_KIND(rep);
7711 data = PyUnicode_DATA(rep);
7712 for (i=0; i < outsize; i++) {
7713 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7714 if (ch > 127) {
7715 raise_encode_exception(&exc,
7716 encoding, unicode,
7717 pos, pos + 1,
7718 "unable to encode error handler result to ASCII");
7719 Py_DECREF(rep);
7720 goto error;
7721 }
7722 *out = (unsigned char)ch;
7723 out++;
7724 }
7725 }
7726 Py_DECREF(rep);
7727 }
7728 /* write a NUL byte */
7729 *out = 0;
7730 outsize = out - PyBytes_AS_STRING(*outbytes);
7731 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7732 if (_PyBytes_Resize(outbytes, outsize) < 0)
7733 goto error;
7734 ret = 0;
7735
7736 error:
7737 Py_XDECREF(encoding_obj);
7738 Py_XDECREF(errorHandler);
7739 Py_XDECREF(exc);
7740 return ret;
7741 }
7742
7743 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7744 encode_code_page(int code_page,
7745 PyObject *unicode,
7746 const char *errors)
7747 {
7748 Py_ssize_t len;
7749 PyObject *outbytes = NULL;
7750 Py_ssize_t offset;
7751 int chunk_len, ret, done;
7752
7753 if (!PyUnicode_Check(unicode)) {
7754 PyErr_BadArgument();
7755 return NULL;
7756 }
7757
7758 if (PyUnicode_READY(unicode) == -1)
7759 return NULL;
7760 len = PyUnicode_GET_LENGTH(unicode);
7761
7762 if (code_page < 0) {
7763 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7764 return NULL;
7765 }
7766
7767 if (len == 0)
7768 return PyBytes_FromStringAndSize(NULL, 0);
7769
7770 offset = 0;
7771 do
7772 {
7773 #ifdef NEED_RETRY
7774 if (len > DECODING_CHUNK_SIZE) {
7775 chunk_len = DECODING_CHUNK_SIZE;
7776 done = 0;
7777 }
7778 else
7779 #endif
7780 {
7781 chunk_len = (int)len;
7782 done = 1;
7783 }
7784
7785 ret = encode_code_page_strict(code_page, &outbytes,
7786 unicode, offset, chunk_len,
7787 errors);
7788 if (ret == -2)
7789 ret = encode_code_page_errors(code_page, &outbytes,
7790 unicode, offset,
7791 chunk_len, errors);
7792 if (ret < 0) {
7793 Py_XDECREF(outbytes);
7794 return NULL;
7795 }
7796
7797 offset += chunk_len;
7798 len -= chunk_len;
7799 } while (!done);
7800
7801 return outbytes;
7802 }
7803
7804 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7805 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7806 Py_ssize_t size,
7807 const char *errors)
7808 {
7809 PyObject *unicode, *res;
7810 unicode = PyUnicode_FromWideChar(p, size);
7811 if (unicode == NULL)
7812 return NULL;
7813 res = encode_code_page(CP_ACP, unicode, errors);
7814 Py_DECREF(unicode);
7815 return res;
7816 }
7817
7818 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7819 PyUnicode_EncodeCodePage(int code_page,
7820 PyObject *unicode,
7821 const char *errors)
7822 {
7823 return encode_code_page(code_page, unicode, errors);
7824 }
7825
7826 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7827 PyUnicode_AsMBCSString(PyObject *unicode)
7828 {
7829 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7830 }
7831
7832 #undef NEED_RETRY
7833
7834 #endif /* MS_WINDOWS */
7835
7836 /* --- Character Mapping Codec -------------------------------------------- */
7837
7838 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7839 charmap_decode_string(const char *s,
7840 Py_ssize_t size,
7841 PyObject *mapping,
7842 const char *errors,
7843 _PyUnicodeWriter *writer)
7844 {
7845 const char *starts = s;
7846 const char *e;
7847 Py_ssize_t startinpos, endinpos;
7848 PyObject *errorHandler = NULL, *exc = NULL;
7849 Py_ssize_t maplen;
7850 enum PyUnicode_Kind mapkind;
7851 void *mapdata;
7852 Py_UCS4 x;
7853 unsigned char ch;
7854
7855 if (PyUnicode_READY(mapping) == -1)
7856 return -1;
7857
7858 maplen = PyUnicode_GET_LENGTH(mapping);
7859 mapdata = PyUnicode_DATA(mapping);
7860 mapkind = PyUnicode_KIND(mapping);
7861
7862 e = s + size;
7863
7864 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7865 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7866 * is disabled in encoding aliases, latin1 is preferred because
7867 * its implementation is faster. */
7868 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7869 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7870 Py_UCS4 maxchar = writer->maxchar;
7871
7872 assert (writer->kind == PyUnicode_1BYTE_KIND);
7873 while (s < e) {
7874 ch = *s;
7875 x = mapdata_ucs1[ch];
7876 if (x > maxchar) {
7877 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7878 goto onError;
7879 maxchar = writer->maxchar;
7880 outdata = (Py_UCS1 *)writer->data;
7881 }
7882 outdata[writer->pos] = x;
7883 writer->pos++;
7884 ++s;
7885 }
7886 return 0;
7887 }
7888
7889 while (s < e) {
7890 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7891 enum PyUnicode_Kind outkind = writer->kind;
7892 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7893 if (outkind == PyUnicode_1BYTE_KIND) {
7894 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7895 Py_UCS4 maxchar = writer->maxchar;
7896 while (s < e) {
7897 ch = *s;
7898 x = mapdata_ucs2[ch];
7899 if (x > maxchar)
7900 goto Error;
7901 outdata[writer->pos] = x;
7902 writer->pos++;
7903 ++s;
7904 }
7905 break;
7906 }
7907 else if (outkind == PyUnicode_2BYTE_KIND) {
7908 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7909 while (s < e) {
7910 ch = *s;
7911 x = mapdata_ucs2[ch];
7912 if (x == 0xFFFE)
7913 goto Error;
7914 outdata[writer->pos] = x;
7915 writer->pos++;
7916 ++s;
7917 }
7918 break;
7919 }
7920 }
7921 ch = *s;
7922
7923 if (ch < maplen)
7924 x = PyUnicode_READ(mapkind, mapdata, ch);
7925 else
7926 x = 0xfffe; /* invalid value */
7927 Error:
7928 if (x == 0xfffe)
7929 {
7930 /* undefined mapping */
7931 startinpos = s-starts;
7932 endinpos = startinpos+1;
7933 if (unicode_decode_call_errorhandler_writer(
7934 errors, &errorHandler,
7935 "charmap", "character maps to <undefined>",
7936 &starts, &e, &startinpos, &endinpos, &exc, &s,
7937 writer)) {
7938 goto onError;
7939 }
7940 continue;
7941 }
7942
7943 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7944 goto onError;
7945 ++s;
7946 }
7947 Py_XDECREF(errorHandler);
7948 Py_XDECREF(exc);
7949 return 0;
7950
7951 onError:
7952 Py_XDECREF(errorHandler);
7953 Py_XDECREF(exc);
7954 return -1;
7955 }
7956
7957 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7958 charmap_decode_mapping(const char *s,
7959 Py_ssize_t size,
7960 PyObject *mapping,
7961 const char *errors,
7962 _PyUnicodeWriter *writer)
7963 {
7964 const char *starts = s;
7965 const char *e;
7966 Py_ssize_t startinpos, endinpos;
7967 PyObject *errorHandler = NULL, *exc = NULL;
7968 unsigned char ch;
7969 PyObject *key, *item = NULL;
7970
7971 e = s + size;
7972
7973 while (s < e) {
7974 ch = *s;
7975
7976 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7977 key = PyLong_FromLong((long)ch);
7978 if (key == NULL)
7979 goto onError;
7980
7981 item = PyObject_GetItem(mapping, key);
7982 Py_DECREF(key);
7983 if (item == NULL) {
7984 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985 /* No mapping found means: mapping is undefined. */
7986 PyErr_Clear();
7987 goto Undefined;
7988 } else
7989 goto onError;
7990 }
7991
7992 /* Apply mapping */
7993 if (item == Py_None)
7994 goto Undefined;
7995 if (PyLong_Check(item)) {
7996 long value = PyLong_AS_LONG(item);
7997 if (value == 0xFFFE)
7998 goto Undefined;
7999 if (value < 0 || value > MAX_UNICODE) {
8000 PyErr_Format(PyExc_TypeError,
8001 "character mapping must be in range(0x%lx)",
8002 (unsigned long)MAX_UNICODE + 1);
8003 goto onError;
8004 }
8005
8006 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8007 goto onError;
8008 }
8009 else if (PyUnicode_Check(item)) {
8010 if (PyUnicode_READY(item) == -1)
8011 goto onError;
8012 if (PyUnicode_GET_LENGTH(item) == 1) {
8013 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8014 if (value == 0xFFFE)
8015 goto Undefined;
8016 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8017 goto onError;
8018 }
8019 else {
8020 writer->overallocate = 1;
8021 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8022 goto onError;
8023 }
8024 }
8025 else {
8026 /* wrong return value */
8027 PyErr_SetString(PyExc_TypeError,
8028 "character mapping must return integer, None or str");
8029 goto onError;
8030 }
8031 Py_CLEAR(item);
8032 ++s;
8033 continue;
8034
8035 Undefined:
8036 /* undefined mapping */
8037 Py_CLEAR(item);
8038 startinpos = s-starts;
8039 endinpos = startinpos+1;
8040 if (unicode_decode_call_errorhandler_writer(
8041 errors, &errorHandler,
8042 "charmap", "character maps to <undefined>",
8043 &starts, &e, &startinpos, &endinpos, &exc, &s,
8044 writer)) {
8045 goto onError;
8046 }
8047 }
8048 Py_XDECREF(errorHandler);
8049 Py_XDECREF(exc);
8050 return 0;
8051
8052 onError:
8053 Py_XDECREF(item);
8054 Py_XDECREF(errorHandler);
8055 Py_XDECREF(exc);
8056 return -1;
8057 }
8058
8059 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8060 PyUnicode_DecodeCharmap(const char *s,
8061 Py_ssize_t size,
8062 PyObject *mapping,
8063 const char *errors)
8064 {
8065 _PyUnicodeWriter writer;
8066
8067 /* Default to Latin-1 */
8068 if (mapping == NULL)
8069 return PyUnicode_DecodeLatin1(s, size, errors);
8070
8071 if (size == 0)
8072 _Py_RETURN_UNICODE_EMPTY();
8073 _PyUnicodeWriter_Init(&writer);
8074 writer.min_length = size;
8075 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8076 goto onError;
8077
8078 if (PyUnicode_CheckExact(mapping)) {
8079 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8080 goto onError;
8081 }
8082 else {
8083 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8084 goto onError;
8085 }
8086 return _PyUnicodeWriter_Finish(&writer);
8087
8088 onError:
8089 _PyUnicodeWriter_Dealloc(&writer);
8090 return NULL;
8091 }
8092
8093 /* Charmap encoding: the lookup table */
8094
8095 struct encoding_map {
8096 PyObject_HEAD
8097 unsigned char level1[32];
8098 int count2, count3;
8099 unsigned char level23[1];
8100 };
8101
8102 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8103 encoding_map_size(PyObject *obj, PyObject* args)
8104 {
8105 struct encoding_map *map = (struct encoding_map*)obj;
8106 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8107 128*map->count3);
8108 }
8109
8110 static PyMethodDef encoding_map_methods[] = {
8111 {"size", encoding_map_size, METH_NOARGS,
8112 PyDoc_STR("Return the size (in bytes) of this object") },
8113 { 0 }
8114 };
8115
8116 static PyTypeObject EncodingMapType = {
8117 PyVarObject_HEAD_INIT(NULL, 0)
8118 "EncodingMap", /*tp_name*/
8119 sizeof(struct encoding_map), /*tp_basicsize*/
8120 0, /*tp_itemsize*/
8121 /* methods */
8122 0, /*tp_dealloc*/
8123 0, /*tp_vectorcall_offset*/
8124 0, /*tp_getattr*/
8125 0, /*tp_setattr*/
8126 0, /*tp_as_async*/
8127 0, /*tp_repr*/
8128 0, /*tp_as_number*/
8129 0, /*tp_as_sequence*/
8130 0, /*tp_as_mapping*/
8131 0, /*tp_hash*/
8132 0, /*tp_call*/
8133 0, /*tp_str*/
8134 0, /*tp_getattro*/
8135 0, /*tp_setattro*/
8136 0, /*tp_as_buffer*/
8137 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8138 0, /*tp_doc*/
8139 0, /*tp_traverse*/
8140 0, /*tp_clear*/
8141 0, /*tp_richcompare*/
8142 0, /*tp_weaklistoffset*/
8143 0, /*tp_iter*/
8144 0, /*tp_iternext*/
8145 encoding_map_methods, /*tp_methods*/
8146 0, /*tp_members*/
8147 0, /*tp_getset*/
8148 0, /*tp_base*/
8149 0, /*tp_dict*/
8150 0, /*tp_descr_get*/
8151 0, /*tp_descr_set*/
8152 0, /*tp_dictoffset*/
8153 0, /*tp_init*/
8154 0, /*tp_alloc*/
8155 0, /*tp_new*/
8156 0, /*tp_free*/
8157 0, /*tp_is_gc*/
8158 };
8159
8160 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8161 PyUnicode_BuildEncodingMap(PyObject* string)
8162 {
8163 PyObject *result;
8164 struct encoding_map *mresult;
8165 int i;
8166 int need_dict = 0;
8167 unsigned char level1[32];
8168 unsigned char level2[512];
8169 unsigned char *mlevel1, *mlevel2, *mlevel3;
8170 int count2 = 0, count3 = 0;
8171 int kind;
8172 void *data;
8173 Py_ssize_t length;
8174 Py_UCS4 ch;
8175
8176 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8177 PyErr_BadArgument();
8178 return NULL;
8179 }
8180 kind = PyUnicode_KIND(string);
8181 data = PyUnicode_DATA(string);
8182 length = PyUnicode_GET_LENGTH(string);
8183 length = Py_MIN(length, 256);
8184 memset(level1, 0xFF, sizeof level1);
8185 memset(level2, 0xFF, sizeof level2);
8186
8187 /* If there isn't a one-to-one mapping of NULL to \0,
8188 or if there are non-BMP characters, we need to use
8189 a mapping dictionary. */
8190 if (PyUnicode_READ(kind, data, 0) != 0)
8191 need_dict = 1;
8192 for (i = 1; i < length; i++) {
8193 int l1, l2;
8194 ch = PyUnicode_READ(kind, data, i);
8195 if (ch == 0 || ch > 0xFFFF) {
8196 need_dict = 1;
8197 break;
8198 }
8199 if (ch == 0xFFFE)
8200 /* unmapped character */
8201 continue;
8202 l1 = ch >> 11;
8203 l2 = ch >> 7;
8204 if (level1[l1] == 0xFF)
8205 level1[l1] = count2++;
8206 if (level2[l2] == 0xFF)
8207 level2[l2] = count3++;
8208 }
8209
8210 if (count2 >= 0xFF || count3 >= 0xFF)
8211 need_dict = 1;
8212
8213 if (need_dict) {
8214 PyObject *result = PyDict_New();
8215 PyObject *key, *value;
8216 if (!result)
8217 return NULL;
8218 for (i = 0; i < length; i++) {
8219 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8220 value = PyLong_FromLong(i);
8221 if (!key || !value)
8222 goto failed1;
8223 if (PyDict_SetItem(result, key, value) == -1)
8224 goto failed1;
8225 Py_DECREF(key);
8226 Py_DECREF(value);
8227 }
8228 return result;
8229 failed1:
8230 Py_XDECREF(key);
8231 Py_XDECREF(value);
8232 Py_DECREF(result);
8233 return NULL;
8234 }
8235
8236 /* Create a three-level trie */
8237 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8238 16*count2 + 128*count3 - 1);
8239 if (!result)
8240 return PyErr_NoMemory();
8241 PyObject_Init(result, &EncodingMapType);
8242 mresult = (struct encoding_map*)result;
8243 mresult->count2 = count2;
8244 mresult->count3 = count3;
8245 mlevel1 = mresult->level1;
8246 mlevel2 = mresult->level23;
8247 mlevel3 = mresult->level23 + 16*count2;
8248 memcpy(mlevel1, level1, 32);
8249 memset(mlevel2, 0xFF, 16*count2);
8250 memset(mlevel3, 0, 128*count3);
8251 count3 = 0;
8252 for (i = 1; i < length; i++) {
8253 int o1, o2, o3, i2, i3;
8254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8255 if (ch == 0xFFFE)
8256 /* unmapped character */
8257 continue;
8258 o1 = ch>>11;
8259 o2 = (ch>>7) & 0xF;
8260 i2 = 16*mlevel1[o1] + o2;
8261 if (mlevel2[i2] == 0xFF)
8262 mlevel2[i2] = count3++;
8263 o3 = ch & 0x7F;
8264 i3 = 128*mlevel2[i2] + o3;
8265 mlevel3[i3] = i;
8266 }
8267 return result;
8268 }
8269
8270 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8271 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8272 {
8273 struct encoding_map *map = (struct encoding_map*)mapping;
8274 int l1 = c>>11;
8275 int l2 = (c>>7) & 0xF;
8276 int l3 = c & 0x7F;
8277 int i;
8278
8279 if (c > 0xFFFF)
8280 return -1;
8281 if (c == 0)
8282 return 0;
8283 /* level 1*/
8284 i = map->level1[l1];
8285 if (i == 0xFF) {
8286 return -1;
8287 }
8288 /* level 2*/
8289 i = map->level23[16*i+l2];
8290 if (i == 0xFF) {
8291 return -1;
8292 }
8293 /* level 3 */
8294 i = map->level23[16*map->count2 + 128*i + l3];
8295 if (i == 0) {
8296 return -1;
8297 }
8298 return i;
8299 }
8300
8301 /* Lookup the character ch in the mapping. If the character
8302 can't be found, Py_None is returned (or NULL, if another
8303 error occurred). */
8304 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8305 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8306 {
8307 PyObject *w = PyLong_FromLong((long)c);
8308 PyObject *x;
8309
8310 if (w == NULL)
8311 return NULL;
8312 x = PyObject_GetItem(mapping, w);
8313 Py_DECREF(w);
8314 if (x == NULL) {
8315 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8316 /* No mapping found means: mapping is undefined. */
8317 PyErr_Clear();
8318 Py_RETURN_NONE;
8319 } else
8320 return NULL;
8321 }
8322 else if (x == Py_None)
8323 return x;
8324 else if (PyLong_Check(x)) {
8325 long value = PyLong_AS_LONG(x);
8326 if (value < 0 || value > 255) {
8327 PyErr_SetString(PyExc_TypeError,
8328 "character mapping must be in range(256)");
8329 Py_DECREF(x);
8330 return NULL;
8331 }
8332 return x;
8333 }
8334 else if (PyBytes_Check(x))
8335 return x;
8336 else {
8337 /* wrong return value */
8338 PyErr_Format(PyExc_TypeError,
8339 "character mapping must return integer, bytes or None, not %.400s",
8340 x->ob_type->tp_name);
8341 Py_DECREF(x);
8342 return NULL;
8343 }
8344 }
8345
8346 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8347 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8348 {
8349 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8350 /* exponentially overallocate to minimize reallocations */
8351 if (requiredsize < 2*outsize)
8352 requiredsize = 2*outsize;
8353 if (_PyBytes_Resize(outobj, requiredsize))
8354 return -1;
8355 return 0;
8356 }
8357
8358 typedef enum charmapencode_result {
8359 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8360 } charmapencode_result;
8361 /* lookup the character, put the result in the output string and adjust
8362 various state variables. Resize the output bytes object if not enough
8363 space is available. Return a new reference to the object that
8364 was put in the output buffer, or Py_None, if the mapping was undefined
8365 (in which case no character was written) or NULL, if a
8366 reallocation error occurred. The caller must decref the result */
8367 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8368 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8369 PyObject **outobj, Py_ssize_t *outpos)
8370 {
8371 PyObject *rep;
8372 char *outstart;
8373 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8374
8375 if (Py_TYPE(mapping) == &EncodingMapType) {
8376 int res = encoding_map_lookup(c, mapping);
8377 Py_ssize_t requiredsize = *outpos+1;
8378 if (res == -1)
8379 return enc_FAILED;
8380 if (outsize<requiredsize)
8381 if (charmapencode_resize(outobj, outpos, requiredsize))
8382 return enc_EXCEPTION;
8383 outstart = PyBytes_AS_STRING(*outobj);
8384 outstart[(*outpos)++] = (char)res;
8385 return enc_SUCCESS;
8386 }
8387
8388 rep = charmapencode_lookup(c, mapping);
8389 if (rep==NULL)
8390 return enc_EXCEPTION;
8391 else if (rep==Py_None) {
8392 Py_DECREF(rep);
8393 return enc_FAILED;
8394 } else {
8395 if (PyLong_Check(rep)) {
8396 Py_ssize_t requiredsize = *outpos+1;
8397 if (outsize<requiredsize)
8398 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8399 Py_DECREF(rep);
8400 return enc_EXCEPTION;
8401 }
8402 outstart = PyBytes_AS_STRING(*outobj);
8403 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8404 }
8405 else {
8406 const char *repchars = PyBytes_AS_STRING(rep);
8407 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8408 Py_ssize_t requiredsize = *outpos+repsize;
8409 if (outsize<requiredsize)
8410 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8411 Py_DECREF(rep);
8412 return enc_EXCEPTION;
8413 }
8414 outstart = PyBytes_AS_STRING(*outobj);
8415 memcpy(outstart + *outpos, repchars, repsize);
8416 *outpos += repsize;
8417 }
8418 }
8419 Py_DECREF(rep);
8420 return enc_SUCCESS;
8421 }
8422
8423 /* handle an error in PyUnicode_EncodeCharmap
8424 Return 0 on success, -1 on error */
8425 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8426 charmap_encoding_error(
8427 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8428 PyObject **exceptionObject,
8429 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8430 PyObject **res, Py_ssize_t *respos)
8431 {
8432 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8433 Py_ssize_t size, repsize;
8434 Py_ssize_t newpos;
8435 enum PyUnicode_Kind kind;
8436 void *data;
8437 Py_ssize_t index;
8438 /* startpos for collecting unencodable chars */
8439 Py_ssize_t collstartpos = *inpos;
8440 Py_ssize_t collendpos = *inpos+1;
8441 Py_ssize_t collpos;
8442 const char *encoding = "charmap";
8443 const char *reason = "character maps to <undefined>";
8444 charmapencode_result x;
8445 Py_UCS4 ch;
8446 int val;
8447
8448 if (PyUnicode_READY(unicode) == -1)
8449 return -1;
8450 size = PyUnicode_GET_LENGTH(unicode);
8451 /* find all unencodable characters */
8452 while (collendpos < size) {
8453 PyObject *rep;
8454 if (Py_TYPE(mapping) == &EncodingMapType) {
8455 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8456 val = encoding_map_lookup(ch, mapping);
8457 if (val != -1)
8458 break;
8459 ++collendpos;
8460 continue;
8461 }
8462
8463 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8464 rep = charmapencode_lookup(ch, mapping);
8465 if (rep==NULL)
8466 return -1;
8467 else if (rep!=Py_None) {
8468 Py_DECREF(rep);
8469 break;
8470 }
8471 Py_DECREF(rep);
8472 ++collendpos;
8473 }
8474 /* cache callback name lookup
8475 * (if not done yet, i.e. it's the first error) */
8476 if (*error_handler == _Py_ERROR_UNKNOWN)
8477 *error_handler = _Py_GetErrorHandler(errors);
8478
8479 switch (*error_handler) {
8480 case _Py_ERROR_STRICT:
8481 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8482 return -1;
8483
8484 case _Py_ERROR_REPLACE:
8485 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8486 x = charmapencode_output('?', mapping, res, respos);
8487 if (x==enc_EXCEPTION) {
8488 return -1;
8489 }
8490 else if (x==enc_FAILED) {
8491 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8492 return -1;
8493 }
8494 }
8495 /* fall through */
8496 case _Py_ERROR_IGNORE:
8497 *inpos = collendpos;
8498 break;
8499
8500 case _Py_ERROR_XMLCHARREFREPLACE:
8501 /* generate replacement (temporarily (mis)uses p) */
8502 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8503 char buffer[2+29+1+1];
8504 char *cp;
8505 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8506 for (cp = buffer; *cp; ++cp) {
8507 x = charmapencode_output(*cp, mapping, res, respos);
8508 if (x==enc_EXCEPTION)
8509 return -1;
8510 else if (x==enc_FAILED) {
8511 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8512 return -1;
8513 }
8514 }
8515 }
8516 *inpos = collendpos;
8517 break;
8518
8519 default:
8520 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8521 encoding, reason, unicode, exceptionObject,
8522 collstartpos, collendpos, &newpos);
8523 if (repunicode == NULL)
8524 return -1;
8525 if (PyBytes_Check(repunicode)) {
8526 /* Directly copy bytes result to output. */
8527 Py_ssize_t outsize = PyBytes_Size(*res);
8528 Py_ssize_t requiredsize;
8529 repsize = PyBytes_Size(repunicode);
8530 requiredsize = *respos + repsize;
8531 if (requiredsize > outsize)
8532 /* Make room for all additional bytes. */
8533 if (charmapencode_resize(res, respos, requiredsize)) {
8534 Py_DECREF(repunicode);
8535 return -1;
8536 }
8537 memcpy(PyBytes_AsString(*res) + *respos,
8538 PyBytes_AsString(repunicode), repsize);
8539 *respos += repsize;
8540 *inpos = newpos;
8541 Py_DECREF(repunicode);
8542 break;
8543 }
8544 /* generate replacement */
8545 if (PyUnicode_READY(repunicode) == -1) {
8546 Py_DECREF(repunicode);
8547 return -1;
8548 }
8549 repsize = PyUnicode_GET_LENGTH(repunicode);
8550 data = PyUnicode_DATA(repunicode);
8551 kind = PyUnicode_KIND(repunicode);
8552 for (index = 0; index < repsize; index++) {
8553 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8554 x = charmapencode_output(repch, mapping, res, respos);
8555 if (x==enc_EXCEPTION) {
8556 Py_DECREF(repunicode);
8557 return -1;
8558 }
8559 else if (x==enc_FAILED) {
8560 Py_DECREF(repunicode);
8561 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8562 return -1;
8563 }
8564 }
8565 *inpos = newpos;
8566 Py_DECREF(repunicode);
8567 }
8568 return 0;
8569 }
8570
8571 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8572 _PyUnicode_EncodeCharmap(PyObject *unicode,
8573 PyObject *mapping,
8574 const char *errors)
8575 {
8576 /* output object */
8577 PyObject *res = NULL;
8578 /* current input position */
8579 Py_ssize_t inpos = 0;
8580 Py_ssize_t size;
8581 /* current output position */
8582 Py_ssize_t respos = 0;
8583 PyObject *error_handler_obj = NULL;
8584 PyObject *exc = NULL;
8585 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8586 void *data;
8587 int kind;
8588
8589 if (PyUnicode_READY(unicode) == -1)
8590 return NULL;
8591 size = PyUnicode_GET_LENGTH(unicode);
8592 data = PyUnicode_DATA(unicode);
8593 kind = PyUnicode_KIND(unicode);
8594
8595 /* Default to Latin-1 */
8596 if (mapping == NULL)
8597 return unicode_encode_ucs1(unicode, errors, 256);
8598
8599 /* allocate enough for a simple encoding without
8600 replacements, if we need more, we'll resize */
8601 res = PyBytes_FromStringAndSize(NULL, size);
8602 if (res == NULL)
8603 goto onError;
8604 if (size == 0)
8605 return res;
8606
8607 while (inpos<size) {
8608 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8609 /* try to encode it */
8610 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8611 if (x==enc_EXCEPTION) /* error */
8612 goto onError;
8613 if (x==enc_FAILED) { /* unencodable character */
8614 if (charmap_encoding_error(unicode, &inpos, mapping,
8615 &exc,
8616 &error_handler, &error_handler_obj, errors,
8617 &res, &respos)) {
8618 goto onError;
8619 }
8620 }
8621 else
8622 /* done with this character => adjust input position */
8623 ++inpos;
8624 }
8625
8626 /* Resize if we allocated to much */
8627 if (respos<PyBytes_GET_SIZE(res))
8628 if (_PyBytes_Resize(&res, respos) < 0)
8629 goto onError;
8630
8631 Py_XDECREF(exc);
8632 Py_XDECREF(error_handler_obj);
8633 return res;
8634
8635 onError:
8636 Py_XDECREF(res);
8637 Py_XDECREF(exc);
8638 Py_XDECREF(error_handler_obj);
8639 return NULL;
8640 }
8641
8642 /* Deprecated */
8643 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8644 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8645 Py_ssize_t size,
8646 PyObject *mapping,
8647 const char *errors)
8648 {
8649 PyObject *result;
8650 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8651 if (unicode == NULL)
8652 return NULL;
8653 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8654 Py_DECREF(unicode);
8655 return result;
8656 }
8657
8658 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8659 PyUnicode_AsCharmapString(PyObject *unicode,
8660 PyObject *mapping)
8661 {
8662 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8663 PyErr_BadArgument();
8664 return NULL;
8665 }
8666 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8667 }
8668
8669 /* create or adjust a UnicodeTranslateError */
8670 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8671 make_translate_exception(PyObject **exceptionObject,
8672 PyObject *unicode,
8673 Py_ssize_t startpos, Py_ssize_t endpos,
8674 const char *reason)
8675 {
8676 if (*exceptionObject == NULL) {
8677 *exceptionObject = _PyUnicodeTranslateError_Create(
8678 unicode, startpos, endpos, reason);
8679 }
8680 else {
8681 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8682 goto onError;
8683 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8684 goto onError;
8685 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8686 goto onError;
8687 return;
8688 onError:
8689 Py_CLEAR(*exceptionObject);
8690 }
8691 }
8692
8693 /* error handling callback helper:
8694 build arguments, call the callback and check the arguments,
8695 put the result into newpos and return the replacement string, which
8696 has to be freed by the caller */
8697 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8698 unicode_translate_call_errorhandler(const char *errors,
8699 PyObject **errorHandler,
8700 const char *reason,
8701 PyObject *unicode, PyObject **exceptionObject,
8702 Py_ssize_t startpos, Py_ssize_t endpos,
8703 Py_ssize_t *newpos)
8704 {
8705 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8706
8707 Py_ssize_t i_newpos;
8708 PyObject *restuple;
8709 PyObject *resunicode;
8710
8711 if (*errorHandler == NULL) {
8712 *errorHandler = PyCodec_LookupError(errors);
8713 if (*errorHandler == NULL)
8714 return NULL;
8715 }
8716
8717 make_translate_exception(exceptionObject,
8718 unicode, startpos, endpos, reason);
8719 if (*exceptionObject == NULL)
8720 return NULL;
8721
8722 restuple = PyObject_CallFunctionObjArgs(
8723 *errorHandler, *exceptionObject, NULL);
8724 if (restuple == NULL)
8725 return NULL;
8726 if (!PyTuple_Check(restuple)) {
8727 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8728 Py_DECREF(restuple);
8729 return NULL;
8730 }
8731 if (!PyArg_ParseTuple(restuple, argparse,
8732 &resunicode, &i_newpos)) {
8733 Py_DECREF(restuple);
8734 return NULL;
8735 }
8736 if (i_newpos<0)
8737 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8738 else
8739 *newpos = i_newpos;
8740 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8741 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8742 Py_DECREF(restuple);
8743 return NULL;
8744 }
8745 Py_INCREF(resunicode);
8746 Py_DECREF(restuple);
8747 return resunicode;
8748 }
8749
8750 /* Lookup the character ch in the mapping and put the result in result,
8751 which must be decrefed by the caller.
8752 Return 0 on success, -1 on error */
8753 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8754 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8755 {
8756 PyObject *w = PyLong_FromLong((long)c);
8757 PyObject *x;
8758
8759 if (w == NULL)
8760 return -1;
8761 x = PyObject_GetItem(mapping, w);
8762 Py_DECREF(w);
8763 if (x == NULL) {
8764 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8765 /* No mapping found means: use 1:1 mapping. */
8766 PyErr_Clear();
8767 *result = NULL;
8768 return 0;
8769 } else
8770 return -1;
8771 }
8772 else if (x == Py_None) {
8773 *result = x;
8774 return 0;
8775 }
8776 else if (PyLong_Check(x)) {
8777 long value = PyLong_AS_LONG(x);
8778 if (value < 0 || value > MAX_UNICODE) {
8779 PyErr_Format(PyExc_ValueError,
8780 "character mapping must be in range(0x%x)",
8781 MAX_UNICODE+1);
8782 Py_DECREF(x);
8783 return -1;
8784 }
8785 *result = x;
8786 return 0;
8787 }
8788 else if (PyUnicode_Check(x)) {
8789 *result = x;
8790 return 0;
8791 }
8792 else {
8793 /* wrong return value */
8794 PyErr_SetString(PyExc_TypeError,
8795 "character mapping must return integer, None or str");
8796 Py_DECREF(x);
8797 return -1;
8798 }
8799 }
8800
8801 /* lookup the character, write the result into the writer.
8802 Return 1 if the result was written into the writer, return 0 if the mapping
8803 was undefined, raise an exception return -1 on error. */
8804 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8805 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8806 _PyUnicodeWriter *writer)
8807 {
8808 PyObject *item;
8809
8810 if (charmaptranslate_lookup(ch, mapping, &item))
8811 return -1;
8812
8813 if (item == NULL) {
8814 /* not found => default to 1:1 mapping */
8815 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8816 return -1;
8817 }
8818 return 1;
8819 }
8820
8821 if (item == Py_None) {
8822 Py_DECREF(item);
8823 return 0;
8824 }
8825
8826 if (PyLong_Check(item)) {
8827 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8828 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8829 used it */
8830 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8831 Py_DECREF(item);
8832 return -1;
8833 }
8834 Py_DECREF(item);
8835 return 1;
8836 }
8837
8838 if (!PyUnicode_Check(item)) {
8839 Py_DECREF(item);
8840 return -1;
8841 }
8842
8843 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847
8848 Py_DECREF(item);
8849 return 1;
8850 }
8851
8852 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8853 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8854 Py_UCS1 *translate)
8855 {
8856 PyObject *item = NULL;
8857 int ret = 0;
8858
8859 if (charmaptranslate_lookup(ch, mapping, &item)) {
8860 return -1;
8861 }
8862
8863 if (item == Py_None) {
8864 /* deletion */
8865 translate[ch] = 0xfe;
8866 }
8867 else if (item == NULL) {
8868 /* not found => default to 1:1 mapping */
8869 translate[ch] = ch;
8870 return 1;
8871 }
8872 else if (PyLong_Check(item)) {
8873 long replace = PyLong_AS_LONG(item);
8874 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8875 used it */
8876 if (127 < replace) {
8877 /* invalid character or character outside ASCII:
8878 skip the fast translate */
8879 goto exit;
8880 }
8881 translate[ch] = (Py_UCS1)replace;
8882 }
8883 else if (PyUnicode_Check(item)) {
8884 Py_UCS4 replace;
8885
8886 if (PyUnicode_READY(item) == -1) {
8887 Py_DECREF(item);
8888 return -1;
8889 }
8890 if (PyUnicode_GET_LENGTH(item) != 1)
8891 goto exit;
8892
8893 replace = PyUnicode_READ_CHAR(item, 0);
8894 if (replace > 127)
8895 goto exit;
8896 translate[ch] = (Py_UCS1)replace;
8897 }
8898 else {
8899 /* not None, NULL, long or unicode */
8900 goto exit;
8901 }
8902 ret = 1;
8903
8904 exit:
8905 Py_DECREF(item);
8906 return ret;
8907 }
8908
8909 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8910 was translated into writer, return 0 if the input string was partially
8911 translated into writer, raise an exception and return -1 on error. */
8912 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8913 unicode_fast_translate(PyObject *input, PyObject *mapping,
8914 _PyUnicodeWriter *writer, int ignore,
8915 Py_ssize_t *input_pos)
8916 {
8917 Py_UCS1 ascii_table[128], ch, ch2;
8918 Py_ssize_t len;
8919 Py_UCS1 *in, *end, *out;
8920 int res = 0;
8921
8922 len = PyUnicode_GET_LENGTH(input);
8923
8924 memset(ascii_table, 0xff, 128);
8925
8926 in = PyUnicode_1BYTE_DATA(input);
8927 end = in + len;
8928
8929 assert(PyUnicode_IS_ASCII(writer->buffer));
8930 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8931 out = PyUnicode_1BYTE_DATA(writer->buffer);
8932
8933 for (; in < end; in++) {
8934 ch = *in;
8935 ch2 = ascii_table[ch];
8936 if (ch2 == 0xff) {
8937 int translate = unicode_fast_translate_lookup(mapping, ch,
8938 ascii_table);
8939 if (translate < 0)
8940 return -1;
8941 if (translate == 0)
8942 goto exit;
8943 ch2 = ascii_table[ch];
8944 }
8945 if (ch2 == 0xfe) {
8946 if (ignore)
8947 continue;
8948 goto exit;
8949 }
8950 assert(ch2 < 128);
8951 *out = ch2;
8952 out++;
8953 }
8954 res = 1;
8955
8956 exit:
8957 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8958 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8959 return res;
8960 }
8961
8962 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8963 _PyUnicode_TranslateCharmap(PyObject *input,
8964 PyObject *mapping,
8965 const char *errors)
8966 {
8967 /* input object */
8968 char *data;
8969 Py_ssize_t size, i;
8970 int kind;
8971 /* output buffer */
8972 _PyUnicodeWriter writer;
8973 /* error handler */
8974 const char *reason = "character maps to <undefined>";
8975 PyObject *errorHandler = NULL;
8976 PyObject *exc = NULL;
8977 int ignore;
8978 int res;
8979
8980 if (mapping == NULL) {
8981 PyErr_BadArgument();
8982 return NULL;
8983 }
8984
8985 if (PyUnicode_READY(input) == -1)
8986 return NULL;
8987 data = (char*)PyUnicode_DATA(input);
8988 kind = PyUnicode_KIND(input);
8989 size = PyUnicode_GET_LENGTH(input);
8990
8991 if (size == 0)
8992 return PyUnicode_FromObject(input);
8993
8994 /* allocate enough for a simple 1:1 translation without
8995 replacements, if we need more, we'll resize */
8996 _PyUnicodeWriter_Init(&writer);
8997 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8998 goto onError;
8999
9000 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9001
9002 if (PyUnicode_READY(input) == -1)
9003 return NULL;
9004 if (PyUnicode_IS_ASCII(input)) {
9005 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9006 if (res < 0) {
9007 _PyUnicodeWriter_Dealloc(&writer);
9008 return NULL;
9009 }
9010 if (res == 1)
9011 return _PyUnicodeWriter_Finish(&writer);
9012 }
9013 else {
9014 i = 0;
9015 }
9016
9017 while (i<size) {
9018 /* try to encode it */
9019 int translate;
9020 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9021 Py_ssize_t newpos;
9022 /* startpos for collecting untranslatable chars */
9023 Py_ssize_t collstart;
9024 Py_ssize_t collend;
9025 Py_UCS4 ch;
9026
9027 ch = PyUnicode_READ(kind, data, i);
9028 translate = charmaptranslate_output(ch, mapping, &writer);
9029 if (translate < 0)
9030 goto onError;
9031
9032 if (translate != 0) {
9033 /* it worked => adjust input pointer */
9034 ++i;
9035 continue;
9036 }
9037
9038 /* untranslatable character */
9039 collstart = i;
9040 collend = i+1;
9041
9042 /* find all untranslatable characters */
9043 while (collend < size) {
9044 PyObject *x;
9045 ch = PyUnicode_READ(kind, data, collend);
9046 if (charmaptranslate_lookup(ch, mapping, &x))
9047 goto onError;
9048 Py_XDECREF(x);
9049 if (x != Py_None)
9050 break;
9051 ++collend;
9052 }
9053
9054 if (ignore) {
9055 i = collend;
9056 }
9057 else {
9058 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9059 reason, input, &exc,
9060 collstart, collend, &newpos);
9061 if (repunicode == NULL)
9062 goto onError;
9063 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9064 Py_DECREF(repunicode);
9065 goto onError;
9066 }
9067 Py_DECREF(repunicode);
9068 i = newpos;
9069 }
9070 }
9071 Py_XDECREF(exc);
9072 Py_XDECREF(errorHandler);
9073 return _PyUnicodeWriter_Finish(&writer);
9074
9075 onError:
9076 _PyUnicodeWriter_Dealloc(&writer);
9077 Py_XDECREF(exc);
9078 Py_XDECREF(errorHandler);
9079 return NULL;
9080 }
9081
9082 /* Deprecated. Use PyUnicode_Translate instead. */
9083 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9084 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9085 Py_ssize_t size,
9086 PyObject *mapping,
9087 const char *errors)
9088 {
9089 PyObject *result;
9090 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9091 if (!unicode)
9092 return NULL;
9093 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9094 Py_DECREF(unicode);
9095 return result;
9096 }
9097
9098 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9099 PyUnicode_Translate(PyObject *str,
9100 PyObject *mapping,
9101 const char *errors)
9102 {
9103 if (ensure_unicode(str) < 0)
9104 return NULL;
9105 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9106 }
9107
9108 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9109 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9110 {
9111 if (!PyUnicode_Check(unicode)) {
9112 PyErr_BadInternalCall();
9113 return NULL;
9114 }
9115 if (PyUnicode_READY(unicode) == -1)
9116 return NULL;
9117 if (PyUnicode_IS_ASCII(unicode)) {
9118 /* If the string is already ASCII, just return the same string */
9119 Py_INCREF(unicode);
9120 return unicode;
9121 }
9122
9123 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9124 PyObject *result = PyUnicode_New(len, 127);
9125 if (result == NULL) {
9126 return NULL;
9127 }
9128
9129 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9130 int kind = PyUnicode_KIND(unicode);
9131 const void *data = PyUnicode_DATA(unicode);
9132 Py_ssize_t i;
9133 for (i = 0; i < len; ++i) {
9134 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9135 if (ch < 127) {
9136 out[i] = ch;
9137 }
9138 else if (Py_UNICODE_ISSPACE(ch)) {
9139 out[i] = ' ';
9140 }
9141 else {
9142 int decimal = Py_UNICODE_TODECIMAL(ch);
9143 if (decimal < 0) {
9144 out[i] = '?';
9145 out[i+1] = '\0';
9146 _PyUnicode_LENGTH(result) = i + 1;
9147 break;
9148 }
9149 out[i] = '0' + decimal;
9150 }
9151 }
9152
9153 assert(_PyUnicode_CheckConsistency(result, 1));
9154 return result;
9155 }
9156
9157 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9158 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9159 Py_ssize_t length)
9160 {
9161 PyObject *decimal;
9162 Py_ssize_t i;
9163 Py_UCS4 maxchar;
9164 enum PyUnicode_Kind kind;
9165 void *data;
9166
9167 maxchar = 127;
9168 for (i = 0; i < length; i++) {
9169 Py_UCS4 ch = s[i];
9170 if (ch > 127) {
9171 int decimal = Py_UNICODE_TODECIMAL(ch);
9172 if (decimal >= 0)
9173 ch = '0' + decimal;
9174 maxchar = Py_MAX(maxchar, ch);
9175 }
9176 }
9177
9178 /* Copy to a new string */
9179 decimal = PyUnicode_New(length, maxchar);
9180 if (decimal == NULL)
9181 return decimal;
9182 kind = PyUnicode_KIND(decimal);
9183 data = PyUnicode_DATA(decimal);
9184 /* Iterate over code points */
9185 for (i = 0; i < length; i++) {
9186 Py_UCS4 ch = s[i];
9187 if (ch > 127) {
9188 int decimal = Py_UNICODE_TODECIMAL(ch);
9189 if (decimal >= 0)
9190 ch = '0' + decimal;
9191 }
9192 PyUnicode_WRITE(kind, data, i, ch);
9193 }
9194 return unicode_result(decimal);
9195 }
9196 /* --- Decimal Encoder ---------------------------------------------------- */
9197
9198 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9199 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9200 Py_ssize_t length,
9201 char *output,
9202 const char *errors)
9203 {
9204 PyObject *unicode;
9205 Py_ssize_t i;
9206 enum PyUnicode_Kind kind;
9207 void *data;
9208
9209 if (output == NULL) {
9210 PyErr_BadArgument();
9211 return -1;
9212 }
9213
9214 unicode = PyUnicode_FromWideChar(s, length);
9215 if (unicode == NULL)
9216 return -1;
9217
9218 kind = PyUnicode_KIND(unicode);
9219 data = PyUnicode_DATA(unicode);
9220
9221 for (i=0; i < length; ) {
9222 PyObject *exc;
9223 Py_UCS4 ch;
9224 int decimal;
9225 Py_ssize_t startpos;
9226
9227 ch = PyUnicode_READ(kind, data, i);
9228
9229 if (Py_UNICODE_ISSPACE(ch)) {
9230 *output++ = ' ';
9231 i++;
9232 continue;
9233 }
9234 decimal = Py_UNICODE_TODECIMAL(ch);
9235 if (decimal >= 0) {
9236 *output++ = '0' + decimal;
9237 i++;
9238 continue;
9239 }
9240 if (0 < ch && ch < 256) {
9241 *output++ = (char)ch;
9242 i++;
9243 continue;
9244 }
9245
9246 startpos = i;
9247 exc = NULL;
9248 raise_encode_exception(&exc, "decimal", unicode,
9249 startpos, startpos+1,
9250 "invalid decimal Unicode string");
9251 Py_XDECREF(exc);
9252 Py_DECREF(unicode);
9253 return -1;
9254 }
9255 /* 0-terminate the output string */
9256 *output++ = '\0';
9257 Py_DECREF(unicode);
9258 return 0;
9259 }
9260
9261 /* --- Helpers ------------------------------------------------------------ */
9262
9263 /* helper macro to fixup start/end slice values */
9264 #define ADJUST_INDICES(start, end, len) \
9265 if (end > len) \
9266 end = len; \
9267 else if (end < 0) { \
9268 end += len; \
9269 if (end < 0) \
9270 end = 0; \
9271 } \
9272 if (start < 0) { \
9273 start += len; \
9274 if (start < 0) \
9275 start = 0; \
9276 }
9277
9278 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9279 any_find_slice(PyObject* s1, PyObject* s2,
9280 Py_ssize_t start,
9281 Py_ssize_t end,
9282 int direction)
9283 {
9284 int kind1, kind2;
9285 void *buf1, *buf2;
9286 Py_ssize_t len1, len2, result;
9287
9288 kind1 = PyUnicode_KIND(s1);
9289 kind2 = PyUnicode_KIND(s2);
9290 if (kind1 < kind2)
9291 return -1;
9292
9293 len1 = PyUnicode_GET_LENGTH(s1);
9294 len2 = PyUnicode_GET_LENGTH(s2);
9295 ADJUST_INDICES(start, end, len1);
9296 if (end - start < len2)
9297 return -1;
9298
9299 buf1 = PyUnicode_DATA(s1);
9300 buf2 = PyUnicode_DATA(s2);
9301 if (len2 == 1) {
9302 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9303 result = findchar((const char *)buf1 + kind1*start,
9304 kind1, end - start, ch, direction);
9305 if (result == -1)
9306 return -1;
9307 else
9308 return start + result;
9309 }
9310
9311 if (kind2 != kind1) {
9312 buf2 = _PyUnicode_AsKind(s2, kind1);
9313 if (!buf2)
9314 return -2;
9315 }
9316
9317 if (direction > 0) {
9318 switch (kind1) {
9319 case PyUnicode_1BYTE_KIND:
9320 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9321 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9322 else
9323 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9324 break;
9325 case PyUnicode_2BYTE_KIND:
9326 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9327 break;
9328 case PyUnicode_4BYTE_KIND:
9329 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9330 break;
9331 default:
9332 Py_UNREACHABLE();
9333 }
9334 }
9335 else {
9336 switch (kind1) {
9337 case PyUnicode_1BYTE_KIND:
9338 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9339 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340 else
9341 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342 break;
9343 case PyUnicode_2BYTE_KIND:
9344 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345 break;
9346 case PyUnicode_4BYTE_KIND:
9347 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9348 break;
9349 default:
9350 Py_UNREACHABLE();
9351 }
9352 }
9353
9354 if (kind2 != kind1)
9355 PyMem_Free(buf2);
9356
9357 return result;
9358 }
9359
9360 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9361 #include "stringlib/localeutil.h"
9362
9363 /**
9364 * InsertThousandsGrouping:
9365 * @writer: Unicode writer.
9366 * @n_buffer: Number of characters in @buffer.
9367 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9368 * @d_pos: Start of digits string.
9369 * @n_digits: The number of digits in the string, in which we want
9370 * to put the grouping chars.
9371 * @min_width: The minimum width of the digits in the output string.
9372 * Output will be zero-padded on the left to fill.
9373 * @grouping: see definition in localeconv().
9374 * @thousands_sep: see definition in localeconv().
9375 *
9376 * There are 2 modes: counting and filling. If @writer is NULL,
9377 * we are in counting mode, else filling mode.
9378 * If counting, the required buffer size is returned.
9379 * If filling, we know the buffer will be large enough, so we don't
9380 * need to pass in the buffer size.
9381 * Inserts thousand grouping characters (as defined by grouping and
9382 * thousands_sep) into @writer.
9383 *
9384 * Return value: -1 on error, number of characters otherwise.
9385 **/
9386 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9387 _PyUnicode_InsertThousandsGrouping(
9388 _PyUnicodeWriter *writer,
9389 Py_ssize_t n_buffer,
9390 PyObject *digits,
9391 Py_ssize_t d_pos,
9392 Py_ssize_t n_digits,
9393 Py_ssize_t min_width,
9394 const char *grouping,
9395 PyObject *thousands_sep,
9396 Py_UCS4 *maxchar)
9397 {
9398 min_width = Py_MAX(0, min_width);
9399 if (writer) {
9400 assert(digits != NULL);
9401 assert(maxchar == NULL);
9402 }
9403 else {
9404 assert(digits == NULL);
9405 assert(maxchar != NULL);
9406 }
9407 assert(0 <= d_pos);
9408 assert(0 <= n_digits);
9409 assert(grouping != NULL);
9410
9411 if (digits != NULL) {
9412 if (PyUnicode_READY(digits) == -1) {
9413 return -1;
9414 }
9415 }
9416 if (PyUnicode_READY(thousands_sep) == -1) {
9417 return -1;
9418 }
9419
9420 Py_ssize_t count = 0;
9421 Py_ssize_t n_zeros;
9422 int loop_broken = 0;
9423 int use_separator = 0; /* First time through, don't append the
9424 separator. They only go between
9425 groups. */
9426 Py_ssize_t buffer_pos;
9427 Py_ssize_t digits_pos;
9428 Py_ssize_t len;
9429 Py_ssize_t n_chars;
9430 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9431 be looked at */
9432 /* A generator that returns all of the grouping widths, until it
9433 returns 0. */
9434 GroupGenerator groupgen;
9435 GroupGenerator_init(&groupgen, grouping);
9436 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9437
9438 /* if digits are not grouped, thousands separator
9439 should be an empty string */
9440 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9441
9442 digits_pos = d_pos + n_digits;
9443 if (writer) {
9444 buffer_pos = writer->pos + n_buffer;
9445 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9446 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9447 }
9448 else {
9449 buffer_pos = n_buffer;
9450 }
9451
9452 if (!writer) {
9453 *maxchar = 127;
9454 }
9455
9456 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9457 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9458 n_zeros = Py_MAX(0, len - remaining);
9459 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9460
9461 /* Use n_zero zero's and n_chars chars */
9462
9463 /* Count only, don't do anything. */
9464 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9465
9466 /* Copy into the writer. */
9467 InsertThousandsGrouping_fill(writer, &buffer_pos,
9468 digits, &digits_pos,
9469 n_chars, n_zeros,
9470 use_separator ? thousands_sep : NULL,
9471 thousands_sep_len, maxchar);
9472
9473 /* Use a separator next time. */
9474 use_separator = 1;
9475
9476 remaining -= n_chars;
9477 min_width -= len;
9478
9479 if (remaining <= 0 && min_width <= 0) {
9480 loop_broken = 1;
9481 break;
9482 }
9483 min_width -= thousands_sep_len;
9484 }
9485 if (!loop_broken) {
9486 /* We left the loop without using a break statement. */
9487
9488 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9489 n_zeros = Py_MAX(0, len - remaining);
9490 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9491
9492 /* Use n_zero zero's and n_chars chars */
9493 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9494
9495 /* Copy into the writer. */
9496 InsertThousandsGrouping_fill(writer, &buffer_pos,
9497 digits, &digits_pos,
9498 n_chars, n_zeros,
9499 use_separator ? thousands_sep : NULL,
9500 thousands_sep_len, maxchar);
9501 }
9502 return count;
9503 }
9504
9505
9506 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9507 PyUnicode_Count(PyObject *str,
9508 PyObject *substr,
9509 Py_ssize_t start,
9510 Py_ssize_t end)
9511 {
9512 Py_ssize_t result;
9513 int kind1, kind2;
9514 void *buf1 = NULL, *buf2 = NULL;
9515 Py_ssize_t len1, len2;
9516
9517 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9518 return -1;
9519
9520 kind1 = PyUnicode_KIND(str);
9521 kind2 = PyUnicode_KIND(substr);
9522 if (kind1 < kind2)
9523 return 0;
9524
9525 len1 = PyUnicode_GET_LENGTH(str);
9526 len2 = PyUnicode_GET_LENGTH(substr);
9527 ADJUST_INDICES(start, end, len1);
9528 if (end - start < len2)
9529 return 0;
9530
9531 buf1 = PyUnicode_DATA(str);
9532 buf2 = PyUnicode_DATA(substr);
9533 if (kind2 != kind1) {
9534 buf2 = _PyUnicode_AsKind(substr, kind1);
9535 if (!buf2)
9536 goto onError;
9537 }
9538
9539 switch (kind1) {
9540 case PyUnicode_1BYTE_KIND:
9541 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9542 result = asciilib_count(
9543 ((Py_UCS1*)buf1) + start, end - start,
9544 buf2, len2, PY_SSIZE_T_MAX
9545 );
9546 else
9547 result = ucs1lib_count(
9548 ((Py_UCS1*)buf1) + start, end - start,
9549 buf2, len2, PY_SSIZE_T_MAX
9550 );
9551 break;
9552 case PyUnicode_2BYTE_KIND:
9553 result = ucs2lib_count(
9554 ((Py_UCS2*)buf1) + start, end - start,
9555 buf2, len2, PY_SSIZE_T_MAX
9556 );
9557 break;
9558 case PyUnicode_4BYTE_KIND:
9559 result = ucs4lib_count(
9560 ((Py_UCS4*)buf1) + start, end - start,
9561 buf2, len2, PY_SSIZE_T_MAX
9562 );
9563 break;
9564 default:
9565 Py_UNREACHABLE();
9566 }
9567
9568 if (kind2 != kind1)
9569 PyMem_Free(buf2);
9570
9571 return result;
9572 onError:
9573 if (kind2 != kind1 && buf2)
9574 PyMem_Free(buf2);
9575 return -1;
9576 }
9577
9578 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9579 PyUnicode_Find(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
9584 {
9585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9586 return -2;
9587
9588 return any_find_slice(str, substr, start, end, direction);
9589 }
9590
9591 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9592 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9593 Py_ssize_t start, Py_ssize_t end,
9594 int direction)
9595 {
9596 int kind;
9597 Py_ssize_t len, result;
9598 if (PyUnicode_READY(str) == -1)
9599 return -2;
9600 len = PyUnicode_GET_LENGTH(str);
9601 ADJUST_INDICES(start, end, len);
9602 if (end - start < 1)
9603 return -1;
9604 kind = PyUnicode_KIND(str);
9605 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9606 kind, end-start, ch, direction);
9607 if (result == -1)
9608 return -1;
9609 else
9610 return start + result;
9611 }
9612
9613 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9614 tailmatch(PyObject *self,
9615 PyObject *substring,
9616 Py_ssize_t start,
9617 Py_ssize_t end,
9618 int direction)
9619 {
9620 int kind_self;
9621 int kind_sub;
9622 void *data_self;
9623 void *data_sub;
9624 Py_ssize_t offset;
9625 Py_ssize_t i;
9626 Py_ssize_t end_sub;
9627
9628 if (PyUnicode_READY(self) == -1 ||
9629 PyUnicode_READY(substring) == -1)
9630 return -1;
9631
9632 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9633 end -= PyUnicode_GET_LENGTH(substring);
9634 if (end < start)
9635 return 0;
9636
9637 if (PyUnicode_GET_LENGTH(substring) == 0)
9638 return 1;
9639
9640 kind_self = PyUnicode_KIND(self);
9641 data_self = PyUnicode_DATA(self);
9642 kind_sub = PyUnicode_KIND(substring);
9643 data_sub = PyUnicode_DATA(substring);
9644 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9645
9646 if (direction > 0)
9647 offset = end;
9648 else
9649 offset = start;
9650
9651 if (PyUnicode_READ(kind_self, data_self, offset) ==
9652 PyUnicode_READ(kind_sub, data_sub, 0) &&
9653 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9654 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9655 /* If both are of the same kind, memcmp is sufficient */
9656 if (kind_self == kind_sub) {
9657 return ! memcmp((char *)data_self +
9658 (offset * PyUnicode_KIND(substring)),
9659 data_sub,
9660 PyUnicode_GET_LENGTH(substring) *
9661 PyUnicode_KIND(substring));
9662 }
9663 /* otherwise we have to compare each character by first accessing it */
9664 else {
9665 /* We do not need to compare 0 and len(substring)-1 because
9666 the if statement above ensured already that they are equal
9667 when we end up here. */
9668 for (i = 1; i < end_sub; ++i) {
9669 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9670 PyUnicode_READ(kind_sub, data_sub, i))
9671 return 0;
9672 }
9673 return 1;
9674 }
9675 }
9676
9677 return 0;
9678 }
9679
9680 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9681 PyUnicode_Tailmatch(PyObject *str,
9682 PyObject *substr,
9683 Py_ssize_t start,
9684 Py_ssize_t end,
9685 int direction)
9686 {
9687 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9688 return -1;
9689
9690 return tailmatch(str, substr, start, end, direction);
9691 }
9692
9693 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9694 ascii_upper_or_lower(PyObject *self, int lower)
9695 {
9696 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9697 char *resdata, *data = PyUnicode_DATA(self);
9698 PyObject *res;
9699
9700 res = PyUnicode_New(len, 127);
9701 if (res == NULL)
9702 return NULL;
9703 resdata = PyUnicode_DATA(res);
9704 if (lower)
9705 _Py_bytes_lower(resdata, data, len);
9706 else
9707 _Py_bytes_upper(resdata, data, len);
9708 return res;
9709 }
9710
9711 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9712 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9713 {
9714 Py_ssize_t j;
9715 int final_sigma;
9716 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9717 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9718
9719 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9720
9721 where ! is a negation and \p{xxx} is a character with property xxx.
9722 */
9723 for (j = i - 1; j >= 0; j--) {
9724 c = PyUnicode_READ(kind, data, j);
9725 if (!_PyUnicode_IsCaseIgnorable(c))
9726 break;
9727 }
9728 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9729 if (final_sigma) {
9730 for (j = i + 1; j < length; j++) {
9731 c = PyUnicode_READ(kind, data, j);
9732 if (!_PyUnicode_IsCaseIgnorable(c))
9733 break;
9734 }
9735 final_sigma = j == length || !_PyUnicode_IsCased(c);
9736 }
9737 return (final_sigma) ? 0x3C2 : 0x3C3;
9738 }
9739
9740 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9741 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9742 Py_UCS4 c, Py_UCS4 *mapped)
9743 {
9744 /* Obscure special case. */
9745 if (c == 0x3A3) {
9746 mapped[0] = handle_capital_sigma(kind, data, length, i);
9747 return 1;
9748 }
9749 return _PyUnicode_ToLowerFull(c, mapped);
9750 }
9751
9752 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9753 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9754 {
9755 Py_ssize_t i, k = 0;
9756 int n_res, j;
9757 Py_UCS4 c, mapped[3];
9758
9759 c = PyUnicode_READ(kind, data, 0);
9760 n_res = _PyUnicode_ToTitleFull(c, mapped);
9761 for (j = 0; j < n_res; j++) {
9762 *maxchar = Py_MAX(*maxchar, mapped[j]);
9763 res[k++] = mapped[j];
9764 }
9765 for (i = 1; i < length; i++) {
9766 c = PyUnicode_READ(kind, data, i);
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 for (j = 0; j < n_res; j++) {
9769 *maxchar = Py_MAX(*maxchar, mapped[j]);
9770 res[k++] = mapped[j];
9771 }
9772 }
9773 return k;
9774 }
9775
9776 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9777 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9778 Py_ssize_t i, k = 0;
9779
9780 for (i = 0; i < length; i++) {
9781 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9782 int n_res, j;
9783 if (Py_UNICODE_ISUPPER(c)) {
9784 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9785 }
9786 else if (Py_UNICODE_ISLOWER(c)) {
9787 n_res = _PyUnicode_ToUpperFull(c, mapped);
9788 }
9789 else {
9790 n_res = 1;
9791 mapped[0] = c;
9792 }
9793 for (j = 0; j < n_res; j++) {
9794 *maxchar = Py_MAX(*maxchar, mapped[j]);
9795 res[k++] = mapped[j];
9796 }
9797 }
9798 return k;
9799 }
9800
9801 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9802 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9803 Py_UCS4 *maxchar, int lower)
9804 {
9805 Py_ssize_t i, k = 0;
9806
9807 for (i = 0; i < length; i++) {
9808 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809 int n_res, j;
9810 if (lower)
9811 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812 else
9813 n_res = _PyUnicode_ToUpperFull(c, mapped);
9814 for (j = 0; j < n_res; j++) {
9815 *maxchar = Py_MAX(*maxchar, mapped[j]);
9816 res[k++] = mapped[j];
9817 }
9818 }
9819 return k;
9820 }
9821
9822 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9823 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824 {
9825 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9826 }
9827
9828 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9829 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830 {
9831 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9832 }
9833
9834 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9835 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836 {
9837 Py_ssize_t i, k = 0;
9838
9839 for (i = 0; i < length; i++) {
9840 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9841 Py_UCS4 mapped[3];
9842 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9843 for (j = 0; j < n_res; j++) {
9844 *maxchar = Py_MAX(*maxchar, mapped[j]);
9845 res[k++] = mapped[j];
9846 }
9847 }
9848 return k;
9849 }
9850
9851 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9852 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853 {
9854 Py_ssize_t i, k = 0;
9855 int previous_is_cased;
9856
9857 previous_is_cased = 0;
9858 for (i = 0; i < length; i++) {
9859 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9860 Py_UCS4 mapped[3];
9861 int n_res, j;
9862
9863 if (previous_is_cased)
9864 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9865 else
9866 n_res = _PyUnicode_ToTitleFull(c, mapped);
9867
9868 for (j = 0; j < n_res; j++) {
9869 *maxchar = Py_MAX(*maxchar, mapped[j]);
9870 res[k++] = mapped[j];
9871 }
9872
9873 previous_is_cased = _PyUnicode_IsCased(c);
9874 }
9875 return k;
9876 }
9877
9878 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9879 case_operation(PyObject *self,
9880 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9881 {
9882 PyObject *res = NULL;
9883 Py_ssize_t length, newlength = 0;
9884 int kind, outkind;
9885 void *data, *outdata;
9886 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9887
9888 assert(PyUnicode_IS_READY(self));
9889
9890 kind = PyUnicode_KIND(self);
9891 data = PyUnicode_DATA(self);
9892 length = PyUnicode_GET_LENGTH(self);
9893 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9894 PyErr_SetString(PyExc_OverflowError, "string is too long");
9895 return NULL;
9896 }
9897 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9898 if (tmp == NULL)
9899 return PyErr_NoMemory();
9900 newlength = perform(kind, data, length, tmp, &maxchar);
9901 res = PyUnicode_New(newlength, maxchar);
9902 if (res == NULL)
9903 goto leave;
9904 tmpend = tmp + newlength;
9905 outdata = PyUnicode_DATA(res);
9906 outkind = PyUnicode_KIND(res);
9907 switch (outkind) {
9908 case PyUnicode_1BYTE_KIND:
9909 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9910 break;
9911 case PyUnicode_2BYTE_KIND:
9912 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9913 break;
9914 case PyUnicode_4BYTE_KIND:
9915 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9916 break;
9917 default:
9918 Py_UNREACHABLE();
9919 }
9920 leave:
9921 PyMem_FREE(tmp);
9922 return res;
9923 }
9924
9925 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9926 PyUnicode_Join(PyObject *separator, PyObject *seq)
9927 {
9928 PyObject *res;
9929 PyObject *fseq;
9930 Py_ssize_t seqlen;
9931 PyObject **items;
9932
9933 fseq = PySequence_Fast(seq, "can only join an iterable");
9934 if (fseq == NULL) {
9935 return NULL;
9936 }
9937
9938 /* NOTE: the following code can't call back into Python code,
9939 * so we are sure that fseq won't be mutated.
9940 */
9941
9942 items = PySequence_Fast_ITEMS(fseq);
9943 seqlen = PySequence_Fast_GET_SIZE(fseq);
9944 res = _PyUnicode_JoinArray(separator, items, seqlen);
9945 Py_DECREF(fseq);
9946 return res;
9947 }
9948
9949 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9950 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9951 {
9952 PyObject *res = NULL; /* the result */
9953 PyObject *sep = NULL;
9954 Py_ssize_t seplen;
9955 PyObject *item;
9956 Py_ssize_t sz, i, res_offset;
9957 Py_UCS4 maxchar;
9958 Py_UCS4 item_maxchar;
9959 int use_memcpy;
9960 unsigned char *res_data = NULL, *sep_data = NULL;
9961 PyObject *last_obj;
9962 unsigned int kind = 0;
9963
9964 /* If empty sequence, return u"". */
9965 if (seqlen == 0) {
9966 _Py_RETURN_UNICODE_EMPTY();
9967 }
9968
9969 /* If singleton sequence with an exact Unicode, return that. */
9970 last_obj = NULL;
9971 if (seqlen == 1) {
9972 if (PyUnicode_CheckExact(items[0])) {
9973 res = items[0];
9974 Py_INCREF(res);
9975 return res;
9976 }
9977 seplen = 0;
9978 maxchar = 0;
9979 }
9980 else {
9981 /* Set up sep and seplen */
9982 if (separator == NULL) {
9983 /* fall back to a blank space separator */
9984 sep = PyUnicode_FromOrdinal(' ');
9985 if (!sep)
9986 goto onError;
9987 seplen = 1;
9988 maxchar = 32;
9989 }
9990 else {
9991 if (!PyUnicode_Check(separator)) {
9992 PyErr_Format(PyExc_TypeError,
9993 "separator: expected str instance,"
9994 " %.80s found",
9995 Py_TYPE(separator)->tp_name);
9996 goto onError;
9997 }
9998 if (PyUnicode_READY(separator))
9999 goto onError;
10000 sep = separator;
10001 seplen = PyUnicode_GET_LENGTH(separator);
10002 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10003 /* inc refcount to keep this code path symmetric with the
10004 above case of a blank separator */
10005 Py_INCREF(sep);
10006 }
10007 last_obj = sep;
10008 }
10009
10010 /* There are at least two things to join, or else we have a subclass
10011 * of str in the sequence.
10012 * Do a pre-pass to figure out the total amount of space we'll
10013 * need (sz), and see whether all argument are strings.
10014 */
10015 sz = 0;
10016 #ifdef Py_DEBUG
10017 use_memcpy = 0;
10018 #else
10019 use_memcpy = 1;
10020 #endif
10021 for (i = 0; i < seqlen; i++) {
10022 size_t add_sz;
10023 item = items[i];
10024 if (!PyUnicode_Check(item)) {
10025 PyErr_Format(PyExc_TypeError,
10026 "sequence item %zd: expected str instance,"
10027 " %.80s found",
10028 i, Py_TYPE(item)->tp_name);
10029 goto onError;
10030 }
10031 if (PyUnicode_READY(item) == -1)
10032 goto onError;
10033 add_sz = PyUnicode_GET_LENGTH(item);
10034 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10035 maxchar = Py_MAX(maxchar, item_maxchar);
10036 if (i != 0) {
10037 add_sz += seplen;
10038 }
10039 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10040 PyErr_SetString(PyExc_OverflowError,
10041 "join() result is too long for a Python string");
10042 goto onError;
10043 }
10044 sz += add_sz;
10045 if (use_memcpy && last_obj != NULL) {
10046 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10047 use_memcpy = 0;
10048 }
10049 last_obj = item;
10050 }
10051
10052 res = PyUnicode_New(sz, maxchar);
10053 if (res == NULL)
10054 goto onError;
10055
10056 /* Catenate everything. */
10057 #ifdef Py_DEBUG
10058 use_memcpy = 0;
10059 #else
10060 if (use_memcpy) {
10061 res_data = PyUnicode_1BYTE_DATA(res);
10062 kind = PyUnicode_KIND(res);
10063 if (seplen != 0)
10064 sep_data = PyUnicode_1BYTE_DATA(sep);
10065 }
10066 #endif
10067 if (use_memcpy) {
10068 for (i = 0; i < seqlen; ++i) {
10069 Py_ssize_t itemlen;
10070 item = items[i];
10071
10072 /* Copy item, and maybe the separator. */
10073 if (i && seplen != 0) {
10074 memcpy(res_data,
10075 sep_data,
10076 kind * seplen);
10077 res_data += kind * seplen;
10078 }
10079
10080 itemlen = PyUnicode_GET_LENGTH(item);
10081 if (itemlen != 0) {
10082 memcpy(res_data,
10083 PyUnicode_DATA(item),
10084 kind * itemlen);
10085 res_data += kind * itemlen;
10086 }
10087 }
10088 assert(res_data == PyUnicode_1BYTE_DATA(res)
10089 + kind * PyUnicode_GET_LENGTH(res));
10090 }
10091 else {
10092 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10093 Py_ssize_t itemlen;
10094 item = items[i];
10095
10096 /* Copy item, and maybe the separator. */
10097 if (i && seplen != 0) {
10098 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10099 res_offset += seplen;
10100 }
10101
10102 itemlen = PyUnicode_GET_LENGTH(item);
10103 if (itemlen != 0) {
10104 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10105 res_offset += itemlen;
10106 }
10107 }
10108 assert(res_offset == PyUnicode_GET_LENGTH(res));
10109 }
10110
10111 Py_XDECREF(sep);
10112 assert(_PyUnicode_CheckConsistency(res, 1));
10113 return res;
10114
10115 onError:
10116 Py_XDECREF(sep);
10117 Py_XDECREF(res);
10118 return NULL;
10119 }
10120
10121 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10122 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123 Py_UCS4 fill_char)
10124 {
10125 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10126 void *data = PyUnicode_DATA(unicode);
10127 assert(PyUnicode_IS_READY(unicode));
10128 assert(unicode_modifiable(unicode));
10129 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10130 assert(start >= 0);
10131 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10132 unicode_fill(kind, data, fill_char, start, length);
10133 }
10134
10135 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10136 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10137 Py_UCS4 fill_char)
10138 {
10139 Py_ssize_t maxlen;
10140
10141 if (!PyUnicode_Check(unicode)) {
10142 PyErr_BadInternalCall();
10143 return -1;
10144 }
10145 if (PyUnicode_READY(unicode) == -1)
10146 return -1;
10147 if (unicode_check_modifiable(unicode))
10148 return -1;
10149
10150 if (start < 0) {
10151 PyErr_SetString(PyExc_IndexError, "string index out of range");
10152 return -1;
10153 }
10154 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10155 PyErr_SetString(PyExc_ValueError,
10156 "fill character is bigger than "
10157 "the string maximum character");
10158 return -1;
10159 }
10160
10161 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10162 length = Py_MIN(maxlen, length);
10163 if (length <= 0)
10164 return 0;
10165
10166 _PyUnicode_FastFill(unicode, start, length, fill_char);
10167 return length;
10168 }
10169
10170 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10171 pad(PyObject *self,
10172 Py_ssize_t left,
10173 Py_ssize_t right,
10174 Py_UCS4 fill)
10175 {
10176 PyObject *u;
10177 Py_UCS4 maxchar;
10178 int kind;
10179 void *data;
10180
10181 if (left < 0)
10182 left = 0;
10183 if (right < 0)
10184 right = 0;
10185
10186 if (left == 0 && right == 0)
10187 return unicode_result_unchanged(self);
10188
10189 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10190 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10191 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10192 return NULL;
10193 }
10194 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10195 maxchar = Py_MAX(maxchar, fill);
10196 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10197 if (!u)
10198 return NULL;
10199
10200 kind = PyUnicode_KIND(u);
10201 data = PyUnicode_DATA(u);
10202 if (left)
10203 unicode_fill(kind, data, fill, 0, left);
10204 if (right)
10205 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10206 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10207 assert(_PyUnicode_CheckConsistency(u, 1));
10208 return u;
10209 }
10210
10211 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10212 PyUnicode_Splitlines(PyObject *string, int keepends)
10213 {
10214 PyObject *list;
10215
10216 if (ensure_unicode(string) < 0)
10217 return NULL;
10218
10219 switch (PyUnicode_KIND(string)) {
10220 case PyUnicode_1BYTE_KIND:
10221 if (PyUnicode_IS_ASCII(string))
10222 list = asciilib_splitlines(
10223 string, PyUnicode_1BYTE_DATA(string),
10224 PyUnicode_GET_LENGTH(string), keepends);
10225 else
10226 list = ucs1lib_splitlines(
10227 string, PyUnicode_1BYTE_DATA(string),
10228 PyUnicode_GET_LENGTH(string), keepends);
10229 break;
10230 case PyUnicode_2BYTE_KIND:
10231 list = ucs2lib_splitlines(
10232 string, PyUnicode_2BYTE_DATA(string),
10233 PyUnicode_GET_LENGTH(string), keepends);
10234 break;
10235 case PyUnicode_4BYTE_KIND:
10236 list = ucs4lib_splitlines(
10237 string, PyUnicode_4BYTE_DATA(string),
10238 PyUnicode_GET_LENGTH(string), keepends);
10239 break;
10240 default:
10241 Py_UNREACHABLE();
10242 }
10243 return list;
10244 }
10245
10246 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10247 split(PyObject *self,
10248 PyObject *substring,
10249 Py_ssize_t maxcount)
10250 {
10251 int kind1, kind2;
10252 void *buf1, *buf2;
10253 Py_ssize_t len1, len2;
10254 PyObject* out;
10255
10256 if (maxcount < 0)
10257 maxcount = PY_SSIZE_T_MAX;
10258
10259 if (PyUnicode_READY(self) == -1)
10260 return NULL;
10261
10262 if (substring == NULL)
10263 switch (PyUnicode_KIND(self)) {
10264 case PyUnicode_1BYTE_KIND:
10265 if (PyUnicode_IS_ASCII(self))
10266 return asciilib_split_whitespace(
10267 self, PyUnicode_1BYTE_DATA(self),
10268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 else
10271 return ucs1lib_split_whitespace(
10272 self, PyUnicode_1BYTE_DATA(self),
10273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 case PyUnicode_2BYTE_KIND:
10276 return ucs2lib_split_whitespace(
10277 self, PyUnicode_2BYTE_DATA(self),
10278 PyUnicode_GET_LENGTH(self), maxcount
10279 );
10280 case PyUnicode_4BYTE_KIND:
10281 return ucs4lib_split_whitespace(
10282 self, PyUnicode_4BYTE_DATA(self),
10283 PyUnicode_GET_LENGTH(self), maxcount
10284 );
10285 default:
10286 Py_UNREACHABLE();
10287 }
10288
10289 if (PyUnicode_READY(substring) == -1)
10290 return NULL;
10291
10292 kind1 = PyUnicode_KIND(self);
10293 kind2 = PyUnicode_KIND(substring);
10294 len1 = PyUnicode_GET_LENGTH(self);
10295 len2 = PyUnicode_GET_LENGTH(substring);
10296 if (kind1 < kind2 || len1 < len2) {
10297 out = PyList_New(1);
10298 if (out == NULL)
10299 return NULL;
10300 Py_INCREF(self);
10301 PyList_SET_ITEM(out, 0, self);
10302 return out;
10303 }
10304 buf1 = PyUnicode_DATA(self);
10305 buf2 = PyUnicode_DATA(substring);
10306 if (kind2 != kind1) {
10307 buf2 = _PyUnicode_AsKind(substring, kind1);
10308 if (!buf2)
10309 return NULL;
10310 }
10311
10312 switch (kind1) {
10313 case PyUnicode_1BYTE_KIND:
10314 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10315 out = asciilib_split(
10316 self, buf1, len1, buf2, len2, maxcount);
10317 else
10318 out = ucs1lib_split(
10319 self, buf1, len1, buf2, len2, maxcount);
10320 break;
10321 case PyUnicode_2BYTE_KIND:
10322 out = ucs2lib_split(
10323 self, buf1, len1, buf2, len2, maxcount);
10324 break;
10325 case PyUnicode_4BYTE_KIND:
10326 out = ucs4lib_split(
10327 self, buf1, len1, buf2, len2, maxcount);
10328 break;
10329 default:
10330 out = NULL;
10331 }
10332 if (kind2 != kind1)
10333 PyMem_Free(buf2);
10334 return out;
10335 }
10336
10337 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10338 rsplit(PyObject *self,
10339 PyObject *substring,
10340 Py_ssize_t maxcount)
10341 {
10342 int kind1, kind2;
10343 void *buf1, *buf2;
10344 Py_ssize_t len1, len2;
10345 PyObject* out;
10346
10347 if (maxcount < 0)
10348 maxcount = PY_SSIZE_T_MAX;
10349
10350 if (PyUnicode_READY(self) == -1)
10351 return NULL;
10352
10353 if (substring == NULL)
10354 switch (PyUnicode_KIND(self)) {
10355 case PyUnicode_1BYTE_KIND:
10356 if (PyUnicode_IS_ASCII(self))
10357 return asciilib_rsplit_whitespace(
10358 self, PyUnicode_1BYTE_DATA(self),
10359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 else
10362 return ucs1lib_rsplit_whitespace(
10363 self, PyUnicode_1BYTE_DATA(self),
10364 PyUnicode_GET_LENGTH(self), maxcount
10365 );
10366 case PyUnicode_2BYTE_KIND:
10367 return ucs2lib_rsplit_whitespace(
10368 self, PyUnicode_2BYTE_DATA(self),
10369 PyUnicode_GET_LENGTH(self), maxcount
10370 );
10371 case PyUnicode_4BYTE_KIND:
10372 return ucs4lib_rsplit_whitespace(
10373 self, PyUnicode_4BYTE_DATA(self),
10374 PyUnicode_GET_LENGTH(self), maxcount
10375 );
10376 default:
10377 Py_UNREACHABLE();
10378 }
10379
10380 if (PyUnicode_READY(substring) == -1)
10381 return NULL;
10382
10383 kind1 = PyUnicode_KIND(self);
10384 kind2 = PyUnicode_KIND(substring);
10385 len1 = PyUnicode_GET_LENGTH(self);
10386 len2 = PyUnicode_GET_LENGTH(substring);
10387 if (kind1 < kind2 || len1 < len2) {
10388 out = PyList_New(1);
10389 if (out == NULL)
10390 return NULL;
10391 Py_INCREF(self);
10392 PyList_SET_ITEM(out, 0, self);
10393 return out;
10394 }
10395 buf1 = PyUnicode_DATA(self);
10396 buf2 = PyUnicode_DATA(substring);
10397 if (kind2 != kind1) {
10398 buf2 = _PyUnicode_AsKind(substring, kind1);
10399 if (!buf2)
10400 return NULL;
10401 }
10402
10403 switch (kind1) {
10404 case PyUnicode_1BYTE_KIND:
10405 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10406 out = asciilib_rsplit(
10407 self, buf1, len1, buf2, len2, maxcount);
10408 else
10409 out = ucs1lib_rsplit(
10410 self, buf1, len1, buf2, len2, maxcount);
10411 break;
10412 case PyUnicode_2BYTE_KIND:
10413 out = ucs2lib_rsplit(
10414 self, buf1, len1, buf2, len2, maxcount);
10415 break;
10416 case PyUnicode_4BYTE_KIND:
10417 out = ucs4lib_rsplit(
10418 self, buf1, len1, buf2, len2, maxcount);
10419 break;
10420 default:
10421 out = NULL;
10422 }
10423 if (kind2 != kind1)
10424 PyMem_Free(buf2);
10425 return out;
10426 }
10427
10428 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10429 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10430 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10431 {
10432 switch (kind) {
10433 case PyUnicode_1BYTE_KIND:
10434 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10435 return asciilib_find(buf1, len1, buf2, len2, offset);
10436 else
10437 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10438 case PyUnicode_2BYTE_KIND:
10439 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10440 case PyUnicode_4BYTE_KIND:
10441 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10442 }
10443 Py_UNREACHABLE();
10444 }
10445
10446 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10447 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10448 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10449 {
10450 switch (kind) {
10451 case PyUnicode_1BYTE_KIND:
10452 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10453 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10454 else
10455 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10456 case PyUnicode_2BYTE_KIND:
10457 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10458 case PyUnicode_4BYTE_KIND:
10459 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10460 }
10461 Py_UNREACHABLE();
10462 }
10463
10464 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10465 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10466 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10467 {
10468 int kind = PyUnicode_KIND(u);
10469 void *data = PyUnicode_DATA(u);
10470 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10471 if (kind == PyUnicode_1BYTE_KIND) {
10472 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10473 (Py_UCS1 *)data + len,
10474 u1, u2, maxcount);
10475 }
10476 else if (kind == PyUnicode_2BYTE_KIND) {
10477 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10478 (Py_UCS2 *)data + len,
10479 u1, u2, maxcount);
10480 }
10481 else {
10482 assert(kind == PyUnicode_4BYTE_KIND);
10483 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10484 (Py_UCS4 *)data + len,
10485 u1, u2, maxcount);
10486 }
10487 }
10488
10489 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10490 replace(PyObject *self, PyObject *str1,
10491 PyObject *str2, Py_ssize_t maxcount)
10492 {
10493 PyObject *u;
10494 char *sbuf = PyUnicode_DATA(self);
10495 char *buf1 = PyUnicode_DATA(str1);
10496 char *buf2 = PyUnicode_DATA(str2);
10497 int srelease = 0, release1 = 0, release2 = 0;
10498 int skind = PyUnicode_KIND(self);
10499 int kind1 = PyUnicode_KIND(str1);
10500 int kind2 = PyUnicode_KIND(str2);
10501 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10502 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10503 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10504 int mayshrink;
10505 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10506
10507 if (maxcount < 0)
10508 maxcount = PY_SSIZE_T_MAX;
10509 else if (maxcount == 0 || slen == 0)
10510 goto nothing;
10511
10512 if (str1 == str2)
10513 goto nothing;
10514
10515 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10516 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10517 if (maxchar < maxchar_str1)
10518 /* substring too wide to be present */
10519 goto nothing;
10520 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10521 /* Replacing str1 with str2 may cause a maxchar reduction in the
10522 result string. */
10523 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10524 maxchar = Py_MAX(maxchar, maxchar_str2);
10525
10526 if (len1 == len2) {
10527 /* same length */
10528 if (len1 == 0)
10529 goto nothing;
10530 if (len1 == 1) {
10531 /* replace characters */
10532 Py_UCS4 u1, u2;
10533 Py_ssize_t pos;
10534
10535 u1 = PyUnicode_READ(kind1, buf1, 0);
10536 pos = findchar(sbuf, skind, slen, u1, 1);
10537 if (pos < 0)
10538 goto nothing;
10539 u2 = PyUnicode_READ(kind2, buf2, 0);
10540 u = PyUnicode_New(slen, maxchar);
10541 if (!u)
10542 goto error;
10543
10544 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10545 replace_1char_inplace(u, pos, u1, u2, maxcount);
10546 }
10547 else {
10548 int rkind = skind;
10549 char *res;
10550 Py_ssize_t i;
10551
10552 if (kind1 < rkind) {
10553 /* widen substring */
10554 buf1 = _PyUnicode_AsKind(str1, rkind);
10555 if (!buf1) goto error;
10556 release1 = 1;
10557 }
10558 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10559 if (i < 0)
10560 goto nothing;
10561 if (rkind > kind2) {
10562 /* widen replacement */
10563 buf2 = _PyUnicode_AsKind(str2, rkind);
10564 if (!buf2) goto error;
10565 release2 = 1;
10566 }
10567 else if (rkind < kind2) {
10568 /* widen self and buf1 */
10569 rkind = kind2;
10570 if (release1) PyMem_Free(buf1);
10571 release1 = 0;
10572 sbuf = _PyUnicode_AsKind(self, rkind);
10573 if (!sbuf) goto error;
10574 srelease = 1;
10575 buf1 = _PyUnicode_AsKind(str1, rkind);
10576 if (!buf1) goto error;
10577 release1 = 1;
10578 }
10579 u = PyUnicode_New(slen, maxchar);
10580 if (!u)
10581 goto error;
10582 assert(PyUnicode_KIND(u) == rkind);
10583 res = PyUnicode_DATA(u);
10584
10585 memcpy(res, sbuf, rkind * slen);
10586 /* change everything in-place, starting with this one */
10587 memcpy(res + rkind * i,
10588 buf2,
10589 rkind * len2);
10590 i += len1;
10591
10592 while ( --maxcount > 0) {
10593 i = anylib_find(rkind, self,
10594 sbuf+rkind*i, slen-i,
10595 str1, buf1, len1, i);
10596 if (i == -1)
10597 break;
10598 memcpy(res + rkind * i,
10599 buf2,
10600 rkind * len2);
10601 i += len1;
10602 }
10603 }
10604 }
10605 else {
10606 Py_ssize_t n, i, j, ires;
10607 Py_ssize_t new_size;
10608 int rkind = skind;
10609 char *res;
10610
10611 if (kind1 < rkind) {
10612 /* widen substring */
10613 buf1 = _PyUnicode_AsKind(str1, rkind);
10614 if (!buf1) goto error;
10615 release1 = 1;
10616 }
10617 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10618 if (n == 0)
10619 goto nothing;
10620 if (kind2 < rkind) {
10621 /* widen replacement */
10622 buf2 = _PyUnicode_AsKind(str2, rkind);
10623 if (!buf2) goto error;
10624 release2 = 1;
10625 }
10626 else if (kind2 > rkind) {
10627 /* widen self and buf1 */
10628 rkind = kind2;
10629 sbuf = _PyUnicode_AsKind(self, rkind);
10630 if (!sbuf) goto error;
10631 srelease = 1;
10632 if (release1) PyMem_Free(buf1);
10633 release1 = 0;
10634 buf1 = _PyUnicode_AsKind(str1, rkind);
10635 if (!buf1) goto error;
10636 release1 = 1;
10637 }
10638 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10639 PyUnicode_GET_LENGTH(str1))); */
10640 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10641 PyErr_SetString(PyExc_OverflowError,
10642 "replace string is too long");
10643 goto error;
10644 }
10645 new_size = slen + n * (len2 - len1);
10646 if (new_size == 0) {
10647 _Py_INCREF_UNICODE_EMPTY();
10648 if (!unicode_empty)
10649 goto error;
10650 u = unicode_empty;
10651 goto done;
10652 }
10653 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10654 PyErr_SetString(PyExc_OverflowError,
10655 "replace string is too long");
10656 goto error;
10657 }
10658 u = PyUnicode_New(new_size, maxchar);
10659 if (!u)
10660 goto error;
10661 assert(PyUnicode_KIND(u) == rkind);
10662 res = PyUnicode_DATA(u);
10663 ires = i = 0;
10664 if (len1 > 0) {
10665 while (n-- > 0) {
10666 /* look for next match */
10667 j = anylib_find(rkind, self,
10668 sbuf + rkind * i, slen-i,
10669 str1, buf1, len1, i);
10670 if (j == -1)
10671 break;
10672 else if (j > i) {
10673 /* copy unchanged part [i:j] */
10674 memcpy(res + rkind * ires,
10675 sbuf + rkind * i,
10676 rkind * (j-i));
10677 ires += j - i;
10678 }
10679 /* copy substitution string */
10680 if (len2 > 0) {
10681 memcpy(res + rkind * ires,
10682 buf2,
10683 rkind * len2);
10684 ires += len2;
10685 }
10686 i = j + len1;
10687 }
10688 if (i < slen)
10689 /* copy tail [i:] */
10690 memcpy(res + rkind * ires,
10691 sbuf + rkind * i,
10692 rkind * (slen-i));
10693 }
10694 else {
10695 /* interleave */
10696 while (n > 0) {
10697 memcpy(res + rkind * ires,
10698 buf2,
10699 rkind * len2);
10700 ires += len2;
10701 if (--n <= 0)
10702 break;
10703 memcpy(res + rkind * ires,
10704 sbuf + rkind * i,
10705 rkind);
10706 ires++;
10707 i++;
10708 }
10709 memcpy(res + rkind * ires,
10710 sbuf + rkind * i,
10711 rkind * (slen-i));
10712 }
10713 }
10714
10715 if (mayshrink) {
10716 unicode_adjust_maxchar(&u);
10717 if (u == NULL)
10718 goto error;
10719 }
10720
10721 done:
10722 if (srelease)
10723 PyMem_FREE(sbuf);
10724 if (release1)
10725 PyMem_FREE(buf1);
10726 if (release2)
10727 PyMem_FREE(buf2);
10728 assert(_PyUnicode_CheckConsistency(u, 1));
10729 return u;
10730
10731 nothing:
10732 /* nothing to replace; return original string (when possible) */
10733 if (srelease)
10734 PyMem_FREE(sbuf);
10735 if (release1)
10736 PyMem_FREE(buf1);
10737 if (release2)
10738 PyMem_FREE(buf2);
10739 return unicode_result_unchanged(self);
10740
10741 error:
10742 if (srelease && sbuf)
10743 PyMem_FREE(sbuf);
10744 if (release1 && buf1)
10745 PyMem_FREE(buf1);
10746 if (release2 && buf2)
10747 PyMem_FREE(buf2);
10748 return NULL;
10749 }
10750
10751 /* --- Unicode Object Methods --------------------------------------------- */
10752
10753 /*[clinic input]
10754 str.title as unicode_title
10755
10756 Return a version of the string where each word is titlecased.
10757
10758 More specifically, words start with uppercased characters and all remaining
10759 cased characters have lower case.
10760 [clinic start generated code]*/
10761
10762 static PyObject *
unicode_title_impl(PyObject * self)10763 unicode_title_impl(PyObject *self)
10764 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10765 {
10766 if (PyUnicode_READY(self) == -1)
10767 return NULL;
10768 return case_operation(self, do_title);
10769 }
10770
10771 /*[clinic input]
10772 str.capitalize as unicode_capitalize
10773
10774 Return a capitalized version of the string.
10775
10776 More specifically, make the first character have upper case and the rest lower
10777 case.
10778 [clinic start generated code]*/
10779
10780 static PyObject *
unicode_capitalize_impl(PyObject * self)10781 unicode_capitalize_impl(PyObject *self)
10782 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10783 {
10784 if (PyUnicode_READY(self) == -1)
10785 return NULL;
10786 if (PyUnicode_GET_LENGTH(self) == 0)
10787 return unicode_result_unchanged(self);
10788 return case_operation(self, do_capitalize);
10789 }
10790
10791 /*[clinic input]
10792 str.casefold as unicode_casefold
10793
10794 Return a version of the string suitable for caseless comparisons.
10795 [clinic start generated code]*/
10796
10797 static PyObject *
unicode_casefold_impl(PyObject * self)10798 unicode_casefold_impl(PyObject *self)
10799 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10800 {
10801 if (PyUnicode_READY(self) == -1)
10802 return NULL;
10803 if (PyUnicode_IS_ASCII(self))
10804 return ascii_upper_or_lower(self, 1);
10805 return case_operation(self, do_casefold);
10806 }
10807
10808
10809 /* Argument converter. Accepts a single Unicode character. */
10810
10811 static int
convert_uc(PyObject * obj,void * addr)10812 convert_uc(PyObject *obj, void *addr)
10813 {
10814 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10815
10816 if (!PyUnicode_Check(obj)) {
10817 PyErr_Format(PyExc_TypeError,
10818 "The fill character must be a unicode character, "
10819 "not %.100s", Py_TYPE(obj)->tp_name);
10820 return 0;
10821 }
10822 if (PyUnicode_READY(obj) < 0)
10823 return 0;
10824 if (PyUnicode_GET_LENGTH(obj) != 1) {
10825 PyErr_SetString(PyExc_TypeError,
10826 "The fill character must be exactly one character long");
10827 return 0;
10828 }
10829 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10830 return 1;
10831 }
10832
10833 /*[clinic input]
10834 str.center as unicode_center
10835
10836 width: Py_ssize_t
10837 fillchar: Py_UCS4 = ' '
10838 /
10839
10840 Return a centered string of length width.
10841
10842 Padding is done using the specified fill character (default is a space).
10843 [clinic start generated code]*/
10844
10845 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10846 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10847 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10848 {
10849 Py_ssize_t marg, left;
10850
10851 if (PyUnicode_READY(self) == -1)
10852 return NULL;
10853
10854 if (PyUnicode_GET_LENGTH(self) >= width)
10855 return unicode_result_unchanged(self);
10856
10857 marg = width - PyUnicode_GET_LENGTH(self);
10858 left = marg / 2 + (marg & width & 1);
10859
10860 return pad(self, left, marg - left, fillchar);
10861 }
10862
10863 /* This function assumes that str1 and str2 are readied by the caller. */
10864
10865 static int
unicode_compare(PyObject * str1,PyObject * str2)10866 unicode_compare(PyObject *str1, PyObject *str2)
10867 {
10868 #define COMPARE(TYPE1, TYPE2) \
10869 do { \
10870 TYPE1* p1 = (TYPE1 *)data1; \
10871 TYPE2* p2 = (TYPE2 *)data2; \
10872 TYPE1* end = p1 + len; \
10873 Py_UCS4 c1, c2; \
10874 for (; p1 != end; p1++, p2++) { \
10875 c1 = *p1; \
10876 c2 = *p2; \
10877 if (c1 != c2) \
10878 return (c1 < c2) ? -1 : 1; \
10879 } \
10880 } \
10881 while (0)
10882
10883 int kind1, kind2;
10884 void *data1, *data2;
10885 Py_ssize_t len1, len2, len;
10886
10887 kind1 = PyUnicode_KIND(str1);
10888 kind2 = PyUnicode_KIND(str2);
10889 data1 = PyUnicode_DATA(str1);
10890 data2 = PyUnicode_DATA(str2);
10891 len1 = PyUnicode_GET_LENGTH(str1);
10892 len2 = PyUnicode_GET_LENGTH(str2);
10893 len = Py_MIN(len1, len2);
10894
10895 switch(kind1) {
10896 case PyUnicode_1BYTE_KIND:
10897 {
10898 switch(kind2) {
10899 case PyUnicode_1BYTE_KIND:
10900 {
10901 int cmp = memcmp(data1, data2, len);
10902 /* normalize result of memcmp() into the range [-1; 1] */
10903 if (cmp < 0)
10904 return -1;
10905 if (cmp > 0)
10906 return 1;
10907 break;
10908 }
10909 case PyUnicode_2BYTE_KIND:
10910 COMPARE(Py_UCS1, Py_UCS2);
10911 break;
10912 case PyUnicode_4BYTE_KIND:
10913 COMPARE(Py_UCS1, Py_UCS4);
10914 break;
10915 default:
10916 Py_UNREACHABLE();
10917 }
10918 break;
10919 }
10920 case PyUnicode_2BYTE_KIND:
10921 {
10922 switch(kind2) {
10923 case PyUnicode_1BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS1);
10925 break;
10926 case PyUnicode_2BYTE_KIND:
10927 {
10928 COMPARE(Py_UCS2, Py_UCS2);
10929 break;
10930 }
10931 case PyUnicode_4BYTE_KIND:
10932 COMPARE(Py_UCS2, Py_UCS4);
10933 break;
10934 default:
10935 Py_UNREACHABLE();
10936 }
10937 break;
10938 }
10939 case PyUnicode_4BYTE_KIND:
10940 {
10941 switch(kind2) {
10942 case PyUnicode_1BYTE_KIND:
10943 COMPARE(Py_UCS4, Py_UCS1);
10944 break;
10945 case PyUnicode_2BYTE_KIND:
10946 COMPARE(Py_UCS4, Py_UCS2);
10947 break;
10948 case PyUnicode_4BYTE_KIND:
10949 {
10950 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10951 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10952 /* normalize result of wmemcmp() into the range [-1; 1] */
10953 if (cmp < 0)
10954 return -1;
10955 if (cmp > 0)
10956 return 1;
10957 #else
10958 COMPARE(Py_UCS4, Py_UCS4);
10959 #endif
10960 break;
10961 }
10962 default:
10963 Py_UNREACHABLE();
10964 }
10965 break;
10966 }
10967 default:
10968 Py_UNREACHABLE();
10969 }
10970
10971 if (len1 == len2)
10972 return 0;
10973 if (len1 < len2)
10974 return -1;
10975 else
10976 return 1;
10977
10978 #undef COMPARE
10979 }
10980
10981 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10982 unicode_compare_eq(PyObject *str1, PyObject *str2)
10983 {
10984 int kind;
10985 void *data1, *data2;
10986 Py_ssize_t len;
10987 int cmp;
10988
10989 len = PyUnicode_GET_LENGTH(str1);
10990 if (PyUnicode_GET_LENGTH(str2) != len)
10991 return 0;
10992 kind = PyUnicode_KIND(str1);
10993 if (PyUnicode_KIND(str2) != kind)
10994 return 0;
10995 data1 = PyUnicode_DATA(str1);
10996 data2 = PyUnicode_DATA(str2);
10997
10998 cmp = memcmp(data1, data2, len * kind);
10999 return (cmp == 0);
11000 }
11001
11002
11003 int
PyUnicode_Compare(PyObject * left,PyObject * right)11004 PyUnicode_Compare(PyObject *left, PyObject *right)
11005 {
11006 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11007 if (PyUnicode_READY(left) == -1 ||
11008 PyUnicode_READY(right) == -1)
11009 return -1;
11010
11011 /* a string is equal to itself */
11012 if (left == right)
11013 return 0;
11014
11015 return unicode_compare(left, right);
11016 }
11017 PyErr_Format(PyExc_TypeError,
11018 "Can't compare %.100s and %.100s",
11019 left->ob_type->tp_name,
11020 right->ob_type->tp_name);
11021 return -1;
11022 }
11023
11024 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11025 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11026 {
11027 Py_ssize_t i;
11028 int kind;
11029 Py_UCS4 chr;
11030 const unsigned char *ustr = (const unsigned char *)str;
11031
11032 assert(_PyUnicode_CHECK(uni));
11033 if (!PyUnicode_IS_READY(uni)) {
11034 const wchar_t *ws = _PyUnicode_WSTR(uni);
11035 /* Compare Unicode string and source character set string */
11036 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11037 if (chr != ustr[i])
11038 return (chr < ustr[i]) ? -1 : 1;
11039 }
11040 /* This check keeps Python strings that end in '\0' from comparing equal
11041 to C strings identical up to that point. */
11042 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11043 return 1; /* uni is longer */
11044 if (ustr[i])
11045 return -1; /* str is longer */
11046 return 0;
11047 }
11048 kind = PyUnicode_KIND(uni);
11049 if (kind == PyUnicode_1BYTE_KIND) {
11050 const void *data = PyUnicode_1BYTE_DATA(uni);
11051 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11052 size_t len, len2 = strlen(str);
11053 int cmp;
11054
11055 len = Py_MIN(len1, len2);
11056 cmp = memcmp(data, str, len);
11057 if (cmp != 0) {
11058 if (cmp < 0)
11059 return -1;
11060 else
11061 return 1;
11062 }
11063 if (len1 > len2)
11064 return 1; /* uni is longer */
11065 if (len1 < len2)
11066 return -1; /* str is longer */
11067 return 0;
11068 }
11069 else {
11070 void *data = PyUnicode_DATA(uni);
11071 /* Compare Unicode string and source character set string */
11072 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11073 if (chr != (unsigned char)str[i])
11074 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11075 /* This check keeps Python strings that end in '\0' from comparing equal
11076 to C strings identical up to that point. */
11077 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11078 return 1; /* uni is longer */
11079 if (str[i])
11080 return -1; /* str is longer */
11081 return 0;
11082 }
11083 }
11084
11085 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11086 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11087 {
11088 size_t i, len;
11089 const wchar_t *p;
11090 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11091 if (strlen(str) != len)
11092 return 0;
11093 p = _PyUnicode_WSTR(unicode);
11094 assert(p);
11095 for (i = 0; i < len; i++) {
11096 unsigned char c = (unsigned char)str[i];
11097 if (c >= 128 || p[i] != (wchar_t)c)
11098 return 0;
11099 }
11100 return 1;
11101 }
11102
11103 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11104 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11105 {
11106 size_t len;
11107 assert(_PyUnicode_CHECK(unicode));
11108 assert(str);
11109 #ifndef NDEBUG
11110 for (const char *p = str; *p; p++) {
11111 assert((unsigned char)*p < 128);
11112 }
11113 #endif
11114 if (PyUnicode_READY(unicode) == -1) {
11115 /* Memory error or bad data */
11116 PyErr_Clear();
11117 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11118 }
11119 if (!PyUnicode_IS_ASCII(unicode))
11120 return 0;
11121 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11122 return strlen(str) == len &&
11123 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11124 }
11125
11126 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11127 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11128 {
11129 PyObject *right_uni;
11130 Py_hash_t hash;
11131
11132 assert(_PyUnicode_CHECK(left));
11133 assert(right->string);
11134 #ifndef NDEBUG
11135 for (const char *p = right->string; *p; p++) {
11136 assert((unsigned char)*p < 128);
11137 }
11138 #endif
11139
11140 if (PyUnicode_READY(left) == -1) {
11141 /* memory error or bad data */
11142 PyErr_Clear();
11143 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11144 }
11145
11146 if (!PyUnicode_IS_ASCII(left))
11147 return 0;
11148
11149 right_uni = _PyUnicode_FromId(right); /* borrowed */
11150 if (right_uni == NULL) {
11151 /* memory error or bad data */
11152 PyErr_Clear();
11153 return _PyUnicode_EqualToASCIIString(left, right->string);
11154 }
11155
11156 if (left == right_uni)
11157 return 1;
11158
11159 if (PyUnicode_CHECK_INTERNED(left))
11160 return 0;
11161
11162 assert(_PyUnicode_HASH(right_uni) != -1);
11163 hash = _PyUnicode_HASH(left);
11164 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11165 return 0;
11166
11167 return unicode_compare_eq(left, right_uni);
11168 }
11169
11170 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11171 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11172 {
11173 int result;
11174
11175 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11176 Py_RETURN_NOTIMPLEMENTED;
11177
11178 if (PyUnicode_READY(left) == -1 ||
11179 PyUnicode_READY(right) == -1)
11180 return NULL;
11181
11182 if (left == right) {
11183 switch (op) {
11184 case Py_EQ:
11185 case Py_LE:
11186 case Py_GE:
11187 /* a string is equal to itself */
11188 Py_RETURN_TRUE;
11189 case Py_NE:
11190 case Py_LT:
11191 case Py_GT:
11192 Py_RETURN_FALSE;
11193 default:
11194 PyErr_BadArgument();
11195 return NULL;
11196 }
11197 }
11198 else if (op == Py_EQ || op == Py_NE) {
11199 result = unicode_compare_eq(left, right);
11200 result ^= (op == Py_NE);
11201 return PyBool_FromLong(result);
11202 }
11203 else {
11204 result = unicode_compare(left, right);
11205 Py_RETURN_RICHCOMPARE(result, 0, op);
11206 }
11207 }
11208
11209 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11210 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211 {
11212 return unicode_eq(aa, bb);
11213 }
11214
11215 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11216 PyUnicode_Contains(PyObject *str, PyObject *substr)
11217 {
11218 int kind1, kind2;
11219 void *buf1, *buf2;
11220 Py_ssize_t len1, len2;
11221 int result;
11222
11223 if (!PyUnicode_Check(substr)) {
11224 PyErr_Format(PyExc_TypeError,
11225 "'in <string>' requires string as left operand, not %.100s",
11226 Py_TYPE(substr)->tp_name);
11227 return -1;
11228 }
11229 if (PyUnicode_READY(substr) == -1)
11230 return -1;
11231 if (ensure_unicode(str) < 0)
11232 return -1;
11233
11234 kind1 = PyUnicode_KIND(str);
11235 kind2 = PyUnicode_KIND(substr);
11236 if (kind1 < kind2)
11237 return 0;
11238 len1 = PyUnicode_GET_LENGTH(str);
11239 len2 = PyUnicode_GET_LENGTH(substr);
11240 if (len1 < len2)
11241 return 0;
11242 buf1 = PyUnicode_DATA(str);
11243 buf2 = PyUnicode_DATA(substr);
11244 if (len2 == 1) {
11245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11247 return result;
11248 }
11249 if (kind2 != kind1) {
11250 buf2 = _PyUnicode_AsKind(substr, kind1);
11251 if (!buf2)
11252 return -1;
11253 }
11254
11255 switch (kind1) {
11256 case PyUnicode_1BYTE_KIND:
11257 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_2BYTE_KIND:
11260 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 case PyUnicode_4BYTE_KIND:
11263 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264 break;
11265 default:
11266 Py_UNREACHABLE();
11267 }
11268
11269 if (kind2 != kind1)
11270 PyMem_Free(buf2);
11271
11272 return result;
11273 }
11274
11275 /* Concat to string or Unicode object giving a new Unicode object. */
11276
11277 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11278 PyUnicode_Concat(PyObject *left, PyObject *right)
11279 {
11280 PyObject *result;
11281 Py_UCS4 maxchar, maxchar2;
11282 Py_ssize_t left_len, right_len, new_len;
11283
11284 if (ensure_unicode(left) < 0)
11285 return NULL;
11286
11287 if (!PyUnicode_Check(right)) {
11288 PyErr_Format(PyExc_TypeError,
11289 "can only concatenate str (not \"%.200s\") to str",
11290 right->ob_type->tp_name);
11291 return NULL;
11292 }
11293 if (PyUnicode_READY(right) < 0)
11294 return NULL;
11295
11296 /* Shortcuts */
11297 if (left == unicode_empty)
11298 return PyUnicode_FromObject(right);
11299 if (right == unicode_empty)
11300 return PyUnicode_FromObject(left);
11301
11302 left_len = PyUnicode_GET_LENGTH(left);
11303 right_len = PyUnicode_GET_LENGTH(right);
11304 if (left_len > PY_SSIZE_T_MAX - right_len) {
11305 PyErr_SetString(PyExc_OverflowError,
11306 "strings are too large to concat");
11307 return NULL;
11308 }
11309 new_len = left_len + right_len;
11310
11311 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11312 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11313 maxchar = Py_MAX(maxchar, maxchar2);
11314
11315 /* Concat the two Unicode strings */
11316 result = PyUnicode_New(new_len, maxchar);
11317 if (result == NULL)
11318 return NULL;
11319 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11320 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11321 assert(_PyUnicode_CheckConsistency(result, 1));
11322 return result;
11323 }
11324
11325 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11326 PyUnicode_Append(PyObject **p_left, PyObject *right)
11327 {
11328 PyObject *left, *res;
11329 Py_UCS4 maxchar, maxchar2;
11330 Py_ssize_t left_len, right_len, new_len;
11331
11332 if (p_left == NULL) {
11333 if (!PyErr_Occurred())
11334 PyErr_BadInternalCall();
11335 return;
11336 }
11337 left = *p_left;
11338 if (right == NULL || left == NULL
11339 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11340 if (!PyErr_Occurred())
11341 PyErr_BadInternalCall();
11342 goto error;
11343 }
11344
11345 if (PyUnicode_READY(left) == -1)
11346 goto error;
11347 if (PyUnicode_READY(right) == -1)
11348 goto error;
11349
11350 /* Shortcuts */
11351 if (left == unicode_empty) {
11352 Py_DECREF(left);
11353 Py_INCREF(right);
11354 *p_left = right;
11355 return;
11356 }
11357 if (right == unicode_empty)
11358 return;
11359
11360 left_len = PyUnicode_GET_LENGTH(left);
11361 right_len = PyUnicode_GET_LENGTH(right);
11362 if (left_len > PY_SSIZE_T_MAX - right_len) {
11363 PyErr_SetString(PyExc_OverflowError,
11364 "strings are too large to concat");
11365 goto error;
11366 }
11367 new_len = left_len + right_len;
11368
11369 if (unicode_modifiable(left)
11370 && PyUnicode_CheckExact(right)
11371 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11372 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11373 to change the structure size, but characters are stored just after
11374 the structure, and so it requires to move all characters which is
11375 not so different than duplicating the string. */
11376 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11377 {
11378 /* append inplace */
11379 if (unicode_resize(p_left, new_len) != 0)
11380 goto error;
11381
11382 /* copy 'right' into the newly allocated area of 'left' */
11383 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11384 }
11385 else {
11386 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11387 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11388 maxchar = Py_MAX(maxchar, maxchar2);
11389
11390 /* Concat the two Unicode strings */
11391 res = PyUnicode_New(new_len, maxchar);
11392 if (res == NULL)
11393 goto error;
11394 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11395 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11396 Py_DECREF(left);
11397 *p_left = res;
11398 }
11399 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11400 return;
11401
11402 error:
11403 Py_CLEAR(*p_left);
11404 }
11405
11406 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11407 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11408 {
11409 PyUnicode_Append(pleft, right);
11410 Py_XDECREF(right);
11411 }
11412
11413 /*
11414 Wraps stringlib_parse_args_finds() and additionally ensures that the
11415 first argument is a unicode object.
11416 */
11417
11418 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11419 parse_args_finds_unicode(const char * function_name, PyObject *args,
11420 PyObject **substring,
11421 Py_ssize_t *start, Py_ssize_t *end)
11422 {
11423 if(stringlib_parse_args_finds(function_name, args, substring,
11424 start, end)) {
11425 if (ensure_unicode(*substring) < 0)
11426 return 0;
11427 return 1;
11428 }
11429 return 0;
11430 }
11431
11432 PyDoc_STRVAR(count__doc__,
11433 "S.count(sub[, start[, end]]) -> int\n\
11434 \n\
11435 Return the number of non-overlapping occurrences of substring sub in\n\
11436 string S[start:end]. Optional arguments start and end are\n\
11437 interpreted as in slice notation.");
11438
11439 static PyObject *
unicode_count(PyObject * self,PyObject * args)11440 unicode_count(PyObject *self, PyObject *args)
11441 {
11442 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11443 Py_ssize_t start = 0;
11444 Py_ssize_t end = PY_SSIZE_T_MAX;
11445 PyObject *result;
11446 int kind1, kind2;
11447 void *buf1, *buf2;
11448 Py_ssize_t len1, len2, iresult;
11449
11450 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11451 return NULL;
11452
11453 kind1 = PyUnicode_KIND(self);
11454 kind2 = PyUnicode_KIND(substring);
11455 if (kind1 < kind2)
11456 return PyLong_FromLong(0);
11457
11458 len1 = PyUnicode_GET_LENGTH(self);
11459 len2 = PyUnicode_GET_LENGTH(substring);
11460 ADJUST_INDICES(start, end, len1);
11461 if (end - start < len2)
11462 return PyLong_FromLong(0);
11463
11464 buf1 = PyUnicode_DATA(self);
11465 buf2 = PyUnicode_DATA(substring);
11466 if (kind2 != kind1) {
11467 buf2 = _PyUnicode_AsKind(substring, kind1);
11468 if (!buf2)
11469 return NULL;
11470 }
11471 switch (kind1) {
11472 case PyUnicode_1BYTE_KIND:
11473 iresult = ucs1lib_count(
11474 ((Py_UCS1*)buf1) + start, end - start,
11475 buf2, len2, PY_SSIZE_T_MAX
11476 );
11477 break;
11478 case PyUnicode_2BYTE_KIND:
11479 iresult = ucs2lib_count(
11480 ((Py_UCS2*)buf1) + start, end - start,
11481 buf2, len2, PY_SSIZE_T_MAX
11482 );
11483 break;
11484 case PyUnicode_4BYTE_KIND:
11485 iresult = ucs4lib_count(
11486 ((Py_UCS4*)buf1) + start, end - start,
11487 buf2, len2, PY_SSIZE_T_MAX
11488 );
11489 break;
11490 default:
11491 Py_UNREACHABLE();
11492 }
11493
11494 result = PyLong_FromSsize_t(iresult);
11495
11496 if (kind2 != kind1)
11497 PyMem_Free(buf2);
11498
11499 return result;
11500 }
11501
11502 /*[clinic input]
11503 str.encode as unicode_encode
11504
11505 encoding: str(c_default="NULL") = 'utf-8'
11506 The encoding in which to encode the string.
11507 errors: str(c_default="NULL") = 'strict'
11508 The error handling scheme to use for encoding errors.
11509 The default is 'strict' meaning that encoding errors raise a
11510 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11511 'xmlcharrefreplace' as well as any other name registered with
11512 codecs.register_error that can handle UnicodeEncodeErrors.
11513
11514 Encode the string using the codec registered for encoding.
11515 [clinic start generated code]*/
11516
11517 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11518 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11519 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11520 {
11521 return PyUnicode_AsEncodedString(self, encoding, errors);
11522 }
11523
11524 /*[clinic input]
11525 str.expandtabs as unicode_expandtabs
11526
11527 tabsize: int = 8
11528
11529 Return a copy where all tab characters are expanded using spaces.
11530
11531 If tabsize is not given, a tab size of 8 characters is assumed.
11532 [clinic start generated code]*/
11533
11534 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11535 unicode_expandtabs_impl(PyObject *self, int tabsize)
11536 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11537 {
11538 Py_ssize_t i, j, line_pos, src_len, incr;
11539 Py_UCS4 ch;
11540 PyObject *u;
11541 void *src_data, *dest_data;
11542 int kind;
11543 int found;
11544
11545 if (PyUnicode_READY(self) == -1)
11546 return NULL;
11547
11548 /* First pass: determine size of output string */
11549 src_len = PyUnicode_GET_LENGTH(self);
11550 i = j = line_pos = 0;
11551 kind = PyUnicode_KIND(self);
11552 src_data = PyUnicode_DATA(self);
11553 found = 0;
11554 for (; i < src_len; i++) {
11555 ch = PyUnicode_READ(kind, src_data, i);
11556 if (ch == '\t') {
11557 found = 1;
11558 if (tabsize > 0) {
11559 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11560 if (j > PY_SSIZE_T_MAX - incr)
11561 goto overflow;
11562 line_pos += incr;
11563 j += incr;
11564 }
11565 }
11566 else {
11567 if (j > PY_SSIZE_T_MAX - 1)
11568 goto overflow;
11569 line_pos++;
11570 j++;
11571 if (ch == '\n' || ch == '\r')
11572 line_pos = 0;
11573 }
11574 }
11575 if (!found)
11576 return unicode_result_unchanged(self);
11577
11578 /* Second pass: create output string and fill it */
11579 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11580 if (!u)
11581 return NULL;
11582 dest_data = PyUnicode_DATA(u);
11583
11584 i = j = line_pos = 0;
11585
11586 for (; i < src_len; i++) {
11587 ch = PyUnicode_READ(kind, src_data, i);
11588 if (ch == '\t') {
11589 if (tabsize > 0) {
11590 incr = tabsize - (line_pos % tabsize);
11591 line_pos += incr;
11592 unicode_fill(kind, dest_data, ' ', j, incr);
11593 j += incr;
11594 }
11595 }
11596 else {
11597 line_pos++;
11598 PyUnicode_WRITE(kind, dest_data, j, ch);
11599 j++;
11600 if (ch == '\n' || ch == '\r')
11601 line_pos = 0;
11602 }
11603 }
11604 assert (j == PyUnicode_GET_LENGTH(u));
11605 return unicode_result(u);
11606
11607 overflow:
11608 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11609 return NULL;
11610 }
11611
11612 PyDoc_STRVAR(find__doc__,
11613 "S.find(sub[, start[, end]]) -> int\n\
11614 \n\
11615 Return the lowest index in S where substring sub is found,\n\
11616 such that sub is contained within S[start:end]. Optional\n\
11617 arguments start and end are interpreted as in slice notation.\n\
11618 \n\
11619 Return -1 on failure.");
11620
11621 static PyObject *
unicode_find(PyObject * self,PyObject * args)11622 unicode_find(PyObject *self, PyObject *args)
11623 {
11624 /* initialize variables to prevent gcc warning */
11625 PyObject *substring = NULL;
11626 Py_ssize_t start = 0;
11627 Py_ssize_t end = 0;
11628 Py_ssize_t result;
11629
11630 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11631 return NULL;
11632
11633 if (PyUnicode_READY(self) == -1)
11634 return NULL;
11635
11636 result = any_find_slice(self, substring, start, end, 1);
11637
11638 if (result == -2)
11639 return NULL;
11640
11641 return PyLong_FromSsize_t(result);
11642 }
11643
11644 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11645 unicode_getitem(PyObject *self, Py_ssize_t index)
11646 {
11647 void *data;
11648 enum PyUnicode_Kind kind;
11649 Py_UCS4 ch;
11650
11651 if (!PyUnicode_Check(self)) {
11652 PyErr_BadArgument();
11653 return NULL;
11654 }
11655 if (PyUnicode_READY(self) == -1) {
11656 return NULL;
11657 }
11658 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11659 PyErr_SetString(PyExc_IndexError, "string index out of range");
11660 return NULL;
11661 }
11662 kind = PyUnicode_KIND(self);
11663 data = PyUnicode_DATA(self);
11664 ch = PyUnicode_READ(kind, data, index);
11665 return unicode_char(ch);
11666 }
11667
11668 /* Believe it or not, this produces the same value for ASCII strings
11669 as bytes_hash(). */
11670 static Py_hash_t
unicode_hash(PyObject * self)11671 unicode_hash(PyObject *self)
11672 {
11673 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11674
11675 #ifdef Py_DEBUG
11676 assert(_Py_HashSecret_Initialized);
11677 #endif
11678 if (_PyUnicode_HASH(self) != -1)
11679 return _PyUnicode_HASH(self);
11680 if (PyUnicode_READY(self) == -1)
11681 return -1;
11682
11683 x = _Py_HashBytes(PyUnicode_DATA(self),
11684 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11685 _PyUnicode_HASH(self) = x;
11686 return x;
11687 }
11688
11689 PyDoc_STRVAR(index__doc__,
11690 "S.index(sub[, start[, end]]) -> int\n\
11691 \n\
11692 Return the lowest index in S where substring sub is found,\n\
11693 such that sub is contained within S[start:end]. Optional\n\
11694 arguments start and end are interpreted as in slice notation.\n\
11695 \n\
11696 Raises ValueError when the substring is not found.");
11697
11698 static PyObject *
unicode_index(PyObject * self,PyObject * args)11699 unicode_index(PyObject *self, PyObject *args)
11700 {
11701 /* initialize variables to prevent gcc warning */
11702 Py_ssize_t result;
11703 PyObject *substring = NULL;
11704 Py_ssize_t start = 0;
11705 Py_ssize_t end = 0;
11706
11707 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11708 return NULL;
11709
11710 if (PyUnicode_READY(self) == -1)
11711 return NULL;
11712
11713 result = any_find_slice(self, substring, start, end, 1);
11714
11715 if (result == -2)
11716 return NULL;
11717
11718 if (result < 0) {
11719 PyErr_SetString(PyExc_ValueError, "substring not found");
11720 return NULL;
11721 }
11722
11723 return PyLong_FromSsize_t(result);
11724 }
11725
11726 /*[clinic input]
11727 str.isascii as unicode_isascii
11728
11729 Return True if all characters in the string are ASCII, False otherwise.
11730
11731 ASCII characters have code points in the range U+0000-U+007F.
11732 Empty string is ASCII too.
11733 [clinic start generated code]*/
11734
11735 static PyObject *
unicode_isascii_impl(PyObject * self)11736 unicode_isascii_impl(PyObject *self)
11737 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11738 {
11739 if (PyUnicode_READY(self) == -1) {
11740 return NULL;
11741 }
11742 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11743 }
11744
11745 /*[clinic input]
11746 str.islower as unicode_islower
11747
11748 Return True if the string is a lowercase string, False otherwise.
11749
11750 A string is lowercase if all cased characters in the string are lowercase and
11751 there is at least one cased character in the string.
11752 [clinic start generated code]*/
11753
11754 static PyObject *
unicode_islower_impl(PyObject * self)11755 unicode_islower_impl(PyObject *self)
11756 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11757 {
11758 Py_ssize_t i, length;
11759 int kind;
11760 void *data;
11761 int cased;
11762
11763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765 length = PyUnicode_GET_LENGTH(self);
11766 kind = PyUnicode_KIND(self);
11767 data = PyUnicode_DATA(self);
11768
11769 /* Shortcut for single character strings */
11770 if (length == 1)
11771 return PyBool_FromLong(
11772 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11773
11774 /* Special case for empty strings */
11775 if (length == 0)
11776 Py_RETURN_FALSE;
11777
11778 cased = 0;
11779 for (i = 0; i < length; i++) {
11780 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11781
11782 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11783 Py_RETURN_FALSE;
11784 else if (!cased && Py_UNICODE_ISLOWER(ch))
11785 cased = 1;
11786 }
11787 return PyBool_FromLong(cased);
11788 }
11789
11790 /*[clinic input]
11791 str.isupper as unicode_isupper
11792
11793 Return True if the string is an uppercase string, False otherwise.
11794
11795 A string is uppercase if all cased characters in the string are uppercase and
11796 there is at least one cased character in the string.
11797 [clinic start generated code]*/
11798
11799 static PyObject *
unicode_isupper_impl(PyObject * self)11800 unicode_isupper_impl(PyObject *self)
11801 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11802 {
11803 Py_ssize_t i, length;
11804 int kind;
11805 void *data;
11806 int cased;
11807
11808 if (PyUnicode_READY(self) == -1)
11809 return NULL;
11810 length = PyUnicode_GET_LENGTH(self);
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
11813
11814 /* Shortcut for single character strings */
11815 if (length == 1)
11816 return PyBool_FromLong(
11817 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11818
11819 /* Special case for empty strings */
11820 if (length == 0)
11821 Py_RETURN_FALSE;
11822
11823 cased = 0;
11824 for (i = 0; i < length; i++) {
11825 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11826
11827 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11828 Py_RETURN_FALSE;
11829 else if (!cased && Py_UNICODE_ISUPPER(ch))
11830 cased = 1;
11831 }
11832 return PyBool_FromLong(cased);
11833 }
11834
11835 /*[clinic input]
11836 str.istitle as unicode_istitle
11837
11838 Return True if the string is a title-cased string, False otherwise.
11839
11840 In a title-cased string, upper- and title-case characters may only
11841 follow uncased characters and lowercase characters only cased ones.
11842 [clinic start generated code]*/
11843
11844 static PyObject *
unicode_istitle_impl(PyObject * self)11845 unicode_istitle_impl(PyObject *self)
11846 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11847 {
11848 Py_ssize_t i, length;
11849 int kind;
11850 void *data;
11851 int cased, previous_is_cased;
11852
11853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855 length = PyUnicode_GET_LENGTH(self);
11856 kind = PyUnicode_KIND(self);
11857 data = PyUnicode_DATA(self);
11858
11859 /* Shortcut for single character strings */
11860 if (length == 1) {
11861 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11862 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11863 (Py_UNICODE_ISUPPER(ch) != 0));
11864 }
11865
11866 /* Special case for empty strings */
11867 if (length == 0)
11868 Py_RETURN_FALSE;
11869
11870 cased = 0;
11871 previous_is_cased = 0;
11872 for (i = 0; i < length; i++) {
11873 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11874
11875 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11876 if (previous_is_cased)
11877 Py_RETURN_FALSE;
11878 previous_is_cased = 1;
11879 cased = 1;
11880 }
11881 else if (Py_UNICODE_ISLOWER(ch)) {
11882 if (!previous_is_cased)
11883 Py_RETURN_FALSE;
11884 previous_is_cased = 1;
11885 cased = 1;
11886 }
11887 else
11888 previous_is_cased = 0;
11889 }
11890 return PyBool_FromLong(cased);
11891 }
11892
11893 /*[clinic input]
11894 str.isspace as unicode_isspace
11895
11896 Return True if the string is a whitespace string, False otherwise.
11897
11898 A string is whitespace if all characters in the string are whitespace and there
11899 is at least one character in the string.
11900 [clinic start generated code]*/
11901
11902 static PyObject *
unicode_isspace_impl(PyObject * self)11903 unicode_isspace_impl(PyObject *self)
11904 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11905 {
11906 Py_ssize_t i, length;
11907 int kind;
11908 void *data;
11909
11910 if (PyUnicode_READY(self) == -1)
11911 return NULL;
11912 length = PyUnicode_GET_LENGTH(self);
11913 kind = PyUnicode_KIND(self);
11914 data = PyUnicode_DATA(self);
11915
11916 /* Shortcut for single character strings */
11917 if (length == 1)
11918 return PyBool_FromLong(
11919 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11920
11921 /* Special case for empty strings */
11922 if (length == 0)
11923 Py_RETURN_FALSE;
11924
11925 for (i = 0; i < length; i++) {
11926 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11927 if (!Py_UNICODE_ISSPACE(ch))
11928 Py_RETURN_FALSE;
11929 }
11930 Py_RETURN_TRUE;
11931 }
11932
11933 /*[clinic input]
11934 str.isalpha as unicode_isalpha
11935
11936 Return True if the string is an alphabetic string, False otherwise.
11937
11938 A string is alphabetic if all characters in the string are alphabetic and there
11939 is at least one character in the string.
11940 [clinic start generated code]*/
11941
11942 static PyObject *
unicode_isalpha_impl(PyObject * self)11943 unicode_isalpha_impl(PyObject *self)
11944 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11945 {
11946 Py_ssize_t i, length;
11947 int kind;
11948 void *data;
11949
11950 if (PyUnicode_READY(self) == -1)
11951 return NULL;
11952 length = PyUnicode_GET_LENGTH(self);
11953 kind = PyUnicode_KIND(self);
11954 data = PyUnicode_DATA(self);
11955
11956 /* Shortcut for single character strings */
11957 if (length == 1)
11958 return PyBool_FromLong(
11959 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11960
11961 /* Special case for empty strings */
11962 if (length == 0)
11963 Py_RETURN_FALSE;
11964
11965 for (i = 0; i < length; i++) {
11966 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11967 Py_RETURN_FALSE;
11968 }
11969 Py_RETURN_TRUE;
11970 }
11971
11972 /*[clinic input]
11973 str.isalnum as unicode_isalnum
11974
11975 Return True if the string is an alpha-numeric string, False otherwise.
11976
11977 A string is alpha-numeric if all characters in the string are alpha-numeric and
11978 there is at least one character in the string.
11979 [clinic start generated code]*/
11980
11981 static PyObject *
unicode_isalnum_impl(PyObject * self)11982 unicode_isalnum_impl(PyObject *self)
11983 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11984 {
11985 int kind;
11986 void *data;
11987 Py_ssize_t len, i;
11988
11989 if (PyUnicode_READY(self) == -1)
11990 return NULL;
11991
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
11994 len = PyUnicode_GET_LENGTH(self);
11995
11996 /* Shortcut for single character strings */
11997 if (len == 1) {
11998 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11999 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12000 }
12001
12002 /* Special case for empty strings */
12003 if (len == 0)
12004 Py_RETURN_FALSE;
12005
12006 for (i = 0; i < len; i++) {
12007 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12008 if (!Py_UNICODE_ISALNUM(ch))
12009 Py_RETURN_FALSE;
12010 }
12011 Py_RETURN_TRUE;
12012 }
12013
12014 /*[clinic input]
12015 str.isdecimal as unicode_isdecimal
12016
12017 Return True if the string is a decimal string, False otherwise.
12018
12019 A string is a decimal string if all characters in the string are decimal and
12020 there is at least one character in the string.
12021 [clinic start generated code]*/
12022
12023 static PyObject *
unicode_isdecimal_impl(PyObject * self)12024 unicode_isdecimal_impl(PyObject *self)
12025 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12026 {
12027 Py_ssize_t i, length;
12028 int kind;
12029 void *data;
12030
12031 if (PyUnicode_READY(self) == -1)
12032 return NULL;
12033 length = PyUnicode_GET_LENGTH(self);
12034 kind = PyUnicode_KIND(self);
12035 data = PyUnicode_DATA(self);
12036
12037 /* Shortcut for single character strings */
12038 if (length == 1)
12039 return PyBool_FromLong(
12040 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12041
12042 /* Special case for empty strings */
12043 if (length == 0)
12044 Py_RETURN_FALSE;
12045
12046 for (i = 0; i < length; i++) {
12047 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12048 Py_RETURN_FALSE;
12049 }
12050 Py_RETURN_TRUE;
12051 }
12052
12053 /*[clinic input]
12054 str.isdigit as unicode_isdigit
12055
12056 Return True if the string is a digit string, False otherwise.
12057
12058 A string is a digit string if all characters in the string are digits and there
12059 is at least one character in the string.
12060 [clinic start generated code]*/
12061
12062 static PyObject *
unicode_isdigit_impl(PyObject * self)12063 unicode_isdigit_impl(PyObject *self)
12064 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12065 {
12066 Py_ssize_t i, length;
12067 int kind;
12068 void *data;
12069
12070 if (PyUnicode_READY(self) == -1)
12071 return NULL;
12072 length = PyUnicode_GET_LENGTH(self);
12073 kind = PyUnicode_KIND(self);
12074 data = PyUnicode_DATA(self);
12075
12076 /* Shortcut for single character strings */
12077 if (length == 1) {
12078 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12079 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12080 }
12081
12082 /* Special case for empty strings */
12083 if (length == 0)
12084 Py_RETURN_FALSE;
12085
12086 for (i = 0; i < length; i++) {
12087 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12088 Py_RETURN_FALSE;
12089 }
12090 Py_RETURN_TRUE;
12091 }
12092
12093 /*[clinic input]
12094 str.isnumeric as unicode_isnumeric
12095
12096 Return True if the string is a numeric string, False otherwise.
12097
12098 A string is numeric if all characters in the string are numeric and there is at
12099 least one character in the string.
12100 [clinic start generated code]*/
12101
12102 static PyObject *
unicode_isnumeric_impl(PyObject * self)12103 unicode_isnumeric_impl(PyObject *self)
12104 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12105 {
12106 Py_ssize_t i, length;
12107 int kind;
12108 void *data;
12109
12110 if (PyUnicode_READY(self) == -1)
12111 return NULL;
12112 length = PyUnicode_GET_LENGTH(self);
12113 kind = PyUnicode_KIND(self);
12114 data = PyUnicode_DATA(self);
12115
12116 /* Shortcut for single character strings */
12117 if (length == 1)
12118 return PyBool_FromLong(
12119 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12120
12121 /* Special case for empty strings */
12122 if (length == 0)
12123 Py_RETURN_FALSE;
12124
12125 for (i = 0; i < length; i++) {
12126 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12127 Py_RETURN_FALSE;
12128 }
12129 Py_RETURN_TRUE;
12130 }
12131
12132 int
PyUnicode_IsIdentifier(PyObject * self)12133 PyUnicode_IsIdentifier(PyObject *self)
12134 {
12135 int kind;
12136 void *data;
12137 Py_ssize_t i;
12138 Py_UCS4 first;
12139
12140 if (PyUnicode_READY(self) == -1) {
12141 Py_FatalError("identifier not ready");
12142 return 0;
12143 }
12144
12145 /* Special case for empty strings */
12146 if (PyUnicode_GET_LENGTH(self) == 0)
12147 return 0;
12148 kind = PyUnicode_KIND(self);
12149 data = PyUnicode_DATA(self);
12150
12151 /* PEP 3131 says that the first character must be in
12152 XID_Start and subsequent characters in XID_Continue,
12153 and for the ASCII range, the 2.x rules apply (i.e
12154 start with letters and underscore, continue with
12155 letters, digits, underscore). However, given the current
12156 definition of XID_Start and XID_Continue, it is sufficient
12157 to check just for these, except that _ must be allowed
12158 as starting an identifier. */
12159 first = PyUnicode_READ(kind, data, 0);
12160 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12161 return 0;
12162
12163 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12164 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12165 return 0;
12166 return 1;
12167 }
12168
12169 /*[clinic input]
12170 str.isidentifier as unicode_isidentifier
12171
12172 Return True if the string is a valid Python identifier, False otherwise.
12173
12174 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12175 such as "def" or "class".
12176 [clinic start generated code]*/
12177
12178 static PyObject *
unicode_isidentifier_impl(PyObject * self)12179 unicode_isidentifier_impl(PyObject *self)
12180 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12181 {
12182 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12183 }
12184
12185 /*[clinic input]
12186 str.isprintable as unicode_isprintable
12187
12188 Return True if the string is printable, False otherwise.
12189
12190 A string is printable if all of its characters are considered printable in
12191 repr() or if it is empty.
12192 [clinic start generated code]*/
12193
12194 static PyObject *
unicode_isprintable_impl(PyObject * self)12195 unicode_isprintable_impl(PyObject *self)
12196 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12197 {
12198 Py_ssize_t i, length;
12199 int kind;
12200 void *data;
12201
12202 if (PyUnicode_READY(self) == -1)
12203 return NULL;
12204 length = PyUnicode_GET_LENGTH(self);
12205 kind = PyUnicode_KIND(self);
12206 data = PyUnicode_DATA(self);
12207
12208 /* Shortcut for single character strings */
12209 if (length == 1)
12210 return PyBool_FromLong(
12211 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12212
12213 for (i = 0; i < length; i++) {
12214 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12215 Py_RETURN_FALSE;
12216 }
12217 }
12218 Py_RETURN_TRUE;
12219 }
12220
12221 /*[clinic input]
12222 str.join as unicode_join
12223
12224 iterable: object
12225 /
12226
12227 Concatenate any number of strings.
12228
12229 The string whose method is called is inserted in between each given string.
12230 The result is returned as a new string.
12231
12232 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12233 [clinic start generated code]*/
12234
12235 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12236 unicode_join(PyObject *self, PyObject *iterable)
12237 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12238 {
12239 return PyUnicode_Join(self, iterable);
12240 }
12241
12242 static Py_ssize_t
unicode_length(PyObject * self)12243 unicode_length(PyObject *self)
12244 {
12245 if (PyUnicode_READY(self) == -1)
12246 return -1;
12247 return PyUnicode_GET_LENGTH(self);
12248 }
12249
12250 /*[clinic input]
12251 str.ljust as unicode_ljust
12252
12253 width: Py_ssize_t
12254 fillchar: Py_UCS4 = ' '
12255 /
12256
12257 Return a left-justified string of length width.
12258
12259 Padding is done using the specified fill character (default is a space).
12260 [clinic start generated code]*/
12261
12262 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12263 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12264 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12265 {
12266 if (PyUnicode_READY(self) == -1)
12267 return NULL;
12268
12269 if (PyUnicode_GET_LENGTH(self) >= width)
12270 return unicode_result_unchanged(self);
12271
12272 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12273 }
12274
12275 /*[clinic input]
12276 str.lower as unicode_lower
12277
12278 Return a copy of the string converted to lowercase.
12279 [clinic start generated code]*/
12280
12281 static PyObject *
unicode_lower_impl(PyObject * self)12282 unicode_lower_impl(PyObject *self)
12283 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12284 {
12285 if (PyUnicode_READY(self) == -1)
12286 return NULL;
12287 if (PyUnicode_IS_ASCII(self))
12288 return ascii_upper_or_lower(self, 1);
12289 return case_operation(self, do_lower);
12290 }
12291
12292 #define LEFTSTRIP 0
12293 #define RIGHTSTRIP 1
12294 #define BOTHSTRIP 2
12295
12296 /* Arrays indexed by above */
12297 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12298
12299 #define STRIPNAME(i) (stripfuncnames[i])
12300
12301 /* externally visible for str.strip(unicode) */
12302 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12303 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12304 {
12305 void *data;
12306 int kind;
12307 Py_ssize_t i, j, len;
12308 BLOOM_MASK sepmask;
12309 Py_ssize_t seplen;
12310
12311 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12312 return NULL;
12313
12314 kind = PyUnicode_KIND(self);
12315 data = PyUnicode_DATA(self);
12316 len = PyUnicode_GET_LENGTH(self);
12317 seplen = PyUnicode_GET_LENGTH(sepobj);
12318 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12319 PyUnicode_DATA(sepobj),
12320 seplen);
12321
12322 i = 0;
12323 if (striptype != RIGHTSTRIP) {
12324 while (i < len) {
12325 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12326 if (!BLOOM(sepmask, ch))
12327 break;
12328 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12329 break;
12330 i++;
12331 }
12332 }
12333
12334 j = len;
12335 if (striptype != LEFTSTRIP) {
12336 j--;
12337 while (j >= i) {
12338 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12339 if (!BLOOM(sepmask, ch))
12340 break;
12341 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12342 break;
12343 j--;
12344 }
12345
12346 j++;
12347 }
12348
12349 return PyUnicode_Substring(self, i, j);
12350 }
12351
12352 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12353 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12354 {
12355 unsigned char *data;
12356 int kind;
12357 Py_ssize_t length;
12358
12359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
12361
12362 length = PyUnicode_GET_LENGTH(self);
12363 end = Py_MIN(end, length);
12364
12365 if (start == 0 && end == length)
12366 return unicode_result_unchanged(self);
12367
12368 if (start < 0 || end < 0) {
12369 PyErr_SetString(PyExc_IndexError, "string index out of range");
12370 return NULL;
12371 }
12372 if (start >= length || end < start)
12373 _Py_RETURN_UNICODE_EMPTY();
12374
12375 length = end - start;
12376 if (PyUnicode_IS_ASCII(self)) {
12377 data = PyUnicode_1BYTE_DATA(self);
12378 return _PyUnicode_FromASCII((char*)(data + start), length);
12379 }
12380 else {
12381 kind = PyUnicode_KIND(self);
12382 data = PyUnicode_1BYTE_DATA(self);
12383 return PyUnicode_FromKindAndData(kind,
12384 data + kind * start,
12385 length);
12386 }
12387 }
12388
12389 static PyObject *
do_strip(PyObject * self,int striptype)12390 do_strip(PyObject *self, int striptype)
12391 {
12392 Py_ssize_t len, i, j;
12393
12394 if (PyUnicode_READY(self) == -1)
12395 return NULL;
12396
12397 len = PyUnicode_GET_LENGTH(self);
12398
12399 if (PyUnicode_IS_ASCII(self)) {
12400 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12401
12402 i = 0;
12403 if (striptype != RIGHTSTRIP) {
12404 while (i < len) {
12405 Py_UCS1 ch = data[i];
12406 if (!_Py_ascii_whitespace[ch])
12407 break;
12408 i++;
12409 }
12410 }
12411
12412 j = len;
12413 if (striptype != LEFTSTRIP) {
12414 j--;
12415 while (j >= i) {
12416 Py_UCS1 ch = data[j];
12417 if (!_Py_ascii_whitespace[ch])
12418 break;
12419 j--;
12420 }
12421 j++;
12422 }
12423 }
12424 else {
12425 int kind = PyUnicode_KIND(self);
12426 void *data = PyUnicode_DATA(self);
12427
12428 i = 0;
12429 if (striptype != RIGHTSTRIP) {
12430 while (i < len) {
12431 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12432 if (!Py_UNICODE_ISSPACE(ch))
12433 break;
12434 i++;
12435 }
12436 }
12437
12438 j = len;
12439 if (striptype != LEFTSTRIP) {
12440 j--;
12441 while (j >= i) {
12442 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12443 if (!Py_UNICODE_ISSPACE(ch))
12444 break;
12445 j--;
12446 }
12447 j++;
12448 }
12449 }
12450
12451 return PyUnicode_Substring(self, i, j);
12452 }
12453
12454
12455 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12456 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12457 {
12458 if (sep != Py_None) {
12459 if (PyUnicode_Check(sep))
12460 return _PyUnicode_XStrip(self, striptype, sep);
12461 else {
12462 PyErr_Format(PyExc_TypeError,
12463 "%s arg must be None or str",
12464 STRIPNAME(striptype));
12465 return NULL;
12466 }
12467 }
12468
12469 return do_strip(self, striptype);
12470 }
12471
12472
12473 /*[clinic input]
12474 str.strip as unicode_strip
12475
12476 chars: object = None
12477 /
12478
12479 Return a copy of the string with leading and trailing whitespace removed.
12480
12481 If chars is given and not None, remove characters in chars instead.
12482 [clinic start generated code]*/
12483
12484 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12485 unicode_strip_impl(PyObject *self, PyObject *chars)
12486 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12487 {
12488 return do_argstrip(self, BOTHSTRIP, chars);
12489 }
12490
12491
12492 /*[clinic input]
12493 str.lstrip as unicode_lstrip
12494
12495 chars: object = None
12496 /
12497
12498 Return a copy of the string with leading whitespace removed.
12499
12500 If chars is given and not None, remove characters in chars instead.
12501 [clinic start generated code]*/
12502
12503 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12504 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12505 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12506 {
12507 return do_argstrip(self, LEFTSTRIP, chars);
12508 }
12509
12510
12511 /*[clinic input]
12512 str.rstrip as unicode_rstrip
12513
12514 chars: object = None
12515 /
12516
12517 Return a copy of the string with trailing whitespace removed.
12518
12519 If chars is given and not None, remove characters in chars instead.
12520 [clinic start generated code]*/
12521
12522 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12523 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12524 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12525 {
12526 return do_argstrip(self, RIGHTSTRIP, chars);
12527 }
12528
12529
12530 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12531 unicode_repeat(PyObject *str, Py_ssize_t len)
12532 {
12533 PyObject *u;
12534 Py_ssize_t nchars, n;
12535
12536 if (len < 1)
12537 _Py_RETURN_UNICODE_EMPTY();
12538
12539 /* no repeat, return original string */
12540 if (len == 1)
12541 return unicode_result_unchanged(str);
12542
12543 if (PyUnicode_READY(str) == -1)
12544 return NULL;
12545
12546 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12547 PyErr_SetString(PyExc_OverflowError,
12548 "repeated string is too long");
12549 return NULL;
12550 }
12551 nchars = len * PyUnicode_GET_LENGTH(str);
12552
12553 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12554 if (!u)
12555 return NULL;
12556 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12557
12558 if (PyUnicode_GET_LENGTH(str) == 1) {
12559 const int kind = PyUnicode_KIND(str);
12560 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12561 if (kind == PyUnicode_1BYTE_KIND) {
12562 void *to = PyUnicode_DATA(u);
12563 memset(to, (unsigned char)fill_char, len);
12564 }
12565 else if (kind == PyUnicode_2BYTE_KIND) {
12566 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12567 for (n = 0; n < len; ++n)
12568 ucs2[n] = fill_char;
12569 } else {
12570 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12571 assert(kind == PyUnicode_4BYTE_KIND);
12572 for (n = 0; n < len; ++n)
12573 ucs4[n] = fill_char;
12574 }
12575 }
12576 else {
12577 /* number of characters copied this far */
12578 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12579 const Py_ssize_t char_size = PyUnicode_KIND(str);
12580 char *to = (char *) PyUnicode_DATA(u);
12581 memcpy(to, PyUnicode_DATA(str),
12582 PyUnicode_GET_LENGTH(str) * char_size);
12583 while (done < nchars) {
12584 n = (done <= nchars-done) ? done : nchars-done;
12585 memcpy(to + (done * char_size), to, n * char_size);
12586 done += n;
12587 }
12588 }
12589
12590 assert(_PyUnicode_CheckConsistency(u, 1));
12591 return u;
12592 }
12593
12594 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12595 PyUnicode_Replace(PyObject *str,
12596 PyObject *substr,
12597 PyObject *replstr,
12598 Py_ssize_t maxcount)
12599 {
12600 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12601 ensure_unicode(replstr) < 0)
12602 return NULL;
12603 return replace(str, substr, replstr, maxcount);
12604 }
12605
12606 /*[clinic input]
12607 str.replace as unicode_replace
12608
12609 old: unicode
12610 new: unicode
12611 count: Py_ssize_t = -1
12612 Maximum number of occurrences to replace.
12613 -1 (the default value) means replace all occurrences.
12614 /
12615
12616 Return a copy with all occurrences of substring old replaced by new.
12617
12618 If the optional argument count is given, only the first count occurrences are
12619 replaced.
12620 [clinic start generated code]*/
12621
12622 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12623 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12624 Py_ssize_t count)
12625 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12626 {
12627 if (PyUnicode_READY(self) == -1)
12628 return NULL;
12629 return replace(self, old, new, count);
12630 }
12631
12632 static PyObject *
unicode_repr(PyObject * unicode)12633 unicode_repr(PyObject *unicode)
12634 {
12635 PyObject *repr;
12636 Py_ssize_t isize;
12637 Py_ssize_t osize, squote, dquote, i, o;
12638 Py_UCS4 max, quote;
12639 int ikind, okind, unchanged;
12640 void *idata, *odata;
12641
12642 if (PyUnicode_READY(unicode) == -1)
12643 return NULL;
12644
12645 isize = PyUnicode_GET_LENGTH(unicode);
12646 idata = PyUnicode_DATA(unicode);
12647
12648 /* Compute length of output, quote characters, and
12649 maximum character */
12650 osize = 0;
12651 max = 127;
12652 squote = dquote = 0;
12653 ikind = PyUnicode_KIND(unicode);
12654 for (i = 0; i < isize; i++) {
12655 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12656 Py_ssize_t incr = 1;
12657 switch (ch) {
12658 case '\'': squote++; break;
12659 case '"': dquote++; break;
12660 case '\\': case '\t': case '\r': case '\n':
12661 incr = 2;
12662 break;
12663 default:
12664 /* Fast-path ASCII */
12665 if (ch < ' ' || ch == 0x7f)
12666 incr = 4; /* \xHH */
12667 else if (ch < 0x7f)
12668 ;
12669 else if (Py_UNICODE_ISPRINTABLE(ch))
12670 max = ch > max ? ch : max;
12671 else if (ch < 0x100)
12672 incr = 4; /* \xHH */
12673 else if (ch < 0x10000)
12674 incr = 6; /* \uHHHH */
12675 else
12676 incr = 10; /* \uHHHHHHHH */
12677 }
12678 if (osize > PY_SSIZE_T_MAX - incr) {
12679 PyErr_SetString(PyExc_OverflowError,
12680 "string is too long to generate repr");
12681 return NULL;
12682 }
12683 osize += incr;
12684 }
12685
12686 quote = '\'';
12687 unchanged = (osize == isize);
12688 if (squote) {
12689 unchanged = 0;
12690 if (dquote)
12691 /* Both squote and dquote present. Use squote,
12692 and escape them */
12693 osize += squote;
12694 else
12695 quote = '"';
12696 }
12697 osize += 2; /* quotes */
12698
12699 repr = PyUnicode_New(osize, max);
12700 if (repr == NULL)
12701 return NULL;
12702 okind = PyUnicode_KIND(repr);
12703 odata = PyUnicode_DATA(repr);
12704
12705 PyUnicode_WRITE(okind, odata, 0, quote);
12706 PyUnicode_WRITE(okind, odata, osize-1, quote);
12707 if (unchanged) {
12708 _PyUnicode_FastCopyCharacters(repr, 1,
12709 unicode, 0,
12710 isize);
12711 }
12712 else {
12713 for (i = 0, o = 1; i < isize; i++) {
12714 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12715
12716 /* Escape quotes and backslashes */
12717 if ((ch == quote) || (ch == '\\')) {
12718 PyUnicode_WRITE(okind, odata, o++, '\\');
12719 PyUnicode_WRITE(okind, odata, o++, ch);
12720 continue;
12721 }
12722
12723 /* Map special whitespace to '\t', \n', '\r' */
12724 if (ch == '\t') {
12725 PyUnicode_WRITE(okind, odata, o++, '\\');
12726 PyUnicode_WRITE(okind, odata, o++, 't');
12727 }
12728 else if (ch == '\n') {
12729 PyUnicode_WRITE(okind, odata, o++, '\\');
12730 PyUnicode_WRITE(okind, odata, o++, 'n');
12731 }
12732 else if (ch == '\r') {
12733 PyUnicode_WRITE(okind, odata, o++, '\\');
12734 PyUnicode_WRITE(okind, odata, o++, 'r');
12735 }
12736
12737 /* Map non-printable US ASCII to '\xhh' */
12738 else if (ch < ' ' || ch == 0x7F) {
12739 PyUnicode_WRITE(okind, odata, o++, '\\');
12740 PyUnicode_WRITE(okind, odata, o++, 'x');
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12743 }
12744
12745 /* Copy ASCII characters as-is */
12746 else if (ch < 0x7F) {
12747 PyUnicode_WRITE(okind, odata, o++, ch);
12748 }
12749
12750 /* Non-ASCII characters */
12751 else {
12752 /* Map Unicode whitespace and control characters
12753 (categories Z* and C* except ASCII space)
12754 */
12755 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12756 PyUnicode_WRITE(okind, odata, o++, '\\');
12757 /* Map 8-bit characters to '\xhh' */
12758 if (ch <= 0xff) {
12759 PyUnicode_WRITE(okind, odata, o++, 'x');
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12762 }
12763 /* Map 16-bit characters to '\uxxxx' */
12764 else if (ch <= 0xffff) {
12765 PyUnicode_WRITE(okind, odata, o++, 'u');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12770 }
12771 /* Map 21-bit characters to '\U00xxxxxx' */
12772 else {
12773 PyUnicode_WRITE(okind, odata, o++, 'U');
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12782 }
12783 }
12784 /* Copy characters as-is */
12785 else {
12786 PyUnicode_WRITE(okind, odata, o++, ch);
12787 }
12788 }
12789 }
12790 }
12791 /* Closing quote already added at the beginning */
12792 assert(_PyUnicode_CheckConsistency(repr, 1));
12793 return repr;
12794 }
12795
12796 PyDoc_STRVAR(rfind__doc__,
12797 "S.rfind(sub[, start[, end]]) -> int\n\
12798 \n\
12799 Return the highest index in S where substring sub is found,\n\
12800 such that sub is contained within S[start:end]. Optional\n\
12801 arguments start and end are interpreted as in slice notation.\n\
12802 \n\
12803 Return -1 on failure.");
12804
12805 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12806 unicode_rfind(PyObject *self, PyObject *args)
12807 {
12808 /* initialize variables to prevent gcc warning */
12809 PyObject *substring = NULL;
12810 Py_ssize_t start = 0;
12811 Py_ssize_t end = 0;
12812 Py_ssize_t result;
12813
12814 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12815 return NULL;
12816
12817 if (PyUnicode_READY(self) == -1)
12818 return NULL;
12819
12820 result = any_find_slice(self, substring, start, end, -1);
12821
12822 if (result == -2)
12823 return NULL;
12824
12825 return PyLong_FromSsize_t(result);
12826 }
12827
12828 PyDoc_STRVAR(rindex__doc__,
12829 "S.rindex(sub[, start[, end]]) -> int\n\
12830 \n\
12831 Return the highest index in S where substring sub is found,\n\
12832 such that sub is contained within S[start:end]. Optional\n\
12833 arguments start and end are interpreted as in slice notation.\n\
12834 \n\
12835 Raises ValueError when the substring is not found.");
12836
12837 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12838 unicode_rindex(PyObject *self, PyObject *args)
12839 {
12840 /* initialize variables to prevent gcc warning */
12841 PyObject *substring = NULL;
12842 Py_ssize_t start = 0;
12843 Py_ssize_t end = 0;
12844 Py_ssize_t result;
12845
12846 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12847 return NULL;
12848
12849 if (PyUnicode_READY(self) == -1)
12850 return NULL;
12851
12852 result = any_find_slice(self, substring, start, end, -1);
12853
12854 if (result == -2)
12855 return NULL;
12856
12857 if (result < 0) {
12858 PyErr_SetString(PyExc_ValueError, "substring not found");
12859 return NULL;
12860 }
12861
12862 return PyLong_FromSsize_t(result);
12863 }
12864
12865 /*[clinic input]
12866 str.rjust as unicode_rjust
12867
12868 width: Py_ssize_t
12869 fillchar: Py_UCS4 = ' '
12870 /
12871
12872 Return a right-justified string of length width.
12873
12874 Padding is done using the specified fill character (default is a space).
12875 [clinic start generated code]*/
12876
12877 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12878 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12879 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12880 {
12881 if (PyUnicode_READY(self) == -1)
12882 return NULL;
12883
12884 if (PyUnicode_GET_LENGTH(self) >= width)
12885 return unicode_result_unchanged(self);
12886
12887 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12888 }
12889
12890 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12891 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12892 {
12893 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12894 return NULL;
12895
12896 return split(s, sep, maxsplit);
12897 }
12898
12899 /*[clinic input]
12900 str.split as unicode_split
12901
12902 sep: object = None
12903 The delimiter according which to split the string.
12904 None (the default value) means split according to any whitespace,
12905 and discard empty strings from the result.
12906 maxsplit: Py_ssize_t = -1
12907 Maximum number of splits to do.
12908 -1 (the default value) means no limit.
12909
12910 Return a list of the words in the string, using sep as the delimiter string.
12911 [clinic start generated code]*/
12912
12913 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12914 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12915 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12916 {
12917 if (sep == Py_None)
12918 return split(self, NULL, maxsplit);
12919 if (PyUnicode_Check(sep))
12920 return split(self, sep, maxsplit);
12921
12922 PyErr_Format(PyExc_TypeError,
12923 "must be str or None, not %.100s",
12924 Py_TYPE(sep)->tp_name);
12925 return NULL;
12926 }
12927
12928 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12929 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12930 {
12931 PyObject* out;
12932 int kind1, kind2;
12933 void *buf1, *buf2;
12934 Py_ssize_t len1, len2;
12935
12936 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12937 return NULL;
12938
12939 kind1 = PyUnicode_KIND(str_obj);
12940 kind2 = PyUnicode_KIND(sep_obj);
12941 len1 = PyUnicode_GET_LENGTH(str_obj);
12942 len2 = PyUnicode_GET_LENGTH(sep_obj);
12943 if (kind1 < kind2 || len1 < len2) {
12944 _Py_INCREF_UNICODE_EMPTY();
12945 if (!unicode_empty)
12946 out = NULL;
12947 else {
12948 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12949 Py_DECREF(unicode_empty);
12950 }
12951 return out;
12952 }
12953 buf1 = PyUnicode_DATA(str_obj);
12954 buf2 = PyUnicode_DATA(sep_obj);
12955 if (kind2 != kind1) {
12956 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12957 if (!buf2)
12958 return NULL;
12959 }
12960
12961 switch (kind1) {
12962 case PyUnicode_1BYTE_KIND:
12963 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12964 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 else
12966 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967 break;
12968 case PyUnicode_2BYTE_KIND:
12969 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970 break;
12971 case PyUnicode_4BYTE_KIND:
12972 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12973 break;
12974 default:
12975 Py_UNREACHABLE();
12976 }
12977
12978 if (kind2 != kind1)
12979 PyMem_Free(buf2);
12980
12981 return out;
12982 }
12983
12984
12985 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12986 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12987 {
12988 PyObject* out;
12989 int kind1, kind2;
12990 void *buf1, *buf2;
12991 Py_ssize_t len1, len2;
12992
12993 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12994 return NULL;
12995
12996 kind1 = PyUnicode_KIND(str_obj);
12997 kind2 = PyUnicode_KIND(sep_obj);
12998 len1 = PyUnicode_GET_LENGTH(str_obj);
12999 len2 = PyUnicode_GET_LENGTH(sep_obj);
13000 if (kind1 < kind2 || len1 < len2) {
13001 _Py_INCREF_UNICODE_EMPTY();
13002 if (!unicode_empty)
13003 out = NULL;
13004 else {
13005 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13006 Py_DECREF(unicode_empty);
13007 }
13008 return out;
13009 }
13010 buf1 = PyUnicode_DATA(str_obj);
13011 buf2 = PyUnicode_DATA(sep_obj);
13012 if (kind2 != kind1) {
13013 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014 if (!buf2)
13015 return NULL;
13016 }
13017
13018 switch (kind1) {
13019 case PyUnicode_1BYTE_KIND:
13020 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 else
13023 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024 break;
13025 case PyUnicode_2BYTE_KIND:
13026 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027 break;
13028 case PyUnicode_4BYTE_KIND:
13029 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030 break;
13031 default:
13032 Py_UNREACHABLE();
13033 }
13034
13035 if (kind2 != kind1)
13036 PyMem_Free(buf2);
13037
13038 return out;
13039 }
13040
13041 /*[clinic input]
13042 str.partition as unicode_partition
13043
13044 sep: object
13045 /
13046
13047 Partition the string into three parts using the given separator.
13048
13049 This will search for the separator in the string. If the separator is found,
13050 returns a 3-tuple containing the part before the separator, the separator
13051 itself, and the part after it.
13052
13053 If the separator is not found, returns a 3-tuple containing the original string
13054 and two empty strings.
13055 [clinic start generated code]*/
13056
13057 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13058 unicode_partition(PyObject *self, PyObject *sep)
13059 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13060 {
13061 return PyUnicode_Partition(self, sep);
13062 }
13063
13064 /*[clinic input]
13065 str.rpartition as unicode_rpartition = str.partition
13066
13067 Partition the string into three parts using the given separator.
13068
13069 This will search for the separator in the string, starting at the end. If
13070 the separator is found, returns a 3-tuple containing the part before the
13071 separator, the separator itself, and the part after it.
13072
13073 If the separator is not found, returns a 3-tuple containing two empty strings
13074 and the original string.
13075 [clinic start generated code]*/
13076
13077 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13078 unicode_rpartition(PyObject *self, PyObject *sep)
13079 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13080 {
13081 return PyUnicode_RPartition(self, sep);
13082 }
13083
13084 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13085 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13086 {
13087 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13088 return NULL;
13089
13090 return rsplit(s, sep, maxsplit);
13091 }
13092
13093 /*[clinic input]
13094 str.rsplit as unicode_rsplit = str.split
13095
13096 Return a list of the words in the string, using sep as the delimiter string.
13097
13098 Splits are done starting at the end of the string and working to the front.
13099 [clinic start generated code]*/
13100
13101 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13102 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13103 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13104 {
13105 if (sep == Py_None)
13106 return rsplit(self, NULL, maxsplit);
13107 if (PyUnicode_Check(sep))
13108 return rsplit(self, sep, maxsplit);
13109
13110 PyErr_Format(PyExc_TypeError,
13111 "must be str or None, not %.100s",
13112 Py_TYPE(sep)->tp_name);
13113 return NULL;
13114 }
13115
13116 /*[clinic input]
13117 str.splitlines as unicode_splitlines
13118
13119 keepends: bool(accept={int}) = False
13120
13121 Return a list of the lines in the string, breaking at line boundaries.
13122
13123 Line breaks are not included in the resulting list unless keepends is given and
13124 true.
13125 [clinic start generated code]*/
13126
13127 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13128 unicode_splitlines_impl(PyObject *self, int keepends)
13129 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13130 {
13131 return PyUnicode_Splitlines(self, keepends);
13132 }
13133
13134 static
unicode_str(PyObject * self)13135 PyObject *unicode_str(PyObject *self)
13136 {
13137 return unicode_result_unchanged(self);
13138 }
13139
13140 /*[clinic input]
13141 str.swapcase as unicode_swapcase
13142
13143 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13144 [clinic start generated code]*/
13145
13146 static PyObject *
unicode_swapcase_impl(PyObject * self)13147 unicode_swapcase_impl(PyObject *self)
13148 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13149 {
13150 if (PyUnicode_READY(self) == -1)
13151 return NULL;
13152 return case_operation(self, do_swapcase);
13153 }
13154
13155 /*[clinic input]
13156
13157 @staticmethod
13158 str.maketrans as unicode_maketrans
13159
13160 x: object
13161
13162 y: unicode=NULL
13163
13164 z: unicode=NULL
13165
13166 /
13167
13168 Return a translation table usable for str.translate().
13169
13170 If there is only one argument, it must be a dictionary mapping Unicode
13171 ordinals (integers) or characters to Unicode ordinals, strings or None.
13172 Character keys will be then converted to ordinals.
13173 If there are two arguments, they must be strings of equal length, and
13174 in the resulting dictionary, each character in x will be mapped to the
13175 character at the same position in y. If there is a third argument, it
13176 must be a string, whose characters will be mapped to None in the result.
13177 [clinic start generated code]*/
13178
13179 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13180 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13181 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13182 {
13183 PyObject *new = NULL, *key, *value;
13184 Py_ssize_t i = 0;
13185 int res;
13186
13187 new = PyDict_New();
13188 if (!new)
13189 return NULL;
13190 if (y != NULL) {
13191 int x_kind, y_kind, z_kind;
13192 void *x_data, *y_data, *z_data;
13193
13194 /* x must be a string too, of equal length */
13195 if (!PyUnicode_Check(x)) {
13196 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13197 "be a string if there is a second argument");
13198 goto err;
13199 }
13200 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13201 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13202 "arguments must have equal length");
13203 goto err;
13204 }
13205 /* create entries for translating chars in x to those in y */
13206 x_kind = PyUnicode_KIND(x);
13207 y_kind = PyUnicode_KIND(y);
13208 x_data = PyUnicode_DATA(x);
13209 y_data = PyUnicode_DATA(y);
13210 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13211 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13212 if (!key)
13213 goto err;
13214 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13215 if (!value) {
13216 Py_DECREF(key);
13217 goto err;
13218 }
13219 res = PyDict_SetItem(new, key, value);
13220 Py_DECREF(key);
13221 Py_DECREF(value);
13222 if (res < 0)
13223 goto err;
13224 }
13225 /* create entries for deleting chars in z */
13226 if (z != NULL) {
13227 z_kind = PyUnicode_KIND(z);
13228 z_data = PyUnicode_DATA(z);
13229 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13230 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13231 if (!key)
13232 goto err;
13233 res = PyDict_SetItem(new, key, Py_None);
13234 Py_DECREF(key);
13235 if (res < 0)
13236 goto err;
13237 }
13238 }
13239 } else {
13240 int kind;
13241 void *data;
13242
13243 /* x must be a dict */
13244 if (!PyDict_CheckExact(x)) {
13245 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13246 "to maketrans it must be a dict");
13247 goto err;
13248 }
13249 /* copy entries into the new dict, converting string keys to int keys */
13250 while (PyDict_Next(x, &i, &key, &value)) {
13251 if (PyUnicode_Check(key)) {
13252 /* convert string keys to integer keys */
13253 PyObject *newkey;
13254 if (PyUnicode_GET_LENGTH(key) != 1) {
13255 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13256 "table must be of length 1");
13257 goto err;
13258 }
13259 kind = PyUnicode_KIND(key);
13260 data = PyUnicode_DATA(key);
13261 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13262 if (!newkey)
13263 goto err;
13264 res = PyDict_SetItem(new, newkey, value);
13265 Py_DECREF(newkey);
13266 if (res < 0)
13267 goto err;
13268 } else if (PyLong_Check(key)) {
13269 /* just keep integer keys */
13270 if (PyDict_SetItem(new, key, value) < 0)
13271 goto err;
13272 } else {
13273 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13274 "be strings or integers");
13275 goto err;
13276 }
13277 }
13278 }
13279 return new;
13280 err:
13281 Py_DECREF(new);
13282 return NULL;
13283 }
13284
13285 /*[clinic input]
13286 str.translate as unicode_translate
13287
13288 table: object
13289 Translation table, which must be a mapping of Unicode ordinals to
13290 Unicode ordinals, strings, or None.
13291 /
13292
13293 Replace each character in the string using the given translation table.
13294
13295 The table must implement lookup/indexing via __getitem__, for instance a
13296 dictionary or list. If this operation raises LookupError, the character is
13297 left untouched. Characters mapped to None are deleted.
13298 [clinic start generated code]*/
13299
13300 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13301 unicode_translate(PyObject *self, PyObject *table)
13302 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13303 {
13304 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13305 }
13306
13307 /*[clinic input]
13308 str.upper as unicode_upper
13309
13310 Return a copy of the string converted to uppercase.
13311 [clinic start generated code]*/
13312
13313 static PyObject *
unicode_upper_impl(PyObject * self)13314 unicode_upper_impl(PyObject *self)
13315 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13316 {
13317 if (PyUnicode_READY(self) == -1)
13318 return NULL;
13319 if (PyUnicode_IS_ASCII(self))
13320 return ascii_upper_or_lower(self, 0);
13321 return case_operation(self, do_upper);
13322 }
13323
13324 /*[clinic input]
13325 str.zfill as unicode_zfill
13326
13327 width: Py_ssize_t
13328 /
13329
13330 Pad a numeric string with zeros on the left, to fill a field of the given width.
13331
13332 The string is never truncated.
13333 [clinic start generated code]*/
13334
13335 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13336 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13337 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13338 {
13339 Py_ssize_t fill;
13340 PyObject *u;
13341 int kind;
13342 void *data;
13343 Py_UCS4 chr;
13344
13345 if (PyUnicode_READY(self) == -1)
13346 return NULL;
13347
13348 if (PyUnicode_GET_LENGTH(self) >= width)
13349 return unicode_result_unchanged(self);
13350
13351 fill = width - PyUnicode_GET_LENGTH(self);
13352
13353 u = pad(self, fill, 0, '0');
13354
13355 if (u == NULL)
13356 return NULL;
13357
13358 kind = PyUnicode_KIND(u);
13359 data = PyUnicode_DATA(u);
13360 chr = PyUnicode_READ(kind, data, fill);
13361
13362 if (chr == '+' || chr == '-') {
13363 /* move sign to beginning of string */
13364 PyUnicode_WRITE(kind, data, 0, chr);
13365 PyUnicode_WRITE(kind, data, fill, '0');
13366 }
13367
13368 assert(_PyUnicode_CheckConsistency(u, 1));
13369 return u;
13370 }
13371
13372 #if 0
13373 static PyObject *
13374 unicode__decimal2ascii(PyObject *self)
13375 {
13376 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13377 }
13378 #endif
13379
13380 PyDoc_STRVAR(startswith__doc__,
13381 "S.startswith(prefix[, start[, end]]) -> bool\n\
13382 \n\
13383 Return True if S starts with the specified prefix, False otherwise.\n\
13384 With optional start, test S beginning at that position.\n\
13385 With optional end, stop comparing S at that position.\n\
13386 prefix can also be a tuple of strings to try.");
13387
13388 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13389 unicode_startswith(PyObject *self,
13390 PyObject *args)
13391 {
13392 PyObject *subobj;
13393 PyObject *substring;
13394 Py_ssize_t start = 0;
13395 Py_ssize_t end = PY_SSIZE_T_MAX;
13396 int result;
13397
13398 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13399 return NULL;
13400 if (PyTuple_Check(subobj)) {
13401 Py_ssize_t i;
13402 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13403 substring = PyTuple_GET_ITEM(subobj, i);
13404 if (!PyUnicode_Check(substring)) {
13405 PyErr_Format(PyExc_TypeError,
13406 "tuple for startswith must only contain str, "
13407 "not %.100s",
13408 Py_TYPE(substring)->tp_name);
13409 return NULL;
13410 }
13411 result = tailmatch(self, substring, start, end, -1);
13412 if (result == -1)
13413 return NULL;
13414 if (result) {
13415 Py_RETURN_TRUE;
13416 }
13417 }
13418 /* nothing matched */
13419 Py_RETURN_FALSE;
13420 }
13421 if (!PyUnicode_Check(subobj)) {
13422 PyErr_Format(PyExc_TypeError,
13423 "startswith first arg must be str or "
13424 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13425 return NULL;
13426 }
13427 result = tailmatch(self, subobj, start, end, -1);
13428 if (result == -1)
13429 return NULL;
13430 return PyBool_FromLong(result);
13431 }
13432
13433
13434 PyDoc_STRVAR(endswith__doc__,
13435 "S.endswith(suffix[, start[, end]]) -> bool\n\
13436 \n\
13437 Return True if S ends with the specified suffix, False otherwise.\n\
13438 With optional start, test S beginning at that position.\n\
13439 With optional end, stop comparing S at that position.\n\
13440 suffix can also be a tuple of strings to try.");
13441
13442 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13443 unicode_endswith(PyObject *self,
13444 PyObject *args)
13445 {
13446 PyObject *subobj;
13447 PyObject *substring;
13448 Py_ssize_t start = 0;
13449 Py_ssize_t end = PY_SSIZE_T_MAX;
13450 int result;
13451
13452 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13453 return NULL;
13454 if (PyTuple_Check(subobj)) {
13455 Py_ssize_t i;
13456 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13457 substring = PyTuple_GET_ITEM(subobj, i);
13458 if (!PyUnicode_Check(substring)) {
13459 PyErr_Format(PyExc_TypeError,
13460 "tuple for endswith must only contain str, "
13461 "not %.100s",
13462 Py_TYPE(substring)->tp_name);
13463 return NULL;
13464 }
13465 result = tailmatch(self, substring, start, end, +1);
13466 if (result == -1)
13467 return NULL;
13468 if (result) {
13469 Py_RETURN_TRUE;
13470 }
13471 }
13472 Py_RETURN_FALSE;
13473 }
13474 if (!PyUnicode_Check(subobj)) {
13475 PyErr_Format(PyExc_TypeError,
13476 "endswith first arg must be str or "
13477 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13478 return NULL;
13479 }
13480 result = tailmatch(self, subobj, start, end, +1);
13481 if (result == -1)
13482 return NULL;
13483 return PyBool_FromLong(result);
13484 }
13485
13486 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13487 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13488 {
13489 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13490 writer->data = PyUnicode_DATA(writer->buffer);
13491
13492 if (!writer->readonly) {
13493 writer->kind = PyUnicode_KIND(writer->buffer);
13494 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13495 }
13496 else {
13497 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13498 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13499 writer->kind = PyUnicode_WCHAR_KIND;
13500 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13501
13502 /* Copy-on-write mode: set buffer size to 0 so
13503 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13504 * next write. */
13505 writer->size = 0;
13506 }
13507 }
13508
13509 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13510 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13511 {
13512 memset(writer, 0, sizeof(*writer));
13513
13514 /* ASCII is the bare minimum */
13515 writer->min_char = 127;
13516
13517 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13518 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13519 writer->kind = PyUnicode_WCHAR_KIND;
13520 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13521 }
13522
13523 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13524 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13525 Py_ssize_t length, Py_UCS4 maxchar)
13526 {
13527 Py_ssize_t newlen;
13528 PyObject *newbuffer;
13529
13530 assert(maxchar <= MAX_UNICODE);
13531
13532 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13533 assert((maxchar > writer->maxchar && length >= 0)
13534 || length > 0);
13535
13536 if (length > PY_SSIZE_T_MAX - writer->pos) {
13537 PyErr_NoMemory();
13538 return -1;
13539 }
13540 newlen = writer->pos + length;
13541
13542 maxchar = Py_MAX(maxchar, writer->min_char);
13543
13544 if (writer->buffer == NULL) {
13545 assert(!writer->readonly);
13546 if (writer->overallocate
13547 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13548 /* overallocate to limit the number of realloc() */
13549 newlen += newlen / OVERALLOCATE_FACTOR;
13550 }
13551 if (newlen < writer->min_length)
13552 newlen = writer->min_length;
13553
13554 writer->buffer = PyUnicode_New(newlen, maxchar);
13555 if (writer->buffer == NULL)
13556 return -1;
13557 }
13558 else if (newlen > writer->size) {
13559 if (writer->overallocate
13560 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13561 /* overallocate to limit the number of realloc() */
13562 newlen += newlen / OVERALLOCATE_FACTOR;
13563 }
13564 if (newlen < writer->min_length)
13565 newlen = writer->min_length;
13566
13567 if (maxchar > writer->maxchar || writer->readonly) {
13568 /* resize + widen */
13569 maxchar = Py_MAX(maxchar, writer->maxchar);
13570 newbuffer = PyUnicode_New(newlen, maxchar);
13571 if (newbuffer == NULL)
13572 return -1;
13573 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13574 writer->buffer, 0, writer->pos);
13575 Py_DECREF(writer->buffer);
13576 writer->readonly = 0;
13577 }
13578 else {
13579 newbuffer = resize_compact(writer->buffer, newlen);
13580 if (newbuffer == NULL)
13581 return -1;
13582 }
13583 writer->buffer = newbuffer;
13584 }
13585 else if (maxchar > writer->maxchar) {
13586 assert(!writer->readonly);
13587 newbuffer = PyUnicode_New(writer->size, maxchar);
13588 if (newbuffer == NULL)
13589 return -1;
13590 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13591 writer->buffer, 0, writer->pos);
13592 Py_SETREF(writer->buffer, newbuffer);
13593 }
13594 _PyUnicodeWriter_Update(writer);
13595 return 0;
13596
13597 #undef OVERALLOCATE_FACTOR
13598 }
13599
13600 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13601 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13602 enum PyUnicode_Kind kind)
13603 {
13604 Py_UCS4 maxchar;
13605
13606 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13607 assert(writer->kind < kind);
13608
13609 switch (kind)
13610 {
13611 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13612 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13613 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13614 default:
13615 Py_UNREACHABLE();
13616 }
13617
13618 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13619 }
13620
13621 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13622 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13623 {
13624 assert(ch <= MAX_UNICODE);
13625 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13626 return -1;
13627 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13628 writer->pos++;
13629 return 0;
13630 }
13631
13632 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13633 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13634 {
13635 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13636 }
13637
13638 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13639 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13640 {
13641 Py_UCS4 maxchar;
13642 Py_ssize_t len;
13643
13644 if (PyUnicode_READY(str) == -1)
13645 return -1;
13646 len = PyUnicode_GET_LENGTH(str);
13647 if (len == 0)
13648 return 0;
13649 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13650 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13651 if (writer->buffer == NULL && !writer->overallocate) {
13652 assert(_PyUnicode_CheckConsistency(str, 1));
13653 writer->readonly = 1;
13654 Py_INCREF(str);
13655 writer->buffer = str;
13656 _PyUnicodeWriter_Update(writer);
13657 writer->pos += len;
13658 return 0;
13659 }
13660 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13661 return -1;
13662 }
13663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13664 str, 0, len);
13665 writer->pos += len;
13666 return 0;
13667 }
13668
13669 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13670 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13671 Py_ssize_t start, Py_ssize_t end)
13672 {
13673 Py_UCS4 maxchar;
13674 Py_ssize_t len;
13675
13676 if (PyUnicode_READY(str) == -1)
13677 return -1;
13678
13679 assert(0 <= start);
13680 assert(end <= PyUnicode_GET_LENGTH(str));
13681 assert(start <= end);
13682
13683 if (end == 0)
13684 return 0;
13685
13686 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13687 return _PyUnicodeWriter_WriteStr(writer, str);
13688
13689 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13690 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13691 else
13692 maxchar = writer->maxchar;
13693 len = end - start;
13694
13695 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13696 return -1;
13697
13698 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13699 str, start, len);
13700 writer->pos += len;
13701 return 0;
13702 }
13703
13704 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13705 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13706 const char *ascii, Py_ssize_t len)
13707 {
13708 if (len == -1)
13709 len = strlen(ascii);
13710
13711 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13712
13713 if (writer->buffer == NULL && !writer->overallocate) {
13714 PyObject *str;
13715
13716 str = _PyUnicode_FromASCII(ascii, len);
13717 if (str == NULL)
13718 return -1;
13719
13720 writer->readonly = 1;
13721 writer->buffer = str;
13722 _PyUnicodeWriter_Update(writer);
13723 writer->pos += len;
13724 return 0;
13725 }
13726
13727 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13728 return -1;
13729
13730 switch (writer->kind)
13731 {
13732 case PyUnicode_1BYTE_KIND:
13733 {
13734 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13735 Py_UCS1 *data = writer->data;
13736
13737 memcpy(data + writer->pos, str, len);
13738 break;
13739 }
13740 case PyUnicode_2BYTE_KIND:
13741 {
13742 _PyUnicode_CONVERT_BYTES(
13743 Py_UCS1, Py_UCS2,
13744 ascii, ascii + len,
13745 (Py_UCS2 *)writer->data + writer->pos);
13746 break;
13747 }
13748 case PyUnicode_4BYTE_KIND:
13749 {
13750 _PyUnicode_CONVERT_BYTES(
13751 Py_UCS1, Py_UCS4,
13752 ascii, ascii + len,
13753 (Py_UCS4 *)writer->data + writer->pos);
13754 break;
13755 }
13756 default:
13757 Py_UNREACHABLE();
13758 }
13759
13760 writer->pos += len;
13761 return 0;
13762 }
13763
13764 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13765 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13766 const char *str, Py_ssize_t len)
13767 {
13768 Py_UCS4 maxchar;
13769
13770 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13771 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13772 return -1;
13773 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13774 writer->pos += len;
13775 return 0;
13776 }
13777
13778 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13779 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13780 {
13781 PyObject *str;
13782
13783 if (writer->pos == 0) {
13784 Py_CLEAR(writer->buffer);
13785 _Py_RETURN_UNICODE_EMPTY();
13786 }
13787
13788 str = writer->buffer;
13789 writer->buffer = NULL;
13790
13791 if (writer->readonly) {
13792 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13793 return str;
13794 }
13795
13796 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13797 PyObject *str2;
13798 str2 = resize_compact(str, writer->pos);
13799 if (str2 == NULL) {
13800 Py_DECREF(str);
13801 return NULL;
13802 }
13803 str = str2;
13804 }
13805
13806 assert(_PyUnicode_CheckConsistency(str, 1));
13807 return unicode_result_ready(str);
13808 }
13809
13810 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13811 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13812 {
13813 Py_CLEAR(writer->buffer);
13814 }
13815
13816 #include "stringlib/unicode_format.h"
13817
13818 PyDoc_STRVAR(format__doc__,
13819 "S.format(*args, **kwargs) -> str\n\
13820 \n\
13821 Return a formatted version of S, using substitutions from args and kwargs.\n\
13822 The substitutions are identified by braces ('{' and '}').");
13823
13824 PyDoc_STRVAR(format_map__doc__,
13825 "S.format_map(mapping) -> str\n\
13826 \n\
13827 Return a formatted version of S, using substitutions from mapping.\n\
13828 The substitutions are identified by braces ('{' and '}').");
13829
13830 /*[clinic input]
13831 str.__format__ as unicode___format__
13832
13833 format_spec: unicode
13834 /
13835
13836 Return a formatted version of the string as described by format_spec.
13837 [clinic start generated code]*/
13838
13839 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13840 unicode___format___impl(PyObject *self, PyObject *format_spec)
13841 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13842 {
13843 _PyUnicodeWriter writer;
13844 int ret;
13845
13846 if (PyUnicode_READY(self) == -1)
13847 return NULL;
13848 _PyUnicodeWriter_Init(&writer);
13849 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13850 self, format_spec, 0,
13851 PyUnicode_GET_LENGTH(format_spec));
13852 if (ret == -1) {
13853 _PyUnicodeWriter_Dealloc(&writer);
13854 return NULL;
13855 }
13856 return _PyUnicodeWriter_Finish(&writer);
13857 }
13858
13859 /*[clinic input]
13860 str.__sizeof__ as unicode_sizeof
13861
13862 Return the size of the string in memory, in bytes.
13863 [clinic start generated code]*/
13864
13865 static PyObject *
unicode_sizeof_impl(PyObject * self)13866 unicode_sizeof_impl(PyObject *self)
13867 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13868 {
13869 Py_ssize_t size;
13870
13871 /* If it's a compact object, account for base structure +
13872 character data. */
13873 if (PyUnicode_IS_COMPACT_ASCII(self))
13874 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13875 else if (PyUnicode_IS_COMPACT(self))
13876 size = sizeof(PyCompactUnicodeObject) +
13877 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13878 else {
13879 /* If it is a two-block object, account for base object, and
13880 for character block if present. */
13881 size = sizeof(PyUnicodeObject);
13882 if (_PyUnicode_DATA_ANY(self))
13883 size += (PyUnicode_GET_LENGTH(self) + 1) *
13884 PyUnicode_KIND(self);
13885 }
13886 /* If the wstr pointer is present, account for it unless it is shared
13887 with the data pointer. Check if the data is not shared. */
13888 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13889 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13890 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13891 size += PyUnicode_UTF8_LENGTH(self) + 1;
13892
13893 return PyLong_FromSsize_t(size);
13894 }
13895
13896 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13897 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13898 {
13899 PyObject *copy = _PyUnicode_Copy(v);
13900 if (!copy)
13901 return NULL;
13902 return Py_BuildValue("(N)", copy);
13903 }
13904
13905 static PyMethodDef unicode_methods[] = {
13906 UNICODE_ENCODE_METHODDEF
13907 UNICODE_REPLACE_METHODDEF
13908 UNICODE_SPLIT_METHODDEF
13909 UNICODE_RSPLIT_METHODDEF
13910 UNICODE_JOIN_METHODDEF
13911 UNICODE_CAPITALIZE_METHODDEF
13912 UNICODE_CASEFOLD_METHODDEF
13913 UNICODE_TITLE_METHODDEF
13914 UNICODE_CENTER_METHODDEF
13915 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13916 UNICODE_EXPANDTABS_METHODDEF
13917 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13918 UNICODE_PARTITION_METHODDEF
13919 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13920 UNICODE_LJUST_METHODDEF
13921 UNICODE_LOWER_METHODDEF
13922 UNICODE_LSTRIP_METHODDEF
13923 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13924 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13925 UNICODE_RJUST_METHODDEF
13926 UNICODE_RSTRIP_METHODDEF
13927 UNICODE_RPARTITION_METHODDEF
13928 UNICODE_SPLITLINES_METHODDEF
13929 UNICODE_STRIP_METHODDEF
13930 UNICODE_SWAPCASE_METHODDEF
13931 UNICODE_TRANSLATE_METHODDEF
13932 UNICODE_UPPER_METHODDEF
13933 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13934 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13935 UNICODE_ISASCII_METHODDEF
13936 UNICODE_ISLOWER_METHODDEF
13937 UNICODE_ISUPPER_METHODDEF
13938 UNICODE_ISTITLE_METHODDEF
13939 UNICODE_ISSPACE_METHODDEF
13940 UNICODE_ISDECIMAL_METHODDEF
13941 UNICODE_ISDIGIT_METHODDEF
13942 UNICODE_ISNUMERIC_METHODDEF
13943 UNICODE_ISALPHA_METHODDEF
13944 UNICODE_ISALNUM_METHODDEF
13945 UNICODE_ISIDENTIFIER_METHODDEF
13946 UNICODE_ISPRINTABLE_METHODDEF
13947 UNICODE_ZFILL_METHODDEF
13948 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13949 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13950 UNICODE___FORMAT___METHODDEF
13951 UNICODE_MAKETRANS_METHODDEF
13952 UNICODE_SIZEOF_METHODDEF
13953 #if 0
13954 /* These methods are just used for debugging the implementation. */
13955 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13956 #endif
13957
13958 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
13959 {NULL, NULL}
13960 };
13961
13962 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13963 unicode_mod(PyObject *v, PyObject *w)
13964 {
13965 if (!PyUnicode_Check(v))
13966 Py_RETURN_NOTIMPLEMENTED;
13967 return PyUnicode_Format(v, w);
13968 }
13969
13970 static PyNumberMethods unicode_as_number = {
13971 0, /*nb_add*/
13972 0, /*nb_subtract*/
13973 0, /*nb_multiply*/
13974 unicode_mod, /*nb_remainder*/
13975 };
13976
13977 static PySequenceMethods unicode_as_sequence = {
13978 (lenfunc) unicode_length, /* sq_length */
13979 PyUnicode_Concat, /* sq_concat */
13980 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13981 (ssizeargfunc) unicode_getitem, /* sq_item */
13982 0, /* sq_slice */
13983 0, /* sq_ass_item */
13984 0, /* sq_ass_slice */
13985 PyUnicode_Contains, /* sq_contains */
13986 };
13987
13988 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13989 unicode_subscript(PyObject* self, PyObject* item)
13990 {
13991 if (PyUnicode_READY(self) == -1)
13992 return NULL;
13993
13994 if (PyIndex_Check(item)) {
13995 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13996 if (i == -1 && PyErr_Occurred())
13997 return NULL;
13998 if (i < 0)
13999 i += PyUnicode_GET_LENGTH(self);
14000 return unicode_getitem(self, i);
14001 } else if (PySlice_Check(item)) {
14002 Py_ssize_t start, stop, step, slicelength, i;
14003 size_t cur;
14004 PyObject *result;
14005 void *src_data, *dest_data;
14006 int src_kind, dest_kind;
14007 Py_UCS4 ch, max_char, kind_limit;
14008
14009 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14010 return NULL;
14011 }
14012 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14013 &start, &stop, step);
14014
14015 if (slicelength <= 0) {
14016 _Py_RETURN_UNICODE_EMPTY();
14017 } else if (start == 0 && step == 1 &&
14018 slicelength == PyUnicode_GET_LENGTH(self)) {
14019 return unicode_result_unchanged(self);
14020 } else if (step == 1) {
14021 return PyUnicode_Substring(self,
14022 start, start + slicelength);
14023 }
14024 /* General case */
14025 src_kind = PyUnicode_KIND(self);
14026 src_data = PyUnicode_DATA(self);
14027 if (!PyUnicode_IS_ASCII(self)) {
14028 kind_limit = kind_maxchar_limit(src_kind);
14029 max_char = 0;
14030 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14031 ch = PyUnicode_READ(src_kind, src_data, cur);
14032 if (ch > max_char) {
14033 max_char = ch;
14034 if (max_char >= kind_limit)
14035 break;
14036 }
14037 }
14038 }
14039 else
14040 max_char = 127;
14041 result = PyUnicode_New(slicelength, max_char);
14042 if (result == NULL)
14043 return NULL;
14044 dest_kind = PyUnicode_KIND(result);
14045 dest_data = PyUnicode_DATA(result);
14046
14047 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14048 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14049 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14050 }
14051 assert(_PyUnicode_CheckConsistency(result, 1));
14052 return result;
14053 } else {
14054 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14055 return NULL;
14056 }
14057 }
14058
14059 static PyMappingMethods unicode_as_mapping = {
14060 (lenfunc)unicode_length, /* mp_length */
14061 (binaryfunc)unicode_subscript, /* mp_subscript */
14062 (objobjargproc)0, /* mp_ass_subscript */
14063 };
14064
14065
14066 /* Helpers for PyUnicode_Format() */
14067
14068 struct unicode_formatter_t {
14069 PyObject *args;
14070 int args_owned;
14071 Py_ssize_t arglen, argidx;
14072 PyObject *dict;
14073
14074 enum PyUnicode_Kind fmtkind;
14075 Py_ssize_t fmtcnt, fmtpos;
14076 void *fmtdata;
14077 PyObject *fmtstr;
14078
14079 _PyUnicodeWriter writer;
14080 };
14081
14082 struct unicode_format_arg_t {
14083 Py_UCS4 ch;
14084 int flags;
14085 Py_ssize_t width;
14086 int prec;
14087 int sign;
14088 };
14089
14090 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14091 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14092 {
14093 Py_ssize_t argidx = ctx->argidx;
14094
14095 if (argidx < ctx->arglen) {
14096 ctx->argidx++;
14097 if (ctx->arglen < 0)
14098 return ctx->args;
14099 else
14100 return PyTuple_GetItem(ctx->args, argidx);
14101 }
14102 PyErr_SetString(PyExc_TypeError,
14103 "not enough arguments for format string");
14104 return NULL;
14105 }
14106
14107 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14108
14109 /* Format a float into the writer if the writer is not NULL, or into *p_output
14110 otherwise.
14111
14112 Return 0 on success, raise an exception and return -1 on error. */
14113 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14114 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14115 PyObject **p_output,
14116 _PyUnicodeWriter *writer)
14117 {
14118 char *p;
14119 double x;
14120 Py_ssize_t len;
14121 int prec;
14122 int dtoa_flags;
14123
14124 x = PyFloat_AsDouble(v);
14125 if (x == -1.0 && PyErr_Occurred())
14126 return -1;
14127
14128 prec = arg->prec;
14129 if (prec < 0)
14130 prec = 6;
14131
14132 if (arg->flags & F_ALT)
14133 dtoa_flags = Py_DTSF_ALT;
14134 else
14135 dtoa_flags = 0;
14136 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14137 if (p == NULL)
14138 return -1;
14139 len = strlen(p);
14140 if (writer) {
14141 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14142 PyMem_Free(p);
14143 return -1;
14144 }
14145 }
14146 else
14147 *p_output = _PyUnicode_FromASCII(p, len);
14148 PyMem_Free(p);
14149 return 0;
14150 }
14151
14152 /* formatlong() emulates the format codes d, u, o, x and X, and
14153 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14154 * Python's regular ints.
14155 * Return value: a new PyUnicodeObject*, or NULL if error.
14156 * The output string is of the form
14157 * "-"? ("0x" | "0X")? digit+
14158 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14159 * set in flags. The case of hex digits will be correct,
14160 * There will be at least prec digits, zero-filled on the left if
14161 * necessary to get that many.
14162 * val object to be converted
14163 * flags bitmask of format flags; only F_ALT is looked at
14164 * prec minimum number of digits; 0-fill on left if needed
14165 * type a character in [duoxX]; u acts the same as d
14166 *
14167 * CAUTION: o, x and X conversions on regular ints can never
14168 * produce a '-' sign, but can for Python's unbounded ints.
14169 */
14170 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14171 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14172 {
14173 PyObject *result = NULL;
14174 char *buf;
14175 Py_ssize_t i;
14176 int sign; /* 1 if '-', else 0 */
14177 int len; /* number of characters */
14178 Py_ssize_t llen;
14179 int numdigits; /* len == numnondigits + numdigits */
14180 int numnondigits = 0;
14181
14182 /* Avoid exceeding SSIZE_T_MAX */
14183 if (prec > INT_MAX-3) {
14184 PyErr_SetString(PyExc_OverflowError,
14185 "precision too large");
14186 return NULL;
14187 }
14188
14189 assert(PyLong_Check(val));
14190
14191 switch (type) {
14192 default:
14193 Py_UNREACHABLE();
14194 case 'd':
14195 case 'i':
14196 case 'u':
14197 /* int and int subclasses should print numerically when a numeric */
14198 /* format code is used (see issue18780) */
14199 result = PyNumber_ToBase(val, 10);
14200 break;
14201 case 'o':
14202 numnondigits = 2;
14203 result = PyNumber_ToBase(val, 8);
14204 break;
14205 case 'x':
14206 case 'X':
14207 numnondigits = 2;
14208 result = PyNumber_ToBase(val, 16);
14209 break;
14210 }
14211 if (!result)
14212 return NULL;
14213
14214 assert(unicode_modifiable(result));
14215 assert(PyUnicode_IS_READY(result));
14216 assert(PyUnicode_IS_ASCII(result));
14217
14218 /* To modify the string in-place, there can only be one reference. */
14219 if (Py_REFCNT(result) != 1) {
14220 Py_DECREF(result);
14221 PyErr_BadInternalCall();
14222 return NULL;
14223 }
14224 buf = PyUnicode_DATA(result);
14225 llen = PyUnicode_GET_LENGTH(result);
14226 if (llen > INT_MAX) {
14227 Py_DECREF(result);
14228 PyErr_SetString(PyExc_ValueError,
14229 "string too large in _PyUnicode_FormatLong");
14230 return NULL;
14231 }
14232 len = (int)llen;
14233 sign = buf[0] == '-';
14234 numnondigits += sign;
14235 numdigits = len - numnondigits;
14236 assert(numdigits > 0);
14237
14238 /* Get rid of base marker unless F_ALT */
14239 if (((alt) == 0 &&
14240 (type == 'o' || type == 'x' || type == 'X'))) {
14241 assert(buf[sign] == '0');
14242 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14243 buf[sign+1] == 'o');
14244 numnondigits -= 2;
14245 buf += 2;
14246 len -= 2;
14247 if (sign)
14248 buf[0] = '-';
14249 assert(len == numnondigits + numdigits);
14250 assert(numdigits > 0);
14251 }
14252
14253 /* Fill with leading zeroes to meet minimum width. */
14254 if (prec > numdigits) {
14255 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14256 numnondigits + prec);
14257 char *b1;
14258 if (!r1) {
14259 Py_DECREF(result);
14260 return NULL;
14261 }
14262 b1 = PyBytes_AS_STRING(r1);
14263 for (i = 0; i < numnondigits; ++i)
14264 *b1++ = *buf++;
14265 for (i = 0; i < prec - numdigits; i++)
14266 *b1++ = '0';
14267 for (i = 0; i < numdigits; i++)
14268 *b1++ = *buf++;
14269 *b1 = '\0';
14270 Py_DECREF(result);
14271 result = r1;
14272 buf = PyBytes_AS_STRING(result);
14273 len = numnondigits + prec;
14274 }
14275
14276 /* Fix up case for hex conversions. */
14277 if (type == 'X') {
14278 /* Need to convert all lower case letters to upper case.
14279 and need to convert 0x to 0X (and -0x to -0X). */
14280 for (i = 0; i < len; i++)
14281 if (buf[i] >= 'a' && buf[i] <= 'x')
14282 buf[i] -= 'a'-'A';
14283 }
14284 if (!PyUnicode_Check(result)
14285 || buf != PyUnicode_DATA(result)) {
14286 PyObject *unicode;
14287 unicode = _PyUnicode_FromASCII(buf, len);
14288 Py_DECREF(result);
14289 result = unicode;
14290 }
14291 else if (len != PyUnicode_GET_LENGTH(result)) {
14292 if (PyUnicode_Resize(&result, len) < 0)
14293 Py_CLEAR(result);
14294 }
14295 return result;
14296 }
14297
14298 /* Format an integer or a float as an integer.
14299 * Return 1 if the number has been formatted into the writer,
14300 * 0 if the number has been formatted into *p_output
14301 * -1 and raise an exception on error */
14302 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14303 mainformatlong(PyObject *v,
14304 struct unicode_format_arg_t *arg,
14305 PyObject **p_output,
14306 _PyUnicodeWriter *writer)
14307 {
14308 PyObject *iobj, *res;
14309 char type = (char)arg->ch;
14310
14311 if (!PyNumber_Check(v))
14312 goto wrongtype;
14313
14314 /* make sure number is a type of integer for o, x, and X */
14315 if (!PyLong_Check(v)) {
14316 if (type == 'o' || type == 'x' || type == 'X') {
14317 iobj = PyNumber_Index(v);
14318 if (iobj == NULL) {
14319 if (PyErr_ExceptionMatches(PyExc_TypeError))
14320 goto wrongtype;
14321 return -1;
14322 }
14323 }
14324 else {
14325 iobj = PyNumber_Long(v);
14326 if (iobj == NULL ) {
14327 if (PyErr_ExceptionMatches(PyExc_TypeError))
14328 goto wrongtype;
14329 return -1;
14330 }
14331 }
14332 assert(PyLong_Check(iobj));
14333 }
14334 else {
14335 iobj = v;
14336 Py_INCREF(iobj);
14337 }
14338
14339 if (PyLong_CheckExact(v)
14340 && arg->width == -1 && arg->prec == -1
14341 && !(arg->flags & (F_SIGN | F_BLANK))
14342 && type != 'X')
14343 {
14344 /* Fast path */
14345 int alternate = arg->flags & F_ALT;
14346 int base;
14347
14348 switch(type)
14349 {
14350 default:
14351 Py_UNREACHABLE();
14352 case 'd':
14353 case 'i':
14354 case 'u':
14355 base = 10;
14356 break;
14357 case 'o':
14358 base = 8;
14359 break;
14360 case 'x':
14361 case 'X':
14362 base = 16;
14363 break;
14364 }
14365
14366 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14367 Py_DECREF(iobj);
14368 return -1;
14369 }
14370 Py_DECREF(iobj);
14371 return 1;
14372 }
14373
14374 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14375 Py_DECREF(iobj);
14376 if (res == NULL)
14377 return -1;
14378 *p_output = res;
14379 return 0;
14380
14381 wrongtype:
14382 switch(type)
14383 {
14384 case 'o':
14385 case 'x':
14386 case 'X':
14387 PyErr_Format(PyExc_TypeError,
14388 "%%%c format: an integer is required, "
14389 "not %.200s",
14390 type, Py_TYPE(v)->tp_name);
14391 break;
14392 default:
14393 PyErr_Format(PyExc_TypeError,
14394 "%%%c format: a number is required, "
14395 "not %.200s",
14396 type, Py_TYPE(v)->tp_name);
14397 break;
14398 }
14399 return -1;
14400 }
14401
14402 static Py_UCS4
formatchar(PyObject * v)14403 formatchar(PyObject *v)
14404 {
14405 /* presume that the buffer is at least 3 characters long */
14406 if (PyUnicode_Check(v)) {
14407 if (PyUnicode_GET_LENGTH(v) == 1) {
14408 return PyUnicode_READ_CHAR(v, 0);
14409 }
14410 goto onError;
14411 }
14412 else {
14413 PyObject *iobj;
14414 long x;
14415 /* make sure number is a type of integer */
14416 if (!PyLong_Check(v)) {
14417 iobj = PyNumber_Index(v);
14418 if (iobj == NULL) {
14419 goto onError;
14420 }
14421 x = PyLong_AsLong(iobj);
14422 Py_DECREF(iobj);
14423 }
14424 else {
14425 x = PyLong_AsLong(v);
14426 }
14427 if (x == -1 && PyErr_Occurred())
14428 goto onError;
14429
14430 if (x < 0 || x > MAX_UNICODE) {
14431 PyErr_SetString(PyExc_OverflowError,
14432 "%c arg not in range(0x110000)");
14433 return (Py_UCS4) -1;
14434 }
14435
14436 return (Py_UCS4) x;
14437 }
14438
14439 onError:
14440 PyErr_SetString(PyExc_TypeError,
14441 "%c requires int or char");
14442 return (Py_UCS4) -1;
14443 }
14444
14445 /* Parse options of an argument: flags, width, precision.
14446 Handle also "%(name)" syntax.
14447
14448 Return 0 if the argument has been formatted into arg->str.
14449 Return 1 if the argument has been written into ctx->writer,
14450 Raise an exception and return -1 on error. */
14451 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14452 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14453 struct unicode_format_arg_t *arg)
14454 {
14455 #define FORMAT_READ(ctx) \
14456 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14457
14458 PyObject *v;
14459
14460 if (arg->ch == '(') {
14461 /* Get argument value from a dictionary. Example: "%(name)s". */
14462 Py_ssize_t keystart;
14463 Py_ssize_t keylen;
14464 PyObject *key;
14465 int pcount = 1;
14466
14467 if (ctx->dict == NULL) {
14468 PyErr_SetString(PyExc_TypeError,
14469 "format requires a mapping");
14470 return -1;
14471 }
14472 ++ctx->fmtpos;
14473 --ctx->fmtcnt;
14474 keystart = ctx->fmtpos;
14475 /* Skip over balanced parentheses */
14476 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14477 arg->ch = FORMAT_READ(ctx);
14478 if (arg->ch == ')')
14479 --pcount;
14480 else if (arg->ch == '(')
14481 ++pcount;
14482 ctx->fmtpos++;
14483 }
14484 keylen = ctx->fmtpos - keystart - 1;
14485 if (ctx->fmtcnt < 0 || pcount > 0) {
14486 PyErr_SetString(PyExc_ValueError,
14487 "incomplete format key");
14488 return -1;
14489 }
14490 key = PyUnicode_Substring(ctx->fmtstr,
14491 keystart, keystart + keylen);
14492 if (key == NULL)
14493 return -1;
14494 if (ctx->args_owned) {
14495 ctx->args_owned = 0;
14496 Py_DECREF(ctx->args);
14497 }
14498 ctx->args = PyObject_GetItem(ctx->dict, key);
14499 Py_DECREF(key);
14500 if (ctx->args == NULL)
14501 return -1;
14502 ctx->args_owned = 1;
14503 ctx->arglen = -1;
14504 ctx->argidx = -2;
14505 }
14506
14507 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14508 while (--ctx->fmtcnt >= 0) {
14509 arg->ch = FORMAT_READ(ctx);
14510 ctx->fmtpos++;
14511 switch (arg->ch) {
14512 case '-': arg->flags |= F_LJUST; continue;
14513 case '+': arg->flags |= F_SIGN; continue;
14514 case ' ': arg->flags |= F_BLANK; continue;
14515 case '#': arg->flags |= F_ALT; continue;
14516 case '0': arg->flags |= F_ZERO; continue;
14517 }
14518 break;
14519 }
14520
14521 /* Parse width. Example: "%10s" => width=10 */
14522 if (arg->ch == '*') {
14523 v = unicode_format_getnextarg(ctx);
14524 if (v == NULL)
14525 return -1;
14526 if (!PyLong_Check(v)) {
14527 PyErr_SetString(PyExc_TypeError,
14528 "* wants int");
14529 return -1;
14530 }
14531 arg->width = PyLong_AsSsize_t(v);
14532 if (arg->width == -1 && PyErr_Occurred())
14533 return -1;
14534 if (arg->width < 0) {
14535 arg->flags |= F_LJUST;
14536 arg->width = -arg->width;
14537 }
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 }
14543 else if (arg->ch >= '0' && arg->ch <= '9') {
14544 arg->width = arg->ch - '0';
14545 while (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 if (arg->ch < '0' || arg->ch > '9')
14549 break;
14550 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14551 mixing signed and unsigned comparison. Since arg->ch is between
14552 '0' and '9', casting to int is safe. */
14553 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14554 PyErr_SetString(PyExc_ValueError,
14555 "width too big");
14556 return -1;
14557 }
14558 arg->width = arg->width*10 + (arg->ch - '0');
14559 }
14560 }
14561
14562 /* Parse precision. Example: "%.3f" => prec=3 */
14563 if (arg->ch == '.') {
14564 arg->prec = 0;
14565 if (--ctx->fmtcnt >= 0) {
14566 arg->ch = FORMAT_READ(ctx);
14567 ctx->fmtpos++;
14568 }
14569 if (arg->ch == '*') {
14570 v = unicode_format_getnextarg(ctx);
14571 if (v == NULL)
14572 return -1;
14573 if (!PyLong_Check(v)) {
14574 PyErr_SetString(PyExc_TypeError,
14575 "* wants int");
14576 return -1;
14577 }
14578 arg->prec = _PyLong_AsInt(v);
14579 if (arg->prec == -1 && PyErr_Occurred())
14580 return -1;
14581 if (arg->prec < 0)
14582 arg->prec = 0;
14583 if (--ctx->fmtcnt >= 0) {
14584 arg->ch = FORMAT_READ(ctx);
14585 ctx->fmtpos++;
14586 }
14587 }
14588 else if (arg->ch >= '0' && arg->ch <= '9') {
14589 arg->prec = arg->ch - '0';
14590 while (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 if (arg->ch < '0' || arg->ch > '9')
14594 break;
14595 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14596 PyErr_SetString(PyExc_ValueError,
14597 "precision too big");
14598 return -1;
14599 }
14600 arg->prec = arg->prec*10 + (arg->ch - '0');
14601 }
14602 }
14603 }
14604
14605 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14606 if (ctx->fmtcnt >= 0) {
14607 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14608 if (--ctx->fmtcnt >= 0) {
14609 arg->ch = FORMAT_READ(ctx);
14610 ctx->fmtpos++;
14611 }
14612 }
14613 }
14614 if (ctx->fmtcnt < 0) {
14615 PyErr_SetString(PyExc_ValueError,
14616 "incomplete format");
14617 return -1;
14618 }
14619 return 0;
14620
14621 #undef FORMAT_READ
14622 }
14623
14624 /* Format one argument. Supported conversion specifiers:
14625
14626 - "s", "r", "a": any type
14627 - "i", "d", "u": int or float
14628 - "o", "x", "X": int
14629 - "e", "E", "f", "F", "g", "G": float
14630 - "c": int or str (1 character)
14631
14632 When possible, the output is written directly into the Unicode writer
14633 (ctx->writer). A string is created when padding is required.
14634
14635 Return 0 if the argument has been formatted into *p_str,
14636 1 if the argument has been written into ctx->writer,
14637 -1 on error. */
14638 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14639 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14640 struct unicode_format_arg_t *arg,
14641 PyObject **p_str)
14642 {
14643 PyObject *v;
14644 _PyUnicodeWriter *writer = &ctx->writer;
14645
14646 if (ctx->fmtcnt == 0)
14647 ctx->writer.overallocate = 0;
14648
14649 v = unicode_format_getnextarg(ctx);
14650 if (v == NULL)
14651 return -1;
14652
14653
14654 switch (arg->ch) {
14655 case 's':
14656 case 'r':
14657 case 'a':
14658 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14659 /* Fast path */
14660 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14661 return -1;
14662 return 1;
14663 }
14664
14665 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14666 *p_str = v;
14667 Py_INCREF(*p_str);
14668 }
14669 else {
14670 if (arg->ch == 's')
14671 *p_str = PyObject_Str(v);
14672 else if (arg->ch == 'r')
14673 *p_str = PyObject_Repr(v);
14674 else
14675 *p_str = PyObject_ASCII(v);
14676 }
14677 break;
14678
14679 case 'i':
14680 case 'd':
14681 case 'u':
14682 case 'o':
14683 case 'x':
14684 case 'X':
14685 {
14686 int ret = mainformatlong(v, arg, p_str, writer);
14687 if (ret != 0)
14688 return ret;
14689 arg->sign = 1;
14690 break;
14691 }
14692
14693 case 'e':
14694 case 'E':
14695 case 'f':
14696 case 'F':
14697 case 'g':
14698 case 'G':
14699 if (arg->width == -1 && arg->prec == -1
14700 && !(arg->flags & (F_SIGN | F_BLANK)))
14701 {
14702 /* Fast path */
14703 if (formatfloat(v, arg, NULL, writer) == -1)
14704 return -1;
14705 return 1;
14706 }
14707
14708 arg->sign = 1;
14709 if (formatfloat(v, arg, p_str, NULL) == -1)
14710 return -1;
14711 break;
14712
14713 case 'c':
14714 {
14715 Py_UCS4 ch = formatchar(v);
14716 if (ch == (Py_UCS4) -1)
14717 return -1;
14718 if (arg->width == -1 && arg->prec == -1) {
14719 /* Fast path */
14720 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14721 return -1;
14722 return 1;
14723 }
14724 *p_str = PyUnicode_FromOrdinal(ch);
14725 break;
14726 }
14727
14728 default:
14729 PyErr_Format(PyExc_ValueError,
14730 "unsupported format character '%c' (0x%x) "
14731 "at index %zd",
14732 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14733 (int)arg->ch,
14734 ctx->fmtpos - 1);
14735 return -1;
14736 }
14737 if (*p_str == NULL)
14738 return -1;
14739 assert (PyUnicode_Check(*p_str));
14740 return 0;
14741 }
14742
14743 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14744 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14745 struct unicode_format_arg_t *arg,
14746 PyObject *str)
14747 {
14748 Py_ssize_t len;
14749 enum PyUnicode_Kind kind;
14750 void *pbuf;
14751 Py_ssize_t pindex;
14752 Py_UCS4 signchar;
14753 Py_ssize_t buflen;
14754 Py_UCS4 maxchar;
14755 Py_ssize_t sublen;
14756 _PyUnicodeWriter *writer = &ctx->writer;
14757 Py_UCS4 fill;
14758
14759 fill = ' ';
14760 if (arg->sign && arg->flags & F_ZERO)
14761 fill = '0';
14762
14763 if (PyUnicode_READY(str) == -1)
14764 return -1;
14765
14766 len = PyUnicode_GET_LENGTH(str);
14767 if ((arg->width == -1 || arg->width <= len)
14768 && (arg->prec == -1 || arg->prec >= len)
14769 && !(arg->flags & (F_SIGN | F_BLANK)))
14770 {
14771 /* Fast path */
14772 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14773 return -1;
14774 return 0;
14775 }
14776
14777 /* Truncate the string for "s", "r" and "a" formats
14778 if the precision is set */
14779 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14780 if (arg->prec >= 0 && len > arg->prec)
14781 len = arg->prec;
14782 }
14783
14784 /* Adjust sign and width */
14785 kind = PyUnicode_KIND(str);
14786 pbuf = PyUnicode_DATA(str);
14787 pindex = 0;
14788 signchar = '\0';
14789 if (arg->sign) {
14790 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14791 if (ch == '-' || ch == '+') {
14792 signchar = ch;
14793 len--;
14794 pindex++;
14795 }
14796 else if (arg->flags & F_SIGN)
14797 signchar = '+';
14798 else if (arg->flags & F_BLANK)
14799 signchar = ' ';
14800 else
14801 arg->sign = 0;
14802 }
14803 if (arg->width < len)
14804 arg->width = len;
14805
14806 /* Prepare the writer */
14807 maxchar = writer->maxchar;
14808 if (!(arg->flags & F_LJUST)) {
14809 if (arg->sign) {
14810 if ((arg->width-1) > len)
14811 maxchar = Py_MAX(maxchar, fill);
14812 }
14813 else {
14814 if (arg->width > len)
14815 maxchar = Py_MAX(maxchar, fill);
14816 }
14817 }
14818 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14819 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14820 maxchar = Py_MAX(maxchar, strmaxchar);
14821 }
14822
14823 buflen = arg->width;
14824 if (arg->sign && len == arg->width)
14825 buflen++;
14826 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14827 return -1;
14828
14829 /* Write the sign if needed */
14830 if (arg->sign) {
14831 if (fill != ' ') {
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14833 writer->pos += 1;
14834 }
14835 if (arg->width > len)
14836 arg->width--;
14837 }
14838
14839 /* Write the numeric prefix for "x", "X" and "o" formats
14840 if the alternate form is used.
14841 For example, write "0x" for the "%#x" format. */
14842 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14845 if (fill != ' ') {
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848 writer->pos += 2;
14849 pindex += 2;
14850 }
14851 arg->width -= 2;
14852 if (arg->width < 0)
14853 arg->width = 0;
14854 len -= 2;
14855 }
14856
14857 /* Pad left with the fill character if needed */
14858 if (arg->width > len && !(arg->flags & F_LJUST)) {
14859 sublen = arg->width - len;
14860 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14861 writer->pos += sublen;
14862 arg->width = len;
14863 }
14864
14865 /* If padding with spaces: write sign if needed and/or numeric prefix if
14866 the alternate form is used */
14867 if (fill == ' ') {
14868 if (arg->sign) {
14869 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14870 writer->pos += 1;
14871 }
14872 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14873 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14874 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14875 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14877 writer->pos += 2;
14878 pindex += 2;
14879 }
14880 }
14881
14882 /* Write characters */
14883 if (len) {
14884 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14885 str, pindex, len);
14886 writer->pos += len;
14887 }
14888
14889 /* Pad right with the fill character if needed */
14890 if (arg->width > len) {
14891 sublen = arg->width - len;
14892 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14893 writer->pos += sublen;
14894 }
14895 return 0;
14896 }
14897
14898 /* Helper of PyUnicode_Format(): format one arg.
14899 Return 0 on success, raise an exception and return -1 on error. */
14900 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14901 unicode_format_arg(struct unicode_formatter_t *ctx)
14902 {
14903 struct unicode_format_arg_t arg;
14904 PyObject *str;
14905 int ret;
14906
14907 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14908 if (arg.ch == '%') {
14909 ctx->fmtpos++;
14910 ctx->fmtcnt--;
14911 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14912 return -1;
14913 return 0;
14914 }
14915 arg.flags = 0;
14916 arg.width = -1;
14917 arg.prec = -1;
14918 arg.sign = 0;
14919 str = NULL;
14920
14921 ret = unicode_format_arg_parse(ctx, &arg);
14922 if (ret == -1)
14923 return -1;
14924
14925 ret = unicode_format_arg_format(ctx, &arg, &str);
14926 if (ret == -1)
14927 return -1;
14928
14929 if (ret != 1) {
14930 ret = unicode_format_arg_output(ctx, &arg, str);
14931 Py_DECREF(str);
14932 if (ret == -1)
14933 return -1;
14934 }
14935
14936 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14937 PyErr_SetString(PyExc_TypeError,
14938 "not all arguments converted during string formatting");
14939 return -1;
14940 }
14941 return 0;
14942 }
14943
14944 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14945 PyUnicode_Format(PyObject *format, PyObject *args)
14946 {
14947 struct unicode_formatter_t ctx;
14948
14949 if (format == NULL || args == NULL) {
14950 PyErr_BadInternalCall();
14951 return NULL;
14952 }
14953
14954 if (ensure_unicode(format) < 0)
14955 return NULL;
14956
14957 ctx.fmtstr = format;
14958 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14959 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14960 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14961 ctx.fmtpos = 0;
14962
14963 _PyUnicodeWriter_Init(&ctx.writer);
14964 ctx.writer.min_length = ctx.fmtcnt + 100;
14965 ctx.writer.overallocate = 1;
14966
14967 if (PyTuple_Check(args)) {
14968 ctx.arglen = PyTuple_Size(args);
14969 ctx.argidx = 0;
14970 }
14971 else {
14972 ctx.arglen = -1;
14973 ctx.argidx = -2;
14974 }
14975 ctx.args_owned = 0;
14976 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14977 ctx.dict = args;
14978 else
14979 ctx.dict = NULL;
14980 ctx.args = args;
14981
14982 while (--ctx.fmtcnt >= 0) {
14983 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14984 Py_ssize_t nonfmtpos;
14985
14986 nonfmtpos = ctx.fmtpos++;
14987 while (ctx.fmtcnt >= 0 &&
14988 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14989 ctx.fmtpos++;
14990 ctx.fmtcnt--;
14991 }
14992 if (ctx.fmtcnt < 0) {
14993 ctx.fmtpos--;
14994 ctx.writer.overallocate = 0;
14995 }
14996
14997 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14998 nonfmtpos, ctx.fmtpos) < 0)
14999 goto onError;
15000 }
15001 else {
15002 ctx.fmtpos++;
15003 if (unicode_format_arg(&ctx) == -1)
15004 goto onError;
15005 }
15006 }
15007
15008 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15009 PyErr_SetString(PyExc_TypeError,
15010 "not all arguments converted during string formatting");
15011 goto onError;
15012 }
15013
15014 if (ctx.args_owned) {
15015 Py_DECREF(ctx.args);
15016 }
15017 return _PyUnicodeWriter_Finish(&ctx.writer);
15018
15019 onError:
15020 _PyUnicodeWriter_Dealloc(&ctx.writer);
15021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
15023 }
15024 return NULL;
15025 }
15026
15027 static PyObject *
15028 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15029
15030 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15031 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15032 {
15033 PyObject *x = NULL;
15034 static char *kwlist[] = {"object", "encoding", "errors", 0};
15035 char *encoding = NULL;
15036 char *errors = NULL;
15037
15038 if (type != &PyUnicode_Type)
15039 return unicode_subtype_new(type, args, kwds);
15040 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15041 kwlist, &x, &encoding, &errors))
15042 return NULL;
15043 if (x == NULL)
15044 _Py_RETURN_UNICODE_EMPTY();
15045 if (encoding == NULL && errors == NULL)
15046 return PyObject_Str(x);
15047 else
15048 return PyUnicode_FromEncodedObject(x, encoding, errors);
15049 }
15050
15051 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15052 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15053 {
15054 PyObject *unicode, *self;
15055 Py_ssize_t length, char_size;
15056 int share_wstr, share_utf8;
15057 unsigned int kind;
15058 void *data;
15059
15060 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15061
15062 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15063 if (unicode == NULL)
15064 return NULL;
15065 assert(_PyUnicode_CHECK(unicode));
15066 if (PyUnicode_READY(unicode) == -1) {
15067 Py_DECREF(unicode);
15068 return NULL;
15069 }
15070
15071 self = type->tp_alloc(type, 0);
15072 if (self == NULL) {
15073 Py_DECREF(unicode);
15074 return NULL;
15075 }
15076 kind = PyUnicode_KIND(unicode);
15077 length = PyUnicode_GET_LENGTH(unicode);
15078
15079 _PyUnicode_LENGTH(self) = length;
15080 #ifdef Py_DEBUG
15081 _PyUnicode_HASH(self) = -1;
15082 #else
15083 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15084 #endif
15085 _PyUnicode_STATE(self).interned = 0;
15086 _PyUnicode_STATE(self).kind = kind;
15087 _PyUnicode_STATE(self).compact = 0;
15088 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15089 _PyUnicode_STATE(self).ready = 1;
15090 _PyUnicode_WSTR(self) = NULL;
15091 _PyUnicode_UTF8_LENGTH(self) = 0;
15092 _PyUnicode_UTF8(self) = NULL;
15093 _PyUnicode_WSTR_LENGTH(self) = 0;
15094 _PyUnicode_DATA_ANY(self) = NULL;
15095
15096 share_utf8 = 0;
15097 share_wstr = 0;
15098 if (kind == PyUnicode_1BYTE_KIND) {
15099 char_size = 1;
15100 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15101 share_utf8 = 1;
15102 }
15103 else if (kind == PyUnicode_2BYTE_KIND) {
15104 char_size = 2;
15105 if (sizeof(wchar_t) == 2)
15106 share_wstr = 1;
15107 }
15108 else {
15109 assert(kind == PyUnicode_4BYTE_KIND);
15110 char_size = 4;
15111 if (sizeof(wchar_t) == 4)
15112 share_wstr = 1;
15113 }
15114
15115 /* Ensure we won't overflow the length. */
15116 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15117 PyErr_NoMemory();
15118 goto onError;
15119 }
15120 data = PyObject_MALLOC((length + 1) * char_size);
15121 if (data == NULL) {
15122 PyErr_NoMemory();
15123 goto onError;
15124 }
15125
15126 _PyUnicode_DATA_ANY(self) = data;
15127 if (share_utf8) {
15128 _PyUnicode_UTF8_LENGTH(self) = length;
15129 _PyUnicode_UTF8(self) = data;
15130 }
15131 if (share_wstr) {
15132 _PyUnicode_WSTR_LENGTH(self) = length;
15133 _PyUnicode_WSTR(self) = (wchar_t *)data;
15134 }
15135
15136 memcpy(data, PyUnicode_DATA(unicode),
15137 kind * (length + 1));
15138 assert(_PyUnicode_CheckConsistency(self, 1));
15139 #ifdef Py_DEBUG
15140 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15141 #endif
15142 Py_DECREF(unicode);
15143 return self;
15144
15145 onError:
15146 Py_DECREF(unicode);
15147 Py_DECREF(self);
15148 return NULL;
15149 }
15150
15151 PyDoc_STRVAR(unicode_doc,
15152 "str(object='') -> str\n\
15153 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15154 \n\
15155 Create a new string object from the given object. If encoding or\n\
15156 errors is specified, then the object must expose a data buffer\n\
15157 that will be decoded using the given encoding and error handler.\n\
15158 Otherwise, returns the result of object.__str__() (if defined)\n\
15159 or repr(object).\n\
15160 encoding defaults to sys.getdefaultencoding().\n\
15161 errors defaults to 'strict'.");
15162
15163 static PyObject *unicode_iter(PyObject *seq);
15164
15165 PyTypeObject PyUnicode_Type = {
15166 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15167 "str", /* tp_name */
15168 sizeof(PyUnicodeObject), /* tp_basicsize */
15169 0, /* tp_itemsize */
15170 /* Slots */
15171 (destructor)unicode_dealloc, /* tp_dealloc */
15172 0, /* tp_vectorcall_offset */
15173 0, /* tp_getattr */
15174 0, /* tp_setattr */
15175 0, /* tp_as_async */
15176 unicode_repr, /* tp_repr */
15177 &unicode_as_number, /* tp_as_number */
15178 &unicode_as_sequence, /* tp_as_sequence */
15179 &unicode_as_mapping, /* tp_as_mapping */
15180 (hashfunc) unicode_hash, /* tp_hash*/
15181 0, /* tp_call*/
15182 (reprfunc) unicode_str, /* tp_str */
15183 PyObject_GenericGetAttr, /* tp_getattro */
15184 0, /* tp_setattro */
15185 0, /* tp_as_buffer */
15186 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15187 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15188 unicode_doc, /* tp_doc */
15189 0, /* tp_traverse */
15190 0, /* tp_clear */
15191 PyUnicode_RichCompare, /* tp_richcompare */
15192 0, /* tp_weaklistoffset */
15193 unicode_iter, /* tp_iter */
15194 0, /* tp_iternext */
15195 unicode_methods, /* tp_methods */
15196 0, /* tp_members */
15197 0, /* tp_getset */
15198 &PyBaseObject_Type, /* tp_base */
15199 0, /* tp_dict */
15200 0, /* tp_descr_get */
15201 0, /* tp_descr_set */
15202 0, /* tp_dictoffset */
15203 0, /* tp_init */
15204 0, /* tp_alloc */
15205 unicode_new, /* tp_new */
15206 PyObject_Del, /* tp_free */
15207 };
15208
15209 /* Initialize the Unicode implementation */
15210
15211 PyStatus
_PyUnicode_Init(void)15212 _PyUnicode_Init(void)
15213 {
15214 /* XXX - move this array to unicodectype.c ? */
15215 Py_UCS2 linebreak[] = {
15216 0x000A, /* LINE FEED */
15217 0x000D, /* CARRIAGE RETURN */
15218 0x001C, /* FILE SEPARATOR */
15219 0x001D, /* GROUP SEPARATOR */
15220 0x001E, /* RECORD SEPARATOR */
15221 0x0085, /* NEXT LINE */
15222 0x2028, /* LINE SEPARATOR */
15223 0x2029, /* PARAGRAPH SEPARATOR */
15224 };
15225
15226 /* Init the implementation */
15227 _Py_INCREF_UNICODE_EMPTY();
15228 if (!unicode_empty) {
15229 return _PyStatus_ERR("Can't create empty string");
15230 }
15231 Py_DECREF(unicode_empty);
15232
15233 if (PyType_Ready(&PyUnicode_Type) < 0) {
15234 return _PyStatus_ERR("Can't initialize unicode type");
15235 }
15236
15237 /* initialize the linebreak bloom filter */
15238 bloom_linebreak = make_bloom_mask(
15239 PyUnicode_2BYTE_KIND, linebreak,
15240 Py_ARRAY_LENGTH(linebreak));
15241
15242 if (PyType_Ready(&EncodingMapType) < 0) {
15243 return _PyStatus_ERR("Can't initialize encoding map type");
15244 }
15245 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15246 return _PyStatus_ERR("Can't initialize field name iterator type");
15247 }
15248 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15249 return _PyStatus_ERR("Can't initialize formatter iter type");
15250 }
15251 return _PyStatus_OK();
15252 }
15253
15254 /* Finalize the Unicode implementation */
15255
15256 int
PyUnicode_ClearFreeList(void)15257 PyUnicode_ClearFreeList(void)
15258 {
15259 return 0;
15260 }
15261
15262
15263 void
PyUnicode_InternInPlace(PyObject ** p)15264 PyUnicode_InternInPlace(PyObject **p)
15265 {
15266 PyObject *s = *p;
15267 PyObject *t;
15268 #ifdef Py_DEBUG
15269 assert(s != NULL);
15270 assert(_PyUnicode_CHECK(s));
15271 #else
15272 if (s == NULL || !PyUnicode_Check(s))
15273 return;
15274 #endif
15275 /* If it's a subclass, we don't really know what putting
15276 it in the interned dict might do. */
15277 if (!PyUnicode_CheckExact(s))
15278 return;
15279 if (PyUnicode_CHECK_INTERNED(s))
15280 return;
15281 if (interned == NULL) {
15282 interned = PyDict_New();
15283 if (interned == NULL) {
15284 PyErr_Clear(); /* Don't leave an exception */
15285 return;
15286 }
15287 }
15288 Py_ALLOW_RECURSION
15289 t = PyDict_SetDefault(interned, s, s);
15290 Py_END_ALLOW_RECURSION
15291 if (t == NULL) {
15292 PyErr_Clear();
15293 return;
15294 }
15295 if (t != s) {
15296 Py_INCREF(t);
15297 Py_SETREF(*p, t);
15298 return;
15299 }
15300 /* The two references in interned are not counted by refcnt.
15301 The deallocator will take care of this */
15302 Py_REFCNT(s) -= 2;
15303 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15304 }
15305
15306 void
PyUnicode_InternImmortal(PyObject ** p)15307 PyUnicode_InternImmortal(PyObject **p)
15308 {
15309 PyUnicode_InternInPlace(p);
15310 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15311 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15312 Py_INCREF(*p);
15313 }
15314 }
15315
15316 PyObject *
PyUnicode_InternFromString(const char * cp)15317 PyUnicode_InternFromString(const char *cp)
15318 {
15319 PyObject *s = PyUnicode_FromString(cp);
15320 if (s == NULL)
15321 return NULL;
15322 PyUnicode_InternInPlace(&s);
15323 return s;
15324 }
15325
15326
15327 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15328 static void
unicode_release_interned(void)15329 unicode_release_interned(void)
15330 {
15331 PyObject *keys;
15332 PyObject *s;
15333 Py_ssize_t i, n;
15334 Py_ssize_t immortal_size = 0, mortal_size = 0;
15335
15336 if (interned == NULL || !PyDict_Check(interned))
15337 return;
15338 keys = PyDict_Keys(interned);
15339 if (keys == NULL || !PyList_Check(keys)) {
15340 PyErr_Clear();
15341 return;
15342 }
15343
15344 /* Since unicode_release_interned() is intended to help a leak
15345 detector, interned unicode strings are not forcibly deallocated;
15346 rather, we give them their stolen references back, and then clear
15347 and DECREF the interned dict. */
15348
15349 n = PyList_GET_SIZE(keys);
15350 #ifdef INTERNED_STATS
15351 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15352 n);
15353 #endif
15354 for (i = 0; i < n; i++) {
15355 s = PyList_GET_ITEM(keys, i);
15356 if (PyUnicode_READY(s) == -1) {
15357 Py_UNREACHABLE();
15358 }
15359 switch (PyUnicode_CHECK_INTERNED(s)) {
15360 case SSTATE_NOT_INTERNED:
15361 /* XXX Shouldn't happen */
15362 break;
15363 case SSTATE_INTERNED_IMMORTAL:
15364 Py_REFCNT(s) += 1;
15365 immortal_size += PyUnicode_GET_LENGTH(s);
15366 break;
15367 case SSTATE_INTERNED_MORTAL:
15368 Py_REFCNT(s) += 2;
15369 mortal_size += PyUnicode_GET_LENGTH(s);
15370 break;
15371 default:
15372 Py_FatalError("Inconsistent interned string state.");
15373 }
15374 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15375 }
15376 #ifdef INTERNED_STATS
15377 fprintf(stderr, "total size of all interned strings: "
15378 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15379 "mortal/immortal\n", mortal_size, immortal_size);
15380 #endif
15381 Py_DECREF(keys);
15382 PyDict_Clear(interned);
15383 Py_CLEAR(interned);
15384 }
15385 #endif
15386
15387
15388 /********************* Unicode Iterator **************************/
15389
15390 typedef struct {
15391 PyObject_HEAD
15392 Py_ssize_t it_index;
15393 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15394 } unicodeiterobject;
15395
15396 static void
unicodeiter_dealloc(unicodeiterobject * it)15397 unicodeiter_dealloc(unicodeiterobject *it)
15398 {
15399 _PyObject_GC_UNTRACK(it);
15400 Py_XDECREF(it->it_seq);
15401 PyObject_GC_Del(it);
15402 }
15403
15404 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15405 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406 {
15407 Py_VISIT(it->it_seq);
15408 return 0;
15409 }
15410
15411 static PyObject *
unicodeiter_next(unicodeiterobject * it)15412 unicodeiter_next(unicodeiterobject *it)
15413 {
15414 PyObject *seq, *item;
15415
15416 assert(it != NULL);
15417 seq = it->it_seq;
15418 if (seq == NULL)
15419 return NULL;
15420 assert(_PyUnicode_CHECK(seq));
15421
15422 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423 int kind = PyUnicode_KIND(seq);
15424 void *data = PyUnicode_DATA(seq);
15425 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426 item = PyUnicode_FromOrdinal(chr);
15427 if (item != NULL)
15428 ++it->it_index;
15429 return item;
15430 }
15431
15432 it->it_seq = NULL;
15433 Py_DECREF(seq);
15434 return NULL;
15435 }
15436
15437 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15438 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15439 {
15440 Py_ssize_t len = 0;
15441 if (it->it_seq)
15442 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15443 return PyLong_FromSsize_t(len);
15444 }
15445
15446 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447
15448 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15449 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15450 {
15451 _Py_IDENTIFIER(iter);
15452 if (it->it_seq != NULL) {
15453 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15454 it->it_seq, it->it_index);
15455 } else {
15456 PyObject *u = (PyObject *)_PyUnicode_New(0);
15457 if (u == NULL)
15458 return NULL;
15459 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15460 }
15461 }
15462
15463 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15464
15465 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15466 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15467 {
15468 Py_ssize_t index = PyLong_AsSsize_t(state);
15469 if (index == -1 && PyErr_Occurred())
15470 return NULL;
15471 if (it->it_seq != NULL) {
15472 if (index < 0)
15473 index = 0;
15474 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15475 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15476 it->it_index = index;
15477 }
15478 Py_RETURN_NONE;
15479 }
15480
15481 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15482
15483 static PyMethodDef unicodeiter_methods[] = {
15484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15485 length_hint_doc},
15486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15487 reduce_doc},
15488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15489 setstate_doc},
15490 {NULL, NULL} /* sentinel */
15491 };
15492
15493 PyTypeObject PyUnicodeIter_Type = {
15494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15495 "str_iterator", /* tp_name */
15496 sizeof(unicodeiterobject), /* tp_basicsize */
15497 0, /* tp_itemsize */
15498 /* methods */
15499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15500 0, /* tp_vectorcall_offset */
15501 0, /* tp_getattr */
15502 0, /* tp_setattr */
15503 0, /* tp_as_async */
15504 0, /* tp_repr */
15505 0, /* tp_as_number */
15506 0, /* tp_as_sequence */
15507 0, /* tp_as_mapping */
15508 0, /* tp_hash */
15509 0, /* tp_call */
15510 0, /* tp_str */
15511 PyObject_GenericGetAttr, /* tp_getattro */
15512 0, /* tp_setattro */
15513 0, /* tp_as_buffer */
15514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15515 0, /* tp_doc */
15516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15517 0, /* tp_clear */
15518 0, /* tp_richcompare */
15519 0, /* tp_weaklistoffset */
15520 PyObject_SelfIter, /* tp_iter */
15521 (iternextfunc)unicodeiter_next, /* tp_iternext */
15522 unicodeiter_methods, /* tp_methods */
15523 0,
15524 };
15525
15526 static PyObject *
unicode_iter(PyObject * seq)15527 unicode_iter(PyObject *seq)
15528 {
15529 unicodeiterobject *it;
15530
15531 if (!PyUnicode_Check(seq)) {
15532 PyErr_BadInternalCall();
15533 return NULL;
15534 }
15535 if (PyUnicode_READY(seq) == -1)
15536 return NULL;
15537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15538 if (it == NULL)
15539 return NULL;
15540 it->it_index = 0;
15541 Py_INCREF(seq);
15542 it->it_seq = seq;
15543 _PyObject_GC_TRACK(it);
15544 return (PyObject *)it;
15545 }
15546
15547
15548 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15549 Py_UNICODE_strlen(const Py_UNICODE *u)
15550 {
15551 return wcslen(u);
15552 }
15553
15554 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15555 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15556 {
15557 Py_UNICODE *u = s1;
15558 while ((*u++ = *s2++));
15559 return s1;
15560 }
15561
15562 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15563 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15564 {
15565 Py_UNICODE *u = s1;
15566 while ((*u++ = *s2++))
15567 if (n-- == 0)
15568 break;
15569 return s1;
15570 }
15571
15572 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15573 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15574 {
15575 Py_UNICODE *u1 = s1;
15576 u1 += wcslen(u1);
15577 while ((*u1++ = *s2++));
15578 return s1;
15579 }
15580
15581 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15582 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15583 {
15584 while (*s1 && *s2 && *s1 == *s2)
15585 s1++, s2++;
15586 if (*s1 && *s2)
15587 return (*s1 < *s2) ? -1 : +1;
15588 if (*s1)
15589 return 1;
15590 if (*s2)
15591 return -1;
15592 return 0;
15593 }
15594
15595 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15596 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15597 {
15598 Py_UNICODE u1, u2;
15599 for (; n != 0; n--) {
15600 u1 = *s1;
15601 u2 = *s2;
15602 if (u1 != u2)
15603 return (u1 < u2) ? -1 : +1;
15604 if (u1 == '\0')
15605 return 0;
15606 s1++;
15607 s2++;
15608 }
15609 return 0;
15610 }
15611
15612 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15613 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15614 {
15615 const Py_UNICODE *p;
15616 for (p = s; *p; p++)
15617 if (*p == c)
15618 return (Py_UNICODE*)p;
15619 return NULL;
15620 }
15621
15622 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15623 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15624 {
15625 const Py_UNICODE *p;
15626 p = s + wcslen(s);
15627 while (p != s) {
15628 p--;
15629 if (*p == c)
15630 return (Py_UNICODE*)p;
15631 }
15632 return NULL;
15633 }
15634
15635 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15636 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15637 {
15638 Py_UNICODE *u, *copy;
15639 Py_ssize_t len, size;
15640
15641 if (!PyUnicode_Check(unicode)) {
15642 PyErr_BadArgument();
15643 return NULL;
15644 }
15645 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15646 if (u == NULL)
15647 return NULL;
15648 /* Ensure we won't overflow the size. */
15649 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15650 PyErr_NoMemory();
15651 return NULL;
15652 }
15653 size = len + 1; /* copy the null character */
15654 size *= sizeof(Py_UNICODE);
15655 copy = PyMem_Malloc(size);
15656 if (copy == NULL) {
15657 PyErr_NoMemory();
15658 return NULL;
15659 }
15660 memcpy(copy, u, size);
15661 return copy;
15662 }
15663
15664
15665 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15666 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15667 {
15668 int res;
15669 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15670 if (res == -2) {
15671 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15672 return -1;
15673 }
15674 if (res < 0) {
15675 PyErr_NoMemory();
15676 return -1;
15677 }
15678 return 0;
15679 }
15680
15681
15682 static int
config_get_codec_name(wchar_t ** config_encoding)15683 config_get_codec_name(wchar_t **config_encoding)
15684 {
15685 char *encoding;
15686 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15687 return -1;
15688 }
15689
15690 PyObject *name_obj = NULL;
15691 PyObject *codec = _PyCodec_Lookup(encoding);
15692 PyMem_RawFree(encoding);
15693
15694 if (!codec)
15695 goto error;
15696
15697 name_obj = PyObject_GetAttrString(codec, "name");
15698 Py_CLEAR(codec);
15699 if (!name_obj) {
15700 goto error;
15701 }
15702
15703 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15704 Py_DECREF(name_obj);
15705 if (wname == NULL) {
15706 goto error;
15707 }
15708
15709 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15710 if (raw_wname == NULL) {
15711 PyMem_Free(wname);
15712 PyErr_NoMemory();
15713 goto error;
15714 }
15715
15716 PyMem_RawFree(*config_encoding);
15717 *config_encoding = raw_wname;
15718
15719 PyMem_Free(wname);
15720 return 0;
15721
15722 error:
15723 Py_XDECREF(codec);
15724 Py_XDECREF(name_obj);
15725 return -1;
15726 }
15727
15728
15729 static PyStatus
init_stdio_encoding(PyThreadState * tstate)15730 init_stdio_encoding(PyThreadState *tstate)
15731 {
15732 /* Update the stdio encoding to the normalized Python codec name. */
15733 PyConfig *config = &tstate->interp->config;
15734 if (config_get_codec_name(&config->stdio_encoding) < 0) {
15735 return _PyStatus_ERR("failed to get the Python codec name "
15736 "of the stdio encoding");
15737 }
15738 return _PyStatus_OK();
15739 }
15740
15741
15742 static int
init_fs_codec(PyInterpreterState * interp)15743 init_fs_codec(PyInterpreterState *interp)
15744 {
15745 PyConfig *config = &interp->config;
15746
15747 _Py_error_handler error_handler;
15748 error_handler = get_error_handler_wide(config->filesystem_errors);
15749 if (error_handler == _Py_ERROR_UNKNOWN) {
15750 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15751 return -1;
15752 }
15753
15754 char *encoding, *errors;
15755 if (encode_wstr_utf8(config->filesystem_encoding,
15756 &encoding,
15757 "filesystem_encoding") < 0) {
15758 return -1;
15759 }
15760
15761 if (encode_wstr_utf8(config->filesystem_errors,
15762 &errors,
15763 "filesystem_errors") < 0) {
15764 PyMem_RawFree(encoding);
15765 return -1;
15766 }
15767
15768 PyMem_RawFree(interp->fs_codec.encoding);
15769 interp->fs_codec.encoding = encoding;
15770 PyMem_RawFree(interp->fs_codec.errors);
15771 interp->fs_codec.errors = errors;
15772 interp->fs_codec.error_handler = error_handler;
15773
15774 /* At this point, PyUnicode_EncodeFSDefault() and
15775 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15776 the C implementation of the filesystem encoding. */
15777
15778 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15779 global configuration variables. */
15780 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15781 interp->fs_codec.errors) < 0) {
15782 PyErr_NoMemory();
15783 return -1;
15784 }
15785 return 0;
15786 }
15787
15788
15789 static PyStatus
init_fs_encoding(PyThreadState * tstate)15790 init_fs_encoding(PyThreadState *tstate)
15791 {
15792 PyInterpreterState *interp = tstate->interp;
15793
15794 /* Update the filesystem encoding to the normalized Python codec name.
15795 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15796 (Python codec name). */
15797 PyConfig *config = &interp->config;
15798 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15799 _Py_DumpPathConfig(tstate);
15800 return _PyStatus_ERR("failed to get the Python codec "
15801 "of the filesystem encoding");
15802 }
15803
15804 if (init_fs_codec(interp) < 0) {
15805 return _PyStatus_ERR("cannot initialize filesystem codec");
15806 }
15807 return _PyStatus_OK();
15808 }
15809
15810
15811 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15812 _PyUnicode_InitEncodings(PyThreadState *tstate)
15813 {
15814 PyStatus status = init_fs_encoding(tstate);
15815 if (_PyStatus_EXCEPTION(status)) {
15816 return status;
15817 }
15818
15819 return init_stdio_encoding(tstate);
15820 }
15821
15822
15823 #ifdef MS_WINDOWS
15824 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15825 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15826 {
15827 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15828 PyConfig *config = &interp->config;
15829
15830 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15831 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15832 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15833 if (encoding == NULL || errors == NULL) {
15834 PyMem_RawFree(encoding);
15835 PyMem_RawFree(errors);
15836 PyErr_NoMemory();
15837 return -1;
15838 }
15839
15840 PyMem_RawFree(config->filesystem_encoding);
15841 config->filesystem_encoding = encoding;
15842 PyMem_RawFree(config->filesystem_errors);
15843 config->filesystem_errors = errors;
15844
15845 return init_fs_codec(interp);
15846 }
15847 #endif
15848
15849
15850 void
_PyUnicode_Fini(void)15851 _PyUnicode_Fini(void)
15852 {
15853 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15854 /* Insure++ is a memory analysis tool that aids in discovering
15855 * memory leaks and other memory problems. On Python exit, the
15856 * interned string dictionaries are flagged as being in use at exit
15857 * (which it is). Under normal circumstances, this is fine because
15858 * the memory will be automatically reclaimed by the system. Under
15859 * memory debugging, it's a huge source of useless noise, so we
15860 * trade off slower shutdown for less distraction in the memory
15861 * reports. -baw
15862 */
15863 unicode_release_interned();
15864 #endif /* __INSURE__ */
15865
15866 Py_CLEAR(unicode_empty);
15867
15868 for (Py_ssize_t i = 0; i < 256; i++) {
15869 Py_CLEAR(unicode_latin1[i]);
15870 }
15871 _PyUnicode_ClearStaticStrings();
15872 (void)PyUnicode_ClearFreeList();
15873
15874 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15875 PyMem_RawFree(interp->fs_codec.encoding);
15876 interp->fs_codec.encoding = NULL;
15877 PyMem_RawFree(interp->fs_codec.errors);
15878 interp->fs_codec.errors = NULL;
15879 }
15880
15881
15882 /* A _string module, to export formatter_parser and formatter_field_name_split
15883 to the string.Formatter class implemented in Python. */
15884
15885 static PyMethodDef _string_methods[] = {
15886 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15887 METH_O, PyDoc_STR("split the argument as a field name")},
15888 {"formatter_parser", (PyCFunction) formatter_parser,
15889 METH_O, PyDoc_STR("parse the argument as a format string")},
15890 {NULL, NULL}
15891 };
15892
15893 static struct PyModuleDef _string_module = {
15894 PyModuleDef_HEAD_INIT,
15895 "_string",
15896 PyDoc_STR("string helper module"),
15897 0,
15898 _string_methods,
15899 NULL,
15900 NULL,
15901 NULL,
15902 NULL
15903 };
15904
15905 PyMODINIT_FUNC
PyInit__string(void)15906 PyInit__string(void)
15907 {
15908 return PyModule_Create(&_string_module);
15909 }
15910
15911
15912 #ifdef __cplusplus
15913 }
15914 #endif
15915