1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h" // _PyIndex_Check()
44 #include "pycore_bytes_methods.h"
45 #include "pycore_fileutils.h"
46 #include "pycore_initconfig.h"
47 #include "pycore_interp.h" // PyInterpreterState.fs_codec
48 #include "pycore_object.h"
49 #include "pycore_pathconfig.h"
50 #include "pycore_pylifecycle.h"
51 #include "pycore_pystate.h" // _PyInterpreterState_GET()
52 #include "ucnhash.h"
53 #include "stringlib/eq.h"
54
55 #ifdef MS_WINDOWS
56 #include <windows.h>
57 #endif
58
59 /* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61 /* #define INTERNED_STATS 1 */
62
63
64 /*[clinic input]
65 class str "PyObject *" "&PyUnicode_Type"
66 [clinic start generated code]*/
67 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69 /*[python input]
70 class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80 [python start generated code]*/
81 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
82
83 /* --- Globals ------------------------------------------------------------
84
85 NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
88
89 */
90
91
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95
96 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97 #define MAX_UNICODE 0x10ffff
98
99 #ifdef Py_DEBUG
100 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
101 #else
102 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103 #endif
104
105 #define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107 #define PyUnicode_UTF8(op) \
108 (assert(_PyUnicode_CHECK(op)), \
109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
113 #define _PyUnicode_UTF8_LENGTH(op) \
114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115 #define PyUnicode_UTF8_LENGTH(op) \
116 (assert(_PyUnicode_CHECK(op)), \
117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
121 #define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
123
124 /* Don't use deprecated macro of unicodeobject.h */
125 #undef PyUnicode_WSTR_LENGTH
126 #define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
130 #define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132 #define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134 #define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136 #define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
138 #define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 ((PyASCIIObject *)(op))->state.kind)
141 #define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
143 ((PyASCIIObject *)(op))->length)
144 #define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
146
147 #undef PyUnicode_READY
148 #define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
151 0 : \
152 _PyUnicode_Ready(op)))
153
154 #define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158 #define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
162 /* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
164 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
166 && _PyUnicode_UTF8(op) \
167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
169 /* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
172 ((_PyUnicode_WSTR(op) && \
173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
176 /* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
183 to_type *_to = (to_type *)(to); \
184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
195 } \
196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
198 } while (0)
199
200 #ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202 # define OVERALLOCATE_FACTOR 2
203 #else
204 /* On Linux, overallocate by 25% is the best factor */
205 # define OVERALLOCATE_FACTOR 4
206 #endif
207
208 #define INTERNED_STRINGS
209
210 /* This dictionary holds all interned unicode strings. Note that references
211 to strings in this dictionary are *not* counted in the string's ob_refcnt.
212 When the interned string reaches a refcnt of 0 the string deallocation
213 function will delete the reference from this dictionary.
214
215 Another way to look at this is that to say that the actual reference
216 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
217 */
218 #ifdef INTERNED_STRINGS
219 static PyObject *interned = NULL;
220 #endif
221
222 /* The empty Unicode object is shared to improve performance. */
223 static PyObject *unicode_empty = NULL;
224
225 #define _Py_INCREF_UNICODE_EMPTY() \
226 do { \
227 if (unicode_empty != NULL) \
228 Py_INCREF(unicode_empty); \
229 else { \
230 unicode_empty = PyUnicode_New(0, 0); \
231 if (unicode_empty != NULL) { \
232 Py_INCREF(unicode_empty); \
233 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
234 } \
235 } \
236 } while (0)
237
238 #define _Py_RETURN_UNICODE_EMPTY() \
239 do { \
240 _Py_INCREF_UNICODE_EMPTY(); \
241 return unicode_empty; \
242 } while (0)
243
244 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)245 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
246 Py_ssize_t start, Py_ssize_t length)
247 {
248 assert(0 <= start);
249 assert(kind != PyUnicode_WCHAR_KIND);
250 switch (kind) {
251 case PyUnicode_1BYTE_KIND: {
252 assert(value <= 0xff);
253 Py_UCS1 ch = (unsigned char)value;
254 Py_UCS1 *to = (Py_UCS1 *)data + start;
255 memset(to, ch, length);
256 break;
257 }
258 case PyUnicode_2BYTE_KIND: {
259 assert(value <= 0xffff);
260 Py_UCS2 ch = (Py_UCS2)value;
261 Py_UCS2 *to = (Py_UCS2 *)data + start;
262 const Py_UCS2 *end = to + length;
263 for (; to < end; ++to) *to = ch;
264 break;
265 }
266 case PyUnicode_4BYTE_KIND: {
267 assert(value <= MAX_UNICODE);
268 Py_UCS4 ch = value;
269 Py_UCS4 * to = (Py_UCS4 *)data + start;
270 const Py_UCS4 *end = to + length;
271 for (; to < end; ++to) *to = ch;
272 break;
273 }
274 default: Py_UNREACHABLE();
275 }
276 }
277
278
279 /* Forward declaration */
280 static inline int
281 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
282 static inline void
283 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
284 static PyObject *
285 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
286 const char *errors);
287 static PyObject *
288 unicode_decode_utf8(const char *s, Py_ssize_t size,
289 _Py_error_handler error_handler, const char *errors,
290 Py_ssize_t *consumed);
291
292 /* List of static strings. */
293 static _Py_Identifier *static_strings = NULL;
294
295 #define LATIN1_SINGLETONS
296
297 #ifdef LATIN1_SINGLETONS
298 /* Single character Unicode strings in the Latin-1 range are being
299 shared as well. */
300 static PyObject *unicode_latin1[256] = {NULL};
301 #endif
302
303 /* Fast detection of the most frequent whitespace characters */
304 const unsigned char _Py_ascii_whitespace[] = {
305 0, 0, 0, 0, 0, 0, 0, 0,
306 /* case 0x0009: * CHARACTER TABULATION */
307 /* case 0x000A: * LINE FEED */
308 /* case 0x000B: * LINE TABULATION */
309 /* case 0x000C: * FORM FEED */
310 /* case 0x000D: * CARRIAGE RETURN */
311 0, 1, 1, 1, 1, 1, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 /* case 0x001C: * FILE SEPARATOR */
314 /* case 0x001D: * GROUP SEPARATOR */
315 /* case 0x001E: * RECORD SEPARATOR */
316 /* case 0x001F: * UNIT SEPARATOR */
317 0, 0, 0, 0, 1, 1, 1, 1,
318 /* case 0x0020: * SPACE */
319 1, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0,
323
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0
332 };
333
334 /* forward */
335 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
336 static PyObject* get_latin1_char(unsigned char ch);
337 static int unicode_modifiable(PyObject *unicode);
338
339
340 static PyObject *
341 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
342 static PyObject *
343 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
344 static PyObject *
345 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
346
347 static PyObject *
348 unicode_encode_call_errorhandler(const char *errors,
349 PyObject **errorHandler,const char *encoding, const char *reason,
350 PyObject *unicode, PyObject **exceptionObject,
351 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
352
353 static void
354 raise_encode_exception(PyObject **exceptionObject,
355 const char *encoding,
356 PyObject *unicode,
357 Py_ssize_t startpos, Py_ssize_t endpos,
358 const char *reason);
359
360 /* Same for linebreaks */
361 static const unsigned char ascii_linebreak[] = {
362 0, 0, 0, 0, 0, 0, 0, 0,
363 /* 0x000A, * LINE FEED */
364 /* 0x000B, * LINE TABULATION */
365 /* 0x000C, * FORM FEED */
366 /* 0x000D, * CARRIAGE RETURN */
367 0, 0, 1, 1, 1, 1, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 /* 0x001C, * FILE SEPARATOR */
370 /* 0x001D, * GROUP SEPARATOR */
371 /* 0x001E, * RECORD SEPARATOR */
372 0, 0, 0, 0, 1, 1, 1, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0
386 };
387
388 static int convert_uc(PyObject *obj, void *addr);
389
390 #include "clinic/unicodeobject.c.h"
391
392 _Py_error_handler
_Py_GetErrorHandler(const char * errors)393 _Py_GetErrorHandler(const char *errors)
394 {
395 if (errors == NULL || strcmp(errors, "strict") == 0) {
396 return _Py_ERROR_STRICT;
397 }
398 if (strcmp(errors, "surrogateescape") == 0) {
399 return _Py_ERROR_SURROGATEESCAPE;
400 }
401 if (strcmp(errors, "replace") == 0) {
402 return _Py_ERROR_REPLACE;
403 }
404 if (strcmp(errors, "ignore") == 0) {
405 return _Py_ERROR_IGNORE;
406 }
407 if (strcmp(errors, "backslashreplace") == 0) {
408 return _Py_ERROR_BACKSLASHREPLACE;
409 }
410 if (strcmp(errors, "surrogatepass") == 0) {
411 return _Py_ERROR_SURROGATEPASS;
412 }
413 if (strcmp(errors, "xmlcharrefreplace") == 0) {
414 return _Py_ERROR_XMLCHARREFREPLACE;
415 }
416 return _Py_ERROR_OTHER;
417 }
418
419
420 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)421 get_error_handler_wide(const wchar_t *errors)
422 {
423 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
424 return _Py_ERROR_STRICT;
425 }
426 if (wcscmp(errors, L"surrogateescape") == 0) {
427 return _Py_ERROR_SURROGATEESCAPE;
428 }
429 if (wcscmp(errors, L"replace") == 0) {
430 return _Py_ERROR_REPLACE;
431 }
432 if (wcscmp(errors, L"ignore") == 0) {
433 return _Py_ERROR_IGNORE;
434 }
435 if (wcscmp(errors, L"backslashreplace") == 0) {
436 return _Py_ERROR_BACKSLASHREPLACE;
437 }
438 if (wcscmp(errors, L"surrogatepass") == 0) {
439 return _Py_ERROR_SURROGATEPASS;
440 }
441 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
442 return _Py_ERROR_XMLCHARREFREPLACE;
443 }
444 return _Py_ERROR_OTHER;
445 }
446
447
448 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)449 unicode_check_encoding_errors(const char *encoding, const char *errors)
450 {
451 if (encoding == NULL && errors == NULL) {
452 return 0;
453 }
454
455 PyInterpreterState *interp = _PyInterpreterState_GET();
456 #ifndef Py_DEBUG
457 /* In release mode, only check in development mode (-X dev) */
458 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
459 return 0;
460 }
461 #else
462 /* Always check in debug mode */
463 #endif
464
465 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
466 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
467 if (!interp->unicode.fs_codec.encoding) {
468 return 0;
469 }
470
471 /* Disable checks during Python finalization. For example, it allows to
472 call _PyObject_Dump() during finalization for debugging purpose. */
473 if (interp->finalizing) {
474 return 0;
475 }
476
477 if (encoding != NULL) {
478 PyObject *handler = _PyCodec_Lookup(encoding);
479 if (handler == NULL) {
480 return -1;
481 }
482 Py_DECREF(handler);
483 }
484
485 if (errors != NULL) {
486 PyObject *handler = PyCodec_LookupError(errors);
487 if (handler == NULL) {
488 return -1;
489 }
490 Py_DECREF(handler);
491 }
492 return 0;
493 }
494
495
496 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
497 This function is kept for backward compatibility with the old API. */
498 Py_UNICODE
PyUnicode_GetMax(void)499 PyUnicode_GetMax(void)
500 {
501 #ifdef Py_UNICODE_WIDE
502 return 0x10FFFF;
503 #else
504 /* This is actually an illegal character, so it should
505 not be passed to unichr. */
506 return 0xFFFF;
507 #endif
508 }
509
510 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)511 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
512 {
513 #define CHECK(expr) \
514 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
515
516 PyASCIIObject *ascii;
517 unsigned int kind;
518
519 assert(op != NULL);
520 CHECK(PyUnicode_Check(op));
521
522 ascii = (PyASCIIObject *)op;
523 kind = ascii->state.kind;
524
525 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
526 CHECK(kind == PyUnicode_1BYTE_KIND);
527 CHECK(ascii->state.ready == 1);
528 }
529 else {
530 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
531 void *data;
532
533 if (ascii->state.compact == 1) {
534 data = compact + 1;
535 CHECK(kind == PyUnicode_1BYTE_KIND
536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
538 CHECK(ascii->state.ascii == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(compact->utf8 != data);
541 }
542 else {
543 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
544
545 data = unicode->data.any;
546 if (kind == PyUnicode_WCHAR_KIND) {
547 CHECK(ascii->length == 0);
548 CHECK(ascii->hash == -1);
549 CHECK(ascii->state.compact == 0);
550 CHECK(ascii->state.ascii == 0);
551 CHECK(ascii->state.ready == 0);
552 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
553 CHECK(ascii->wstr != NULL);
554 CHECK(data == NULL);
555 CHECK(compact->utf8 == NULL);
556 }
557 else {
558 CHECK(kind == PyUnicode_1BYTE_KIND
559 || kind == PyUnicode_2BYTE_KIND
560 || kind == PyUnicode_4BYTE_KIND);
561 CHECK(ascii->state.compact == 0);
562 CHECK(ascii->state.ready == 1);
563 CHECK(data != NULL);
564 if (ascii->state.ascii) {
565 CHECK(compact->utf8 == data);
566 CHECK(compact->utf8_length == ascii->length);
567 }
568 else
569 CHECK(compact->utf8 != data);
570 }
571 }
572 if (kind != PyUnicode_WCHAR_KIND) {
573 if (
574 #if SIZEOF_WCHAR_T == 2
575 kind == PyUnicode_2BYTE_KIND
576 #else
577 kind == PyUnicode_4BYTE_KIND
578 #endif
579 )
580 {
581 CHECK(ascii->wstr == data);
582 CHECK(compact->wstr_length == ascii->length);
583 } else
584 CHECK(ascii->wstr != data);
585 }
586
587 if (compact->utf8 == NULL)
588 CHECK(compact->utf8_length == 0);
589 if (ascii->wstr == NULL)
590 CHECK(compact->wstr_length == 0);
591 }
592
593 /* check that the best kind is used: O(n) operation */
594 if (check_content && kind != PyUnicode_WCHAR_KIND) {
595 Py_ssize_t i;
596 Py_UCS4 maxchar = 0;
597 const void *data;
598 Py_UCS4 ch;
599
600 data = PyUnicode_DATA(ascii);
601 for (i=0; i < ascii->length; i++)
602 {
603 ch = PyUnicode_READ(kind, data, i);
604 if (ch > maxchar)
605 maxchar = ch;
606 }
607 if (kind == PyUnicode_1BYTE_KIND) {
608 if (ascii->state.ascii == 0) {
609 CHECK(maxchar >= 128);
610 CHECK(maxchar <= 255);
611 }
612 else
613 CHECK(maxchar < 128);
614 }
615 else if (kind == PyUnicode_2BYTE_KIND) {
616 CHECK(maxchar >= 0x100);
617 CHECK(maxchar <= 0xFFFF);
618 }
619 else {
620 CHECK(maxchar >= 0x10000);
621 CHECK(maxchar <= MAX_UNICODE);
622 }
623 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
624 }
625 return 1;
626
627 #undef CHECK
628 }
629
630
631 static PyObject*
unicode_result_wchar(PyObject * unicode)632 unicode_result_wchar(PyObject *unicode)
633 {
634 #ifndef Py_DEBUG
635 Py_ssize_t len;
636
637 len = _PyUnicode_WSTR_LENGTH(unicode);
638 if (len == 0) {
639 Py_DECREF(unicode);
640 _Py_RETURN_UNICODE_EMPTY();
641 }
642
643 if (len == 1) {
644 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
645 if ((Py_UCS4)ch < 256) {
646 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
647 Py_DECREF(unicode);
648 return latin1_char;
649 }
650 }
651
652 if (_PyUnicode_Ready(unicode) < 0) {
653 Py_DECREF(unicode);
654 return NULL;
655 }
656 #else
657 assert(Py_REFCNT(unicode) == 1);
658
659 /* don't make the result ready in debug mode to ensure that the caller
660 makes the string ready before using it */
661 assert(_PyUnicode_CheckConsistency(unicode, 1));
662 #endif
663 return unicode;
664 }
665
666 static PyObject*
unicode_result_ready(PyObject * unicode)667 unicode_result_ready(PyObject *unicode)
668 {
669 Py_ssize_t length;
670
671 length = PyUnicode_GET_LENGTH(unicode);
672 if (length == 0) {
673 if (unicode != unicode_empty) {
674 Py_DECREF(unicode);
675 _Py_RETURN_UNICODE_EMPTY();
676 }
677 return unicode_empty;
678 }
679
680 #ifdef LATIN1_SINGLETONS
681 if (length == 1) {
682 const void *data = PyUnicode_DATA(unicode);
683 int kind = PyUnicode_KIND(unicode);
684 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
685 if (ch < 256) {
686 PyObject *latin1_char = unicode_latin1[ch];
687 if (latin1_char != NULL) {
688 if (unicode != latin1_char) {
689 Py_INCREF(latin1_char);
690 Py_DECREF(unicode);
691 }
692 return latin1_char;
693 }
694 else {
695 assert(_PyUnicode_CheckConsistency(unicode, 1));
696 Py_INCREF(unicode);
697 unicode_latin1[ch] = unicode;
698 return unicode;
699 }
700 }
701 }
702 #endif
703
704 assert(_PyUnicode_CheckConsistency(unicode, 1));
705 return unicode;
706 }
707
708 static PyObject*
unicode_result(PyObject * unicode)709 unicode_result(PyObject *unicode)
710 {
711 assert(_PyUnicode_CHECK(unicode));
712 if (PyUnicode_IS_READY(unicode))
713 return unicode_result_ready(unicode);
714 else
715 return unicode_result_wchar(unicode);
716 }
717
718 static PyObject*
unicode_result_unchanged(PyObject * unicode)719 unicode_result_unchanged(PyObject *unicode)
720 {
721 if (PyUnicode_CheckExact(unicode)) {
722 if (PyUnicode_READY(unicode) == -1)
723 return NULL;
724 Py_INCREF(unicode);
725 return unicode;
726 }
727 else
728 /* Subtype -- return genuine unicode string with the same value. */
729 return _PyUnicode_Copy(unicode);
730 }
731
732 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
733 ASCII, Latin1, UTF-8, etc. */
734 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)735 backslashreplace(_PyBytesWriter *writer, char *str,
736 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
737 {
738 Py_ssize_t size, i;
739 Py_UCS4 ch;
740 enum PyUnicode_Kind kind;
741 const void *data;
742
743 assert(PyUnicode_IS_READY(unicode));
744 kind = PyUnicode_KIND(unicode);
745 data = PyUnicode_DATA(unicode);
746
747 size = 0;
748 /* determine replacement size */
749 for (i = collstart; i < collend; ++i) {
750 Py_ssize_t incr;
751
752 ch = PyUnicode_READ(kind, data, i);
753 if (ch < 0x100)
754 incr = 2+2;
755 else if (ch < 0x10000)
756 incr = 2+4;
757 else {
758 assert(ch <= MAX_UNICODE);
759 incr = 2+8;
760 }
761 if (size > PY_SSIZE_T_MAX - incr) {
762 PyErr_SetString(PyExc_OverflowError,
763 "encoded result is too long for a Python string");
764 return NULL;
765 }
766 size += incr;
767 }
768
769 str = _PyBytesWriter_Prepare(writer, str, size);
770 if (str == NULL)
771 return NULL;
772
773 /* generate replacement */
774 for (i = collstart; i < collend; ++i) {
775 ch = PyUnicode_READ(kind, data, i);
776 *str++ = '\\';
777 if (ch >= 0x00010000) {
778 *str++ = 'U';
779 *str++ = Py_hexdigits[(ch>>28)&0xf];
780 *str++ = Py_hexdigits[(ch>>24)&0xf];
781 *str++ = Py_hexdigits[(ch>>20)&0xf];
782 *str++ = Py_hexdigits[(ch>>16)&0xf];
783 *str++ = Py_hexdigits[(ch>>12)&0xf];
784 *str++ = Py_hexdigits[(ch>>8)&0xf];
785 }
786 else if (ch >= 0x100) {
787 *str++ = 'u';
788 *str++ = Py_hexdigits[(ch>>12)&0xf];
789 *str++ = Py_hexdigits[(ch>>8)&0xf];
790 }
791 else
792 *str++ = 'x';
793 *str++ = Py_hexdigits[(ch>>4)&0xf];
794 *str++ = Py_hexdigits[ch&0xf];
795 }
796 return str;
797 }
798
799 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
800 ASCII, Latin1, UTF-8, etc. */
801 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)802 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
803 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
804 {
805 Py_ssize_t size, i;
806 Py_UCS4 ch;
807 enum PyUnicode_Kind kind;
808 const void *data;
809
810 assert(PyUnicode_IS_READY(unicode));
811 kind = PyUnicode_KIND(unicode);
812 data = PyUnicode_DATA(unicode);
813
814 size = 0;
815 /* determine replacement size */
816 for (i = collstart; i < collend; ++i) {
817 Py_ssize_t incr;
818
819 ch = PyUnicode_READ(kind, data, i);
820 if (ch < 10)
821 incr = 2+1+1;
822 else if (ch < 100)
823 incr = 2+2+1;
824 else if (ch < 1000)
825 incr = 2+3+1;
826 else if (ch < 10000)
827 incr = 2+4+1;
828 else if (ch < 100000)
829 incr = 2+5+1;
830 else if (ch < 1000000)
831 incr = 2+6+1;
832 else {
833 assert(ch <= MAX_UNICODE);
834 incr = 2+7+1;
835 }
836 if (size > PY_SSIZE_T_MAX - incr) {
837 PyErr_SetString(PyExc_OverflowError,
838 "encoded result is too long for a Python string");
839 return NULL;
840 }
841 size += incr;
842 }
843
844 str = _PyBytesWriter_Prepare(writer, str, size);
845 if (str == NULL)
846 return NULL;
847
848 /* generate replacement */
849 for (i = collstart; i < collend; ++i) {
850 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
851 if (size < 0) {
852 return NULL;
853 }
854 str += size;
855 }
856 return str;
857 }
858
859 /* --- Bloom Filters ----------------------------------------------------- */
860
861 /* stuff to implement simple "bloom filters" for Unicode characters.
862 to keep things simple, we use a single bitmask, using the least 5
863 bits from each unicode characters as the bit index. */
864
865 /* the linebreak mask is set up by Unicode_Init below */
866
867 #if LONG_BIT >= 128
868 #define BLOOM_WIDTH 128
869 #elif LONG_BIT >= 64
870 #define BLOOM_WIDTH 64
871 #elif LONG_BIT >= 32
872 #define BLOOM_WIDTH 32
873 #else
874 #error "LONG_BIT is smaller than 32"
875 #endif
876
877 #define BLOOM_MASK unsigned long
878
879 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
880
881 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
882
883 #define BLOOM_LINEBREAK(ch) \
884 ((ch) < 128U ? ascii_linebreak[(ch)] : \
885 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
886
887 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)888 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
889 {
890 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
891 do { \
892 TYPE *data = (TYPE *)PTR; \
893 TYPE *end = data + LEN; \
894 Py_UCS4 ch; \
895 for (; data != end; data++) { \
896 ch = *data; \
897 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
898 } \
899 break; \
900 } while (0)
901
902 /* calculate simple bloom-style bitmask for a given unicode string */
903
904 BLOOM_MASK mask;
905
906 mask = 0;
907 switch (kind) {
908 case PyUnicode_1BYTE_KIND:
909 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
910 break;
911 case PyUnicode_2BYTE_KIND:
912 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
913 break;
914 case PyUnicode_4BYTE_KIND:
915 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
916 break;
917 default:
918 Py_UNREACHABLE();
919 }
920 return mask;
921
922 #undef BLOOM_UPDATE
923 }
924
925 static int
ensure_unicode(PyObject * obj)926 ensure_unicode(PyObject *obj)
927 {
928 if (!PyUnicode_Check(obj)) {
929 PyErr_Format(PyExc_TypeError,
930 "must be str, not %.100s",
931 Py_TYPE(obj)->tp_name);
932 return -1;
933 }
934 return PyUnicode_READY(obj);
935 }
936
937 /* Compilation of templated routines */
938
939 #include "stringlib/asciilib.h"
940 #include "stringlib/fastsearch.h"
941 #include "stringlib/partition.h"
942 #include "stringlib/split.h"
943 #include "stringlib/count.h"
944 #include "stringlib/find.h"
945 #include "stringlib/find_max_char.h"
946 #include "stringlib/undef.h"
947
948 #include "stringlib/ucs1lib.h"
949 #include "stringlib/fastsearch.h"
950 #include "stringlib/partition.h"
951 #include "stringlib/split.h"
952 #include "stringlib/count.h"
953 #include "stringlib/find.h"
954 #include "stringlib/replace.h"
955 #include "stringlib/find_max_char.h"
956 #include "stringlib/undef.h"
957
958 #include "stringlib/ucs2lib.h"
959 #include "stringlib/fastsearch.h"
960 #include "stringlib/partition.h"
961 #include "stringlib/split.h"
962 #include "stringlib/count.h"
963 #include "stringlib/find.h"
964 #include "stringlib/replace.h"
965 #include "stringlib/find_max_char.h"
966 #include "stringlib/undef.h"
967
968 #include "stringlib/ucs4lib.h"
969 #include "stringlib/fastsearch.h"
970 #include "stringlib/partition.h"
971 #include "stringlib/split.h"
972 #include "stringlib/count.h"
973 #include "stringlib/find.h"
974 #include "stringlib/replace.h"
975 #include "stringlib/find_max_char.h"
976 #include "stringlib/undef.h"
977
978 _Py_COMP_DIAG_PUSH
979 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
980 #include "stringlib/unicodedefs.h"
981 #include "stringlib/fastsearch.h"
982 #include "stringlib/count.h"
983 #include "stringlib/find.h"
984 #include "stringlib/undef.h"
985 _Py_COMP_DIAG_POP
986
987 /* --- Unicode Object ----------------------------------------------------- */
988
989 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)990 findchar(const void *s, int kind,
991 Py_ssize_t size, Py_UCS4 ch,
992 int direction)
993 {
994 switch (kind) {
995 case PyUnicode_1BYTE_KIND:
996 if ((Py_UCS1) ch != ch)
997 return -1;
998 if (direction > 0)
999 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000 else
1001 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002 case PyUnicode_2BYTE_KIND:
1003 if ((Py_UCS2) ch != ch)
1004 return -1;
1005 if (direction > 0)
1006 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007 else
1008 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009 case PyUnicode_4BYTE_KIND:
1010 if (direction > 0)
1011 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012 else
1013 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014 default:
1015 Py_UNREACHABLE();
1016 }
1017 }
1018
1019 #ifdef Py_DEBUG
1020 /* Fill the data of a Unicode string with invalid characters to detect bugs
1021 earlier.
1022
1023 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025 invalid character in Unicode 6.0. */
1026 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1027 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028 {
1029 int kind = PyUnicode_KIND(unicode);
1030 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032 if (length <= old_length)
1033 return;
1034 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035 }
1036 #endif
1037
1038 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1039 resize_compact(PyObject *unicode, Py_ssize_t length)
1040 {
1041 Py_ssize_t char_size;
1042 Py_ssize_t struct_size;
1043 Py_ssize_t new_size;
1044 int share_wstr;
1045 PyObject *new_unicode;
1046 #ifdef Py_DEBUG
1047 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048 #endif
1049
1050 assert(unicode_modifiable(unicode));
1051 assert(PyUnicode_IS_READY(unicode));
1052 assert(PyUnicode_IS_COMPACT(unicode));
1053
1054 char_size = PyUnicode_KIND(unicode);
1055 if (PyUnicode_IS_ASCII(unicode))
1056 struct_size = sizeof(PyASCIIObject);
1057 else
1058 struct_size = sizeof(PyCompactUnicodeObject);
1059 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060
1061 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062 PyErr_NoMemory();
1063 return NULL;
1064 }
1065 new_size = (struct_size + (length + 1) * char_size);
1066
1067 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068 PyObject_DEL(_PyUnicode_UTF8(unicode));
1069 _PyUnicode_UTF8(unicode) = NULL;
1070 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071 }
1072 #ifdef Py_REF_DEBUG
1073 _Py_RefTotal--;
1074 #endif
1075 #ifdef Py_TRACE_REFS
1076 _Py_ForgetReference(unicode);
1077 #endif
1078
1079 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
1080 if (new_unicode == NULL) {
1081 _Py_NewReference(unicode);
1082 PyErr_NoMemory();
1083 return NULL;
1084 }
1085 unicode = new_unicode;
1086 _Py_NewReference(unicode);
1087
1088 _PyUnicode_LENGTH(unicode) = length;
1089 if (share_wstr) {
1090 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091 if (!PyUnicode_IS_ASCII(unicode))
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 }
1094 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095 PyObject_DEL(_PyUnicode_WSTR(unicode));
1096 _PyUnicode_WSTR(unicode) = NULL;
1097 if (!PyUnicode_IS_ASCII(unicode))
1098 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099 }
1100 #ifdef Py_DEBUG
1101 unicode_fill_invalid(unicode, old_length);
1102 #endif
1103 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104 length, 0);
1105 assert(_PyUnicode_CheckConsistency(unicode, 0));
1106 return unicode;
1107 }
1108
1109 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1110 resize_inplace(PyObject *unicode, Py_ssize_t length)
1111 {
1112 wchar_t *wstr;
1113 Py_ssize_t new_size;
1114 assert(!PyUnicode_IS_COMPACT(unicode));
1115 assert(Py_REFCNT(unicode) == 1);
1116
1117 if (PyUnicode_IS_READY(unicode)) {
1118 Py_ssize_t char_size;
1119 int share_wstr, share_utf8;
1120 void *data;
1121 #ifdef Py_DEBUG
1122 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123 #endif
1124
1125 data = _PyUnicode_DATA_ANY(unicode);
1126 char_size = PyUnicode_KIND(unicode);
1127 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129
1130 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131 PyErr_NoMemory();
1132 return -1;
1133 }
1134 new_size = (length + 1) * char_size;
1135
1136 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137 {
1138 PyObject_DEL(_PyUnicode_UTF8(unicode));
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141 }
1142
1143 data = (PyObject *)PyObject_REALLOC(data, new_size);
1144 if (data == NULL) {
1145 PyErr_NoMemory();
1146 return -1;
1147 }
1148 _PyUnicode_DATA_ANY(unicode) = data;
1149 if (share_wstr) {
1150 _PyUnicode_WSTR(unicode) = data;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
1152 }
1153 if (share_utf8) {
1154 _PyUnicode_UTF8(unicode) = data;
1155 _PyUnicode_UTF8_LENGTH(unicode) = length;
1156 }
1157 _PyUnicode_LENGTH(unicode) = length;
1158 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159 #ifdef Py_DEBUG
1160 unicode_fill_invalid(unicode, old_length);
1161 #endif
1162 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163 assert(_PyUnicode_CheckConsistency(unicode, 0));
1164 return 0;
1165 }
1166 }
1167 assert(_PyUnicode_WSTR(unicode) != NULL);
1168
1169 /* check for integer overflow */
1170 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171 PyErr_NoMemory();
1172 return -1;
1173 }
1174 new_size = sizeof(wchar_t) * (length + 1);
1175 wstr = _PyUnicode_WSTR(unicode);
1176 wstr = PyObject_REALLOC(wstr, new_size);
1177 if (!wstr) {
1178 PyErr_NoMemory();
1179 return -1;
1180 }
1181 _PyUnicode_WSTR(unicode) = wstr;
1182 _PyUnicode_WSTR(unicode)[length] = 0;
1183 _PyUnicode_WSTR_LENGTH(unicode) = length;
1184 assert(_PyUnicode_CheckConsistency(unicode, 0));
1185 return 0;
1186 }
1187
1188 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1189 resize_copy(PyObject *unicode, Py_ssize_t length)
1190 {
1191 Py_ssize_t copy_length;
1192 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193 PyObject *copy;
1194
1195 assert(PyUnicode_IS_READY(unicode));
1196
1197 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198 if (copy == NULL)
1199 return NULL;
1200
1201 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203 return copy;
1204 }
1205 else {
1206 PyObject *w;
1207
1208 w = (PyObject*)_PyUnicode_New(length);
1209 if (w == NULL)
1210 return NULL;
1211 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212 copy_length = Py_MIN(copy_length, length);
1213 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214 copy_length * sizeof(wchar_t));
1215 return w;
1216 }
1217 }
1218
1219 /* We allocate one more byte to make sure the string is
1220 Ux0000 terminated; some code (e.g. new_identifier)
1221 relies on that.
1222
1223 XXX This allocator could further be enhanced by assuring that the
1224 free list never reduces its size below 1.
1225
1226 */
1227
1228 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1229 _PyUnicode_New(Py_ssize_t length)
1230 {
1231 PyUnicodeObject *unicode;
1232 size_t new_size;
1233
1234 /* Optimization for empty strings */
1235 if (length == 0 && unicode_empty != NULL) {
1236 Py_INCREF(unicode_empty);
1237 return (PyUnicodeObject*)unicode_empty;
1238 }
1239
1240 /* Ensure we won't overflow the size. */
1241 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1242 return (PyUnicodeObject *)PyErr_NoMemory();
1243 }
1244 if (length < 0) {
1245 PyErr_SetString(PyExc_SystemError,
1246 "Negative size passed to _PyUnicode_New");
1247 return NULL;
1248 }
1249
1250 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1251 if (unicode == NULL)
1252 return NULL;
1253 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1254
1255 _PyUnicode_WSTR_LENGTH(unicode) = length;
1256 _PyUnicode_HASH(unicode) = -1;
1257 _PyUnicode_STATE(unicode).interned = 0;
1258 _PyUnicode_STATE(unicode).kind = 0;
1259 _PyUnicode_STATE(unicode).compact = 0;
1260 _PyUnicode_STATE(unicode).ready = 0;
1261 _PyUnicode_STATE(unicode).ascii = 0;
1262 _PyUnicode_DATA_ANY(unicode) = NULL;
1263 _PyUnicode_LENGTH(unicode) = 0;
1264 _PyUnicode_UTF8(unicode) = NULL;
1265 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1266
1267 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1268 if (!_PyUnicode_WSTR(unicode)) {
1269 Py_DECREF(unicode);
1270 PyErr_NoMemory();
1271 return NULL;
1272 }
1273
1274 /* Initialize the first element to guard against cases where
1275 * the caller fails before initializing str -- unicode_resize()
1276 * reads str[0], and the Keep-Alive optimization can keep memory
1277 * allocated for str alive across a call to unicode_dealloc(unicode).
1278 * We don't want unicode_resize to read uninitialized memory in
1279 * that case.
1280 */
1281 _PyUnicode_WSTR(unicode)[0] = 0;
1282 _PyUnicode_WSTR(unicode)[length] = 0;
1283
1284 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1285 return unicode;
1286 }
1287
1288 static const char*
unicode_kind_name(PyObject * unicode)1289 unicode_kind_name(PyObject *unicode)
1290 {
1291 /* don't check consistency: unicode_kind_name() is called from
1292 _PyUnicode_Dump() */
1293 if (!PyUnicode_IS_COMPACT(unicode))
1294 {
1295 if (!PyUnicode_IS_READY(unicode))
1296 return "wstr";
1297 switch (PyUnicode_KIND(unicode))
1298 {
1299 case PyUnicode_1BYTE_KIND:
1300 if (PyUnicode_IS_ASCII(unicode))
1301 return "legacy ascii";
1302 else
1303 return "legacy latin1";
1304 case PyUnicode_2BYTE_KIND:
1305 return "legacy UCS2";
1306 case PyUnicode_4BYTE_KIND:
1307 return "legacy UCS4";
1308 default:
1309 return "<legacy invalid kind>";
1310 }
1311 }
1312 assert(PyUnicode_IS_READY(unicode));
1313 switch (PyUnicode_KIND(unicode)) {
1314 case PyUnicode_1BYTE_KIND:
1315 if (PyUnicode_IS_ASCII(unicode))
1316 return "ascii";
1317 else
1318 return "latin1";
1319 case PyUnicode_2BYTE_KIND:
1320 return "UCS2";
1321 case PyUnicode_4BYTE_KIND:
1322 return "UCS4";
1323 default:
1324 return "<invalid compact kind>";
1325 }
1326 }
1327
1328 #ifdef Py_DEBUG
1329 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1330 const char *_PyUnicode_utf8(void *unicode_raw){
1331 PyObject *unicode = _PyObject_CAST(unicode_raw);
1332 return PyUnicode_UTF8(unicode);
1333 }
1334
_PyUnicode_compact_data(void * unicode_raw)1335 const void *_PyUnicode_compact_data(void *unicode_raw) {
1336 PyObject *unicode = _PyObject_CAST(unicode_raw);
1337 return _PyUnicode_COMPACT_DATA(unicode);
1338 }
_PyUnicode_data(void * unicode_raw)1339 const void *_PyUnicode_data(void *unicode_raw) {
1340 PyObject *unicode = _PyObject_CAST(unicode_raw);
1341 printf("obj %p\n", (void*)unicode);
1342 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1343 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1344 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1345 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1346 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1347 return PyUnicode_DATA(unicode);
1348 }
1349
1350 void
_PyUnicode_Dump(PyObject * op)1351 _PyUnicode_Dump(PyObject *op)
1352 {
1353 PyASCIIObject *ascii = (PyASCIIObject *)op;
1354 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1355 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1356 const void *data;
1357
1358 if (ascii->state.compact)
1359 {
1360 if (ascii->state.ascii)
1361 data = (ascii + 1);
1362 else
1363 data = (compact + 1);
1364 }
1365 else
1366 data = unicode->data.any;
1367 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1368 unicode_kind_name(op), ascii->length);
1369
1370 if (ascii->wstr == data)
1371 printf("shared ");
1372 printf("wstr=%p", (void *)ascii->wstr);
1373
1374 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1375 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1376 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1377 printf("shared ");
1378 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1379 (void *)compact->utf8, compact->utf8_length);
1380 }
1381 printf(", data=%p\n", data);
1382 }
1383 #endif
1384
1385 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1386 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1387 {
1388 PyObject *obj;
1389 PyCompactUnicodeObject *unicode;
1390 void *data;
1391 enum PyUnicode_Kind kind;
1392 int is_sharing, is_ascii;
1393 Py_ssize_t char_size;
1394 Py_ssize_t struct_size;
1395
1396 /* Optimization for empty strings */
1397 if (size == 0 && unicode_empty != NULL) {
1398 Py_INCREF(unicode_empty);
1399 return unicode_empty;
1400 }
1401
1402 is_ascii = 0;
1403 is_sharing = 0;
1404 struct_size = sizeof(PyCompactUnicodeObject);
1405 if (maxchar < 128) {
1406 kind = PyUnicode_1BYTE_KIND;
1407 char_size = 1;
1408 is_ascii = 1;
1409 struct_size = sizeof(PyASCIIObject);
1410 }
1411 else if (maxchar < 256) {
1412 kind = PyUnicode_1BYTE_KIND;
1413 char_size = 1;
1414 }
1415 else if (maxchar < 65536) {
1416 kind = PyUnicode_2BYTE_KIND;
1417 char_size = 2;
1418 if (sizeof(wchar_t) == 2)
1419 is_sharing = 1;
1420 }
1421 else {
1422 if (maxchar > MAX_UNICODE) {
1423 PyErr_SetString(PyExc_SystemError,
1424 "invalid maximum character passed to PyUnicode_New");
1425 return NULL;
1426 }
1427 kind = PyUnicode_4BYTE_KIND;
1428 char_size = 4;
1429 if (sizeof(wchar_t) == 4)
1430 is_sharing = 1;
1431 }
1432
1433 /* Ensure we won't overflow the size. */
1434 if (size < 0) {
1435 PyErr_SetString(PyExc_SystemError,
1436 "Negative size passed to PyUnicode_New");
1437 return NULL;
1438 }
1439 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1440 return PyErr_NoMemory();
1441
1442 /* Duplicated allocation code from _PyObject_New() instead of a call to
1443 * PyObject_New() so we are able to allocate space for the object and
1444 * it's data buffer.
1445 */
1446 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1447 if (obj == NULL)
1448 return PyErr_NoMemory();
1449 obj = PyObject_INIT(obj, &PyUnicode_Type);
1450 if (obj == NULL)
1451 return NULL;
1452
1453 unicode = (PyCompactUnicodeObject *)obj;
1454 if (is_ascii)
1455 data = ((PyASCIIObject*)obj) + 1;
1456 else
1457 data = unicode + 1;
1458 _PyUnicode_LENGTH(unicode) = size;
1459 _PyUnicode_HASH(unicode) = -1;
1460 _PyUnicode_STATE(unicode).interned = 0;
1461 _PyUnicode_STATE(unicode).kind = kind;
1462 _PyUnicode_STATE(unicode).compact = 1;
1463 _PyUnicode_STATE(unicode).ready = 1;
1464 _PyUnicode_STATE(unicode).ascii = is_ascii;
1465 if (is_ascii) {
1466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
1469 else if (kind == PyUnicode_1BYTE_KIND) {
1470 ((char*)data)[size] = 0;
1471 _PyUnicode_WSTR(unicode) = NULL;
1472 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1473 unicode->utf8 = NULL;
1474 unicode->utf8_length = 0;
1475 }
1476 else {
1477 unicode->utf8 = NULL;
1478 unicode->utf8_length = 0;
1479 if (kind == PyUnicode_2BYTE_KIND)
1480 ((Py_UCS2*)data)[size] = 0;
1481 else /* kind == PyUnicode_4BYTE_KIND */
1482 ((Py_UCS4*)data)[size] = 0;
1483 if (is_sharing) {
1484 _PyUnicode_WSTR_LENGTH(unicode) = size;
1485 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486 }
1487 else {
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
1491 }
1492 #ifdef Py_DEBUG
1493 unicode_fill_invalid((PyObject*)unicode, 0);
1494 #endif
1495 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1496 return obj;
1497 }
1498
1499 #if SIZEOF_WCHAR_T == 2
1500 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501 will decode surrogate pairs, the other conversions are implemented as macros
1502 for efficiency.
1503
1504 This function assumes that unicode can hold one more code point than wstr
1505 characters for a terminating null character. */
1506 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1507 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1508 PyObject *unicode)
1509 {
1510 const wchar_t *iter;
1511 Py_UCS4 *ucs4_out;
1512
1513 assert(unicode != NULL);
1514 assert(_PyUnicode_CHECK(unicode));
1515 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517
1518 for (iter = begin; iter < end; ) {
1519 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520 _PyUnicode_GET_LENGTH(unicode)));
1521 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522 && (iter+1) < end
1523 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1524 {
1525 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1526 iter += 2;
1527 }
1528 else {
1529 *ucs4_out++ = *iter;
1530 iter++;
1531 }
1532 }
1533 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534 _PyUnicode_GET_LENGTH(unicode)));
1535
1536 }
1537 #endif
1538
1539 static int
unicode_check_modifiable(PyObject * unicode)1540 unicode_check_modifiable(PyObject *unicode)
1541 {
1542 if (!unicode_modifiable(unicode)) {
1543 PyErr_SetString(PyExc_SystemError,
1544 "Cannot modify a string currently used");
1545 return -1;
1546 }
1547 return 0;
1548 }
1549
1550 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1551 _copy_characters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many, int check_maxchar)
1554 {
1555 unsigned int from_kind, to_kind;
1556 const void *from_data;
1557 void *to_data;
1558
1559 assert(0 <= how_many);
1560 assert(0 <= from_start);
1561 assert(0 <= to_start);
1562 assert(PyUnicode_Check(from));
1563 assert(PyUnicode_IS_READY(from));
1564 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1565
1566 assert(PyUnicode_Check(to));
1567 assert(PyUnicode_IS_READY(to));
1568 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569
1570 if (how_many == 0)
1571 return 0;
1572
1573 from_kind = PyUnicode_KIND(from);
1574 from_data = PyUnicode_DATA(from);
1575 to_kind = PyUnicode_KIND(to);
1576 to_data = PyUnicode_DATA(to);
1577
1578 #ifdef Py_DEBUG
1579 if (!check_maxchar
1580 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581 {
1582 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1583 Py_UCS4 ch;
1584 Py_ssize_t i;
1585 for (i=0; i < how_many; i++) {
1586 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587 assert(ch <= to_maxchar);
1588 }
1589 }
1590 #endif
1591
1592 if (from_kind == to_kind) {
1593 if (check_maxchar
1594 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595 {
1596 /* Writing Latin-1 characters into an ASCII string requires to
1597 check that all written characters are pure ASCII */
1598 Py_UCS4 max_char;
1599 max_char = ucs1lib_find_max_char(from_data,
1600 (const Py_UCS1*)from_data + how_many);
1601 if (max_char >= 128)
1602 return -1;
1603 }
1604 memcpy((char*)to_data + to_kind * to_start,
1605 (const char*)from_data + from_kind * from_start,
1606 to_kind * how_many);
1607 }
1608 else if (from_kind == PyUnicode_1BYTE_KIND
1609 && to_kind == PyUnicode_2BYTE_KIND)
1610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS2,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_2BYTE_DATA(to) + to_start
1616 );
1617 }
1618 else if (from_kind == PyUnicode_1BYTE_KIND
1619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS1, Py_UCS4,
1623 PyUnicode_1BYTE_DATA(from) + from_start,
1624 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
1627 }
1628 else if (from_kind == PyUnicode_2BYTE_KIND
1629 && to_kind == PyUnicode_4BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS2, Py_UCS4,
1633 PyUnicode_2BYTE_DATA(from) + from_start,
1634 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_4BYTE_DATA(to) + to_start
1636 );
1637 }
1638 else {
1639 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640
1641 if (!check_maxchar) {
1642 if (from_kind == PyUnicode_2BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS2, Py_UCS1,
1647 PyUnicode_2BYTE_DATA(from) + from_start,
1648 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_1BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS1,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_1BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else if (from_kind == PyUnicode_4BYTE_KIND
1663 && to_kind == PyUnicode_2BYTE_KIND)
1664 {
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS4, Py_UCS2,
1667 PyUnicode_4BYTE_DATA(from) + from_start,
1668 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669 PyUnicode_2BYTE_DATA(to) + to_start
1670 );
1671 }
1672 else {
1673 Py_UNREACHABLE();
1674 }
1675 }
1676 else {
1677 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1678 Py_UCS4 ch;
1679 Py_ssize_t i;
1680
1681 for (i=0; i < how_many; i++) {
1682 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1683 if (ch > to_maxchar)
1684 return -1;
1685 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686 }
1687 }
1688 }
1689 return 0;
1690 }
1691
1692 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1693 _PyUnicode_FastCopyCharacters(
1694 PyObject *to, Py_ssize_t to_start,
1695 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1696 {
1697 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698 }
1699
1700 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1701 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702 PyObject *from, Py_ssize_t from_start,
1703 Py_ssize_t how_many)
1704 {
1705 int err;
1706
1707 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708 PyErr_BadInternalCall();
1709 return -1;
1710 }
1711
1712 if (PyUnicode_READY(from) == -1)
1713 return -1;
1714 if (PyUnicode_READY(to) == -1)
1715 return -1;
1716
1717 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
1721 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1722 PyErr_SetString(PyExc_IndexError, "string index out of range");
1723 return -1;
1724 }
1725 if (how_many < 0) {
1726 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727 return -1;
1728 }
1729 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1730 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731 PyErr_Format(PyExc_SystemError,
1732 "Cannot write %zi characters at %zi "
1733 "in a string of %zi characters",
1734 how_many, to_start, PyUnicode_GET_LENGTH(to));
1735 return -1;
1736 }
1737
1738 if (how_many == 0)
1739 return 0;
1740
1741 if (unicode_check_modifiable(to))
1742 return -1;
1743
1744 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745 if (err) {
1746 PyErr_Format(PyExc_SystemError,
1747 "Cannot copy %s characters "
1748 "into a string of %s characters",
1749 unicode_kind_name(from),
1750 unicode_kind_name(to));
1751 return -1;
1752 }
1753 return how_many;
1754 }
1755
1756 /* Find the maximum code point and count the number of surrogate pairs so a
1757 correct string length can be computed before converting a string to UCS4.
1758 This function counts single surrogates as a character and not as a pair.
1759
1760 Return 0 on success, or -1 on error. */
1761 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1762 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1764 {
1765 const wchar_t *iter;
1766 Py_UCS4 ch;
1767
1768 assert(num_surrogates != NULL && maxchar != NULL);
1769 *num_surrogates = 0;
1770 *maxchar = 0;
1771
1772 for (iter = begin; iter < end; ) {
1773 #if SIZEOF_WCHAR_T == 2
1774 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775 && (iter+1) < end
1776 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777 {
1778 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779 ++(*num_surrogates);
1780 iter += 2;
1781 }
1782 else
1783 #endif
1784 {
1785 ch = *iter;
1786 iter++;
1787 }
1788 if (ch > *maxchar) {
1789 *maxchar = ch;
1790 if (*maxchar > MAX_UNICODE) {
1791 PyErr_Format(PyExc_ValueError,
1792 "character U+%x is not in range [U+0000; U+10ffff]",
1793 ch);
1794 return -1;
1795 }
1796 }
1797 }
1798 return 0;
1799 }
1800
1801 int
_PyUnicode_Ready(PyObject * unicode)1802 _PyUnicode_Ready(PyObject *unicode)
1803 {
1804 wchar_t *end;
1805 Py_UCS4 maxchar = 0;
1806 Py_ssize_t num_surrogates;
1807 #if SIZEOF_WCHAR_T == 2
1808 Py_ssize_t length_wo_surrogates;
1809 #endif
1810
1811 /* _PyUnicode_Ready() is only intended for old-style API usage where
1812 strings were created using _PyObject_New() and where no canonical
1813 representation (the str field) has been set yet aka strings
1814 which are not yet ready. */
1815 assert(_PyUnicode_CHECK(unicode));
1816 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1817 assert(_PyUnicode_WSTR(unicode) != NULL);
1818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1819 assert(_PyUnicode_UTF8(unicode) == NULL);
1820 /* Actually, it should neither be interned nor be anything else: */
1821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1822
1823 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1824 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1825 &maxchar, &num_surrogates) == -1)
1826 return -1;
1827
1828 if (maxchar < 256) {
1829 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1830 if (!_PyUnicode_DATA_ANY(unicode)) {
1831 PyErr_NoMemory();
1832 return -1;
1833 }
1834 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1835 _PyUnicode_WSTR(unicode), end,
1836 PyUnicode_1BYTE_DATA(unicode));
1837 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840 if (maxchar < 128) {
1841 _PyUnicode_STATE(unicode).ascii = 1;
1842 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1843 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1844 }
1845 else {
1846 _PyUnicode_STATE(unicode).ascii = 0;
1847 _PyUnicode_UTF8(unicode) = NULL;
1848 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1849 }
1850 PyObject_FREE(_PyUnicode_WSTR(unicode));
1851 _PyUnicode_WSTR(unicode) = NULL;
1852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853 }
1854 /* In this case we might have to convert down from 4-byte native
1855 wchar_t to 2-byte unicode. */
1856 else if (maxchar < 65536) {
1857 assert(num_surrogates == 0 &&
1858 "FindMaxCharAndNumSurrogatePairs() messed up");
1859
1860 #if SIZEOF_WCHAR_T == 2
1861 /* We can share representations and are done. */
1862 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1863 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1866 _PyUnicode_UTF8(unicode) = NULL;
1867 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1868 #else
1869 /* sizeof(wchar_t) == 4 */
1870 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1871 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1872 if (!_PyUnicode_DATA_ANY(unicode)) {
1873 PyErr_NoMemory();
1874 return -1;
1875 }
1876 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877 _PyUnicode_WSTR(unicode), end,
1878 PyUnicode_2BYTE_DATA(unicode));
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884 PyObject_FREE(_PyUnicode_WSTR(unicode));
1885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887 #endif
1888 }
1889 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1890 else {
1891 #if SIZEOF_WCHAR_T == 2
1892 /* in case the native representation is 2-bytes, we need to allocate a
1893 new normalized 4-byte version. */
1894 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1895 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896 PyErr_NoMemory();
1897 return -1;
1898 }
1899 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1900 if (!_PyUnicode_DATA_ANY(unicode)) {
1901 PyErr_NoMemory();
1902 return -1;
1903 }
1904 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1906 _PyUnicode_UTF8(unicode) = NULL;
1907 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1908 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909 _PyUnicode_STATE(unicode).ready = 1;
1910 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1911 PyObject_FREE(_PyUnicode_WSTR(unicode));
1912 _PyUnicode_WSTR(unicode) = NULL;
1913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914 #else
1915 assert(num_surrogates == 0);
1916
1917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922 #endif
1923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924 }
1925 _PyUnicode_STATE(unicode).ready = 1;
1926 assert(_PyUnicode_CheckConsistency(unicode, 1));
1927 return 0;
1928 }
1929
1930 static void
unicode_dealloc(PyObject * unicode)1931 unicode_dealloc(PyObject *unicode)
1932 {
1933 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1934 case SSTATE_NOT_INTERNED:
1935 break;
1936
1937 case SSTATE_INTERNED_MORTAL:
1938 /* revive dead object temporarily for DelItem */
1939 Py_SET_REFCNT(unicode, 3);
1940 #ifdef INTERNED_STRINGS
1941 if (PyDict_DelItem(interned, unicode) != 0) {
1942 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1943 NULL);
1944 }
1945 #endif
1946 break;
1947
1948 case SSTATE_INTERNED_IMMORTAL:
1949 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1950 break;
1951
1952 default:
1953 Py_UNREACHABLE();
1954 }
1955
1956 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1957 PyObject_DEL(_PyUnicode_WSTR(unicode));
1958 }
1959 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1960 PyObject_DEL(_PyUnicode_UTF8(unicode));
1961 }
1962 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1963 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1964 }
1965
1966 Py_TYPE(unicode)->tp_free(unicode);
1967 }
1968
1969 #ifdef Py_DEBUG
1970 static int
unicode_is_singleton(PyObject * unicode)1971 unicode_is_singleton(PyObject *unicode)
1972 {
1973 if (unicode == unicode_empty) {
1974 return 1;
1975 }
1976 #ifdef LATIN1_SINGLETONS
1977 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1978 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1979 {
1980 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1981 if (ch < 256 && unicode_latin1[ch] == unicode)
1982 return 1;
1983 }
1984 #endif
1985 return 0;
1986 }
1987 #endif
1988
1989 static int
unicode_modifiable(PyObject * unicode)1990 unicode_modifiable(PyObject *unicode)
1991 {
1992 assert(_PyUnicode_CHECK(unicode));
1993 if (Py_REFCNT(unicode) != 1)
1994 return 0;
1995 if (_PyUnicode_HASH(unicode) != -1)
1996 return 0;
1997 if (PyUnicode_CHECK_INTERNED(unicode))
1998 return 0;
1999 if (!PyUnicode_CheckExact(unicode))
2000 return 0;
2001 #ifdef Py_DEBUG
2002 /* singleton refcount is greater than 1 */
2003 assert(!unicode_is_singleton(unicode));
2004 #endif
2005 return 1;
2006 }
2007
2008 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)2009 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2010 {
2011 PyObject *unicode;
2012 Py_ssize_t old_length;
2013
2014 assert(p_unicode != NULL);
2015 unicode = *p_unicode;
2016
2017 assert(unicode != NULL);
2018 assert(PyUnicode_Check(unicode));
2019 assert(0 <= length);
2020
2021 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2022 old_length = PyUnicode_WSTR_LENGTH(unicode);
2023 else
2024 old_length = PyUnicode_GET_LENGTH(unicode);
2025 if (old_length == length)
2026 return 0;
2027
2028 if (length == 0) {
2029 _Py_INCREF_UNICODE_EMPTY();
2030 if (!unicode_empty)
2031 return -1;
2032 Py_SETREF(*p_unicode, unicode_empty);
2033 return 0;
2034 }
2035
2036 if (!unicode_modifiable(unicode)) {
2037 PyObject *copy = resize_copy(unicode, length);
2038 if (copy == NULL)
2039 return -1;
2040 Py_SETREF(*p_unicode, copy);
2041 return 0;
2042 }
2043
2044 if (PyUnicode_IS_COMPACT(unicode)) {
2045 PyObject *new_unicode = resize_compact(unicode, length);
2046 if (new_unicode == NULL)
2047 return -1;
2048 *p_unicode = new_unicode;
2049 return 0;
2050 }
2051 return resize_inplace(unicode, length);
2052 }
2053
2054 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2055 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2056 {
2057 PyObject *unicode;
2058 if (p_unicode == NULL) {
2059 PyErr_BadInternalCall();
2060 return -1;
2061 }
2062 unicode = *p_unicode;
2063 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2064 {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 return unicode_resize(p_unicode, length);
2069 }
2070
2071 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2072
2073 WARNING: The function doesn't copy the terminating null character and
2074 doesn't check the maximum character (may write a latin1 character in an
2075 ASCII string). */
2076 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2077 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2078 const char *str, Py_ssize_t len)
2079 {
2080 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2081 const void *data = PyUnicode_DATA(unicode);
2082 const char *end = str + len;
2083
2084 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2085 switch (kind) {
2086 case PyUnicode_1BYTE_KIND: {
2087 #ifdef Py_DEBUG
2088 if (PyUnicode_IS_ASCII(unicode)) {
2089 Py_UCS4 maxchar = ucs1lib_find_max_char(
2090 (const Py_UCS1*)str,
2091 (const Py_UCS1*)str + len);
2092 assert(maxchar < 128);
2093 }
2094 #endif
2095 memcpy((char *) data + index, str, len);
2096 break;
2097 }
2098 case PyUnicode_2BYTE_KIND: {
2099 Py_UCS2 *start = (Py_UCS2 *)data + index;
2100 Py_UCS2 *ucs2 = start;
2101
2102 for (; str < end; ++ucs2, ++str)
2103 *ucs2 = (Py_UCS2)*str;
2104
2105 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2106 break;
2107 }
2108 case PyUnicode_4BYTE_KIND: {
2109 Py_UCS4 *start = (Py_UCS4 *)data + index;
2110 Py_UCS4 *ucs4 = start;
2111
2112 for (; str < end; ++ucs4, ++str)
2113 *ucs4 = (Py_UCS4)*str;
2114
2115 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2116 break;
2117 }
2118 default:
2119 Py_UNREACHABLE();
2120 }
2121 }
2122
2123 static PyObject*
get_latin1_char(unsigned char ch)2124 get_latin1_char(unsigned char ch)
2125 {
2126 PyObject *unicode;
2127
2128 #ifdef LATIN1_SINGLETONS
2129 unicode = unicode_latin1[ch];
2130 if (unicode) {
2131 Py_INCREF(unicode);
2132 return unicode;
2133 }
2134 #endif
2135
2136 unicode = PyUnicode_New(1, ch);
2137 if (!unicode) {
2138 return NULL;
2139 }
2140
2141 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2142 assert(_PyUnicode_CheckConsistency(unicode, 1));
2143
2144 #ifdef LATIN1_SINGLETONS
2145 Py_INCREF(unicode);
2146 unicode_latin1[ch] = unicode;
2147 #endif
2148 return unicode;
2149 }
2150
2151 static PyObject*
unicode_char(Py_UCS4 ch)2152 unicode_char(Py_UCS4 ch)
2153 {
2154 PyObject *unicode;
2155
2156 assert(ch <= MAX_UNICODE);
2157
2158 if (ch < 256)
2159 return get_latin1_char(ch);
2160
2161 unicode = PyUnicode_New(1, ch);
2162 if (unicode == NULL)
2163 return NULL;
2164
2165 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2166 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2167 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2168 } else {
2169 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2170 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2171 }
2172 assert(_PyUnicode_CheckConsistency(unicode, 1));
2173 return unicode;
2174 }
2175
2176 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2177 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2178 {
2179 if (u == NULL)
2180 return (PyObject*)_PyUnicode_New(size);
2181
2182 if (size < 0) {
2183 PyErr_BadInternalCall();
2184 return NULL;
2185 }
2186
2187 return PyUnicode_FromWideChar(u, size);
2188 }
2189
2190 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2191 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2192 {
2193 PyObject *unicode;
2194 Py_UCS4 maxchar = 0;
2195 Py_ssize_t num_surrogates;
2196
2197 if (u == NULL && size != 0) {
2198 PyErr_BadInternalCall();
2199 return NULL;
2200 }
2201
2202 if (size == -1) {
2203 size = wcslen(u);
2204 }
2205
2206 /* If the Unicode data is known at construction time, we can apply
2207 some optimizations which share commonly used objects. */
2208
2209 /* Optimization for empty strings */
2210 if (size == 0)
2211 _Py_RETURN_UNICODE_EMPTY();
2212
2213 /* Single character Unicode objects in the Latin-1 range are
2214 shared when using this constructor */
2215 if (size == 1 && (Py_UCS4)*u < 256)
2216 return get_latin1_char((unsigned char)*u);
2217
2218 /* If not empty and not single character, copy the Unicode data
2219 into the new object */
2220 if (find_maxchar_surrogates(u, u + size,
2221 &maxchar, &num_surrogates) == -1)
2222 return NULL;
2223
2224 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2225 if (!unicode)
2226 return NULL;
2227
2228 switch (PyUnicode_KIND(unicode)) {
2229 case PyUnicode_1BYTE_KIND:
2230 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2231 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2232 break;
2233 case PyUnicode_2BYTE_KIND:
2234 #if Py_UNICODE_SIZE == 2
2235 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2236 #else
2237 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2238 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2239 #endif
2240 break;
2241 case PyUnicode_4BYTE_KIND:
2242 #if SIZEOF_WCHAR_T == 2
2243 /* This is the only case which has to process surrogates, thus
2244 a simple copy loop is not enough and we need a function. */
2245 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2246 #else
2247 assert(num_surrogates == 0);
2248 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2249 #endif
2250 break;
2251 default:
2252 Py_UNREACHABLE();
2253 }
2254
2255 return unicode_result(unicode);
2256 }
2257
2258 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2259 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2260 {
2261 if (size < 0) {
2262 PyErr_SetString(PyExc_SystemError,
2263 "Negative size passed to PyUnicode_FromStringAndSize");
2264 return NULL;
2265 }
2266 if (u != NULL)
2267 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2268 else
2269 return (PyObject *)_PyUnicode_New(size);
2270 }
2271
2272 PyObject *
PyUnicode_FromString(const char * u)2273 PyUnicode_FromString(const char *u)
2274 {
2275 size_t size = strlen(u);
2276 if (size > PY_SSIZE_T_MAX) {
2277 PyErr_SetString(PyExc_OverflowError, "input too long");
2278 return NULL;
2279 }
2280 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2281 }
2282
2283 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2284 _PyUnicode_FromId(_Py_Identifier *id)
2285 {
2286 if (!id->object) {
2287 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2288 strlen(id->string),
2289 NULL, NULL);
2290 if (!id->object)
2291 return NULL;
2292 PyUnicode_InternInPlace(&id->object);
2293 assert(!id->next);
2294 id->next = static_strings;
2295 static_strings = id;
2296 }
2297 return id->object;
2298 }
2299
2300 static void
unicode_clear_static_strings(void)2301 unicode_clear_static_strings(void)
2302 {
2303 _Py_Identifier *tmp, *s = static_strings;
2304 while (s) {
2305 Py_CLEAR(s->object);
2306 tmp = s->next;
2307 s->next = NULL;
2308 s = tmp;
2309 }
2310 static_strings = NULL;
2311 }
2312
2313 /* Internal function, doesn't check maximum character */
2314
2315 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2316 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2317 {
2318 const unsigned char *s = (const unsigned char *)buffer;
2319 PyObject *unicode;
2320 if (size == 1) {
2321 #ifdef Py_DEBUG
2322 assert((unsigned char)s[0] < 128);
2323 #endif
2324 return get_latin1_char(s[0]);
2325 }
2326 unicode = PyUnicode_New(size, 127);
2327 if (!unicode)
2328 return NULL;
2329 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2330 assert(_PyUnicode_CheckConsistency(unicode, 1));
2331 return unicode;
2332 }
2333
2334 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2335 kind_maxchar_limit(unsigned int kind)
2336 {
2337 switch (kind) {
2338 case PyUnicode_1BYTE_KIND:
2339 return 0x80;
2340 case PyUnicode_2BYTE_KIND:
2341 return 0x100;
2342 case PyUnicode_4BYTE_KIND:
2343 return 0x10000;
2344 default:
2345 Py_UNREACHABLE();
2346 }
2347 }
2348
2349 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2350 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2351 {
2352 PyObject *res;
2353 unsigned char max_char;
2354
2355 if (size == 0)
2356 _Py_RETURN_UNICODE_EMPTY();
2357 assert(size > 0);
2358 if (size == 1)
2359 return get_latin1_char(u[0]);
2360
2361 max_char = ucs1lib_find_max_char(u, u + size);
2362 res = PyUnicode_New(size, max_char);
2363 if (!res)
2364 return NULL;
2365 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2366 assert(_PyUnicode_CheckConsistency(res, 1));
2367 return res;
2368 }
2369
2370 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2371 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2372 {
2373 PyObject *res;
2374 Py_UCS2 max_char;
2375
2376 if (size == 0)
2377 _Py_RETURN_UNICODE_EMPTY();
2378 assert(size > 0);
2379 if (size == 1)
2380 return unicode_char(u[0]);
2381
2382 max_char = ucs2lib_find_max_char(u, u + size);
2383 res = PyUnicode_New(size, max_char);
2384 if (!res)
2385 return NULL;
2386 if (max_char >= 256)
2387 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2388 else {
2389 _PyUnicode_CONVERT_BYTES(
2390 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2391 }
2392 assert(_PyUnicode_CheckConsistency(res, 1));
2393 return res;
2394 }
2395
2396 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2397 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2398 {
2399 PyObject *res;
2400 Py_UCS4 max_char;
2401
2402 if (size == 0)
2403 _Py_RETURN_UNICODE_EMPTY();
2404 assert(size > 0);
2405 if (size == 1)
2406 return unicode_char(u[0]);
2407
2408 max_char = ucs4lib_find_max_char(u, u + size);
2409 res = PyUnicode_New(size, max_char);
2410 if (!res)
2411 return NULL;
2412 if (max_char < 256)
2413 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2414 PyUnicode_1BYTE_DATA(res));
2415 else if (max_char < 0x10000)
2416 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2417 PyUnicode_2BYTE_DATA(res));
2418 else
2419 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2420 assert(_PyUnicode_CheckConsistency(res, 1));
2421 return res;
2422 }
2423
2424 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2425 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2426 {
2427 if (size < 0) {
2428 PyErr_SetString(PyExc_ValueError, "size must be positive");
2429 return NULL;
2430 }
2431 switch (kind) {
2432 case PyUnicode_1BYTE_KIND:
2433 return _PyUnicode_FromUCS1(buffer, size);
2434 case PyUnicode_2BYTE_KIND:
2435 return _PyUnicode_FromUCS2(buffer, size);
2436 case PyUnicode_4BYTE_KIND:
2437 return _PyUnicode_FromUCS4(buffer, size);
2438 default:
2439 PyErr_SetString(PyExc_SystemError, "invalid kind");
2440 return NULL;
2441 }
2442 }
2443
2444 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2445 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2446 {
2447 enum PyUnicode_Kind kind;
2448 const void *startptr, *endptr;
2449
2450 assert(PyUnicode_IS_READY(unicode));
2451 assert(0 <= start);
2452 assert(end <= PyUnicode_GET_LENGTH(unicode));
2453 assert(start <= end);
2454
2455 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2456 return PyUnicode_MAX_CHAR_VALUE(unicode);
2457
2458 if (start == end)
2459 return 127;
2460
2461 if (PyUnicode_IS_ASCII(unicode))
2462 return 127;
2463
2464 kind = PyUnicode_KIND(unicode);
2465 startptr = PyUnicode_DATA(unicode);
2466 endptr = (char *)startptr + end * kind;
2467 startptr = (char *)startptr + start * kind;
2468 switch(kind) {
2469 case PyUnicode_1BYTE_KIND:
2470 return ucs1lib_find_max_char(startptr, endptr);
2471 case PyUnicode_2BYTE_KIND:
2472 return ucs2lib_find_max_char(startptr, endptr);
2473 case PyUnicode_4BYTE_KIND:
2474 return ucs4lib_find_max_char(startptr, endptr);
2475 default:
2476 Py_UNREACHABLE();
2477 }
2478 }
2479
2480 /* Ensure that a string uses the most efficient storage, if it is not the
2481 case: create a new string with of the right kind. Write NULL into *p_unicode
2482 on error. */
2483 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2484 unicode_adjust_maxchar(PyObject **p_unicode)
2485 {
2486 PyObject *unicode, *copy;
2487 Py_UCS4 max_char;
2488 Py_ssize_t len;
2489 unsigned int kind;
2490
2491 assert(p_unicode != NULL);
2492 unicode = *p_unicode;
2493 assert(PyUnicode_IS_READY(unicode));
2494 if (PyUnicode_IS_ASCII(unicode))
2495 return;
2496
2497 len = PyUnicode_GET_LENGTH(unicode);
2498 kind = PyUnicode_KIND(unicode);
2499 if (kind == PyUnicode_1BYTE_KIND) {
2500 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2501 max_char = ucs1lib_find_max_char(u, u + len);
2502 if (max_char >= 128)
2503 return;
2504 }
2505 else if (kind == PyUnicode_2BYTE_KIND) {
2506 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2507 max_char = ucs2lib_find_max_char(u, u + len);
2508 if (max_char >= 256)
2509 return;
2510 }
2511 else if (kind == PyUnicode_4BYTE_KIND) {
2512 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2513 max_char = ucs4lib_find_max_char(u, u + len);
2514 if (max_char >= 0x10000)
2515 return;
2516 }
2517 else
2518 Py_UNREACHABLE();
2519
2520 copy = PyUnicode_New(len, max_char);
2521 if (copy != NULL)
2522 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2523 Py_DECREF(unicode);
2524 *p_unicode = copy;
2525 }
2526
2527 PyObject*
_PyUnicode_Copy(PyObject * unicode)2528 _PyUnicode_Copy(PyObject *unicode)
2529 {
2530 Py_ssize_t length;
2531 PyObject *copy;
2532
2533 if (!PyUnicode_Check(unicode)) {
2534 PyErr_BadInternalCall();
2535 return NULL;
2536 }
2537 if (PyUnicode_READY(unicode) == -1)
2538 return NULL;
2539
2540 length = PyUnicode_GET_LENGTH(unicode);
2541 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2542 if (!copy)
2543 return NULL;
2544 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2545
2546 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2547 length * PyUnicode_KIND(unicode));
2548 assert(_PyUnicode_CheckConsistency(copy, 1));
2549 return copy;
2550 }
2551
2552
2553 /* Widen Unicode objects to larger buffers. Don't write terminating null
2554 character. Return NULL on error. */
2555
2556 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2557 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2558 {
2559 void *result;
2560
2561 assert(skind < kind);
2562 switch (kind) {
2563 case PyUnicode_2BYTE_KIND:
2564 result = PyMem_New(Py_UCS2, len);
2565 if (!result)
2566 return PyErr_NoMemory();
2567 assert(skind == PyUnicode_1BYTE_KIND);
2568 _PyUnicode_CONVERT_BYTES(
2569 Py_UCS1, Py_UCS2,
2570 (const Py_UCS1 *)data,
2571 ((const Py_UCS1 *)data) + len,
2572 result);
2573 return result;
2574 case PyUnicode_4BYTE_KIND:
2575 result = PyMem_New(Py_UCS4, len);
2576 if (!result)
2577 return PyErr_NoMemory();
2578 if (skind == PyUnicode_2BYTE_KIND) {
2579 _PyUnicode_CONVERT_BYTES(
2580 Py_UCS2, Py_UCS4,
2581 (const Py_UCS2 *)data,
2582 ((const Py_UCS2 *)data) + len,
2583 result);
2584 }
2585 else {
2586 assert(skind == PyUnicode_1BYTE_KIND);
2587 _PyUnicode_CONVERT_BYTES(
2588 Py_UCS1, Py_UCS4,
2589 (const Py_UCS1 *)data,
2590 ((const Py_UCS1 *)data) + len,
2591 result);
2592 }
2593 return result;
2594 default:
2595 Py_UNREACHABLE();
2596 return NULL;
2597 }
2598 }
2599
2600 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2601 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2602 int copy_null)
2603 {
2604 int kind;
2605 const void *data;
2606 Py_ssize_t len, targetlen;
2607 if (PyUnicode_READY(string) == -1)
2608 return NULL;
2609 kind = PyUnicode_KIND(string);
2610 data = PyUnicode_DATA(string);
2611 len = PyUnicode_GET_LENGTH(string);
2612 targetlen = len;
2613 if (copy_null)
2614 targetlen++;
2615 if (!target) {
2616 target = PyMem_New(Py_UCS4, targetlen);
2617 if (!target) {
2618 PyErr_NoMemory();
2619 return NULL;
2620 }
2621 }
2622 else {
2623 if (targetsize < targetlen) {
2624 PyErr_Format(PyExc_SystemError,
2625 "string is longer than the buffer");
2626 if (copy_null && 0 < targetsize)
2627 target[0] = 0;
2628 return NULL;
2629 }
2630 }
2631 if (kind == PyUnicode_1BYTE_KIND) {
2632 const Py_UCS1 *start = (const Py_UCS1 *) data;
2633 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2634 }
2635 else if (kind == PyUnicode_2BYTE_KIND) {
2636 const Py_UCS2 *start = (const Py_UCS2 *) data;
2637 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2638 }
2639 else if (kind == PyUnicode_4BYTE_KIND) {
2640 memcpy(target, data, len * sizeof(Py_UCS4));
2641 }
2642 else {
2643 Py_UNREACHABLE();
2644 }
2645 if (copy_null)
2646 target[len] = 0;
2647 return target;
2648 }
2649
2650 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2651 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2652 int copy_null)
2653 {
2654 if (target == NULL || targetsize < 0) {
2655 PyErr_BadInternalCall();
2656 return NULL;
2657 }
2658 return as_ucs4(string, target, targetsize, copy_null);
2659 }
2660
2661 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2662 PyUnicode_AsUCS4Copy(PyObject *string)
2663 {
2664 return as_ucs4(string, NULL, 0, 1);
2665 }
2666
2667 /* maximum number of characters required for output of %lld or %p.
2668 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2669 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2670 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2671
2672 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2673 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2674 Py_ssize_t width, Py_ssize_t precision)
2675 {
2676 Py_ssize_t length, fill, arglen;
2677 Py_UCS4 maxchar;
2678
2679 if (PyUnicode_READY(str) == -1)
2680 return -1;
2681
2682 length = PyUnicode_GET_LENGTH(str);
2683 if ((precision == -1 || precision >= length)
2684 && width <= length)
2685 return _PyUnicodeWriter_WriteStr(writer, str);
2686
2687 if (precision != -1)
2688 length = Py_MIN(precision, length);
2689
2690 arglen = Py_MAX(length, width);
2691 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2692 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2693 else
2694 maxchar = writer->maxchar;
2695
2696 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2697 return -1;
2698
2699 if (width > length) {
2700 fill = width - length;
2701 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2702 return -1;
2703 writer->pos += fill;
2704 }
2705
2706 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2707 str, 0, length);
2708 writer->pos += length;
2709 return 0;
2710 }
2711
2712 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2713 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2714 Py_ssize_t width, Py_ssize_t precision)
2715 {
2716 /* UTF-8 */
2717 Py_ssize_t length;
2718 PyObject *unicode;
2719 int res;
2720
2721 if (precision == -1) {
2722 length = strlen(str);
2723 }
2724 else {
2725 length = 0;
2726 while (length < precision && str[length]) {
2727 length++;
2728 }
2729 }
2730 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2731 if (unicode == NULL)
2732 return -1;
2733
2734 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2735 Py_DECREF(unicode);
2736 return res;
2737 }
2738
2739 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2740 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2741 const char *f, va_list *vargs)
2742 {
2743 const char *p;
2744 Py_ssize_t len;
2745 int zeropad;
2746 Py_ssize_t width;
2747 Py_ssize_t precision;
2748 int longflag;
2749 int longlongflag;
2750 int size_tflag;
2751 Py_ssize_t fill;
2752
2753 p = f;
2754 f++;
2755 zeropad = 0;
2756 if (*f == '0') {
2757 zeropad = 1;
2758 f++;
2759 }
2760
2761 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2762 width = -1;
2763 if (Py_ISDIGIT((unsigned)*f)) {
2764 width = *f - '0';
2765 f++;
2766 while (Py_ISDIGIT((unsigned)*f)) {
2767 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2768 PyErr_SetString(PyExc_ValueError,
2769 "width too big");
2770 return NULL;
2771 }
2772 width = (width * 10) + (*f - '0');
2773 f++;
2774 }
2775 }
2776 precision = -1;
2777 if (*f == '.') {
2778 f++;
2779 if (Py_ISDIGIT((unsigned)*f)) {
2780 precision = (*f - '0');
2781 f++;
2782 while (Py_ISDIGIT((unsigned)*f)) {
2783 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2784 PyErr_SetString(PyExc_ValueError,
2785 "precision too big");
2786 return NULL;
2787 }
2788 precision = (precision * 10) + (*f - '0');
2789 f++;
2790 }
2791 }
2792 if (*f == '%') {
2793 /* "%.3%s" => f points to "3" */
2794 f--;
2795 }
2796 }
2797 if (*f == '\0') {
2798 /* bogus format "%.123" => go backward, f points to "3" */
2799 f--;
2800 }
2801
2802 /* Handle %ld, %lu, %lld and %llu. */
2803 longflag = 0;
2804 longlongflag = 0;
2805 size_tflag = 0;
2806 if (*f == 'l') {
2807 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2808 longflag = 1;
2809 ++f;
2810 }
2811 else if (f[1] == 'l' &&
2812 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2813 longlongflag = 1;
2814 f += 2;
2815 }
2816 }
2817 /* handle the size_t flag. */
2818 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2819 size_tflag = 1;
2820 ++f;
2821 }
2822
2823 if (f[1] == '\0')
2824 writer->overallocate = 0;
2825
2826 switch (*f) {
2827 case 'c':
2828 {
2829 int ordinal = va_arg(*vargs, int);
2830 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2831 PyErr_SetString(PyExc_OverflowError,
2832 "character argument not in range(0x110000)");
2833 return NULL;
2834 }
2835 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2836 return NULL;
2837 break;
2838 }
2839
2840 case 'i':
2841 case 'd':
2842 case 'u':
2843 case 'x':
2844 {
2845 /* used by sprintf */
2846 char buffer[MAX_LONG_LONG_CHARS];
2847 Py_ssize_t arglen;
2848
2849 if (*f == 'u') {
2850 if (longflag)
2851 len = sprintf(buffer, "%lu",
2852 va_arg(*vargs, unsigned long));
2853 else if (longlongflag)
2854 len = sprintf(buffer, "%llu",
2855 va_arg(*vargs, unsigned long long));
2856 else if (size_tflag)
2857 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2858 va_arg(*vargs, size_t));
2859 else
2860 len = sprintf(buffer, "%u",
2861 va_arg(*vargs, unsigned int));
2862 }
2863 else if (*f == 'x') {
2864 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2865 }
2866 else {
2867 if (longflag)
2868 len = sprintf(buffer, "%li",
2869 va_arg(*vargs, long));
2870 else if (longlongflag)
2871 len = sprintf(buffer, "%lli",
2872 va_arg(*vargs, long long));
2873 else if (size_tflag)
2874 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2875 va_arg(*vargs, Py_ssize_t));
2876 else
2877 len = sprintf(buffer, "%i",
2878 va_arg(*vargs, int));
2879 }
2880 assert(len >= 0);
2881
2882 if (precision < len)
2883 precision = len;
2884
2885 arglen = Py_MAX(precision, width);
2886 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2887 return NULL;
2888
2889 if (width > precision) {
2890 Py_UCS4 fillchar;
2891 fill = width - precision;
2892 fillchar = zeropad?'0':' ';
2893 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2894 return NULL;
2895 writer->pos += fill;
2896 }
2897 if (precision > len) {
2898 fill = precision - len;
2899 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2900 return NULL;
2901 writer->pos += fill;
2902 }
2903
2904 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2905 return NULL;
2906 break;
2907 }
2908
2909 case 'p':
2910 {
2911 char number[MAX_LONG_LONG_CHARS];
2912
2913 len = sprintf(number, "%p", va_arg(*vargs, void*));
2914 assert(len >= 0);
2915
2916 /* %p is ill-defined: ensure leading 0x. */
2917 if (number[1] == 'X')
2918 number[1] = 'x';
2919 else if (number[1] != 'x') {
2920 memmove(number + 2, number,
2921 strlen(number) + 1);
2922 number[0] = '0';
2923 number[1] = 'x';
2924 len += 2;
2925 }
2926
2927 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2928 return NULL;
2929 break;
2930 }
2931
2932 case 's':
2933 {
2934 /* UTF-8 */
2935 const char *s = va_arg(*vargs, const char*);
2936 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2937 return NULL;
2938 break;
2939 }
2940
2941 case 'U':
2942 {
2943 PyObject *obj = va_arg(*vargs, PyObject *);
2944 assert(obj && _PyUnicode_CHECK(obj));
2945
2946 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2947 return NULL;
2948 break;
2949 }
2950
2951 case 'V':
2952 {
2953 PyObject *obj = va_arg(*vargs, PyObject *);
2954 const char *str = va_arg(*vargs, const char *);
2955 if (obj) {
2956 assert(_PyUnicode_CHECK(obj));
2957 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2958 return NULL;
2959 }
2960 else {
2961 assert(str != NULL);
2962 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2963 return NULL;
2964 }
2965 break;
2966 }
2967
2968 case 'S':
2969 {
2970 PyObject *obj = va_arg(*vargs, PyObject *);
2971 PyObject *str;
2972 assert(obj);
2973 str = PyObject_Str(obj);
2974 if (!str)
2975 return NULL;
2976 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2977 Py_DECREF(str);
2978 return NULL;
2979 }
2980 Py_DECREF(str);
2981 break;
2982 }
2983
2984 case 'R':
2985 {
2986 PyObject *obj = va_arg(*vargs, PyObject *);
2987 PyObject *repr;
2988 assert(obj);
2989 repr = PyObject_Repr(obj);
2990 if (!repr)
2991 return NULL;
2992 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2993 Py_DECREF(repr);
2994 return NULL;
2995 }
2996 Py_DECREF(repr);
2997 break;
2998 }
2999
3000 case 'A':
3001 {
3002 PyObject *obj = va_arg(*vargs, PyObject *);
3003 PyObject *ascii;
3004 assert(obj);
3005 ascii = PyObject_ASCII(obj);
3006 if (!ascii)
3007 return NULL;
3008 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3009 Py_DECREF(ascii);
3010 return NULL;
3011 }
3012 Py_DECREF(ascii);
3013 break;
3014 }
3015
3016 case '%':
3017 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3018 return NULL;
3019 break;
3020
3021 default:
3022 /* if we stumble upon an unknown formatting code, copy the rest
3023 of the format string to the output string. (we cannot just
3024 skip the code, since there's no way to know what's in the
3025 argument list) */
3026 len = strlen(p);
3027 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3028 return NULL;
3029 f = p+len;
3030 return f;
3031 }
3032
3033 f++;
3034 return f;
3035 }
3036
3037 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3038 PyUnicode_FromFormatV(const char *format, va_list vargs)
3039 {
3040 va_list vargs2;
3041 const char *f;
3042 _PyUnicodeWriter writer;
3043
3044 _PyUnicodeWriter_Init(&writer);
3045 writer.min_length = strlen(format) + 100;
3046 writer.overallocate = 1;
3047
3048 // Copy varags to be able to pass a reference to a subfunction.
3049 va_copy(vargs2, vargs);
3050
3051 for (f = format; *f; ) {
3052 if (*f == '%') {
3053 f = unicode_fromformat_arg(&writer, f, &vargs2);
3054 if (f == NULL)
3055 goto fail;
3056 }
3057 else {
3058 const char *p;
3059 Py_ssize_t len;
3060
3061 p = f;
3062 do
3063 {
3064 if ((unsigned char)*p > 127) {
3065 PyErr_Format(PyExc_ValueError,
3066 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3067 "string, got a non-ASCII byte: 0x%02x",
3068 (unsigned char)*p);
3069 goto fail;
3070 }
3071 p++;
3072 }
3073 while (*p != '\0' && *p != '%');
3074 len = p - f;
3075
3076 if (*p == '\0')
3077 writer.overallocate = 0;
3078
3079 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3080 goto fail;
3081
3082 f = p;
3083 }
3084 }
3085 va_end(vargs2);
3086 return _PyUnicodeWriter_Finish(&writer);
3087
3088 fail:
3089 va_end(vargs2);
3090 _PyUnicodeWriter_Dealloc(&writer);
3091 return NULL;
3092 }
3093
3094 PyObject *
PyUnicode_FromFormat(const char * format,...)3095 PyUnicode_FromFormat(const char *format, ...)
3096 {
3097 PyObject* ret;
3098 va_list vargs;
3099
3100 #ifdef HAVE_STDARG_PROTOTYPES
3101 va_start(vargs, format);
3102 #else
3103 va_start(vargs);
3104 #endif
3105 ret = PyUnicode_FromFormatV(format, vargs);
3106 va_end(vargs);
3107 return ret;
3108 }
3109
3110 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3111 unicode_get_widechar_size(PyObject *unicode)
3112 {
3113 Py_ssize_t res;
3114
3115 assert(unicode != NULL);
3116 assert(_PyUnicode_CHECK(unicode));
3117
3118 if (_PyUnicode_WSTR(unicode) != NULL) {
3119 return PyUnicode_WSTR_LENGTH(unicode);
3120 }
3121 assert(PyUnicode_IS_READY(unicode));
3122
3123 res = _PyUnicode_LENGTH(unicode);
3124 #if SIZEOF_WCHAR_T == 2
3125 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3126 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3127 const Py_UCS4 *end = s + res;
3128 for (; s < end; ++s) {
3129 if (*s > 0xFFFF) {
3130 ++res;
3131 }
3132 }
3133 }
3134 #endif
3135 return res;
3136 }
3137
3138 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3139 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3140 {
3141 const wchar_t *wstr;
3142
3143 assert(unicode != NULL);
3144 assert(_PyUnicode_CHECK(unicode));
3145
3146 wstr = _PyUnicode_WSTR(unicode);
3147 if (wstr != NULL) {
3148 memcpy(w, wstr, size * sizeof(wchar_t));
3149 return;
3150 }
3151 assert(PyUnicode_IS_READY(unicode));
3152
3153 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3154 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3155 for (; size--; ++s, ++w) {
3156 *w = *s;
3157 }
3158 }
3159 else {
3160 #if SIZEOF_WCHAR_T == 4
3161 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3162 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3163 for (; size--; ++s, ++w) {
3164 *w = *s;
3165 }
3166 #else
3167 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3168 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3169 for (; size--; ++s, ++w) {
3170 Py_UCS4 ch = *s;
3171 if (ch > 0xFFFF) {
3172 assert(ch <= MAX_UNICODE);
3173 /* encode surrogate pair in this case */
3174 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3175 if (!size--)
3176 break;
3177 *w = Py_UNICODE_LOW_SURROGATE(ch);
3178 }
3179 else {
3180 *w = ch;
3181 }
3182 }
3183 #endif
3184 }
3185 }
3186
3187 #ifdef HAVE_WCHAR_H
3188
3189 /* Convert a Unicode object to a wide character string.
3190
3191 - If w is NULL: return the number of wide characters (including the null
3192 character) required to convert the unicode object. Ignore size argument.
3193
3194 - Otherwise: return the number of wide characters (excluding the null
3195 character) written into w. Write at most size wide characters (including
3196 the null character). */
3197 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3198 PyUnicode_AsWideChar(PyObject *unicode,
3199 wchar_t *w,
3200 Py_ssize_t size)
3201 {
3202 Py_ssize_t res;
3203
3204 if (unicode == NULL) {
3205 PyErr_BadInternalCall();
3206 return -1;
3207 }
3208 if (!PyUnicode_Check(unicode)) {
3209 PyErr_BadArgument();
3210 return -1;
3211 }
3212
3213 res = unicode_get_widechar_size(unicode);
3214 if (w == NULL) {
3215 return res + 1;
3216 }
3217
3218 if (size > res) {
3219 size = res + 1;
3220 }
3221 else {
3222 res = size;
3223 }
3224 unicode_copy_as_widechar(unicode, w, size);
3225 return res;
3226 }
3227
3228 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3229 PyUnicode_AsWideCharString(PyObject *unicode,
3230 Py_ssize_t *size)
3231 {
3232 wchar_t *buffer;
3233 Py_ssize_t buflen;
3234
3235 if (unicode == NULL) {
3236 PyErr_BadInternalCall();
3237 return NULL;
3238 }
3239 if (!PyUnicode_Check(unicode)) {
3240 PyErr_BadArgument();
3241 return NULL;
3242 }
3243
3244 buflen = unicode_get_widechar_size(unicode);
3245 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3246 if (buffer == NULL) {
3247 PyErr_NoMemory();
3248 return NULL;
3249 }
3250 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3251 if (size != NULL) {
3252 *size = buflen;
3253 }
3254 else if (wcslen(buffer) != (size_t)buflen) {
3255 PyMem_FREE(buffer);
3256 PyErr_SetString(PyExc_ValueError,
3257 "embedded null character");
3258 return NULL;
3259 }
3260 return buffer;
3261 }
3262
3263 #endif /* HAVE_WCHAR_H */
3264
3265 PyObject *
PyUnicode_FromOrdinal(int ordinal)3266 PyUnicode_FromOrdinal(int ordinal)
3267 {
3268 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3269 PyErr_SetString(PyExc_ValueError,
3270 "chr() arg not in range(0x110000)");
3271 return NULL;
3272 }
3273
3274 return unicode_char((Py_UCS4)ordinal);
3275 }
3276
3277 PyObject *
PyUnicode_FromObject(PyObject * obj)3278 PyUnicode_FromObject(PyObject *obj)
3279 {
3280 /* XXX Perhaps we should make this API an alias of
3281 PyObject_Str() instead ?! */
3282 if (PyUnicode_CheckExact(obj)) {
3283 if (PyUnicode_READY(obj) == -1)
3284 return NULL;
3285 Py_INCREF(obj);
3286 return obj;
3287 }
3288 if (PyUnicode_Check(obj)) {
3289 /* For a Unicode subtype that's not a Unicode object,
3290 return a true Unicode object with the same data. */
3291 return _PyUnicode_Copy(obj);
3292 }
3293 PyErr_Format(PyExc_TypeError,
3294 "Can't convert '%.100s' object to str implicitly",
3295 Py_TYPE(obj)->tp_name);
3296 return NULL;
3297 }
3298
3299 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3300 PyUnicode_FromEncodedObject(PyObject *obj,
3301 const char *encoding,
3302 const char *errors)
3303 {
3304 Py_buffer buffer;
3305 PyObject *v;
3306
3307 if (obj == NULL) {
3308 PyErr_BadInternalCall();
3309 return NULL;
3310 }
3311
3312 /* Decoding bytes objects is the most common case and should be fast */
3313 if (PyBytes_Check(obj)) {
3314 if (PyBytes_GET_SIZE(obj) == 0) {
3315 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3316 return NULL;
3317 }
3318 _Py_RETURN_UNICODE_EMPTY();
3319 }
3320 return PyUnicode_Decode(
3321 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3322 encoding, errors);
3323 }
3324
3325 if (PyUnicode_Check(obj)) {
3326 PyErr_SetString(PyExc_TypeError,
3327 "decoding str is not supported");
3328 return NULL;
3329 }
3330
3331 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3332 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3333 PyErr_Format(PyExc_TypeError,
3334 "decoding to str: need a bytes-like object, %.80s found",
3335 Py_TYPE(obj)->tp_name);
3336 return NULL;
3337 }
3338
3339 if (buffer.len == 0) {
3340 PyBuffer_Release(&buffer);
3341 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3342 return NULL;
3343 }
3344 _Py_RETURN_UNICODE_EMPTY();
3345 }
3346
3347 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3348 PyBuffer_Release(&buffer);
3349 return v;
3350 }
3351
3352 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3353 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3354 longer than lower_len-1). */
3355 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3356 _Py_normalize_encoding(const char *encoding,
3357 char *lower,
3358 size_t lower_len)
3359 {
3360 const char *e;
3361 char *l;
3362 char *l_end;
3363 int punct;
3364
3365 assert(encoding != NULL);
3366
3367 e = encoding;
3368 l = lower;
3369 l_end = &lower[lower_len - 1];
3370 punct = 0;
3371 while (1) {
3372 char c = *e;
3373 if (c == 0) {
3374 break;
3375 }
3376
3377 if (Py_ISALNUM(c) || c == '.') {
3378 if (punct && l != lower) {
3379 if (l == l_end) {
3380 return 0;
3381 }
3382 *l++ = '_';
3383 }
3384 punct = 0;
3385
3386 if (l == l_end) {
3387 return 0;
3388 }
3389 *l++ = Py_TOLOWER(c);
3390 }
3391 else {
3392 punct = 1;
3393 }
3394
3395 e++;
3396 }
3397 *l = '\0';
3398 return 1;
3399 }
3400
3401 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3402 PyUnicode_Decode(const char *s,
3403 Py_ssize_t size,
3404 const char *encoding,
3405 const char *errors)
3406 {
3407 PyObject *buffer = NULL, *unicode;
3408 Py_buffer info;
3409 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3410
3411 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3412 return NULL;
3413 }
3414
3415 if (size == 0) {
3416 _Py_RETURN_UNICODE_EMPTY();
3417 }
3418
3419 if (encoding == NULL) {
3420 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3421 }
3422
3423 /* Shortcuts for common default encodings */
3424 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3425 char *lower = buflower;
3426
3427 /* Fast paths */
3428 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3429 lower += 3;
3430 if (*lower == '_') {
3431 /* Match "utf8" and "utf_8" */
3432 lower++;
3433 }
3434
3435 if (lower[0] == '8' && lower[1] == 0) {
3436 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3437 }
3438 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3439 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3440 }
3441 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3442 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3443 }
3444 }
3445 else {
3446 if (strcmp(lower, "ascii") == 0
3447 || strcmp(lower, "us_ascii") == 0) {
3448 return PyUnicode_DecodeASCII(s, size, errors);
3449 }
3450 #ifdef MS_WINDOWS
3451 else if (strcmp(lower, "mbcs") == 0) {
3452 return PyUnicode_DecodeMBCS(s, size, errors);
3453 }
3454 #endif
3455 else if (strcmp(lower, "latin1") == 0
3456 || strcmp(lower, "latin_1") == 0
3457 || strcmp(lower, "iso_8859_1") == 0
3458 || strcmp(lower, "iso8859_1") == 0) {
3459 return PyUnicode_DecodeLatin1(s, size, errors);
3460 }
3461 }
3462 }
3463
3464 /* Decode via the codec registry */
3465 buffer = NULL;
3466 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3467 goto onError;
3468 buffer = PyMemoryView_FromBuffer(&info);
3469 if (buffer == NULL)
3470 goto onError;
3471 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3472 if (unicode == NULL)
3473 goto onError;
3474 if (!PyUnicode_Check(unicode)) {
3475 PyErr_Format(PyExc_TypeError,
3476 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3477 "use codecs.decode() to decode to arbitrary types",
3478 encoding,
3479 Py_TYPE(unicode)->tp_name);
3480 Py_DECREF(unicode);
3481 goto onError;
3482 }
3483 Py_DECREF(buffer);
3484 return unicode_result(unicode);
3485
3486 onError:
3487 Py_XDECREF(buffer);
3488 return NULL;
3489 }
3490
3491 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3492 PyUnicode_AsDecodedObject(PyObject *unicode,
3493 const char *encoding,
3494 const char *errors)
3495 {
3496 if (!PyUnicode_Check(unicode)) {
3497 PyErr_BadArgument();
3498 return NULL;
3499 }
3500
3501 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3502 "PyUnicode_AsDecodedObject() is deprecated; "
3503 "use PyCodec_Decode() to decode from str", 1) < 0)
3504 return NULL;
3505
3506 if (encoding == NULL)
3507 encoding = PyUnicode_GetDefaultEncoding();
3508
3509 /* Decode via the codec registry */
3510 return PyCodec_Decode(unicode, encoding, errors);
3511 }
3512
3513 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3514 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3515 const char *encoding,
3516 const char *errors)
3517 {
3518 PyObject *v;
3519
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 goto onError;
3523 }
3524
3525 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3526 "PyUnicode_AsDecodedUnicode() is deprecated; "
3527 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3528 return NULL;
3529
3530 if (encoding == NULL)
3531 encoding = PyUnicode_GetDefaultEncoding();
3532
3533 /* Decode via the codec registry */
3534 v = PyCodec_Decode(unicode, encoding, errors);
3535 if (v == NULL)
3536 goto onError;
3537 if (!PyUnicode_Check(v)) {
3538 PyErr_Format(PyExc_TypeError,
3539 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3540 "use codecs.decode() to decode to arbitrary types",
3541 encoding,
3542 Py_TYPE(unicode)->tp_name);
3543 Py_DECREF(v);
3544 goto onError;
3545 }
3546 return unicode_result(v);
3547
3548 onError:
3549 return NULL;
3550 }
3551
3552 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3553 PyUnicode_Encode(const Py_UNICODE *s,
3554 Py_ssize_t size,
3555 const char *encoding,
3556 const char *errors)
3557 {
3558 PyObject *v, *unicode;
3559
3560 unicode = PyUnicode_FromWideChar(s, size);
3561 if (unicode == NULL)
3562 return NULL;
3563 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3564 Py_DECREF(unicode);
3565 return v;
3566 }
3567
3568 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3569 PyUnicode_AsEncodedObject(PyObject *unicode,
3570 const char *encoding,
3571 const char *errors)
3572 {
3573 PyObject *v;
3574
3575 if (!PyUnicode_Check(unicode)) {
3576 PyErr_BadArgument();
3577 goto onError;
3578 }
3579
3580 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3581 "PyUnicode_AsEncodedObject() is deprecated; "
3582 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3583 "or PyCodec_Encode() for generic encoding", 1) < 0)
3584 return NULL;
3585
3586 if (encoding == NULL)
3587 encoding = PyUnicode_GetDefaultEncoding();
3588
3589 /* Encode via the codec registry */
3590 v = PyCodec_Encode(unicode, encoding, errors);
3591 if (v == NULL)
3592 goto onError;
3593 return v;
3594
3595 onError:
3596 return NULL;
3597 }
3598
3599
3600 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3601 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3602 int current_locale)
3603 {
3604 Py_ssize_t wlen;
3605 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3606 if (wstr == NULL) {
3607 return NULL;
3608 }
3609
3610 if ((size_t)wlen != wcslen(wstr)) {
3611 PyErr_SetString(PyExc_ValueError, "embedded null character");
3612 PyMem_Free(wstr);
3613 return NULL;
3614 }
3615
3616 char *str;
3617 size_t error_pos;
3618 const char *reason;
3619 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3620 current_locale, error_handler);
3621 PyMem_Free(wstr);
3622
3623 if (res != 0) {
3624 if (res == -2) {
3625 PyObject *exc;
3626 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3627 "locale", unicode,
3628 (Py_ssize_t)error_pos,
3629 (Py_ssize_t)(error_pos+1),
3630 reason);
3631 if (exc != NULL) {
3632 PyCodec_StrictErrors(exc);
3633 Py_DECREF(exc);
3634 }
3635 }
3636 else if (res == -3) {
3637 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3638 }
3639 else {
3640 PyErr_NoMemory();
3641 }
3642 return NULL;
3643 }
3644
3645 PyObject *bytes = PyBytes_FromString(str);
3646 PyMem_RawFree(str);
3647 return bytes;
3648 }
3649
3650 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3651 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3652 {
3653 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3654 return unicode_encode_locale(unicode, error_handler, 1);
3655 }
3656
3657 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3658 PyUnicode_EncodeFSDefault(PyObject *unicode)
3659 {
3660 PyInterpreterState *interp = _PyInterpreterState_GET();
3661 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3662 if (fs_codec->utf8) {
3663 return unicode_encode_utf8(unicode,
3664 fs_codec->error_handler,
3665 fs_codec->errors);
3666 }
3667 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3668 else if (fs_codec->encoding) {
3669 return PyUnicode_AsEncodedString(unicode,
3670 fs_codec->encoding,
3671 fs_codec->errors);
3672 }
3673 #endif
3674 else {
3675 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3676 machinery is not ready and so cannot be used:
3677 use wcstombs() in this case. */
3678 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3679 const wchar_t *filesystem_errors = config->filesystem_errors;
3680 assert(filesystem_errors != NULL);
3681 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3682 assert(errors != _Py_ERROR_UNKNOWN);
3683 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3684 return unicode_encode_utf8(unicode, errors, NULL);
3685 #else
3686 return unicode_encode_locale(unicode, errors, 0);
3687 #endif
3688 }
3689 }
3690
3691 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3692 PyUnicode_AsEncodedString(PyObject *unicode,
3693 const char *encoding,
3694 const char *errors)
3695 {
3696 PyObject *v;
3697 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3698
3699 if (!PyUnicode_Check(unicode)) {
3700 PyErr_BadArgument();
3701 return NULL;
3702 }
3703
3704 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3705 return NULL;
3706 }
3707
3708 if (encoding == NULL) {
3709 return _PyUnicode_AsUTF8String(unicode, errors);
3710 }
3711
3712 /* Shortcuts for common default encodings */
3713 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3714 char *lower = buflower;
3715
3716 /* Fast paths */
3717 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3718 lower += 3;
3719 if (*lower == '_') {
3720 /* Match "utf8" and "utf_8" */
3721 lower++;
3722 }
3723
3724 if (lower[0] == '8' && lower[1] == 0) {
3725 return _PyUnicode_AsUTF8String(unicode, errors);
3726 }
3727 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3728 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3729 }
3730 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3731 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3732 }
3733 }
3734 else {
3735 if (strcmp(lower, "ascii") == 0
3736 || strcmp(lower, "us_ascii") == 0) {
3737 return _PyUnicode_AsASCIIString(unicode, errors);
3738 }
3739 #ifdef MS_WINDOWS
3740 else if (strcmp(lower, "mbcs") == 0) {
3741 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3742 }
3743 #endif
3744 else if (strcmp(lower, "latin1") == 0 ||
3745 strcmp(lower, "latin_1") == 0 ||
3746 strcmp(lower, "iso_8859_1") == 0 ||
3747 strcmp(lower, "iso8859_1") == 0) {
3748 return _PyUnicode_AsLatin1String(unicode, errors);
3749 }
3750 }
3751 }
3752
3753 /* Encode via the codec registry */
3754 v = _PyCodec_EncodeText(unicode, encoding, errors);
3755 if (v == NULL)
3756 return NULL;
3757
3758 /* The normal path */
3759 if (PyBytes_Check(v))
3760 return v;
3761
3762 /* If the codec returns a buffer, raise a warning and convert to bytes */
3763 if (PyByteArray_Check(v)) {
3764 int error;
3765 PyObject *b;
3766
3767 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3768 "encoder %s returned bytearray instead of bytes; "
3769 "use codecs.encode() to encode to arbitrary types",
3770 encoding);
3771 if (error) {
3772 Py_DECREF(v);
3773 return NULL;
3774 }
3775
3776 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3777 PyByteArray_GET_SIZE(v));
3778 Py_DECREF(v);
3779 return b;
3780 }
3781
3782 PyErr_Format(PyExc_TypeError,
3783 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3784 "use codecs.encode() to encode to arbitrary types",
3785 encoding,
3786 Py_TYPE(v)->tp_name);
3787 Py_DECREF(v);
3788 return NULL;
3789 }
3790
3791 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3792 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3793 const char *encoding,
3794 const char *errors)
3795 {
3796 PyObject *v;
3797
3798 if (!PyUnicode_Check(unicode)) {
3799 PyErr_BadArgument();
3800 goto onError;
3801 }
3802
3803 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3804 "PyUnicode_AsEncodedUnicode() is deprecated; "
3805 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3806 return NULL;
3807
3808 if (encoding == NULL)
3809 encoding = PyUnicode_GetDefaultEncoding();
3810
3811 /* Encode via the codec registry */
3812 v = PyCodec_Encode(unicode, encoding, errors);
3813 if (v == NULL)
3814 goto onError;
3815 if (!PyUnicode_Check(v)) {
3816 PyErr_Format(PyExc_TypeError,
3817 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3818 "use codecs.encode() to encode to arbitrary types",
3819 encoding,
3820 Py_TYPE(v)->tp_name);
3821 Py_DECREF(v);
3822 goto onError;
3823 }
3824 return v;
3825
3826 onError:
3827 return NULL;
3828 }
3829
3830 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3831 unicode_decode_locale(const char *str, Py_ssize_t len,
3832 _Py_error_handler errors, int current_locale)
3833 {
3834 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3835 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3836 return NULL;
3837 }
3838
3839 wchar_t *wstr;
3840 size_t wlen;
3841 const char *reason;
3842 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3843 current_locale, errors);
3844 if (res != 0) {
3845 if (res == -2) {
3846 PyObject *exc;
3847 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3848 "locale", str, len,
3849 (Py_ssize_t)wlen,
3850 (Py_ssize_t)(wlen + 1),
3851 reason);
3852 if (exc != NULL) {
3853 PyCodec_StrictErrors(exc);
3854 Py_DECREF(exc);
3855 }
3856 }
3857 else if (res == -3) {
3858 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3859 }
3860 else {
3861 PyErr_NoMemory();
3862 }
3863 return NULL;
3864 }
3865
3866 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3867 PyMem_RawFree(wstr);
3868 return unicode;
3869 }
3870
3871 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3872 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3873 const char *errors)
3874 {
3875 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3876 return unicode_decode_locale(str, len, error_handler, 1);
3877 }
3878
3879 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3880 PyUnicode_DecodeLocale(const char *str, const char *errors)
3881 {
3882 Py_ssize_t size = (Py_ssize_t)strlen(str);
3883 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3884 return unicode_decode_locale(str, size, error_handler, 1);
3885 }
3886
3887
3888 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3889 PyUnicode_DecodeFSDefault(const char *s) {
3890 Py_ssize_t size = (Py_ssize_t)strlen(s);
3891 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3892 }
3893
3894 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3895 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3896 {
3897 PyInterpreterState *interp = _PyInterpreterState_GET();
3898 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3899 if (fs_codec->utf8) {
3900 return unicode_decode_utf8(s, size,
3901 fs_codec->error_handler,
3902 fs_codec->errors,
3903 NULL);
3904 }
3905 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3906 else if (fs_codec->encoding) {
3907 return PyUnicode_Decode(s, size,
3908 fs_codec->encoding,
3909 fs_codec->errors);
3910 }
3911 #endif
3912 else {
3913 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3914 machinery is not ready and so cannot be used:
3915 use mbstowcs() in this case. */
3916 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3917 const wchar_t *filesystem_errors = config->filesystem_errors;
3918 assert(filesystem_errors != NULL);
3919 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3920 assert(errors != _Py_ERROR_UNKNOWN);
3921 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3922 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3923 #else
3924 return unicode_decode_locale(s, size, errors, 0);
3925 #endif
3926 }
3927 }
3928
3929
3930 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3931 PyUnicode_FSConverter(PyObject* arg, void* addr)
3932 {
3933 PyObject *path = NULL;
3934 PyObject *output = NULL;
3935 Py_ssize_t size;
3936 const char *data;
3937 if (arg == NULL) {
3938 Py_DECREF(*(PyObject**)addr);
3939 *(PyObject**)addr = NULL;
3940 return 1;
3941 }
3942 path = PyOS_FSPath(arg);
3943 if (path == NULL) {
3944 return 0;
3945 }
3946 if (PyBytes_Check(path)) {
3947 output = path;
3948 }
3949 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3950 output = PyUnicode_EncodeFSDefault(path);
3951 Py_DECREF(path);
3952 if (!output) {
3953 return 0;
3954 }
3955 assert(PyBytes_Check(output));
3956 }
3957
3958 size = PyBytes_GET_SIZE(output);
3959 data = PyBytes_AS_STRING(output);
3960 if ((size_t)size != strlen(data)) {
3961 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3962 Py_DECREF(output);
3963 return 0;
3964 }
3965 *(PyObject**)addr = output;
3966 return Py_CLEANUP_SUPPORTED;
3967 }
3968
3969
3970 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3971 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3972 {
3973 int is_buffer = 0;
3974 PyObject *path = NULL;
3975 PyObject *output = NULL;
3976 if (arg == NULL) {
3977 Py_DECREF(*(PyObject**)addr);
3978 *(PyObject**)addr = NULL;
3979 return 1;
3980 }
3981
3982 is_buffer = PyObject_CheckBuffer(arg);
3983 if (!is_buffer) {
3984 path = PyOS_FSPath(arg);
3985 if (path == NULL) {
3986 return 0;
3987 }
3988 }
3989 else {
3990 path = arg;
3991 Py_INCREF(arg);
3992 }
3993
3994 if (PyUnicode_Check(path)) {
3995 output = path;
3996 }
3997 else if (PyBytes_Check(path) || is_buffer) {
3998 PyObject *path_bytes = NULL;
3999
4000 if (!PyBytes_Check(path) &&
4001 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4002 "path should be string, bytes, or os.PathLike, not %.200s",
4003 Py_TYPE(arg)->tp_name)) {
4004 Py_DECREF(path);
4005 return 0;
4006 }
4007 path_bytes = PyBytes_FromObject(path);
4008 Py_DECREF(path);
4009 if (!path_bytes) {
4010 return 0;
4011 }
4012 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4013 PyBytes_GET_SIZE(path_bytes));
4014 Py_DECREF(path_bytes);
4015 if (!output) {
4016 return 0;
4017 }
4018 }
4019 else {
4020 PyErr_Format(PyExc_TypeError,
4021 "path should be string, bytes, or os.PathLike, not %.200s",
4022 Py_TYPE(arg)->tp_name);
4023 Py_DECREF(path);
4024 return 0;
4025 }
4026 if (PyUnicode_READY(output) == -1) {
4027 Py_DECREF(output);
4028 return 0;
4029 }
4030 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4031 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4032 PyErr_SetString(PyExc_ValueError, "embedded null character");
4033 Py_DECREF(output);
4034 return 0;
4035 }
4036 *(PyObject**)addr = output;
4037 return Py_CLEANUP_SUPPORTED;
4038 }
4039
4040
4041 static int unicode_fill_utf8(PyObject *unicode);
4042
4043 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4044 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4045 {
4046 if (!PyUnicode_Check(unicode)) {
4047 PyErr_BadArgument();
4048 return NULL;
4049 }
4050 if (PyUnicode_READY(unicode) == -1)
4051 return NULL;
4052
4053 if (PyUnicode_UTF8(unicode) == NULL) {
4054 if (unicode_fill_utf8(unicode) == -1) {
4055 return NULL;
4056 }
4057 }
4058
4059 if (psize)
4060 *psize = PyUnicode_UTF8_LENGTH(unicode);
4061 return PyUnicode_UTF8(unicode);
4062 }
4063
4064 const char *
PyUnicode_AsUTF8(PyObject * unicode)4065 PyUnicode_AsUTF8(PyObject *unicode)
4066 {
4067 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4068 }
4069
4070 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4071 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4072 {
4073 if (!PyUnicode_Check(unicode)) {
4074 PyErr_BadArgument();
4075 return NULL;
4076 }
4077 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4078 if (w == NULL) {
4079 /* Non-ASCII compact unicode object */
4080 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4081 assert(PyUnicode_IS_READY(unicode));
4082
4083 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4084 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4085 PyErr_NoMemory();
4086 return NULL;
4087 }
4088 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4089 if (w == NULL) {
4090 PyErr_NoMemory();
4091 return NULL;
4092 }
4093 unicode_copy_as_widechar(unicode, w, wlen + 1);
4094 _PyUnicode_WSTR(unicode) = w;
4095 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4096 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4097 }
4098 }
4099 if (size != NULL)
4100 *size = PyUnicode_WSTR_LENGTH(unicode);
4101 return w;
4102 }
4103
4104 /* Deprecated APIs */
4105
4106 _Py_COMP_DIAG_PUSH
4107 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4108
4109 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4110 PyUnicode_AsUnicode(PyObject *unicode)
4111 {
4112 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4113 }
4114
4115 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4116 _PyUnicode_AsUnicode(PyObject *unicode)
4117 {
4118 Py_ssize_t size;
4119 const Py_UNICODE *wstr;
4120
4121 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4122 if (wstr && wcslen(wstr) != (size_t)size) {
4123 PyErr_SetString(PyExc_ValueError, "embedded null character");
4124 return NULL;
4125 }
4126 return wstr;
4127 }
4128
4129
4130 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4131 PyUnicode_GetSize(PyObject *unicode)
4132 {
4133 if (!PyUnicode_Check(unicode)) {
4134 PyErr_BadArgument();
4135 goto onError;
4136 }
4137 if (_PyUnicode_WSTR(unicode) == NULL) {
4138 if (PyUnicode_AsUnicode(unicode) == NULL)
4139 goto onError;
4140 }
4141 return PyUnicode_WSTR_LENGTH(unicode);
4142
4143 onError:
4144 return -1;
4145 }
4146
4147 _Py_COMP_DIAG_POP
4148
4149 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4150 PyUnicode_GetLength(PyObject *unicode)
4151 {
4152 if (!PyUnicode_Check(unicode)) {
4153 PyErr_BadArgument();
4154 return -1;
4155 }
4156 if (PyUnicode_READY(unicode) == -1)
4157 return -1;
4158 return PyUnicode_GET_LENGTH(unicode);
4159 }
4160
4161 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4162 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4163 {
4164 const void *data;
4165 int kind;
4166
4167 if (!PyUnicode_Check(unicode)) {
4168 PyErr_BadArgument();
4169 return (Py_UCS4)-1;
4170 }
4171 if (PyUnicode_READY(unicode) == -1) {
4172 return (Py_UCS4)-1;
4173 }
4174 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4175 PyErr_SetString(PyExc_IndexError, "string index out of range");
4176 return (Py_UCS4)-1;
4177 }
4178 data = PyUnicode_DATA(unicode);
4179 kind = PyUnicode_KIND(unicode);
4180 return PyUnicode_READ(kind, data, index);
4181 }
4182
4183 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4184 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4185 {
4186 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4187 PyErr_BadArgument();
4188 return -1;
4189 }
4190 assert(PyUnicode_IS_READY(unicode));
4191 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4192 PyErr_SetString(PyExc_IndexError, "string index out of range");
4193 return -1;
4194 }
4195 if (unicode_check_modifiable(unicode))
4196 return -1;
4197 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4198 PyErr_SetString(PyExc_ValueError, "character out of range");
4199 return -1;
4200 }
4201 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4202 index, ch);
4203 return 0;
4204 }
4205
4206 const char *
PyUnicode_GetDefaultEncoding(void)4207 PyUnicode_GetDefaultEncoding(void)
4208 {
4209 return "utf-8";
4210 }
4211
4212 /* create or adjust a UnicodeDecodeError */
4213 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4214 make_decode_exception(PyObject **exceptionObject,
4215 const char *encoding,
4216 const char *input, Py_ssize_t length,
4217 Py_ssize_t startpos, Py_ssize_t endpos,
4218 const char *reason)
4219 {
4220 if (*exceptionObject == NULL) {
4221 *exceptionObject = PyUnicodeDecodeError_Create(
4222 encoding, input, length, startpos, endpos, reason);
4223 }
4224 else {
4225 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4226 goto onError;
4227 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4228 goto onError;
4229 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4230 goto onError;
4231 }
4232 return;
4233
4234 onError:
4235 Py_CLEAR(*exceptionObject);
4236 }
4237
4238 #ifdef MS_WINDOWS
4239 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4240 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4241 {
4242 if (newsize > *size) {
4243 wchar_t *newbuf = *buf;
4244 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4245 PyErr_NoMemory();
4246 return -1;
4247 }
4248 *buf = newbuf;
4249 }
4250 *size = newsize;
4251 return 0;
4252 }
4253
4254 /* error handling callback helper:
4255 build arguments, call the callback and check the arguments,
4256 if no exception occurred, copy the replacement to the output
4257 and adjust various state variables.
4258 return 0 on success, -1 on error
4259 */
4260
4261 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4262 unicode_decode_call_errorhandler_wchar(
4263 const char *errors, PyObject **errorHandler,
4264 const char *encoding, const char *reason,
4265 const char **input, const char **inend, Py_ssize_t *startinpos,
4266 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4267 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4268 {
4269 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4270
4271 PyObject *restuple = NULL;
4272 PyObject *repunicode = NULL;
4273 Py_ssize_t outsize;
4274 Py_ssize_t insize;
4275 Py_ssize_t requiredsize;
4276 Py_ssize_t newpos;
4277 PyObject *inputobj = NULL;
4278 wchar_t *repwstr;
4279 Py_ssize_t repwlen;
4280
4281 if (*errorHandler == NULL) {
4282 *errorHandler = PyCodec_LookupError(errors);
4283 if (*errorHandler == NULL)
4284 goto onError;
4285 }
4286
4287 make_decode_exception(exceptionObject,
4288 encoding,
4289 *input, *inend - *input,
4290 *startinpos, *endinpos,
4291 reason);
4292 if (*exceptionObject == NULL)
4293 goto onError;
4294
4295 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4296 if (restuple == NULL)
4297 goto onError;
4298 if (!PyTuple_Check(restuple)) {
4299 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4300 goto onError;
4301 }
4302 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4303 goto onError;
4304
4305 /* Copy back the bytes variables, which might have been modified by the
4306 callback */
4307 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4308 if (!inputobj)
4309 goto onError;
4310 *input = PyBytes_AS_STRING(inputobj);
4311 insize = PyBytes_GET_SIZE(inputobj);
4312 *inend = *input + insize;
4313 /* we can DECREF safely, as the exception has another reference,
4314 so the object won't go away. */
4315 Py_DECREF(inputobj);
4316
4317 if (newpos<0)
4318 newpos = insize+newpos;
4319 if (newpos<0 || newpos>insize) {
4320 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4321 goto onError;
4322 }
4323
4324 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4325 if (repwstr == NULL)
4326 goto onError;
4327 /* need more space? (at least enough for what we
4328 have+the replacement+the rest of the string (starting
4329 at the new input position), so we won't have to check space
4330 when there are no errors in the rest of the string) */
4331 requiredsize = *outpos;
4332 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4333 goto overflow;
4334 requiredsize += repwlen;
4335 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4336 goto overflow;
4337 requiredsize += insize - newpos;
4338 outsize = *bufsize;
4339 if (requiredsize > outsize) {
4340 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4341 requiredsize = 2*outsize;
4342 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4343 goto onError;
4344 }
4345 }
4346 wcsncpy(*buf + *outpos, repwstr, repwlen);
4347 *outpos += repwlen;
4348 *endinpos = newpos;
4349 *inptr = *input + newpos;
4350
4351 /* we made it! */
4352 Py_DECREF(restuple);
4353 return 0;
4354
4355 overflow:
4356 PyErr_SetString(PyExc_OverflowError,
4357 "decoded result is too long for a Python string");
4358
4359 onError:
4360 Py_XDECREF(restuple);
4361 return -1;
4362 }
4363 #endif /* MS_WINDOWS */
4364
4365 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4366 unicode_decode_call_errorhandler_writer(
4367 const char *errors, PyObject **errorHandler,
4368 const char *encoding, const char *reason,
4369 const char **input, const char **inend, Py_ssize_t *startinpos,
4370 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4371 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4372 {
4373 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4374
4375 PyObject *restuple = NULL;
4376 PyObject *repunicode = NULL;
4377 Py_ssize_t insize;
4378 Py_ssize_t newpos;
4379 Py_ssize_t replen;
4380 Py_ssize_t remain;
4381 PyObject *inputobj = NULL;
4382 int need_to_grow = 0;
4383 const char *new_inptr;
4384
4385 if (*errorHandler == NULL) {
4386 *errorHandler = PyCodec_LookupError(errors);
4387 if (*errorHandler == NULL)
4388 goto onError;
4389 }
4390
4391 make_decode_exception(exceptionObject,
4392 encoding,
4393 *input, *inend - *input,
4394 *startinpos, *endinpos,
4395 reason);
4396 if (*exceptionObject == NULL)
4397 goto onError;
4398
4399 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4400 if (restuple == NULL)
4401 goto onError;
4402 if (!PyTuple_Check(restuple)) {
4403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4404 goto onError;
4405 }
4406 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4407 goto onError;
4408
4409 /* Copy back the bytes variables, which might have been modified by the
4410 callback */
4411 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412 if (!inputobj)
4413 goto onError;
4414 remain = *inend - *input - *endinpos;
4415 *input = PyBytes_AS_STRING(inputobj);
4416 insize = PyBytes_GET_SIZE(inputobj);
4417 *inend = *input + insize;
4418 /* we can DECREF safely, as the exception has another reference,
4419 so the object won't go away. */
4420 Py_DECREF(inputobj);
4421
4422 if (newpos<0)
4423 newpos = insize+newpos;
4424 if (newpos<0 || newpos>insize) {
4425 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4426 goto onError;
4427 }
4428
4429 replen = PyUnicode_GET_LENGTH(repunicode);
4430 if (replen > 1) {
4431 writer->min_length += replen - 1;
4432 need_to_grow = 1;
4433 }
4434 new_inptr = *input + newpos;
4435 if (*inend - new_inptr > remain) {
4436 /* We don't know the decoding algorithm here so we make the worst
4437 assumption that one byte decodes to one unicode character.
4438 If unfortunately one byte could decode to more unicode characters,
4439 the decoder may write out-of-bound then. Is it possible for the
4440 algorithms using this function? */
4441 writer->min_length += *inend - new_inptr - remain;
4442 need_to_grow = 1;
4443 }
4444 if (need_to_grow) {
4445 writer->overallocate = 1;
4446 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4447 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4448 goto onError;
4449 }
4450 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4451 goto onError;
4452
4453 *endinpos = newpos;
4454 *inptr = new_inptr;
4455
4456 /* we made it! */
4457 Py_DECREF(restuple);
4458 return 0;
4459
4460 onError:
4461 Py_XDECREF(restuple);
4462 return -1;
4463 }
4464
4465 /* --- UTF-7 Codec -------------------------------------------------------- */
4466
4467 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4468
4469 /* Three simple macros defining base-64. */
4470
4471 /* Is c a base-64 character? */
4472
4473 #define IS_BASE64(c) \
4474 (((c) >= 'A' && (c) <= 'Z') || \
4475 ((c) >= 'a' && (c) <= 'z') || \
4476 ((c) >= '0' && (c) <= '9') || \
4477 (c) == '+' || (c) == '/')
4478
4479 /* given that c is a base-64 character, what is its base-64 value? */
4480
4481 #define FROM_BASE64(c) \
4482 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4483 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4484 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4485 (c) == '+' ? 62 : 63)
4486
4487 /* What is the base-64 character of the bottom 6 bits of n? */
4488
4489 #define TO_BASE64(n) \
4490 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4491
4492 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4493 * decoded as itself. We are permissive on decoding; the only ASCII
4494 * byte not decoding to itself is the + which begins a base64
4495 * string. */
4496
4497 #define DECODE_DIRECT(c) \
4498 ((c) <= 127 && (c) != '+')
4499
4500 /* The UTF-7 encoder treats ASCII characters differently according to
4501 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4502 * the above). See RFC2152. This array identifies these different
4503 * sets:
4504 * 0 : "Set D"
4505 * alphanumeric and '(),-./:?
4506 * 1 : "Set O"
4507 * !"#$%&*;<=>@[]^_`{|}
4508 * 2 : "whitespace"
4509 * ht nl cr sp
4510 * 3 : special (must be base64 encoded)
4511 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4512 */
4513
4514 static
4515 char utf7_category[128] = {
4516 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4517 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4518 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4519 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4520 /* sp ! " # $ % & ' ( ) * + , - . / */
4521 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4522 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4524 /* @ A B C D E F G H I J K L M N O */
4525 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4526 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4528 /* ` a b c d e f g h i j k l m n o */
4529 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4530 /* p q r s t u v w x y z { | } ~ del */
4531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4532 };
4533
4534 /* ENCODE_DIRECT: this character should be encoded as itself. The
4535 * answer depends on whether we are encoding set O as itself, and also
4536 * on whether we are encoding whitespace as itself. RFC2152 makes it
4537 * clear that the answers to these questions vary between
4538 * applications, so this code needs to be flexible. */
4539
4540 #define ENCODE_DIRECT(c, directO, directWS) \
4541 ((c) < 128 && (c) > 0 && \
4542 ((utf7_category[(c)] == 0) || \
4543 (directWS && (utf7_category[(c)] == 2)) || \
4544 (directO && (utf7_category[(c)] == 1))))
4545
4546 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4547 PyUnicode_DecodeUTF7(const char *s,
4548 Py_ssize_t size,
4549 const char *errors)
4550 {
4551 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4552 }
4553
4554 /* The decoder. The only state we preserve is our read position,
4555 * i.e. how many characters we have consumed. So if we end in the
4556 * middle of a shift sequence we have to back off the read position
4557 * and the output to the beginning of the sequence, otherwise we lose
4558 * all the shift state (seen bits, number of bits seen, high
4559 * surrogate). */
4560
4561 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4562 PyUnicode_DecodeUTF7Stateful(const char *s,
4563 Py_ssize_t size,
4564 const char *errors,
4565 Py_ssize_t *consumed)
4566 {
4567 const char *starts = s;
4568 Py_ssize_t startinpos;
4569 Py_ssize_t endinpos;
4570 const char *e;
4571 _PyUnicodeWriter writer;
4572 const char *errmsg = "";
4573 int inShift = 0;
4574 Py_ssize_t shiftOutStart;
4575 unsigned int base64bits = 0;
4576 unsigned long base64buffer = 0;
4577 Py_UCS4 surrogate = 0;
4578 PyObject *errorHandler = NULL;
4579 PyObject *exc = NULL;
4580
4581 if (size == 0) {
4582 if (consumed)
4583 *consumed = 0;
4584 _Py_RETURN_UNICODE_EMPTY();
4585 }
4586
4587 /* Start off assuming it's all ASCII. Widen later as necessary. */
4588 _PyUnicodeWriter_Init(&writer);
4589 writer.min_length = size;
4590
4591 shiftOutStart = 0;
4592 e = s + size;
4593
4594 while (s < e) {
4595 Py_UCS4 ch;
4596 restart:
4597 ch = (unsigned char) *s;
4598
4599 if (inShift) { /* in a base-64 section */
4600 if (IS_BASE64(ch)) { /* consume a base-64 character */
4601 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4602 base64bits += 6;
4603 s++;
4604 if (base64bits >= 16) {
4605 /* we have enough bits for a UTF-16 value */
4606 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4607 base64bits -= 16;
4608 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4609 assert(outCh <= 0xffff);
4610 if (surrogate) {
4611 /* expecting a second surrogate */
4612 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4613 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4614 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4615 goto onError;
4616 surrogate = 0;
4617 continue;
4618 }
4619 else {
4620 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4621 goto onError;
4622 surrogate = 0;
4623 }
4624 }
4625 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4626 /* first surrogate */
4627 surrogate = outCh;
4628 }
4629 else {
4630 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4631 goto onError;
4632 }
4633 }
4634 }
4635 else { /* now leaving a base-64 section */
4636 inShift = 0;
4637 if (base64bits > 0) { /* left-over bits */
4638 if (base64bits >= 6) {
4639 /* We've seen at least one base-64 character */
4640 s++;
4641 errmsg = "partial character in shift sequence";
4642 goto utf7Error;
4643 }
4644 else {
4645 /* Some bits remain; they should be zero */
4646 if (base64buffer != 0) {
4647 s++;
4648 errmsg = "non-zero padding bits in shift sequence";
4649 goto utf7Error;
4650 }
4651 }
4652 }
4653 if (surrogate && DECODE_DIRECT(ch)) {
4654 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4655 goto onError;
4656 }
4657 surrogate = 0;
4658 if (ch == '-') {
4659 /* '-' is absorbed; other terminating
4660 characters are preserved */
4661 s++;
4662 }
4663 }
4664 }
4665 else if ( ch == '+' ) {
4666 startinpos = s-starts;
4667 s++; /* consume '+' */
4668 if (s < e && *s == '-') { /* '+-' encodes '+' */
4669 s++;
4670 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4671 goto onError;
4672 }
4673 else if (s < e && !IS_BASE64(*s)) {
4674 s++;
4675 errmsg = "ill-formed sequence";
4676 goto utf7Error;
4677 }
4678 else { /* begin base64-encoded section */
4679 inShift = 1;
4680 surrogate = 0;
4681 shiftOutStart = writer.pos;
4682 base64bits = 0;
4683 base64buffer = 0;
4684 }
4685 }
4686 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4687 s++;
4688 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4689 goto onError;
4690 }
4691 else {
4692 startinpos = s-starts;
4693 s++;
4694 errmsg = "unexpected special character";
4695 goto utf7Error;
4696 }
4697 continue;
4698 utf7Error:
4699 endinpos = s-starts;
4700 if (unicode_decode_call_errorhandler_writer(
4701 errors, &errorHandler,
4702 "utf7", errmsg,
4703 &starts, &e, &startinpos, &endinpos, &exc, &s,
4704 &writer))
4705 goto onError;
4706 }
4707
4708 /* end of string */
4709
4710 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4711 /* if we're in an inconsistent state, that's an error */
4712 inShift = 0;
4713 if (surrogate ||
4714 (base64bits >= 6) ||
4715 (base64bits > 0 && base64buffer != 0)) {
4716 endinpos = size;
4717 if (unicode_decode_call_errorhandler_writer(
4718 errors, &errorHandler,
4719 "utf7", "unterminated shift sequence",
4720 &starts, &e, &startinpos, &endinpos, &exc, &s,
4721 &writer))
4722 goto onError;
4723 if (s < e)
4724 goto restart;
4725 }
4726 }
4727
4728 /* return state */
4729 if (consumed) {
4730 if (inShift) {
4731 *consumed = startinpos;
4732 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4733 PyObject *result = PyUnicode_FromKindAndData(
4734 writer.kind, writer.data, shiftOutStart);
4735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
4737 _PyUnicodeWriter_Dealloc(&writer);
4738 return result;
4739 }
4740 writer.pos = shiftOutStart; /* back off output */
4741 }
4742 else {
4743 *consumed = s-starts;
4744 }
4745 }
4746
4747 Py_XDECREF(errorHandler);
4748 Py_XDECREF(exc);
4749 return _PyUnicodeWriter_Finish(&writer);
4750
4751 onError:
4752 Py_XDECREF(errorHandler);
4753 Py_XDECREF(exc);
4754 _PyUnicodeWriter_Dealloc(&writer);
4755 return NULL;
4756 }
4757
4758
4759 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4760 _PyUnicode_EncodeUTF7(PyObject *str,
4761 int base64SetO,
4762 int base64WhiteSpace,
4763 const char *errors)
4764 {
4765 int kind;
4766 const void *data;
4767 Py_ssize_t len;
4768 PyObject *v;
4769 int inShift = 0;
4770 Py_ssize_t i;
4771 unsigned int base64bits = 0;
4772 unsigned long base64buffer = 0;
4773 char * out;
4774 const char * start;
4775
4776 if (PyUnicode_READY(str) == -1)
4777 return NULL;
4778 kind = PyUnicode_KIND(str);
4779 data = PyUnicode_DATA(str);
4780 len = PyUnicode_GET_LENGTH(str);
4781
4782 if (len == 0)
4783 return PyBytes_FromStringAndSize(NULL, 0);
4784
4785 /* It might be possible to tighten this worst case */
4786 if (len > PY_SSIZE_T_MAX / 8)
4787 return PyErr_NoMemory();
4788 v = PyBytes_FromStringAndSize(NULL, len * 8);
4789 if (v == NULL)
4790 return NULL;
4791
4792 start = out = PyBytes_AS_STRING(v);
4793 for (i = 0; i < len; ++i) {
4794 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4795
4796 if (inShift) {
4797 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4798 /* shifting out */
4799 if (base64bits) { /* output remaining bits */
4800 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4801 base64buffer = 0;
4802 base64bits = 0;
4803 }
4804 inShift = 0;
4805 /* Characters not in the BASE64 set implicitly unshift the sequence
4806 so no '-' is required, except if the character is itself a '-' */
4807 if (IS_BASE64(ch) || ch == '-') {
4808 *out++ = '-';
4809 }
4810 *out++ = (char) ch;
4811 }
4812 else {
4813 goto encode_char;
4814 }
4815 }
4816 else { /* not in a shift sequence */
4817 if (ch == '+') {
4818 *out++ = '+';
4819 *out++ = '-';
4820 }
4821 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4822 *out++ = (char) ch;
4823 }
4824 else {
4825 *out++ = '+';
4826 inShift = 1;
4827 goto encode_char;
4828 }
4829 }
4830 continue;
4831 encode_char:
4832 if (ch >= 0x10000) {
4833 assert(ch <= MAX_UNICODE);
4834
4835 /* code first surrogate */
4836 base64bits += 16;
4837 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4838 while (base64bits >= 6) {
4839 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4840 base64bits -= 6;
4841 }
4842 /* prepare second surrogate */
4843 ch = Py_UNICODE_LOW_SURROGATE(ch);
4844 }
4845 base64bits += 16;
4846 base64buffer = (base64buffer << 16) | ch;
4847 while (base64bits >= 6) {
4848 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4849 base64bits -= 6;
4850 }
4851 }
4852 if (base64bits)
4853 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4854 if (inShift)
4855 *out++ = '-';
4856 if (_PyBytes_Resize(&v, out - start) < 0)
4857 return NULL;
4858 return v;
4859 }
4860 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4861 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4862 Py_ssize_t size,
4863 int base64SetO,
4864 int base64WhiteSpace,
4865 const char *errors)
4866 {
4867 PyObject *result;
4868 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4869 if (tmp == NULL)
4870 return NULL;
4871 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4872 base64WhiteSpace, errors);
4873 Py_DECREF(tmp);
4874 return result;
4875 }
4876
4877 #undef IS_BASE64
4878 #undef FROM_BASE64
4879 #undef TO_BASE64
4880 #undef DECODE_DIRECT
4881 #undef ENCODE_DIRECT
4882
4883 /* --- UTF-8 Codec -------------------------------------------------------- */
4884
4885 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4886 PyUnicode_DecodeUTF8(const char *s,
4887 Py_ssize_t size,
4888 const char *errors)
4889 {
4890 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4891 }
4892
4893 #include "stringlib/asciilib.h"
4894 #include "stringlib/codecs.h"
4895 #include "stringlib/undef.h"
4896
4897 #include "stringlib/ucs1lib.h"
4898 #include "stringlib/codecs.h"
4899 #include "stringlib/undef.h"
4900
4901 #include "stringlib/ucs2lib.h"
4902 #include "stringlib/codecs.h"
4903 #include "stringlib/undef.h"
4904
4905 #include "stringlib/ucs4lib.h"
4906 #include "stringlib/codecs.h"
4907 #include "stringlib/undef.h"
4908
4909 /* Mask to quickly check whether a C 'long' contains a
4910 non-ASCII, UTF8-encoded char. */
4911 #if (SIZEOF_LONG == 8)
4912 # define ASCII_CHAR_MASK 0x8080808080808080UL
4913 #elif (SIZEOF_LONG == 4)
4914 # define ASCII_CHAR_MASK 0x80808080UL
4915 #else
4916 # error C 'long' size should be either 4 or 8!
4917 #endif
4918
4919 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4920 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4921 {
4922 const char *p = start;
4923 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4924
4925 /*
4926 * Issue #17237: m68k is a bit different from most architectures in
4927 * that objects do not use "natural alignment" - for example, int and
4928 * long are only aligned at 2-byte boundaries. Therefore the assert()
4929 * won't work; also, tests have shown that skipping the "optimised
4930 * version" will even speed up m68k.
4931 */
4932 #if !defined(__m68k__)
4933 #if SIZEOF_LONG <= SIZEOF_VOID_P
4934 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4935 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4936 /* Fast path, see in STRINGLIB(utf8_decode) for
4937 an explanation. */
4938 /* Help allocation */
4939 const char *_p = p;
4940 Py_UCS1 * q = dest;
4941 while (_p < aligned_end) {
4942 unsigned long value = *(const unsigned long *) _p;
4943 if (value & ASCII_CHAR_MASK)
4944 break;
4945 *((unsigned long *)q) = value;
4946 _p += SIZEOF_LONG;
4947 q += SIZEOF_LONG;
4948 }
4949 p = _p;
4950 while (p < end) {
4951 if ((unsigned char)*p & 0x80)
4952 break;
4953 *q++ = *p++;
4954 }
4955 return p - start;
4956 }
4957 #endif
4958 #endif
4959 while (p < end) {
4960 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4961 for an explanation. */
4962 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4963 /* Help allocation */
4964 const char *_p = p;
4965 while (_p < aligned_end) {
4966 unsigned long value = *(const unsigned long *) _p;
4967 if (value & ASCII_CHAR_MASK)
4968 break;
4969 _p += SIZEOF_LONG;
4970 }
4971 p = _p;
4972 if (_p == end)
4973 break;
4974 }
4975 if ((unsigned char)*p & 0x80)
4976 break;
4977 ++p;
4978 }
4979 memcpy(dest, start, p - start);
4980 return p - start;
4981 }
4982
4983 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4984 unicode_decode_utf8(const char *s, Py_ssize_t size,
4985 _Py_error_handler error_handler, const char *errors,
4986 Py_ssize_t *consumed)
4987 {
4988 if (size == 0) {
4989 if (consumed)
4990 *consumed = 0;
4991 _Py_RETURN_UNICODE_EMPTY();
4992 }
4993
4994 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4995 if (size == 1 && (unsigned char)s[0] < 128) {
4996 if (consumed)
4997 *consumed = 1;
4998 return get_latin1_char((unsigned char)s[0]);
4999 }
5000
5001 const char *starts = s;
5002 const char *end = s + size;
5003
5004 // fast path: try ASCII string.
5005 PyObject *u = PyUnicode_New(size, 127);
5006 if (u == NULL) {
5007 return NULL;
5008 }
5009 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5010 if (s == end) {
5011 return u;
5012 }
5013
5014 // Use _PyUnicodeWriter after fast path is failed.
5015 _PyUnicodeWriter writer;
5016 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5017 writer.pos = s - starts;
5018
5019 Py_ssize_t startinpos, endinpos;
5020 const char *errmsg = "";
5021 PyObject *error_handler_obj = NULL;
5022 PyObject *exc = NULL;
5023
5024 while (s < end) {
5025 Py_UCS4 ch;
5026 int kind = writer.kind;
5027
5028 if (kind == PyUnicode_1BYTE_KIND) {
5029 if (PyUnicode_IS_ASCII(writer.buffer))
5030 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5031 else
5032 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5033 } else if (kind == PyUnicode_2BYTE_KIND) {
5034 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5035 } else {
5036 assert(kind == PyUnicode_4BYTE_KIND);
5037 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5038 }
5039
5040 switch (ch) {
5041 case 0:
5042 if (s == end || consumed)
5043 goto End;
5044 errmsg = "unexpected end of data";
5045 startinpos = s - starts;
5046 endinpos = end - starts;
5047 break;
5048 case 1:
5049 errmsg = "invalid start byte";
5050 startinpos = s - starts;
5051 endinpos = startinpos + 1;
5052 break;
5053 case 2:
5054 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5055 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5056 {
5057 /* Truncated surrogate code in range D800-DFFF */
5058 goto End;
5059 }
5060 /* fall through */
5061 case 3:
5062 case 4:
5063 errmsg = "invalid continuation byte";
5064 startinpos = s - starts;
5065 endinpos = startinpos + ch - 1;
5066 break;
5067 default:
5068 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5069 goto onError;
5070 continue;
5071 }
5072
5073 if (error_handler == _Py_ERROR_UNKNOWN)
5074 error_handler = _Py_GetErrorHandler(errors);
5075
5076 switch (error_handler) {
5077 case _Py_ERROR_IGNORE:
5078 s += (endinpos - startinpos);
5079 break;
5080
5081 case _Py_ERROR_REPLACE:
5082 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5083 goto onError;
5084 s += (endinpos - startinpos);
5085 break;
5086
5087 case _Py_ERROR_SURROGATEESCAPE:
5088 {
5089 Py_ssize_t i;
5090
5091 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5092 goto onError;
5093 for (i=startinpos; i<endinpos; i++) {
5094 ch = (Py_UCS4)(unsigned char)(starts[i]);
5095 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5096 ch + 0xdc00);
5097 writer.pos++;
5098 }
5099 s += (endinpos - startinpos);
5100 break;
5101 }
5102
5103 default:
5104 if (unicode_decode_call_errorhandler_writer(
5105 errors, &error_handler_obj,
5106 "utf-8", errmsg,
5107 &starts, &end, &startinpos, &endinpos, &exc, &s,
5108 &writer))
5109 goto onError;
5110 }
5111 }
5112
5113 End:
5114 if (consumed)
5115 *consumed = s - starts;
5116
5117 Py_XDECREF(error_handler_obj);
5118 Py_XDECREF(exc);
5119 return _PyUnicodeWriter_Finish(&writer);
5120
5121 onError:
5122 Py_XDECREF(error_handler_obj);
5123 Py_XDECREF(exc);
5124 _PyUnicodeWriter_Dealloc(&writer);
5125 return NULL;
5126 }
5127
5128
5129 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5130 PyUnicode_DecodeUTF8Stateful(const char *s,
5131 Py_ssize_t size,
5132 const char *errors,
5133 Py_ssize_t *consumed)
5134 {
5135 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5136 }
5137
5138
5139 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5140 non-zero, use strict error handler otherwise.
5141
5142 On success, write a pointer to a newly allocated wide character string into
5143 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5144 (in number of wchar_t units) into *wlen (if wlen is set).
5145
5146 On memory allocation failure, return -1.
5147
5148 On decoding error (if surrogateescape is zero), return -2. If wlen is
5149 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5150 is not NULL, write the decoding error message into *reason. */
5151 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5152 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5153 const char **reason, _Py_error_handler errors)
5154 {
5155 const char *orig_s = s;
5156 const char *e;
5157 wchar_t *unicode;
5158 Py_ssize_t outpos;
5159
5160 int surrogateescape = 0;
5161 int surrogatepass = 0;
5162 switch (errors)
5163 {
5164 case _Py_ERROR_STRICT:
5165 break;
5166 case _Py_ERROR_SURROGATEESCAPE:
5167 surrogateescape = 1;
5168 break;
5169 case _Py_ERROR_SURROGATEPASS:
5170 surrogatepass = 1;
5171 break;
5172 default:
5173 return -3;
5174 }
5175
5176 /* Note: size will always be longer than the resulting Unicode
5177 character count */
5178 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5179 return -1;
5180 }
5181
5182 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5183 if (!unicode) {
5184 return -1;
5185 }
5186
5187 /* Unpack UTF-8 encoded data */
5188 e = s + size;
5189 outpos = 0;
5190 while (s < e) {
5191 Py_UCS4 ch;
5192 #if SIZEOF_WCHAR_T == 4
5193 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5194 #else
5195 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5196 #endif
5197 if (ch > 0xFF) {
5198 #if SIZEOF_WCHAR_T == 4
5199 Py_UNREACHABLE();
5200 #else
5201 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5202 /* write a surrogate pair */
5203 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5204 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5205 #endif
5206 }
5207 else {
5208 if (!ch && s == e) {
5209 break;
5210 }
5211
5212 if (surrogateescape) {
5213 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5214 }
5215 else {
5216 /* Is it a valid three-byte code? */
5217 if (surrogatepass
5218 && (e - s) >= 3
5219 && (s[0] & 0xf0) == 0xe0
5220 && (s[1] & 0xc0) == 0x80
5221 && (s[2] & 0xc0) == 0x80)
5222 {
5223 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5224 s += 3;
5225 unicode[outpos++] = ch;
5226 }
5227 else {
5228 PyMem_RawFree(unicode );
5229 if (reason != NULL) {
5230 switch (ch) {
5231 case 0:
5232 *reason = "unexpected end of data";
5233 break;
5234 case 1:
5235 *reason = "invalid start byte";
5236 break;
5237 /* 2, 3, 4 */
5238 default:
5239 *reason = "invalid continuation byte";
5240 break;
5241 }
5242 }
5243 if (wlen != NULL) {
5244 *wlen = s - orig_s;
5245 }
5246 return -2;
5247 }
5248 }
5249 }
5250 }
5251 unicode[outpos] = L'\0';
5252 if (wlen) {
5253 *wlen = outpos;
5254 }
5255 *wstr = unicode;
5256 return 0;
5257 }
5258
5259
5260 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5261 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5262 size_t *wlen)
5263 {
5264 wchar_t *wstr;
5265 int res = _Py_DecodeUTF8Ex(arg, arglen,
5266 &wstr, wlen,
5267 NULL, _Py_ERROR_SURROGATEESCAPE);
5268 if (res != 0) {
5269 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5270 assert(res != -3);
5271 if (wlen) {
5272 *wlen = (size_t)res;
5273 }
5274 return NULL;
5275 }
5276 return wstr;
5277 }
5278
5279
5280 /* UTF-8 encoder using the surrogateescape error handler .
5281
5282 On success, return 0 and write the newly allocated character string (use
5283 PyMem_Free() to free the memory) into *str.
5284
5285 On encoding failure, return -2 and write the position of the invalid
5286 surrogate character into *error_pos (if error_pos is set) and the decoding
5287 error message into *reason (if reason is set).
5288
5289 On memory allocation failure, return -1. */
5290 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5291 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5292 const char **reason, int raw_malloc, _Py_error_handler errors)
5293 {
5294 const Py_ssize_t max_char_size = 4;
5295 Py_ssize_t len = wcslen(text);
5296
5297 assert(len >= 0);
5298
5299 int surrogateescape = 0;
5300 int surrogatepass = 0;
5301 switch (errors)
5302 {
5303 case _Py_ERROR_STRICT:
5304 break;
5305 case _Py_ERROR_SURROGATEESCAPE:
5306 surrogateescape = 1;
5307 break;
5308 case _Py_ERROR_SURROGATEPASS:
5309 surrogatepass = 1;
5310 break;
5311 default:
5312 return -3;
5313 }
5314
5315 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5316 return -1;
5317 }
5318 char *bytes;
5319 if (raw_malloc) {
5320 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5321 }
5322 else {
5323 bytes = PyMem_Malloc((len + 1) * max_char_size);
5324 }
5325 if (bytes == NULL) {
5326 return -1;
5327 }
5328
5329 char *p = bytes;
5330 Py_ssize_t i;
5331 for (i = 0; i < len; ) {
5332 Py_ssize_t ch_pos = i;
5333 Py_UCS4 ch = text[i];
5334 i++;
5335 #if Py_UNICODE_SIZE == 2
5336 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5337 && i < len
5338 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5339 {
5340 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5341 i++;
5342 }
5343 #endif
5344
5345 if (ch < 0x80) {
5346 /* Encode ASCII */
5347 *p++ = (char) ch;
5348
5349 }
5350 else if (ch < 0x0800) {
5351 /* Encode Latin-1 */
5352 *p++ = (char)(0xc0 | (ch >> 6));
5353 *p++ = (char)(0x80 | (ch & 0x3f));
5354 }
5355 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5356 /* surrogateescape error handler */
5357 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5358 if (error_pos != NULL) {
5359 *error_pos = (size_t)ch_pos;
5360 }
5361 if (reason != NULL) {
5362 *reason = "encoding error";
5363 }
5364 if (raw_malloc) {
5365 PyMem_RawFree(bytes);
5366 }
5367 else {
5368 PyMem_Free(bytes);
5369 }
5370 return -2;
5371 }
5372 *p++ = (char)(ch & 0xff);
5373 }
5374 else if (ch < 0x10000) {
5375 *p++ = (char)(0xe0 | (ch >> 12));
5376 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5377 *p++ = (char)(0x80 | (ch & 0x3f));
5378 }
5379 else { /* ch >= 0x10000 */
5380 assert(ch <= MAX_UNICODE);
5381 /* Encode UCS4 Unicode ordinals */
5382 *p++ = (char)(0xf0 | (ch >> 18));
5383 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5384 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5385 *p++ = (char)(0x80 | (ch & 0x3f));
5386 }
5387 }
5388 *p++ = '\0';
5389
5390 size_t final_size = (p - bytes);
5391 char *bytes2;
5392 if (raw_malloc) {
5393 bytes2 = PyMem_RawRealloc(bytes, final_size);
5394 }
5395 else {
5396 bytes2 = PyMem_Realloc(bytes, final_size);
5397 }
5398 if (bytes2 == NULL) {
5399 if (error_pos != NULL) {
5400 *error_pos = (size_t)-1;
5401 }
5402 if (raw_malloc) {
5403 PyMem_RawFree(bytes);
5404 }
5405 else {
5406 PyMem_Free(bytes);
5407 }
5408 return -1;
5409 }
5410 *str = bytes2;
5411 return 0;
5412 }
5413
5414
5415 /* Primary internal function which creates utf8 encoded bytes objects.
5416
5417 Allocation strategy: if the string is short, convert into a stack buffer
5418 and allocate exactly as much space needed at the end. Else allocate the
5419 maximum possible needed (4 result bytes per Unicode character), and return
5420 the excess memory at the end.
5421 */
5422 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5423 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5424 const char *errors)
5425 {
5426 if (!PyUnicode_Check(unicode)) {
5427 PyErr_BadArgument();
5428 return NULL;
5429 }
5430
5431 if (PyUnicode_READY(unicode) == -1)
5432 return NULL;
5433
5434 if (PyUnicode_UTF8(unicode))
5435 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5436 PyUnicode_UTF8_LENGTH(unicode));
5437
5438 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5439 const void *data = PyUnicode_DATA(unicode);
5440 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5441
5442 _PyBytesWriter writer;
5443 char *end;
5444
5445 switch (kind) {
5446 default:
5447 Py_UNREACHABLE();
5448 case PyUnicode_1BYTE_KIND:
5449 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5450 assert(!PyUnicode_IS_ASCII(unicode));
5451 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5452 break;
5453 case PyUnicode_2BYTE_KIND:
5454 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5455 break;
5456 case PyUnicode_4BYTE_KIND:
5457 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5458 break;
5459 }
5460
5461 if (end == NULL) {
5462 _PyBytesWriter_Dealloc(&writer);
5463 return NULL;
5464 }
5465 return _PyBytesWriter_Finish(&writer, end);
5466 }
5467
5468 static int
unicode_fill_utf8(PyObject * unicode)5469 unicode_fill_utf8(PyObject *unicode)
5470 {
5471 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5472 assert(!PyUnicode_IS_ASCII(unicode));
5473
5474 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5475 const void *data = PyUnicode_DATA(unicode);
5476 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5477
5478 _PyBytesWriter writer;
5479 char *end;
5480
5481 switch (kind) {
5482 default:
5483 Py_UNREACHABLE();
5484 case PyUnicode_1BYTE_KIND:
5485 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5486 _Py_ERROR_STRICT, NULL);
5487 break;
5488 case PyUnicode_2BYTE_KIND:
5489 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5490 _Py_ERROR_STRICT, NULL);
5491 break;
5492 case PyUnicode_4BYTE_KIND:
5493 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5494 _Py_ERROR_STRICT, NULL);
5495 break;
5496 }
5497 if (end == NULL) {
5498 _PyBytesWriter_Dealloc(&writer);
5499 return -1;
5500 }
5501
5502 const char *start = writer.use_small_buffer ? writer.small_buffer :
5503 PyBytes_AS_STRING(writer.buffer);
5504 Py_ssize_t len = end - start;
5505
5506 char *cache = PyObject_MALLOC(len + 1);
5507 if (cache == NULL) {
5508 _PyBytesWriter_Dealloc(&writer);
5509 PyErr_NoMemory();
5510 return -1;
5511 }
5512 _PyUnicode_UTF8(unicode) = cache;
5513 _PyUnicode_UTF8_LENGTH(unicode) = len;
5514 memcpy(cache, start, len);
5515 cache[len] = '\0';
5516 _PyBytesWriter_Dealloc(&writer);
5517 return 0;
5518 }
5519
5520 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5521 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5522 {
5523 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5524 }
5525
5526
5527 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5528 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5529 Py_ssize_t size,
5530 const char *errors)
5531 {
5532 PyObject *v, *unicode;
5533
5534 unicode = PyUnicode_FromWideChar(s, size);
5535 if (unicode == NULL)
5536 return NULL;
5537 v = _PyUnicode_AsUTF8String(unicode, errors);
5538 Py_DECREF(unicode);
5539 return v;
5540 }
5541
5542 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5543 PyUnicode_AsUTF8String(PyObject *unicode)
5544 {
5545 return _PyUnicode_AsUTF8String(unicode, NULL);
5546 }
5547
5548 /* --- UTF-32 Codec ------------------------------------------------------- */
5549
5550 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5551 PyUnicode_DecodeUTF32(const char *s,
5552 Py_ssize_t size,
5553 const char *errors,
5554 int *byteorder)
5555 {
5556 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5557 }
5558
5559 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5560 PyUnicode_DecodeUTF32Stateful(const char *s,
5561 Py_ssize_t size,
5562 const char *errors,
5563 int *byteorder,
5564 Py_ssize_t *consumed)
5565 {
5566 const char *starts = s;
5567 Py_ssize_t startinpos;
5568 Py_ssize_t endinpos;
5569 _PyUnicodeWriter writer;
5570 const unsigned char *q, *e;
5571 int le, bo = 0; /* assume native ordering by default */
5572 const char *encoding;
5573 const char *errmsg = "";
5574 PyObject *errorHandler = NULL;
5575 PyObject *exc = NULL;
5576
5577 q = (const unsigned char *)s;
5578 e = q + size;
5579
5580 if (byteorder)
5581 bo = *byteorder;
5582
5583 /* Check for BOM marks (U+FEFF) in the input and adjust current
5584 byte order setting accordingly. In native mode, the leading BOM
5585 mark is skipped, in all other modes, it is copied to the output
5586 stream as-is (giving a ZWNBSP character). */
5587 if (bo == 0 && size >= 4) {
5588 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5589 if (bom == 0x0000FEFF) {
5590 bo = -1;
5591 q += 4;
5592 }
5593 else if (bom == 0xFFFE0000) {
5594 bo = 1;
5595 q += 4;
5596 }
5597 if (byteorder)
5598 *byteorder = bo;
5599 }
5600
5601 if (q == e) {
5602 if (consumed)
5603 *consumed = size;
5604 _Py_RETURN_UNICODE_EMPTY();
5605 }
5606
5607 #ifdef WORDS_BIGENDIAN
5608 le = bo < 0;
5609 #else
5610 le = bo <= 0;
5611 #endif
5612 encoding = le ? "utf-32-le" : "utf-32-be";
5613
5614 _PyUnicodeWriter_Init(&writer);
5615 writer.min_length = (e - q + 3) / 4;
5616 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5617 goto onError;
5618
5619 while (1) {
5620 Py_UCS4 ch = 0;
5621 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5622
5623 if (e - q >= 4) {
5624 enum PyUnicode_Kind kind = writer.kind;
5625 void *data = writer.data;
5626 const unsigned char *last = e - 4;
5627 Py_ssize_t pos = writer.pos;
5628 if (le) {
5629 do {
5630 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5631 if (ch > maxch)
5632 break;
5633 if (kind != PyUnicode_1BYTE_KIND &&
5634 Py_UNICODE_IS_SURROGATE(ch))
5635 break;
5636 PyUnicode_WRITE(kind, data, pos++, ch);
5637 q += 4;
5638 } while (q <= last);
5639 }
5640 else {
5641 do {
5642 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5643 if (ch > maxch)
5644 break;
5645 if (kind != PyUnicode_1BYTE_KIND &&
5646 Py_UNICODE_IS_SURROGATE(ch))
5647 break;
5648 PyUnicode_WRITE(kind, data, pos++, ch);
5649 q += 4;
5650 } while (q <= last);
5651 }
5652 writer.pos = pos;
5653 }
5654
5655 if (Py_UNICODE_IS_SURROGATE(ch)) {
5656 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5657 startinpos = ((const char *)q) - starts;
5658 endinpos = startinpos + 4;
5659 }
5660 else if (ch <= maxch) {
5661 if (q == e || consumed)
5662 break;
5663 /* remaining bytes at the end? (size should be divisible by 4) */
5664 errmsg = "truncated data";
5665 startinpos = ((const char *)q) - starts;
5666 endinpos = ((const char *)e) - starts;
5667 }
5668 else {
5669 if (ch < 0x110000) {
5670 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5671 goto onError;
5672 q += 4;
5673 continue;
5674 }
5675 errmsg = "code point not in range(0x110000)";
5676 startinpos = ((const char *)q) - starts;
5677 endinpos = startinpos + 4;
5678 }
5679
5680 /* The remaining input chars are ignored if the callback
5681 chooses to skip the input */
5682 if (unicode_decode_call_errorhandler_writer(
5683 errors, &errorHandler,
5684 encoding, errmsg,
5685 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5686 &writer))
5687 goto onError;
5688 }
5689
5690 if (consumed)
5691 *consumed = (const char *)q-starts;
5692
5693 Py_XDECREF(errorHandler);
5694 Py_XDECREF(exc);
5695 return _PyUnicodeWriter_Finish(&writer);
5696
5697 onError:
5698 _PyUnicodeWriter_Dealloc(&writer);
5699 Py_XDECREF(errorHandler);
5700 Py_XDECREF(exc);
5701 return NULL;
5702 }
5703
5704 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5705 _PyUnicode_EncodeUTF32(PyObject *str,
5706 const char *errors,
5707 int byteorder)
5708 {
5709 enum PyUnicode_Kind kind;
5710 const void *data;
5711 Py_ssize_t len;
5712 PyObject *v;
5713 uint32_t *out;
5714 #if PY_LITTLE_ENDIAN
5715 int native_ordering = byteorder <= 0;
5716 #else
5717 int native_ordering = byteorder >= 0;
5718 #endif
5719 const char *encoding;
5720 Py_ssize_t nsize, pos;
5721 PyObject *errorHandler = NULL;
5722 PyObject *exc = NULL;
5723 PyObject *rep = NULL;
5724
5725 if (!PyUnicode_Check(str)) {
5726 PyErr_BadArgument();
5727 return NULL;
5728 }
5729 if (PyUnicode_READY(str) == -1)
5730 return NULL;
5731 kind = PyUnicode_KIND(str);
5732 data = PyUnicode_DATA(str);
5733 len = PyUnicode_GET_LENGTH(str);
5734
5735 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5736 return PyErr_NoMemory();
5737 nsize = len + (byteorder == 0);
5738 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5739 if (v == NULL)
5740 return NULL;
5741
5742 /* output buffer is 4-bytes aligned */
5743 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5744 out = (uint32_t *)PyBytes_AS_STRING(v);
5745 if (byteorder == 0)
5746 *out++ = 0xFEFF;
5747 if (len == 0)
5748 goto done;
5749
5750 if (byteorder == -1)
5751 encoding = "utf-32-le";
5752 else if (byteorder == 1)
5753 encoding = "utf-32-be";
5754 else
5755 encoding = "utf-32";
5756
5757 if (kind == PyUnicode_1BYTE_KIND) {
5758 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5759 goto done;
5760 }
5761
5762 pos = 0;
5763 while (pos < len) {
5764 Py_ssize_t repsize, moreunits;
5765
5766 if (kind == PyUnicode_2BYTE_KIND) {
5767 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5768 &out, native_ordering);
5769 }
5770 else {
5771 assert(kind == PyUnicode_4BYTE_KIND);
5772 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5773 &out, native_ordering);
5774 }
5775 if (pos == len)
5776 break;
5777
5778 rep = unicode_encode_call_errorhandler(
5779 errors, &errorHandler,
5780 encoding, "surrogates not allowed",
5781 str, &exc, pos, pos + 1, &pos);
5782 if (!rep)
5783 goto error;
5784
5785 if (PyBytes_Check(rep)) {
5786 repsize = PyBytes_GET_SIZE(rep);
5787 if (repsize & 3) {
5788 raise_encode_exception(&exc, encoding,
5789 str, pos - 1, pos,
5790 "surrogates not allowed");
5791 goto error;
5792 }
5793 moreunits = repsize / 4;
5794 }
5795 else {
5796 assert(PyUnicode_Check(rep));
5797 if (PyUnicode_READY(rep) < 0)
5798 goto error;
5799 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5800 if (!PyUnicode_IS_ASCII(rep)) {
5801 raise_encode_exception(&exc, encoding,
5802 str, pos - 1, pos,
5803 "surrogates not allowed");
5804 goto error;
5805 }
5806 }
5807
5808 /* four bytes are reserved for each surrogate */
5809 if (moreunits > 1) {
5810 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5811 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5812 /* integer overflow */
5813 PyErr_NoMemory();
5814 goto error;
5815 }
5816 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5817 goto error;
5818 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5819 }
5820
5821 if (PyBytes_Check(rep)) {
5822 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5823 out += moreunits;
5824 } else /* rep is unicode */ {
5825 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5826 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5827 &out, native_ordering);
5828 }
5829
5830 Py_CLEAR(rep);
5831 }
5832
5833 /* Cut back to size actually needed. This is necessary for, for example,
5834 encoding of a string containing isolated surrogates and the 'ignore'
5835 handler is used. */
5836 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5837 if (nsize != PyBytes_GET_SIZE(v))
5838 _PyBytes_Resize(&v, nsize);
5839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
5841 done:
5842 return v;
5843 error:
5844 Py_XDECREF(rep);
5845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
5847 Py_XDECREF(v);
5848 return NULL;
5849 }
5850
5851 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5852 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5853 Py_ssize_t size,
5854 const char *errors,
5855 int byteorder)
5856 {
5857 PyObject *result;
5858 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5859 if (tmp == NULL)
5860 return NULL;
5861 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5862 Py_DECREF(tmp);
5863 return result;
5864 }
5865
5866 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5867 PyUnicode_AsUTF32String(PyObject *unicode)
5868 {
5869 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5870 }
5871
5872 /* --- UTF-16 Codec ------------------------------------------------------- */
5873
5874 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5875 PyUnicode_DecodeUTF16(const char *s,
5876 Py_ssize_t size,
5877 const char *errors,
5878 int *byteorder)
5879 {
5880 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5881 }
5882
5883 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5884 PyUnicode_DecodeUTF16Stateful(const char *s,
5885 Py_ssize_t size,
5886 const char *errors,
5887 int *byteorder,
5888 Py_ssize_t *consumed)
5889 {
5890 const char *starts = s;
5891 Py_ssize_t startinpos;
5892 Py_ssize_t endinpos;
5893 _PyUnicodeWriter writer;
5894 const unsigned char *q, *e;
5895 int bo = 0; /* assume native ordering by default */
5896 int native_ordering;
5897 const char *errmsg = "";
5898 PyObject *errorHandler = NULL;
5899 PyObject *exc = NULL;
5900 const char *encoding;
5901
5902 q = (const unsigned char *)s;
5903 e = q + size;
5904
5905 if (byteorder)
5906 bo = *byteorder;
5907
5908 /* Check for BOM marks (U+FEFF) in the input and adjust current
5909 byte order setting accordingly. In native mode, the leading BOM
5910 mark is skipped, in all other modes, it is copied to the output
5911 stream as-is (giving a ZWNBSP character). */
5912 if (bo == 0 && size >= 2) {
5913 const Py_UCS4 bom = (q[1] << 8) | q[0];
5914 if (bom == 0xFEFF) {
5915 q += 2;
5916 bo = -1;
5917 }
5918 else if (bom == 0xFFFE) {
5919 q += 2;
5920 bo = 1;
5921 }
5922 if (byteorder)
5923 *byteorder = bo;
5924 }
5925
5926 if (q == e) {
5927 if (consumed)
5928 *consumed = size;
5929 _Py_RETURN_UNICODE_EMPTY();
5930 }
5931
5932 #if PY_LITTLE_ENDIAN
5933 native_ordering = bo <= 0;
5934 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5935 #else
5936 native_ordering = bo >= 0;
5937 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5938 #endif
5939
5940 /* Note: size will always be longer than the resulting Unicode
5941 character count normally. Error handler will take care of
5942 resizing when needed. */
5943 _PyUnicodeWriter_Init(&writer);
5944 writer.min_length = (e - q + 1) / 2;
5945 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5946 goto onError;
5947
5948 while (1) {
5949 Py_UCS4 ch = 0;
5950 if (e - q >= 2) {
5951 int kind = writer.kind;
5952 if (kind == PyUnicode_1BYTE_KIND) {
5953 if (PyUnicode_IS_ASCII(writer.buffer))
5954 ch = asciilib_utf16_decode(&q, e,
5955 (Py_UCS1*)writer.data, &writer.pos,
5956 native_ordering);
5957 else
5958 ch = ucs1lib_utf16_decode(&q, e,
5959 (Py_UCS1*)writer.data, &writer.pos,
5960 native_ordering);
5961 } else if (kind == PyUnicode_2BYTE_KIND) {
5962 ch = ucs2lib_utf16_decode(&q, e,
5963 (Py_UCS2*)writer.data, &writer.pos,
5964 native_ordering);
5965 } else {
5966 assert(kind == PyUnicode_4BYTE_KIND);
5967 ch = ucs4lib_utf16_decode(&q, e,
5968 (Py_UCS4*)writer.data, &writer.pos,
5969 native_ordering);
5970 }
5971 }
5972
5973 switch (ch)
5974 {
5975 case 0:
5976 /* remaining byte at the end? (size should be even) */
5977 if (q == e || consumed)
5978 goto End;
5979 errmsg = "truncated data";
5980 startinpos = ((const char *)q) - starts;
5981 endinpos = ((const char *)e) - starts;
5982 break;
5983 /* The remaining input chars are ignored if the callback
5984 chooses to skip the input */
5985 case 1:
5986 q -= 2;
5987 if (consumed)
5988 goto End;
5989 errmsg = "unexpected end of data";
5990 startinpos = ((const char *)q) - starts;
5991 endinpos = ((const char *)e) - starts;
5992 break;
5993 case 2:
5994 errmsg = "illegal encoding";
5995 startinpos = ((const char *)q) - 2 - starts;
5996 endinpos = startinpos + 2;
5997 break;
5998 case 3:
5999 errmsg = "illegal UTF-16 surrogate";
6000 startinpos = ((const char *)q) - 4 - starts;
6001 endinpos = startinpos + 2;
6002 break;
6003 default:
6004 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6005 goto onError;
6006 continue;
6007 }
6008
6009 if (unicode_decode_call_errorhandler_writer(
6010 errors,
6011 &errorHandler,
6012 encoding, errmsg,
6013 &starts,
6014 (const char **)&e,
6015 &startinpos,
6016 &endinpos,
6017 &exc,
6018 (const char **)&q,
6019 &writer))
6020 goto onError;
6021 }
6022
6023 End:
6024 if (consumed)
6025 *consumed = (const char *)q-starts;
6026
6027 Py_XDECREF(errorHandler);
6028 Py_XDECREF(exc);
6029 return _PyUnicodeWriter_Finish(&writer);
6030
6031 onError:
6032 _PyUnicodeWriter_Dealloc(&writer);
6033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
6035 return NULL;
6036 }
6037
6038 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6039 _PyUnicode_EncodeUTF16(PyObject *str,
6040 const char *errors,
6041 int byteorder)
6042 {
6043 enum PyUnicode_Kind kind;
6044 const void *data;
6045 Py_ssize_t len;
6046 PyObject *v;
6047 unsigned short *out;
6048 Py_ssize_t pairs;
6049 #if PY_BIG_ENDIAN
6050 int native_ordering = byteorder >= 0;
6051 #else
6052 int native_ordering = byteorder <= 0;
6053 #endif
6054 const char *encoding;
6055 Py_ssize_t nsize, pos;
6056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
6058 PyObject *rep = NULL;
6059
6060 if (!PyUnicode_Check(str)) {
6061 PyErr_BadArgument();
6062 return NULL;
6063 }
6064 if (PyUnicode_READY(str) == -1)
6065 return NULL;
6066 kind = PyUnicode_KIND(str);
6067 data = PyUnicode_DATA(str);
6068 len = PyUnicode_GET_LENGTH(str);
6069
6070 pairs = 0;
6071 if (kind == PyUnicode_4BYTE_KIND) {
6072 const Py_UCS4 *in = (const Py_UCS4 *)data;
6073 const Py_UCS4 *end = in + len;
6074 while (in < end) {
6075 if (*in++ >= 0x10000) {
6076 pairs++;
6077 }
6078 }
6079 }
6080 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6081 return PyErr_NoMemory();
6082 }
6083 nsize = len + pairs + (byteorder == 0);
6084 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6085 if (v == NULL) {
6086 return NULL;
6087 }
6088
6089 /* output buffer is 2-bytes aligned */
6090 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6091 out = (unsigned short *)PyBytes_AS_STRING(v);
6092 if (byteorder == 0) {
6093 *out++ = 0xFEFF;
6094 }
6095 if (len == 0) {
6096 goto done;
6097 }
6098
6099 if (kind == PyUnicode_1BYTE_KIND) {
6100 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6101 goto done;
6102 }
6103
6104 if (byteorder < 0) {
6105 encoding = "utf-16-le";
6106 }
6107 else if (byteorder > 0) {
6108 encoding = "utf-16-be";
6109 }
6110 else {
6111 encoding = "utf-16";
6112 }
6113
6114 pos = 0;
6115 while (pos < len) {
6116 Py_ssize_t repsize, moreunits;
6117
6118 if (kind == PyUnicode_2BYTE_KIND) {
6119 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6120 &out, native_ordering);
6121 }
6122 else {
6123 assert(kind == PyUnicode_4BYTE_KIND);
6124 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6125 &out, native_ordering);
6126 }
6127 if (pos == len)
6128 break;
6129
6130 rep = unicode_encode_call_errorhandler(
6131 errors, &errorHandler,
6132 encoding, "surrogates not allowed",
6133 str, &exc, pos, pos + 1, &pos);
6134 if (!rep)
6135 goto error;
6136
6137 if (PyBytes_Check(rep)) {
6138 repsize = PyBytes_GET_SIZE(rep);
6139 if (repsize & 1) {
6140 raise_encode_exception(&exc, encoding,
6141 str, pos - 1, pos,
6142 "surrogates not allowed");
6143 goto error;
6144 }
6145 moreunits = repsize / 2;
6146 }
6147 else {
6148 assert(PyUnicode_Check(rep));
6149 if (PyUnicode_READY(rep) < 0)
6150 goto error;
6151 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6152 if (!PyUnicode_IS_ASCII(rep)) {
6153 raise_encode_exception(&exc, encoding,
6154 str, pos - 1, pos,
6155 "surrogates not allowed");
6156 goto error;
6157 }
6158 }
6159
6160 /* two bytes are reserved for each surrogate */
6161 if (moreunits > 1) {
6162 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6163 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6164 /* integer overflow */
6165 PyErr_NoMemory();
6166 goto error;
6167 }
6168 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6169 goto error;
6170 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6171 }
6172
6173 if (PyBytes_Check(rep)) {
6174 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6175 out += moreunits;
6176 } else /* rep is unicode */ {
6177 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6178 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6179 &out, native_ordering);
6180 }
6181
6182 Py_CLEAR(rep);
6183 }
6184
6185 /* Cut back to size actually needed. This is necessary for, for example,
6186 encoding of a string containing isolated surrogates and the 'ignore' handler
6187 is used. */
6188 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6189 if (nsize != PyBytes_GET_SIZE(v))
6190 _PyBytes_Resize(&v, nsize);
6191 Py_XDECREF(errorHandler);
6192 Py_XDECREF(exc);
6193 done:
6194 return v;
6195 error:
6196 Py_XDECREF(rep);
6197 Py_XDECREF(errorHandler);
6198 Py_XDECREF(exc);
6199 Py_XDECREF(v);
6200 return NULL;
6201 #undef STORECHAR
6202 }
6203
6204 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6205 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6206 Py_ssize_t size,
6207 const char *errors,
6208 int byteorder)
6209 {
6210 PyObject *result;
6211 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6212 if (tmp == NULL)
6213 return NULL;
6214 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6215 Py_DECREF(tmp);
6216 return result;
6217 }
6218
6219 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6220 PyUnicode_AsUTF16String(PyObject *unicode)
6221 {
6222 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6223 }
6224
6225 /* --- Unicode Escape Codec ----------------------------------------------- */
6226
6227 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6228
6229 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6230 _PyUnicode_DecodeUnicodeEscape(const char *s,
6231 Py_ssize_t size,
6232 const char *errors,
6233 const char **first_invalid_escape)
6234 {
6235 const char *starts = s;
6236 _PyUnicodeWriter writer;
6237 const char *end;
6238 PyObject *errorHandler = NULL;
6239 PyObject *exc = NULL;
6240
6241 // so we can remember if we've seen an invalid escape char or not
6242 *first_invalid_escape = NULL;
6243
6244 if (size == 0) {
6245 _Py_RETURN_UNICODE_EMPTY();
6246 }
6247 /* Escaped strings will always be longer than the resulting
6248 Unicode string, so we start with size here and then reduce the
6249 length after conversion to the true value.
6250 (but if the error callback returns a long replacement string
6251 we'll have to allocate more space) */
6252 _PyUnicodeWriter_Init(&writer);
6253 writer.min_length = size;
6254 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6255 goto onError;
6256 }
6257
6258 end = s + size;
6259 while (s < end) {
6260 unsigned char c = (unsigned char) *s++;
6261 Py_UCS4 ch;
6262 int count;
6263 Py_ssize_t startinpos;
6264 Py_ssize_t endinpos;
6265 const char *message;
6266
6267 #define WRITE_ASCII_CHAR(ch) \
6268 do { \
6269 assert(ch <= 127); \
6270 assert(writer.pos < writer.size); \
6271 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6272 } while(0)
6273
6274 #define WRITE_CHAR(ch) \
6275 do { \
6276 if (ch <= writer.maxchar) { \
6277 assert(writer.pos < writer.size); \
6278 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6279 } \
6280 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6281 goto onError; \
6282 } \
6283 } while(0)
6284
6285 /* Non-escape characters are interpreted as Unicode ordinals */
6286 if (c != '\\') {
6287 WRITE_CHAR(c);
6288 continue;
6289 }
6290
6291 startinpos = s - starts - 1;
6292 /* \ - Escapes */
6293 if (s >= end) {
6294 message = "\\ at end of string";
6295 goto error;
6296 }
6297 c = (unsigned char) *s++;
6298
6299 assert(writer.pos < writer.size);
6300 switch (c) {
6301
6302 /* \x escapes */
6303 case '\n': continue;
6304 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6305 case '\'': WRITE_ASCII_CHAR('\''); continue;
6306 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6307 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6308 /* FF */
6309 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6310 case 't': WRITE_ASCII_CHAR('\t'); continue;
6311 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6312 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6313 /* VT */
6314 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6315 /* BEL, not classic C */
6316 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6317
6318 /* \OOO (octal) escapes */
6319 case '0': case '1': case '2': case '3':
6320 case '4': case '5': case '6': case '7':
6321 ch = c - '0';
6322 if (s < end && '0' <= *s && *s <= '7') {
6323 ch = (ch<<3) + *s++ - '0';
6324 if (s < end && '0' <= *s && *s <= '7') {
6325 ch = (ch<<3) + *s++ - '0';
6326 }
6327 }
6328 WRITE_CHAR(ch);
6329 continue;
6330
6331 /* hex escapes */
6332 /* \xXX */
6333 case 'x':
6334 count = 2;
6335 message = "truncated \\xXX escape";
6336 goto hexescape;
6337
6338 /* \uXXXX */
6339 case 'u':
6340 count = 4;
6341 message = "truncated \\uXXXX escape";
6342 goto hexescape;
6343
6344 /* \UXXXXXXXX */
6345 case 'U':
6346 count = 8;
6347 message = "truncated \\UXXXXXXXX escape";
6348 hexescape:
6349 for (ch = 0; count && s < end; ++s, --count) {
6350 c = (unsigned char)*s;
6351 ch <<= 4;
6352 if (c >= '0' && c <= '9') {
6353 ch += c - '0';
6354 }
6355 else if (c >= 'a' && c <= 'f') {
6356 ch += c - ('a' - 10);
6357 }
6358 else if (c >= 'A' && c <= 'F') {
6359 ch += c - ('A' - 10);
6360 }
6361 else {
6362 break;
6363 }
6364 }
6365 if (count) {
6366 goto error;
6367 }
6368
6369 /* when we get here, ch is a 32-bit unicode character */
6370 if (ch > MAX_UNICODE) {
6371 message = "illegal Unicode character";
6372 goto error;
6373 }
6374
6375 WRITE_CHAR(ch);
6376 continue;
6377
6378 /* \N{name} */
6379 case 'N':
6380 if (ucnhash_CAPI == NULL) {
6381 /* load the unicode data module */
6382 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6383 PyUnicodeData_CAPSULE_NAME, 1);
6384 if (ucnhash_CAPI == NULL) {
6385 PyErr_SetString(
6386 PyExc_UnicodeError,
6387 "\\N escapes not supported (can't load unicodedata module)"
6388 );
6389 goto onError;
6390 }
6391 }
6392
6393 message = "malformed \\N character escape";
6394 if (s < end && *s == '{') {
6395 const char *start = ++s;
6396 size_t namelen;
6397 /* look for the closing brace */
6398 while (s < end && *s != '}')
6399 s++;
6400 namelen = s - start;
6401 if (namelen && s < end) {
6402 /* found a name. look it up in the unicode database */
6403 s++;
6404 ch = 0xffffffff; /* in case 'getcode' messes up */
6405 if (namelen <= INT_MAX &&
6406 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6407 &ch, 0)) {
6408 assert(ch <= MAX_UNICODE);
6409 WRITE_CHAR(ch);
6410 continue;
6411 }
6412 message = "unknown Unicode character name";
6413 }
6414 }
6415 goto error;
6416
6417 default:
6418 if (*first_invalid_escape == NULL) {
6419 *first_invalid_escape = s-1; /* Back up one char, since we've
6420 already incremented s. */
6421 }
6422 WRITE_ASCII_CHAR('\\');
6423 WRITE_CHAR(c);
6424 continue;
6425 }
6426
6427 error:
6428 endinpos = s-starts;
6429 writer.min_length = end - s + writer.pos;
6430 if (unicode_decode_call_errorhandler_writer(
6431 errors, &errorHandler,
6432 "unicodeescape", message,
6433 &starts, &end, &startinpos, &endinpos, &exc, &s,
6434 &writer)) {
6435 goto onError;
6436 }
6437 assert(end - s <= writer.size - writer.pos);
6438
6439 #undef WRITE_ASCII_CHAR
6440 #undef WRITE_CHAR
6441 }
6442
6443 Py_XDECREF(errorHandler);
6444 Py_XDECREF(exc);
6445 return _PyUnicodeWriter_Finish(&writer);
6446
6447 onError:
6448 _PyUnicodeWriter_Dealloc(&writer);
6449 Py_XDECREF(errorHandler);
6450 Py_XDECREF(exc);
6451 return NULL;
6452 }
6453
6454 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6455 PyUnicode_DecodeUnicodeEscape(const char *s,
6456 Py_ssize_t size,
6457 const char *errors)
6458 {
6459 const char *first_invalid_escape;
6460 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6461 &first_invalid_escape);
6462 if (result == NULL)
6463 return NULL;
6464 if (first_invalid_escape != NULL) {
6465 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6466 "invalid escape sequence '\\%c'",
6467 (unsigned char)*first_invalid_escape) < 0) {
6468 Py_DECREF(result);
6469 return NULL;
6470 }
6471 }
6472 return result;
6473 }
6474
6475 /* Return a Unicode-Escape string version of the Unicode object. */
6476
6477 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6478 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6479 {
6480 Py_ssize_t i, len;
6481 PyObject *repr;
6482 char *p;
6483 enum PyUnicode_Kind kind;
6484 const void *data;
6485 Py_ssize_t expandsize;
6486
6487 /* Initial allocation is based on the longest-possible character
6488 escape.
6489
6490 For UCS1 strings it's '\xxx', 4 bytes per source character.
6491 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6492 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6493 */
6494
6495 if (!PyUnicode_Check(unicode)) {
6496 PyErr_BadArgument();
6497 return NULL;
6498 }
6499 if (PyUnicode_READY(unicode) == -1) {
6500 return NULL;
6501 }
6502
6503 len = PyUnicode_GET_LENGTH(unicode);
6504 if (len == 0) {
6505 return PyBytes_FromStringAndSize(NULL, 0);
6506 }
6507
6508 kind = PyUnicode_KIND(unicode);
6509 data = PyUnicode_DATA(unicode);
6510 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6511 bytes, and 1 byte characters 4. */
6512 expandsize = kind * 2 + 2;
6513 if (len > PY_SSIZE_T_MAX / expandsize) {
6514 return PyErr_NoMemory();
6515 }
6516 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6517 if (repr == NULL) {
6518 return NULL;
6519 }
6520
6521 p = PyBytes_AS_STRING(repr);
6522 for (i = 0; i < len; i++) {
6523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6524
6525 /* U+0000-U+00ff range */
6526 if (ch < 0x100) {
6527 if (ch >= ' ' && ch < 127) {
6528 if (ch != '\\') {
6529 /* Copy printable US ASCII as-is */
6530 *p++ = (char) ch;
6531 }
6532 /* Escape backslashes */
6533 else {
6534 *p++ = '\\';
6535 *p++ = '\\';
6536 }
6537 }
6538
6539 /* Map special whitespace to '\t', \n', '\r' */
6540 else if (ch == '\t') {
6541 *p++ = '\\';
6542 *p++ = 't';
6543 }
6544 else if (ch == '\n') {
6545 *p++ = '\\';
6546 *p++ = 'n';
6547 }
6548 else if (ch == '\r') {
6549 *p++ = '\\';
6550 *p++ = 'r';
6551 }
6552
6553 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6554 else {
6555 *p++ = '\\';
6556 *p++ = 'x';
6557 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6558 *p++ = Py_hexdigits[ch & 0x000F];
6559 }
6560 }
6561 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6562 else if (ch < 0x10000) {
6563 *p++ = '\\';
6564 *p++ = 'u';
6565 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6566 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6567 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6568 *p++ = Py_hexdigits[ch & 0x000F];
6569 }
6570 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6571 else {
6572
6573 /* Make sure that the first two digits are zero */
6574 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6575 *p++ = '\\';
6576 *p++ = 'U';
6577 *p++ = '0';
6578 *p++ = '0';
6579 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6580 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6581 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6582 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6583 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6584 *p++ = Py_hexdigits[ch & 0x0000000F];
6585 }
6586 }
6587
6588 assert(p - PyBytes_AS_STRING(repr) > 0);
6589 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6590 return NULL;
6591 }
6592 return repr;
6593 }
6594
6595 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6596 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6597 Py_ssize_t size)
6598 {
6599 PyObject *result;
6600 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6601 if (tmp == NULL) {
6602 return NULL;
6603 }
6604
6605 result = PyUnicode_AsUnicodeEscapeString(tmp);
6606 Py_DECREF(tmp);
6607 return result;
6608 }
6609
6610 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6611
6612 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6613 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6614 Py_ssize_t size,
6615 const char *errors)
6616 {
6617 const char *starts = s;
6618 _PyUnicodeWriter writer;
6619 const char *end;
6620 PyObject *errorHandler = NULL;
6621 PyObject *exc = NULL;
6622
6623 if (size == 0) {
6624 _Py_RETURN_UNICODE_EMPTY();
6625 }
6626
6627 /* Escaped strings will always be longer than the resulting
6628 Unicode string, so we start with size here and then reduce the
6629 length after conversion to the true value. (But decoding error
6630 handler might have to resize the string) */
6631 _PyUnicodeWriter_Init(&writer);
6632 writer.min_length = size;
6633 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6634 goto onError;
6635 }
6636
6637 end = s + size;
6638 while (s < end) {
6639 unsigned char c = (unsigned char) *s++;
6640 Py_UCS4 ch;
6641 int count;
6642 Py_ssize_t startinpos;
6643 Py_ssize_t endinpos;
6644 const char *message;
6645
6646 #define WRITE_CHAR(ch) \
6647 do { \
6648 if (ch <= writer.maxchar) { \
6649 assert(writer.pos < writer.size); \
6650 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6651 } \
6652 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6653 goto onError; \
6654 } \
6655 } while(0)
6656
6657 /* Non-escape characters are interpreted as Unicode ordinals */
6658 if (c != '\\' || s >= end) {
6659 WRITE_CHAR(c);
6660 continue;
6661 }
6662
6663 c = (unsigned char) *s++;
6664 if (c == 'u') {
6665 count = 4;
6666 message = "truncated \\uXXXX escape";
6667 }
6668 else if (c == 'U') {
6669 count = 8;
6670 message = "truncated \\UXXXXXXXX escape";
6671 }
6672 else {
6673 assert(writer.pos < writer.size);
6674 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6675 WRITE_CHAR(c);
6676 continue;
6677 }
6678 startinpos = s - starts - 2;
6679
6680 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6681 for (ch = 0; count && s < end; ++s, --count) {
6682 c = (unsigned char)*s;
6683 ch <<= 4;
6684 if (c >= '0' && c <= '9') {
6685 ch += c - '0';
6686 }
6687 else if (c >= 'a' && c <= 'f') {
6688 ch += c - ('a' - 10);
6689 }
6690 else if (c >= 'A' && c <= 'F') {
6691 ch += c - ('A' - 10);
6692 }
6693 else {
6694 break;
6695 }
6696 }
6697 if (!count) {
6698 if (ch <= MAX_UNICODE) {
6699 WRITE_CHAR(ch);
6700 continue;
6701 }
6702 message = "\\Uxxxxxxxx out of range";
6703 }
6704
6705 endinpos = s-starts;
6706 writer.min_length = end - s + writer.pos;
6707 if (unicode_decode_call_errorhandler_writer(
6708 errors, &errorHandler,
6709 "rawunicodeescape", message,
6710 &starts, &end, &startinpos, &endinpos, &exc, &s,
6711 &writer)) {
6712 goto onError;
6713 }
6714 assert(end - s <= writer.size - writer.pos);
6715
6716 #undef WRITE_CHAR
6717 }
6718 Py_XDECREF(errorHandler);
6719 Py_XDECREF(exc);
6720 return _PyUnicodeWriter_Finish(&writer);
6721
6722 onError:
6723 _PyUnicodeWriter_Dealloc(&writer);
6724 Py_XDECREF(errorHandler);
6725 Py_XDECREF(exc);
6726 return NULL;
6727
6728 }
6729
6730
6731 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6732 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6733 {
6734 PyObject *repr;
6735 char *p;
6736 Py_ssize_t expandsize, pos;
6737 int kind;
6738 const void *data;
6739 Py_ssize_t len;
6740
6741 if (!PyUnicode_Check(unicode)) {
6742 PyErr_BadArgument();
6743 return NULL;
6744 }
6745 if (PyUnicode_READY(unicode) == -1) {
6746 return NULL;
6747 }
6748 kind = PyUnicode_KIND(unicode);
6749 data = PyUnicode_DATA(unicode);
6750 len = PyUnicode_GET_LENGTH(unicode);
6751 if (kind == PyUnicode_1BYTE_KIND) {
6752 return PyBytes_FromStringAndSize(data, len);
6753 }
6754
6755 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6756 bytes, and 1 byte characters 4. */
6757 expandsize = kind * 2 + 2;
6758
6759 if (len > PY_SSIZE_T_MAX / expandsize) {
6760 return PyErr_NoMemory();
6761 }
6762 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6763 if (repr == NULL) {
6764 return NULL;
6765 }
6766 if (len == 0) {
6767 return repr;
6768 }
6769
6770 p = PyBytes_AS_STRING(repr);
6771 for (pos = 0; pos < len; pos++) {
6772 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6773
6774 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6775 if (ch < 0x100) {
6776 *p++ = (char) ch;
6777 }
6778 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6779 else if (ch < 0x10000) {
6780 *p++ = '\\';
6781 *p++ = 'u';
6782 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6783 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6784 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6785 *p++ = Py_hexdigits[ch & 15];
6786 }
6787 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6788 else {
6789 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6790 *p++ = '\\';
6791 *p++ = 'U';
6792 *p++ = '0';
6793 *p++ = '0';
6794 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6795 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6796 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6797 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6798 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6799 *p++ = Py_hexdigits[ch & 15];
6800 }
6801 }
6802
6803 assert(p > PyBytes_AS_STRING(repr));
6804 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6805 return NULL;
6806 }
6807 return repr;
6808 }
6809
6810 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6811 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6812 Py_ssize_t size)
6813 {
6814 PyObject *result;
6815 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6816 if (tmp == NULL)
6817 return NULL;
6818 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6819 Py_DECREF(tmp);
6820 return result;
6821 }
6822
6823 /* --- Latin-1 Codec ------------------------------------------------------ */
6824
6825 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6826 PyUnicode_DecodeLatin1(const char *s,
6827 Py_ssize_t size,
6828 const char *errors)
6829 {
6830 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6831 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6832 }
6833
6834 /* create or adjust a UnicodeEncodeError */
6835 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6836 make_encode_exception(PyObject **exceptionObject,
6837 const char *encoding,
6838 PyObject *unicode,
6839 Py_ssize_t startpos, Py_ssize_t endpos,
6840 const char *reason)
6841 {
6842 if (*exceptionObject == NULL) {
6843 *exceptionObject = PyObject_CallFunction(
6844 PyExc_UnicodeEncodeError, "sOnns",
6845 encoding, unicode, startpos, endpos, reason);
6846 }
6847 else {
6848 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6849 goto onError;
6850 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6851 goto onError;
6852 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6853 goto onError;
6854 return;
6855 onError:
6856 Py_CLEAR(*exceptionObject);
6857 }
6858 }
6859
6860 /* raises a UnicodeEncodeError */
6861 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6862 raise_encode_exception(PyObject **exceptionObject,
6863 const char *encoding,
6864 PyObject *unicode,
6865 Py_ssize_t startpos, Py_ssize_t endpos,
6866 const char *reason)
6867 {
6868 make_encode_exception(exceptionObject,
6869 encoding, unicode, startpos, endpos, reason);
6870 if (*exceptionObject != NULL)
6871 PyCodec_StrictErrors(*exceptionObject);
6872 }
6873
6874 /* error handling callback helper:
6875 build arguments, call the callback and check the arguments,
6876 put the result into newpos and return the replacement string, which
6877 has to be freed by the caller */
6878 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6879 unicode_encode_call_errorhandler(const char *errors,
6880 PyObject **errorHandler,
6881 const char *encoding, const char *reason,
6882 PyObject *unicode, PyObject **exceptionObject,
6883 Py_ssize_t startpos, Py_ssize_t endpos,
6884 Py_ssize_t *newpos)
6885 {
6886 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6887 Py_ssize_t len;
6888 PyObject *restuple;
6889 PyObject *resunicode;
6890
6891 if (*errorHandler == NULL) {
6892 *errorHandler = PyCodec_LookupError(errors);
6893 if (*errorHandler == NULL)
6894 return NULL;
6895 }
6896
6897 if (PyUnicode_READY(unicode) == -1)
6898 return NULL;
6899 len = PyUnicode_GET_LENGTH(unicode);
6900
6901 make_encode_exception(exceptionObject,
6902 encoding, unicode, startpos, endpos, reason);
6903 if (*exceptionObject == NULL)
6904 return NULL;
6905
6906 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
6907 if (restuple == NULL)
6908 return NULL;
6909 if (!PyTuple_Check(restuple)) {
6910 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6911 Py_DECREF(restuple);
6912 return NULL;
6913 }
6914 if (!PyArg_ParseTuple(restuple, argparse,
6915 &resunicode, newpos)) {
6916 Py_DECREF(restuple);
6917 return NULL;
6918 }
6919 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6920 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6921 Py_DECREF(restuple);
6922 return NULL;
6923 }
6924 if (*newpos<0)
6925 *newpos = len + *newpos;
6926 if (*newpos<0 || *newpos>len) {
6927 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6928 Py_DECREF(restuple);
6929 return NULL;
6930 }
6931 Py_INCREF(resunicode);
6932 Py_DECREF(restuple);
6933 return resunicode;
6934 }
6935
6936 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6937 unicode_encode_ucs1(PyObject *unicode,
6938 const char *errors,
6939 const Py_UCS4 limit)
6940 {
6941 /* input state */
6942 Py_ssize_t pos=0, size;
6943 int kind;
6944 const void *data;
6945 /* pointer into the output */
6946 char *str;
6947 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6948 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6949 PyObject *error_handler_obj = NULL;
6950 PyObject *exc = NULL;
6951 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6952 PyObject *rep = NULL;
6953 /* output object */
6954 _PyBytesWriter writer;
6955
6956 if (PyUnicode_READY(unicode) == -1)
6957 return NULL;
6958 size = PyUnicode_GET_LENGTH(unicode);
6959 kind = PyUnicode_KIND(unicode);
6960 data = PyUnicode_DATA(unicode);
6961 /* allocate enough for a simple encoding without
6962 replacements, if we need more, we'll resize */
6963 if (size == 0)
6964 return PyBytes_FromStringAndSize(NULL, 0);
6965
6966 _PyBytesWriter_Init(&writer);
6967 str = _PyBytesWriter_Alloc(&writer, size);
6968 if (str == NULL)
6969 return NULL;
6970
6971 while (pos < size) {
6972 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6973
6974 /* can we encode this? */
6975 if (ch < limit) {
6976 /* no overflow check, because we know that the space is enough */
6977 *str++ = (char)ch;
6978 ++pos;
6979 }
6980 else {
6981 Py_ssize_t newpos, i;
6982 /* startpos for collecting unencodable chars */
6983 Py_ssize_t collstart = pos;
6984 Py_ssize_t collend = collstart + 1;
6985 /* find all unecodable characters */
6986
6987 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6988 ++collend;
6989
6990 /* Only overallocate the buffer if it's not the last write */
6991 writer.overallocate = (collend < size);
6992
6993 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6994 if (error_handler == _Py_ERROR_UNKNOWN)
6995 error_handler = _Py_GetErrorHandler(errors);
6996
6997 switch (error_handler) {
6998 case _Py_ERROR_STRICT:
6999 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7000 goto onError;
7001
7002 case _Py_ERROR_REPLACE:
7003 memset(str, '?', collend - collstart);
7004 str += (collend - collstart);
7005 /* fall through */
7006 case _Py_ERROR_IGNORE:
7007 pos = collend;
7008 break;
7009
7010 case _Py_ERROR_BACKSLASHREPLACE:
7011 /* subtract preallocated bytes */
7012 writer.min_size -= (collend - collstart);
7013 str = backslashreplace(&writer, str,
7014 unicode, collstart, collend);
7015 if (str == NULL)
7016 goto onError;
7017 pos = collend;
7018 break;
7019
7020 case _Py_ERROR_XMLCHARREFREPLACE:
7021 /* subtract preallocated bytes */
7022 writer.min_size -= (collend - collstart);
7023 str = xmlcharrefreplace(&writer, str,
7024 unicode, collstart, collend);
7025 if (str == NULL)
7026 goto onError;
7027 pos = collend;
7028 break;
7029
7030 case _Py_ERROR_SURROGATEESCAPE:
7031 for (i = collstart; i < collend; ++i) {
7032 ch = PyUnicode_READ(kind, data, i);
7033 if (ch < 0xdc80 || 0xdcff < ch) {
7034 /* Not a UTF-8b surrogate */
7035 break;
7036 }
7037 *str++ = (char)(ch - 0xdc00);
7038 ++pos;
7039 }
7040 if (i >= collend)
7041 break;
7042 collstart = pos;
7043 assert(collstart != collend);
7044 /* fall through */
7045
7046 default:
7047 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7048 encoding, reason, unicode, &exc,
7049 collstart, collend, &newpos);
7050 if (rep == NULL)
7051 goto onError;
7052
7053 /* subtract preallocated bytes */
7054 writer.min_size -= newpos - collstart;
7055
7056 if (PyBytes_Check(rep)) {
7057 /* Directly copy bytes result to output. */
7058 str = _PyBytesWriter_WriteBytes(&writer, str,
7059 PyBytes_AS_STRING(rep),
7060 PyBytes_GET_SIZE(rep));
7061 }
7062 else {
7063 assert(PyUnicode_Check(rep));
7064
7065 if (PyUnicode_READY(rep) < 0)
7066 goto onError;
7067
7068 if (limit == 256 ?
7069 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7070 !PyUnicode_IS_ASCII(rep))
7071 {
7072 /* Not all characters are smaller than limit */
7073 raise_encode_exception(&exc, encoding, unicode,
7074 collstart, collend, reason);
7075 goto onError;
7076 }
7077 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7078 str = _PyBytesWriter_WriteBytes(&writer, str,
7079 PyUnicode_DATA(rep),
7080 PyUnicode_GET_LENGTH(rep));
7081 }
7082 if (str == NULL)
7083 goto onError;
7084
7085 pos = newpos;
7086 Py_CLEAR(rep);
7087 }
7088
7089 /* If overallocation was disabled, ensure that it was the last
7090 write. Otherwise, we missed an optimization */
7091 assert(writer.overallocate || pos == size);
7092 }
7093 }
7094
7095 Py_XDECREF(error_handler_obj);
7096 Py_XDECREF(exc);
7097 return _PyBytesWriter_Finish(&writer, str);
7098
7099 onError:
7100 Py_XDECREF(rep);
7101 _PyBytesWriter_Dealloc(&writer);
7102 Py_XDECREF(error_handler_obj);
7103 Py_XDECREF(exc);
7104 return NULL;
7105 }
7106
7107 /* Deprecated */
7108 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7109 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7110 Py_ssize_t size,
7111 const char *errors)
7112 {
7113 PyObject *result;
7114 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7115 if (unicode == NULL)
7116 return NULL;
7117 result = unicode_encode_ucs1(unicode, errors, 256);
7118 Py_DECREF(unicode);
7119 return result;
7120 }
7121
7122 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7123 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7124 {
7125 if (!PyUnicode_Check(unicode)) {
7126 PyErr_BadArgument();
7127 return NULL;
7128 }
7129 if (PyUnicode_READY(unicode) == -1)
7130 return NULL;
7131 /* Fast path: if it is a one-byte string, construct
7132 bytes object directly. */
7133 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7134 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7135 PyUnicode_GET_LENGTH(unicode));
7136 /* Non-Latin-1 characters present. Defer to above function to
7137 raise the exception. */
7138 return unicode_encode_ucs1(unicode, errors, 256);
7139 }
7140
7141 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7142 PyUnicode_AsLatin1String(PyObject *unicode)
7143 {
7144 return _PyUnicode_AsLatin1String(unicode, NULL);
7145 }
7146
7147 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7148
7149 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7150 PyUnicode_DecodeASCII(const char *s,
7151 Py_ssize_t size,
7152 const char *errors)
7153 {
7154 const char *starts = s;
7155 const char *e = s + size;
7156 PyObject *error_handler_obj = NULL;
7157 PyObject *exc = NULL;
7158 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7159
7160 if (size == 0)
7161 _Py_RETURN_UNICODE_EMPTY();
7162
7163 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7164 if (size == 1 && (unsigned char)s[0] < 128)
7165 return get_latin1_char((unsigned char)s[0]);
7166
7167 // Shortcut for simple case
7168 PyObject *u = PyUnicode_New(size, 127);
7169 if (u == NULL) {
7170 return NULL;
7171 }
7172 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7173 if (outpos == size) {
7174 return u;
7175 }
7176
7177 _PyUnicodeWriter writer;
7178 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7179 writer.pos = outpos;
7180
7181 s += outpos;
7182 int kind = writer.kind;
7183 void *data = writer.data;
7184 Py_ssize_t startinpos, endinpos;
7185
7186 while (s < e) {
7187 unsigned char c = (unsigned char)*s;
7188 if (c < 128) {
7189 PyUnicode_WRITE(kind, data, writer.pos, c);
7190 writer.pos++;
7191 ++s;
7192 continue;
7193 }
7194
7195 /* byte outsize range 0x00..0x7f: call the error handler */
7196
7197 if (error_handler == _Py_ERROR_UNKNOWN)
7198 error_handler = _Py_GetErrorHandler(errors);
7199
7200 switch (error_handler)
7201 {
7202 case _Py_ERROR_REPLACE:
7203 case _Py_ERROR_SURROGATEESCAPE:
7204 /* Fast-path: the error handler only writes one character,
7205 but we may switch to UCS2 at the first write */
7206 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7207 goto onError;
7208 kind = writer.kind;
7209 data = writer.data;
7210
7211 if (error_handler == _Py_ERROR_REPLACE)
7212 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7213 else
7214 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7215 writer.pos++;
7216 ++s;
7217 break;
7218
7219 case _Py_ERROR_IGNORE:
7220 ++s;
7221 break;
7222
7223 default:
7224 startinpos = s-starts;
7225 endinpos = startinpos + 1;
7226 if (unicode_decode_call_errorhandler_writer(
7227 errors, &error_handler_obj,
7228 "ascii", "ordinal not in range(128)",
7229 &starts, &e, &startinpos, &endinpos, &exc, &s,
7230 &writer))
7231 goto onError;
7232 kind = writer.kind;
7233 data = writer.data;
7234 }
7235 }
7236 Py_XDECREF(error_handler_obj);
7237 Py_XDECREF(exc);
7238 return _PyUnicodeWriter_Finish(&writer);
7239
7240 onError:
7241 _PyUnicodeWriter_Dealloc(&writer);
7242 Py_XDECREF(error_handler_obj);
7243 Py_XDECREF(exc);
7244 return NULL;
7245 }
7246
7247 /* Deprecated */
7248 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7249 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7250 Py_ssize_t size,
7251 const char *errors)
7252 {
7253 PyObject *result;
7254 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7255 if (unicode == NULL)
7256 return NULL;
7257 result = unicode_encode_ucs1(unicode, errors, 128);
7258 Py_DECREF(unicode);
7259 return result;
7260 }
7261
7262 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7263 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7264 {
7265 if (!PyUnicode_Check(unicode)) {
7266 PyErr_BadArgument();
7267 return NULL;
7268 }
7269 if (PyUnicode_READY(unicode) == -1)
7270 return NULL;
7271 /* Fast path: if it is an ASCII-only string, construct bytes object
7272 directly. Else defer to above function to raise the exception. */
7273 if (PyUnicode_IS_ASCII(unicode))
7274 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7275 PyUnicode_GET_LENGTH(unicode));
7276 return unicode_encode_ucs1(unicode, errors, 128);
7277 }
7278
7279 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7280 PyUnicode_AsASCIIString(PyObject *unicode)
7281 {
7282 return _PyUnicode_AsASCIIString(unicode, NULL);
7283 }
7284
7285 #ifdef MS_WINDOWS
7286
7287 /* --- MBCS codecs for Windows -------------------------------------------- */
7288
7289 #if SIZEOF_INT < SIZEOF_SIZE_T
7290 #define NEED_RETRY
7291 #endif
7292
7293 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7294 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7295 both cases also and avoids partial characters overrunning the
7296 length limit in MultiByteToWideChar on Windows */
7297 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7298
7299 #ifndef WC_ERR_INVALID_CHARS
7300 # define WC_ERR_INVALID_CHARS 0x0080
7301 #endif
7302
7303 static const char*
code_page_name(UINT code_page,PyObject ** obj)7304 code_page_name(UINT code_page, PyObject **obj)
7305 {
7306 *obj = NULL;
7307 if (code_page == CP_ACP)
7308 return "mbcs";
7309 if (code_page == CP_UTF7)
7310 return "CP_UTF7";
7311 if (code_page == CP_UTF8)
7312 return "CP_UTF8";
7313
7314 *obj = PyBytes_FromFormat("cp%u", code_page);
7315 if (*obj == NULL)
7316 return NULL;
7317 return PyBytes_AS_STRING(*obj);
7318 }
7319
7320 static DWORD
decode_code_page_flags(UINT code_page)7321 decode_code_page_flags(UINT code_page)
7322 {
7323 if (code_page == CP_UTF7) {
7324 /* The CP_UTF7 decoder only supports flags=0 */
7325 return 0;
7326 }
7327 else
7328 return MB_ERR_INVALID_CHARS;
7329 }
7330
7331 /*
7332 * Decode a byte string from a Windows code page into unicode object in strict
7333 * mode.
7334 *
7335 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7336 * OSError and returns -1 on other error.
7337 */
7338 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7339 decode_code_page_strict(UINT code_page,
7340 wchar_t **buf,
7341 Py_ssize_t *bufsize,
7342 const char *in,
7343 int insize)
7344 {
7345 DWORD flags = MB_ERR_INVALID_CHARS;
7346 wchar_t *out;
7347 DWORD outsize;
7348
7349 /* First get the size of the result */
7350 assert(insize > 0);
7351 while ((outsize = MultiByteToWideChar(code_page, flags,
7352 in, insize, NULL, 0)) <= 0)
7353 {
7354 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7355 goto error;
7356 }
7357 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7358 flags = 0;
7359 }
7360
7361 /* Extend a wchar_t* buffer */
7362 Py_ssize_t n = *bufsize; /* Get the current length */
7363 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7364 return -1;
7365 }
7366 out = *buf + n;
7367
7368 /* Do the conversion */
7369 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7370 if (outsize <= 0)
7371 goto error;
7372 return insize;
7373
7374 error:
7375 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7376 return -2;
7377 PyErr_SetFromWindowsErr(0);
7378 return -1;
7379 }
7380
7381 /*
7382 * Decode a byte string from a code page into unicode object with an error
7383 * handler.
7384 *
7385 * Returns consumed size if succeed, or raise an OSError or
7386 * UnicodeDecodeError exception and returns -1 on error.
7387 */
7388 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7389 decode_code_page_errors(UINT code_page,
7390 wchar_t **buf,
7391 Py_ssize_t *bufsize,
7392 const char *in, const int size,
7393 const char *errors, int final)
7394 {
7395 const char *startin = in;
7396 const char *endin = in + size;
7397 DWORD flags = MB_ERR_INVALID_CHARS;
7398 /* Ideally, we should get reason from FormatMessage. This is the Windows
7399 2000 English version of the message. */
7400 const char *reason = "No mapping for the Unicode character exists "
7401 "in the target code page.";
7402 /* each step cannot decode more than 1 character, but a character can be
7403 represented as a surrogate pair */
7404 wchar_t buffer[2], *out;
7405 int insize;
7406 Py_ssize_t outsize;
7407 PyObject *errorHandler = NULL;
7408 PyObject *exc = NULL;
7409 PyObject *encoding_obj = NULL;
7410 const char *encoding;
7411 DWORD err;
7412 int ret = -1;
7413
7414 assert(size > 0);
7415
7416 encoding = code_page_name(code_page, &encoding_obj);
7417 if (encoding == NULL)
7418 return -1;
7419
7420 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7421 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7422 UnicodeDecodeError. */
7423 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7424 if (exc != NULL) {
7425 PyCodec_StrictErrors(exc);
7426 Py_CLEAR(exc);
7427 }
7428 goto error;
7429 }
7430
7431 /* Extend a wchar_t* buffer */
7432 Py_ssize_t n = *bufsize; /* Get the current length */
7433 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7434 PyErr_NoMemory();
7435 goto error;
7436 }
7437 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7438 goto error;
7439 }
7440 out = *buf + n;
7441
7442 /* Decode the byte string character per character */
7443 while (in < endin)
7444 {
7445 /* Decode a character */
7446 insize = 1;
7447 do
7448 {
7449 outsize = MultiByteToWideChar(code_page, flags,
7450 in, insize,
7451 buffer, Py_ARRAY_LENGTH(buffer));
7452 if (outsize > 0)
7453 break;
7454 err = GetLastError();
7455 if (err == ERROR_INVALID_FLAGS && flags) {
7456 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7457 flags = 0;
7458 continue;
7459 }
7460 if (err != ERROR_NO_UNICODE_TRANSLATION
7461 && err != ERROR_INSUFFICIENT_BUFFER)
7462 {
7463 PyErr_SetFromWindowsErr(0);
7464 goto error;
7465 }
7466 insize++;
7467 }
7468 /* 4=maximum length of a UTF-8 sequence */
7469 while (insize <= 4 && (in + insize) <= endin);
7470
7471 if (outsize <= 0) {
7472 Py_ssize_t startinpos, endinpos, outpos;
7473
7474 /* last character in partial decode? */
7475 if (in + insize >= endin && !final)
7476 break;
7477
7478 startinpos = in - startin;
7479 endinpos = startinpos + 1;
7480 outpos = out - *buf;
7481 if (unicode_decode_call_errorhandler_wchar(
7482 errors, &errorHandler,
7483 encoding, reason,
7484 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7485 buf, bufsize, &outpos))
7486 {
7487 goto error;
7488 }
7489 out = *buf + outpos;
7490 }
7491 else {
7492 in += insize;
7493 memcpy(out, buffer, outsize * sizeof(wchar_t));
7494 out += outsize;
7495 }
7496 }
7497
7498 /* Shrink the buffer */
7499 assert(out - *buf <= *bufsize);
7500 *bufsize = out - *buf;
7501 /* (in - startin) <= size and size is an int */
7502 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7503
7504 error:
7505 Py_XDECREF(encoding_obj);
7506 Py_XDECREF(errorHandler);
7507 Py_XDECREF(exc);
7508 return ret;
7509 }
7510
7511 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7512 decode_code_page_stateful(int code_page,
7513 const char *s, Py_ssize_t size,
7514 const char *errors, Py_ssize_t *consumed)
7515 {
7516 wchar_t *buf = NULL;
7517 Py_ssize_t bufsize = 0;
7518 int chunk_size, final, converted, done;
7519
7520 if (code_page < 0) {
7521 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7522 return NULL;
7523 }
7524 if (size < 0) {
7525 PyErr_BadInternalCall();
7526 return NULL;
7527 }
7528
7529 if (consumed)
7530 *consumed = 0;
7531
7532 do
7533 {
7534 #ifdef NEED_RETRY
7535 if (size > DECODING_CHUNK_SIZE) {
7536 chunk_size = DECODING_CHUNK_SIZE;
7537 final = 0;
7538 done = 0;
7539 }
7540 else
7541 #endif
7542 {
7543 chunk_size = (int)size;
7544 final = (consumed == NULL);
7545 done = 1;
7546 }
7547
7548 if (chunk_size == 0 && done) {
7549 if (buf != NULL)
7550 break;
7551 _Py_RETURN_UNICODE_EMPTY();
7552 }
7553
7554 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7555 s, chunk_size);
7556 if (converted == -2)
7557 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7558 s, chunk_size,
7559 errors, final);
7560 assert(converted != 0 || done);
7561
7562 if (converted < 0) {
7563 PyMem_Free(buf);
7564 return NULL;
7565 }
7566
7567 if (consumed)
7568 *consumed += converted;
7569
7570 s += converted;
7571 size -= converted;
7572 } while (!done);
7573
7574 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7575 PyMem_Free(buf);
7576 return v;
7577 }
7578
7579 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7580 PyUnicode_DecodeCodePageStateful(int code_page,
7581 const char *s,
7582 Py_ssize_t size,
7583 const char *errors,
7584 Py_ssize_t *consumed)
7585 {
7586 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7587 }
7588
7589 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7590 PyUnicode_DecodeMBCSStateful(const char *s,
7591 Py_ssize_t size,
7592 const char *errors,
7593 Py_ssize_t *consumed)
7594 {
7595 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7596 }
7597
7598 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7599 PyUnicode_DecodeMBCS(const char *s,
7600 Py_ssize_t size,
7601 const char *errors)
7602 {
7603 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7604 }
7605
7606 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7607 encode_code_page_flags(UINT code_page, const char *errors)
7608 {
7609 if (code_page == CP_UTF8) {
7610 return WC_ERR_INVALID_CHARS;
7611 }
7612 else if (code_page == CP_UTF7) {
7613 /* CP_UTF7 only supports flags=0 */
7614 return 0;
7615 }
7616 else {
7617 if (errors != NULL && strcmp(errors, "replace") == 0)
7618 return 0;
7619 else
7620 return WC_NO_BEST_FIT_CHARS;
7621 }
7622 }
7623
7624 /*
7625 * Encode a Unicode string to a Windows code page into a byte string in strict
7626 * mode.
7627 *
7628 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7629 * an OSError and returns -1 on other error.
7630 */
7631 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7632 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7633 PyObject *unicode, Py_ssize_t offset, int len,
7634 const char* errors)
7635 {
7636 BOOL usedDefaultChar = FALSE;
7637 BOOL *pusedDefaultChar = &usedDefaultChar;
7638 int outsize;
7639 wchar_t *p;
7640 Py_ssize_t size;
7641 const DWORD flags = encode_code_page_flags(code_page, NULL);
7642 char *out;
7643 /* Create a substring so that we can get the UTF-16 representation
7644 of just the slice under consideration. */
7645 PyObject *substring;
7646
7647 assert(len > 0);
7648
7649 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7650 pusedDefaultChar = &usedDefaultChar;
7651 else
7652 pusedDefaultChar = NULL;
7653
7654 substring = PyUnicode_Substring(unicode, offset, offset+len);
7655 if (substring == NULL)
7656 return -1;
7657 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7658 if (p == NULL) {
7659 Py_DECREF(substring);
7660 return -1;
7661 }
7662 assert(size <= INT_MAX);
7663
7664 /* First get the size of the result */
7665 outsize = WideCharToMultiByte(code_page, flags,
7666 p, (int)size,
7667 NULL, 0,
7668 NULL, pusedDefaultChar);
7669 if (outsize <= 0)
7670 goto error;
7671 /* If we used a default char, then we failed! */
7672 if (pusedDefaultChar && *pusedDefaultChar) {
7673 Py_DECREF(substring);
7674 return -2;
7675 }
7676
7677 if (*outbytes == NULL) {
7678 /* Create string object */
7679 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7680 if (*outbytes == NULL) {
7681 Py_DECREF(substring);
7682 return -1;
7683 }
7684 out = PyBytes_AS_STRING(*outbytes);
7685 }
7686 else {
7687 /* Extend string object */
7688 const Py_ssize_t n = PyBytes_Size(*outbytes);
7689 if (outsize > PY_SSIZE_T_MAX - n) {
7690 PyErr_NoMemory();
7691 Py_DECREF(substring);
7692 return -1;
7693 }
7694 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7695 Py_DECREF(substring);
7696 return -1;
7697 }
7698 out = PyBytes_AS_STRING(*outbytes) + n;
7699 }
7700
7701 /* Do the conversion */
7702 outsize = WideCharToMultiByte(code_page, flags,
7703 p, (int)size,
7704 out, outsize,
7705 NULL, pusedDefaultChar);
7706 Py_CLEAR(substring);
7707 if (outsize <= 0)
7708 goto error;
7709 if (pusedDefaultChar && *pusedDefaultChar)
7710 return -2;
7711 return 0;
7712
7713 error:
7714 Py_XDECREF(substring);
7715 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7716 return -2;
7717 PyErr_SetFromWindowsErr(0);
7718 return -1;
7719 }
7720
7721 /*
7722 * Encode a Unicode string to a Windows code page into a byte string using an
7723 * error handler.
7724 *
7725 * Returns consumed characters if succeed, or raise an OSError and returns
7726 * -1 on other error.
7727 */
7728 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7729 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7730 PyObject *unicode, Py_ssize_t unicode_offset,
7731 Py_ssize_t insize, const char* errors)
7732 {
7733 const DWORD flags = encode_code_page_flags(code_page, errors);
7734 Py_ssize_t pos = unicode_offset;
7735 Py_ssize_t endin = unicode_offset + insize;
7736 /* Ideally, we should get reason from FormatMessage. This is the Windows
7737 2000 English version of the message. */
7738 const char *reason = "invalid character";
7739 /* 4=maximum length of a UTF-8 sequence */
7740 char buffer[4];
7741 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7742 Py_ssize_t outsize;
7743 char *out;
7744 PyObject *errorHandler = NULL;
7745 PyObject *exc = NULL;
7746 PyObject *encoding_obj = NULL;
7747 const char *encoding;
7748 Py_ssize_t newpos, newoutsize;
7749 PyObject *rep;
7750 int ret = -1;
7751
7752 assert(insize > 0);
7753
7754 encoding = code_page_name(code_page, &encoding_obj);
7755 if (encoding == NULL)
7756 return -1;
7757
7758 if (errors == NULL || strcmp(errors, "strict") == 0) {
7759 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7760 then we raise a UnicodeEncodeError. */
7761 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7762 if (exc != NULL) {
7763 PyCodec_StrictErrors(exc);
7764 Py_DECREF(exc);
7765 }
7766 Py_XDECREF(encoding_obj);
7767 return -1;
7768 }
7769
7770 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7771 pusedDefaultChar = &usedDefaultChar;
7772 else
7773 pusedDefaultChar = NULL;
7774
7775 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7776 PyErr_NoMemory();
7777 goto error;
7778 }
7779 outsize = insize * Py_ARRAY_LENGTH(buffer);
7780
7781 if (*outbytes == NULL) {
7782 /* Create string object */
7783 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7784 if (*outbytes == NULL)
7785 goto error;
7786 out = PyBytes_AS_STRING(*outbytes);
7787 }
7788 else {
7789 /* Extend string object */
7790 Py_ssize_t n = PyBytes_Size(*outbytes);
7791 if (n > PY_SSIZE_T_MAX - outsize) {
7792 PyErr_NoMemory();
7793 goto error;
7794 }
7795 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7796 goto error;
7797 out = PyBytes_AS_STRING(*outbytes) + n;
7798 }
7799
7800 /* Encode the string character per character */
7801 while (pos < endin)
7802 {
7803 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7804 wchar_t chars[2];
7805 int charsize;
7806 if (ch < 0x10000) {
7807 chars[0] = (wchar_t)ch;
7808 charsize = 1;
7809 }
7810 else {
7811 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7812 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7813 charsize = 2;
7814 }
7815
7816 outsize = WideCharToMultiByte(code_page, flags,
7817 chars, charsize,
7818 buffer, Py_ARRAY_LENGTH(buffer),
7819 NULL, pusedDefaultChar);
7820 if (outsize > 0) {
7821 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7822 {
7823 pos++;
7824 memcpy(out, buffer, outsize);
7825 out += outsize;
7826 continue;
7827 }
7828 }
7829 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7830 PyErr_SetFromWindowsErr(0);
7831 goto error;
7832 }
7833
7834 rep = unicode_encode_call_errorhandler(
7835 errors, &errorHandler, encoding, reason,
7836 unicode, &exc,
7837 pos, pos + 1, &newpos);
7838 if (rep == NULL)
7839 goto error;
7840 pos = newpos;
7841
7842 if (PyBytes_Check(rep)) {
7843 outsize = PyBytes_GET_SIZE(rep);
7844 if (outsize != 1) {
7845 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7846 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7847 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7848 Py_DECREF(rep);
7849 goto error;
7850 }
7851 out = PyBytes_AS_STRING(*outbytes) + offset;
7852 }
7853 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7854 out += outsize;
7855 }
7856 else {
7857 Py_ssize_t i;
7858 enum PyUnicode_Kind kind;
7859 const void *data;
7860
7861 if (PyUnicode_READY(rep) == -1) {
7862 Py_DECREF(rep);
7863 goto error;
7864 }
7865
7866 outsize = PyUnicode_GET_LENGTH(rep);
7867 if (outsize != 1) {
7868 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7869 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7870 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7871 Py_DECREF(rep);
7872 goto error;
7873 }
7874 out = PyBytes_AS_STRING(*outbytes) + offset;
7875 }
7876 kind = PyUnicode_KIND(rep);
7877 data = PyUnicode_DATA(rep);
7878 for (i=0; i < outsize; i++) {
7879 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7880 if (ch > 127) {
7881 raise_encode_exception(&exc,
7882 encoding, unicode,
7883 pos, pos + 1,
7884 "unable to encode error handler result to ASCII");
7885 Py_DECREF(rep);
7886 goto error;
7887 }
7888 *out = (unsigned char)ch;
7889 out++;
7890 }
7891 }
7892 Py_DECREF(rep);
7893 }
7894 /* write a NUL byte */
7895 *out = 0;
7896 outsize = out - PyBytes_AS_STRING(*outbytes);
7897 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7898 if (_PyBytes_Resize(outbytes, outsize) < 0)
7899 goto error;
7900 ret = 0;
7901
7902 error:
7903 Py_XDECREF(encoding_obj);
7904 Py_XDECREF(errorHandler);
7905 Py_XDECREF(exc);
7906 return ret;
7907 }
7908
7909 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7910 encode_code_page(int code_page,
7911 PyObject *unicode,
7912 const char *errors)
7913 {
7914 Py_ssize_t len;
7915 PyObject *outbytes = NULL;
7916 Py_ssize_t offset;
7917 int chunk_len, ret, done;
7918
7919 if (!PyUnicode_Check(unicode)) {
7920 PyErr_BadArgument();
7921 return NULL;
7922 }
7923
7924 if (PyUnicode_READY(unicode) == -1)
7925 return NULL;
7926 len = PyUnicode_GET_LENGTH(unicode);
7927
7928 if (code_page < 0) {
7929 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7930 return NULL;
7931 }
7932
7933 if (len == 0)
7934 return PyBytes_FromStringAndSize(NULL, 0);
7935
7936 offset = 0;
7937 do
7938 {
7939 #ifdef NEED_RETRY
7940 if (len > DECODING_CHUNK_SIZE) {
7941 chunk_len = DECODING_CHUNK_SIZE;
7942 done = 0;
7943 }
7944 else
7945 #endif
7946 {
7947 chunk_len = (int)len;
7948 done = 1;
7949 }
7950
7951 ret = encode_code_page_strict(code_page, &outbytes,
7952 unicode, offset, chunk_len,
7953 errors);
7954 if (ret == -2)
7955 ret = encode_code_page_errors(code_page, &outbytes,
7956 unicode, offset,
7957 chunk_len, errors);
7958 if (ret < 0) {
7959 Py_XDECREF(outbytes);
7960 return NULL;
7961 }
7962
7963 offset += chunk_len;
7964 len -= chunk_len;
7965 } while (!done);
7966
7967 return outbytes;
7968 }
7969
7970 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7971 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7972 Py_ssize_t size,
7973 const char *errors)
7974 {
7975 PyObject *unicode, *res;
7976 unicode = PyUnicode_FromWideChar(p, size);
7977 if (unicode == NULL)
7978 return NULL;
7979 res = encode_code_page(CP_ACP, unicode, errors);
7980 Py_DECREF(unicode);
7981 return res;
7982 }
7983
7984 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7985 PyUnicode_EncodeCodePage(int code_page,
7986 PyObject *unicode,
7987 const char *errors)
7988 {
7989 return encode_code_page(code_page, unicode, errors);
7990 }
7991
7992 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7993 PyUnicode_AsMBCSString(PyObject *unicode)
7994 {
7995 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7996 }
7997
7998 #undef NEED_RETRY
7999
8000 #endif /* MS_WINDOWS */
8001
8002 /* --- Character Mapping Codec -------------------------------------------- */
8003
8004 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8005 charmap_decode_string(const char *s,
8006 Py_ssize_t size,
8007 PyObject *mapping,
8008 const char *errors,
8009 _PyUnicodeWriter *writer)
8010 {
8011 const char *starts = s;
8012 const char *e;
8013 Py_ssize_t startinpos, endinpos;
8014 PyObject *errorHandler = NULL, *exc = NULL;
8015 Py_ssize_t maplen;
8016 enum PyUnicode_Kind mapkind;
8017 const void *mapdata;
8018 Py_UCS4 x;
8019 unsigned char ch;
8020
8021 if (PyUnicode_READY(mapping) == -1)
8022 return -1;
8023
8024 maplen = PyUnicode_GET_LENGTH(mapping);
8025 mapdata = PyUnicode_DATA(mapping);
8026 mapkind = PyUnicode_KIND(mapping);
8027
8028 e = s + size;
8029
8030 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8031 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8032 * is disabled in encoding aliases, latin1 is preferred because
8033 * its implementation is faster. */
8034 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8035 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8036 Py_UCS4 maxchar = writer->maxchar;
8037
8038 assert (writer->kind == PyUnicode_1BYTE_KIND);
8039 while (s < e) {
8040 ch = *s;
8041 x = mapdata_ucs1[ch];
8042 if (x > maxchar) {
8043 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8044 goto onError;
8045 maxchar = writer->maxchar;
8046 outdata = (Py_UCS1 *)writer->data;
8047 }
8048 outdata[writer->pos] = x;
8049 writer->pos++;
8050 ++s;
8051 }
8052 return 0;
8053 }
8054
8055 while (s < e) {
8056 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8057 enum PyUnicode_Kind outkind = writer->kind;
8058 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8059 if (outkind == PyUnicode_1BYTE_KIND) {
8060 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8061 Py_UCS4 maxchar = writer->maxchar;
8062 while (s < e) {
8063 ch = *s;
8064 x = mapdata_ucs2[ch];
8065 if (x > maxchar)
8066 goto Error;
8067 outdata[writer->pos] = x;
8068 writer->pos++;
8069 ++s;
8070 }
8071 break;
8072 }
8073 else if (outkind == PyUnicode_2BYTE_KIND) {
8074 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8075 while (s < e) {
8076 ch = *s;
8077 x = mapdata_ucs2[ch];
8078 if (x == 0xFFFE)
8079 goto Error;
8080 outdata[writer->pos] = x;
8081 writer->pos++;
8082 ++s;
8083 }
8084 break;
8085 }
8086 }
8087 ch = *s;
8088
8089 if (ch < maplen)
8090 x = PyUnicode_READ(mapkind, mapdata, ch);
8091 else
8092 x = 0xfffe; /* invalid value */
8093 Error:
8094 if (x == 0xfffe)
8095 {
8096 /* undefined mapping */
8097 startinpos = s-starts;
8098 endinpos = startinpos+1;
8099 if (unicode_decode_call_errorhandler_writer(
8100 errors, &errorHandler,
8101 "charmap", "character maps to <undefined>",
8102 &starts, &e, &startinpos, &endinpos, &exc, &s,
8103 writer)) {
8104 goto onError;
8105 }
8106 continue;
8107 }
8108
8109 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8110 goto onError;
8111 ++s;
8112 }
8113 Py_XDECREF(errorHandler);
8114 Py_XDECREF(exc);
8115 return 0;
8116
8117 onError:
8118 Py_XDECREF(errorHandler);
8119 Py_XDECREF(exc);
8120 return -1;
8121 }
8122
8123 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8124 charmap_decode_mapping(const char *s,
8125 Py_ssize_t size,
8126 PyObject *mapping,
8127 const char *errors,
8128 _PyUnicodeWriter *writer)
8129 {
8130 const char *starts = s;
8131 const char *e;
8132 Py_ssize_t startinpos, endinpos;
8133 PyObject *errorHandler = NULL, *exc = NULL;
8134 unsigned char ch;
8135 PyObject *key, *item = NULL;
8136
8137 e = s + size;
8138
8139 while (s < e) {
8140 ch = *s;
8141
8142 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8143 key = PyLong_FromLong((long)ch);
8144 if (key == NULL)
8145 goto onError;
8146
8147 item = PyObject_GetItem(mapping, key);
8148 Py_DECREF(key);
8149 if (item == NULL) {
8150 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8151 /* No mapping found means: mapping is undefined. */
8152 PyErr_Clear();
8153 goto Undefined;
8154 } else
8155 goto onError;
8156 }
8157
8158 /* Apply mapping */
8159 if (item == Py_None)
8160 goto Undefined;
8161 if (PyLong_Check(item)) {
8162 long value = PyLong_AS_LONG(item);
8163 if (value == 0xFFFE)
8164 goto Undefined;
8165 if (value < 0 || value > MAX_UNICODE) {
8166 PyErr_Format(PyExc_TypeError,
8167 "character mapping must be in range(0x%x)",
8168 (unsigned long)MAX_UNICODE + 1);
8169 goto onError;
8170 }
8171
8172 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8173 goto onError;
8174 }
8175 else if (PyUnicode_Check(item)) {
8176 if (PyUnicode_READY(item) == -1)
8177 goto onError;
8178 if (PyUnicode_GET_LENGTH(item) == 1) {
8179 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8180 if (value == 0xFFFE)
8181 goto Undefined;
8182 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8183 goto onError;
8184 }
8185 else {
8186 writer->overallocate = 1;
8187 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8188 goto onError;
8189 }
8190 }
8191 else {
8192 /* wrong return value */
8193 PyErr_SetString(PyExc_TypeError,
8194 "character mapping must return integer, None or str");
8195 goto onError;
8196 }
8197 Py_CLEAR(item);
8198 ++s;
8199 continue;
8200
8201 Undefined:
8202 /* undefined mapping */
8203 Py_CLEAR(item);
8204 startinpos = s-starts;
8205 endinpos = startinpos+1;
8206 if (unicode_decode_call_errorhandler_writer(
8207 errors, &errorHandler,
8208 "charmap", "character maps to <undefined>",
8209 &starts, &e, &startinpos, &endinpos, &exc, &s,
8210 writer)) {
8211 goto onError;
8212 }
8213 }
8214 Py_XDECREF(errorHandler);
8215 Py_XDECREF(exc);
8216 return 0;
8217
8218 onError:
8219 Py_XDECREF(item);
8220 Py_XDECREF(errorHandler);
8221 Py_XDECREF(exc);
8222 return -1;
8223 }
8224
8225 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8226 PyUnicode_DecodeCharmap(const char *s,
8227 Py_ssize_t size,
8228 PyObject *mapping,
8229 const char *errors)
8230 {
8231 _PyUnicodeWriter writer;
8232
8233 /* Default to Latin-1 */
8234 if (mapping == NULL)
8235 return PyUnicode_DecodeLatin1(s, size, errors);
8236
8237 if (size == 0)
8238 _Py_RETURN_UNICODE_EMPTY();
8239 _PyUnicodeWriter_Init(&writer);
8240 writer.min_length = size;
8241 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8242 goto onError;
8243
8244 if (PyUnicode_CheckExact(mapping)) {
8245 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8246 goto onError;
8247 }
8248 else {
8249 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8250 goto onError;
8251 }
8252 return _PyUnicodeWriter_Finish(&writer);
8253
8254 onError:
8255 _PyUnicodeWriter_Dealloc(&writer);
8256 return NULL;
8257 }
8258
8259 /* Charmap encoding: the lookup table */
8260
8261 struct encoding_map {
8262 PyObject_HEAD
8263 unsigned char level1[32];
8264 int count2, count3;
8265 unsigned char level23[1];
8266 };
8267
8268 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8269 encoding_map_size(PyObject *obj, PyObject* args)
8270 {
8271 struct encoding_map *map = (struct encoding_map*)obj;
8272 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8273 128*map->count3);
8274 }
8275
8276 static PyMethodDef encoding_map_methods[] = {
8277 {"size", encoding_map_size, METH_NOARGS,
8278 PyDoc_STR("Return the size (in bytes) of this object") },
8279 { 0 }
8280 };
8281
8282 static PyTypeObject EncodingMapType = {
8283 PyVarObject_HEAD_INIT(NULL, 0)
8284 "EncodingMap", /*tp_name*/
8285 sizeof(struct encoding_map), /*tp_basicsize*/
8286 0, /*tp_itemsize*/
8287 /* methods */
8288 0, /*tp_dealloc*/
8289 0, /*tp_vectorcall_offset*/
8290 0, /*tp_getattr*/
8291 0, /*tp_setattr*/
8292 0, /*tp_as_async*/
8293 0, /*tp_repr*/
8294 0, /*tp_as_number*/
8295 0, /*tp_as_sequence*/
8296 0, /*tp_as_mapping*/
8297 0, /*tp_hash*/
8298 0, /*tp_call*/
8299 0, /*tp_str*/
8300 0, /*tp_getattro*/
8301 0, /*tp_setattro*/
8302 0, /*tp_as_buffer*/
8303 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8304 0, /*tp_doc*/
8305 0, /*tp_traverse*/
8306 0, /*tp_clear*/
8307 0, /*tp_richcompare*/
8308 0, /*tp_weaklistoffset*/
8309 0, /*tp_iter*/
8310 0, /*tp_iternext*/
8311 encoding_map_methods, /*tp_methods*/
8312 0, /*tp_members*/
8313 0, /*tp_getset*/
8314 0, /*tp_base*/
8315 0, /*tp_dict*/
8316 0, /*tp_descr_get*/
8317 0, /*tp_descr_set*/
8318 0, /*tp_dictoffset*/
8319 0, /*tp_init*/
8320 0, /*tp_alloc*/
8321 0, /*tp_new*/
8322 0, /*tp_free*/
8323 0, /*tp_is_gc*/
8324 };
8325
8326 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8327 PyUnicode_BuildEncodingMap(PyObject* string)
8328 {
8329 PyObject *result;
8330 struct encoding_map *mresult;
8331 int i;
8332 int need_dict = 0;
8333 unsigned char level1[32];
8334 unsigned char level2[512];
8335 unsigned char *mlevel1, *mlevel2, *mlevel3;
8336 int count2 = 0, count3 = 0;
8337 int kind;
8338 const void *data;
8339 Py_ssize_t length;
8340 Py_UCS4 ch;
8341
8342 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8343 PyErr_BadArgument();
8344 return NULL;
8345 }
8346 kind = PyUnicode_KIND(string);
8347 data = PyUnicode_DATA(string);
8348 length = PyUnicode_GET_LENGTH(string);
8349 length = Py_MIN(length, 256);
8350 memset(level1, 0xFF, sizeof level1);
8351 memset(level2, 0xFF, sizeof level2);
8352
8353 /* If there isn't a one-to-one mapping of NULL to \0,
8354 or if there are non-BMP characters, we need to use
8355 a mapping dictionary. */
8356 if (PyUnicode_READ(kind, data, 0) != 0)
8357 need_dict = 1;
8358 for (i = 1; i < length; i++) {
8359 int l1, l2;
8360 ch = PyUnicode_READ(kind, data, i);
8361 if (ch == 0 || ch > 0xFFFF) {
8362 need_dict = 1;
8363 break;
8364 }
8365 if (ch == 0xFFFE)
8366 /* unmapped character */
8367 continue;
8368 l1 = ch >> 11;
8369 l2 = ch >> 7;
8370 if (level1[l1] == 0xFF)
8371 level1[l1] = count2++;
8372 if (level2[l2] == 0xFF)
8373 level2[l2] = count3++;
8374 }
8375
8376 if (count2 >= 0xFF || count3 >= 0xFF)
8377 need_dict = 1;
8378
8379 if (need_dict) {
8380 PyObject *result = PyDict_New();
8381 PyObject *key, *value;
8382 if (!result)
8383 return NULL;
8384 for (i = 0; i < length; i++) {
8385 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8386 value = PyLong_FromLong(i);
8387 if (!key || !value)
8388 goto failed1;
8389 if (PyDict_SetItem(result, key, value) == -1)
8390 goto failed1;
8391 Py_DECREF(key);
8392 Py_DECREF(value);
8393 }
8394 return result;
8395 failed1:
8396 Py_XDECREF(key);
8397 Py_XDECREF(value);
8398 Py_DECREF(result);
8399 return NULL;
8400 }
8401
8402 /* Create a three-level trie */
8403 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8404 16*count2 + 128*count3 - 1);
8405 if (!result)
8406 return PyErr_NoMemory();
8407 PyObject_Init(result, &EncodingMapType);
8408 mresult = (struct encoding_map*)result;
8409 mresult->count2 = count2;
8410 mresult->count3 = count3;
8411 mlevel1 = mresult->level1;
8412 mlevel2 = mresult->level23;
8413 mlevel3 = mresult->level23 + 16*count2;
8414 memcpy(mlevel1, level1, 32);
8415 memset(mlevel2, 0xFF, 16*count2);
8416 memset(mlevel3, 0, 128*count3);
8417 count3 = 0;
8418 for (i = 1; i < length; i++) {
8419 int o1, o2, o3, i2, i3;
8420 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8421 if (ch == 0xFFFE)
8422 /* unmapped character */
8423 continue;
8424 o1 = ch>>11;
8425 o2 = (ch>>7) & 0xF;
8426 i2 = 16*mlevel1[o1] + o2;
8427 if (mlevel2[i2] == 0xFF)
8428 mlevel2[i2] = count3++;
8429 o3 = ch & 0x7F;
8430 i3 = 128*mlevel2[i2] + o3;
8431 mlevel3[i3] = i;
8432 }
8433 return result;
8434 }
8435
8436 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8437 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8438 {
8439 struct encoding_map *map = (struct encoding_map*)mapping;
8440 int l1 = c>>11;
8441 int l2 = (c>>7) & 0xF;
8442 int l3 = c & 0x7F;
8443 int i;
8444
8445 if (c > 0xFFFF)
8446 return -1;
8447 if (c == 0)
8448 return 0;
8449 /* level 1*/
8450 i = map->level1[l1];
8451 if (i == 0xFF) {
8452 return -1;
8453 }
8454 /* level 2*/
8455 i = map->level23[16*i+l2];
8456 if (i == 0xFF) {
8457 return -1;
8458 }
8459 /* level 3 */
8460 i = map->level23[16*map->count2 + 128*i + l3];
8461 if (i == 0) {
8462 return -1;
8463 }
8464 return i;
8465 }
8466
8467 /* Lookup the character ch in the mapping. If the character
8468 can't be found, Py_None is returned (or NULL, if another
8469 error occurred). */
8470 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8471 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8472 {
8473 PyObject *w = PyLong_FromLong((long)c);
8474 PyObject *x;
8475
8476 if (w == NULL)
8477 return NULL;
8478 x = PyObject_GetItem(mapping, w);
8479 Py_DECREF(w);
8480 if (x == NULL) {
8481 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8482 /* No mapping found means: mapping is undefined. */
8483 PyErr_Clear();
8484 Py_RETURN_NONE;
8485 } else
8486 return NULL;
8487 }
8488 else if (x == Py_None)
8489 return x;
8490 else if (PyLong_Check(x)) {
8491 long value = PyLong_AS_LONG(x);
8492 if (value < 0 || value > 255) {
8493 PyErr_SetString(PyExc_TypeError,
8494 "character mapping must be in range(256)");
8495 Py_DECREF(x);
8496 return NULL;
8497 }
8498 return x;
8499 }
8500 else if (PyBytes_Check(x))
8501 return x;
8502 else {
8503 /* wrong return value */
8504 PyErr_Format(PyExc_TypeError,
8505 "character mapping must return integer, bytes or None, not %.400s",
8506 Py_TYPE(x)->tp_name);
8507 Py_DECREF(x);
8508 return NULL;
8509 }
8510 }
8511
8512 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8513 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8514 {
8515 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8516 /* exponentially overallocate to minimize reallocations */
8517 if (requiredsize < 2*outsize)
8518 requiredsize = 2*outsize;
8519 if (_PyBytes_Resize(outobj, requiredsize))
8520 return -1;
8521 return 0;
8522 }
8523
8524 typedef enum charmapencode_result {
8525 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8526 } charmapencode_result;
8527 /* lookup the character, put the result in the output string and adjust
8528 various state variables. Resize the output bytes object if not enough
8529 space is available. Return a new reference to the object that
8530 was put in the output buffer, or Py_None, if the mapping was undefined
8531 (in which case no character was written) or NULL, if a
8532 reallocation error occurred. The caller must decref the result */
8533 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8534 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8535 PyObject **outobj, Py_ssize_t *outpos)
8536 {
8537 PyObject *rep;
8538 char *outstart;
8539 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8540
8541 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8542 int res = encoding_map_lookup(c, mapping);
8543 Py_ssize_t requiredsize = *outpos+1;
8544 if (res == -1)
8545 return enc_FAILED;
8546 if (outsize<requiredsize)
8547 if (charmapencode_resize(outobj, outpos, requiredsize))
8548 return enc_EXCEPTION;
8549 outstart = PyBytes_AS_STRING(*outobj);
8550 outstart[(*outpos)++] = (char)res;
8551 return enc_SUCCESS;
8552 }
8553
8554 rep = charmapencode_lookup(c, mapping);
8555 if (rep==NULL)
8556 return enc_EXCEPTION;
8557 else if (rep==Py_None) {
8558 Py_DECREF(rep);
8559 return enc_FAILED;
8560 } else {
8561 if (PyLong_Check(rep)) {
8562 Py_ssize_t requiredsize = *outpos+1;
8563 if (outsize<requiredsize)
8564 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8565 Py_DECREF(rep);
8566 return enc_EXCEPTION;
8567 }
8568 outstart = PyBytes_AS_STRING(*outobj);
8569 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8570 }
8571 else {
8572 const char *repchars = PyBytes_AS_STRING(rep);
8573 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8574 Py_ssize_t requiredsize = *outpos+repsize;
8575 if (outsize<requiredsize)
8576 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8577 Py_DECREF(rep);
8578 return enc_EXCEPTION;
8579 }
8580 outstart = PyBytes_AS_STRING(*outobj);
8581 memcpy(outstart + *outpos, repchars, repsize);
8582 *outpos += repsize;
8583 }
8584 }
8585 Py_DECREF(rep);
8586 return enc_SUCCESS;
8587 }
8588
8589 /* handle an error in PyUnicode_EncodeCharmap
8590 Return 0 on success, -1 on error */
8591 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8592 charmap_encoding_error(
8593 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8594 PyObject **exceptionObject,
8595 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8596 PyObject **res, Py_ssize_t *respos)
8597 {
8598 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8599 Py_ssize_t size, repsize;
8600 Py_ssize_t newpos;
8601 enum PyUnicode_Kind kind;
8602 const void *data;
8603 Py_ssize_t index;
8604 /* startpos for collecting unencodable chars */
8605 Py_ssize_t collstartpos = *inpos;
8606 Py_ssize_t collendpos = *inpos+1;
8607 Py_ssize_t collpos;
8608 const char *encoding = "charmap";
8609 const char *reason = "character maps to <undefined>";
8610 charmapencode_result x;
8611 Py_UCS4 ch;
8612 int val;
8613
8614 if (PyUnicode_READY(unicode) == -1)
8615 return -1;
8616 size = PyUnicode_GET_LENGTH(unicode);
8617 /* find all unencodable characters */
8618 while (collendpos < size) {
8619 PyObject *rep;
8620 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8621 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8622 val = encoding_map_lookup(ch, mapping);
8623 if (val != -1)
8624 break;
8625 ++collendpos;
8626 continue;
8627 }
8628
8629 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8630 rep = charmapencode_lookup(ch, mapping);
8631 if (rep==NULL)
8632 return -1;
8633 else if (rep!=Py_None) {
8634 Py_DECREF(rep);
8635 break;
8636 }
8637 Py_DECREF(rep);
8638 ++collendpos;
8639 }
8640 /* cache callback name lookup
8641 * (if not done yet, i.e. it's the first error) */
8642 if (*error_handler == _Py_ERROR_UNKNOWN)
8643 *error_handler = _Py_GetErrorHandler(errors);
8644
8645 switch (*error_handler) {
8646 case _Py_ERROR_STRICT:
8647 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8648 return -1;
8649
8650 case _Py_ERROR_REPLACE:
8651 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8652 x = charmapencode_output('?', mapping, res, respos);
8653 if (x==enc_EXCEPTION) {
8654 return -1;
8655 }
8656 else if (x==enc_FAILED) {
8657 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8658 return -1;
8659 }
8660 }
8661 /* fall through */
8662 case _Py_ERROR_IGNORE:
8663 *inpos = collendpos;
8664 break;
8665
8666 case _Py_ERROR_XMLCHARREFREPLACE:
8667 /* generate replacement (temporarily (mis)uses p) */
8668 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8669 char buffer[2+29+1+1];
8670 char *cp;
8671 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8672 for (cp = buffer; *cp; ++cp) {
8673 x = charmapencode_output(*cp, mapping, res, respos);
8674 if (x==enc_EXCEPTION)
8675 return -1;
8676 else if (x==enc_FAILED) {
8677 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8678 return -1;
8679 }
8680 }
8681 }
8682 *inpos = collendpos;
8683 break;
8684
8685 default:
8686 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8687 encoding, reason, unicode, exceptionObject,
8688 collstartpos, collendpos, &newpos);
8689 if (repunicode == NULL)
8690 return -1;
8691 if (PyBytes_Check(repunicode)) {
8692 /* Directly copy bytes result to output. */
8693 Py_ssize_t outsize = PyBytes_Size(*res);
8694 Py_ssize_t requiredsize;
8695 repsize = PyBytes_Size(repunicode);
8696 requiredsize = *respos + repsize;
8697 if (requiredsize > outsize)
8698 /* Make room for all additional bytes. */
8699 if (charmapencode_resize(res, respos, requiredsize)) {
8700 Py_DECREF(repunicode);
8701 return -1;
8702 }
8703 memcpy(PyBytes_AsString(*res) + *respos,
8704 PyBytes_AsString(repunicode), repsize);
8705 *respos += repsize;
8706 *inpos = newpos;
8707 Py_DECREF(repunicode);
8708 break;
8709 }
8710 /* generate replacement */
8711 if (PyUnicode_READY(repunicode) == -1) {
8712 Py_DECREF(repunicode);
8713 return -1;
8714 }
8715 repsize = PyUnicode_GET_LENGTH(repunicode);
8716 data = PyUnicode_DATA(repunicode);
8717 kind = PyUnicode_KIND(repunicode);
8718 for (index = 0; index < repsize; index++) {
8719 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8720 x = charmapencode_output(repch, mapping, res, respos);
8721 if (x==enc_EXCEPTION) {
8722 Py_DECREF(repunicode);
8723 return -1;
8724 }
8725 else if (x==enc_FAILED) {
8726 Py_DECREF(repunicode);
8727 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8728 return -1;
8729 }
8730 }
8731 *inpos = newpos;
8732 Py_DECREF(repunicode);
8733 }
8734 return 0;
8735 }
8736
8737 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8738 _PyUnicode_EncodeCharmap(PyObject *unicode,
8739 PyObject *mapping,
8740 const char *errors)
8741 {
8742 /* output object */
8743 PyObject *res = NULL;
8744 /* current input position */
8745 Py_ssize_t inpos = 0;
8746 Py_ssize_t size;
8747 /* current output position */
8748 Py_ssize_t respos = 0;
8749 PyObject *error_handler_obj = NULL;
8750 PyObject *exc = NULL;
8751 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8752 const void *data;
8753 int kind;
8754
8755 if (PyUnicode_READY(unicode) == -1)
8756 return NULL;
8757 size = PyUnicode_GET_LENGTH(unicode);
8758 data = PyUnicode_DATA(unicode);
8759 kind = PyUnicode_KIND(unicode);
8760
8761 /* Default to Latin-1 */
8762 if (mapping == NULL)
8763 return unicode_encode_ucs1(unicode, errors, 256);
8764
8765 /* allocate enough for a simple encoding without
8766 replacements, if we need more, we'll resize */
8767 res = PyBytes_FromStringAndSize(NULL, size);
8768 if (res == NULL)
8769 goto onError;
8770 if (size == 0)
8771 return res;
8772
8773 while (inpos<size) {
8774 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8775 /* try to encode it */
8776 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8777 if (x==enc_EXCEPTION) /* error */
8778 goto onError;
8779 if (x==enc_FAILED) { /* unencodable character */
8780 if (charmap_encoding_error(unicode, &inpos, mapping,
8781 &exc,
8782 &error_handler, &error_handler_obj, errors,
8783 &res, &respos)) {
8784 goto onError;
8785 }
8786 }
8787 else
8788 /* done with this character => adjust input position */
8789 ++inpos;
8790 }
8791
8792 /* Resize if we allocated to much */
8793 if (respos<PyBytes_GET_SIZE(res))
8794 if (_PyBytes_Resize(&res, respos) < 0)
8795 goto onError;
8796
8797 Py_XDECREF(exc);
8798 Py_XDECREF(error_handler_obj);
8799 return res;
8800
8801 onError:
8802 Py_XDECREF(res);
8803 Py_XDECREF(exc);
8804 Py_XDECREF(error_handler_obj);
8805 return NULL;
8806 }
8807
8808 /* Deprecated */
8809 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8810 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8811 Py_ssize_t size,
8812 PyObject *mapping,
8813 const char *errors)
8814 {
8815 PyObject *result;
8816 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8817 if (unicode == NULL)
8818 return NULL;
8819 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8820 Py_DECREF(unicode);
8821 return result;
8822 }
8823
8824 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8825 PyUnicode_AsCharmapString(PyObject *unicode,
8826 PyObject *mapping)
8827 {
8828 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8829 PyErr_BadArgument();
8830 return NULL;
8831 }
8832 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8833 }
8834
8835 /* create or adjust a UnicodeTranslateError */
8836 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8837 make_translate_exception(PyObject **exceptionObject,
8838 PyObject *unicode,
8839 Py_ssize_t startpos, Py_ssize_t endpos,
8840 const char *reason)
8841 {
8842 if (*exceptionObject == NULL) {
8843 *exceptionObject = _PyUnicodeTranslateError_Create(
8844 unicode, startpos, endpos, reason);
8845 }
8846 else {
8847 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8848 goto onError;
8849 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8850 goto onError;
8851 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8852 goto onError;
8853 return;
8854 onError:
8855 Py_CLEAR(*exceptionObject);
8856 }
8857 }
8858
8859 /* error handling callback helper:
8860 build arguments, call the callback and check the arguments,
8861 put the result into newpos and return the replacement string, which
8862 has to be freed by the caller */
8863 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8864 unicode_translate_call_errorhandler(const char *errors,
8865 PyObject **errorHandler,
8866 const char *reason,
8867 PyObject *unicode, PyObject **exceptionObject,
8868 Py_ssize_t startpos, Py_ssize_t endpos,
8869 Py_ssize_t *newpos)
8870 {
8871 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8872
8873 Py_ssize_t i_newpos;
8874 PyObject *restuple;
8875 PyObject *resunicode;
8876
8877 if (*errorHandler == NULL) {
8878 *errorHandler = PyCodec_LookupError(errors);
8879 if (*errorHandler == NULL)
8880 return NULL;
8881 }
8882
8883 make_translate_exception(exceptionObject,
8884 unicode, startpos, endpos, reason);
8885 if (*exceptionObject == NULL)
8886 return NULL;
8887
8888 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8889 if (restuple == NULL)
8890 return NULL;
8891 if (!PyTuple_Check(restuple)) {
8892 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8893 Py_DECREF(restuple);
8894 return NULL;
8895 }
8896 if (!PyArg_ParseTuple(restuple, argparse,
8897 &resunicode, &i_newpos)) {
8898 Py_DECREF(restuple);
8899 return NULL;
8900 }
8901 if (i_newpos<0)
8902 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8903 else
8904 *newpos = i_newpos;
8905 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8906 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8907 Py_DECREF(restuple);
8908 return NULL;
8909 }
8910 Py_INCREF(resunicode);
8911 Py_DECREF(restuple);
8912 return resunicode;
8913 }
8914
8915 /* Lookup the character ch in the mapping and put the result in result,
8916 which must be decrefed by the caller.
8917 Return 0 on success, -1 on error */
8918 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8919 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8920 {
8921 PyObject *w = PyLong_FromLong((long)c);
8922 PyObject *x;
8923
8924 if (w == NULL)
8925 return -1;
8926 x = PyObject_GetItem(mapping, w);
8927 Py_DECREF(w);
8928 if (x == NULL) {
8929 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8930 /* No mapping found means: use 1:1 mapping. */
8931 PyErr_Clear();
8932 *result = NULL;
8933 return 0;
8934 } else
8935 return -1;
8936 }
8937 else if (x == Py_None) {
8938 *result = x;
8939 return 0;
8940 }
8941 else if (PyLong_Check(x)) {
8942 long value = PyLong_AS_LONG(x);
8943 if (value < 0 || value > MAX_UNICODE) {
8944 PyErr_Format(PyExc_ValueError,
8945 "character mapping must be in range(0x%x)",
8946 MAX_UNICODE+1);
8947 Py_DECREF(x);
8948 return -1;
8949 }
8950 *result = x;
8951 return 0;
8952 }
8953 else if (PyUnicode_Check(x)) {
8954 *result = x;
8955 return 0;
8956 }
8957 else {
8958 /* wrong return value */
8959 PyErr_SetString(PyExc_TypeError,
8960 "character mapping must return integer, None or str");
8961 Py_DECREF(x);
8962 return -1;
8963 }
8964 }
8965
8966 /* lookup the character, write the result into the writer.
8967 Return 1 if the result was written into the writer, return 0 if the mapping
8968 was undefined, raise an exception return -1 on error. */
8969 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8970 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8971 _PyUnicodeWriter *writer)
8972 {
8973 PyObject *item;
8974
8975 if (charmaptranslate_lookup(ch, mapping, &item))
8976 return -1;
8977
8978 if (item == NULL) {
8979 /* not found => default to 1:1 mapping */
8980 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8981 return -1;
8982 }
8983 return 1;
8984 }
8985
8986 if (item == Py_None) {
8987 Py_DECREF(item);
8988 return 0;
8989 }
8990
8991 if (PyLong_Check(item)) {
8992 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8993 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8994 used it */
8995 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8996 Py_DECREF(item);
8997 return -1;
8998 }
8999 Py_DECREF(item);
9000 return 1;
9001 }
9002
9003 if (!PyUnicode_Check(item)) {
9004 Py_DECREF(item);
9005 return -1;
9006 }
9007
9008 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9009 Py_DECREF(item);
9010 return -1;
9011 }
9012
9013 Py_DECREF(item);
9014 return 1;
9015 }
9016
9017 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9018 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9019 Py_UCS1 *translate)
9020 {
9021 PyObject *item = NULL;
9022 int ret = 0;
9023
9024 if (charmaptranslate_lookup(ch, mapping, &item)) {
9025 return -1;
9026 }
9027
9028 if (item == Py_None) {
9029 /* deletion */
9030 translate[ch] = 0xfe;
9031 }
9032 else if (item == NULL) {
9033 /* not found => default to 1:1 mapping */
9034 translate[ch] = ch;
9035 return 1;
9036 }
9037 else if (PyLong_Check(item)) {
9038 long replace = PyLong_AS_LONG(item);
9039 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9040 used it */
9041 if (127 < replace) {
9042 /* invalid character or character outside ASCII:
9043 skip the fast translate */
9044 goto exit;
9045 }
9046 translate[ch] = (Py_UCS1)replace;
9047 }
9048 else if (PyUnicode_Check(item)) {
9049 Py_UCS4 replace;
9050
9051 if (PyUnicode_READY(item) == -1) {
9052 Py_DECREF(item);
9053 return -1;
9054 }
9055 if (PyUnicode_GET_LENGTH(item) != 1)
9056 goto exit;
9057
9058 replace = PyUnicode_READ_CHAR(item, 0);
9059 if (replace > 127)
9060 goto exit;
9061 translate[ch] = (Py_UCS1)replace;
9062 }
9063 else {
9064 /* not None, NULL, long or unicode */
9065 goto exit;
9066 }
9067 ret = 1;
9068
9069 exit:
9070 Py_DECREF(item);
9071 return ret;
9072 }
9073
9074 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9075 was translated into writer, return 0 if the input string was partially
9076 translated into writer, raise an exception and return -1 on error. */
9077 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9078 unicode_fast_translate(PyObject *input, PyObject *mapping,
9079 _PyUnicodeWriter *writer, int ignore,
9080 Py_ssize_t *input_pos)
9081 {
9082 Py_UCS1 ascii_table[128], ch, ch2;
9083 Py_ssize_t len;
9084 const Py_UCS1 *in, *end;
9085 Py_UCS1 *out;
9086 int res = 0;
9087
9088 len = PyUnicode_GET_LENGTH(input);
9089
9090 memset(ascii_table, 0xff, 128);
9091
9092 in = PyUnicode_1BYTE_DATA(input);
9093 end = in + len;
9094
9095 assert(PyUnicode_IS_ASCII(writer->buffer));
9096 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9097 out = PyUnicode_1BYTE_DATA(writer->buffer);
9098
9099 for (; in < end; in++) {
9100 ch = *in;
9101 ch2 = ascii_table[ch];
9102 if (ch2 == 0xff) {
9103 int translate = unicode_fast_translate_lookup(mapping, ch,
9104 ascii_table);
9105 if (translate < 0)
9106 return -1;
9107 if (translate == 0)
9108 goto exit;
9109 ch2 = ascii_table[ch];
9110 }
9111 if (ch2 == 0xfe) {
9112 if (ignore)
9113 continue;
9114 goto exit;
9115 }
9116 assert(ch2 < 128);
9117 *out = ch2;
9118 out++;
9119 }
9120 res = 1;
9121
9122 exit:
9123 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9124 *input_pos = in - PyUnicode_1BYTE_DATA(input);
9125 return res;
9126 }
9127
9128 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9129 _PyUnicode_TranslateCharmap(PyObject *input,
9130 PyObject *mapping,
9131 const char *errors)
9132 {
9133 /* input object */
9134 const void *data;
9135 Py_ssize_t size, i;
9136 int kind;
9137 /* output buffer */
9138 _PyUnicodeWriter writer;
9139 /* error handler */
9140 const char *reason = "character maps to <undefined>";
9141 PyObject *errorHandler = NULL;
9142 PyObject *exc = NULL;
9143 int ignore;
9144 int res;
9145
9146 if (mapping == NULL) {
9147 PyErr_BadArgument();
9148 return NULL;
9149 }
9150
9151 if (PyUnicode_READY(input) == -1)
9152 return NULL;
9153 data = PyUnicode_DATA(input);
9154 kind = PyUnicode_KIND(input);
9155 size = PyUnicode_GET_LENGTH(input);
9156
9157 if (size == 0)
9158 return PyUnicode_FromObject(input);
9159
9160 /* allocate enough for a simple 1:1 translation without
9161 replacements, if we need more, we'll resize */
9162 _PyUnicodeWriter_Init(&writer);
9163 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9164 goto onError;
9165
9166 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9167
9168 if (PyUnicode_READY(input) == -1)
9169 return NULL;
9170 if (PyUnicode_IS_ASCII(input)) {
9171 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9172 if (res < 0) {
9173 _PyUnicodeWriter_Dealloc(&writer);
9174 return NULL;
9175 }
9176 if (res == 1)
9177 return _PyUnicodeWriter_Finish(&writer);
9178 }
9179 else {
9180 i = 0;
9181 }
9182
9183 while (i<size) {
9184 /* try to encode it */
9185 int translate;
9186 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9187 Py_ssize_t newpos;
9188 /* startpos for collecting untranslatable chars */
9189 Py_ssize_t collstart;
9190 Py_ssize_t collend;
9191 Py_UCS4 ch;
9192
9193 ch = PyUnicode_READ(kind, data, i);
9194 translate = charmaptranslate_output(ch, mapping, &writer);
9195 if (translate < 0)
9196 goto onError;
9197
9198 if (translate != 0) {
9199 /* it worked => adjust input pointer */
9200 ++i;
9201 continue;
9202 }
9203
9204 /* untranslatable character */
9205 collstart = i;
9206 collend = i+1;
9207
9208 /* find all untranslatable characters */
9209 while (collend < size) {
9210 PyObject *x;
9211 ch = PyUnicode_READ(kind, data, collend);
9212 if (charmaptranslate_lookup(ch, mapping, &x))
9213 goto onError;
9214 Py_XDECREF(x);
9215 if (x != Py_None)
9216 break;
9217 ++collend;
9218 }
9219
9220 if (ignore) {
9221 i = collend;
9222 }
9223 else {
9224 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9225 reason, input, &exc,
9226 collstart, collend, &newpos);
9227 if (repunicode == NULL)
9228 goto onError;
9229 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9230 Py_DECREF(repunicode);
9231 goto onError;
9232 }
9233 Py_DECREF(repunicode);
9234 i = newpos;
9235 }
9236 }
9237 Py_XDECREF(exc);
9238 Py_XDECREF(errorHandler);
9239 return _PyUnicodeWriter_Finish(&writer);
9240
9241 onError:
9242 _PyUnicodeWriter_Dealloc(&writer);
9243 Py_XDECREF(exc);
9244 Py_XDECREF(errorHandler);
9245 return NULL;
9246 }
9247
9248 /* Deprecated. Use PyUnicode_Translate instead. */
9249 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9250 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9251 Py_ssize_t size,
9252 PyObject *mapping,
9253 const char *errors)
9254 {
9255 PyObject *result;
9256 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9257 if (!unicode)
9258 return NULL;
9259 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9260 Py_DECREF(unicode);
9261 return result;
9262 }
9263
9264 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9265 PyUnicode_Translate(PyObject *str,
9266 PyObject *mapping,
9267 const char *errors)
9268 {
9269 if (ensure_unicode(str) < 0)
9270 return NULL;
9271 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9272 }
9273
9274 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9275 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9276 {
9277 if (!PyUnicode_Check(unicode)) {
9278 PyErr_BadInternalCall();
9279 return NULL;
9280 }
9281 if (PyUnicode_READY(unicode) == -1)
9282 return NULL;
9283 if (PyUnicode_IS_ASCII(unicode)) {
9284 /* If the string is already ASCII, just return the same string */
9285 Py_INCREF(unicode);
9286 return unicode;
9287 }
9288
9289 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9290 PyObject *result = PyUnicode_New(len, 127);
9291 if (result == NULL) {
9292 return NULL;
9293 }
9294
9295 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9296 int kind = PyUnicode_KIND(unicode);
9297 const void *data = PyUnicode_DATA(unicode);
9298 Py_ssize_t i;
9299 for (i = 0; i < len; ++i) {
9300 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9301 if (ch < 127) {
9302 out[i] = ch;
9303 }
9304 else if (Py_UNICODE_ISSPACE(ch)) {
9305 out[i] = ' ';
9306 }
9307 else {
9308 int decimal = Py_UNICODE_TODECIMAL(ch);
9309 if (decimal < 0) {
9310 out[i] = '?';
9311 out[i+1] = '\0';
9312 _PyUnicode_LENGTH(result) = i + 1;
9313 break;
9314 }
9315 out[i] = '0' + decimal;
9316 }
9317 }
9318
9319 assert(_PyUnicode_CheckConsistency(result, 1));
9320 return result;
9321 }
9322
9323 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9324 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9325 Py_ssize_t length)
9326 {
9327 PyObject *decimal;
9328 Py_ssize_t i;
9329 Py_UCS4 maxchar;
9330 enum PyUnicode_Kind kind;
9331 const void *data;
9332
9333 maxchar = 127;
9334 for (i = 0; i < length; i++) {
9335 Py_UCS4 ch = s[i];
9336 if (ch > 127) {
9337 int decimal = Py_UNICODE_TODECIMAL(ch);
9338 if (decimal >= 0)
9339 ch = '0' + decimal;
9340 maxchar = Py_MAX(maxchar, ch);
9341 }
9342 }
9343
9344 /* Copy to a new string */
9345 decimal = PyUnicode_New(length, maxchar);
9346 if (decimal == NULL)
9347 return decimal;
9348 kind = PyUnicode_KIND(decimal);
9349 data = PyUnicode_DATA(decimal);
9350 /* Iterate over code points */
9351 for (i = 0; i < length; i++) {
9352 Py_UCS4 ch = s[i];
9353 if (ch > 127) {
9354 int decimal = Py_UNICODE_TODECIMAL(ch);
9355 if (decimal >= 0)
9356 ch = '0' + decimal;
9357 }
9358 PyUnicode_WRITE(kind, data, i, ch);
9359 }
9360 return unicode_result(decimal);
9361 }
9362 /* --- Decimal Encoder ---------------------------------------------------- */
9363
9364 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9365 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9366 Py_ssize_t length,
9367 char *output,
9368 const char *errors)
9369 {
9370 PyObject *unicode;
9371 Py_ssize_t i;
9372 enum PyUnicode_Kind kind;
9373 const void *data;
9374
9375 if (output == NULL) {
9376 PyErr_BadArgument();
9377 return -1;
9378 }
9379
9380 unicode = PyUnicode_FromWideChar(s, length);
9381 if (unicode == NULL)
9382 return -1;
9383
9384 kind = PyUnicode_KIND(unicode);
9385 data = PyUnicode_DATA(unicode);
9386
9387 for (i=0; i < length; ) {
9388 PyObject *exc;
9389 Py_UCS4 ch;
9390 int decimal;
9391 Py_ssize_t startpos;
9392
9393 ch = PyUnicode_READ(kind, data, i);
9394
9395 if (Py_UNICODE_ISSPACE(ch)) {
9396 *output++ = ' ';
9397 i++;
9398 continue;
9399 }
9400 decimal = Py_UNICODE_TODECIMAL(ch);
9401 if (decimal >= 0) {
9402 *output++ = '0' + decimal;
9403 i++;
9404 continue;
9405 }
9406 if (0 < ch && ch < 256) {
9407 *output++ = (char)ch;
9408 i++;
9409 continue;
9410 }
9411
9412 startpos = i;
9413 exc = NULL;
9414 raise_encode_exception(&exc, "decimal", unicode,
9415 startpos, startpos+1,
9416 "invalid decimal Unicode string");
9417 Py_XDECREF(exc);
9418 Py_DECREF(unicode);
9419 return -1;
9420 }
9421 /* 0-terminate the output string */
9422 *output++ = '\0';
9423 Py_DECREF(unicode);
9424 return 0;
9425 }
9426
9427 /* --- Helpers ------------------------------------------------------------ */
9428
9429 /* helper macro to fixup start/end slice values */
9430 #define ADJUST_INDICES(start, end, len) \
9431 if (end > len) \
9432 end = len; \
9433 else if (end < 0) { \
9434 end += len; \
9435 if (end < 0) \
9436 end = 0; \
9437 } \
9438 if (start < 0) { \
9439 start += len; \
9440 if (start < 0) \
9441 start = 0; \
9442 }
9443
9444 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9445 any_find_slice(PyObject* s1, PyObject* s2,
9446 Py_ssize_t start,
9447 Py_ssize_t end,
9448 int direction)
9449 {
9450 int kind1, kind2;
9451 const void *buf1, *buf2;
9452 Py_ssize_t len1, len2, result;
9453
9454 kind1 = PyUnicode_KIND(s1);
9455 kind2 = PyUnicode_KIND(s2);
9456 if (kind1 < kind2)
9457 return -1;
9458
9459 len1 = PyUnicode_GET_LENGTH(s1);
9460 len2 = PyUnicode_GET_LENGTH(s2);
9461 ADJUST_INDICES(start, end, len1);
9462 if (end - start < len2)
9463 return -1;
9464
9465 buf1 = PyUnicode_DATA(s1);
9466 buf2 = PyUnicode_DATA(s2);
9467 if (len2 == 1) {
9468 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9469 result = findchar((const char *)buf1 + kind1*start,
9470 kind1, end - start, ch, direction);
9471 if (result == -1)
9472 return -1;
9473 else
9474 return start + result;
9475 }
9476
9477 if (kind2 != kind1) {
9478 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9479 if (!buf2)
9480 return -2;
9481 }
9482
9483 if (direction > 0) {
9484 switch (kind1) {
9485 case PyUnicode_1BYTE_KIND:
9486 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9487 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9488 else
9489 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9490 break;
9491 case PyUnicode_2BYTE_KIND:
9492 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9493 break;
9494 case PyUnicode_4BYTE_KIND:
9495 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9496 break;
9497 default:
9498 Py_UNREACHABLE();
9499 }
9500 }
9501 else {
9502 switch (kind1) {
9503 case PyUnicode_1BYTE_KIND:
9504 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9505 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9506 else
9507 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9508 break;
9509 case PyUnicode_2BYTE_KIND:
9510 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9511 break;
9512 case PyUnicode_4BYTE_KIND:
9513 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9514 break;
9515 default:
9516 Py_UNREACHABLE();
9517 }
9518 }
9519
9520 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9521 if (kind2 != kind1)
9522 PyMem_Free((void *)buf2);
9523
9524 return result;
9525 }
9526
9527 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9528 #include "stringlib/localeutil.h"
9529
9530 /**
9531 * InsertThousandsGrouping:
9532 * @writer: Unicode writer.
9533 * @n_buffer: Number of characters in @buffer.
9534 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9535 * @d_pos: Start of digits string.
9536 * @n_digits: The number of digits in the string, in which we want
9537 * to put the grouping chars.
9538 * @min_width: The minimum width of the digits in the output string.
9539 * Output will be zero-padded on the left to fill.
9540 * @grouping: see definition in localeconv().
9541 * @thousands_sep: see definition in localeconv().
9542 *
9543 * There are 2 modes: counting and filling. If @writer is NULL,
9544 * we are in counting mode, else filling mode.
9545 * If counting, the required buffer size is returned.
9546 * If filling, we know the buffer will be large enough, so we don't
9547 * need to pass in the buffer size.
9548 * Inserts thousand grouping characters (as defined by grouping and
9549 * thousands_sep) into @writer.
9550 *
9551 * Return value: -1 on error, number of characters otherwise.
9552 **/
9553 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9554 _PyUnicode_InsertThousandsGrouping(
9555 _PyUnicodeWriter *writer,
9556 Py_ssize_t n_buffer,
9557 PyObject *digits,
9558 Py_ssize_t d_pos,
9559 Py_ssize_t n_digits,
9560 Py_ssize_t min_width,
9561 const char *grouping,
9562 PyObject *thousands_sep,
9563 Py_UCS4 *maxchar)
9564 {
9565 min_width = Py_MAX(0, min_width);
9566 if (writer) {
9567 assert(digits != NULL);
9568 assert(maxchar == NULL);
9569 }
9570 else {
9571 assert(digits == NULL);
9572 assert(maxchar != NULL);
9573 }
9574 assert(0 <= d_pos);
9575 assert(0 <= n_digits);
9576 assert(grouping != NULL);
9577
9578 if (digits != NULL) {
9579 if (PyUnicode_READY(digits) == -1) {
9580 return -1;
9581 }
9582 }
9583 if (PyUnicode_READY(thousands_sep) == -1) {
9584 return -1;
9585 }
9586
9587 Py_ssize_t count = 0;
9588 Py_ssize_t n_zeros;
9589 int loop_broken = 0;
9590 int use_separator = 0; /* First time through, don't append the
9591 separator. They only go between
9592 groups. */
9593 Py_ssize_t buffer_pos;
9594 Py_ssize_t digits_pos;
9595 Py_ssize_t len;
9596 Py_ssize_t n_chars;
9597 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9598 be looked at */
9599 /* A generator that returns all of the grouping widths, until it
9600 returns 0. */
9601 GroupGenerator groupgen;
9602 GroupGenerator_init(&groupgen, grouping);
9603 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9604
9605 /* if digits are not grouped, thousands separator
9606 should be an empty string */
9607 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9608
9609 digits_pos = d_pos + n_digits;
9610 if (writer) {
9611 buffer_pos = writer->pos + n_buffer;
9612 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9613 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9614 }
9615 else {
9616 buffer_pos = n_buffer;
9617 }
9618
9619 if (!writer) {
9620 *maxchar = 127;
9621 }
9622
9623 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9624 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9625 n_zeros = Py_MAX(0, len - remaining);
9626 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9627
9628 /* Use n_zero zero's and n_chars chars */
9629
9630 /* Count only, don't do anything. */
9631 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9632
9633 /* Copy into the writer. */
9634 InsertThousandsGrouping_fill(writer, &buffer_pos,
9635 digits, &digits_pos,
9636 n_chars, n_zeros,
9637 use_separator ? thousands_sep : NULL,
9638 thousands_sep_len, maxchar);
9639
9640 /* Use a separator next time. */
9641 use_separator = 1;
9642
9643 remaining -= n_chars;
9644 min_width -= len;
9645
9646 if (remaining <= 0 && min_width <= 0) {
9647 loop_broken = 1;
9648 break;
9649 }
9650 min_width -= thousands_sep_len;
9651 }
9652 if (!loop_broken) {
9653 /* We left the loop without using a break statement. */
9654
9655 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9656 n_zeros = Py_MAX(0, len - remaining);
9657 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9658
9659 /* Use n_zero zero's and n_chars chars */
9660 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9661
9662 /* Copy into the writer. */
9663 InsertThousandsGrouping_fill(writer, &buffer_pos,
9664 digits, &digits_pos,
9665 n_chars, n_zeros,
9666 use_separator ? thousands_sep : NULL,
9667 thousands_sep_len, maxchar);
9668 }
9669 return count;
9670 }
9671
9672
9673 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9674 PyUnicode_Count(PyObject *str,
9675 PyObject *substr,
9676 Py_ssize_t start,
9677 Py_ssize_t end)
9678 {
9679 Py_ssize_t result;
9680 int kind1, kind2;
9681 const void *buf1 = NULL, *buf2 = NULL;
9682 Py_ssize_t len1, len2;
9683
9684 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9685 return -1;
9686
9687 kind1 = PyUnicode_KIND(str);
9688 kind2 = PyUnicode_KIND(substr);
9689 if (kind1 < kind2)
9690 return 0;
9691
9692 len1 = PyUnicode_GET_LENGTH(str);
9693 len2 = PyUnicode_GET_LENGTH(substr);
9694 ADJUST_INDICES(start, end, len1);
9695 if (end - start < len2)
9696 return 0;
9697
9698 buf1 = PyUnicode_DATA(str);
9699 buf2 = PyUnicode_DATA(substr);
9700 if (kind2 != kind1) {
9701 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9702 if (!buf2)
9703 goto onError;
9704 }
9705
9706 switch (kind1) {
9707 case PyUnicode_1BYTE_KIND:
9708 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9709 result = asciilib_count(
9710 ((const Py_UCS1*)buf1) + start, end - start,
9711 buf2, len2, PY_SSIZE_T_MAX
9712 );
9713 else
9714 result = ucs1lib_count(
9715 ((const Py_UCS1*)buf1) + start, end - start,
9716 buf2, len2, PY_SSIZE_T_MAX
9717 );
9718 break;
9719 case PyUnicode_2BYTE_KIND:
9720 result = ucs2lib_count(
9721 ((const Py_UCS2*)buf1) + start, end - start,
9722 buf2, len2, PY_SSIZE_T_MAX
9723 );
9724 break;
9725 case PyUnicode_4BYTE_KIND:
9726 result = ucs4lib_count(
9727 ((const Py_UCS4*)buf1) + start, end - start,
9728 buf2, len2, PY_SSIZE_T_MAX
9729 );
9730 break;
9731 default:
9732 Py_UNREACHABLE();
9733 }
9734
9735 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9736 if (kind2 != kind1)
9737 PyMem_Free((void *)buf2);
9738
9739 return result;
9740 onError:
9741 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9742 if (kind2 != kind1)
9743 PyMem_Free((void *)buf2);
9744 return -1;
9745 }
9746
9747 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9748 PyUnicode_Find(PyObject *str,
9749 PyObject *substr,
9750 Py_ssize_t start,
9751 Py_ssize_t end,
9752 int direction)
9753 {
9754 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9755 return -2;
9756
9757 return any_find_slice(str, substr, start, end, direction);
9758 }
9759
9760 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9761 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9762 Py_ssize_t start, Py_ssize_t end,
9763 int direction)
9764 {
9765 int kind;
9766 Py_ssize_t len, result;
9767 if (PyUnicode_READY(str) == -1)
9768 return -2;
9769 len = PyUnicode_GET_LENGTH(str);
9770 ADJUST_INDICES(start, end, len);
9771 if (end - start < 1)
9772 return -1;
9773 kind = PyUnicode_KIND(str);
9774 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9775 kind, end-start, ch, direction);
9776 if (result == -1)
9777 return -1;
9778 else
9779 return start + result;
9780 }
9781
9782 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9783 tailmatch(PyObject *self,
9784 PyObject *substring,
9785 Py_ssize_t start,
9786 Py_ssize_t end,
9787 int direction)
9788 {
9789 int kind_self;
9790 int kind_sub;
9791 const void *data_self;
9792 const void *data_sub;
9793 Py_ssize_t offset;
9794 Py_ssize_t i;
9795 Py_ssize_t end_sub;
9796
9797 if (PyUnicode_READY(self) == -1 ||
9798 PyUnicode_READY(substring) == -1)
9799 return -1;
9800
9801 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9802 end -= PyUnicode_GET_LENGTH(substring);
9803 if (end < start)
9804 return 0;
9805
9806 if (PyUnicode_GET_LENGTH(substring) == 0)
9807 return 1;
9808
9809 kind_self = PyUnicode_KIND(self);
9810 data_self = PyUnicode_DATA(self);
9811 kind_sub = PyUnicode_KIND(substring);
9812 data_sub = PyUnicode_DATA(substring);
9813 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9814
9815 if (direction > 0)
9816 offset = end;
9817 else
9818 offset = start;
9819
9820 if (PyUnicode_READ(kind_self, data_self, offset) ==
9821 PyUnicode_READ(kind_sub, data_sub, 0) &&
9822 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9823 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9824 /* If both are of the same kind, memcmp is sufficient */
9825 if (kind_self == kind_sub) {
9826 return ! memcmp((char *)data_self +
9827 (offset * PyUnicode_KIND(substring)),
9828 data_sub,
9829 PyUnicode_GET_LENGTH(substring) *
9830 PyUnicode_KIND(substring));
9831 }
9832 /* otherwise we have to compare each character by first accessing it */
9833 else {
9834 /* We do not need to compare 0 and len(substring)-1 because
9835 the if statement above ensured already that they are equal
9836 when we end up here. */
9837 for (i = 1; i < end_sub; ++i) {
9838 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9839 PyUnicode_READ(kind_sub, data_sub, i))
9840 return 0;
9841 }
9842 return 1;
9843 }
9844 }
9845
9846 return 0;
9847 }
9848
9849 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9850 PyUnicode_Tailmatch(PyObject *str,
9851 PyObject *substr,
9852 Py_ssize_t start,
9853 Py_ssize_t end,
9854 int direction)
9855 {
9856 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9857 return -1;
9858
9859 return tailmatch(str, substr, start, end, direction);
9860 }
9861
9862 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9863 ascii_upper_or_lower(PyObject *self, int lower)
9864 {
9865 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9866 const char *data = PyUnicode_DATA(self);
9867 char *resdata;
9868 PyObject *res;
9869
9870 res = PyUnicode_New(len, 127);
9871 if (res == NULL)
9872 return NULL;
9873 resdata = PyUnicode_DATA(res);
9874 if (lower)
9875 _Py_bytes_lower(resdata, data, len);
9876 else
9877 _Py_bytes_upper(resdata, data, len);
9878 return res;
9879 }
9880
9881 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9882 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9883 {
9884 Py_ssize_t j;
9885 int final_sigma;
9886 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9887 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9888
9889 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9890
9891 where ! is a negation and \p{xxx} is a character with property xxx.
9892 */
9893 for (j = i - 1; j >= 0; j--) {
9894 c = PyUnicode_READ(kind, data, j);
9895 if (!_PyUnicode_IsCaseIgnorable(c))
9896 break;
9897 }
9898 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9899 if (final_sigma) {
9900 for (j = i + 1; j < length; j++) {
9901 c = PyUnicode_READ(kind, data, j);
9902 if (!_PyUnicode_IsCaseIgnorable(c))
9903 break;
9904 }
9905 final_sigma = j == length || !_PyUnicode_IsCased(c);
9906 }
9907 return (final_sigma) ? 0x3C2 : 0x3C3;
9908 }
9909
9910 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9911 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9912 Py_UCS4 c, Py_UCS4 *mapped)
9913 {
9914 /* Obscure special case. */
9915 if (c == 0x3A3) {
9916 mapped[0] = handle_capital_sigma(kind, data, length, i);
9917 return 1;
9918 }
9919 return _PyUnicode_ToLowerFull(c, mapped);
9920 }
9921
9922 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9923 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9924 {
9925 Py_ssize_t i, k = 0;
9926 int n_res, j;
9927 Py_UCS4 c, mapped[3];
9928
9929 c = PyUnicode_READ(kind, data, 0);
9930 n_res = _PyUnicode_ToTitleFull(c, mapped);
9931 for (j = 0; j < n_res; j++) {
9932 *maxchar = Py_MAX(*maxchar, mapped[j]);
9933 res[k++] = mapped[j];
9934 }
9935 for (i = 1; i < length; i++) {
9936 c = PyUnicode_READ(kind, data, i);
9937 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9938 for (j = 0; j < n_res; j++) {
9939 *maxchar = Py_MAX(*maxchar, mapped[j]);
9940 res[k++] = mapped[j];
9941 }
9942 }
9943 return k;
9944 }
9945
9946 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9948 Py_ssize_t i, k = 0;
9949
9950 for (i = 0; i < length; i++) {
9951 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9952 int n_res, j;
9953 if (Py_UNICODE_ISUPPER(c)) {
9954 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9955 }
9956 else if (Py_UNICODE_ISLOWER(c)) {
9957 n_res = _PyUnicode_ToUpperFull(c, mapped);
9958 }
9959 else {
9960 n_res = 1;
9961 mapped[0] = c;
9962 }
9963 for (j = 0; j < n_res; j++) {
9964 *maxchar = Py_MAX(*maxchar, mapped[j]);
9965 res[k++] = mapped[j];
9966 }
9967 }
9968 return k;
9969 }
9970
9971 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9972 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9973 Py_UCS4 *maxchar, int lower)
9974 {
9975 Py_ssize_t i, k = 0;
9976
9977 for (i = 0; i < length; i++) {
9978 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9979 int n_res, j;
9980 if (lower)
9981 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9982 else
9983 n_res = _PyUnicode_ToUpperFull(c, mapped);
9984 for (j = 0; j < n_res; j++) {
9985 *maxchar = Py_MAX(*maxchar, mapped[j]);
9986 res[k++] = mapped[j];
9987 }
9988 }
9989 return k;
9990 }
9991
9992 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9993 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9994 {
9995 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9996 }
9997
9998 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9999 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10000 {
10001 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10002 }
10003
10004 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10005 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10006 {
10007 Py_ssize_t i, k = 0;
10008
10009 for (i = 0; i < length; i++) {
10010 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10011 Py_UCS4 mapped[3];
10012 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10013 for (j = 0; j < n_res; j++) {
10014 *maxchar = Py_MAX(*maxchar, mapped[j]);
10015 res[k++] = mapped[j];
10016 }
10017 }
10018 return k;
10019 }
10020
10021 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10022 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10023 {
10024 Py_ssize_t i, k = 0;
10025 int previous_is_cased;
10026
10027 previous_is_cased = 0;
10028 for (i = 0; i < length; i++) {
10029 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10030 Py_UCS4 mapped[3];
10031 int n_res, j;
10032
10033 if (previous_is_cased)
10034 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10035 else
10036 n_res = _PyUnicode_ToTitleFull(c, mapped);
10037
10038 for (j = 0; j < n_res; j++) {
10039 *maxchar = Py_MAX(*maxchar, mapped[j]);
10040 res[k++] = mapped[j];
10041 }
10042
10043 previous_is_cased = _PyUnicode_IsCased(c);
10044 }
10045 return k;
10046 }
10047
10048 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10049 case_operation(PyObject *self,
10050 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10051 {
10052 PyObject *res = NULL;
10053 Py_ssize_t length, newlength = 0;
10054 int kind, outkind;
10055 const void *data;
10056 void *outdata;
10057 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10058
10059 assert(PyUnicode_IS_READY(self));
10060
10061 kind = PyUnicode_KIND(self);
10062 data = PyUnicode_DATA(self);
10063 length = PyUnicode_GET_LENGTH(self);
10064 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10065 PyErr_SetString(PyExc_OverflowError, "string is too long");
10066 return NULL;
10067 }
10068 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
10069 if (tmp == NULL)
10070 return PyErr_NoMemory();
10071 newlength = perform(kind, data, length, tmp, &maxchar);
10072 res = PyUnicode_New(newlength, maxchar);
10073 if (res == NULL)
10074 goto leave;
10075 tmpend = tmp + newlength;
10076 outdata = PyUnicode_DATA(res);
10077 outkind = PyUnicode_KIND(res);
10078 switch (outkind) {
10079 case PyUnicode_1BYTE_KIND:
10080 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10081 break;
10082 case PyUnicode_2BYTE_KIND:
10083 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10084 break;
10085 case PyUnicode_4BYTE_KIND:
10086 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10087 break;
10088 default:
10089 Py_UNREACHABLE();
10090 }
10091 leave:
10092 PyMem_FREE(tmp);
10093 return res;
10094 }
10095
10096 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10097 PyUnicode_Join(PyObject *separator, PyObject *seq)
10098 {
10099 PyObject *res;
10100 PyObject *fseq;
10101 Py_ssize_t seqlen;
10102 PyObject **items;
10103
10104 fseq = PySequence_Fast(seq, "can only join an iterable");
10105 if (fseq == NULL) {
10106 return NULL;
10107 }
10108
10109 /* NOTE: the following code can't call back into Python code,
10110 * so we are sure that fseq won't be mutated.
10111 */
10112
10113 items = PySequence_Fast_ITEMS(fseq);
10114 seqlen = PySequence_Fast_GET_SIZE(fseq);
10115 res = _PyUnicode_JoinArray(separator, items, seqlen);
10116 Py_DECREF(fseq);
10117 return res;
10118 }
10119
10120 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10121 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10122 {
10123 PyObject *res = NULL; /* the result */
10124 PyObject *sep = NULL;
10125 Py_ssize_t seplen;
10126 PyObject *item;
10127 Py_ssize_t sz, i, res_offset;
10128 Py_UCS4 maxchar;
10129 Py_UCS4 item_maxchar;
10130 int use_memcpy;
10131 unsigned char *res_data = NULL, *sep_data = NULL;
10132 PyObject *last_obj;
10133 unsigned int kind = 0;
10134
10135 /* If empty sequence, return u"". */
10136 if (seqlen == 0) {
10137 _Py_RETURN_UNICODE_EMPTY();
10138 }
10139
10140 /* If singleton sequence with an exact Unicode, return that. */
10141 last_obj = NULL;
10142 if (seqlen == 1) {
10143 if (PyUnicode_CheckExact(items[0])) {
10144 res = items[0];
10145 Py_INCREF(res);
10146 return res;
10147 }
10148 seplen = 0;
10149 maxchar = 0;
10150 }
10151 else {
10152 /* Set up sep and seplen */
10153 if (separator == NULL) {
10154 /* fall back to a blank space separator */
10155 sep = PyUnicode_FromOrdinal(' ');
10156 if (!sep)
10157 goto onError;
10158 seplen = 1;
10159 maxchar = 32;
10160 }
10161 else {
10162 if (!PyUnicode_Check(separator)) {
10163 PyErr_Format(PyExc_TypeError,
10164 "separator: expected str instance,"
10165 " %.80s found",
10166 Py_TYPE(separator)->tp_name);
10167 goto onError;
10168 }
10169 if (PyUnicode_READY(separator))
10170 goto onError;
10171 sep = separator;
10172 seplen = PyUnicode_GET_LENGTH(separator);
10173 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10174 /* inc refcount to keep this code path symmetric with the
10175 above case of a blank separator */
10176 Py_INCREF(sep);
10177 }
10178 last_obj = sep;
10179 }
10180
10181 /* There are at least two things to join, or else we have a subclass
10182 * of str in the sequence.
10183 * Do a pre-pass to figure out the total amount of space we'll
10184 * need (sz), and see whether all argument are strings.
10185 */
10186 sz = 0;
10187 #ifdef Py_DEBUG
10188 use_memcpy = 0;
10189 #else
10190 use_memcpy = 1;
10191 #endif
10192 for (i = 0; i < seqlen; i++) {
10193 size_t add_sz;
10194 item = items[i];
10195 if (!PyUnicode_Check(item)) {
10196 PyErr_Format(PyExc_TypeError,
10197 "sequence item %zd: expected str instance,"
10198 " %.80s found",
10199 i, Py_TYPE(item)->tp_name);
10200 goto onError;
10201 }
10202 if (PyUnicode_READY(item) == -1)
10203 goto onError;
10204 add_sz = PyUnicode_GET_LENGTH(item);
10205 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10206 maxchar = Py_MAX(maxchar, item_maxchar);
10207 if (i != 0) {
10208 add_sz += seplen;
10209 }
10210 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10211 PyErr_SetString(PyExc_OverflowError,
10212 "join() result is too long for a Python string");
10213 goto onError;
10214 }
10215 sz += add_sz;
10216 if (use_memcpy && last_obj != NULL) {
10217 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10218 use_memcpy = 0;
10219 }
10220 last_obj = item;
10221 }
10222
10223 res = PyUnicode_New(sz, maxchar);
10224 if (res == NULL)
10225 goto onError;
10226
10227 /* Catenate everything. */
10228 #ifdef Py_DEBUG
10229 use_memcpy = 0;
10230 #else
10231 if (use_memcpy) {
10232 res_data = PyUnicode_1BYTE_DATA(res);
10233 kind = PyUnicode_KIND(res);
10234 if (seplen != 0)
10235 sep_data = PyUnicode_1BYTE_DATA(sep);
10236 }
10237 #endif
10238 if (use_memcpy) {
10239 for (i = 0; i < seqlen; ++i) {
10240 Py_ssize_t itemlen;
10241 item = items[i];
10242
10243 /* Copy item, and maybe the separator. */
10244 if (i && seplen != 0) {
10245 memcpy(res_data,
10246 sep_data,
10247 kind * seplen);
10248 res_data += kind * seplen;
10249 }
10250
10251 itemlen = PyUnicode_GET_LENGTH(item);
10252 if (itemlen != 0) {
10253 memcpy(res_data,
10254 PyUnicode_DATA(item),
10255 kind * itemlen);
10256 res_data += kind * itemlen;
10257 }
10258 }
10259 assert(res_data == PyUnicode_1BYTE_DATA(res)
10260 + kind * PyUnicode_GET_LENGTH(res));
10261 }
10262 else {
10263 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10264 Py_ssize_t itemlen;
10265 item = items[i];
10266
10267 /* Copy item, and maybe the separator. */
10268 if (i && seplen != 0) {
10269 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10270 res_offset += seplen;
10271 }
10272
10273 itemlen = PyUnicode_GET_LENGTH(item);
10274 if (itemlen != 0) {
10275 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10276 res_offset += itemlen;
10277 }
10278 }
10279 assert(res_offset == PyUnicode_GET_LENGTH(res));
10280 }
10281
10282 Py_XDECREF(sep);
10283 assert(_PyUnicode_CheckConsistency(res, 1));
10284 return res;
10285
10286 onError:
10287 Py_XDECREF(sep);
10288 Py_XDECREF(res);
10289 return NULL;
10290 }
10291
10292 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10293 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10294 Py_UCS4 fill_char)
10295 {
10296 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10297 void *data = PyUnicode_DATA(unicode);
10298 assert(PyUnicode_IS_READY(unicode));
10299 assert(unicode_modifiable(unicode));
10300 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10301 assert(start >= 0);
10302 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10303 unicode_fill(kind, data, fill_char, start, length);
10304 }
10305
10306 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10307 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10308 Py_UCS4 fill_char)
10309 {
10310 Py_ssize_t maxlen;
10311
10312 if (!PyUnicode_Check(unicode)) {
10313 PyErr_BadInternalCall();
10314 return -1;
10315 }
10316 if (PyUnicode_READY(unicode) == -1)
10317 return -1;
10318 if (unicode_check_modifiable(unicode))
10319 return -1;
10320
10321 if (start < 0) {
10322 PyErr_SetString(PyExc_IndexError, "string index out of range");
10323 return -1;
10324 }
10325 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10326 PyErr_SetString(PyExc_ValueError,
10327 "fill character is bigger than "
10328 "the string maximum character");
10329 return -1;
10330 }
10331
10332 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10333 length = Py_MIN(maxlen, length);
10334 if (length <= 0)
10335 return 0;
10336
10337 _PyUnicode_FastFill(unicode, start, length, fill_char);
10338 return length;
10339 }
10340
10341 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10342 pad(PyObject *self,
10343 Py_ssize_t left,
10344 Py_ssize_t right,
10345 Py_UCS4 fill)
10346 {
10347 PyObject *u;
10348 Py_UCS4 maxchar;
10349 int kind;
10350 void *data;
10351
10352 if (left < 0)
10353 left = 0;
10354 if (right < 0)
10355 right = 0;
10356
10357 if (left == 0 && right == 0)
10358 return unicode_result_unchanged(self);
10359
10360 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10361 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10362 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10363 return NULL;
10364 }
10365 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10366 maxchar = Py_MAX(maxchar, fill);
10367 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10368 if (!u)
10369 return NULL;
10370
10371 kind = PyUnicode_KIND(u);
10372 data = PyUnicode_DATA(u);
10373 if (left)
10374 unicode_fill(kind, data, fill, 0, left);
10375 if (right)
10376 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10377 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10378 assert(_PyUnicode_CheckConsistency(u, 1));
10379 return u;
10380 }
10381
10382 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10383 PyUnicode_Splitlines(PyObject *string, int keepends)
10384 {
10385 PyObject *list;
10386
10387 if (ensure_unicode(string) < 0)
10388 return NULL;
10389
10390 switch (PyUnicode_KIND(string)) {
10391 case PyUnicode_1BYTE_KIND:
10392 if (PyUnicode_IS_ASCII(string))
10393 list = asciilib_splitlines(
10394 string, PyUnicode_1BYTE_DATA(string),
10395 PyUnicode_GET_LENGTH(string), keepends);
10396 else
10397 list = ucs1lib_splitlines(
10398 string, PyUnicode_1BYTE_DATA(string),
10399 PyUnicode_GET_LENGTH(string), keepends);
10400 break;
10401 case PyUnicode_2BYTE_KIND:
10402 list = ucs2lib_splitlines(
10403 string, PyUnicode_2BYTE_DATA(string),
10404 PyUnicode_GET_LENGTH(string), keepends);
10405 break;
10406 case PyUnicode_4BYTE_KIND:
10407 list = ucs4lib_splitlines(
10408 string, PyUnicode_4BYTE_DATA(string),
10409 PyUnicode_GET_LENGTH(string), keepends);
10410 break;
10411 default:
10412 Py_UNREACHABLE();
10413 }
10414 return list;
10415 }
10416
10417 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10418 split(PyObject *self,
10419 PyObject *substring,
10420 Py_ssize_t maxcount)
10421 {
10422 int kind1, kind2;
10423 const void *buf1, *buf2;
10424 Py_ssize_t len1, len2;
10425 PyObject* out;
10426
10427 if (maxcount < 0)
10428 maxcount = PY_SSIZE_T_MAX;
10429
10430 if (PyUnicode_READY(self) == -1)
10431 return NULL;
10432
10433 if (substring == NULL)
10434 switch (PyUnicode_KIND(self)) {
10435 case PyUnicode_1BYTE_KIND:
10436 if (PyUnicode_IS_ASCII(self))
10437 return asciilib_split_whitespace(
10438 self, PyUnicode_1BYTE_DATA(self),
10439 PyUnicode_GET_LENGTH(self), maxcount
10440 );
10441 else
10442 return ucs1lib_split_whitespace(
10443 self, PyUnicode_1BYTE_DATA(self),
10444 PyUnicode_GET_LENGTH(self), maxcount
10445 );
10446 case PyUnicode_2BYTE_KIND:
10447 return ucs2lib_split_whitespace(
10448 self, PyUnicode_2BYTE_DATA(self),
10449 PyUnicode_GET_LENGTH(self), maxcount
10450 );
10451 case PyUnicode_4BYTE_KIND:
10452 return ucs4lib_split_whitespace(
10453 self, PyUnicode_4BYTE_DATA(self),
10454 PyUnicode_GET_LENGTH(self), maxcount
10455 );
10456 default:
10457 Py_UNREACHABLE();
10458 }
10459
10460 if (PyUnicode_READY(substring) == -1)
10461 return NULL;
10462
10463 kind1 = PyUnicode_KIND(self);
10464 kind2 = PyUnicode_KIND(substring);
10465 len1 = PyUnicode_GET_LENGTH(self);
10466 len2 = PyUnicode_GET_LENGTH(substring);
10467 if (kind1 < kind2 || len1 < len2) {
10468 out = PyList_New(1);
10469 if (out == NULL)
10470 return NULL;
10471 Py_INCREF(self);
10472 PyList_SET_ITEM(out, 0, self);
10473 return out;
10474 }
10475 buf1 = PyUnicode_DATA(self);
10476 buf2 = PyUnicode_DATA(substring);
10477 if (kind2 != kind1) {
10478 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10479 if (!buf2)
10480 return NULL;
10481 }
10482
10483 switch (kind1) {
10484 case PyUnicode_1BYTE_KIND:
10485 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10486 out = asciilib_split(
10487 self, buf1, len1, buf2, len2, maxcount);
10488 else
10489 out = ucs1lib_split(
10490 self, buf1, len1, buf2, len2, maxcount);
10491 break;
10492 case PyUnicode_2BYTE_KIND:
10493 out = ucs2lib_split(
10494 self, buf1, len1, buf2, len2, maxcount);
10495 break;
10496 case PyUnicode_4BYTE_KIND:
10497 out = ucs4lib_split(
10498 self, buf1, len1, buf2, len2, maxcount);
10499 break;
10500 default:
10501 out = NULL;
10502 }
10503 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10504 if (kind2 != kind1)
10505 PyMem_Free((void *)buf2);
10506 return out;
10507 }
10508
10509 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10510 rsplit(PyObject *self,
10511 PyObject *substring,
10512 Py_ssize_t maxcount)
10513 {
10514 int kind1, kind2;
10515 const void *buf1, *buf2;
10516 Py_ssize_t len1, len2;
10517 PyObject* out;
10518
10519 if (maxcount < 0)
10520 maxcount = PY_SSIZE_T_MAX;
10521
10522 if (PyUnicode_READY(self) == -1)
10523 return NULL;
10524
10525 if (substring == NULL)
10526 switch (PyUnicode_KIND(self)) {
10527 case PyUnicode_1BYTE_KIND:
10528 if (PyUnicode_IS_ASCII(self))
10529 return asciilib_rsplit_whitespace(
10530 self, PyUnicode_1BYTE_DATA(self),
10531 PyUnicode_GET_LENGTH(self), maxcount
10532 );
10533 else
10534 return ucs1lib_rsplit_whitespace(
10535 self, PyUnicode_1BYTE_DATA(self),
10536 PyUnicode_GET_LENGTH(self), maxcount
10537 );
10538 case PyUnicode_2BYTE_KIND:
10539 return ucs2lib_rsplit_whitespace(
10540 self, PyUnicode_2BYTE_DATA(self),
10541 PyUnicode_GET_LENGTH(self), maxcount
10542 );
10543 case PyUnicode_4BYTE_KIND:
10544 return ucs4lib_rsplit_whitespace(
10545 self, PyUnicode_4BYTE_DATA(self),
10546 PyUnicode_GET_LENGTH(self), maxcount
10547 );
10548 default:
10549 Py_UNREACHABLE();
10550 }
10551
10552 if (PyUnicode_READY(substring) == -1)
10553 return NULL;
10554
10555 kind1 = PyUnicode_KIND(self);
10556 kind2 = PyUnicode_KIND(substring);
10557 len1 = PyUnicode_GET_LENGTH(self);
10558 len2 = PyUnicode_GET_LENGTH(substring);
10559 if (kind1 < kind2 || len1 < len2) {
10560 out = PyList_New(1);
10561 if (out == NULL)
10562 return NULL;
10563 Py_INCREF(self);
10564 PyList_SET_ITEM(out, 0, self);
10565 return out;
10566 }
10567 buf1 = PyUnicode_DATA(self);
10568 buf2 = PyUnicode_DATA(substring);
10569 if (kind2 != kind1) {
10570 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10571 if (!buf2)
10572 return NULL;
10573 }
10574
10575 switch (kind1) {
10576 case PyUnicode_1BYTE_KIND:
10577 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10578 out = asciilib_rsplit(
10579 self, buf1, len1, buf2, len2, maxcount);
10580 else
10581 out = ucs1lib_rsplit(
10582 self, buf1, len1, buf2, len2, maxcount);
10583 break;
10584 case PyUnicode_2BYTE_KIND:
10585 out = ucs2lib_rsplit(
10586 self, buf1, len1, buf2, len2, maxcount);
10587 break;
10588 case PyUnicode_4BYTE_KIND:
10589 out = ucs4lib_rsplit(
10590 self, buf1, len1, buf2, len2, maxcount);
10591 break;
10592 default:
10593 out = NULL;
10594 }
10595 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10596 if (kind2 != kind1)
10597 PyMem_Free((void *)buf2);
10598 return out;
10599 }
10600
10601 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10602 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10603 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10604 {
10605 switch (kind) {
10606 case PyUnicode_1BYTE_KIND:
10607 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10608 return asciilib_find(buf1, len1, buf2, len2, offset);
10609 else
10610 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10611 case PyUnicode_2BYTE_KIND:
10612 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10613 case PyUnicode_4BYTE_KIND:
10614 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10615 }
10616 Py_UNREACHABLE();
10617 }
10618
10619 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10620 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10621 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10622 {
10623 switch (kind) {
10624 case PyUnicode_1BYTE_KIND:
10625 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10626 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10627 else
10628 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10629 case PyUnicode_2BYTE_KIND:
10630 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10631 case PyUnicode_4BYTE_KIND:
10632 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10633 }
10634 Py_UNREACHABLE();
10635 }
10636
10637 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10638 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10639 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10640 {
10641 int kind = PyUnicode_KIND(u);
10642 void *data = PyUnicode_DATA(u);
10643 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10644 if (kind == PyUnicode_1BYTE_KIND) {
10645 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10646 (Py_UCS1 *)data + len,
10647 u1, u2, maxcount);
10648 }
10649 else if (kind == PyUnicode_2BYTE_KIND) {
10650 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10651 (Py_UCS2 *)data + len,
10652 u1, u2, maxcount);
10653 }
10654 else {
10655 assert(kind == PyUnicode_4BYTE_KIND);
10656 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10657 (Py_UCS4 *)data + len,
10658 u1, u2, maxcount);
10659 }
10660 }
10661
10662 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10663 replace(PyObject *self, PyObject *str1,
10664 PyObject *str2, Py_ssize_t maxcount)
10665 {
10666 PyObject *u;
10667 const char *sbuf = PyUnicode_DATA(self);
10668 const void *buf1 = PyUnicode_DATA(str1);
10669 const void *buf2 = PyUnicode_DATA(str2);
10670 int srelease = 0, release1 = 0, release2 = 0;
10671 int skind = PyUnicode_KIND(self);
10672 int kind1 = PyUnicode_KIND(str1);
10673 int kind2 = PyUnicode_KIND(str2);
10674 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10675 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10676 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10677 int mayshrink;
10678 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10679
10680 if (slen < len1)
10681 goto nothing;
10682
10683 if (maxcount < 0)
10684 maxcount = PY_SSIZE_T_MAX;
10685 else if (maxcount == 0)
10686 goto nothing;
10687
10688 if (str1 == str2)
10689 goto nothing;
10690
10691 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10692 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10693 if (maxchar < maxchar_str1)
10694 /* substring too wide to be present */
10695 goto nothing;
10696 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10697 /* Replacing str1 with str2 may cause a maxchar reduction in the
10698 result string. */
10699 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10700 maxchar = Py_MAX(maxchar, maxchar_str2);
10701
10702 if (len1 == len2) {
10703 /* same length */
10704 if (len1 == 0)
10705 goto nothing;
10706 if (len1 == 1) {
10707 /* replace characters */
10708 Py_UCS4 u1, u2;
10709 Py_ssize_t pos;
10710
10711 u1 = PyUnicode_READ(kind1, buf1, 0);
10712 pos = findchar(sbuf, skind, slen, u1, 1);
10713 if (pos < 0)
10714 goto nothing;
10715 u2 = PyUnicode_READ(kind2, buf2, 0);
10716 u = PyUnicode_New(slen, maxchar);
10717 if (!u)
10718 goto error;
10719
10720 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10721 replace_1char_inplace(u, pos, u1, u2, maxcount);
10722 }
10723 else {
10724 int rkind = skind;
10725 char *res;
10726 Py_ssize_t i;
10727
10728 if (kind1 < rkind) {
10729 /* widen substring */
10730 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10731 if (!buf1) goto error;
10732 release1 = 1;
10733 }
10734 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10735 if (i < 0)
10736 goto nothing;
10737 if (rkind > kind2) {
10738 /* widen replacement */
10739 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10740 if (!buf2) goto error;
10741 release2 = 1;
10742 }
10743 else if (rkind < kind2) {
10744 /* widen self and buf1 */
10745 rkind = kind2;
10746 if (release1) {
10747 assert(buf1 != PyUnicode_DATA(str1));
10748 PyMem_Free((void *)buf1);
10749 buf1 = PyUnicode_DATA(str1);
10750 release1 = 0;
10751 }
10752 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10753 if (!sbuf) goto error;
10754 srelease = 1;
10755 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10756 if (!buf1) goto error;
10757 release1 = 1;
10758 }
10759 u = PyUnicode_New(slen, maxchar);
10760 if (!u)
10761 goto error;
10762 assert(PyUnicode_KIND(u) == rkind);
10763 res = PyUnicode_DATA(u);
10764
10765 memcpy(res, sbuf, rkind * slen);
10766 /* change everything in-place, starting with this one */
10767 memcpy(res + rkind * i,
10768 buf2,
10769 rkind * len2);
10770 i += len1;
10771
10772 while ( --maxcount > 0) {
10773 i = anylib_find(rkind, self,
10774 sbuf+rkind*i, slen-i,
10775 str1, buf1, len1, i);
10776 if (i == -1)
10777 break;
10778 memcpy(res + rkind * i,
10779 buf2,
10780 rkind * len2);
10781 i += len1;
10782 }
10783 }
10784 }
10785 else {
10786 Py_ssize_t n, i, j, ires;
10787 Py_ssize_t new_size;
10788 int rkind = skind;
10789 char *res;
10790
10791 if (kind1 < rkind) {
10792 /* widen substring */
10793 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10794 if (!buf1) goto error;
10795 release1 = 1;
10796 }
10797 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10798 if (n == 0)
10799 goto nothing;
10800 if (kind2 < rkind) {
10801 /* widen replacement */
10802 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10803 if (!buf2) goto error;
10804 release2 = 1;
10805 }
10806 else if (kind2 > rkind) {
10807 /* widen self and buf1 */
10808 rkind = kind2;
10809 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10810 if (!sbuf) goto error;
10811 srelease = 1;
10812 if (release1) {
10813 assert(buf1 != PyUnicode_DATA(str1));
10814 PyMem_Free((void *)buf1);
10815 buf1 = PyUnicode_DATA(str1);
10816 release1 = 0;
10817 }
10818 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10819 if (!buf1) goto error;
10820 release1 = 1;
10821 }
10822 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10823 PyUnicode_GET_LENGTH(str1))); */
10824 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10825 PyErr_SetString(PyExc_OverflowError,
10826 "replace string is too long");
10827 goto error;
10828 }
10829 new_size = slen + n * (len2 - len1);
10830 if (new_size == 0) {
10831 _Py_INCREF_UNICODE_EMPTY();
10832 if (!unicode_empty)
10833 goto error;
10834 u = unicode_empty;
10835 goto done;
10836 }
10837 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10838 PyErr_SetString(PyExc_OverflowError,
10839 "replace string is too long");
10840 goto error;
10841 }
10842 u = PyUnicode_New(new_size, maxchar);
10843 if (!u)
10844 goto error;
10845 assert(PyUnicode_KIND(u) == rkind);
10846 res = PyUnicode_DATA(u);
10847 ires = i = 0;
10848 if (len1 > 0) {
10849 while (n-- > 0) {
10850 /* look for next match */
10851 j = anylib_find(rkind, self,
10852 sbuf + rkind * i, slen-i,
10853 str1, buf1, len1, i);
10854 if (j == -1)
10855 break;
10856 else if (j > i) {
10857 /* copy unchanged part [i:j] */
10858 memcpy(res + rkind * ires,
10859 sbuf + rkind * i,
10860 rkind * (j-i));
10861 ires += j - i;
10862 }
10863 /* copy substitution string */
10864 if (len2 > 0) {
10865 memcpy(res + rkind * ires,
10866 buf2,
10867 rkind * len2);
10868 ires += len2;
10869 }
10870 i = j + len1;
10871 }
10872 if (i < slen)
10873 /* copy tail [i:] */
10874 memcpy(res + rkind * ires,
10875 sbuf + rkind * i,
10876 rkind * (slen-i));
10877 }
10878 else {
10879 /* interleave */
10880 while (n > 0) {
10881 memcpy(res + rkind * ires,
10882 buf2,
10883 rkind * len2);
10884 ires += len2;
10885 if (--n <= 0)
10886 break;
10887 memcpy(res + rkind * ires,
10888 sbuf + rkind * i,
10889 rkind);
10890 ires++;
10891 i++;
10892 }
10893 memcpy(res + rkind * ires,
10894 sbuf + rkind * i,
10895 rkind * (slen-i));
10896 }
10897 }
10898
10899 if (mayshrink) {
10900 unicode_adjust_maxchar(&u);
10901 if (u == NULL)
10902 goto error;
10903 }
10904
10905 done:
10906 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10907 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10908 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10909 if (srelease)
10910 PyMem_FREE((void *)sbuf);
10911 if (release1)
10912 PyMem_FREE((void *)buf1);
10913 if (release2)
10914 PyMem_FREE((void *)buf2);
10915 assert(_PyUnicode_CheckConsistency(u, 1));
10916 return u;
10917
10918 nothing:
10919 /* nothing to replace; return original string (when possible) */
10920 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10921 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10922 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10923 if (srelease)
10924 PyMem_FREE((void *)sbuf);
10925 if (release1)
10926 PyMem_FREE((void *)buf1);
10927 if (release2)
10928 PyMem_FREE((void *)buf2);
10929 return unicode_result_unchanged(self);
10930
10931 error:
10932 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10933 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10934 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10935 if (srelease)
10936 PyMem_FREE((void *)sbuf);
10937 if (release1)
10938 PyMem_FREE((void *)buf1);
10939 if (release2)
10940 PyMem_FREE((void *)buf2);
10941 return NULL;
10942 }
10943
10944 /* --- Unicode Object Methods --------------------------------------------- */
10945
10946 /*[clinic input]
10947 str.title as unicode_title
10948
10949 Return a version of the string where each word is titlecased.
10950
10951 More specifically, words start with uppercased characters and all remaining
10952 cased characters have lower case.
10953 [clinic start generated code]*/
10954
10955 static PyObject *
unicode_title_impl(PyObject * self)10956 unicode_title_impl(PyObject *self)
10957 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10958 {
10959 if (PyUnicode_READY(self) == -1)
10960 return NULL;
10961 return case_operation(self, do_title);
10962 }
10963
10964 /*[clinic input]
10965 str.capitalize as unicode_capitalize
10966
10967 Return a capitalized version of the string.
10968
10969 More specifically, make the first character have upper case and the rest lower
10970 case.
10971 [clinic start generated code]*/
10972
10973 static PyObject *
unicode_capitalize_impl(PyObject * self)10974 unicode_capitalize_impl(PyObject *self)
10975 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10976 {
10977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979 if (PyUnicode_GET_LENGTH(self) == 0)
10980 return unicode_result_unchanged(self);
10981 return case_operation(self, do_capitalize);
10982 }
10983
10984 /*[clinic input]
10985 str.casefold as unicode_casefold
10986
10987 Return a version of the string suitable for caseless comparisons.
10988 [clinic start generated code]*/
10989
10990 static PyObject *
unicode_casefold_impl(PyObject * self)10991 unicode_casefold_impl(PyObject *self)
10992 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10993 {
10994 if (PyUnicode_READY(self) == -1)
10995 return NULL;
10996 if (PyUnicode_IS_ASCII(self))
10997 return ascii_upper_or_lower(self, 1);
10998 return case_operation(self, do_casefold);
10999 }
11000
11001
11002 /* Argument converter. Accepts a single Unicode character. */
11003
11004 static int
convert_uc(PyObject * obj,void * addr)11005 convert_uc(PyObject *obj, void *addr)
11006 {
11007 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11008
11009 if (!PyUnicode_Check(obj)) {
11010 PyErr_Format(PyExc_TypeError,
11011 "The fill character must be a unicode character, "
11012 "not %.100s", Py_TYPE(obj)->tp_name);
11013 return 0;
11014 }
11015 if (PyUnicode_READY(obj) < 0)
11016 return 0;
11017 if (PyUnicode_GET_LENGTH(obj) != 1) {
11018 PyErr_SetString(PyExc_TypeError,
11019 "The fill character must be exactly one character long");
11020 return 0;
11021 }
11022 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11023 return 1;
11024 }
11025
11026 /*[clinic input]
11027 str.center as unicode_center
11028
11029 width: Py_ssize_t
11030 fillchar: Py_UCS4 = ' '
11031 /
11032
11033 Return a centered string of length width.
11034
11035 Padding is done using the specified fill character (default is a space).
11036 [clinic start generated code]*/
11037
11038 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11039 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11040 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11041 {
11042 Py_ssize_t marg, left;
11043
11044 if (PyUnicode_READY(self) == -1)
11045 return NULL;
11046
11047 if (PyUnicode_GET_LENGTH(self) >= width)
11048 return unicode_result_unchanged(self);
11049
11050 marg = width - PyUnicode_GET_LENGTH(self);
11051 left = marg / 2 + (marg & width & 1);
11052
11053 return pad(self, left, marg - left, fillchar);
11054 }
11055
11056 /* This function assumes that str1 and str2 are readied by the caller. */
11057
11058 static int
unicode_compare(PyObject * str1,PyObject * str2)11059 unicode_compare(PyObject *str1, PyObject *str2)
11060 {
11061 #define COMPARE(TYPE1, TYPE2) \
11062 do { \
11063 TYPE1* p1 = (TYPE1 *)data1; \
11064 TYPE2* p2 = (TYPE2 *)data2; \
11065 TYPE1* end = p1 + len; \
11066 Py_UCS4 c1, c2; \
11067 for (; p1 != end; p1++, p2++) { \
11068 c1 = *p1; \
11069 c2 = *p2; \
11070 if (c1 != c2) \
11071 return (c1 < c2) ? -1 : 1; \
11072 } \
11073 } \
11074 while (0)
11075
11076 int kind1, kind2;
11077 const void *data1, *data2;
11078 Py_ssize_t len1, len2, len;
11079
11080 kind1 = PyUnicode_KIND(str1);
11081 kind2 = PyUnicode_KIND(str2);
11082 data1 = PyUnicode_DATA(str1);
11083 data2 = PyUnicode_DATA(str2);
11084 len1 = PyUnicode_GET_LENGTH(str1);
11085 len2 = PyUnicode_GET_LENGTH(str2);
11086 len = Py_MIN(len1, len2);
11087
11088 switch(kind1) {
11089 case PyUnicode_1BYTE_KIND:
11090 {
11091 switch(kind2) {
11092 case PyUnicode_1BYTE_KIND:
11093 {
11094 int cmp = memcmp(data1, data2, len);
11095 /* normalize result of memcmp() into the range [-1; 1] */
11096 if (cmp < 0)
11097 return -1;
11098 if (cmp > 0)
11099 return 1;
11100 break;
11101 }
11102 case PyUnicode_2BYTE_KIND:
11103 COMPARE(Py_UCS1, Py_UCS2);
11104 break;
11105 case PyUnicode_4BYTE_KIND:
11106 COMPARE(Py_UCS1, Py_UCS4);
11107 break;
11108 default:
11109 Py_UNREACHABLE();
11110 }
11111 break;
11112 }
11113 case PyUnicode_2BYTE_KIND:
11114 {
11115 switch(kind2) {
11116 case PyUnicode_1BYTE_KIND:
11117 COMPARE(Py_UCS2, Py_UCS1);
11118 break;
11119 case PyUnicode_2BYTE_KIND:
11120 {
11121 COMPARE(Py_UCS2, Py_UCS2);
11122 break;
11123 }
11124 case PyUnicode_4BYTE_KIND:
11125 COMPARE(Py_UCS2, Py_UCS4);
11126 break;
11127 default:
11128 Py_UNREACHABLE();
11129 }
11130 break;
11131 }
11132 case PyUnicode_4BYTE_KIND:
11133 {
11134 switch(kind2) {
11135 case PyUnicode_1BYTE_KIND:
11136 COMPARE(Py_UCS4, Py_UCS1);
11137 break;
11138 case PyUnicode_2BYTE_KIND:
11139 COMPARE(Py_UCS4, Py_UCS2);
11140 break;
11141 case PyUnicode_4BYTE_KIND:
11142 {
11143 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11144 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11145 /* normalize result of wmemcmp() into the range [-1; 1] */
11146 if (cmp < 0)
11147 return -1;
11148 if (cmp > 0)
11149 return 1;
11150 #else
11151 COMPARE(Py_UCS4, Py_UCS4);
11152 #endif
11153 break;
11154 }
11155 default:
11156 Py_UNREACHABLE();
11157 }
11158 break;
11159 }
11160 default:
11161 Py_UNREACHABLE();
11162 }
11163
11164 if (len1 == len2)
11165 return 0;
11166 if (len1 < len2)
11167 return -1;
11168 else
11169 return 1;
11170
11171 #undef COMPARE
11172 }
11173
11174 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11175 unicode_compare_eq(PyObject *str1, PyObject *str2)
11176 {
11177 int kind;
11178 const void *data1, *data2;
11179 Py_ssize_t len;
11180 int cmp;
11181
11182 len = PyUnicode_GET_LENGTH(str1);
11183 if (PyUnicode_GET_LENGTH(str2) != len)
11184 return 0;
11185 kind = PyUnicode_KIND(str1);
11186 if (PyUnicode_KIND(str2) != kind)
11187 return 0;
11188 data1 = PyUnicode_DATA(str1);
11189 data2 = PyUnicode_DATA(str2);
11190
11191 cmp = memcmp(data1, data2, len * kind);
11192 return (cmp == 0);
11193 }
11194
11195
11196 int
PyUnicode_Compare(PyObject * left,PyObject * right)11197 PyUnicode_Compare(PyObject *left, PyObject *right)
11198 {
11199 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11200 if (PyUnicode_READY(left) == -1 ||
11201 PyUnicode_READY(right) == -1)
11202 return -1;
11203
11204 /* a string is equal to itself */
11205 if (left == right)
11206 return 0;
11207
11208 return unicode_compare(left, right);
11209 }
11210 PyErr_Format(PyExc_TypeError,
11211 "Can't compare %.100s and %.100s",
11212 Py_TYPE(left)->tp_name,
11213 Py_TYPE(right)->tp_name);
11214 return -1;
11215 }
11216
11217 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11218 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11219 {
11220 Py_ssize_t i;
11221 int kind;
11222 Py_UCS4 chr;
11223 const unsigned char *ustr = (const unsigned char *)str;
11224
11225 assert(_PyUnicode_CHECK(uni));
11226 if (!PyUnicode_IS_READY(uni)) {
11227 const wchar_t *ws = _PyUnicode_WSTR(uni);
11228 /* Compare Unicode string and source character set string */
11229 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11230 if (chr != ustr[i])
11231 return (chr < ustr[i]) ? -1 : 1;
11232 }
11233 /* This check keeps Python strings that end in '\0' from comparing equal
11234 to C strings identical up to that point. */
11235 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11236 return 1; /* uni is longer */
11237 if (ustr[i])
11238 return -1; /* str is longer */
11239 return 0;
11240 }
11241 kind = PyUnicode_KIND(uni);
11242 if (kind == PyUnicode_1BYTE_KIND) {
11243 const void *data = PyUnicode_1BYTE_DATA(uni);
11244 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11245 size_t len, len2 = strlen(str);
11246 int cmp;
11247
11248 len = Py_MIN(len1, len2);
11249 cmp = memcmp(data, str, len);
11250 if (cmp != 0) {
11251 if (cmp < 0)
11252 return -1;
11253 else
11254 return 1;
11255 }
11256 if (len1 > len2)
11257 return 1; /* uni is longer */
11258 if (len1 < len2)
11259 return -1; /* str is longer */
11260 return 0;
11261 }
11262 else {
11263 const void *data = PyUnicode_DATA(uni);
11264 /* Compare Unicode string and source character set string */
11265 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11266 if (chr != (unsigned char)str[i])
11267 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11268 /* This check keeps Python strings that end in '\0' from comparing equal
11269 to C strings identical up to that point. */
11270 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11271 return 1; /* uni is longer */
11272 if (str[i])
11273 return -1; /* str is longer */
11274 return 0;
11275 }
11276 }
11277
11278 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11279 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11280 {
11281 size_t i, len;
11282 const wchar_t *p;
11283 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11284 if (strlen(str) != len)
11285 return 0;
11286 p = _PyUnicode_WSTR(unicode);
11287 assert(p);
11288 for (i = 0; i < len; i++) {
11289 unsigned char c = (unsigned char)str[i];
11290 if (c >= 128 || p[i] != (wchar_t)c)
11291 return 0;
11292 }
11293 return 1;
11294 }
11295
11296 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11297 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11298 {
11299 size_t len;
11300 assert(_PyUnicode_CHECK(unicode));
11301 assert(str);
11302 #ifndef NDEBUG
11303 for (const char *p = str; *p; p++) {
11304 assert((unsigned char)*p < 128);
11305 }
11306 #endif
11307 if (PyUnicode_READY(unicode) == -1) {
11308 /* Memory error or bad data */
11309 PyErr_Clear();
11310 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11311 }
11312 if (!PyUnicode_IS_ASCII(unicode))
11313 return 0;
11314 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11315 return strlen(str) == len &&
11316 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11317 }
11318
11319 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11320 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11321 {
11322 PyObject *right_uni;
11323
11324 assert(_PyUnicode_CHECK(left));
11325 assert(right->string);
11326 #ifndef NDEBUG
11327 for (const char *p = right->string; *p; p++) {
11328 assert((unsigned char)*p < 128);
11329 }
11330 #endif
11331
11332 if (PyUnicode_READY(left) == -1) {
11333 /* memory error or bad data */
11334 PyErr_Clear();
11335 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11336 }
11337
11338 if (!PyUnicode_IS_ASCII(left))
11339 return 0;
11340
11341 right_uni = _PyUnicode_FromId(right); /* borrowed */
11342 if (right_uni == NULL) {
11343 /* memory error or bad data */
11344 PyErr_Clear();
11345 return _PyUnicode_EqualToASCIIString(left, right->string);
11346 }
11347
11348 if (left == right_uni)
11349 return 1;
11350
11351 if (PyUnicode_CHECK_INTERNED(left))
11352 return 0;
11353
11354 #ifdef INTERNED_STRINGS
11355 assert(_PyUnicode_HASH(right_uni) != -1);
11356 Py_hash_t hash = _PyUnicode_HASH(left);
11357 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11358 return 0;
11359 #endif
11360
11361 return unicode_compare_eq(left, right_uni);
11362 }
11363
11364 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11365 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11366 {
11367 int result;
11368
11369 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11370 Py_RETURN_NOTIMPLEMENTED;
11371
11372 if (PyUnicode_READY(left) == -1 ||
11373 PyUnicode_READY(right) == -1)
11374 return NULL;
11375
11376 if (left == right) {
11377 switch (op) {
11378 case Py_EQ:
11379 case Py_LE:
11380 case Py_GE:
11381 /* a string is equal to itself */
11382 Py_RETURN_TRUE;
11383 case Py_NE:
11384 case Py_LT:
11385 case Py_GT:
11386 Py_RETURN_FALSE;
11387 default:
11388 PyErr_BadArgument();
11389 return NULL;
11390 }
11391 }
11392 else if (op == Py_EQ || op == Py_NE) {
11393 result = unicode_compare_eq(left, right);
11394 result ^= (op == Py_NE);
11395 return PyBool_FromLong(result);
11396 }
11397 else {
11398 result = unicode_compare(left, right);
11399 Py_RETURN_RICHCOMPARE(result, 0, op);
11400 }
11401 }
11402
11403 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11404 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11405 {
11406 return unicode_eq(aa, bb);
11407 }
11408
11409 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11410 PyUnicode_Contains(PyObject *str, PyObject *substr)
11411 {
11412 int kind1, kind2;
11413 const void *buf1, *buf2;
11414 Py_ssize_t len1, len2;
11415 int result;
11416
11417 if (!PyUnicode_Check(substr)) {
11418 PyErr_Format(PyExc_TypeError,
11419 "'in <string>' requires string as left operand, not %.100s",
11420 Py_TYPE(substr)->tp_name);
11421 return -1;
11422 }
11423 if (PyUnicode_READY(substr) == -1)
11424 return -1;
11425 if (ensure_unicode(str) < 0)
11426 return -1;
11427
11428 kind1 = PyUnicode_KIND(str);
11429 kind2 = PyUnicode_KIND(substr);
11430 if (kind1 < kind2)
11431 return 0;
11432 len1 = PyUnicode_GET_LENGTH(str);
11433 len2 = PyUnicode_GET_LENGTH(substr);
11434 if (len1 < len2)
11435 return 0;
11436 buf1 = PyUnicode_DATA(str);
11437 buf2 = PyUnicode_DATA(substr);
11438 if (len2 == 1) {
11439 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11440 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11441 return result;
11442 }
11443 if (kind2 != kind1) {
11444 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11445 if (!buf2)
11446 return -1;
11447 }
11448
11449 switch (kind1) {
11450 case PyUnicode_1BYTE_KIND:
11451 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11452 break;
11453 case PyUnicode_2BYTE_KIND:
11454 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11455 break;
11456 case PyUnicode_4BYTE_KIND:
11457 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11458 break;
11459 default:
11460 Py_UNREACHABLE();
11461 }
11462
11463 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11464 if (kind2 != kind1)
11465 PyMem_Free((void *)buf2);
11466
11467 return result;
11468 }
11469
11470 /* Concat to string or Unicode object giving a new Unicode object. */
11471
11472 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11473 PyUnicode_Concat(PyObject *left, PyObject *right)
11474 {
11475 PyObject *result;
11476 Py_UCS4 maxchar, maxchar2;
11477 Py_ssize_t left_len, right_len, new_len;
11478
11479 if (ensure_unicode(left) < 0)
11480 return NULL;
11481
11482 if (!PyUnicode_Check(right)) {
11483 PyErr_Format(PyExc_TypeError,
11484 "can only concatenate str (not \"%.200s\") to str",
11485 Py_TYPE(right)->tp_name);
11486 return NULL;
11487 }
11488 if (PyUnicode_READY(right) < 0)
11489 return NULL;
11490
11491 /* Shortcuts */
11492 if (left == unicode_empty)
11493 return PyUnicode_FromObject(right);
11494 if (right == unicode_empty)
11495 return PyUnicode_FromObject(left);
11496
11497 left_len = PyUnicode_GET_LENGTH(left);
11498 right_len = PyUnicode_GET_LENGTH(right);
11499 if (left_len > PY_SSIZE_T_MAX - right_len) {
11500 PyErr_SetString(PyExc_OverflowError,
11501 "strings are too large to concat");
11502 return NULL;
11503 }
11504 new_len = left_len + right_len;
11505
11506 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11507 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11508 maxchar = Py_MAX(maxchar, maxchar2);
11509
11510 /* Concat the two Unicode strings */
11511 result = PyUnicode_New(new_len, maxchar);
11512 if (result == NULL)
11513 return NULL;
11514 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11515 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11516 assert(_PyUnicode_CheckConsistency(result, 1));
11517 return result;
11518 }
11519
11520 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11521 PyUnicode_Append(PyObject **p_left, PyObject *right)
11522 {
11523 PyObject *left, *res;
11524 Py_UCS4 maxchar, maxchar2;
11525 Py_ssize_t left_len, right_len, new_len;
11526
11527 if (p_left == NULL) {
11528 if (!PyErr_Occurred())
11529 PyErr_BadInternalCall();
11530 return;
11531 }
11532 left = *p_left;
11533 if (right == NULL || left == NULL
11534 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11535 if (!PyErr_Occurred())
11536 PyErr_BadInternalCall();
11537 goto error;
11538 }
11539
11540 if (PyUnicode_READY(left) == -1)
11541 goto error;
11542 if (PyUnicode_READY(right) == -1)
11543 goto error;
11544
11545 /* Shortcuts */
11546 if (left == unicode_empty) {
11547 Py_DECREF(left);
11548 Py_INCREF(right);
11549 *p_left = right;
11550 return;
11551 }
11552 if (right == unicode_empty)
11553 return;
11554
11555 left_len = PyUnicode_GET_LENGTH(left);
11556 right_len = PyUnicode_GET_LENGTH(right);
11557 if (left_len > PY_SSIZE_T_MAX - right_len) {
11558 PyErr_SetString(PyExc_OverflowError,
11559 "strings are too large to concat");
11560 goto error;
11561 }
11562 new_len = left_len + right_len;
11563
11564 if (unicode_modifiable(left)
11565 && PyUnicode_CheckExact(right)
11566 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11567 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11568 to change the structure size, but characters are stored just after
11569 the structure, and so it requires to move all characters which is
11570 not so different than duplicating the string. */
11571 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11572 {
11573 /* append inplace */
11574 if (unicode_resize(p_left, new_len) != 0)
11575 goto error;
11576
11577 /* copy 'right' into the newly allocated area of 'left' */
11578 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11579 }
11580 else {
11581 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11582 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11583 maxchar = Py_MAX(maxchar, maxchar2);
11584
11585 /* Concat the two Unicode strings */
11586 res = PyUnicode_New(new_len, maxchar);
11587 if (res == NULL)
11588 goto error;
11589 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11590 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11591 Py_DECREF(left);
11592 *p_left = res;
11593 }
11594 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11595 return;
11596
11597 error:
11598 Py_CLEAR(*p_left);
11599 }
11600
11601 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11602 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11603 {
11604 PyUnicode_Append(pleft, right);
11605 Py_XDECREF(right);
11606 }
11607
11608 /*
11609 Wraps stringlib_parse_args_finds() and additionally ensures that the
11610 first argument is a unicode object.
11611 */
11612
11613 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11614 parse_args_finds_unicode(const char * function_name, PyObject *args,
11615 PyObject **substring,
11616 Py_ssize_t *start, Py_ssize_t *end)
11617 {
11618 if(stringlib_parse_args_finds(function_name, args, substring,
11619 start, end)) {
11620 if (ensure_unicode(*substring) < 0)
11621 return 0;
11622 return 1;
11623 }
11624 return 0;
11625 }
11626
11627 PyDoc_STRVAR(count__doc__,
11628 "S.count(sub[, start[, end]]) -> int\n\
11629 \n\
11630 Return the number of non-overlapping occurrences of substring sub in\n\
11631 string S[start:end]. Optional arguments start and end are\n\
11632 interpreted as in slice notation.");
11633
11634 static PyObject *
unicode_count(PyObject * self,PyObject * args)11635 unicode_count(PyObject *self, PyObject *args)
11636 {
11637 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11638 Py_ssize_t start = 0;
11639 Py_ssize_t end = PY_SSIZE_T_MAX;
11640 PyObject *result;
11641 int kind1, kind2;
11642 const void *buf1, *buf2;
11643 Py_ssize_t len1, len2, iresult;
11644
11645 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11646 return NULL;
11647
11648 kind1 = PyUnicode_KIND(self);
11649 kind2 = PyUnicode_KIND(substring);
11650 if (kind1 < kind2)
11651 return PyLong_FromLong(0);
11652
11653 len1 = PyUnicode_GET_LENGTH(self);
11654 len2 = PyUnicode_GET_LENGTH(substring);
11655 ADJUST_INDICES(start, end, len1);
11656 if (end - start < len2)
11657 return PyLong_FromLong(0);
11658
11659 buf1 = PyUnicode_DATA(self);
11660 buf2 = PyUnicode_DATA(substring);
11661 if (kind2 != kind1) {
11662 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11663 if (!buf2)
11664 return NULL;
11665 }
11666 switch (kind1) {
11667 case PyUnicode_1BYTE_KIND:
11668 iresult = ucs1lib_count(
11669 ((const Py_UCS1*)buf1) + start, end - start,
11670 buf2, len2, PY_SSIZE_T_MAX
11671 );
11672 break;
11673 case PyUnicode_2BYTE_KIND:
11674 iresult = ucs2lib_count(
11675 ((const Py_UCS2*)buf1) + start, end - start,
11676 buf2, len2, PY_SSIZE_T_MAX
11677 );
11678 break;
11679 case PyUnicode_4BYTE_KIND:
11680 iresult = ucs4lib_count(
11681 ((const Py_UCS4*)buf1) + start, end - start,
11682 buf2, len2, PY_SSIZE_T_MAX
11683 );
11684 break;
11685 default:
11686 Py_UNREACHABLE();
11687 }
11688
11689 result = PyLong_FromSsize_t(iresult);
11690
11691 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11692 if (kind2 != kind1)
11693 PyMem_Free((void *)buf2);
11694
11695 return result;
11696 }
11697
11698 /*[clinic input]
11699 str.encode as unicode_encode
11700
11701 encoding: str(c_default="NULL") = 'utf-8'
11702 The encoding in which to encode the string.
11703 errors: str(c_default="NULL") = 'strict'
11704 The error handling scheme to use for encoding errors.
11705 The default is 'strict' meaning that encoding errors raise a
11706 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11707 'xmlcharrefreplace' as well as any other name registered with
11708 codecs.register_error that can handle UnicodeEncodeErrors.
11709
11710 Encode the string using the codec registered for encoding.
11711 [clinic start generated code]*/
11712
11713 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11714 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11715 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11716 {
11717 return PyUnicode_AsEncodedString(self, encoding, errors);
11718 }
11719
11720 /*[clinic input]
11721 str.expandtabs as unicode_expandtabs
11722
11723 tabsize: int = 8
11724
11725 Return a copy where all tab characters are expanded using spaces.
11726
11727 If tabsize is not given, a tab size of 8 characters is assumed.
11728 [clinic start generated code]*/
11729
11730 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11731 unicode_expandtabs_impl(PyObject *self, int tabsize)
11732 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11733 {
11734 Py_ssize_t i, j, line_pos, src_len, incr;
11735 Py_UCS4 ch;
11736 PyObject *u;
11737 const void *src_data;
11738 void *dest_data;
11739 int kind;
11740 int found;
11741
11742 if (PyUnicode_READY(self) == -1)
11743 return NULL;
11744
11745 /* First pass: determine size of output string */
11746 src_len = PyUnicode_GET_LENGTH(self);
11747 i = j = line_pos = 0;
11748 kind = PyUnicode_KIND(self);
11749 src_data = PyUnicode_DATA(self);
11750 found = 0;
11751 for (; i < src_len; i++) {
11752 ch = PyUnicode_READ(kind, src_data, i);
11753 if (ch == '\t') {
11754 found = 1;
11755 if (tabsize > 0) {
11756 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11757 if (j > PY_SSIZE_T_MAX - incr)
11758 goto overflow;
11759 line_pos += incr;
11760 j += incr;
11761 }
11762 }
11763 else {
11764 if (j > PY_SSIZE_T_MAX - 1)
11765 goto overflow;
11766 line_pos++;
11767 j++;
11768 if (ch == '\n' || ch == '\r')
11769 line_pos = 0;
11770 }
11771 }
11772 if (!found)
11773 return unicode_result_unchanged(self);
11774
11775 /* Second pass: create output string and fill it */
11776 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11777 if (!u)
11778 return NULL;
11779 dest_data = PyUnicode_DATA(u);
11780
11781 i = j = line_pos = 0;
11782
11783 for (; i < src_len; i++) {
11784 ch = PyUnicode_READ(kind, src_data, i);
11785 if (ch == '\t') {
11786 if (tabsize > 0) {
11787 incr = tabsize - (line_pos % tabsize);
11788 line_pos += incr;
11789 unicode_fill(kind, dest_data, ' ', j, incr);
11790 j += incr;
11791 }
11792 }
11793 else {
11794 line_pos++;
11795 PyUnicode_WRITE(kind, dest_data, j, ch);
11796 j++;
11797 if (ch == '\n' || ch == '\r')
11798 line_pos = 0;
11799 }
11800 }
11801 assert (j == PyUnicode_GET_LENGTH(u));
11802 return unicode_result(u);
11803
11804 overflow:
11805 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11806 return NULL;
11807 }
11808
11809 PyDoc_STRVAR(find__doc__,
11810 "S.find(sub[, start[, end]]) -> int\n\
11811 \n\
11812 Return the lowest index in S where substring sub is found,\n\
11813 such that sub is contained within S[start:end]. Optional\n\
11814 arguments start and end are interpreted as in slice notation.\n\
11815 \n\
11816 Return -1 on failure.");
11817
11818 static PyObject *
unicode_find(PyObject * self,PyObject * args)11819 unicode_find(PyObject *self, PyObject *args)
11820 {
11821 /* initialize variables to prevent gcc warning */
11822 PyObject *substring = NULL;
11823 Py_ssize_t start = 0;
11824 Py_ssize_t end = 0;
11825 Py_ssize_t result;
11826
11827 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11828 return NULL;
11829
11830 if (PyUnicode_READY(self) == -1)
11831 return NULL;
11832
11833 result = any_find_slice(self, substring, start, end, 1);
11834
11835 if (result == -2)
11836 return NULL;
11837
11838 return PyLong_FromSsize_t(result);
11839 }
11840
11841 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11842 unicode_getitem(PyObject *self, Py_ssize_t index)
11843 {
11844 const void *data;
11845 enum PyUnicode_Kind kind;
11846 Py_UCS4 ch;
11847
11848 if (!PyUnicode_Check(self)) {
11849 PyErr_BadArgument();
11850 return NULL;
11851 }
11852 if (PyUnicode_READY(self) == -1) {
11853 return NULL;
11854 }
11855 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11856 PyErr_SetString(PyExc_IndexError, "string index out of range");
11857 return NULL;
11858 }
11859 kind = PyUnicode_KIND(self);
11860 data = PyUnicode_DATA(self);
11861 ch = PyUnicode_READ(kind, data, index);
11862 return unicode_char(ch);
11863 }
11864
11865 /* Believe it or not, this produces the same value for ASCII strings
11866 as bytes_hash(). */
11867 static Py_hash_t
unicode_hash(PyObject * self)11868 unicode_hash(PyObject *self)
11869 {
11870 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11871
11872 #ifdef Py_DEBUG
11873 assert(_Py_HashSecret_Initialized);
11874 #endif
11875 if (_PyUnicode_HASH(self) != -1)
11876 return _PyUnicode_HASH(self);
11877 if (PyUnicode_READY(self) == -1)
11878 return -1;
11879
11880 x = _Py_HashBytes(PyUnicode_DATA(self),
11881 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11882 _PyUnicode_HASH(self) = x;
11883 return x;
11884 }
11885
11886 PyDoc_STRVAR(index__doc__,
11887 "S.index(sub[, start[, end]]) -> int\n\
11888 \n\
11889 Return the lowest index in S where substring sub is found,\n\
11890 such that sub is contained within S[start:end]. Optional\n\
11891 arguments start and end are interpreted as in slice notation.\n\
11892 \n\
11893 Raises ValueError when the substring is not found.");
11894
11895 static PyObject *
unicode_index(PyObject * self,PyObject * args)11896 unicode_index(PyObject *self, PyObject *args)
11897 {
11898 /* initialize variables to prevent gcc warning */
11899 Py_ssize_t result;
11900 PyObject *substring = NULL;
11901 Py_ssize_t start = 0;
11902 Py_ssize_t end = 0;
11903
11904 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11905 return NULL;
11906
11907 if (PyUnicode_READY(self) == -1)
11908 return NULL;
11909
11910 result = any_find_slice(self, substring, start, end, 1);
11911
11912 if (result == -2)
11913 return NULL;
11914
11915 if (result < 0) {
11916 PyErr_SetString(PyExc_ValueError, "substring not found");
11917 return NULL;
11918 }
11919
11920 return PyLong_FromSsize_t(result);
11921 }
11922
11923 /*[clinic input]
11924 str.isascii as unicode_isascii
11925
11926 Return True if all characters in the string are ASCII, False otherwise.
11927
11928 ASCII characters have code points in the range U+0000-U+007F.
11929 Empty string is ASCII too.
11930 [clinic start generated code]*/
11931
11932 static PyObject *
unicode_isascii_impl(PyObject * self)11933 unicode_isascii_impl(PyObject *self)
11934 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11935 {
11936 if (PyUnicode_READY(self) == -1) {
11937 return NULL;
11938 }
11939 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11940 }
11941
11942 /*[clinic input]
11943 str.islower as unicode_islower
11944
11945 Return True if the string is a lowercase string, False otherwise.
11946
11947 A string is lowercase if all cased characters in the string are lowercase and
11948 there is at least one cased character in the string.
11949 [clinic start generated code]*/
11950
11951 static PyObject *
unicode_islower_impl(PyObject * self)11952 unicode_islower_impl(PyObject *self)
11953 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11954 {
11955 Py_ssize_t i, length;
11956 int kind;
11957 const void *data;
11958 int cased;
11959
11960 if (PyUnicode_READY(self) == -1)
11961 return NULL;
11962 length = PyUnicode_GET_LENGTH(self);
11963 kind = PyUnicode_KIND(self);
11964 data = PyUnicode_DATA(self);
11965
11966 /* Shortcut for single character strings */
11967 if (length == 1)
11968 return PyBool_FromLong(
11969 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11970
11971 /* Special case for empty strings */
11972 if (length == 0)
11973 Py_RETURN_FALSE;
11974
11975 cased = 0;
11976 for (i = 0; i < length; i++) {
11977 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11978
11979 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11980 Py_RETURN_FALSE;
11981 else if (!cased && Py_UNICODE_ISLOWER(ch))
11982 cased = 1;
11983 }
11984 return PyBool_FromLong(cased);
11985 }
11986
11987 /*[clinic input]
11988 str.isupper as unicode_isupper
11989
11990 Return True if the string is an uppercase string, False otherwise.
11991
11992 A string is uppercase if all cased characters in the string are uppercase and
11993 there is at least one cased character in the string.
11994 [clinic start generated code]*/
11995
11996 static PyObject *
unicode_isupper_impl(PyObject * self)11997 unicode_isupper_impl(PyObject *self)
11998 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11999 {
12000 Py_ssize_t i, length;
12001 int kind;
12002 const void *data;
12003 int cased;
12004
12005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007 length = PyUnicode_GET_LENGTH(self);
12008 kind = PyUnicode_KIND(self);
12009 data = PyUnicode_DATA(self);
12010
12011 /* Shortcut for single character strings */
12012 if (length == 1)
12013 return PyBool_FromLong(
12014 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12015
12016 /* Special case for empty strings */
12017 if (length == 0)
12018 Py_RETURN_FALSE;
12019
12020 cased = 0;
12021 for (i = 0; i < length; i++) {
12022 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12023
12024 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12025 Py_RETURN_FALSE;
12026 else if (!cased && Py_UNICODE_ISUPPER(ch))
12027 cased = 1;
12028 }
12029 return PyBool_FromLong(cased);
12030 }
12031
12032 /*[clinic input]
12033 str.istitle as unicode_istitle
12034
12035 Return True if the string is a title-cased string, False otherwise.
12036
12037 In a title-cased string, upper- and title-case characters may only
12038 follow uncased characters and lowercase characters only cased ones.
12039 [clinic start generated code]*/
12040
12041 static PyObject *
unicode_istitle_impl(PyObject * self)12042 unicode_istitle_impl(PyObject *self)
12043 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12044 {
12045 Py_ssize_t i, length;
12046 int kind;
12047 const void *data;
12048 int cased, previous_is_cased;
12049
12050 if (PyUnicode_READY(self) == -1)
12051 return NULL;
12052 length = PyUnicode_GET_LENGTH(self);
12053 kind = PyUnicode_KIND(self);
12054 data = PyUnicode_DATA(self);
12055
12056 /* Shortcut for single character strings */
12057 if (length == 1) {
12058 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12059 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12060 (Py_UNICODE_ISUPPER(ch) != 0));
12061 }
12062
12063 /* Special case for empty strings */
12064 if (length == 0)
12065 Py_RETURN_FALSE;
12066
12067 cased = 0;
12068 previous_is_cased = 0;
12069 for (i = 0; i < length; i++) {
12070 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12071
12072 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12073 if (previous_is_cased)
12074 Py_RETURN_FALSE;
12075 previous_is_cased = 1;
12076 cased = 1;
12077 }
12078 else if (Py_UNICODE_ISLOWER(ch)) {
12079 if (!previous_is_cased)
12080 Py_RETURN_FALSE;
12081 previous_is_cased = 1;
12082 cased = 1;
12083 }
12084 else
12085 previous_is_cased = 0;
12086 }
12087 return PyBool_FromLong(cased);
12088 }
12089
12090 /*[clinic input]
12091 str.isspace as unicode_isspace
12092
12093 Return True if the string is a whitespace string, False otherwise.
12094
12095 A string is whitespace if all characters in the string are whitespace and there
12096 is at least one character in the string.
12097 [clinic start generated code]*/
12098
12099 static PyObject *
unicode_isspace_impl(PyObject * self)12100 unicode_isspace_impl(PyObject *self)
12101 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12102 {
12103 Py_ssize_t i, length;
12104 int kind;
12105 const void *data;
12106
12107 if (PyUnicode_READY(self) == -1)
12108 return NULL;
12109 length = PyUnicode_GET_LENGTH(self);
12110 kind = PyUnicode_KIND(self);
12111 data = PyUnicode_DATA(self);
12112
12113 /* Shortcut for single character strings */
12114 if (length == 1)
12115 return PyBool_FromLong(
12116 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12117
12118 /* Special case for empty strings */
12119 if (length == 0)
12120 Py_RETURN_FALSE;
12121
12122 for (i = 0; i < length; i++) {
12123 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12124 if (!Py_UNICODE_ISSPACE(ch))
12125 Py_RETURN_FALSE;
12126 }
12127 Py_RETURN_TRUE;
12128 }
12129
12130 /*[clinic input]
12131 str.isalpha as unicode_isalpha
12132
12133 Return True if the string is an alphabetic string, False otherwise.
12134
12135 A string is alphabetic if all characters in the string are alphabetic and there
12136 is at least one character in the string.
12137 [clinic start generated code]*/
12138
12139 static PyObject *
unicode_isalpha_impl(PyObject * self)12140 unicode_isalpha_impl(PyObject *self)
12141 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12142 {
12143 Py_ssize_t i, length;
12144 int kind;
12145 const void *data;
12146
12147 if (PyUnicode_READY(self) == -1)
12148 return NULL;
12149 length = PyUnicode_GET_LENGTH(self);
12150 kind = PyUnicode_KIND(self);
12151 data = PyUnicode_DATA(self);
12152
12153 /* Shortcut for single character strings */
12154 if (length == 1)
12155 return PyBool_FromLong(
12156 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12157
12158 /* Special case for empty strings */
12159 if (length == 0)
12160 Py_RETURN_FALSE;
12161
12162 for (i = 0; i < length; i++) {
12163 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12164 Py_RETURN_FALSE;
12165 }
12166 Py_RETURN_TRUE;
12167 }
12168
12169 /*[clinic input]
12170 str.isalnum as unicode_isalnum
12171
12172 Return True if the string is an alpha-numeric string, False otherwise.
12173
12174 A string is alpha-numeric if all characters in the string are alpha-numeric and
12175 there is at least one character in the string.
12176 [clinic start generated code]*/
12177
12178 static PyObject *
unicode_isalnum_impl(PyObject * self)12179 unicode_isalnum_impl(PyObject *self)
12180 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12181 {
12182 int kind;
12183 const void *data;
12184 Py_ssize_t len, i;
12185
12186 if (PyUnicode_READY(self) == -1)
12187 return NULL;
12188
12189 kind = PyUnicode_KIND(self);
12190 data = PyUnicode_DATA(self);
12191 len = PyUnicode_GET_LENGTH(self);
12192
12193 /* Shortcut for single character strings */
12194 if (len == 1) {
12195 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12196 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12197 }
12198
12199 /* Special case for empty strings */
12200 if (len == 0)
12201 Py_RETURN_FALSE;
12202
12203 for (i = 0; i < len; i++) {
12204 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12205 if (!Py_UNICODE_ISALNUM(ch))
12206 Py_RETURN_FALSE;
12207 }
12208 Py_RETURN_TRUE;
12209 }
12210
12211 /*[clinic input]
12212 str.isdecimal as unicode_isdecimal
12213
12214 Return True if the string is a decimal string, False otherwise.
12215
12216 A string is a decimal string if all characters in the string are decimal and
12217 there is at least one character in the string.
12218 [clinic start generated code]*/
12219
12220 static PyObject *
unicode_isdecimal_impl(PyObject * self)12221 unicode_isdecimal_impl(PyObject *self)
12222 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12223 {
12224 Py_ssize_t i, length;
12225 int kind;
12226 const void *data;
12227
12228 if (PyUnicode_READY(self) == -1)
12229 return NULL;
12230 length = PyUnicode_GET_LENGTH(self);
12231 kind = PyUnicode_KIND(self);
12232 data = PyUnicode_DATA(self);
12233
12234 /* Shortcut for single character strings */
12235 if (length == 1)
12236 return PyBool_FromLong(
12237 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12238
12239 /* Special case for empty strings */
12240 if (length == 0)
12241 Py_RETURN_FALSE;
12242
12243 for (i = 0; i < length; i++) {
12244 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12245 Py_RETURN_FALSE;
12246 }
12247 Py_RETURN_TRUE;
12248 }
12249
12250 /*[clinic input]
12251 str.isdigit as unicode_isdigit
12252
12253 Return True if the string is a digit string, False otherwise.
12254
12255 A string is a digit string if all characters in the string are digits and there
12256 is at least one character in the string.
12257 [clinic start generated code]*/
12258
12259 static PyObject *
unicode_isdigit_impl(PyObject * self)12260 unicode_isdigit_impl(PyObject *self)
12261 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12262 {
12263 Py_ssize_t i, length;
12264 int kind;
12265 const void *data;
12266
12267 if (PyUnicode_READY(self) == -1)
12268 return NULL;
12269 length = PyUnicode_GET_LENGTH(self);
12270 kind = PyUnicode_KIND(self);
12271 data = PyUnicode_DATA(self);
12272
12273 /* Shortcut for single character strings */
12274 if (length == 1) {
12275 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12276 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12277 }
12278
12279 /* Special case for empty strings */
12280 if (length == 0)
12281 Py_RETURN_FALSE;
12282
12283 for (i = 0; i < length; i++) {
12284 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12285 Py_RETURN_FALSE;
12286 }
12287 Py_RETURN_TRUE;
12288 }
12289
12290 /*[clinic input]
12291 str.isnumeric as unicode_isnumeric
12292
12293 Return True if the string is a numeric string, False otherwise.
12294
12295 A string is numeric if all characters in the string are numeric and there is at
12296 least one character in the string.
12297 [clinic start generated code]*/
12298
12299 static PyObject *
unicode_isnumeric_impl(PyObject * self)12300 unicode_isnumeric_impl(PyObject *self)
12301 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12302 {
12303 Py_ssize_t i, length;
12304 int kind;
12305 const void *data;
12306
12307 if (PyUnicode_READY(self) == -1)
12308 return NULL;
12309 length = PyUnicode_GET_LENGTH(self);
12310 kind = PyUnicode_KIND(self);
12311 data = PyUnicode_DATA(self);
12312
12313 /* Shortcut for single character strings */
12314 if (length == 1)
12315 return PyBool_FromLong(
12316 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12317
12318 /* Special case for empty strings */
12319 if (length == 0)
12320 Py_RETURN_FALSE;
12321
12322 for (i = 0; i < length; i++) {
12323 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12324 Py_RETURN_FALSE;
12325 }
12326 Py_RETURN_TRUE;
12327 }
12328
12329 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12330 _PyUnicode_ScanIdentifier(PyObject *self)
12331 {
12332 Py_ssize_t i;
12333 if (PyUnicode_READY(self) == -1)
12334 return -1;
12335
12336 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12337 if (len == 0) {
12338 /* an empty string is not a valid identifier */
12339 return 0;
12340 }
12341
12342 int kind = PyUnicode_KIND(self);
12343 const void *data = PyUnicode_DATA(self);
12344 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12345 /* PEP 3131 says that the first character must be in
12346 XID_Start and subsequent characters in XID_Continue,
12347 and for the ASCII range, the 2.x rules apply (i.e
12348 start with letters and underscore, continue with
12349 letters, digits, underscore). However, given the current
12350 definition of XID_Start and XID_Continue, it is sufficient
12351 to check just for these, except that _ must be allowed
12352 as starting an identifier. */
12353 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12354 return 0;
12355 }
12356
12357 for (i = 1; i < len; i++) {
12358 ch = PyUnicode_READ(kind, data, i);
12359 if (!_PyUnicode_IsXidContinue(ch)) {
12360 return i;
12361 }
12362 }
12363 return i;
12364 }
12365
12366 int
PyUnicode_IsIdentifier(PyObject * self)12367 PyUnicode_IsIdentifier(PyObject *self)
12368 {
12369 if (PyUnicode_IS_READY(self)) {
12370 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12371 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12372 /* an empty string is not a valid identifier */
12373 return len && i == len;
12374 }
12375 else {
12376 _Py_COMP_DIAG_PUSH
12377 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12378 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12379 if (len == 0) {
12380 /* an empty string is not a valid identifier */
12381 return 0;
12382 }
12383
12384 const wchar_t *wstr = _PyUnicode_WSTR(self);
12385 Py_UCS4 ch = wstr[i++];
12386 #if SIZEOF_WCHAR_T == 2
12387 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12388 && i < len
12389 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12390 {
12391 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12392 i++;
12393 }
12394 #endif
12395 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12396 return 0;
12397 }
12398
12399 while (i < len) {
12400 ch = wstr[i++];
12401 #if SIZEOF_WCHAR_T == 2
12402 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12403 && i < len
12404 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12405 {
12406 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12407 i++;
12408 }
12409 #endif
12410 if (!_PyUnicode_IsXidContinue(ch)) {
12411 return 0;
12412 }
12413 }
12414 return 1;
12415 _Py_COMP_DIAG_POP
12416 }
12417 }
12418
12419 /*[clinic input]
12420 str.isidentifier as unicode_isidentifier
12421
12422 Return True if the string is a valid Python identifier, False otherwise.
12423
12424 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12425 such as "def" or "class".
12426 [clinic start generated code]*/
12427
12428 static PyObject *
unicode_isidentifier_impl(PyObject * self)12429 unicode_isidentifier_impl(PyObject *self)
12430 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12431 {
12432 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12433 }
12434
12435 /*[clinic input]
12436 str.isprintable as unicode_isprintable
12437
12438 Return True if the string is printable, False otherwise.
12439
12440 A string is printable if all of its characters are considered printable in
12441 repr() or if it is empty.
12442 [clinic start generated code]*/
12443
12444 static PyObject *
unicode_isprintable_impl(PyObject * self)12445 unicode_isprintable_impl(PyObject *self)
12446 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12447 {
12448 Py_ssize_t i, length;
12449 int kind;
12450 const void *data;
12451
12452 if (PyUnicode_READY(self) == -1)
12453 return NULL;
12454 length = PyUnicode_GET_LENGTH(self);
12455 kind = PyUnicode_KIND(self);
12456 data = PyUnicode_DATA(self);
12457
12458 /* Shortcut for single character strings */
12459 if (length == 1)
12460 return PyBool_FromLong(
12461 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12462
12463 for (i = 0; i < length; i++) {
12464 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12465 Py_RETURN_FALSE;
12466 }
12467 }
12468 Py_RETURN_TRUE;
12469 }
12470
12471 /*[clinic input]
12472 str.join as unicode_join
12473
12474 iterable: object
12475 /
12476
12477 Concatenate any number of strings.
12478
12479 The string whose method is called is inserted in between each given string.
12480 The result is returned as a new string.
12481
12482 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12483 [clinic start generated code]*/
12484
12485 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12486 unicode_join(PyObject *self, PyObject *iterable)
12487 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12488 {
12489 return PyUnicode_Join(self, iterable);
12490 }
12491
12492 static Py_ssize_t
unicode_length(PyObject * self)12493 unicode_length(PyObject *self)
12494 {
12495 if (PyUnicode_READY(self) == -1)
12496 return -1;
12497 return PyUnicode_GET_LENGTH(self);
12498 }
12499
12500 /*[clinic input]
12501 str.ljust as unicode_ljust
12502
12503 width: Py_ssize_t
12504 fillchar: Py_UCS4 = ' '
12505 /
12506
12507 Return a left-justified string of length width.
12508
12509 Padding is done using the specified fill character (default is a space).
12510 [clinic start generated code]*/
12511
12512 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12513 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12514 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12515 {
12516 if (PyUnicode_READY(self) == -1)
12517 return NULL;
12518
12519 if (PyUnicode_GET_LENGTH(self) >= width)
12520 return unicode_result_unchanged(self);
12521
12522 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12523 }
12524
12525 /*[clinic input]
12526 str.lower as unicode_lower
12527
12528 Return a copy of the string converted to lowercase.
12529 [clinic start generated code]*/
12530
12531 static PyObject *
unicode_lower_impl(PyObject * self)12532 unicode_lower_impl(PyObject *self)
12533 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12534 {
12535 if (PyUnicode_READY(self) == -1)
12536 return NULL;
12537 if (PyUnicode_IS_ASCII(self))
12538 return ascii_upper_or_lower(self, 1);
12539 return case_operation(self, do_lower);
12540 }
12541
12542 #define LEFTSTRIP 0
12543 #define RIGHTSTRIP 1
12544 #define BOTHSTRIP 2
12545
12546 /* Arrays indexed by above */
12547 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12548
12549 #define STRIPNAME(i) (stripfuncnames[i])
12550
12551 /* externally visible for str.strip(unicode) */
12552 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12553 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12554 {
12555 const void *data;
12556 int kind;
12557 Py_ssize_t i, j, len;
12558 BLOOM_MASK sepmask;
12559 Py_ssize_t seplen;
12560
12561 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12562 return NULL;
12563
12564 kind = PyUnicode_KIND(self);
12565 data = PyUnicode_DATA(self);
12566 len = PyUnicode_GET_LENGTH(self);
12567 seplen = PyUnicode_GET_LENGTH(sepobj);
12568 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12569 PyUnicode_DATA(sepobj),
12570 seplen);
12571
12572 i = 0;
12573 if (striptype != RIGHTSTRIP) {
12574 while (i < len) {
12575 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12576 if (!BLOOM(sepmask, ch))
12577 break;
12578 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12579 break;
12580 i++;
12581 }
12582 }
12583
12584 j = len;
12585 if (striptype != LEFTSTRIP) {
12586 j--;
12587 while (j >= i) {
12588 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12589 if (!BLOOM(sepmask, ch))
12590 break;
12591 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12592 break;
12593 j--;
12594 }
12595
12596 j++;
12597 }
12598
12599 return PyUnicode_Substring(self, i, j);
12600 }
12601
12602 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12603 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12604 {
12605 const unsigned char *data;
12606 int kind;
12607 Py_ssize_t length;
12608
12609 if (PyUnicode_READY(self) == -1)
12610 return NULL;
12611
12612 length = PyUnicode_GET_LENGTH(self);
12613 end = Py_MIN(end, length);
12614
12615 if (start == 0 && end == length)
12616 return unicode_result_unchanged(self);
12617
12618 if (start < 0 || end < 0) {
12619 PyErr_SetString(PyExc_IndexError, "string index out of range");
12620 return NULL;
12621 }
12622 if (start >= length || end < start)
12623 _Py_RETURN_UNICODE_EMPTY();
12624
12625 length = end - start;
12626 if (PyUnicode_IS_ASCII(self)) {
12627 data = PyUnicode_1BYTE_DATA(self);
12628 return _PyUnicode_FromASCII((const char*)(data + start), length);
12629 }
12630 else {
12631 kind = PyUnicode_KIND(self);
12632 data = PyUnicode_1BYTE_DATA(self);
12633 return PyUnicode_FromKindAndData(kind,
12634 data + kind * start,
12635 length);
12636 }
12637 }
12638
12639 static PyObject *
do_strip(PyObject * self,int striptype)12640 do_strip(PyObject *self, int striptype)
12641 {
12642 Py_ssize_t len, i, j;
12643
12644 if (PyUnicode_READY(self) == -1)
12645 return NULL;
12646
12647 len = PyUnicode_GET_LENGTH(self);
12648
12649 if (PyUnicode_IS_ASCII(self)) {
12650 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12651
12652 i = 0;
12653 if (striptype != RIGHTSTRIP) {
12654 while (i < len) {
12655 Py_UCS1 ch = data[i];
12656 if (!_Py_ascii_whitespace[ch])
12657 break;
12658 i++;
12659 }
12660 }
12661
12662 j = len;
12663 if (striptype != LEFTSTRIP) {
12664 j--;
12665 while (j >= i) {
12666 Py_UCS1 ch = data[j];
12667 if (!_Py_ascii_whitespace[ch])
12668 break;
12669 j--;
12670 }
12671 j++;
12672 }
12673 }
12674 else {
12675 int kind = PyUnicode_KIND(self);
12676 const void *data = PyUnicode_DATA(self);
12677
12678 i = 0;
12679 if (striptype != RIGHTSTRIP) {
12680 while (i < len) {
12681 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12682 if (!Py_UNICODE_ISSPACE(ch))
12683 break;
12684 i++;
12685 }
12686 }
12687
12688 j = len;
12689 if (striptype != LEFTSTRIP) {
12690 j--;
12691 while (j >= i) {
12692 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12693 if (!Py_UNICODE_ISSPACE(ch))
12694 break;
12695 j--;
12696 }
12697 j++;
12698 }
12699 }
12700
12701 return PyUnicode_Substring(self, i, j);
12702 }
12703
12704
12705 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12706 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12707 {
12708 if (sep != Py_None) {
12709 if (PyUnicode_Check(sep))
12710 return _PyUnicode_XStrip(self, striptype, sep);
12711 else {
12712 PyErr_Format(PyExc_TypeError,
12713 "%s arg must be None or str",
12714 STRIPNAME(striptype));
12715 return NULL;
12716 }
12717 }
12718
12719 return do_strip(self, striptype);
12720 }
12721
12722
12723 /*[clinic input]
12724 str.strip as unicode_strip
12725
12726 chars: object = None
12727 /
12728
12729 Return a copy of the string with leading and trailing whitespace removed.
12730
12731 If chars is given and not None, remove characters in chars instead.
12732 [clinic start generated code]*/
12733
12734 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12735 unicode_strip_impl(PyObject *self, PyObject *chars)
12736 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12737 {
12738 return do_argstrip(self, BOTHSTRIP, chars);
12739 }
12740
12741
12742 /*[clinic input]
12743 str.lstrip as unicode_lstrip
12744
12745 chars: object = None
12746 /
12747
12748 Return a copy of the string with leading whitespace removed.
12749
12750 If chars is given and not None, remove characters in chars instead.
12751 [clinic start generated code]*/
12752
12753 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12754 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12755 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12756 {
12757 return do_argstrip(self, LEFTSTRIP, chars);
12758 }
12759
12760
12761 /*[clinic input]
12762 str.rstrip as unicode_rstrip
12763
12764 chars: object = None
12765 /
12766
12767 Return a copy of the string with trailing whitespace removed.
12768
12769 If chars is given and not None, remove characters in chars instead.
12770 [clinic start generated code]*/
12771
12772 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12773 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12774 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12775 {
12776 return do_argstrip(self, RIGHTSTRIP, chars);
12777 }
12778
12779
12780 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12781 unicode_repeat(PyObject *str, Py_ssize_t len)
12782 {
12783 PyObject *u;
12784 Py_ssize_t nchars, n;
12785
12786 if (len < 1)
12787 _Py_RETURN_UNICODE_EMPTY();
12788
12789 /* no repeat, return original string */
12790 if (len == 1)
12791 return unicode_result_unchanged(str);
12792
12793 if (PyUnicode_READY(str) == -1)
12794 return NULL;
12795
12796 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12797 PyErr_SetString(PyExc_OverflowError,
12798 "repeated string is too long");
12799 return NULL;
12800 }
12801 nchars = len * PyUnicode_GET_LENGTH(str);
12802
12803 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12804 if (!u)
12805 return NULL;
12806 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12807
12808 if (PyUnicode_GET_LENGTH(str) == 1) {
12809 int kind = PyUnicode_KIND(str);
12810 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12811 if (kind == PyUnicode_1BYTE_KIND) {
12812 void *to = PyUnicode_DATA(u);
12813 memset(to, (unsigned char)fill_char, len);
12814 }
12815 else if (kind == PyUnicode_2BYTE_KIND) {
12816 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12817 for (n = 0; n < len; ++n)
12818 ucs2[n] = fill_char;
12819 } else {
12820 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12821 assert(kind == PyUnicode_4BYTE_KIND);
12822 for (n = 0; n < len; ++n)
12823 ucs4[n] = fill_char;
12824 }
12825 }
12826 else {
12827 /* number of characters copied this far */
12828 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12829 Py_ssize_t char_size = PyUnicode_KIND(str);
12830 char *to = (char *) PyUnicode_DATA(u);
12831 memcpy(to, PyUnicode_DATA(str),
12832 PyUnicode_GET_LENGTH(str) * char_size);
12833 while (done < nchars) {
12834 n = (done <= nchars-done) ? done : nchars-done;
12835 memcpy(to + (done * char_size), to, n * char_size);
12836 done += n;
12837 }
12838 }
12839
12840 assert(_PyUnicode_CheckConsistency(u, 1));
12841 return u;
12842 }
12843
12844 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12845 PyUnicode_Replace(PyObject *str,
12846 PyObject *substr,
12847 PyObject *replstr,
12848 Py_ssize_t maxcount)
12849 {
12850 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12851 ensure_unicode(replstr) < 0)
12852 return NULL;
12853 return replace(str, substr, replstr, maxcount);
12854 }
12855
12856 /*[clinic input]
12857 str.replace as unicode_replace
12858
12859 old: unicode
12860 new: unicode
12861 count: Py_ssize_t = -1
12862 Maximum number of occurrences to replace.
12863 -1 (the default value) means replace all occurrences.
12864 /
12865
12866 Return a copy with all occurrences of substring old replaced by new.
12867
12868 If the optional argument count is given, only the first count occurrences are
12869 replaced.
12870 [clinic start generated code]*/
12871
12872 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12873 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12874 Py_ssize_t count)
12875 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12876 {
12877 if (PyUnicode_READY(self) == -1)
12878 return NULL;
12879 return replace(self, old, new, count);
12880 }
12881
12882 /*[clinic input]
12883 str.removeprefix as unicode_removeprefix
12884
12885 prefix: unicode
12886 /
12887
12888 Return a str with the given prefix string removed if present.
12889
12890 If the string starts with the prefix string, return string[len(prefix):].
12891 Otherwise, return a copy of the original string.
12892 [clinic start generated code]*/
12893
12894 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12895 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12896 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12897 {
12898 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12899 if (match == -1) {
12900 return NULL;
12901 }
12902 if (match) {
12903 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12904 PyUnicode_GET_LENGTH(self));
12905 }
12906 return unicode_result_unchanged(self);
12907 }
12908
12909 /*[clinic input]
12910 str.removesuffix as unicode_removesuffix
12911
12912 suffix: unicode
12913 /
12914
12915 Return a str with the given suffix string removed if present.
12916
12917 If the string ends with the suffix string and that suffix is not empty,
12918 return string[:-len(suffix)]. Otherwise, return a copy of the original
12919 string.
12920 [clinic start generated code]*/
12921
12922 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12923 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12924 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12925 {
12926 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12927 if (match == -1) {
12928 return NULL;
12929 }
12930 if (match) {
12931 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12932 - PyUnicode_GET_LENGTH(suffix));
12933 }
12934 return unicode_result_unchanged(self);
12935 }
12936
12937 static PyObject *
unicode_repr(PyObject * unicode)12938 unicode_repr(PyObject *unicode)
12939 {
12940 PyObject *repr;
12941 Py_ssize_t isize;
12942 Py_ssize_t osize, squote, dquote, i, o;
12943 Py_UCS4 max, quote;
12944 int ikind, okind, unchanged;
12945 const void *idata;
12946 void *odata;
12947
12948 if (PyUnicode_READY(unicode) == -1)
12949 return NULL;
12950
12951 isize = PyUnicode_GET_LENGTH(unicode);
12952 idata = PyUnicode_DATA(unicode);
12953
12954 /* Compute length of output, quote characters, and
12955 maximum character */
12956 osize = 0;
12957 max = 127;
12958 squote = dquote = 0;
12959 ikind = PyUnicode_KIND(unicode);
12960 for (i = 0; i < isize; i++) {
12961 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12962 Py_ssize_t incr = 1;
12963 switch (ch) {
12964 case '\'': squote++; break;
12965 case '"': dquote++; break;
12966 case '\\': case '\t': case '\r': case '\n':
12967 incr = 2;
12968 break;
12969 default:
12970 /* Fast-path ASCII */
12971 if (ch < ' ' || ch == 0x7f)
12972 incr = 4; /* \xHH */
12973 else if (ch < 0x7f)
12974 ;
12975 else if (Py_UNICODE_ISPRINTABLE(ch))
12976 max = ch > max ? ch : max;
12977 else if (ch < 0x100)
12978 incr = 4; /* \xHH */
12979 else if (ch < 0x10000)
12980 incr = 6; /* \uHHHH */
12981 else
12982 incr = 10; /* \uHHHHHHHH */
12983 }
12984 if (osize > PY_SSIZE_T_MAX - incr) {
12985 PyErr_SetString(PyExc_OverflowError,
12986 "string is too long to generate repr");
12987 return NULL;
12988 }
12989 osize += incr;
12990 }
12991
12992 quote = '\'';
12993 unchanged = (osize == isize);
12994 if (squote) {
12995 unchanged = 0;
12996 if (dquote)
12997 /* Both squote and dquote present. Use squote,
12998 and escape them */
12999 osize += squote;
13000 else
13001 quote = '"';
13002 }
13003 osize += 2; /* quotes */
13004
13005 repr = PyUnicode_New(osize, max);
13006 if (repr == NULL)
13007 return NULL;
13008 okind = PyUnicode_KIND(repr);
13009 odata = PyUnicode_DATA(repr);
13010
13011 PyUnicode_WRITE(okind, odata, 0, quote);
13012 PyUnicode_WRITE(okind, odata, osize-1, quote);
13013 if (unchanged) {
13014 _PyUnicode_FastCopyCharacters(repr, 1,
13015 unicode, 0,
13016 isize);
13017 }
13018 else {
13019 for (i = 0, o = 1; i < isize; i++) {
13020 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13021
13022 /* Escape quotes and backslashes */
13023 if ((ch == quote) || (ch == '\\')) {
13024 PyUnicode_WRITE(okind, odata, o++, '\\');
13025 PyUnicode_WRITE(okind, odata, o++, ch);
13026 continue;
13027 }
13028
13029 /* Map special whitespace to '\t', \n', '\r' */
13030 if (ch == '\t') {
13031 PyUnicode_WRITE(okind, odata, o++, '\\');
13032 PyUnicode_WRITE(okind, odata, o++, 't');
13033 }
13034 else if (ch == '\n') {
13035 PyUnicode_WRITE(okind, odata, o++, '\\');
13036 PyUnicode_WRITE(okind, odata, o++, 'n');
13037 }
13038 else if (ch == '\r') {
13039 PyUnicode_WRITE(okind, odata, o++, '\\');
13040 PyUnicode_WRITE(okind, odata, o++, 'r');
13041 }
13042
13043 /* Map non-printable US ASCII to '\xhh' */
13044 else if (ch < ' ' || ch == 0x7F) {
13045 PyUnicode_WRITE(okind, odata, o++, '\\');
13046 PyUnicode_WRITE(okind, odata, o++, 'x');
13047 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13049 }
13050
13051 /* Copy ASCII characters as-is */
13052 else if (ch < 0x7F) {
13053 PyUnicode_WRITE(okind, odata, o++, ch);
13054 }
13055
13056 /* Non-ASCII characters */
13057 else {
13058 /* Map Unicode whitespace and control characters
13059 (categories Z* and C* except ASCII space)
13060 */
13061 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13062 PyUnicode_WRITE(okind, odata, o++, '\\');
13063 /* Map 8-bit characters to '\xhh' */
13064 if (ch <= 0xff) {
13065 PyUnicode_WRITE(okind, odata, o++, 'x');
13066 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13068 }
13069 /* Map 16-bit characters to '\uxxxx' */
13070 else if (ch <= 0xffff) {
13071 PyUnicode_WRITE(okind, odata, o++, 'u');
13072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13076 }
13077 /* Map 21-bit characters to '\U00xxxxxx' */
13078 else {
13079 PyUnicode_WRITE(okind, odata, o++, 'U');
13080 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13081 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13086 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13088 }
13089 }
13090 /* Copy characters as-is */
13091 else {
13092 PyUnicode_WRITE(okind, odata, o++, ch);
13093 }
13094 }
13095 }
13096 }
13097 /* Closing quote already added at the beginning */
13098 assert(_PyUnicode_CheckConsistency(repr, 1));
13099 return repr;
13100 }
13101
13102 PyDoc_STRVAR(rfind__doc__,
13103 "S.rfind(sub[, start[, end]]) -> int\n\
13104 \n\
13105 Return the highest index in S where substring sub is found,\n\
13106 such that sub is contained within S[start:end]. Optional\n\
13107 arguments start and end are interpreted as in slice notation.\n\
13108 \n\
13109 Return -1 on failure.");
13110
13111 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13112 unicode_rfind(PyObject *self, PyObject *args)
13113 {
13114 /* initialize variables to prevent gcc warning */
13115 PyObject *substring = NULL;
13116 Py_ssize_t start = 0;
13117 Py_ssize_t end = 0;
13118 Py_ssize_t result;
13119
13120 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13121 return NULL;
13122
13123 if (PyUnicode_READY(self) == -1)
13124 return NULL;
13125
13126 result = any_find_slice(self, substring, start, end, -1);
13127
13128 if (result == -2)
13129 return NULL;
13130
13131 return PyLong_FromSsize_t(result);
13132 }
13133
13134 PyDoc_STRVAR(rindex__doc__,
13135 "S.rindex(sub[, start[, end]]) -> int\n\
13136 \n\
13137 Return the highest index in S where substring sub is found,\n\
13138 such that sub is contained within S[start:end]. Optional\n\
13139 arguments start and end are interpreted as in slice notation.\n\
13140 \n\
13141 Raises ValueError when the substring is not found.");
13142
13143 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13144 unicode_rindex(PyObject *self, PyObject *args)
13145 {
13146 /* initialize variables to prevent gcc warning */
13147 PyObject *substring = NULL;
13148 Py_ssize_t start = 0;
13149 Py_ssize_t end = 0;
13150 Py_ssize_t result;
13151
13152 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13153 return NULL;
13154
13155 if (PyUnicode_READY(self) == -1)
13156 return NULL;
13157
13158 result = any_find_slice(self, substring, start, end, -1);
13159
13160 if (result == -2)
13161 return NULL;
13162
13163 if (result < 0) {
13164 PyErr_SetString(PyExc_ValueError, "substring not found");
13165 return NULL;
13166 }
13167
13168 return PyLong_FromSsize_t(result);
13169 }
13170
13171 /*[clinic input]
13172 str.rjust as unicode_rjust
13173
13174 width: Py_ssize_t
13175 fillchar: Py_UCS4 = ' '
13176 /
13177
13178 Return a right-justified string of length width.
13179
13180 Padding is done using the specified fill character (default is a space).
13181 [clinic start generated code]*/
13182
13183 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13184 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13185 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13186 {
13187 if (PyUnicode_READY(self) == -1)
13188 return NULL;
13189
13190 if (PyUnicode_GET_LENGTH(self) >= width)
13191 return unicode_result_unchanged(self);
13192
13193 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13194 }
13195
13196 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13197 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13198 {
13199 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13200 return NULL;
13201
13202 return split(s, sep, maxsplit);
13203 }
13204
13205 /*[clinic input]
13206 str.split as unicode_split
13207
13208 sep: object = None
13209 The delimiter according which to split the string.
13210 None (the default value) means split according to any whitespace,
13211 and discard empty strings from the result.
13212 maxsplit: Py_ssize_t = -1
13213 Maximum number of splits to do.
13214 -1 (the default value) means no limit.
13215
13216 Return a list of the words in the string, using sep as the delimiter string.
13217 [clinic start generated code]*/
13218
13219 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13220 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13221 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
13222 {
13223 if (sep == Py_None)
13224 return split(self, NULL, maxsplit);
13225 if (PyUnicode_Check(sep))
13226 return split(self, sep, maxsplit);
13227
13228 PyErr_Format(PyExc_TypeError,
13229 "must be str or None, not %.100s",
13230 Py_TYPE(sep)->tp_name);
13231 return NULL;
13232 }
13233
13234 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13235 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13236 {
13237 PyObject* out;
13238 int kind1, kind2;
13239 const void *buf1, *buf2;
13240 Py_ssize_t len1, len2;
13241
13242 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13243 return NULL;
13244
13245 kind1 = PyUnicode_KIND(str_obj);
13246 kind2 = PyUnicode_KIND(sep_obj);
13247 len1 = PyUnicode_GET_LENGTH(str_obj);
13248 len2 = PyUnicode_GET_LENGTH(sep_obj);
13249 if (kind1 < kind2 || len1 < len2) {
13250 _Py_INCREF_UNICODE_EMPTY();
13251 if (!unicode_empty)
13252 out = NULL;
13253 else {
13254 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13255 Py_DECREF(unicode_empty);
13256 }
13257 return out;
13258 }
13259 buf1 = PyUnicode_DATA(str_obj);
13260 buf2 = PyUnicode_DATA(sep_obj);
13261 if (kind2 != kind1) {
13262 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13263 if (!buf2)
13264 return NULL;
13265 }
13266
13267 switch (kind1) {
13268 case PyUnicode_1BYTE_KIND:
13269 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13270 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13271 else
13272 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13273 break;
13274 case PyUnicode_2BYTE_KIND:
13275 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276 break;
13277 case PyUnicode_4BYTE_KIND:
13278 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279 break;
13280 default:
13281 Py_UNREACHABLE();
13282 }
13283
13284 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13285 if (kind2 != kind1)
13286 PyMem_Free((void *)buf2);
13287
13288 return out;
13289 }
13290
13291
13292 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13293 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13294 {
13295 PyObject* out;
13296 int kind1, kind2;
13297 const void *buf1, *buf2;
13298 Py_ssize_t len1, len2;
13299
13300 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13301 return NULL;
13302
13303 kind1 = PyUnicode_KIND(str_obj);
13304 kind2 = PyUnicode_KIND(sep_obj);
13305 len1 = PyUnicode_GET_LENGTH(str_obj);
13306 len2 = PyUnicode_GET_LENGTH(sep_obj);
13307 if (kind1 < kind2 || len1 < len2) {
13308 _Py_INCREF_UNICODE_EMPTY();
13309 if (!unicode_empty)
13310 out = NULL;
13311 else {
13312 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13313 Py_DECREF(unicode_empty);
13314 }
13315 return out;
13316 }
13317 buf1 = PyUnicode_DATA(str_obj);
13318 buf2 = PyUnicode_DATA(sep_obj);
13319 if (kind2 != kind1) {
13320 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13321 if (!buf2)
13322 return NULL;
13323 }
13324
13325 switch (kind1) {
13326 case PyUnicode_1BYTE_KIND:
13327 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13328 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13329 else
13330 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13331 break;
13332 case PyUnicode_2BYTE_KIND:
13333 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13334 break;
13335 case PyUnicode_4BYTE_KIND:
13336 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13337 break;
13338 default:
13339 Py_UNREACHABLE();
13340 }
13341
13342 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13343 if (kind2 != kind1)
13344 PyMem_Free((void *)buf2);
13345
13346 return out;
13347 }
13348
13349 /*[clinic input]
13350 str.partition as unicode_partition
13351
13352 sep: object
13353 /
13354
13355 Partition the string into three parts using the given separator.
13356
13357 This will search for the separator in the string. If the separator is found,
13358 returns a 3-tuple containing the part before the separator, the separator
13359 itself, and the part after it.
13360
13361 If the separator is not found, returns a 3-tuple containing the original string
13362 and two empty strings.
13363 [clinic start generated code]*/
13364
13365 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13366 unicode_partition(PyObject *self, PyObject *sep)
13367 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13368 {
13369 return PyUnicode_Partition(self, sep);
13370 }
13371
13372 /*[clinic input]
13373 str.rpartition as unicode_rpartition = str.partition
13374
13375 Partition the string into three parts using the given separator.
13376
13377 This will search for the separator in the string, starting at the end. If
13378 the separator is found, returns a 3-tuple containing the part before the
13379 separator, the separator itself, and the part after it.
13380
13381 If the separator is not found, returns a 3-tuple containing two empty strings
13382 and the original string.
13383 [clinic start generated code]*/
13384
13385 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13386 unicode_rpartition(PyObject *self, PyObject *sep)
13387 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13388 {
13389 return PyUnicode_RPartition(self, sep);
13390 }
13391
13392 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13393 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13394 {
13395 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13396 return NULL;
13397
13398 return rsplit(s, sep, maxsplit);
13399 }
13400
13401 /*[clinic input]
13402 str.rsplit as unicode_rsplit = str.split
13403
13404 Return a list of the words in the string, using sep as the delimiter string.
13405
13406 Splits are done starting at the end of the string and working to the front.
13407 [clinic start generated code]*/
13408
13409 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13410 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13411 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13412 {
13413 if (sep == Py_None)
13414 return rsplit(self, NULL, maxsplit);
13415 if (PyUnicode_Check(sep))
13416 return rsplit(self, sep, maxsplit);
13417
13418 PyErr_Format(PyExc_TypeError,
13419 "must be str or None, not %.100s",
13420 Py_TYPE(sep)->tp_name);
13421 return NULL;
13422 }
13423
13424 /*[clinic input]
13425 str.splitlines as unicode_splitlines
13426
13427 keepends: bool(accept={int}) = False
13428
13429 Return a list of the lines in the string, breaking at line boundaries.
13430
13431 Line breaks are not included in the resulting list unless keepends is given and
13432 true.
13433 [clinic start generated code]*/
13434
13435 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13436 unicode_splitlines_impl(PyObject *self, int keepends)
13437 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13438 {
13439 return PyUnicode_Splitlines(self, keepends);
13440 }
13441
13442 static
unicode_str(PyObject * self)13443 PyObject *unicode_str(PyObject *self)
13444 {
13445 return unicode_result_unchanged(self);
13446 }
13447
13448 /*[clinic input]
13449 str.swapcase as unicode_swapcase
13450
13451 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13452 [clinic start generated code]*/
13453
13454 static PyObject *
unicode_swapcase_impl(PyObject * self)13455 unicode_swapcase_impl(PyObject *self)
13456 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13457 {
13458 if (PyUnicode_READY(self) == -1)
13459 return NULL;
13460 return case_operation(self, do_swapcase);
13461 }
13462
13463 /*[clinic input]
13464
13465 @staticmethod
13466 str.maketrans as unicode_maketrans
13467
13468 x: object
13469
13470 y: unicode=NULL
13471
13472 z: unicode=NULL
13473
13474 /
13475
13476 Return a translation table usable for str.translate().
13477
13478 If there is only one argument, it must be a dictionary mapping Unicode
13479 ordinals (integers) or characters to Unicode ordinals, strings or None.
13480 Character keys will be then converted to ordinals.
13481 If there are two arguments, they must be strings of equal length, and
13482 in the resulting dictionary, each character in x will be mapped to the
13483 character at the same position in y. If there is a third argument, it
13484 must be a string, whose characters will be mapped to None in the result.
13485 [clinic start generated code]*/
13486
13487 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13488 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13489 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13490 {
13491 PyObject *new = NULL, *key, *value;
13492 Py_ssize_t i = 0;
13493 int res;
13494
13495 new = PyDict_New();
13496 if (!new)
13497 return NULL;
13498 if (y != NULL) {
13499 int x_kind, y_kind, z_kind;
13500 const void *x_data, *y_data, *z_data;
13501
13502 /* x must be a string too, of equal length */
13503 if (!PyUnicode_Check(x)) {
13504 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13505 "be a string if there is a second argument");
13506 goto err;
13507 }
13508 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13509 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13510 "arguments must have equal length");
13511 goto err;
13512 }
13513 /* create entries for translating chars in x to those in y */
13514 x_kind = PyUnicode_KIND(x);
13515 y_kind = PyUnicode_KIND(y);
13516 x_data = PyUnicode_DATA(x);
13517 y_data = PyUnicode_DATA(y);
13518 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13519 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13520 if (!key)
13521 goto err;
13522 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13523 if (!value) {
13524 Py_DECREF(key);
13525 goto err;
13526 }
13527 res = PyDict_SetItem(new, key, value);
13528 Py_DECREF(key);
13529 Py_DECREF(value);
13530 if (res < 0)
13531 goto err;
13532 }
13533 /* create entries for deleting chars in z */
13534 if (z != NULL) {
13535 z_kind = PyUnicode_KIND(z);
13536 z_data = PyUnicode_DATA(z);
13537 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13538 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13539 if (!key)
13540 goto err;
13541 res = PyDict_SetItem(new, key, Py_None);
13542 Py_DECREF(key);
13543 if (res < 0)
13544 goto err;
13545 }
13546 }
13547 } else {
13548 int kind;
13549 const void *data;
13550
13551 /* x must be a dict */
13552 if (!PyDict_CheckExact(x)) {
13553 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13554 "to maketrans it must be a dict");
13555 goto err;
13556 }
13557 /* copy entries into the new dict, converting string keys to int keys */
13558 while (PyDict_Next(x, &i, &key, &value)) {
13559 if (PyUnicode_Check(key)) {
13560 /* convert string keys to integer keys */
13561 PyObject *newkey;
13562 if (PyUnicode_GET_LENGTH(key) != 1) {
13563 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13564 "table must be of length 1");
13565 goto err;
13566 }
13567 kind = PyUnicode_KIND(key);
13568 data = PyUnicode_DATA(key);
13569 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13570 if (!newkey)
13571 goto err;
13572 res = PyDict_SetItem(new, newkey, value);
13573 Py_DECREF(newkey);
13574 if (res < 0)
13575 goto err;
13576 } else if (PyLong_Check(key)) {
13577 /* just keep integer keys */
13578 if (PyDict_SetItem(new, key, value) < 0)
13579 goto err;
13580 } else {
13581 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13582 "be strings or integers");
13583 goto err;
13584 }
13585 }
13586 }
13587 return new;
13588 err:
13589 Py_DECREF(new);
13590 return NULL;
13591 }
13592
13593 /*[clinic input]
13594 str.translate as unicode_translate
13595
13596 table: object
13597 Translation table, which must be a mapping of Unicode ordinals to
13598 Unicode ordinals, strings, or None.
13599 /
13600
13601 Replace each character in the string using the given translation table.
13602
13603 The table must implement lookup/indexing via __getitem__, for instance a
13604 dictionary or list. If this operation raises LookupError, the character is
13605 left untouched. Characters mapped to None are deleted.
13606 [clinic start generated code]*/
13607
13608 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13609 unicode_translate(PyObject *self, PyObject *table)
13610 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13611 {
13612 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13613 }
13614
13615 /*[clinic input]
13616 str.upper as unicode_upper
13617
13618 Return a copy of the string converted to uppercase.
13619 [clinic start generated code]*/
13620
13621 static PyObject *
unicode_upper_impl(PyObject * self)13622 unicode_upper_impl(PyObject *self)
13623 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13624 {
13625 if (PyUnicode_READY(self) == -1)
13626 return NULL;
13627 if (PyUnicode_IS_ASCII(self))
13628 return ascii_upper_or_lower(self, 0);
13629 return case_operation(self, do_upper);
13630 }
13631
13632 /*[clinic input]
13633 str.zfill as unicode_zfill
13634
13635 width: Py_ssize_t
13636 /
13637
13638 Pad a numeric string with zeros on the left, to fill a field of the given width.
13639
13640 The string is never truncated.
13641 [clinic start generated code]*/
13642
13643 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13644 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13645 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13646 {
13647 Py_ssize_t fill;
13648 PyObject *u;
13649 int kind;
13650 const void *data;
13651 Py_UCS4 chr;
13652
13653 if (PyUnicode_READY(self) == -1)
13654 return NULL;
13655
13656 if (PyUnicode_GET_LENGTH(self) >= width)
13657 return unicode_result_unchanged(self);
13658
13659 fill = width - PyUnicode_GET_LENGTH(self);
13660
13661 u = pad(self, fill, 0, '0');
13662
13663 if (u == NULL)
13664 return NULL;
13665
13666 kind = PyUnicode_KIND(u);
13667 data = PyUnicode_DATA(u);
13668 chr = PyUnicode_READ(kind, data, fill);
13669
13670 if (chr == '+' || chr == '-') {
13671 /* move sign to beginning of string */
13672 PyUnicode_WRITE(kind, data, 0, chr);
13673 PyUnicode_WRITE(kind, data, fill, '0');
13674 }
13675
13676 assert(_PyUnicode_CheckConsistency(u, 1));
13677 return u;
13678 }
13679
13680 #if 0
13681 static PyObject *
13682 unicode__decimal2ascii(PyObject *self)
13683 {
13684 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13685 }
13686 #endif
13687
13688 PyDoc_STRVAR(startswith__doc__,
13689 "S.startswith(prefix[, start[, end]]) -> bool\n\
13690 \n\
13691 Return True if S starts with the specified prefix, False otherwise.\n\
13692 With optional start, test S beginning at that position.\n\
13693 With optional end, stop comparing S at that position.\n\
13694 prefix can also be a tuple of strings to try.");
13695
13696 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13697 unicode_startswith(PyObject *self,
13698 PyObject *args)
13699 {
13700 PyObject *subobj;
13701 PyObject *substring;
13702 Py_ssize_t start = 0;
13703 Py_ssize_t end = PY_SSIZE_T_MAX;
13704 int result;
13705
13706 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13707 return NULL;
13708 if (PyTuple_Check(subobj)) {
13709 Py_ssize_t i;
13710 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13711 substring = PyTuple_GET_ITEM(subobj, i);
13712 if (!PyUnicode_Check(substring)) {
13713 PyErr_Format(PyExc_TypeError,
13714 "tuple for startswith must only contain str, "
13715 "not %.100s",
13716 Py_TYPE(substring)->tp_name);
13717 return NULL;
13718 }
13719 result = tailmatch(self, substring, start, end, -1);
13720 if (result == -1)
13721 return NULL;
13722 if (result) {
13723 Py_RETURN_TRUE;
13724 }
13725 }
13726 /* nothing matched */
13727 Py_RETURN_FALSE;
13728 }
13729 if (!PyUnicode_Check(subobj)) {
13730 PyErr_Format(PyExc_TypeError,
13731 "startswith first arg must be str or "
13732 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13733 return NULL;
13734 }
13735 result = tailmatch(self, subobj, start, end, -1);
13736 if (result == -1)
13737 return NULL;
13738 return PyBool_FromLong(result);
13739 }
13740
13741
13742 PyDoc_STRVAR(endswith__doc__,
13743 "S.endswith(suffix[, start[, end]]) -> bool\n\
13744 \n\
13745 Return True if S ends with the specified suffix, False otherwise.\n\
13746 With optional start, test S beginning at that position.\n\
13747 With optional end, stop comparing S at that position.\n\
13748 suffix can also be a tuple of strings to try.");
13749
13750 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13751 unicode_endswith(PyObject *self,
13752 PyObject *args)
13753 {
13754 PyObject *subobj;
13755 PyObject *substring;
13756 Py_ssize_t start = 0;
13757 Py_ssize_t end = PY_SSIZE_T_MAX;
13758 int result;
13759
13760 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13761 return NULL;
13762 if (PyTuple_Check(subobj)) {
13763 Py_ssize_t i;
13764 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13765 substring = PyTuple_GET_ITEM(subobj, i);
13766 if (!PyUnicode_Check(substring)) {
13767 PyErr_Format(PyExc_TypeError,
13768 "tuple for endswith must only contain str, "
13769 "not %.100s",
13770 Py_TYPE(substring)->tp_name);
13771 return NULL;
13772 }
13773 result = tailmatch(self, substring, start, end, +1);
13774 if (result == -1)
13775 return NULL;
13776 if (result) {
13777 Py_RETURN_TRUE;
13778 }
13779 }
13780 Py_RETURN_FALSE;
13781 }
13782 if (!PyUnicode_Check(subobj)) {
13783 PyErr_Format(PyExc_TypeError,
13784 "endswith first arg must be str or "
13785 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13786 return NULL;
13787 }
13788 result = tailmatch(self, subobj, start, end, +1);
13789 if (result == -1)
13790 return NULL;
13791 return PyBool_FromLong(result);
13792 }
13793
13794 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13795 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13796 {
13797 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13798 writer->data = PyUnicode_DATA(writer->buffer);
13799
13800 if (!writer->readonly) {
13801 writer->kind = PyUnicode_KIND(writer->buffer);
13802 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13803 }
13804 else {
13805 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13806 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13807 writer->kind = PyUnicode_WCHAR_KIND;
13808 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13809
13810 /* Copy-on-write mode: set buffer size to 0 so
13811 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13812 * next write. */
13813 writer->size = 0;
13814 }
13815 }
13816
13817 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13818 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13819 {
13820 memset(writer, 0, sizeof(*writer));
13821
13822 /* ASCII is the bare minimum */
13823 writer->min_char = 127;
13824
13825 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13826 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13827 writer->kind = PyUnicode_WCHAR_KIND;
13828 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13829 }
13830
13831 // Initialize _PyUnicodeWriter with initial buffer
13832 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13833 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13834 {
13835 memset(writer, 0, sizeof(*writer));
13836 writer->buffer = buffer;
13837 _PyUnicodeWriter_Update(writer);
13838 writer->min_length = writer->size;
13839 }
13840
13841 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13842 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13843 Py_ssize_t length, Py_UCS4 maxchar)
13844 {
13845 Py_ssize_t newlen;
13846 PyObject *newbuffer;
13847
13848 assert(maxchar <= MAX_UNICODE);
13849
13850 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13851 assert((maxchar > writer->maxchar && length >= 0)
13852 || length > 0);
13853
13854 if (length > PY_SSIZE_T_MAX - writer->pos) {
13855 PyErr_NoMemory();
13856 return -1;
13857 }
13858 newlen = writer->pos + length;
13859
13860 maxchar = Py_MAX(maxchar, writer->min_char);
13861
13862 if (writer->buffer == NULL) {
13863 assert(!writer->readonly);
13864 if (writer->overallocate
13865 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13866 /* overallocate to limit the number of realloc() */
13867 newlen += newlen / OVERALLOCATE_FACTOR;
13868 }
13869 if (newlen < writer->min_length)
13870 newlen = writer->min_length;
13871
13872 writer->buffer = PyUnicode_New(newlen, maxchar);
13873 if (writer->buffer == NULL)
13874 return -1;
13875 }
13876 else if (newlen > writer->size) {
13877 if (writer->overallocate
13878 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13879 /* overallocate to limit the number of realloc() */
13880 newlen += newlen / OVERALLOCATE_FACTOR;
13881 }
13882 if (newlen < writer->min_length)
13883 newlen = writer->min_length;
13884
13885 if (maxchar > writer->maxchar || writer->readonly) {
13886 /* resize + widen */
13887 maxchar = Py_MAX(maxchar, writer->maxchar);
13888 newbuffer = PyUnicode_New(newlen, maxchar);
13889 if (newbuffer == NULL)
13890 return -1;
13891 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13892 writer->buffer, 0, writer->pos);
13893 Py_DECREF(writer->buffer);
13894 writer->readonly = 0;
13895 }
13896 else {
13897 newbuffer = resize_compact(writer->buffer, newlen);
13898 if (newbuffer == NULL)
13899 return -1;
13900 }
13901 writer->buffer = newbuffer;
13902 }
13903 else if (maxchar > writer->maxchar) {
13904 assert(!writer->readonly);
13905 newbuffer = PyUnicode_New(writer->size, maxchar);
13906 if (newbuffer == NULL)
13907 return -1;
13908 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13909 writer->buffer, 0, writer->pos);
13910 Py_SETREF(writer->buffer, newbuffer);
13911 }
13912 _PyUnicodeWriter_Update(writer);
13913 return 0;
13914
13915 #undef OVERALLOCATE_FACTOR
13916 }
13917
13918 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13919 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13920 enum PyUnicode_Kind kind)
13921 {
13922 Py_UCS4 maxchar;
13923
13924 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13925 assert(writer->kind < kind);
13926
13927 switch (kind)
13928 {
13929 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13930 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13931 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13932 default:
13933 Py_UNREACHABLE();
13934 }
13935
13936 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13937 }
13938
13939 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13940 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13941 {
13942 assert(ch <= MAX_UNICODE);
13943 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13944 return -1;
13945 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13946 writer->pos++;
13947 return 0;
13948 }
13949
13950 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13951 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13952 {
13953 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13954 }
13955
13956 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13957 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13958 {
13959 Py_UCS4 maxchar;
13960 Py_ssize_t len;
13961
13962 if (PyUnicode_READY(str) == -1)
13963 return -1;
13964 len = PyUnicode_GET_LENGTH(str);
13965 if (len == 0)
13966 return 0;
13967 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13968 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13969 if (writer->buffer == NULL && !writer->overallocate) {
13970 assert(_PyUnicode_CheckConsistency(str, 1));
13971 writer->readonly = 1;
13972 Py_INCREF(str);
13973 writer->buffer = str;
13974 _PyUnicodeWriter_Update(writer);
13975 writer->pos += len;
13976 return 0;
13977 }
13978 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13979 return -1;
13980 }
13981 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13982 str, 0, len);
13983 writer->pos += len;
13984 return 0;
13985 }
13986
13987 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13988 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13989 Py_ssize_t start, Py_ssize_t end)
13990 {
13991 Py_UCS4 maxchar;
13992 Py_ssize_t len;
13993
13994 if (PyUnicode_READY(str) == -1)
13995 return -1;
13996
13997 assert(0 <= start);
13998 assert(end <= PyUnicode_GET_LENGTH(str));
13999 assert(start <= end);
14000
14001 if (end == 0)
14002 return 0;
14003
14004 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14005 return _PyUnicodeWriter_WriteStr(writer, str);
14006
14007 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14008 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14009 else
14010 maxchar = writer->maxchar;
14011 len = end - start;
14012
14013 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14014 return -1;
14015
14016 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14017 str, start, len);
14018 writer->pos += len;
14019 return 0;
14020 }
14021
14022 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)14023 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14024 const char *ascii, Py_ssize_t len)
14025 {
14026 if (len == -1)
14027 len = strlen(ascii);
14028
14029 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14030
14031 if (writer->buffer == NULL && !writer->overallocate) {
14032 PyObject *str;
14033
14034 str = _PyUnicode_FromASCII(ascii, len);
14035 if (str == NULL)
14036 return -1;
14037
14038 writer->readonly = 1;
14039 writer->buffer = str;
14040 _PyUnicodeWriter_Update(writer);
14041 writer->pos += len;
14042 return 0;
14043 }
14044
14045 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14046 return -1;
14047
14048 switch (writer->kind)
14049 {
14050 case PyUnicode_1BYTE_KIND:
14051 {
14052 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14053 Py_UCS1 *data = writer->data;
14054
14055 memcpy(data + writer->pos, str, len);
14056 break;
14057 }
14058 case PyUnicode_2BYTE_KIND:
14059 {
14060 _PyUnicode_CONVERT_BYTES(
14061 Py_UCS1, Py_UCS2,
14062 ascii, ascii + len,
14063 (Py_UCS2 *)writer->data + writer->pos);
14064 break;
14065 }
14066 case PyUnicode_4BYTE_KIND:
14067 {
14068 _PyUnicode_CONVERT_BYTES(
14069 Py_UCS1, Py_UCS4,
14070 ascii, ascii + len,
14071 (Py_UCS4 *)writer->data + writer->pos);
14072 break;
14073 }
14074 default:
14075 Py_UNREACHABLE();
14076 }
14077
14078 writer->pos += len;
14079 return 0;
14080 }
14081
14082 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14083 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14084 const char *str, Py_ssize_t len)
14085 {
14086 Py_UCS4 maxchar;
14087
14088 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14089 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14090 return -1;
14091 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14092 writer->pos += len;
14093 return 0;
14094 }
14095
14096 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14097 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14098 {
14099 PyObject *str;
14100
14101 if (writer->pos == 0) {
14102 Py_CLEAR(writer->buffer);
14103 _Py_RETURN_UNICODE_EMPTY();
14104 }
14105
14106 str = writer->buffer;
14107 writer->buffer = NULL;
14108
14109 if (writer->readonly) {
14110 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14111 return str;
14112 }
14113
14114 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14115 PyObject *str2;
14116 str2 = resize_compact(str, writer->pos);
14117 if (str2 == NULL) {
14118 Py_DECREF(str);
14119 return NULL;
14120 }
14121 str = str2;
14122 }
14123
14124 assert(_PyUnicode_CheckConsistency(str, 1));
14125 return unicode_result_ready(str);
14126 }
14127
14128 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14129 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14130 {
14131 Py_CLEAR(writer->buffer);
14132 }
14133
14134 #include "stringlib/unicode_format.h"
14135
14136 PyDoc_STRVAR(format__doc__,
14137 "S.format(*args, **kwargs) -> str\n\
14138 \n\
14139 Return a formatted version of S, using substitutions from args and kwargs.\n\
14140 The substitutions are identified by braces ('{' and '}').");
14141
14142 PyDoc_STRVAR(format_map__doc__,
14143 "S.format_map(mapping) -> str\n\
14144 \n\
14145 Return a formatted version of S, using substitutions from mapping.\n\
14146 The substitutions are identified by braces ('{' and '}').");
14147
14148 /*[clinic input]
14149 str.__format__ as unicode___format__
14150
14151 format_spec: unicode
14152 /
14153
14154 Return a formatted version of the string as described by format_spec.
14155 [clinic start generated code]*/
14156
14157 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14158 unicode___format___impl(PyObject *self, PyObject *format_spec)
14159 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14160 {
14161 _PyUnicodeWriter writer;
14162 int ret;
14163
14164 if (PyUnicode_READY(self) == -1)
14165 return NULL;
14166 _PyUnicodeWriter_Init(&writer);
14167 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14168 self, format_spec, 0,
14169 PyUnicode_GET_LENGTH(format_spec));
14170 if (ret == -1) {
14171 _PyUnicodeWriter_Dealloc(&writer);
14172 return NULL;
14173 }
14174 return _PyUnicodeWriter_Finish(&writer);
14175 }
14176
14177 /*[clinic input]
14178 str.__sizeof__ as unicode_sizeof
14179
14180 Return the size of the string in memory, in bytes.
14181 [clinic start generated code]*/
14182
14183 static PyObject *
unicode_sizeof_impl(PyObject * self)14184 unicode_sizeof_impl(PyObject *self)
14185 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14186 {
14187 Py_ssize_t size;
14188
14189 /* If it's a compact object, account for base structure +
14190 character data. */
14191 if (PyUnicode_IS_COMPACT_ASCII(self))
14192 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14193 else if (PyUnicode_IS_COMPACT(self))
14194 size = sizeof(PyCompactUnicodeObject) +
14195 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14196 else {
14197 /* If it is a two-block object, account for base object, and
14198 for character block if present. */
14199 size = sizeof(PyUnicodeObject);
14200 if (_PyUnicode_DATA_ANY(self))
14201 size += (PyUnicode_GET_LENGTH(self) + 1) *
14202 PyUnicode_KIND(self);
14203 }
14204 /* If the wstr pointer is present, account for it unless it is shared
14205 with the data pointer. Check if the data is not shared. */
14206 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14207 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14208 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14209 size += PyUnicode_UTF8_LENGTH(self) + 1;
14210
14211 return PyLong_FromSsize_t(size);
14212 }
14213
14214 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14215 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14216 {
14217 PyObject *copy = _PyUnicode_Copy(v);
14218 if (!copy)
14219 return NULL;
14220 return Py_BuildValue("(N)", copy);
14221 }
14222
14223 static PyMethodDef unicode_methods[] = {
14224 UNICODE_ENCODE_METHODDEF
14225 UNICODE_REPLACE_METHODDEF
14226 UNICODE_SPLIT_METHODDEF
14227 UNICODE_RSPLIT_METHODDEF
14228 UNICODE_JOIN_METHODDEF
14229 UNICODE_CAPITALIZE_METHODDEF
14230 UNICODE_CASEFOLD_METHODDEF
14231 UNICODE_TITLE_METHODDEF
14232 UNICODE_CENTER_METHODDEF
14233 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14234 UNICODE_EXPANDTABS_METHODDEF
14235 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14236 UNICODE_PARTITION_METHODDEF
14237 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14238 UNICODE_LJUST_METHODDEF
14239 UNICODE_LOWER_METHODDEF
14240 UNICODE_LSTRIP_METHODDEF
14241 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14242 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14243 UNICODE_RJUST_METHODDEF
14244 UNICODE_RSTRIP_METHODDEF
14245 UNICODE_RPARTITION_METHODDEF
14246 UNICODE_SPLITLINES_METHODDEF
14247 UNICODE_STRIP_METHODDEF
14248 UNICODE_SWAPCASE_METHODDEF
14249 UNICODE_TRANSLATE_METHODDEF
14250 UNICODE_UPPER_METHODDEF
14251 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14252 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14253 UNICODE_REMOVEPREFIX_METHODDEF
14254 UNICODE_REMOVESUFFIX_METHODDEF
14255 UNICODE_ISASCII_METHODDEF
14256 UNICODE_ISLOWER_METHODDEF
14257 UNICODE_ISUPPER_METHODDEF
14258 UNICODE_ISTITLE_METHODDEF
14259 UNICODE_ISSPACE_METHODDEF
14260 UNICODE_ISDECIMAL_METHODDEF
14261 UNICODE_ISDIGIT_METHODDEF
14262 UNICODE_ISNUMERIC_METHODDEF
14263 UNICODE_ISALPHA_METHODDEF
14264 UNICODE_ISALNUM_METHODDEF
14265 UNICODE_ISIDENTIFIER_METHODDEF
14266 UNICODE_ISPRINTABLE_METHODDEF
14267 UNICODE_ZFILL_METHODDEF
14268 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14269 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14270 UNICODE___FORMAT___METHODDEF
14271 UNICODE_MAKETRANS_METHODDEF
14272 UNICODE_SIZEOF_METHODDEF
14273 #if 0
14274 /* These methods are just used for debugging the implementation. */
14275 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14276 #endif
14277
14278 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14279 {NULL, NULL}
14280 };
14281
14282 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14283 unicode_mod(PyObject *v, PyObject *w)
14284 {
14285 if (!PyUnicode_Check(v))
14286 Py_RETURN_NOTIMPLEMENTED;
14287 return PyUnicode_Format(v, w);
14288 }
14289
14290 static PyNumberMethods unicode_as_number = {
14291 0, /*nb_add*/
14292 0, /*nb_subtract*/
14293 0, /*nb_multiply*/
14294 unicode_mod, /*nb_remainder*/
14295 };
14296
14297 static PySequenceMethods unicode_as_sequence = {
14298 (lenfunc) unicode_length, /* sq_length */
14299 PyUnicode_Concat, /* sq_concat */
14300 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14301 (ssizeargfunc) unicode_getitem, /* sq_item */
14302 0, /* sq_slice */
14303 0, /* sq_ass_item */
14304 0, /* sq_ass_slice */
14305 PyUnicode_Contains, /* sq_contains */
14306 };
14307
14308 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14309 unicode_subscript(PyObject* self, PyObject* item)
14310 {
14311 if (PyUnicode_READY(self) == -1)
14312 return NULL;
14313
14314 if (_PyIndex_Check(item)) {
14315 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14316 if (i == -1 && PyErr_Occurred())
14317 return NULL;
14318 if (i < 0)
14319 i += PyUnicode_GET_LENGTH(self);
14320 return unicode_getitem(self, i);
14321 } else if (PySlice_Check(item)) {
14322 Py_ssize_t start, stop, step, slicelength, i;
14323 size_t cur;
14324 PyObject *result;
14325 const void *src_data;
14326 void *dest_data;
14327 int src_kind, dest_kind;
14328 Py_UCS4 ch, max_char, kind_limit;
14329
14330 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14331 return NULL;
14332 }
14333 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14334 &start, &stop, step);
14335
14336 if (slicelength <= 0) {
14337 _Py_RETURN_UNICODE_EMPTY();
14338 } else if (start == 0 && step == 1 &&
14339 slicelength == PyUnicode_GET_LENGTH(self)) {
14340 return unicode_result_unchanged(self);
14341 } else if (step == 1) {
14342 return PyUnicode_Substring(self,
14343 start, start + slicelength);
14344 }
14345 /* General case */
14346 src_kind = PyUnicode_KIND(self);
14347 src_data = PyUnicode_DATA(self);
14348 if (!PyUnicode_IS_ASCII(self)) {
14349 kind_limit = kind_maxchar_limit(src_kind);
14350 max_char = 0;
14351 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14352 ch = PyUnicode_READ(src_kind, src_data, cur);
14353 if (ch > max_char) {
14354 max_char = ch;
14355 if (max_char >= kind_limit)
14356 break;
14357 }
14358 }
14359 }
14360 else
14361 max_char = 127;
14362 result = PyUnicode_New(slicelength, max_char);
14363 if (result == NULL)
14364 return NULL;
14365 dest_kind = PyUnicode_KIND(result);
14366 dest_data = PyUnicode_DATA(result);
14367
14368 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14369 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14370 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14371 }
14372 assert(_PyUnicode_CheckConsistency(result, 1));
14373 return result;
14374 } else {
14375 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14376 return NULL;
14377 }
14378 }
14379
14380 static PyMappingMethods unicode_as_mapping = {
14381 (lenfunc)unicode_length, /* mp_length */
14382 (binaryfunc)unicode_subscript, /* mp_subscript */
14383 (objobjargproc)0, /* mp_ass_subscript */
14384 };
14385
14386
14387 /* Helpers for PyUnicode_Format() */
14388
14389 struct unicode_formatter_t {
14390 PyObject *args;
14391 int args_owned;
14392 Py_ssize_t arglen, argidx;
14393 PyObject *dict;
14394
14395 enum PyUnicode_Kind fmtkind;
14396 Py_ssize_t fmtcnt, fmtpos;
14397 const void *fmtdata;
14398 PyObject *fmtstr;
14399
14400 _PyUnicodeWriter writer;
14401 };
14402
14403 struct unicode_format_arg_t {
14404 Py_UCS4 ch;
14405 int flags;
14406 Py_ssize_t width;
14407 int prec;
14408 int sign;
14409 };
14410
14411 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14412 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14413 {
14414 Py_ssize_t argidx = ctx->argidx;
14415
14416 if (argidx < ctx->arglen) {
14417 ctx->argidx++;
14418 if (ctx->arglen < 0)
14419 return ctx->args;
14420 else
14421 return PyTuple_GetItem(ctx->args, argidx);
14422 }
14423 PyErr_SetString(PyExc_TypeError,
14424 "not enough arguments for format string");
14425 return NULL;
14426 }
14427
14428 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14429
14430 /* Format a float into the writer if the writer is not NULL, or into *p_output
14431 otherwise.
14432
14433 Return 0 on success, raise an exception and return -1 on error. */
14434 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14435 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14436 PyObject **p_output,
14437 _PyUnicodeWriter *writer)
14438 {
14439 char *p;
14440 double x;
14441 Py_ssize_t len;
14442 int prec;
14443 int dtoa_flags;
14444
14445 x = PyFloat_AsDouble(v);
14446 if (x == -1.0 && PyErr_Occurred())
14447 return -1;
14448
14449 prec = arg->prec;
14450 if (prec < 0)
14451 prec = 6;
14452
14453 if (arg->flags & F_ALT)
14454 dtoa_flags = Py_DTSF_ALT;
14455 else
14456 dtoa_flags = 0;
14457 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14458 if (p == NULL)
14459 return -1;
14460 len = strlen(p);
14461 if (writer) {
14462 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14463 PyMem_Free(p);
14464 return -1;
14465 }
14466 }
14467 else
14468 *p_output = _PyUnicode_FromASCII(p, len);
14469 PyMem_Free(p);
14470 return 0;
14471 }
14472
14473 /* formatlong() emulates the format codes d, u, o, x and X, and
14474 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14475 * Python's regular ints.
14476 * Return value: a new PyUnicodeObject*, or NULL if error.
14477 * The output string is of the form
14478 * "-"? ("0x" | "0X")? digit+
14479 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14480 * set in flags. The case of hex digits will be correct,
14481 * There will be at least prec digits, zero-filled on the left if
14482 * necessary to get that many.
14483 * val object to be converted
14484 * flags bitmask of format flags; only F_ALT is looked at
14485 * prec minimum number of digits; 0-fill on left if needed
14486 * type a character in [duoxX]; u acts the same as d
14487 *
14488 * CAUTION: o, x and X conversions on regular ints can never
14489 * produce a '-' sign, but can for Python's unbounded ints.
14490 */
14491 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14492 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14493 {
14494 PyObject *result = NULL;
14495 char *buf;
14496 Py_ssize_t i;
14497 int sign; /* 1 if '-', else 0 */
14498 int len; /* number of characters */
14499 Py_ssize_t llen;
14500 int numdigits; /* len == numnondigits + numdigits */
14501 int numnondigits = 0;
14502
14503 /* Avoid exceeding SSIZE_T_MAX */
14504 if (prec > INT_MAX-3) {
14505 PyErr_SetString(PyExc_OverflowError,
14506 "precision too large");
14507 return NULL;
14508 }
14509
14510 assert(PyLong_Check(val));
14511
14512 switch (type) {
14513 default:
14514 Py_UNREACHABLE();
14515 case 'd':
14516 case 'i':
14517 case 'u':
14518 /* int and int subclasses should print numerically when a numeric */
14519 /* format code is used (see issue18780) */
14520 result = PyNumber_ToBase(val, 10);
14521 break;
14522 case 'o':
14523 numnondigits = 2;
14524 result = PyNumber_ToBase(val, 8);
14525 break;
14526 case 'x':
14527 case 'X':
14528 numnondigits = 2;
14529 result = PyNumber_ToBase(val, 16);
14530 break;
14531 }
14532 if (!result)
14533 return NULL;
14534
14535 assert(unicode_modifiable(result));
14536 assert(PyUnicode_IS_READY(result));
14537 assert(PyUnicode_IS_ASCII(result));
14538
14539 /* To modify the string in-place, there can only be one reference. */
14540 if (Py_REFCNT(result) != 1) {
14541 Py_DECREF(result);
14542 PyErr_BadInternalCall();
14543 return NULL;
14544 }
14545 buf = PyUnicode_DATA(result);
14546 llen = PyUnicode_GET_LENGTH(result);
14547 if (llen > INT_MAX) {
14548 Py_DECREF(result);
14549 PyErr_SetString(PyExc_ValueError,
14550 "string too large in _PyUnicode_FormatLong");
14551 return NULL;
14552 }
14553 len = (int)llen;
14554 sign = buf[0] == '-';
14555 numnondigits += sign;
14556 numdigits = len - numnondigits;
14557 assert(numdigits > 0);
14558
14559 /* Get rid of base marker unless F_ALT */
14560 if (((alt) == 0 &&
14561 (type == 'o' || type == 'x' || type == 'X'))) {
14562 assert(buf[sign] == '0');
14563 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14564 buf[sign+1] == 'o');
14565 numnondigits -= 2;
14566 buf += 2;
14567 len -= 2;
14568 if (sign)
14569 buf[0] = '-';
14570 assert(len == numnondigits + numdigits);
14571 assert(numdigits > 0);
14572 }
14573
14574 /* Fill with leading zeroes to meet minimum width. */
14575 if (prec > numdigits) {
14576 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14577 numnondigits + prec);
14578 char *b1;
14579 if (!r1) {
14580 Py_DECREF(result);
14581 return NULL;
14582 }
14583 b1 = PyBytes_AS_STRING(r1);
14584 for (i = 0; i < numnondigits; ++i)
14585 *b1++ = *buf++;
14586 for (i = 0; i < prec - numdigits; i++)
14587 *b1++ = '0';
14588 for (i = 0; i < numdigits; i++)
14589 *b1++ = *buf++;
14590 *b1 = '\0';
14591 Py_DECREF(result);
14592 result = r1;
14593 buf = PyBytes_AS_STRING(result);
14594 len = numnondigits + prec;
14595 }
14596
14597 /* Fix up case for hex conversions. */
14598 if (type == 'X') {
14599 /* Need to convert all lower case letters to upper case.
14600 and need to convert 0x to 0X (and -0x to -0X). */
14601 for (i = 0; i < len; i++)
14602 if (buf[i] >= 'a' && buf[i] <= 'x')
14603 buf[i] -= 'a'-'A';
14604 }
14605 if (!PyUnicode_Check(result)
14606 || buf != PyUnicode_DATA(result)) {
14607 PyObject *unicode;
14608 unicode = _PyUnicode_FromASCII(buf, len);
14609 Py_DECREF(result);
14610 result = unicode;
14611 }
14612 else if (len != PyUnicode_GET_LENGTH(result)) {
14613 if (PyUnicode_Resize(&result, len) < 0)
14614 Py_CLEAR(result);
14615 }
14616 return result;
14617 }
14618
14619 /* Format an integer or a float as an integer.
14620 * Return 1 if the number has been formatted into the writer,
14621 * 0 if the number has been formatted into *p_output
14622 * -1 and raise an exception on error */
14623 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14624 mainformatlong(PyObject *v,
14625 struct unicode_format_arg_t *arg,
14626 PyObject **p_output,
14627 _PyUnicodeWriter *writer)
14628 {
14629 PyObject *iobj, *res;
14630 char type = (char)arg->ch;
14631
14632 if (!PyNumber_Check(v))
14633 goto wrongtype;
14634
14635 /* make sure number is a type of integer for o, x, and X */
14636 if (!PyLong_Check(v)) {
14637 if (type == 'o' || type == 'x' || type == 'X') {
14638 iobj = PyNumber_Index(v);
14639 if (iobj == NULL) {
14640 if (PyErr_ExceptionMatches(PyExc_TypeError))
14641 goto wrongtype;
14642 return -1;
14643 }
14644 }
14645 else {
14646 iobj = PyNumber_Long(v);
14647 if (iobj == NULL ) {
14648 if (PyErr_ExceptionMatches(PyExc_TypeError))
14649 goto wrongtype;
14650 return -1;
14651 }
14652 }
14653 assert(PyLong_Check(iobj));
14654 }
14655 else {
14656 iobj = v;
14657 Py_INCREF(iobj);
14658 }
14659
14660 if (PyLong_CheckExact(v)
14661 && arg->width == -1 && arg->prec == -1
14662 && !(arg->flags & (F_SIGN | F_BLANK))
14663 && type != 'X')
14664 {
14665 /* Fast path */
14666 int alternate = arg->flags & F_ALT;
14667 int base;
14668
14669 switch(type)
14670 {
14671 default:
14672 Py_UNREACHABLE();
14673 case 'd':
14674 case 'i':
14675 case 'u':
14676 base = 10;
14677 break;
14678 case 'o':
14679 base = 8;
14680 break;
14681 case 'x':
14682 case 'X':
14683 base = 16;
14684 break;
14685 }
14686
14687 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14688 Py_DECREF(iobj);
14689 return -1;
14690 }
14691 Py_DECREF(iobj);
14692 return 1;
14693 }
14694
14695 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14696 Py_DECREF(iobj);
14697 if (res == NULL)
14698 return -1;
14699 *p_output = res;
14700 return 0;
14701
14702 wrongtype:
14703 switch(type)
14704 {
14705 case 'o':
14706 case 'x':
14707 case 'X':
14708 PyErr_Format(PyExc_TypeError,
14709 "%%%c format: an integer is required, "
14710 "not %.200s",
14711 type, Py_TYPE(v)->tp_name);
14712 break;
14713 default:
14714 PyErr_Format(PyExc_TypeError,
14715 "%%%c format: a number is required, "
14716 "not %.200s",
14717 type, Py_TYPE(v)->tp_name);
14718 break;
14719 }
14720 return -1;
14721 }
14722
14723 static Py_UCS4
formatchar(PyObject * v)14724 formatchar(PyObject *v)
14725 {
14726 /* presume that the buffer is at least 3 characters long */
14727 if (PyUnicode_Check(v)) {
14728 if (PyUnicode_GET_LENGTH(v) == 1) {
14729 return PyUnicode_READ_CHAR(v, 0);
14730 }
14731 goto onError;
14732 }
14733 else {
14734 PyObject *iobj;
14735 long x;
14736 /* make sure number is a type of integer */
14737 if (!PyLong_Check(v)) {
14738 iobj = PyNumber_Index(v);
14739 if (iobj == NULL) {
14740 goto onError;
14741 }
14742 x = PyLong_AsLong(iobj);
14743 Py_DECREF(iobj);
14744 }
14745 else {
14746 x = PyLong_AsLong(v);
14747 }
14748 if (x == -1 && PyErr_Occurred())
14749 goto onError;
14750
14751 if (x < 0 || x > MAX_UNICODE) {
14752 PyErr_SetString(PyExc_OverflowError,
14753 "%c arg not in range(0x110000)");
14754 return (Py_UCS4) -1;
14755 }
14756
14757 return (Py_UCS4) x;
14758 }
14759
14760 onError:
14761 PyErr_SetString(PyExc_TypeError,
14762 "%c requires int or char");
14763 return (Py_UCS4) -1;
14764 }
14765
14766 /* Parse options of an argument: flags, width, precision.
14767 Handle also "%(name)" syntax.
14768
14769 Return 0 if the argument has been formatted into arg->str.
14770 Return 1 if the argument has been written into ctx->writer,
14771 Raise an exception and return -1 on error. */
14772 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14773 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14774 struct unicode_format_arg_t *arg)
14775 {
14776 #define FORMAT_READ(ctx) \
14777 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14778
14779 PyObject *v;
14780
14781 if (arg->ch == '(') {
14782 /* Get argument value from a dictionary. Example: "%(name)s". */
14783 Py_ssize_t keystart;
14784 Py_ssize_t keylen;
14785 PyObject *key;
14786 int pcount = 1;
14787
14788 if (ctx->dict == NULL) {
14789 PyErr_SetString(PyExc_TypeError,
14790 "format requires a mapping");
14791 return -1;
14792 }
14793 ++ctx->fmtpos;
14794 --ctx->fmtcnt;
14795 keystart = ctx->fmtpos;
14796 /* Skip over balanced parentheses */
14797 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14798 arg->ch = FORMAT_READ(ctx);
14799 if (arg->ch == ')')
14800 --pcount;
14801 else if (arg->ch == '(')
14802 ++pcount;
14803 ctx->fmtpos++;
14804 }
14805 keylen = ctx->fmtpos - keystart - 1;
14806 if (ctx->fmtcnt < 0 || pcount > 0) {
14807 PyErr_SetString(PyExc_ValueError,
14808 "incomplete format key");
14809 return -1;
14810 }
14811 key = PyUnicode_Substring(ctx->fmtstr,
14812 keystart, keystart + keylen);
14813 if (key == NULL)
14814 return -1;
14815 if (ctx->args_owned) {
14816 ctx->args_owned = 0;
14817 Py_DECREF(ctx->args);
14818 }
14819 ctx->args = PyObject_GetItem(ctx->dict, key);
14820 Py_DECREF(key);
14821 if (ctx->args == NULL)
14822 return -1;
14823 ctx->args_owned = 1;
14824 ctx->arglen = -1;
14825 ctx->argidx = -2;
14826 }
14827
14828 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14829 while (--ctx->fmtcnt >= 0) {
14830 arg->ch = FORMAT_READ(ctx);
14831 ctx->fmtpos++;
14832 switch (arg->ch) {
14833 case '-': arg->flags |= F_LJUST; continue;
14834 case '+': arg->flags |= F_SIGN; continue;
14835 case ' ': arg->flags |= F_BLANK; continue;
14836 case '#': arg->flags |= F_ALT; continue;
14837 case '0': arg->flags |= F_ZERO; continue;
14838 }
14839 break;
14840 }
14841
14842 /* Parse width. Example: "%10s" => width=10 */
14843 if (arg->ch == '*') {
14844 v = unicode_format_getnextarg(ctx);
14845 if (v == NULL)
14846 return -1;
14847 if (!PyLong_Check(v)) {
14848 PyErr_SetString(PyExc_TypeError,
14849 "* wants int");
14850 return -1;
14851 }
14852 arg->width = PyLong_AsSsize_t(v);
14853 if (arg->width == -1 && PyErr_Occurred())
14854 return -1;
14855 if (arg->width < 0) {
14856 arg->flags |= F_LJUST;
14857 arg->width = -arg->width;
14858 }
14859 if (--ctx->fmtcnt >= 0) {
14860 arg->ch = FORMAT_READ(ctx);
14861 ctx->fmtpos++;
14862 }
14863 }
14864 else if (arg->ch >= '0' && arg->ch <= '9') {
14865 arg->width = arg->ch - '0';
14866 while (--ctx->fmtcnt >= 0) {
14867 arg->ch = FORMAT_READ(ctx);
14868 ctx->fmtpos++;
14869 if (arg->ch < '0' || arg->ch > '9')
14870 break;
14871 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14872 mixing signed and unsigned comparison. Since arg->ch is between
14873 '0' and '9', casting to int is safe. */
14874 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14875 PyErr_SetString(PyExc_ValueError,
14876 "width too big");
14877 return -1;
14878 }
14879 arg->width = arg->width*10 + (arg->ch - '0');
14880 }
14881 }
14882
14883 /* Parse precision. Example: "%.3f" => prec=3 */
14884 if (arg->ch == '.') {
14885 arg->prec = 0;
14886 if (--ctx->fmtcnt >= 0) {
14887 arg->ch = FORMAT_READ(ctx);
14888 ctx->fmtpos++;
14889 }
14890 if (arg->ch == '*') {
14891 v = unicode_format_getnextarg(ctx);
14892 if (v == NULL)
14893 return -1;
14894 if (!PyLong_Check(v)) {
14895 PyErr_SetString(PyExc_TypeError,
14896 "* wants int");
14897 return -1;
14898 }
14899 arg->prec = _PyLong_AsInt(v);
14900 if (arg->prec == -1 && PyErr_Occurred())
14901 return -1;
14902 if (arg->prec < 0)
14903 arg->prec = 0;
14904 if (--ctx->fmtcnt >= 0) {
14905 arg->ch = FORMAT_READ(ctx);
14906 ctx->fmtpos++;
14907 }
14908 }
14909 else if (arg->ch >= '0' && arg->ch <= '9') {
14910 arg->prec = arg->ch - '0';
14911 while (--ctx->fmtcnt >= 0) {
14912 arg->ch = FORMAT_READ(ctx);
14913 ctx->fmtpos++;
14914 if (arg->ch < '0' || arg->ch > '9')
14915 break;
14916 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14917 PyErr_SetString(PyExc_ValueError,
14918 "precision too big");
14919 return -1;
14920 }
14921 arg->prec = arg->prec*10 + (arg->ch - '0');
14922 }
14923 }
14924 }
14925
14926 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14927 if (ctx->fmtcnt >= 0) {
14928 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14929 if (--ctx->fmtcnt >= 0) {
14930 arg->ch = FORMAT_READ(ctx);
14931 ctx->fmtpos++;
14932 }
14933 }
14934 }
14935 if (ctx->fmtcnt < 0) {
14936 PyErr_SetString(PyExc_ValueError,
14937 "incomplete format");
14938 return -1;
14939 }
14940 return 0;
14941
14942 #undef FORMAT_READ
14943 }
14944
14945 /* Format one argument. Supported conversion specifiers:
14946
14947 - "s", "r", "a": any type
14948 - "i", "d", "u": int or float
14949 - "o", "x", "X": int
14950 - "e", "E", "f", "F", "g", "G": float
14951 - "c": int or str (1 character)
14952
14953 When possible, the output is written directly into the Unicode writer
14954 (ctx->writer). A string is created when padding is required.
14955
14956 Return 0 if the argument has been formatted into *p_str,
14957 1 if the argument has been written into ctx->writer,
14958 -1 on error. */
14959 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14960 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14961 struct unicode_format_arg_t *arg,
14962 PyObject **p_str)
14963 {
14964 PyObject *v;
14965 _PyUnicodeWriter *writer = &ctx->writer;
14966
14967 if (ctx->fmtcnt == 0)
14968 ctx->writer.overallocate = 0;
14969
14970 v = unicode_format_getnextarg(ctx);
14971 if (v == NULL)
14972 return -1;
14973
14974
14975 switch (arg->ch) {
14976 case 's':
14977 case 'r':
14978 case 'a':
14979 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14980 /* Fast path */
14981 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14982 return -1;
14983 return 1;
14984 }
14985
14986 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14987 *p_str = v;
14988 Py_INCREF(*p_str);
14989 }
14990 else {
14991 if (arg->ch == 's')
14992 *p_str = PyObject_Str(v);
14993 else if (arg->ch == 'r')
14994 *p_str = PyObject_Repr(v);
14995 else
14996 *p_str = PyObject_ASCII(v);
14997 }
14998 break;
14999
15000 case 'i':
15001 case 'd':
15002 case 'u':
15003 case 'o':
15004 case 'x':
15005 case 'X':
15006 {
15007 int ret = mainformatlong(v, arg, p_str, writer);
15008 if (ret != 0)
15009 return ret;
15010 arg->sign = 1;
15011 break;
15012 }
15013
15014 case 'e':
15015 case 'E':
15016 case 'f':
15017 case 'F':
15018 case 'g':
15019 case 'G':
15020 if (arg->width == -1 && arg->prec == -1
15021 && !(arg->flags & (F_SIGN | F_BLANK)))
15022 {
15023 /* Fast path */
15024 if (formatfloat(v, arg, NULL, writer) == -1)
15025 return -1;
15026 return 1;
15027 }
15028
15029 arg->sign = 1;
15030 if (formatfloat(v, arg, p_str, NULL) == -1)
15031 return -1;
15032 break;
15033
15034 case 'c':
15035 {
15036 Py_UCS4 ch = formatchar(v);
15037 if (ch == (Py_UCS4) -1)
15038 return -1;
15039 if (arg->width == -1 && arg->prec == -1) {
15040 /* Fast path */
15041 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15042 return -1;
15043 return 1;
15044 }
15045 *p_str = PyUnicode_FromOrdinal(ch);
15046 break;
15047 }
15048
15049 default:
15050 PyErr_Format(PyExc_ValueError,
15051 "unsupported format character '%c' (0x%x) "
15052 "at index %zd",
15053 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15054 (int)arg->ch,
15055 ctx->fmtpos - 1);
15056 return -1;
15057 }
15058 if (*p_str == NULL)
15059 return -1;
15060 assert (PyUnicode_Check(*p_str));
15061 return 0;
15062 }
15063
15064 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15065 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15066 struct unicode_format_arg_t *arg,
15067 PyObject *str)
15068 {
15069 Py_ssize_t len;
15070 enum PyUnicode_Kind kind;
15071 const void *pbuf;
15072 Py_ssize_t pindex;
15073 Py_UCS4 signchar;
15074 Py_ssize_t buflen;
15075 Py_UCS4 maxchar;
15076 Py_ssize_t sublen;
15077 _PyUnicodeWriter *writer = &ctx->writer;
15078 Py_UCS4 fill;
15079
15080 fill = ' ';
15081 if (arg->sign && arg->flags & F_ZERO)
15082 fill = '0';
15083
15084 if (PyUnicode_READY(str) == -1)
15085 return -1;
15086
15087 len = PyUnicode_GET_LENGTH(str);
15088 if ((arg->width == -1 || arg->width <= len)
15089 && (arg->prec == -1 || arg->prec >= len)
15090 && !(arg->flags & (F_SIGN | F_BLANK)))
15091 {
15092 /* Fast path */
15093 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15094 return -1;
15095 return 0;
15096 }
15097
15098 /* Truncate the string for "s", "r" and "a" formats
15099 if the precision is set */
15100 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15101 if (arg->prec >= 0 && len > arg->prec)
15102 len = arg->prec;
15103 }
15104
15105 /* Adjust sign and width */
15106 kind = PyUnicode_KIND(str);
15107 pbuf = PyUnicode_DATA(str);
15108 pindex = 0;
15109 signchar = '\0';
15110 if (arg->sign) {
15111 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15112 if (ch == '-' || ch == '+') {
15113 signchar = ch;
15114 len--;
15115 pindex++;
15116 }
15117 else if (arg->flags & F_SIGN)
15118 signchar = '+';
15119 else if (arg->flags & F_BLANK)
15120 signchar = ' ';
15121 else
15122 arg->sign = 0;
15123 }
15124 if (arg->width < len)
15125 arg->width = len;
15126
15127 /* Prepare the writer */
15128 maxchar = writer->maxchar;
15129 if (!(arg->flags & F_LJUST)) {
15130 if (arg->sign) {
15131 if ((arg->width-1) > len)
15132 maxchar = Py_MAX(maxchar, fill);
15133 }
15134 else {
15135 if (arg->width > len)
15136 maxchar = Py_MAX(maxchar, fill);
15137 }
15138 }
15139 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15140 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15141 maxchar = Py_MAX(maxchar, strmaxchar);
15142 }
15143
15144 buflen = arg->width;
15145 if (arg->sign && len == arg->width)
15146 buflen++;
15147 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15148 return -1;
15149
15150 /* Write the sign if needed */
15151 if (arg->sign) {
15152 if (fill != ' ') {
15153 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15154 writer->pos += 1;
15155 }
15156 if (arg->width > len)
15157 arg->width--;
15158 }
15159
15160 /* Write the numeric prefix for "x", "X" and "o" formats
15161 if the alternate form is used.
15162 For example, write "0x" for the "%#x" format. */
15163 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15164 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15165 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15166 if (fill != ' ') {
15167 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15168 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15169 writer->pos += 2;
15170 pindex += 2;
15171 }
15172 arg->width -= 2;
15173 if (arg->width < 0)
15174 arg->width = 0;
15175 len -= 2;
15176 }
15177
15178 /* Pad left with the fill character if needed */
15179 if (arg->width > len && !(arg->flags & F_LJUST)) {
15180 sublen = arg->width - len;
15181 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15182 writer->pos += sublen;
15183 arg->width = len;
15184 }
15185
15186 /* If padding with spaces: write sign if needed and/or numeric prefix if
15187 the alternate form is used */
15188 if (fill == ' ') {
15189 if (arg->sign) {
15190 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15191 writer->pos += 1;
15192 }
15193 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15194 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15195 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15196 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15197 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15198 writer->pos += 2;
15199 pindex += 2;
15200 }
15201 }
15202
15203 /* Write characters */
15204 if (len) {
15205 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15206 str, pindex, len);
15207 writer->pos += len;
15208 }
15209
15210 /* Pad right with the fill character if needed */
15211 if (arg->width > len) {
15212 sublen = arg->width - len;
15213 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15214 writer->pos += sublen;
15215 }
15216 return 0;
15217 }
15218
15219 /* Helper of PyUnicode_Format(): format one arg.
15220 Return 0 on success, raise an exception and return -1 on error. */
15221 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15222 unicode_format_arg(struct unicode_formatter_t *ctx)
15223 {
15224 struct unicode_format_arg_t arg;
15225 PyObject *str;
15226 int ret;
15227
15228 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15229 if (arg.ch == '%') {
15230 ctx->fmtpos++;
15231 ctx->fmtcnt--;
15232 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15233 return -1;
15234 return 0;
15235 }
15236 arg.flags = 0;
15237 arg.width = -1;
15238 arg.prec = -1;
15239 arg.sign = 0;
15240 str = NULL;
15241
15242 ret = unicode_format_arg_parse(ctx, &arg);
15243 if (ret == -1)
15244 return -1;
15245
15246 ret = unicode_format_arg_format(ctx, &arg, &str);
15247 if (ret == -1)
15248 return -1;
15249
15250 if (ret != 1) {
15251 ret = unicode_format_arg_output(ctx, &arg, str);
15252 Py_DECREF(str);
15253 if (ret == -1)
15254 return -1;
15255 }
15256
15257 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15258 PyErr_SetString(PyExc_TypeError,
15259 "not all arguments converted during string formatting");
15260 return -1;
15261 }
15262 return 0;
15263 }
15264
15265 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15266 PyUnicode_Format(PyObject *format, PyObject *args)
15267 {
15268 struct unicode_formatter_t ctx;
15269
15270 if (format == NULL || args == NULL) {
15271 PyErr_BadInternalCall();
15272 return NULL;
15273 }
15274
15275 if (ensure_unicode(format) < 0)
15276 return NULL;
15277
15278 ctx.fmtstr = format;
15279 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15280 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15281 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15282 ctx.fmtpos = 0;
15283
15284 _PyUnicodeWriter_Init(&ctx.writer);
15285 ctx.writer.min_length = ctx.fmtcnt + 100;
15286 ctx.writer.overallocate = 1;
15287
15288 if (PyTuple_Check(args)) {
15289 ctx.arglen = PyTuple_Size(args);
15290 ctx.argidx = 0;
15291 }
15292 else {
15293 ctx.arglen = -1;
15294 ctx.argidx = -2;
15295 }
15296 ctx.args_owned = 0;
15297 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15298 ctx.dict = args;
15299 else
15300 ctx.dict = NULL;
15301 ctx.args = args;
15302
15303 while (--ctx.fmtcnt >= 0) {
15304 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15305 Py_ssize_t nonfmtpos;
15306
15307 nonfmtpos = ctx.fmtpos++;
15308 while (ctx.fmtcnt >= 0 &&
15309 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15310 ctx.fmtpos++;
15311 ctx.fmtcnt--;
15312 }
15313 if (ctx.fmtcnt < 0) {
15314 ctx.fmtpos--;
15315 ctx.writer.overallocate = 0;
15316 }
15317
15318 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15319 nonfmtpos, ctx.fmtpos) < 0)
15320 goto onError;
15321 }
15322 else {
15323 ctx.fmtpos++;
15324 if (unicode_format_arg(&ctx) == -1)
15325 goto onError;
15326 }
15327 }
15328
15329 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15330 PyErr_SetString(PyExc_TypeError,
15331 "not all arguments converted during string formatting");
15332 goto onError;
15333 }
15334
15335 if (ctx.args_owned) {
15336 Py_DECREF(ctx.args);
15337 }
15338 return _PyUnicodeWriter_Finish(&ctx.writer);
15339
15340 onError:
15341 _PyUnicodeWriter_Dealloc(&ctx.writer);
15342 if (ctx.args_owned) {
15343 Py_DECREF(ctx.args);
15344 }
15345 return NULL;
15346 }
15347
15348 static PyObject *
15349 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15350
15351 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15352 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15353 {
15354 PyObject *x = NULL;
15355 static char *kwlist[] = {"object", "encoding", "errors", 0};
15356 char *encoding = NULL;
15357 char *errors = NULL;
15358
15359 if (type != &PyUnicode_Type)
15360 return unicode_subtype_new(type, args, kwds);
15361 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15362 kwlist, &x, &encoding, &errors))
15363 return NULL;
15364 if (x == NULL)
15365 _Py_RETURN_UNICODE_EMPTY();
15366 if (encoding == NULL && errors == NULL)
15367 return PyObject_Str(x);
15368 else
15369 return PyUnicode_FromEncodedObject(x, encoding, errors);
15370 }
15371
15372 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15373 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15374 {
15375 PyObject *unicode, *self;
15376 Py_ssize_t length, char_size;
15377 int share_wstr, share_utf8;
15378 unsigned int kind;
15379 void *data;
15380
15381 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15382
15383 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15384 if (unicode == NULL)
15385 return NULL;
15386 assert(_PyUnicode_CHECK(unicode));
15387 if (PyUnicode_READY(unicode) == -1) {
15388 Py_DECREF(unicode);
15389 return NULL;
15390 }
15391
15392 self = type->tp_alloc(type, 0);
15393 if (self == NULL) {
15394 Py_DECREF(unicode);
15395 return NULL;
15396 }
15397 kind = PyUnicode_KIND(unicode);
15398 length = PyUnicode_GET_LENGTH(unicode);
15399
15400 _PyUnicode_LENGTH(self) = length;
15401 #ifdef Py_DEBUG
15402 _PyUnicode_HASH(self) = -1;
15403 #else
15404 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15405 #endif
15406 _PyUnicode_STATE(self).interned = 0;
15407 _PyUnicode_STATE(self).kind = kind;
15408 _PyUnicode_STATE(self).compact = 0;
15409 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15410 _PyUnicode_STATE(self).ready = 1;
15411 _PyUnicode_WSTR(self) = NULL;
15412 _PyUnicode_UTF8_LENGTH(self) = 0;
15413 _PyUnicode_UTF8(self) = NULL;
15414 _PyUnicode_WSTR_LENGTH(self) = 0;
15415 _PyUnicode_DATA_ANY(self) = NULL;
15416
15417 share_utf8 = 0;
15418 share_wstr = 0;
15419 if (kind == PyUnicode_1BYTE_KIND) {
15420 char_size = 1;
15421 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15422 share_utf8 = 1;
15423 }
15424 else if (kind == PyUnicode_2BYTE_KIND) {
15425 char_size = 2;
15426 if (sizeof(wchar_t) == 2)
15427 share_wstr = 1;
15428 }
15429 else {
15430 assert(kind == PyUnicode_4BYTE_KIND);
15431 char_size = 4;
15432 if (sizeof(wchar_t) == 4)
15433 share_wstr = 1;
15434 }
15435
15436 /* Ensure we won't overflow the length. */
15437 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15438 PyErr_NoMemory();
15439 goto onError;
15440 }
15441 data = PyObject_MALLOC((length + 1) * char_size);
15442 if (data == NULL) {
15443 PyErr_NoMemory();
15444 goto onError;
15445 }
15446
15447 _PyUnicode_DATA_ANY(self) = data;
15448 if (share_utf8) {
15449 _PyUnicode_UTF8_LENGTH(self) = length;
15450 _PyUnicode_UTF8(self) = data;
15451 }
15452 if (share_wstr) {
15453 _PyUnicode_WSTR_LENGTH(self) = length;
15454 _PyUnicode_WSTR(self) = (wchar_t *)data;
15455 }
15456
15457 memcpy(data, PyUnicode_DATA(unicode),
15458 kind * (length + 1));
15459 assert(_PyUnicode_CheckConsistency(self, 1));
15460 #ifdef Py_DEBUG
15461 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15462 #endif
15463 Py_DECREF(unicode);
15464 return self;
15465
15466 onError:
15467 Py_DECREF(unicode);
15468 Py_DECREF(self);
15469 return NULL;
15470 }
15471
15472 PyDoc_STRVAR(unicode_doc,
15473 "str(object='') -> str\n\
15474 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15475 \n\
15476 Create a new string object from the given object. If encoding or\n\
15477 errors is specified, then the object must expose a data buffer\n\
15478 that will be decoded using the given encoding and error handler.\n\
15479 Otherwise, returns the result of object.__str__() (if defined)\n\
15480 or repr(object).\n\
15481 encoding defaults to sys.getdefaultencoding().\n\
15482 errors defaults to 'strict'.");
15483
15484 static PyObject *unicode_iter(PyObject *seq);
15485
15486 PyTypeObject PyUnicode_Type = {
15487 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15488 "str", /* tp_name */
15489 sizeof(PyUnicodeObject), /* tp_basicsize */
15490 0, /* tp_itemsize */
15491 /* Slots */
15492 (destructor)unicode_dealloc, /* tp_dealloc */
15493 0, /* tp_vectorcall_offset */
15494 0, /* tp_getattr */
15495 0, /* tp_setattr */
15496 0, /* tp_as_async */
15497 unicode_repr, /* tp_repr */
15498 &unicode_as_number, /* tp_as_number */
15499 &unicode_as_sequence, /* tp_as_sequence */
15500 &unicode_as_mapping, /* tp_as_mapping */
15501 (hashfunc) unicode_hash, /* tp_hash*/
15502 0, /* tp_call*/
15503 (reprfunc) unicode_str, /* tp_str */
15504 PyObject_GenericGetAttr, /* tp_getattro */
15505 0, /* tp_setattro */
15506 0, /* tp_as_buffer */
15507 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15508 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15509 unicode_doc, /* tp_doc */
15510 0, /* tp_traverse */
15511 0, /* tp_clear */
15512 PyUnicode_RichCompare, /* tp_richcompare */
15513 0, /* tp_weaklistoffset */
15514 unicode_iter, /* tp_iter */
15515 0, /* tp_iternext */
15516 unicode_methods, /* tp_methods */
15517 0, /* tp_members */
15518 0, /* tp_getset */
15519 &PyBaseObject_Type, /* tp_base */
15520 0, /* tp_dict */
15521 0, /* tp_descr_get */
15522 0, /* tp_descr_set */
15523 0, /* tp_dictoffset */
15524 0, /* tp_init */
15525 0, /* tp_alloc */
15526 unicode_new, /* tp_new */
15527 PyObject_Del, /* tp_free */
15528 };
15529
15530 /* Initialize the Unicode implementation */
15531
15532 PyStatus
_PyUnicode_Init(void)15533 _PyUnicode_Init(void)
15534 {
15535 /* XXX - move this array to unicodectype.c ? */
15536 Py_UCS2 linebreak[] = {
15537 0x000A, /* LINE FEED */
15538 0x000D, /* CARRIAGE RETURN */
15539 0x001C, /* FILE SEPARATOR */
15540 0x001D, /* GROUP SEPARATOR */
15541 0x001E, /* RECORD SEPARATOR */
15542 0x0085, /* NEXT LINE */
15543 0x2028, /* LINE SEPARATOR */
15544 0x2029, /* PARAGRAPH SEPARATOR */
15545 };
15546
15547 /* Init the implementation */
15548 _Py_INCREF_UNICODE_EMPTY();
15549 if (!unicode_empty) {
15550 return _PyStatus_ERR("Can't create empty string");
15551 }
15552 Py_DECREF(unicode_empty);
15553
15554 if (PyType_Ready(&PyUnicode_Type) < 0) {
15555 return _PyStatus_ERR("Can't initialize unicode type");
15556 }
15557
15558 /* initialize the linebreak bloom filter */
15559 bloom_linebreak = make_bloom_mask(
15560 PyUnicode_2BYTE_KIND, linebreak,
15561 Py_ARRAY_LENGTH(linebreak));
15562
15563 if (PyType_Ready(&EncodingMapType) < 0) {
15564 return _PyStatus_ERR("Can't initialize encoding map type");
15565 }
15566 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15567 return _PyStatus_ERR("Can't initialize field name iterator type");
15568 }
15569 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15570 return _PyStatus_ERR("Can't initialize formatter iter type");
15571 }
15572 return _PyStatus_OK();
15573 }
15574
15575
15576 void
PyUnicode_InternInPlace(PyObject ** p)15577 PyUnicode_InternInPlace(PyObject **p)
15578 {
15579 PyObject *s = *p;
15580 #ifdef Py_DEBUG
15581 assert(s != NULL);
15582 assert(_PyUnicode_CHECK(s));
15583 #else
15584 if (s == NULL || !PyUnicode_Check(s)) {
15585 return;
15586 }
15587 #endif
15588
15589 /* If it's a subclass, we don't really know what putting
15590 it in the interned dict might do. */
15591 if (!PyUnicode_CheckExact(s)) {
15592 return;
15593 }
15594
15595 if (PyUnicode_CHECK_INTERNED(s)) {
15596 return;
15597 }
15598
15599 #ifdef INTERNED_STRINGS
15600 if (interned == NULL) {
15601 interned = PyDict_New();
15602 if (interned == NULL) {
15603 PyErr_Clear(); /* Don't leave an exception */
15604 return;
15605 }
15606 }
15607
15608 PyObject *t;
15609 t = PyDict_SetDefault(interned, s, s);
15610
15611 if (t == NULL) {
15612 PyErr_Clear();
15613 return;
15614 }
15615
15616 if (t != s) {
15617 Py_INCREF(t);
15618 Py_SETREF(*p, t);
15619 return;
15620 }
15621
15622 /* The two references in interned are not counted by refcnt.
15623 The deallocator will take care of this */
15624 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15625 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15626 #endif
15627 }
15628
15629 void
PyUnicode_InternImmortal(PyObject ** p)15630 PyUnicode_InternImmortal(PyObject **p)
15631 {
15632 PyUnicode_InternInPlace(p);
15633 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15634 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15635 Py_INCREF(*p);
15636 }
15637 }
15638
15639 PyObject *
PyUnicode_InternFromString(const char * cp)15640 PyUnicode_InternFromString(const char *cp)
15641 {
15642 PyObject *s = PyUnicode_FromString(cp);
15643 if (s == NULL)
15644 return NULL;
15645 PyUnicode_InternInPlace(&s);
15646 return s;
15647 }
15648
15649
15650 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15651 static void
unicode_release_interned(void)15652 unicode_release_interned(void)
15653 {
15654 if (interned == NULL || !PyDict_Check(interned)) {
15655 return;
15656 }
15657 PyObject *keys = PyDict_Keys(interned);
15658 if (keys == NULL || !PyList_Check(keys)) {
15659 PyErr_Clear();
15660 return;
15661 }
15662
15663 /* Since unicode_release_interned() is intended to help a leak
15664 detector, interned unicode strings are not forcibly deallocated;
15665 rather, we give them their stolen references back, and then clear
15666 and DECREF the interned dict. */
15667
15668 Py_ssize_t n = PyList_GET_SIZE(keys);
15669 #ifdef INTERNED_STATS
15670 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15671 n);
15672
15673 Py_ssize_t immortal_size = 0, mortal_size = 0;
15674 #endif
15675 for (Py_ssize_t i = 0; i < n; i++) {
15676 PyObject *s = PyList_GET_ITEM(keys, i);
15677 if (PyUnicode_READY(s) == -1) {
15678 Py_UNREACHABLE();
15679 }
15680 switch (PyUnicode_CHECK_INTERNED(s)) {
15681 case SSTATE_INTERNED_IMMORTAL:
15682 Py_REFCNT(s) += 1;
15683 #ifdef INTERNED_STATS
15684 immortal_size += PyUnicode_GET_LENGTH(s);
15685 #endif
15686 break;
15687 case SSTATE_INTERNED_MORTAL:
15688 Py_REFCNT(s) += 2;
15689 #ifdef INTERNED_STATS
15690 mortal_size += PyUnicode_GET_LENGTH(s);
15691 #endif
15692 break;
15693 case SSTATE_NOT_INTERNED:
15694 /* fall through */
15695 default:
15696 Py_UNREACHABLE();
15697 }
15698 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15699 }
15700 #ifdef INTERNED_STATS
15701 fprintf(stderr, "total size of all interned strings: "
15702 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15703 "mortal/immortal\n", mortal_size, immortal_size);
15704 #endif
15705 Py_DECREF(keys);
15706 PyDict_Clear(interned);
15707 Py_CLEAR(interned);
15708 }
15709 #endif
15710
15711
15712 /********************* Unicode Iterator **************************/
15713
15714 typedef struct {
15715 PyObject_HEAD
15716 Py_ssize_t it_index;
15717 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15718 } unicodeiterobject;
15719
15720 static void
unicodeiter_dealloc(unicodeiterobject * it)15721 unicodeiter_dealloc(unicodeiterobject *it)
15722 {
15723 _PyObject_GC_UNTRACK(it);
15724 Py_XDECREF(it->it_seq);
15725 PyObject_GC_Del(it);
15726 }
15727
15728 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15729 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15730 {
15731 Py_VISIT(it->it_seq);
15732 return 0;
15733 }
15734
15735 static PyObject *
unicodeiter_next(unicodeiterobject * it)15736 unicodeiter_next(unicodeiterobject *it)
15737 {
15738 PyObject *seq, *item;
15739
15740 assert(it != NULL);
15741 seq = it->it_seq;
15742 if (seq == NULL)
15743 return NULL;
15744 assert(_PyUnicode_CHECK(seq));
15745
15746 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15747 int kind = PyUnicode_KIND(seq);
15748 const void *data = PyUnicode_DATA(seq);
15749 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15750 item = PyUnicode_FromOrdinal(chr);
15751 if (item != NULL)
15752 ++it->it_index;
15753 return item;
15754 }
15755
15756 it->it_seq = NULL;
15757 Py_DECREF(seq);
15758 return NULL;
15759 }
15760
15761 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15762 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15763 {
15764 Py_ssize_t len = 0;
15765 if (it->it_seq)
15766 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15767 return PyLong_FromSsize_t(len);
15768 }
15769
15770 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15771
15772 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15773 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15774 {
15775 _Py_IDENTIFIER(iter);
15776 if (it->it_seq != NULL) {
15777 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15778 it->it_seq, it->it_index);
15779 } else {
15780 PyObject *u = (PyObject *)_PyUnicode_New(0);
15781 if (u == NULL)
15782 return NULL;
15783 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15784 }
15785 }
15786
15787 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15788
15789 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15790 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15791 {
15792 Py_ssize_t index = PyLong_AsSsize_t(state);
15793 if (index == -1 && PyErr_Occurred())
15794 return NULL;
15795 if (it->it_seq != NULL) {
15796 if (index < 0)
15797 index = 0;
15798 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15799 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15800 it->it_index = index;
15801 }
15802 Py_RETURN_NONE;
15803 }
15804
15805 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15806
15807 static PyMethodDef unicodeiter_methods[] = {
15808 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15809 length_hint_doc},
15810 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15811 reduce_doc},
15812 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15813 setstate_doc},
15814 {NULL, NULL} /* sentinel */
15815 };
15816
15817 PyTypeObject PyUnicodeIter_Type = {
15818 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15819 "str_iterator", /* tp_name */
15820 sizeof(unicodeiterobject), /* tp_basicsize */
15821 0, /* tp_itemsize */
15822 /* methods */
15823 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15824 0, /* tp_vectorcall_offset */
15825 0, /* tp_getattr */
15826 0, /* tp_setattr */
15827 0, /* tp_as_async */
15828 0, /* tp_repr */
15829 0, /* tp_as_number */
15830 0, /* tp_as_sequence */
15831 0, /* tp_as_mapping */
15832 0, /* tp_hash */
15833 0, /* tp_call */
15834 0, /* tp_str */
15835 PyObject_GenericGetAttr, /* tp_getattro */
15836 0, /* tp_setattro */
15837 0, /* tp_as_buffer */
15838 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15839 0, /* tp_doc */
15840 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15841 0, /* tp_clear */
15842 0, /* tp_richcompare */
15843 0, /* tp_weaklistoffset */
15844 PyObject_SelfIter, /* tp_iter */
15845 (iternextfunc)unicodeiter_next, /* tp_iternext */
15846 unicodeiter_methods, /* tp_methods */
15847 0,
15848 };
15849
15850 static PyObject *
unicode_iter(PyObject * seq)15851 unicode_iter(PyObject *seq)
15852 {
15853 unicodeiterobject *it;
15854
15855 if (!PyUnicode_Check(seq)) {
15856 PyErr_BadInternalCall();
15857 return NULL;
15858 }
15859 if (PyUnicode_READY(seq) == -1)
15860 return NULL;
15861 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15862 if (it == NULL)
15863 return NULL;
15864 it->it_index = 0;
15865 Py_INCREF(seq);
15866 it->it_seq = seq;
15867 _PyObject_GC_TRACK(it);
15868 return (PyObject *)it;
15869 }
15870
15871
15872 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15873 Py_UNICODE_strlen(const Py_UNICODE *u)
15874 {
15875 return wcslen(u);
15876 }
15877
15878 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15879 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15880 {
15881 Py_UNICODE *u = s1;
15882 while ((*u++ = *s2++));
15883 return s1;
15884 }
15885
15886 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15887 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15888 {
15889 Py_UNICODE *u = s1;
15890 while ((*u++ = *s2++))
15891 if (n-- == 0)
15892 break;
15893 return s1;
15894 }
15895
15896 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15897 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15898 {
15899 Py_UNICODE *u1 = s1;
15900 u1 += wcslen(u1);
15901 while ((*u1++ = *s2++));
15902 return s1;
15903 }
15904
15905 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15906 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15907 {
15908 while (*s1 && *s2 && *s1 == *s2)
15909 s1++, s2++;
15910 if (*s1 && *s2)
15911 return (*s1 < *s2) ? -1 : +1;
15912 if (*s1)
15913 return 1;
15914 if (*s2)
15915 return -1;
15916 return 0;
15917 }
15918
15919 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15920 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15921 {
15922 Py_UNICODE u1, u2;
15923 for (; n != 0; n--) {
15924 u1 = *s1;
15925 u2 = *s2;
15926 if (u1 != u2)
15927 return (u1 < u2) ? -1 : +1;
15928 if (u1 == '\0')
15929 return 0;
15930 s1++;
15931 s2++;
15932 }
15933 return 0;
15934 }
15935
15936 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15937 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15938 {
15939 const Py_UNICODE *p;
15940 for (p = s; *p; p++)
15941 if (*p == c)
15942 return (Py_UNICODE*)p;
15943 return NULL;
15944 }
15945
15946 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15947 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15948 {
15949 const Py_UNICODE *p;
15950 p = s + wcslen(s);
15951 while (p != s) {
15952 p--;
15953 if (*p == c)
15954 return (Py_UNICODE*)p;
15955 }
15956 return NULL;
15957 }
15958
15959 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15960 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15961 {
15962 Py_UNICODE *u, *copy;
15963 Py_ssize_t len, size;
15964
15965 if (!PyUnicode_Check(unicode)) {
15966 PyErr_BadArgument();
15967 return NULL;
15968 }
15969 _Py_COMP_DIAG_PUSH
15970 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
15971 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15972 _Py_COMP_DIAG_POP
15973 if (u == NULL)
15974 return NULL;
15975 /* Ensure we won't overflow the size. */
15976 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15977 PyErr_NoMemory();
15978 return NULL;
15979 }
15980 size = len + 1; /* copy the null character */
15981 size *= sizeof(Py_UNICODE);
15982 copy = PyMem_Malloc(size);
15983 if (copy == NULL) {
15984 PyErr_NoMemory();
15985 return NULL;
15986 }
15987 memcpy(copy, u, size);
15988 return copy;
15989 }
15990
15991
15992 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15993 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15994 {
15995 int res;
15996 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15997 if (res == -2) {
15998 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15999 return -1;
16000 }
16001 if (res < 0) {
16002 PyErr_NoMemory();
16003 return -1;
16004 }
16005 return 0;
16006 }
16007
16008
16009 static int
config_get_codec_name(wchar_t ** config_encoding)16010 config_get_codec_name(wchar_t **config_encoding)
16011 {
16012 char *encoding;
16013 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16014 return -1;
16015 }
16016
16017 PyObject *name_obj = NULL;
16018 PyObject *codec = _PyCodec_Lookup(encoding);
16019 PyMem_RawFree(encoding);
16020
16021 if (!codec)
16022 goto error;
16023
16024 name_obj = PyObject_GetAttrString(codec, "name");
16025 Py_CLEAR(codec);
16026 if (!name_obj) {
16027 goto error;
16028 }
16029
16030 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16031 Py_DECREF(name_obj);
16032 if (wname == NULL) {
16033 goto error;
16034 }
16035
16036 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16037 if (raw_wname == NULL) {
16038 PyMem_Free(wname);
16039 PyErr_NoMemory();
16040 goto error;
16041 }
16042
16043 PyMem_RawFree(*config_encoding);
16044 *config_encoding = raw_wname;
16045
16046 PyMem_Free(wname);
16047 return 0;
16048
16049 error:
16050 Py_XDECREF(codec);
16051 Py_XDECREF(name_obj);
16052 return -1;
16053 }
16054
16055
16056 static PyStatus
init_stdio_encoding(PyThreadState * tstate)16057 init_stdio_encoding(PyThreadState *tstate)
16058 {
16059 /* Update the stdio encoding to the normalized Python codec name. */
16060 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
16061 if (config_get_codec_name(&config->stdio_encoding) < 0) {
16062 return _PyStatus_ERR("failed to get the Python codec name "
16063 "of the stdio encoding");
16064 }
16065 return _PyStatus_OK();
16066 }
16067
16068
16069 static int
init_fs_codec(PyInterpreterState * interp)16070 init_fs_codec(PyInterpreterState *interp)
16071 {
16072 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16073
16074 _Py_error_handler error_handler;
16075 error_handler = get_error_handler_wide(config->filesystem_errors);
16076 if (error_handler == _Py_ERROR_UNKNOWN) {
16077 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16078 return -1;
16079 }
16080
16081 char *encoding, *errors;
16082 if (encode_wstr_utf8(config->filesystem_encoding,
16083 &encoding,
16084 "filesystem_encoding") < 0) {
16085 return -1;
16086 }
16087
16088 if (encode_wstr_utf8(config->filesystem_errors,
16089 &errors,
16090 "filesystem_errors") < 0) {
16091 PyMem_RawFree(encoding);
16092 return -1;
16093 }
16094
16095 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16096 PyMem_RawFree(fs_codec->encoding);
16097 fs_codec->encoding = encoding;
16098 /* encoding has been normalized by init_fs_encoding() */
16099 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16100 PyMem_RawFree(fs_codec->errors);
16101 fs_codec->errors = errors;
16102 fs_codec->error_handler = error_handler;
16103
16104 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16105 assert(fs_codec->utf8 == 1);
16106 #endif
16107
16108 /* At this point, PyUnicode_EncodeFSDefault() and
16109 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16110 the C implementation of the filesystem encoding. */
16111
16112 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16113 global configuration variables. */
16114 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16115 fs_codec->errors) < 0) {
16116 PyErr_NoMemory();
16117 return -1;
16118 }
16119 return 0;
16120 }
16121
16122
16123 static PyStatus
init_fs_encoding(PyThreadState * tstate)16124 init_fs_encoding(PyThreadState *tstate)
16125 {
16126 PyInterpreterState *interp = tstate->interp;
16127
16128 /* Update the filesystem encoding to the normalized Python codec name.
16129 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16130 (Python codec name). */
16131 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16132 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16133 _Py_DumpPathConfig(tstate);
16134 return _PyStatus_ERR("failed to get the Python codec "
16135 "of the filesystem encoding");
16136 }
16137
16138 if (init_fs_codec(interp) < 0) {
16139 return _PyStatus_ERR("cannot initialize filesystem codec");
16140 }
16141 return _PyStatus_OK();
16142 }
16143
16144
16145 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16146 _PyUnicode_InitEncodings(PyThreadState *tstate)
16147 {
16148 PyStatus status = init_fs_encoding(tstate);
16149 if (_PyStatus_EXCEPTION(status)) {
16150 return status;
16151 }
16152
16153 return init_stdio_encoding(tstate);
16154 }
16155
16156
16157 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16158 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16159 {
16160 PyMem_RawFree(fs_codec->encoding);
16161 fs_codec->encoding = NULL;
16162 fs_codec->utf8 = 0;
16163 PyMem_RawFree(fs_codec->errors);
16164 fs_codec->errors = NULL;
16165 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16166 }
16167
16168
16169 #ifdef MS_WINDOWS
16170 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16171 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16172 {
16173 PyInterpreterState *interp = _PyInterpreterState_GET();
16174 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16175
16176 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16177 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16178 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16179 if (encoding == NULL || errors == NULL) {
16180 PyMem_RawFree(encoding);
16181 PyMem_RawFree(errors);
16182 PyErr_NoMemory();
16183 return -1;
16184 }
16185
16186 PyMem_RawFree(config->filesystem_encoding);
16187 config->filesystem_encoding = encoding;
16188 PyMem_RawFree(config->filesystem_errors);
16189 config->filesystem_errors = errors;
16190
16191 return init_fs_codec(interp);
16192 }
16193 #endif
16194
16195
16196 void
_PyUnicode_Fini(PyThreadState * tstate)16197 _PyUnicode_Fini(PyThreadState *tstate)
16198 {
16199 if (_Py_IsMainInterpreter(tstate)) {
16200 #if defined(WITH_VALGRIND) || defined(__INSURE__)
16201 /* Insure++ is a memory analysis tool that aids in discovering
16202 * memory leaks and other memory problems. On Python exit, the
16203 * interned string dictionaries are flagged as being in use at exit
16204 * (which it is). Under normal circumstances, this is fine because
16205 * the memory will be automatically reclaimed by the system. Under
16206 * memory debugging, it's a huge source of useless noise, so we
16207 * trade off slower shutdown for less distraction in the memory
16208 * reports. -baw
16209 */
16210 unicode_release_interned();
16211 #endif /* __INSURE__ */
16212
16213 Py_CLEAR(unicode_empty);
16214
16215 #ifdef LATIN1_SINGLETONS
16216 for (Py_ssize_t i = 0; i < 256; i++) {
16217 Py_CLEAR(unicode_latin1[i]);
16218 }
16219 #endif
16220 unicode_clear_static_strings();
16221 }
16222
16223 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
16224 }
16225
16226
16227 /* A _string module, to export formatter_parser and formatter_field_name_split
16228 to the string.Formatter class implemented in Python. */
16229
16230 static PyMethodDef _string_methods[] = {
16231 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16232 METH_O, PyDoc_STR("split the argument as a field name")},
16233 {"formatter_parser", (PyCFunction) formatter_parser,
16234 METH_O, PyDoc_STR("parse the argument as a format string")},
16235 {NULL, NULL}
16236 };
16237
16238 static struct PyModuleDef _string_module = {
16239 PyModuleDef_HEAD_INIT,
16240 "_string",
16241 PyDoc_STR("string helper module"),
16242 0,
16243 _string_methods,
16244 NULL,
16245 NULL,
16246 NULL,
16247 NULL
16248 };
16249
16250 PyMODINIT_FUNC
PyInit__string(void)16251 PyInit__string(void)
16252 {
16253 return PyModule_Create(&_string_module);
16254 }
16255
16256
16257 #ifdef __cplusplus
16258 }
16259 #endif
16260