1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h" // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
46 #include "pycore_format.h" // F_LJUST
47 #include "pycore_initconfig.h" // _PyStatus_OK()
48 #include "pycore_interp.h" // PyInterpreterState.fs_codec
49 #include "pycore_object.h" // _PyObject_GC_TRACK()
50 #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52 #include "pycore_pystate.h" // _PyInterpreterState_GET()
53 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54 #include "stringlib/eq.h" // unicode_eq()
55
56 #ifdef MS_WINDOWS
57 #include <windows.h>
58 #endif
59
60 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61 #include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62 #endif
63
64 /* Uncomment to display statistics on interned strings at exit
65 in _PyUnicode_ClearInterned(). */
66 /* #define INTERNED_STATS 1 */
67
68
69 /*[clinic input]
70 class str "PyObject *" "&PyUnicode_Type"
71 [clinic start generated code]*/
72 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73
74 /*[python input]
75 class Py_UCS4_converter(CConverter):
76 type = 'Py_UCS4'
77 converter = 'convert_uc'
78
79 def converter_init(self):
80 if self.default is not unspecified:
81 self.c_default = ascii(self.default)
82 if len(self.c_default) > 4 or self.c_default[0] != "'":
83 self.c_default = hex(ord(self.default))
84
85 [python start generated code]*/
86 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
87
88 /* --- Globals ------------------------------------------------------------
89
90 NOTE: In the interpreter's initialization phase, some globals are currently
91 initialized dynamically as needed. In the process Unicode objects may
92 be created before the Unicode type is ready.
93
94 */
95
96
97 #ifdef __cplusplus
98 extern "C" {
99 #endif
100
101 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102 // The value must be the same in fileutils.c.
103 #define MAX_UNICODE 0x10ffff
104
105 #ifdef Py_DEBUG
106 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
107 #else
108 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109 #endif
110
111 #define _PyUnicode_UTF8(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8)
113 #define PyUnicode_UTF8(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((char*)((PyASCIIObject*)(op) + 1)) : \
118 _PyUnicode_UTF8(op))
119 #define _PyUnicode_UTF8_LENGTH(op) \
120 (((PyCompactUnicodeObject*)(op))->utf8_length)
121 #define PyUnicode_UTF8_LENGTH(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 assert(PyUnicode_IS_READY(op)), \
124 PyUnicode_IS_COMPACT_ASCII(op) ? \
125 ((PyASCIIObject*)(op))->length : \
126 _PyUnicode_UTF8_LENGTH(op))
127 #define _PyUnicode_WSTR(op) \
128 (((PyASCIIObject*)(op))->wstr)
129
130 /* Don't use deprecated macro of unicodeobject.h */
131 #undef PyUnicode_WSTR_LENGTH
132 #define PyUnicode_WSTR_LENGTH(op) \
133 (PyUnicode_IS_COMPACT_ASCII(op) ? \
134 ((PyASCIIObject*)op)->length : \
135 ((PyCompactUnicodeObject*)op)->wstr_length)
136 #define _PyUnicode_WSTR_LENGTH(op) \
137 (((PyCompactUnicodeObject*)(op))->wstr_length)
138 #define _PyUnicode_LENGTH(op) \
139 (((PyASCIIObject *)(op))->length)
140 #define _PyUnicode_STATE(op) \
141 (((PyASCIIObject *)(op))->state)
142 #define _PyUnicode_HASH(op) \
143 (((PyASCIIObject *)(op))->hash)
144 #define _PyUnicode_KIND(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 ((PyASCIIObject *)(op))->state.kind)
147 #define _PyUnicode_GET_LENGTH(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 ((PyASCIIObject *)(op))->length)
150 #define _PyUnicode_DATA_ANY(op) \
151 (((PyUnicodeObject*)(op))->data.any)
152
153 #undef PyUnicode_READY
154 #define PyUnicode_READY(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 (PyUnicode_IS_READY(op) ? \
157 0 : \
158 _PyUnicode_Ready(op)))
159
160 #define _PyUnicode_SHARE_UTF8(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164 #define _PyUnicode_SHARE_WSTR(op) \
165 (assert(_PyUnicode_CHECK(op)), \
166 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
168 /* true if the Unicode object has an allocated UTF-8 memory block
169 (not shared with other data) */
170 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
171 ((!PyUnicode_IS_COMPACT_ASCII(op) \
172 && _PyUnicode_UTF8(op) \
173 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
175 /* true if the Unicode object has an allocated wstr memory block
176 (not shared with other data) */
177 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
178 ((_PyUnicode_WSTR(op) && \
179 (!PyUnicode_IS_READY(op) || \
180 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
182 /* Generic helper macro to convert characters of different types.
183 from_type and to_type have to be valid type names, begin and end
184 are pointers to the source characters which should be of type
185 "from_type *". to is a pointer of type "to_type *" and points to the
186 buffer where the result characters are written to. */
187 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188 do { \
189 to_type *_to = (to_type *)(to); \
190 const from_type *_iter = (const from_type *)(begin);\
191 const from_type *_end = (const from_type *)(end);\
192 Py_ssize_t n = (_end) - (_iter); \
193 const from_type *_unrolled_end = \
194 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
195 while (_iter < (_unrolled_end)) { \
196 _to[0] = (to_type) _iter[0]; \
197 _to[1] = (to_type) _iter[1]; \
198 _to[2] = (to_type) _iter[2]; \
199 _to[3] = (to_type) _iter[3]; \
200 _iter += 4; _to += 4; \
201 } \
202 while (_iter < (_end)) \
203 *_to++ = (to_type) *_iter++; \
204 } while (0)
205
206 #ifdef MS_WINDOWS
207 /* On Windows, overallocate by 50% is the best factor */
208 # define OVERALLOCATE_FACTOR 2
209 #else
210 /* On Linux, overallocate by 25% is the best factor */
211 # define OVERALLOCATE_FACTOR 4
212 #endif
213
214 /* bpo-40521: Interned strings are shared by all interpreters. */
215 #ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
216 # define INTERNED_STRINGS
217 #endif
218
219 /* This dictionary holds all interned unicode strings. Note that references
220 to strings in this dictionary are *not* counted in the string's ob_refcnt.
221 When the interned string reaches a refcnt of 0 the string deallocation
222 function will delete the reference from this dictionary.
223
224 Another way to look at this is that to say that the actual reference
225 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
226 */
227 #ifdef INTERNED_STRINGS
228 static PyObject *interned = NULL;
229 #endif
230
231 static struct _Py_unicode_state*
get_unicode_state(void)232 get_unicode_state(void)
233 {
234 PyInterpreterState *interp = _PyInterpreterState_GET();
235 return &interp->unicode;
236 }
237
238
239 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)240 static inline PyObject* unicode_get_empty(void)
241 {
242 struct _Py_unicode_state *state = get_unicode_state();
243 // unicode_get_empty() must not be called before _PyUnicode_Init()
244 // or after _PyUnicode_Fini()
245 assert(state->empty_string != NULL);
246 return state->empty_string;
247 }
248
249
250 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)251 static inline PyObject* unicode_new_empty(void)
252 {
253 PyObject *empty = unicode_get_empty();
254 Py_INCREF(empty);
255 return empty;
256 }
257
258 #define _Py_RETURN_UNICODE_EMPTY() \
259 do { \
260 return unicode_new_empty(); \
261 } while (0)
262
263 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)264 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
265 Py_ssize_t start, Py_ssize_t length)
266 {
267 assert(0 <= start);
268 assert(kind != PyUnicode_WCHAR_KIND);
269 switch (kind) {
270 case PyUnicode_1BYTE_KIND: {
271 assert(value <= 0xff);
272 Py_UCS1 ch = (unsigned char)value;
273 Py_UCS1 *to = (Py_UCS1 *)data + start;
274 memset(to, ch, length);
275 break;
276 }
277 case PyUnicode_2BYTE_KIND: {
278 assert(value <= 0xffff);
279 Py_UCS2 ch = (Py_UCS2)value;
280 Py_UCS2 *to = (Py_UCS2 *)data + start;
281 const Py_UCS2 *end = to + length;
282 for (; to < end; ++to) *to = ch;
283 break;
284 }
285 case PyUnicode_4BYTE_KIND: {
286 assert(value <= MAX_UNICODE);
287 Py_UCS4 ch = value;
288 Py_UCS4 * to = (Py_UCS4 *)data + start;
289 const Py_UCS4 *end = to + length;
290 for (; to < end; ++to) *to = ch;
291 break;
292 }
293 default: Py_UNREACHABLE();
294 }
295 }
296
297
298 /* Forward declaration */
299 static inline int
300 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
301 static inline void
302 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
303 static PyObject *
304 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
305 const char *errors);
306 static PyObject *
307 unicode_decode_utf8(const char *s, Py_ssize_t size,
308 _Py_error_handler error_handler, const char *errors,
309 Py_ssize_t *consumed);
310
311 /* Fast detection of the most frequent whitespace characters */
312 const unsigned char _Py_ascii_whitespace[] = {
313 0, 0, 0, 0, 0, 0, 0, 0,
314 /* case 0x0009: * CHARACTER TABULATION */
315 /* case 0x000A: * LINE FEED */
316 /* case 0x000B: * LINE TABULATION */
317 /* case 0x000C: * FORM FEED */
318 /* case 0x000D: * CARRIAGE RETURN */
319 0, 1, 1, 1, 1, 1, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 /* case 0x001C: * FILE SEPARATOR */
322 /* case 0x001D: * GROUP SEPARATOR */
323 /* case 0x001E: * RECORD SEPARATOR */
324 /* case 0x001F: * UNIT SEPARATOR */
325 0, 0, 0, 0, 1, 1, 1, 1,
326 /* case 0x0020: * SPACE */
327 1, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0
340 };
341
342 /* forward */
343 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
344 static PyObject* get_latin1_char(unsigned char ch);
345 static int unicode_modifiable(PyObject *unicode);
346
347
348 static PyObject *
349 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
350 static PyObject *
351 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
352 static PyObject *
353 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
354
355 static PyObject *
356 unicode_encode_call_errorhandler(const char *errors,
357 PyObject **errorHandler,const char *encoding, const char *reason,
358 PyObject *unicode, PyObject **exceptionObject,
359 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
360
361 static void
362 raise_encode_exception(PyObject **exceptionObject,
363 const char *encoding,
364 PyObject *unicode,
365 Py_ssize_t startpos, Py_ssize_t endpos,
366 const char *reason);
367
368 /* Same for linebreaks */
369 static const unsigned char ascii_linebreak[] = {
370 0, 0, 0, 0, 0, 0, 0, 0,
371 /* 0x000A, * LINE FEED */
372 /* 0x000B, * LINE TABULATION */
373 /* 0x000C, * FORM FEED */
374 /* 0x000D, * CARRIAGE RETURN */
375 0, 0, 1, 1, 1, 1, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 /* 0x001C, * FILE SEPARATOR */
378 /* 0x001D, * GROUP SEPARATOR */
379 /* 0x001E, * RECORD SEPARATOR */
380 0, 0, 0, 0, 1, 1, 1, 0,
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0,
390 0, 0, 0, 0, 0, 0, 0, 0,
391 0, 0, 0, 0, 0, 0, 0, 0,
392 0, 0, 0, 0, 0, 0, 0, 0,
393 0, 0, 0, 0, 0, 0, 0, 0
394 };
395
396 static int convert_uc(PyObject *obj, void *addr);
397
398 #include "clinic/unicodeobject.c.h"
399
400 _Py_error_handler
_Py_GetErrorHandler(const char * errors)401 _Py_GetErrorHandler(const char *errors)
402 {
403 if (errors == NULL || strcmp(errors, "strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (strcmp(errors, "surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (strcmp(errors, "replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (strcmp(errors, "ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (strcmp(errors, "backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (strcmp(errors, "surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (strcmp(errors, "xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425 }
426
427
428 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)429 get_error_handler_wide(const wchar_t *errors)
430 {
431 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
432 return _Py_ERROR_STRICT;
433 }
434 if (wcscmp(errors, L"surrogateescape") == 0) {
435 return _Py_ERROR_SURROGATEESCAPE;
436 }
437 if (wcscmp(errors, L"replace") == 0) {
438 return _Py_ERROR_REPLACE;
439 }
440 if (wcscmp(errors, L"ignore") == 0) {
441 return _Py_ERROR_IGNORE;
442 }
443 if (wcscmp(errors, L"backslashreplace") == 0) {
444 return _Py_ERROR_BACKSLASHREPLACE;
445 }
446 if (wcscmp(errors, L"surrogatepass") == 0) {
447 return _Py_ERROR_SURROGATEPASS;
448 }
449 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
450 return _Py_ERROR_XMLCHARREFREPLACE;
451 }
452 return _Py_ERROR_OTHER;
453 }
454
455
456 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)457 unicode_check_encoding_errors(const char *encoding, const char *errors)
458 {
459 if (encoding == NULL && errors == NULL) {
460 return 0;
461 }
462
463 PyInterpreterState *interp = _PyInterpreterState_GET();
464 #ifndef Py_DEBUG
465 /* In release mode, only check in development mode (-X dev) */
466 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
467 return 0;
468 }
469 #else
470 /* Always check in debug mode */
471 #endif
472
473 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
474 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
475 if (!interp->unicode.fs_codec.encoding) {
476 return 0;
477 }
478
479 /* Disable checks during Python finalization. For example, it allows to
480 call _PyObject_Dump() during finalization for debugging purpose. */
481 if (interp->finalizing) {
482 return 0;
483 }
484
485 if (encoding != NULL) {
486 PyObject *handler = _PyCodec_Lookup(encoding);
487 if (handler == NULL) {
488 return -1;
489 }
490 Py_DECREF(handler);
491 }
492
493 if (errors != NULL) {
494 PyObject *handler = PyCodec_LookupError(errors);
495 if (handler == NULL) {
496 return -1;
497 }
498 Py_DECREF(handler);
499 }
500 return 0;
501 }
502
503
504 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)505 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
506 {
507 #define CHECK(expr) \
508 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
509
510 PyASCIIObject *ascii;
511 unsigned int kind;
512
513 assert(op != NULL);
514 CHECK(PyUnicode_Check(op));
515
516 ascii = (PyASCIIObject *)op;
517 kind = ascii->state.kind;
518
519 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
520 CHECK(kind == PyUnicode_1BYTE_KIND);
521 CHECK(ascii->state.ready == 1);
522 }
523 else {
524 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
525 void *data;
526
527 if (ascii->state.compact == 1) {
528 data = compact + 1;
529 CHECK(kind == PyUnicode_1BYTE_KIND
530 || kind == PyUnicode_2BYTE_KIND
531 || kind == PyUnicode_4BYTE_KIND);
532 CHECK(ascii->state.ascii == 0);
533 CHECK(ascii->state.ready == 1);
534 CHECK(compact->utf8 != data);
535 }
536 else {
537 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
538
539 data = unicode->data.any;
540 if (kind == PyUnicode_WCHAR_KIND) {
541 CHECK(ascii->length == 0);
542 CHECK(ascii->hash == -1);
543 CHECK(ascii->state.compact == 0);
544 CHECK(ascii->state.ascii == 0);
545 CHECK(ascii->state.ready == 0);
546 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
547 CHECK(ascii->wstr != NULL);
548 CHECK(data == NULL);
549 CHECK(compact->utf8 == NULL);
550 }
551 else {
552 CHECK(kind == PyUnicode_1BYTE_KIND
553 || kind == PyUnicode_2BYTE_KIND
554 || kind == PyUnicode_4BYTE_KIND);
555 CHECK(ascii->state.compact == 0);
556 CHECK(ascii->state.ready == 1);
557 CHECK(data != NULL);
558 if (ascii->state.ascii) {
559 CHECK(compact->utf8 == data);
560 CHECK(compact->utf8_length == ascii->length);
561 }
562 else
563 CHECK(compact->utf8 != data);
564 }
565 }
566 if (kind != PyUnicode_WCHAR_KIND) {
567 if (
568 #if SIZEOF_WCHAR_T == 2
569 kind == PyUnicode_2BYTE_KIND
570 #else
571 kind == PyUnicode_4BYTE_KIND
572 #endif
573 )
574 {
575 CHECK(ascii->wstr == data);
576 CHECK(compact->wstr_length == ascii->length);
577 } else
578 CHECK(ascii->wstr != data);
579 }
580
581 if (compact->utf8 == NULL)
582 CHECK(compact->utf8_length == 0);
583 if (ascii->wstr == NULL)
584 CHECK(compact->wstr_length == 0);
585 }
586
587 /* check that the best kind is used: O(n) operation */
588 if (check_content && kind != PyUnicode_WCHAR_KIND) {
589 Py_ssize_t i;
590 Py_UCS4 maxchar = 0;
591 const void *data;
592 Py_UCS4 ch;
593
594 data = PyUnicode_DATA(ascii);
595 for (i=0; i < ascii->length; i++)
596 {
597 ch = PyUnicode_READ(kind, data, i);
598 if (ch > maxchar)
599 maxchar = ch;
600 }
601 if (kind == PyUnicode_1BYTE_KIND) {
602 if (ascii->state.ascii == 0) {
603 CHECK(maxchar >= 128);
604 CHECK(maxchar <= 255);
605 }
606 else
607 CHECK(maxchar < 128);
608 }
609 else if (kind == PyUnicode_2BYTE_KIND) {
610 CHECK(maxchar >= 0x100);
611 CHECK(maxchar <= 0xFFFF);
612 }
613 else {
614 CHECK(maxchar >= 0x10000);
615 CHECK(maxchar <= MAX_UNICODE);
616 }
617 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
618 }
619 return 1;
620
621 #undef CHECK
622 }
623
624
625 static PyObject*
unicode_result_wchar(PyObject * unicode)626 unicode_result_wchar(PyObject *unicode)
627 {
628 #ifndef Py_DEBUG
629 Py_ssize_t len;
630
631 len = _PyUnicode_WSTR_LENGTH(unicode);
632 if (len == 0) {
633 Py_DECREF(unicode);
634 _Py_RETURN_UNICODE_EMPTY();
635 }
636
637 if (len == 1) {
638 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
639 if ((Py_UCS4)ch < 256) {
640 Py_DECREF(unicode);
641 return get_latin1_char((unsigned char)ch);
642 }
643 }
644
645 if (_PyUnicode_Ready(unicode) < 0) {
646 Py_DECREF(unicode);
647 return NULL;
648 }
649 #else
650 assert(Py_REFCNT(unicode) == 1);
651
652 /* don't make the result ready in debug mode to ensure that the caller
653 makes the string ready before using it */
654 assert(_PyUnicode_CheckConsistency(unicode, 1));
655 #endif
656 return unicode;
657 }
658
659 static PyObject*
unicode_result_ready(PyObject * unicode)660 unicode_result_ready(PyObject *unicode)
661 {
662 Py_ssize_t length;
663
664 length = PyUnicode_GET_LENGTH(unicode);
665 if (length == 0) {
666 PyObject *empty = unicode_get_empty();
667 if (unicode != empty) {
668 Py_DECREF(unicode);
669 Py_INCREF(empty);
670 }
671 return empty;
672 }
673
674 if (length == 1) {
675 int kind = PyUnicode_KIND(unicode);
676 if (kind == PyUnicode_1BYTE_KIND) {
677 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
678 Py_UCS1 ch = data[0];
679 struct _Py_unicode_state *state = get_unicode_state();
680 PyObject *latin1_char = state->latin1[ch];
681 if (latin1_char != NULL) {
682 if (unicode != latin1_char) {
683 Py_INCREF(latin1_char);
684 Py_DECREF(unicode);
685 }
686 return latin1_char;
687 }
688 else {
689 assert(_PyUnicode_CheckConsistency(unicode, 1));
690 Py_INCREF(unicode);
691 state->latin1[ch] = unicode;
692 return unicode;
693 }
694 }
695 else {
696 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
697 }
698 }
699
700 assert(_PyUnicode_CheckConsistency(unicode, 1));
701 return unicode;
702 }
703
704 static PyObject*
unicode_result(PyObject * unicode)705 unicode_result(PyObject *unicode)
706 {
707 assert(_PyUnicode_CHECK(unicode));
708 if (PyUnicode_IS_READY(unicode))
709 return unicode_result_ready(unicode);
710 else
711 return unicode_result_wchar(unicode);
712 }
713
714 static PyObject*
unicode_result_unchanged(PyObject * unicode)715 unicode_result_unchanged(PyObject *unicode)
716 {
717 if (PyUnicode_CheckExact(unicode)) {
718 if (PyUnicode_READY(unicode) == -1)
719 return NULL;
720 Py_INCREF(unicode);
721 return unicode;
722 }
723 else
724 /* Subtype -- return genuine unicode string with the same value. */
725 return _PyUnicode_Copy(unicode);
726 }
727
728 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
729 ASCII, Latin1, UTF-8, etc. */
730 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)731 backslashreplace(_PyBytesWriter *writer, char *str,
732 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
733 {
734 Py_ssize_t size, i;
735 Py_UCS4 ch;
736 enum PyUnicode_Kind kind;
737 const void *data;
738
739 assert(PyUnicode_IS_READY(unicode));
740 kind = PyUnicode_KIND(unicode);
741 data = PyUnicode_DATA(unicode);
742
743 size = 0;
744 /* determine replacement size */
745 for (i = collstart; i < collend; ++i) {
746 Py_ssize_t incr;
747
748 ch = PyUnicode_READ(kind, data, i);
749 if (ch < 0x100)
750 incr = 2+2;
751 else if (ch < 0x10000)
752 incr = 2+4;
753 else {
754 assert(ch <= MAX_UNICODE);
755 incr = 2+8;
756 }
757 if (size > PY_SSIZE_T_MAX - incr) {
758 PyErr_SetString(PyExc_OverflowError,
759 "encoded result is too long for a Python string");
760 return NULL;
761 }
762 size += incr;
763 }
764
765 str = _PyBytesWriter_Prepare(writer, str, size);
766 if (str == NULL)
767 return NULL;
768
769 /* generate replacement */
770 for (i = collstart; i < collend; ++i) {
771 ch = PyUnicode_READ(kind, data, i);
772 *str++ = '\\';
773 if (ch >= 0x00010000) {
774 *str++ = 'U';
775 *str++ = Py_hexdigits[(ch>>28)&0xf];
776 *str++ = Py_hexdigits[(ch>>24)&0xf];
777 *str++ = Py_hexdigits[(ch>>20)&0xf];
778 *str++ = Py_hexdigits[(ch>>16)&0xf];
779 *str++ = Py_hexdigits[(ch>>12)&0xf];
780 *str++ = Py_hexdigits[(ch>>8)&0xf];
781 }
782 else if (ch >= 0x100) {
783 *str++ = 'u';
784 *str++ = Py_hexdigits[(ch>>12)&0xf];
785 *str++ = Py_hexdigits[(ch>>8)&0xf];
786 }
787 else
788 *str++ = 'x';
789 *str++ = Py_hexdigits[(ch>>4)&0xf];
790 *str++ = Py_hexdigits[ch&0xf];
791 }
792 return str;
793 }
794
795 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
796 ASCII, Latin1, UTF-8, etc. */
797 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)798 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
799 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
800 {
801 Py_ssize_t size, i;
802 Py_UCS4 ch;
803 enum PyUnicode_Kind kind;
804 const void *data;
805
806 assert(PyUnicode_IS_READY(unicode));
807 kind = PyUnicode_KIND(unicode);
808 data = PyUnicode_DATA(unicode);
809
810 size = 0;
811 /* determine replacement size */
812 for (i = collstart; i < collend; ++i) {
813 Py_ssize_t incr;
814
815 ch = PyUnicode_READ(kind, data, i);
816 if (ch < 10)
817 incr = 2+1+1;
818 else if (ch < 100)
819 incr = 2+2+1;
820 else if (ch < 1000)
821 incr = 2+3+1;
822 else if (ch < 10000)
823 incr = 2+4+1;
824 else if (ch < 100000)
825 incr = 2+5+1;
826 else if (ch < 1000000)
827 incr = 2+6+1;
828 else {
829 assert(ch <= MAX_UNICODE);
830 incr = 2+7+1;
831 }
832 if (size > PY_SSIZE_T_MAX - incr) {
833 PyErr_SetString(PyExc_OverflowError,
834 "encoded result is too long for a Python string");
835 return NULL;
836 }
837 size += incr;
838 }
839
840 str = _PyBytesWriter_Prepare(writer, str, size);
841 if (str == NULL)
842 return NULL;
843
844 /* generate replacement */
845 for (i = collstart; i < collend; ++i) {
846 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
847 if (size < 0) {
848 return NULL;
849 }
850 str += size;
851 }
852 return str;
853 }
854
855 /* --- Bloom Filters ----------------------------------------------------- */
856
857 /* stuff to implement simple "bloom filters" for Unicode characters.
858 to keep things simple, we use a single bitmask, using the least 5
859 bits from each unicode characters as the bit index. */
860
861 /* the linebreak mask is set up by _PyUnicode_Init() below */
862
863 #if LONG_BIT >= 128
864 #define BLOOM_WIDTH 128
865 #elif LONG_BIT >= 64
866 #define BLOOM_WIDTH 64
867 #elif LONG_BIT >= 32
868 #define BLOOM_WIDTH 32
869 #else
870 #error "LONG_BIT is smaller than 32"
871 #endif
872
873 #define BLOOM_MASK unsigned long
874
875 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
876
877 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
878
879 #define BLOOM_LINEBREAK(ch) \
880 ((ch) < 128U ? ascii_linebreak[(ch)] : \
881 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
882
883 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)884 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
885 {
886 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
887 do { \
888 TYPE *data = (TYPE *)PTR; \
889 TYPE *end = data + LEN; \
890 Py_UCS4 ch; \
891 for (; data != end; data++) { \
892 ch = *data; \
893 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
894 } \
895 break; \
896 } while (0)
897
898 /* calculate simple bloom-style bitmask for a given unicode string */
899
900 BLOOM_MASK mask;
901
902 mask = 0;
903 switch (kind) {
904 case PyUnicode_1BYTE_KIND:
905 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
906 break;
907 case PyUnicode_2BYTE_KIND:
908 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
909 break;
910 case PyUnicode_4BYTE_KIND:
911 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
912 break;
913 default:
914 Py_UNREACHABLE();
915 }
916 return mask;
917
918 #undef BLOOM_UPDATE
919 }
920
921 static int
ensure_unicode(PyObject * obj)922 ensure_unicode(PyObject *obj)
923 {
924 if (!PyUnicode_Check(obj)) {
925 PyErr_Format(PyExc_TypeError,
926 "must be str, not %.100s",
927 Py_TYPE(obj)->tp_name);
928 return -1;
929 }
930 return PyUnicode_READY(obj);
931 }
932
933 /* Compilation of templated routines */
934
935 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
936
937 #include "stringlib/asciilib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/find_max_char.h"
944 #include "stringlib/undef.h"
945
946 #include "stringlib/ucs1lib.h"
947 #include "stringlib/fastsearch.h"
948 #include "stringlib/partition.h"
949 #include "stringlib/split.h"
950 #include "stringlib/count.h"
951 #include "stringlib/find.h"
952 #include "stringlib/replace.h"
953 #include "stringlib/find_max_char.h"
954 #include "stringlib/undef.h"
955
956 #include "stringlib/ucs2lib.h"
957 #include "stringlib/fastsearch.h"
958 #include "stringlib/partition.h"
959 #include "stringlib/split.h"
960 #include "stringlib/count.h"
961 #include "stringlib/find.h"
962 #include "stringlib/replace.h"
963 #include "stringlib/find_max_char.h"
964 #include "stringlib/undef.h"
965
966 #include "stringlib/ucs4lib.h"
967 #include "stringlib/fastsearch.h"
968 #include "stringlib/partition.h"
969 #include "stringlib/split.h"
970 #include "stringlib/count.h"
971 #include "stringlib/find.h"
972 #include "stringlib/replace.h"
973 #include "stringlib/find_max_char.h"
974 #include "stringlib/undef.h"
975
976 _Py_COMP_DIAG_PUSH
977 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
978 #include "stringlib/unicodedefs.h"
979 #include "stringlib/fastsearch.h"
980 #include "stringlib/count.h"
981 #include "stringlib/find.h"
982 #include "stringlib/undef.h"
983 _Py_COMP_DIAG_POP
984
985 #undef STRINGLIB_GET_EMPTY
986
987 /* --- Unicode Object ----------------------------------------------------- */
988
989 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)990 findchar(const void *s, int kind,
991 Py_ssize_t size, Py_UCS4 ch,
992 int direction)
993 {
994 switch (kind) {
995 case PyUnicode_1BYTE_KIND:
996 if ((Py_UCS1) ch != ch)
997 return -1;
998 if (direction > 0)
999 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1000 else
1001 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1002 case PyUnicode_2BYTE_KIND:
1003 if ((Py_UCS2) ch != ch)
1004 return -1;
1005 if (direction > 0)
1006 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1007 else
1008 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1009 case PyUnicode_4BYTE_KIND:
1010 if (direction > 0)
1011 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1012 else
1013 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1014 default:
1015 Py_UNREACHABLE();
1016 }
1017 }
1018
1019 #ifdef Py_DEBUG
1020 /* Fill the data of a Unicode string with invalid characters to detect bugs
1021 earlier.
1022
1023 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1024 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1025 invalid character in Unicode 6.0. */
1026 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1027 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1028 {
1029 int kind = PyUnicode_KIND(unicode);
1030 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1031 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1032 if (length <= old_length)
1033 return;
1034 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1035 }
1036 #endif
1037
1038 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1039 resize_compact(PyObject *unicode, Py_ssize_t length)
1040 {
1041 Py_ssize_t char_size;
1042 Py_ssize_t struct_size;
1043 Py_ssize_t new_size;
1044 int share_wstr;
1045 PyObject *new_unicode;
1046 #ifdef Py_DEBUG
1047 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1048 #endif
1049
1050 assert(unicode_modifiable(unicode));
1051 assert(PyUnicode_IS_READY(unicode));
1052 assert(PyUnicode_IS_COMPACT(unicode));
1053
1054 char_size = PyUnicode_KIND(unicode);
1055 if (PyUnicode_IS_ASCII(unicode))
1056 struct_size = sizeof(PyASCIIObject);
1057 else
1058 struct_size = sizeof(PyCompactUnicodeObject);
1059 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1060
1061 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1062 PyErr_NoMemory();
1063 return NULL;
1064 }
1065 new_size = (struct_size + (length + 1) * char_size);
1066
1067 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1068 PyObject_Free(_PyUnicode_UTF8(unicode));
1069 _PyUnicode_UTF8(unicode) = NULL;
1070 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1071 }
1072 #ifdef Py_REF_DEBUG
1073 _Py_RefTotal--;
1074 #endif
1075 #ifdef Py_TRACE_REFS
1076 _Py_ForgetReference(unicode);
1077 #endif
1078
1079 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1080 if (new_unicode == NULL) {
1081 _Py_NewReference(unicode);
1082 PyErr_NoMemory();
1083 return NULL;
1084 }
1085 unicode = new_unicode;
1086 _Py_NewReference(unicode);
1087
1088 _PyUnicode_LENGTH(unicode) = length;
1089 if (share_wstr) {
1090 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1091 if (!PyUnicode_IS_ASCII(unicode))
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 }
1094 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1095 PyObject_Free(_PyUnicode_WSTR(unicode));
1096 _PyUnicode_WSTR(unicode) = NULL;
1097 if (!PyUnicode_IS_ASCII(unicode))
1098 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1099 }
1100 #ifdef Py_DEBUG
1101 unicode_fill_invalid(unicode, old_length);
1102 #endif
1103 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1104 length, 0);
1105 assert(_PyUnicode_CheckConsistency(unicode, 0));
1106 return unicode;
1107 }
1108
1109 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1110 resize_inplace(PyObject *unicode, Py_ssize_t length)
1111 {
1112 wchar_t *wstr;
1113 Py_ssize_t new_size;
1114 assert(!PyUnicode_IS_COMPACT(unicode));
1115 assert(Py_REFCNT(unicode) == 1);
1116
1117 if (PyUnicode_IS_READY(unicode)) {
1118 Py_ssize_t char_size;
1119 int share_wstr, share_utf8;
1120 void *data;
1121 #ifdef Py_DEBUG
1122 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1123 #endif
1124
1125 data = _PyUnicode_DATA_ANY(unicode);
1126 char_size = PyUnicode_KIND(unicode);
1127 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1128 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1129
1130 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1131 PyErr_NoMemory();
1132 return -1;
1133 }
1134 new_size = (length + 1) * char_size;
1135
1136 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1137 {
1138 PyObject_Free(_PyUnicode_UTF8(unicode));
1139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1141 }
1142
1143 data = (PyObject *)PyObject_Realloc(data, new_size);
1144 if (data == NULL) {
1145 PyErr_NoMemory();
1146 return -1;
1147 }
1148 _PyUnicode_DATA_ANY(unicode) = data;
1149 if (share_wstr) {
1150 _PyUnicode_WSTR(unicode) = data;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
1152 }
1153 if (share_utf8) {
1154 _PyUnicode_UTF8(unicode) = data;
1155 _PyUnicode_UTF8_LENGTH(unicode) = length;
1156 }
1157 _PyUnicode_LENGTH(unicode) = length;
1158 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1159 #ifdef Py_DEBUG
1160 unicode_fill_invalid(unicode, old_length);
1161 #endif
1162 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1163 assert(_PyUnicode_CheckConsistency(unicode, 0));
1164 return 0;
1165 }
1166 }
1167 assert(_PyUnicode_WSTR(unicode) != NULL);
1168
1169 /* check for integer overflow */
1170 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1171 PyErr_NoMemory();
1172 return -1;
1173 }
1174 new_size = sizeof(wchar_t) * (length + 1);
1175 wstr = _PyUnicode_WSTR(unicode);
1176 wstr = PyObject_Realloc(wstr, new_size);
1177 if (!wstr) {
1178 PyErr_NoMemory();
1179 return -1;
1180 }
1181 _PyUnicode_WSTR(unicode) = wstr;
1182 _PyUnicode_WSTR(unicode)[length] = 0;
1183 _PyUnicode_WSTR_LENGTH(unicode) = length;
1184 assert(_PyUnicode_CheckConsistency(unicode, 0));
1185 return 0;
1186 }
1187
1188 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1189 resize_copy(PyObject *unicode, Py_ssize_t length)
1190 {
1191 Py_ssize_t copy_length;
1192 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1193 PyObject *copy;
1194
1195 assert(PyUnicode_IS_READY(unicode));
1196
1197 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1198 if (copy == NULL)
1199 return NULL;
1200
1201 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1202 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1203 return copy;
1204 }
1205 else {
1206 PyObject *w;
1207
1208 w = (PyObject*)_PyUnicode_New(length);
1209 if (w == NULL)
1210 return NULL;
1211 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1212 copy_length = Py_MIN(copy_length, length);
1213 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1214 copy_length * sizeof(wchar_t));
1215 return w;
1216 }
1217 }
1218
1219 /* We allocate one more byte to make sure the string is
1220 Ux0000 terminated; some code (e.g. new_identifier)
1221 relies on that.
1222
1223 XXX This allocator could further be enhanced by assuring that the
1224 free list never reduces its size below 1.
1225
1226 */
1227
1228 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1229 _PyUnicode_New(Py_ssize_t length)
1230 {
1231 PyUnicodeObject *unicode;
1232 size_t new_size;
1233
1234 /* Optimization for empty strings */
1235 if (length == 0) {
1236 return (PyUnicodeObject *)unicode_new_empty();
1237 }
1238
1239 /* Ensure we won't overflow the size. */
1240 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1241 return (PyUnicodeObject *)PyErr_NoMemory();
1242 }
1243 if (length < 0) {
1244 PyErr_SetString(PyExc_SystemError,
1245 "Negative size passed to _PyUnicode_New");
1246 return NULL;
1247 }
1248
1249 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1250 if (unicode == NULL)
1251 return NULL;
1252 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1253
1254 _PyUnicode_WSTR_LENGTH(unicode) = length;
1255 _PyUnicode_HASH(unicode) = -1;
1256 _PyUnicode_STATE(unicode).interned = 0;
1257 _PyUnicode_STATE(unicode).kind = 0;
1258 _PyUnicode_STATE(unicode).compact = 0;
1259 _PyUnicode_STATE(unicode).ready = 0;
1260 _PyUnicode_STATE(unicode).ascii = 0;
1261 _PyUnicode_DATA_ANY(unicode) = NULL;
1262 _PyUnicode_LENGTH(unicode) = 0;
1263 _PyUnicode_UTF8(unicode) = NULL;
1264 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1265
1266 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1267 if (!_PyUnicode_WSTR(unicode)) {
1268 Py_DECREF(unicode);
1269 PyErr_NoMemory();
1270 return NULL;
1271 }
1272
1273 /* Initialize the first element to guard against cases where
1274 * the caller fails before initializing str -- unicode_resize()
1275 * reads str[0], and the Keep-Alive optimization can keep memory
1276 * allocated for str alive across a call to unicode_dealloc(unicode).
1277 * We don't want unicode_resize to read uninitialized memory in
1278 * that case.
1279 */
1280 _PyUnicode_WSTR(unicode)[0] = 0;
1281 _PyUnicode_WSTR(unicode)[length] = 0;
1282
1283 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1284 return unicode;
1285 }
1286
1287 static const char*
unicode_kind_name(PyObject * unicode)1288 unicode_kind_name(PyObject *unicode)
1289 {
1290 /* don't check consistency: unicode_kind_name() is called from
1291 _PyUnicode_Dump() */
1292 if (!PyUnicode_IS_COMPACT(unicode))
1293 {
1294 if (!PyUnicode_IS_READY(unicode))
1295 return "wstr";
1296 switch (PyUnicode_KIND(unicode))
1297 {
1298 case PyUnicode_1BYTE_KIND:
1299 if (PyUnicode_IS_ASCII(unicode))
1300 return "legacy ascii";
1301 else
1302 return "legacy latin1";
1303 case PyUnicode_2BYTE_KIND:
1304 return "legacy UCS2";
1305 case PyUnicode_4BYTE_KIND:
1306 return "legacy UCS4";
1307 default:
1308 return "<legacy invalid kind>";
1309 }
1310 }
1311 assert(PyUnicode_IS_READY(unicode));
1312 switch (PyUnicode_KIND(unicode)) {
1313 case PyUnicode_1BYTE_KIND:
1314 if (PyUnicode_IS_ASCII(unicode))
1315 return "ascii";
1316 else
1317 return "latin1";
1318 case PyUnicode_2BYTE_KIND:
1319 return "UCS2";
1320 case PyUnicode_4BYTE_KIND:
1321 return "UCS4";
1322 default:
1323 return "<invalid compact kind>";
1324 }
1325 }
1326
1327 #ifdef Py_DEBUG
1328 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1329 const char *_PyUnicode_utf8(void *unicode_raw){
1330 PyObject *unicode = _PyObject_CAST(unicode_raw);
1331 return PyUnicode_UTF8(unicode);
1332 }
1333
_PyUnicode_compact_data(void * unicode_raw)1334 const void *_PyUnicode_compact_data(void *unicode_raw) {
1335 PyObject *unicode = _PyObject_CAST(unicode_raw);
1336 return _PyUnicode_COMPACT_DATA(unicode);
1337 }
_PyUnicode_data(void * unicode_raw)1338 const void *_PyUnicode_data(void *unicode_raw) {
1339 PyObject *unicode = _PyObject_CAST(unicode_raw);
1340 printf("obj %p\n", (void*)unicode);
1341 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1342 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1343 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1344 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1345 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1346 return PyUnicode_DATA(unicode);
1347 }
1348
1349 void
_PyUnicode_Dump(PyObject * op)1350 _PyUnicode_Dump(PyObject *op)
1351 {
1352 PyASCIIObject *ascii = (PyASCIIObject *)op;
1353 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1354 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1355 const void *data;
1356
1357 if (ascii->state.compact)
1358 {
1359 if (ascii->state.ascii)
1360 data = (ascii + 1);
1361 else
1362 data = (compact + 1);
1363 }
1364 else
1365 data = unicode->data.any;
1366 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1367
1368 if (ascii->wstr == data)
1369 printf("shared ");
1370 printf("wstr=%p", (void *)ascii->wstr);
1371
1372 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1373 printf(" (%zu), ", compact->wstr_length);
1374 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1375 printf("shared ");
1376 }
1377 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1378 }
1379 printf(", data=%p\n", data);
1380 }
1381 #endif
1382
1383 static int
unicode_create_empty_string_singleton(struct _Py_unicode_state * state)1384 unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1385 {
1386 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1387 // optimized to always use state->empty_string without having to check if
1388 // it is NULL or not.
1389 PyObject *empty = PyUnicode_New(1, 0);
1390 if (empty == NULL) {
1391 return -1;
1392 }
1393 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1394 _PyUnicode_LENGTH(empty) = 0;
1395 assert(_PyUnicode_CheckConsistency(empty, 1));
1396
1397 assert(state->empty_string == NULL);
1398 state->empty_string = empty;
1399 return 0;
1400 }
1401
1402
1403 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1404 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1405 {
1406 /* Optimization for empty strings */
1407 if (size == 0) {
1408 return unicode_new_empty();
1409 }
1410
1411 PyObject *obj;
1412 PyCompactUnicodeObject *unicode;
1413 void *data;
1414 enum PyUnicode_Kind kind;
1415 int is_sharing, is_ascii;
1416 Py_ssize_t char_size;
1417 Py_ssize_t struct_size;
1418
1419 is_ascii = 0;
1420 is_sharing = 0;
1421 struct_size = sizeof(PyCompactUnicodeObject);
1422 if (maxchar < 128) {
1423 kind = PyUnicode_1BYTE_KIND;
1424 char_size = 1;
1425 is_ascii = 1;
1426 struct_size = sizeof(PyASCIIObject);
1427 }
1428 else if (maxchar < 256) {
1429 kind = PyUnicode_1BYTE_KIND;
1430 char_size = 1;
1431 }
1432 else if (maxchar < 65536) {
1433 kind = PyUnicode_2BYTE_KIND;
1434 char_size = 2;
1435 if (sizeof(wchar_t) == 2)
1436 is_sharing = 1;
1437 }
1438 else {
1439 if (maxchar > MAX_UNICODE) {
1440 PyErr_SetString(PyExc_SystemError,
1441 "invalid maximum character passed to PyUnicode_New");
1442 return NULL;
1443 }
1444 kind = PyUnicode_4BYTE_KIND;
1445 char_size = 4;
1446 if (sizeof(wchar_t) == 4)
1447 is_sharing = 1;
1448 }
1449
1450 /* Ensure we won't overflow the size. */
1451 if (size < 0) {
1452 PyErr_SetString(PyExc_SystemError,
1453 "Negative size passed to PyUnicode_New");
1454 return NULL;
1455 }
1456 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1457 return PyErr_NoMemory();
1458
1459 /* Duplicated allocation code from _PyObject_New() instead of a call to
1460 * PyObject_New() so we are able to allocate space for the object and
1461 * it's data buffer.
1462 */
1463 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1464 if (obj == NULL) {
1465 return PyErr_NoMemory();
1466 }
1467 _PyObject_Init(obj, &PyUnicode_Type);
1468
1469 unicode = (PyCompactUnicodeObject *)obj;
1470 if (is_ascii)
1471 data = ((PyASCIIObject*)obj) + 1;
1472 else
1473 data = unicode + 1;
1474 _PyUnicode_LENGTH(unicode) = size;
1475 _PyUnicode_HASH(unicode) = -1;
1476 _PyUnicode_STATE(unicode).interned = 0;
1477 _PyUnicode_STATE(unicode).kind = kind;
1478 _PyUnicode_STATE(unicode).compact = 1;
1479 _PyUnicode_STATE(unicode).ready = 1;
1480 _PyUnicode_STATE(unicode).ascii = is_ascii;
1481 if (is_ascii) {
1482 ((char*)data)[size] = 0;
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 }
1485 else if (kind == PyUnicode_1BYTE_KIND) {
1486 ((char*)data)[size] = 0;
1487 _PyUnicode_WSTR(unicode) = NULL;
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 unicode->utf8 = NULL;
1490 unicode->utf8_length = 0;
1491 }
1492 else {
1493 unicode->utf8 = NULL;
1494 unicode->utf8_length = 0;
1495 if (kind == PyUnicode_2BYTE_KIND)
1496 ((Py_UCS2*)data)[size] = 0;
1497 else /* kind == PyUnicode_4BYTE_KIND */
1498 ((Py_UCS4*)data)[size] = 0;
1499 if (is_sharing) {
1500 _PyUnicode_WSTR_LENGTH(unicode) = size;
1501 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1502 }
1503 else {
1504 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505 _PyUnicode_WSTR(unicode) = NULL;
1506 }
1507 }
1508 #ifdef Py_DEBUG
1509 unicode_fill_invalid((PyObject*)unicode, 0);
1510 #endif
1511 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1512 return obj;
1513 }
1514
1515 #if SIZEOF_WCHAR_T == 2
1516 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1517 will decode surrogate pairs, the other conversions are implemented as macros
1518 for efficiency.
1519
1520 This function assumes that unicode can hold one more code point than wstr
1521 characters for a terminating null character. */
1522 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1523 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1524 PyObject *unicode)
1525 {
1526 const wchar_t *iter;
1527 Py_UCS4 *ucs4_out;
1528
1529 assert(unicode != NULL);
1530 assert(_PyUnicode_CHECK(unicode));
1531 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1532 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1533
1534 for (iter = begin; iter < end; ) {
1535 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1536 _PyUnicode_GET_LENGTH(unicode)));
1537 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1538 && (iter+1) < end
1539 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1540 {
1541 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1542 iter += 2;
1543 }
1544 else {
1545 *ucs4_out++ = *iter;
1546 iter++;
1547 }
1548 }
1549 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1550 _PyUnicode_GET_LENGTH(unicode)));
1551
1552 }
1553 #endif
1554
1555 static int
unicode_check_modifiable(PyObject * unicode)1556 unicode_check_modifiable(PyObject *unicode)
1557 {
1558 if (!unicode_modifiable(unicode)) {
1559 PyErr_SetString(PyExc_SystemError,
1560 "Cannot modify a string currently used");
1561 return -1;
1562 }
1563 return 0;
1564 }
1565
1566 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1567 _copy_characters(PyObject *to, Py_ssize_t to_start,
1568 PyObject *from, Py_ssize_t from_start,
1569 Py_ssize_t how_many, int check_maxchar)
1570 {
1571 unsigned int from_kind, to_kind;
1572 const void *from_data;
1573 void *to_data;
1574
1575 assert(0 <= how_many);
1576 assert(0 <= from_start);
1577 assert(0 <= to_start);
1578 assert(PyUnicode_Check(from));
1579 assert(PyUnicode_IS_READY(from));
1580 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1581
1582 assert(PyUnicode_Check(to));
1583 assert(PyUnicode_IS_READY(to));
1584 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1585
1586 if (how_many == 0)
1587 return 0;
1588
1589 from_kind = PyUnicode_KIND(from);
1590 from_data = PyUnicode_DATA(from);
1591 to_kind = PyUnicode_KIND(to);
1592 to_data = PyUnicode_DATA(to);
1593
1594 #ifdef Py_DEBUG
1595 if (!check_maxchar
1596 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1597 {
1598 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1599 Py_UCS4 ch;
1600 Py_ssize_t i;
1601 for (i=0; i < how_many; i++) {
1602 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1603 assert(ch <= to_maxchar);
1604 }
1605 }
1606 #endif
1607
1608 if (from_kind == to_kind) {
1609 if (check_maxchar
1610 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1611 {
1612 /* Writing Latin-1 characters into an ASCII string requires to
1613 check that all written characters are pure ASCII */
1614 Py_UCS4 max_char;
1615 max_char = ucs1lib_find_max_char(from_data,
1616 (const Py_UCS1*)from_data + how_many);
1617 if (max_char >= 128)
1618 return -1;
1619 }
1620 memcpy((char*)to_data + to_kind * to_start,
1621 (const char*)from_data + from_kind * from_start,
1622 to_kind * how_many);
1623 }
1624 else if (from_kind == PyUnicode_1BYTE_KIND
1625 && to_kind == PyUnicode_2BYTE_KIND)
1626 {
1627 _PyUnicode_CONVERT_BYTES(
1628 Py_UCS1, Py_UCS2,
1629 PyUnicode_1BYTE_DATA(from) + from_start,
1630 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1631 PyUnicode_2BYTE_DATA(to) + to_start
1632 );
1633 }
1634 else if (from_kind == PyUnicode_1BYTE_KIND
1635 && to_kind == PyUnicode_4BYTE_KIND)
1636 {
1637 _PyUnicode_CONVERT_BYTES(
1638 Py_UCS1, Py_UCS4,
1639 PyUnicode_1BYTE_DATA(from) + from_start,
1640 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1641 PyUnicode_4BYTE_DATA(to) + to_start
1642 );
1643 }
1644 else if (from_kind == PyUnicode_2BYTE_KIND
1645 && to_kind == PyUnicode_4BYTE_KIND)
1646 {
1647 _PyUnicode_CONVERT_BYTES(
1648 Py_UCS2, Py_UCS4,
1649 PyUnicode_2BYTE_DATA(from) + from_start,
1650 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1651 PyUnicode_4BYTE_DATA(to) + to_start
1652 );
1653 }
1654 else {
1655 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1656
1657 if (!check_maxchar) {
1658 if (from_kind == PyUnicode_2BYTE_KIND
1659 && to_kind == PyUnicode_1BYTE_KIND)
1660 {
1661 _PyUnicode_CONVERT_BYTES(
1662 Py_UCS2, Py_UCS1,
1663 PyUnicode_2BYTE_DATA(from) + from_start,
1664 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1665 PyUnicode_1BYTE_DATA(to) + to_start
1666 );
1667 }
1668 else if (from_kind == PyUnicode_4BYTE_KIND
1669 && to_kind == PyUnicode_1BYTE_KIND)
1670 {
1671 _PyUnicode_CONVERT_BYTES(
1672 Py_UCS4, Py_UCS1,
1673 PyUnicode_4BYTE_DATA(from) + from_start,
1674 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1675 PyUnicode_1BYTE_DATA(to) + to_start
1676 );
1677 }
1678 else if (from_kind == PyUnicode_4BYTE_KIND
1679 && to_kind == PyUnicode_2BYTE_KIND)
1680 {
1681 _PyUnicode_CONVERT_BYTES(
1682 Py_UCS4, Py_UCS2,
1683 PyUnicode_4BYTE_DATA(from) + from_start,
1684 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1685 PyUnicode_2BYTE_DATA(to) + to_start
1686 );
1687 }
1688 else {
1689 Py_UNREACHABLE();
1690 }
1691 }
1692 else {
1693 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1694 Py_UCS4 ch;
1695 Py_ssize_t i;
1696
1697 for (i=0; i < how_many; i++) {
1698 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1699 if (ch > to_maxchar)
1700 return -1;
1701 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1702 }
1703 }
1704 }
1705 return 0;
1706 }
1707
1708 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1709 _PyUnicode_FastCopyCharacters(
1710 PyObject *to, Py_ssize_t to_start,
1711 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1712 {
1713 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1714 }
1715
1716 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1717 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1718 PyObject *from, Py_ssize_t from_start,
1719 Py_ssize_t how_many)
1720 {
1721 int err;
1722
1723 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1724 PyErr_BadInternalCall();
1725 return -1;
1726 }
1727
1728 if (PyUnicode_READY(from) == -1)
1729 return -1;
1730 if (PyUnicode_READY(to) == -1)
1731 return -1;
1732
1733 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1734 PyErr_SetString(PyExc_IndexError, "string index out of range");
1735 return -1;
1736 }
1737 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1738 PyErr_SetString(PyExc_IndexError, "string index out of range");
1739 return -1;
1740 }
1741 if (how_many < 0) {
1742 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1743 return -1;
1744 }
1745 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1746 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1747 PyErr_Format(PyExc_SystemError,
1748 "Cannot write %zi characters at %zi "
1749 "in a string of %zi characters",
1750 how_many, to_start, PyUnicode_GET_LENGTH(to));
1751 return -1;
1752 }
1753
1754 if (how_many == 0)
1755 return 0;
1756
1757 if (unicode_check_modifiable(to))
1758 return -1;
1759
1760 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1761 if (err) {
1762 PyErr_Format(PyExc_SystemError,
1763 "Cannot copy %s characters "
1764 "into a string of %s characters",
1765 unicode_kind_name(from),
1766 unicode_kind_name(to));
1767 return -1;
1768 }
1769 return how_many;
1770 }
1771
1772 /* Find the maximum code point and count the number of surrogate pairs so a
1773 correct string length can be computed before converting a string to UCS4.
1774 This function counts single surrogates as a character and not as a pair.
1775
1776 Return 0 on success, or -1 on error. */
1777 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1778 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1779 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1780 {
1781 const wchar_t *iter;
1782 Py_UCS4 ch;
1783
1784 assert(num_surrogates != NULL && maxchar != NULL);
1785 *num_surrogates = 0;
1786 *maxchar = 0;
1787
1788 for (iter = begin; iter < end; ) {
1789 #if SIZEOF_WCHAR_T == 2
1790 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1791 && (iter+1) < end
1792 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1793 {
1794 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1795 ++(*num_surrogates);
1796 iter += 2;
1797 }
1798 else
1799 #endif
1800 {
1801 ch = *iter;
1802 iter++;
1803 }
1804 if (ch > *maxchar) {
1805 *maxchar = ch;
1806 if (*maxchar > MAX_UNICODE) {
1807 PyErr_Format(PyExc_ValueError,
1808 "character U+%x is not in range [U+0000; U+%x]",
1809 ch, MAX_UNICODE);
1810 return -1;
1811 }
1812 }
1813 }
1814 return 0;
1815 }
1816
1817 int
_PyUnicode_Ready(PyObject * unicode)1818 _PyUnicode_Ready(PyObject *unicode)
1819 {
1820 wchar_t *end;
1821 Py_UCS4 maxchar = 0;
1822 Py_ssize_t num_surrogates;
1823 #if SIZEOF_WCHAR_T == 2
1824 Py_ssize_t length_wo_surrogates;
1825 #endif
1826
1827 /* _PyUnicode_Ready() is only intended for old-style API usage where
1828 strings were created using _PyObject_New() and where no canonical
1829 representation (the str field) has been set yet aka strings
1830 which are not yet ready. */
1831 assert(_PyUnicode_CHECK(unicode));
1832 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1833 assert(_PyUnicode_WSTR(unicode) != NULL);
1834 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1835 assert(_PyUnicode_UTF8(unicode) == NULL);
1836 /* Actually, it should neither be interned nor be anything else: */
1837 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1838
1839 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1840 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1841 &maxchar, &num_surrogates) == -1)
1842 return -1;
1843
1844 if (maxchar < 256) {
1845 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1846 if (!_PyUnicode_DATA_ANY(unicode)) {
1847 PyErr_NoMemory();
1848 return -1;
1849 }
1850 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1851 _PyUnicode_WSTR(unicode), end,
1852 PyUnicode_1BYTE_DATA(unicode));
1853 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1856 if (maxchar < 128) {
1857 _PyUnicode_STATE(unicode).ascii = 1;
1858 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1859 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860 }
1861 else {
1862 _PyUnicode_STATE(unicode).ascii = 0;
1863 _PyUnicode_UTF8(unicode) = NULL;
1864 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1865 }
1866 PyObject_Free(_PyUnicode_WSTR(unicode));
1867 _PyUnicode_WSTR(unicode) = NULL;
1868 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1869 }
1870 /* In this case we might have to convert down from 4-byte native
1871 wchar_t to 2-byte unicode. */
1872 else if (maxchar < 65536) {
1873 assert(num_surrogates == 0 &&
1874 "FindMaxCharAndNumSurrogatePairs() messed up");
1875
1876 #if SIZEOF_WCHAR_T == 2
1877 /* We can share representations and are done. */
1878 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1884 #else
1885 /* sizeof(wchar_t) == 4 */
1886 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1887 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1888 if (!_PyUnicode_DATA_ANY(unicode)) {
1889 PyErr_NoMemory();
1890 return -1;
1891 }
1892 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1893 _PyUnicode_WSTR(unicode), end,
1894 PyUnicode_2BYTE_DATA(unicode));
1895 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1898 _PyUnicode_UTF8(unicode) = NULL;
1899 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1900 PyObject_Free(_PyUnicode_WSTR(unicode));
1901 _PyUnicode_WSTR(unicode) = NULL;
1902 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1903 #endif
1904 }
1905 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1906 else {
1907 #if SIZEOF_WCHAR_T == 2
1908 /* in case the native representation is 2-bytes, we need to allocate a
1909 new normalized 4-byte version. */
1910 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1911 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1912 PyErr_NoMemory();
1913 return -1;
1914 }
1915 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1916 if (!_PyUnicode_DATA_ANY(unicode)) {
1917 PyErr_NoMemory();
1918 return -1;
1919 }
1920 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922 _PyUnicode_UTF8(unicode) = NULL;
1923 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1924 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1925 _PyUnicode_STATE(unicode).ready = 1;
1926 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1927 PyObject_Free(_PyUnicode_WSTR(unicode));
1928 _PyUnicode_WSTR(unicode) = NULL;
1929 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1930 #else
1931 assert(num_surrogates == 0);
1932
1933 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1934 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1935 _PyUnicode_UTF8(unicode) = NULL;
1936 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1937 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1938 #endif
1939 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1940 }
1941 _PyUnicode_STATE(unicode).ready = 1;
1942 assert(_PyUnicode_CheckConsistency(unicode, 1));
1943 return 0;
1944 }
1945
1946 static void
unicode_dealloc(PyObject * unicode)1947 unicode_dealloc(PyObject *unicode)
1948 {
1949 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1950 case SSTATE_NOT_INTERNED:
1951 break;
1952
1953 case SSTATE_INTERNED_MORTAL:
1954 {
1955 #ifdef INTERNED_STRINGS
1956 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1957 references (key and value) which were ignored by
1958 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1959 to prevent calling unicode_dealloc() again. Adjust refcnt after
1960 PyDict_DelItem(). */
1961 assert(Py_REFCNT(unicode) == 0);
1962 Py_SET_REFCNT(unicode, 3);
1963 if (PyDict_DelItem(interned, unicode) != 0) {
1964 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965 NULL);
1966 }
1967 assert(Py_REFCNT(unicode) == 1);
1968 Py_SET_REFCNT(unicode, 0);
1969 #endif
1970 break;
1971 }
1972
1973 case SSTATE_INTERNED_IMMORTAL:
1974 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1975 break;
1976
1977 default:
1978 Py_UNREACHABLE();
1979 }
1980
1981 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1982 PyObject_Free(_PyUnicode_WSTR(unicode));
1983 }
1984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1985 PyObject_Free(_PyUnicode_UTF8(unicode));
1986 }
1987 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1988 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1989 }
1990
1991 Py_TYPE(unicode)->tp_free(unicode);
1992 }
1993
1994 #ifdef Py_DEBUG
1995 static int
unicode_is_singleton(PyObject * unicode)1996 unicode_is_singleton(PyObject *unicode)
1997 {
1998 struct _Py_unicode_state *state = get_unicode_state();
1999 if (unicode == state->empty_string) {
2000 return 1;
2001 }
2002 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
2003 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
2004 {
2005 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
2006 if (ch < 256 && state->latin1[ch] == unicode) {
2007 return 1;
2008 }
2009 }
2010 return 0;
2011 }
2012 #endif
2013
2014 static int
unicode_modifiable(PyObject * unicode)2015 unicode_modifiable(PyObject *unicode)
2016 {
2017 assert(_PyUnicode_CHECK(unicode));
2018 if (Py_REFCNT(unicode) != 1)
2019 return 0;
2020 if (_PyUnicode_HASH(unicode) != -1)
2021 return 0;
2022 if (PyUnicode_CHECK_INTERNED(unicode))
2023 return 0;
2024 if (!PyUnicode_CheckExact(unicode))
2025 return 0;
2026 #ifdef Py_DEBUG
2027 /* singleton refcount is greater than 1 */
2028 assert(!unicode_is_singleton(unicode));
2029 #endif
2030 return 1;
2031 }
2032
2033 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)2034 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2035 {
2036 PyObject *unicode;
2037 Py_ssize_t old_length;
2038
2039 assert(p_unicode != NULL);
2040 unicode = *p_unicode;
2041
2042 assert(unicode != NULL);
2043 assert(PyUnicode_Check(unicode));
2044 assert(0 <= length);
2045
2046 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2047 old_length = PyUnicode_WSTR_LENGTH(unicode);
2048 else
2049 old_length = PyUnicode_GET_LENGTH(unicode);
2050 if (old_length == length)
2051 return 0;
2052
2053 if (length == 0) {
2054 PyObject *empty = unicode_new_empty();
2055 Py_SETREF(*p_unicode, empty);
2056 return 0;
2057 }
2058
2059 if (!unicode_modifiable(unicode)) {
2060 PyObject *copy = resize_copy(unicode, length);
2061 if (copy == NULL)
2062 return -1;
2063 Py_SETREF(*p_unicode, copy);
2064 return 0;
2065 }
2066
2067 if (PyUnicode_IS_COMPACT(unicode)) {
2068 PyObject *new_unicode = resize_compact(unicode, length);
2069 if (new_unicode == NULL)
2070 return -1;
2071 *p_unicode = new_unicode;
2072 return 0;
2073 }
2074 return resize_inplace(unicode, length);
2075 }
2076
2077 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2078 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2079 {
2080 PyObject *unicode;
2081 if (p_unicode == NULL) {
2082 PyErr_BadInternalCall();
2083 return -1;
2084 }
2085 unicode = *p_unicode;
2086 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2087 {
2088 PyErr_BadInternalCall();
2089 return -1;
2090 }
2091 return unicode_resize(p_unicode, length);
2092 }
2093
2094 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2095
2096 WARNING: The function doesn't copy the terminating null character and
2097 doesn't check the maximum character (may write a latin1 character in an
2098 ASCII string). */
2099 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2100 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2101 const char *str, Py_ssize_t len)
2102 {
2103 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2104 const void *data = PyUnicode_DATA(unicode);
2105 const char *end = str + len;
2106
2107 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2108 switch (kind) {
2109 case PyUnicode_1BYTE_KIND: {
2110 #ifdef Py_DEBUG
2111 if (PyUnicode_IS_ASCII(unicode)) {
2112 Py_UCS4 maxchar = ucs1lib_find_max_char(
2113 (const Py_UCS1*)str,
2114 (const Py_UCS1*)str + len);
2115 assert(maxchar < 128);
2116 }
2117 #endif
2118 memcpy((char *) data + index, str, len);
2119 break;
2120 }
2121 case PyUnicode_2BYTE_KIND: {
2122 Py_UCS2 *start = (Py_UCS2 *)data + index;
2123 Py_UCS2 *ucs2 = start;
2124
2125 for (; str < end; ++ucs2, ++str)
2126 *ucs2 = (Py_UCS2)*str;
2127
2128 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2129 break;
2130 }
2131 case PyUnicode_4BYTE_KIND: {
2132 Py_UCS4 *start = (Py_UCS4 *)data + index;
2133 Py_UCS4 *ucs4 = start;
2134
2135 for (; str < end; ++ucs4, ++str)
2136 *ucs4 = (Py_UCS4)*str;
2137
2138 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2139 break;
2140 }
2141 default:
2142 Py_UNREACHABLE();
2143 }
2144 }
2145
2146 static PyObject*
get_latin1_char(Py_UCS1 ch)2147 get_latin1_char(Py_UCS1 ch)
2148 {
2149 struct _Py_unicode_state *state = get_unicode_state();
2150
2151 PyObject *unicode = state->latin1[ch];
2152 if (unicode) {
2153 Py_INCREF(unicode);
2154 return unicode;
2155 }
2156
2157 unicode = PyUnicode_New(1, ch);
2158 if (!unicode) {
2159 return NULL;
2160 }
2161
2162 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2163 assert(_PyUnicode_CheckConsistency(unicode, 1));
2164
2165 Py_INCREF(unicode);
2166 state->latin1[ch] = unicode;
2167 return unicode;
2168 }
2169
2170 static PyObject*
unicode_char(Py_UCS4 ch)2171 unicode_char(Py_UCS4 ch)
2172 {
2173 PyObject *unicode;
2174
2175 assert(ch <= MAX_UNICODE);
2176
2177 if (ch < 256) {
2178 return get_latin1_char(ch);
2179 }
2180
2181 unicode = PyUnicode_New(1, ch);
2182 if (unicode == NULL)
2183 return NULL;
2184
2185 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2186 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2187 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2188 } else {
2189 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2190 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2191 }
2192 assert(_PyUnicode_CheckConsistency(unicode, 1));
2193 return unicode;
2194 }
2195
2196 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2197 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2198 {
2199 if (u == NULL) {
2200 if (size > 0) {
2201 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2202 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2203 "use PyUnicode_New() instead", 1) < 0) {
2204 return NULL;
2205 }
2206 }
2207 return (PyObject*)_PyUnicode_New(size);
2208 }
2209
2210 if (size < 0) {
2211 PyErr_BadInternalCall();
2212 return NULL;
2213 }
2214
2215 return PyUnicode_FromWideChar(u, size);
2216 }
2217
2218 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2219 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2220 {
2221 PyObject *unicode;
2222 Py_UCS4 maxchar = 0;
2223 Py_ssize_t num_surrogates;
2224
2225 if (u == NULL && size != 0) {
2226 PyErr_BadInternalCall();
2227 return NULL;
2228 }
2229
2230 if (size == -1) {
2231 size = wcslen(u);
2232 }
2233
2234 /* If the Unicode data is known at construction time, we can apply
2235 some optimizations which share commonly used objects. */
2236
2237 /* Optimization for empty strings */
2238 if (size == 0)
2239 _Py_RETURN_UNICODE_EMPTY();
2240
2241 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2242 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2243 non-Unicode locales and hence needs conversion to UCS-4 first. */
2244 if (_Py_LocaleUsesNonUnicodeWchar()) {
2245 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2246 if (!converted) {
2247 return NULL;
2248 }
2249 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2250 PyMem_Free(converted);
2251 return unicode;
2252 }
2253 #endif
2254
2255 /* Single character Unicode objects in the Latin-1 range are
2256 shared when using this constructor */
2257 if (size == 1 && (Py_UCS4)*u < 256)
2258 return get_latin1_char((unsigned char)*u);
2259
2260 /* If not empty and not single character, copy the Unicode data
2261 into the new object */
2262 if (find_maxchar_surrogates(u, u + size,
2263 &maxchar, &num_surrogates) == -1)
2264 return NULL;
2265
2266 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2267 if (!unicode)
2268 return NULL;
2269
2270 switch (PyUnicode_KIND(unicode)) {
2271 case PyUnicode_1BYTE_KIND:
2272 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2273 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2274 break;
2275 case PyUnicode_2BYTE_KIND:
2276 #if Py_UNICODE_SIZE == 2
2277 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2278 #else
2279 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2280 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2281 #endif
2282 break;
2283 case PyUnicode_4BYTE_KIND:
2284 #if SIZEOF_WCHAR_T == 2
2285 /* This is the only case which has to process surrogates, thus
2286 a simple copy loop is not enough and we need a function. */
2287 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2288 #else
2289 assert(num_surrogates == 0);
2290 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2291 #endif
2292 break;
2293 default:
2294 Py_UNREACHABLE();
2295 }
2296
2297 return unicode_result(unicode);
2298 }
2299
2300 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2301 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2302 {
2303 if (size < 0) {
2304 PyErr_SetString(PyExc_SystemError,
2305 "Negative size passed to PyUnicode_FromStringAndSize");
2306 return NULL;
2307 }
2308 if (u != NULL) {
2309 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2310 }
2311 else {
2312 if (size > 0) {
2313 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2314 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2315 "use PyUnicode_New() instead", 1) < 0) {
2316 return NULL;
2317 }
2318 }
2319 return (PyObject *)_PyUnicode_New(size);
2320 }
2321 }
2322
2323 PyObject *
PyUnicode_FromString(const char * u)2324 PyUnicode_FromString(const char *u)
2325 {
2326 size_t size = strlen(u);
2327 if (size > PY_SSIZE_T_MAX) {
2328 PyErr_SetString(PyExc_OverflowError, "input too long");
2329 return NULL;
2330 }
2331 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2332 }
2333
2334
2335 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2336 _PyUnicode_FromId(_Py_Identifier *id)
2337 {
2338 PyInterpreterState *interp = _PyInterpreterState_GET();
2339 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2340
2341 Py_ssize_t index = _Py_atomic_size_get(&id->index);
2342 if (index < 0) {
2343 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2344
2345 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2346 // Check again to detect concurrent access. Another thread can have
2347 // initialized the index while this thread waited for the lock.
2348 index = _Py_atomic_size_get(&id->index);
2349 if (index < 0) {
2350 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2351 index = rt_ids->next_index;
2352 rt_ids->next_index++;
2353 _Py_atomic_size_set(&id->index, index);
2354 }
2355 PyThread_release_lock(rt_ids->lock);
2356 }
2357 assert(index >= 0);
2358
2359 PyObject *obj;
2360 if (index < ids->size) {
2361 obj = ids->array[index];
2362 if (obj) {
2363 // Return a borrowed reference
2364 return obj;
2365 }
2366 }
2367
2368 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2369 NULL, NULL);
2370 if (!obj) {
2371 return NULL;
2372 }
2373 PyUnicode_InternInPlace(&obj);
2374
2375 if (index >= ids->size) {
2376 // Overallocate to reduce the number of realloc
2377 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2378 Py_ssize_t item_size = sizeof(ids->array[0]);
2379 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2380 if (new_array == NULL) {
2381 PyErr_NoMemory();
2382 return NULL;
2383 }
2384 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2385 ids->array = new_array;
2386 ids->size = new_size;
2387 }
2388
2389 // The array stores a strong reference
2390 ids->array[index] = obj;
2391
2392 // Return a borrowed reference
2393 return obj;
2394 }
2395
2396
2397 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2398 unicode_clear_identifiers(struct _Py_unicode_state *state)
2399 {
2400 struct _Py_unicode_ids *ids = &state->ids;
2401 for (Py_ssize_t i=0; i < ids->size; i++) {
2402 Py_XDECREF(ids->array[i]);
2403 }
2404 ids->size = 0;
2405 PyMem_Free(ids->array);
2406 ids->array = NULL;
2407 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2408 // after Py_Finalize().
2409 }
2410
2411
2412 /* Internal function, doesn't check maximum character */
2413
2414 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2415 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2416 {
2417 const unsigned char *s = (const unsigned char *)buffer;
2418 PyObject *unicode;
2419 if (size == 1) {
2420 #ifdef Py_DEBUG
2421 assert((unsigned char)s[0] < 128);
2422 #endif
2423 return get_latin1_char(s[0]);
2424 }
2425 unicode = PyUnicode_New(size, 127);
2426 if (!unicode)
2427 return NULL;
2428 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2429 assert(_PyUnicode_CheckConsistency(unicode, 1));
2430 return unicode;
2431 }
2432
2433 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2434 kind_maxchar_limit(unsigned int kind)
2435 {
2436 switch (kind) {
2437 case PyUnicode_1BYTE_KIND:
2438 return 0x80;
2439 case PyUnicode_2BYTE_KIND:
2440 return 0x100;
2441 case PyUnicode_4BYTE_KIND:
2442 return 0x10000;
2443 default:
2444 Py_UNREACHABLE();
2445 }
2446 }
2447
2448 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2449 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2450 {
2451 PyObject *res;
2452 unsigned char max_char;
2453
2454 if (size == 0) {
2455 _Py_RETURN_UNICODE_EMPTY();
2456 }
2457 assert(size > 0);
2458 if (size == 1) {
2459 return get_latin1_char(u[0]);
2460 }
2461
2462 max_char = ucs1lib_find_max_char(u, u + size);
2463 res = PyUnicode_New(size, max_char);
2464 if (!res)
2465 return NULL;
2466 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2467 assert(_PyUnicode_CheckConsistency(res, 1));
2468 return res;
2469 }
2470
2471 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2472 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2473 {
2474 PyObject *res;
2475 Py_UCS2 max_char;
2476
2477 if (size == 0)
2478 _Py_RETURN_UNICODE_EMPTY();
2479 assert(size > 0);
2480 if (size == 1)
2481 return unicode_char(u[0]);
2482
2483 max_char = ucs2lib_find_max_char(u, u + size);
2484 res = PyUnicode_New(size, max_char);
2485 if (!res)
2486 return NULL;
2487 if (max_char >= 256)
2488 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2489 else {
2490 _PyUnicode_CONVERT_BYTES(
2491 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2492 }
2493 assert(_PyUnicode_CheckConsistency(res, 1));
2494 return res;
2495 }
2496
2497 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2498 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2499 {
2500 PyObject *res;
2501 Py_UCS4 max_char;
2502
2503 if (size == 0)
2504 _Py_RETURN_UNICODE_EMPTY();
2505 assert(size > 0);
2506 if (size == 1)
2507 return unicode_char(u[0]);
2508
2509 max_char = ucs4lib_find_max_char(u, u + size);
2510 res = PyUnicode_New(size, max_char);
2511 if (!res)
2512 return NULL;
2513 if (max_char < 256)
2514 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2515 PyUnicode_1BYTE_DATA(res));
2516 else if (max_char < 0x10000)
2517 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2518 PyUnicode_2BYTE_DATA(res));
2519 else
2520 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2521 assert(_PyUnicode_CheckConsistency(res, 1));
2522 return res;
2523 }
2524
2525 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2526 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2527 {
2528 if (size < 0) {
2529 PyErr_SetString(PyExc_ValueError, "size must be positive");
2530 return NULL;
2531 }
2532 switch (kind) {
2533 case PyUnicode_1BYTE_KIND:
2534 return _PyUnicode_FromUCS1(buffer, size);
2535 case PyUnicode_2BYTE_KIND:
2536 return _PyUnicode_FromUCS2(buffer, size);
2537 case PyUnicode_4BYTE_KIND:
2538 return _PyUnicode_FromUCS4(buffer, size);
2539 default:
2540 PyErr_SetString(PyExc_SystemError, "invalid kind");
2541 return NULL;
2542 }
2543 }
2544
2545 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2546 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2547 {
2548 enum PyUnicode_Kind kind;
2549 const void *startptr, *endptr;
2550
2551 assert(PyUnicode_IS_READY(unicode));
2552 assert(0 <= start);
2553 assert(end <= PyUnicode_GET_LENGTH(unicode));
2554 assert(start <= end);
2555
2556 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2557 return PyUnicode_MAX_CHAR_VALUE(unicode);
2558
2559 if (start == end)
2560 return 127;
2561
2562 if (PyUnicode_IS_ASCII(unicode))
2563 return 127;
2564
2565 kind = PyUnicode_KIND(unicode);
2566 startptr = PyUnicode_DATA(unicode);
2567 endptr = (char *)startptr + end * kind;
2568 startptr = (char *)startptr + start * kind;
2569 switch(kind) {
2570 case PyUnicode_1BYTE_KIND:
2571 return ucs1lib_find_max_char(startptr, endptr);
2572 case PyUnicode_2BYTE_KIND:
2573 return ucs2lib_find_max_char(startptr, endptr);
2574 case PyUnicode_4BYTE_KIND:
2575 return ucs4lib_find_max_char(startptr, endptr);
2576 default:
2577 Py_UNREACHABLE();
2578 }
2579 }
2580
2581 /* Ensure that a string uses the most efficient storage, if it is not the
2582 case: create a new string with of the right kind. Write NULL into *p_unicode
2583 on error. */
2584 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2585 unicode_adjust_maxchar(PyObject **p_unicode)
2586 {
2587 PyObject *unicode, *copy;
2588 Py_UCS4 max_char;
2589 Py_ssize_t len;
2590 unsigned int kind;
2591
2592 assert(p_unicode != NULL);
2593 unicode = *p_unicode;
2594 assert(PyUnicode_IS_READY(unicode));
2595 if (PyUnicode_IS_ASCII(unicode))
2596 return;
2597
2598 len = PyUnicode_GET_LENGTH(unicode);
2599 kind = PyUnicode_KIND(unicode);
2600 if (kind == PyUnicode_1BYTE_KIND) {
2601 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2602 max_char = ucs1lib_find_max_char(u, u + len);
2603 if (max_char >= 128)
2604 return;
2605 }
2606 else if (kind == PyUnicode_2BYTE_KIND) {
2607 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2608 max_char = ucs2lib_find_max_char(u, u + len);
2609 if (max_char >= 256)
2610 return;
2611 }
2612 else if (kind == PyUnicode_4BYTE_KIND) {
2613 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2614 max_char = ucs4lib_find_max_char(u, u + len);
2615 if (max_char >= 0x10000)
2616 return;
2617 }
2618 else
2619 Py_UNREACHABLE();
2620
2621 copy = PyUnicode_New(len, max_char);
2622 if (copy != NULL)
2623 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2624 Py_DECREF(unicode);
2625 *p_unicode = copy;
2626 }
2627
2628 PyObject*
_PyUnicode_Copy(PyObject * unicode)2629 _PyUnicode_Copy(PyObject *unicode)
2630 {
2631 Py_ssize_t length;
2632 PyObject *copy;
2633
2634 if (!PyUnicode_Check(unicode)) {
2635 PyErr_BadInternalCall();
2636 return NULL;
2637 }
2638 if (PyUnicode_READY(unicode) == -1)
2639 return NULL;
2640
2641 length = PyUnicode_GET_LENGTH(unicode);
2642 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2643 if (!copy)
2644 return NULL;
2645 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2646
2647 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2648 length * PyUnicode_KIND(unicode));
2649 assert(_PyUnicode_CheckConsistency(copy, 1));
2650 return copy;
2651 }
2652
2653
2654 /* Widen Unicode objects to larger buffers. Don't write terminating null
2655 character. Return NULL on error. */
2656
2657 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2658 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2659 {
2660 void *result;
2661
2662 assert(skind < kind);
2663 switch (kind) {
2664 case PyUnicode_2BYTE_KIND:
2665 result = PyMem_New(Py_UCS2, len);
2666 if (!result)
2667 return PyErr_NoMemory();
2668 assert(skind == PyUnicode_1BYTE_KIND);
2669 _PyUnicode_CONVERT_BYTES(
2670 Py_UCS1, Py_UCS2,
2671 (const Py_UCS1 *)data,
2672 ((const Py_UCS1 *)data) + len,
2673 result);
2674 return result;
2675 case PyUnicode_4BYTE_KIND:
2676 result = PyMem_New(Py_UCS4, len);
2677 if (!result)
2678 return PyErr_NoMemory();
2679 if (skind == PyUnicode_2BYTE_KIND) {
2680 _PyUnicode_CONVERT_BYTES(
2681 Py_UCS2, Py_UCS4,
2682 (const Py_UCS2 *)data,
2683 ((const Py_UCS2 *)data) + len,
2684 result);
2685 }
2686 else {
2687 assert(skind == PyUnicode_1BYTE_KIND);
2688 _PyUnicode_CONVERT_BYTES(
2689 Py_UCS1, Py_UCS4,
2690 (const Py_UCS1 *)data,
2691 ((const Py_UCS1 *)data) + len,
2692 result);
2693 }
2694 return result;
2695 default:
2696 Py_UNREACHABLE();
2697 return NULL;
2698 }
2699 }
2700
2701 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2702 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2703 int copy_null)
2704 {
2705 int kind;
2706 const void *data;
2707 Py_ssize_t len, targetlen;
2708 if (PyUnicode_READY(string) == -1)
2709 return NULL;
2710 kind = PyUnicode_KIND(string);
2711 data = PyUnicode_DATA(string);
2712 len = PyUnicode_GET_LENGTH(string);
2713 targetlen = len;
2714 if (copy_null)
2715 targetlen++;
2716 if (!target) {
2717 target = PyMem_New(Py_UCS4, targetlen);
2718 if (!target) {
2719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722 }
2723 else {
2724 if (targetsize < targetlen) {
2725 PyErr_Format(PyExc_SystemError,
2726 "string is longer than the buffer");
2727 if (copy_null && 0 < targetsize)
2728 target[0] = 0;
2729 return NULL;
2730 }
2731 }
2732 if (kind == PyUnicode_1BYTE_KIND) {
2733 const Py_UCS1 *start = (const Py_UCS1 *) data;
2734 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2735 }
2736 else if (kind == PyUnicode_2BYTE_KIND) {
2737 const Py_UCS2 *start = (const Py_UCS2 *) data;
2738 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2739 }
2740 else if (kind == PyUnicode_4BYTE_KIND) {
2741 memcpy(target, data, len * sizeof(Py_UCS4));
2742 }
2743 else {
2744 Py_UNREACHABLE();
2745 }
2746 if (copy_null)
2747 target[len] = 0;
2748 return target;
2749 }
2750
2751 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2752 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2753 int copy_null)
2754 {
2755 if (target == NULL || targetsize < 0) {
2756 PyErr_BadInternalCall();
2757 return NULL;
2758 }
2759 return as_ucs4(string, target, targetsize, copy_null);
2760 }
2761
2762 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2763 PyUnicode_AsUCS4Copy(PyObject *string)
2764 {
2765 return as_ucs4(string, NULL, 0, 1);
2766 }
2767
2768 /* maximum number of characters required for output of %lld or %p.
2769 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2770 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2771 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2772
2773 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2774 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2775 Py_ssize_t width, Py_ssize_t precision)
2776 {
2777 Py_ssize_t length, fill, arglen;
2778 Py_UCS4 maxchar;
2779
2780 if (PyUnicode_READY(str) == -1)
2781 return -1;
2782
2783 length = PyUnicode_GET_LENGTH(str);
2784 if ((precision == -1 || precision >= length)
2785 && width <= length)
2786 return _PyUnicodeWriter_WriteStr(writer, str);
2787
2788 if (precision != -1)
2789 length = Py_MIN(precision, length);
2790
2791 arglen = Py_MAX(length, width);
2792 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2793 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2794 else
2795 maxchar = writer->maxchar;
2796
2797 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2798 return -1;
2799
2800 if (width > length) {
2801 fill = width - length;
2802 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2803 return -1;
2804 writer->pos += fill;
2805 }
2806
2807 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2808 str, 0, length);
2809 writer->pos += length;
2810 return 0;
2811 }
2812
2813 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2814 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2815 Py_ssize_t width, Py_ssize_t precision)
2816 {
2817 /* UTF-8 */
2818 Py_ssize_t length;
2819 PyObject *unicode;
2820 int res;
2821
2822 if (precision == -1) {
2823 length = strlen(str);
2824 }
2825 else {
2826 length = 0;
2827 while (length < precision && str[length]) {
2828 length++;
2829 }
2830 }
2831 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2832 if (unicode == NULL)
2833 return -1;
2834
2835 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2836 Py_DECREF(unicode);
2837 return res;
2838 }
2839
2840 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2841 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2842 const char *f, va_list *vargs)
2843 {
2844 const char *p;
2845 Py_ssize_t len;
2846 int zeropad;
2847 Py_ssize_t width;
2848 Py_ssize_t precision;
2849 int longflag;
2850 int longlongflag;
2851 int size_tflag;
2852 Py_ssize_t fill;
2853
2854 p = f;
2855 f++;
2856 zeropad = 0;
2857 if (*f == '0') {
2858 zeropad = 1;
2859 f++;
2860 }
2861
2862 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2863 width = -1;
2864 if (Py_ISDIGIT((unsigned)*f)) {
2865 width = *f - '0';
2866 f++;
2867 while (Py_ISDIGIT((unsigned)*f)) {
2868 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2869 PyErr_SetString(PyExc_ValueError,
2870 "width too big");
2871 return NULL;
2872 }
2873 width = (width * 10) + (*f - '0');
2874 f++;
2875 }
2876 }
2877 precision = -1;
2878 if (*f == '.') {
2879 f++;
2880 if (Py_ISDIGIT((unsigned)*f)) {
2881 precision = (*f - '0');
2882 f++;
2883 while (Py_ISDIGIT((unsigned)*f)) {
2884 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2885 PyErr_SetString(PyExc_ValueError,
2886 "precision too big");
2887 return NULL;
2888 }
2889 precision = (precision * 10) + (*f - '0');
2890 f++;
2891 }
2892 }
2893 if (*f == '%') {
2894 /* "%.3%s" => f points to "3" */
2895 f--;
2896 }
2897 }
2898 if (*f == '\0') {
2899 /* bogus format "%.123" => go backward, f points to "3" */
2900 f--;
2901 }
2902
2903 /* Handle %ld, %lu, %lld and %llu. */
2904 longflag = 0;
2905 longlongflag = 0;
2906 size_tflag = 0;
2907 if (*f == 'l') {
2908 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2909 longflag = 1;
2910 ++f;
2911 }
2912 else if (f[1] == 'l' &&
2913 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2914 longlongflag = 1;
2915 f += 2;
2916 }
2917 }
2918 /* handle the size_t flag. */
2919 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2920 size_tflag = 1;
2921 ++f;
2922 }
2923
2924 if (f[1] == '\0')
2925 writer->overallocate = 0;
2926
2927 switch (*f) {
2928 case 'c':
2929 {
2930 int ordinal = va_arg(*vargs, int);
2931 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2932 PyErr_SetString(PyExc_OverflowError,
2933 "character argument not in range(0x110000)");
2934 return NULL;
2935 }
2936 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2937 return NULL;
2938 break;
2939 }
2940
2941 case 'i':
2942 case 'd':
2943 case 'u':
2944 case 'x':
2945 {
2946 /* used by sprintf */
2947 char buffer[MAX_LONG_LONG_CHARS];
2948 Py_ssize_t arglen;
2949
2950 if (*f == 'u') {
2951 if (longflag) {
2952 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2953 }
2954 else if (longlongflag) {
2955 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2956 }
2957 else if (size_tflag) {
2958 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2959 }
2960 else {
2961 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2962 }
2963 }
2964 else if (*f == 'x') {
2965 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2966 }
2967 else {
2968 if (longflag) {
2969 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2970 }
2971 else if (longlongflag) {
2972 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2973 }
2974 else if (size_tflag) {
2975 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2976 }
2977 else {
2978 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2979 }
2980 }
2981 assert(len >= 0);
2982
2983 if (precision < len)
2984 precision = len;
2985
2986 arglen = Py_MAX(precision, width);
2987 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2988 return NULL;
2989
2990 if (width > precision) {
2991 Py_UCS4 fillchar;
2992 fill = width - precision;
2993 fillchar = zeropad?'0':' ';
2994 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2995 return NULL;
2996 writer->pos += fill;
2997 }
2998 if (precision > len) {
2999 fill = precision - len;
3000 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
3001 return NULL;
3002 writer->pos += fill;
3003 }
3004
3005 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
3006 return NULL;
3007 break;
3008 }
3009
3010 case 'p':
3011 {
3012 char number[MAX_LONG_LONG_CHARS];
3013
3014 len = sprintf(number, "%p", va_arg(*vargs, void*));
3015 assert(len >= 0);
3016
3017 /* %p is ill-defined: ensure leading 0x. */
3018 if (number[1] == 'X')
3019 number[1] = 'x';
3020 else if (number[1] != 'x') {
3021 memmove(number + 2, number,
3022 strlen(number) + 1);
3023 number[0] = '0';
3024 number[1] = 'x';
3025 len += 2;
3026 }
3027
3028 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3029 return NULL;
3030 break;
3031 }
3032
3033 case 's':
3034 {
3035 /* UTF-8 */
3036 const char *s = va_arg(*vargs, const char*);
3037 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
3038 return NULL;
3039 break;
3040 }
3041
3042 case 'U':
3043 {
3044 PyObject *obj = va_arg(*vargs, PyObject *);
3045 assert(obj && _PyUnicode_CHECK(obj));
3046
3047 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3048 return NULL;
3049 break;
3050 }
3051
3052 case 'V':
3053 {
3054 PyObject *obj = va_arg(*vargs, PyObject *);
3055 const char *str = va_arg(*vargs, const char *);
3056 if (obj) {
3057 assert(_PyUnicode_CHECK(obj));
3058 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3059 return NULL;
3060 }
3061 else {
3062 assert(str != NULL);
3063 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3064 return NULL;
3065 }
3066 break;
3067 }
3068
3069 case 'S':
3070 {
3071 PyObject *obj = va_arg(*vargs, PyObject *);
3072 PyObject *str;
3073 assert(obj);
3074 str = PyObject_Str(obj);
3075 if (!str)
3076 return NULL;
3077 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3078 Py_DECREF(str);
3079 return NULL;
3080 }
3081 Py_DECREF(str);
3082 break;
3083 }
3084
3085 case 'R':
3086 {
3087 PyObject *obj = va_arg(*vargs, PyObject *);
3088 PyObject *repr;
3089 assert(obj);
3090 repr = PyObject_Repr(obj);
3091 if (!repr)
3092 return NULL;
3093 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3094 Py_DECREF(repr);
3095 return NULL;
3096 }
3097 Py_DECREF(repr);
3098 break;
3099 }
3100
3101 case 'A':
3102 {
3103 PyObject *obj = va_arg(*vargs, PyObject *);
3104 PyObject *ascii;
3105 assert(obj);
3106 ascii = PyObject_ASCII(obj);
3107 if (!ascii)
3108 return NULL;
3109 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3110 Py_DECREF(ascii);
3111 return NULL;
3112 }
3113 Py_DECREF(ascii);
3114 break;
3115 }
3116
3117 case '%':
3118 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3119 return NULL;
3120 break;
3121
3122 default:
3123 /* if we stumble upon an unknown formatting code, copy the rest
3124 of the format string to the output string. (we cannot just
3125 skip the code, since there's no way to know what's in the
3126 argument list) */
3127 len = strlen(p);
3128 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3129 return NULL;
3130 f = p+len;
3131 return f;
3132 }
3133
3134 f++;
3135 return f;
3136 }
3137
3138 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3139 PyUnicode_FromFormatV(const char *format, va_list vargs)
3140 {
3141 va_list vargs2;
3142 const char *f;
3143 _PyUnicodeWriter writer;
3144
3145 _PyUnicodeWriter_Init(&writer);
3146 writer.min_length = strlen(format) + 100;
3147 writer.overallocate = 1;
3148
3149 // Copy varags to be able to pass a reference to a subfunction.
3150 va_copy(vargs2, vargs);
3151
3152 for (f = format; *f; ) {
3153 if (*f == '%') {
3154 f = unicode_fromformat_arg(&writer, f, &vargs2);
3155 if (f == NULL)
3156 goto fail;
3157 }
3158 else {
3159 const char *p;
3160 Py_ssize_t len;
3161
3162 p = f;
3163 do
3164 {
3165 if ((unsigned char)*p > 127) {
3166 PyErr_Format(PyExc_ValueError,
3167 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3168 "string, got a non-ASCII byte: 0x%02x",
3169 (unsigned char)*p);
3170 goto fail;
3171 }
3172 p++;
3173 }
3174 while (*p != '\0' && *p != '%');
3175 len = p - f;
3176
3177 if (*p == '\0')
3178 writer.overallocate = 0;
3179
3180 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3181 goto fail;
3182
3183 f = p;
3184 }
3185 }
3186 va_end(vargs2);
3187 return _PyUnicodeWriter_Finish(&writer);
3188
3189 fail:
3190 va_end(vargs2);
3191 _PyUnicodeWriter_Dealloc(&writer);
3192 return NULL;
3193 }
3194
3195 PyObject *
PyUnicode_FromFormat(const char * format,...)3196 PyUnicode_FromFormat(const char *format, ...)
3197 {
3198 PyObject* ret;
3199 va_list vargs;
3200
3201 #ifdef HAVE_STDARG_PROTOTYPES
3202 va_start(vargs, format);
3203 #else
3204 va_start(vargs);
3205 #endif
3206 ret = PyUnicode_FromFormatV(format, vargs);
3207 va_end(vargs);
3208 return ret;
3209 }
3210
3211 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3212 unicode_get_widechar_size(PyObject *unicode)
3213 {
3214 Py_ssize_t res;
3215
3216 assert(unicode != NULL);
3217 assert(_PyUnicode_CHECK(unicode));
3218
3219 #if USE_UNICODE_WCHAR_CACHE
3220 if (_PyUnicode_WSTR(unicode) != NULL) {
3221 return PyUnicode_WSTR_LENGTH(unicode);
3222 }
3223 #endif /* USE_UNICODE_WCHAR_CACHE */
3224 assert(PyUnicode_IS_READY(unicode));
3225
3226 res = _PyUnicode_LENGTH(unicode);
3227 #if SIZEOF_WCHAR_T == 2
3228 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3229 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3230 const Py_UCS4 *end = s + res;
3231 for (; s < end; ++s) {
3232 if (*s > 0xFFFF) {
3233 ++res;
3234 }
3235 }
3236 }
3237 #endif
3238 return res;
3239 }
3240
3241 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3242 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3243 {
3244 assert(unicode != NULL);
3245 assert(_PyUnicode_CHECK(unicode));
3246
3247 #if USE_UNICODE_WCHAR_CACHE
3248 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3249 if (wstr != NULL) {
3250 memcpy(w, wstr, size * sizeof(wchar_t));
3251 return;
3252 }
3253 #else /* USE_UNICODE_WCHAR_CACHE */
3254 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3255 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3256 return;
3257 }
3258 #endif /* USE_UNICODE_WCHAR_CACHE */
3259 assert(PyUnicode_IS_READY(unicode));
3260
3261 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3262 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3263 for (; size--; ++s, ++w) {
3264 *w = *s;
3265 }
3266 }
3267 else {
3268 #if SIZEOF_WCHAR_T == 4
3269 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3270 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3271 for (; size--; ++s, ++w) {
3272 *w = *s;
3273 }
3274 #else
3275 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3276 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3277 for (; size--; ++s, ++w) {
3278 Py_UCS4 ch = *s;
3279 if (ch > 0xFFFF) {
3280 assert(ch <= MAX_UNICODE);
3281 /* encode surrogate pair in this case */
3282 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3283 if (!size--)
3284 break;
3285 *w = Py_UNICODE_LOW_SURROGATE(ch);
3286 }
3287 else {
3288 *w = ch;
3289 }
3290 }
3291 #endif
3292 }
3293 }
3294
3295 #ifdef HAVE_WCHAR_H
3296
3297 /* Convert a Unicode object to a wide character string.
3298
3299 - If w is NULL: return the number of wide characters (including the null
3300 character) required to convert the unicode object. Ignore size argument.
3301
3302 - Otherwise: return the number of wide characters (excluding the null
3303 character) written into w. Write at most size wide characters (including
3304 the null character). */
3305 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3306 PyUnicode_AsWideChar(PyObject *unicode,
3307 wchar_t *w,
3308 Py_ssize_t size)
3309 {
3310 Py_ssize_t res;
3311
3312 if (unicode == NULL) {
3313 PyErr_BadInternalCall();
3314 return -1;
3315 }
3316 if (!PyUnicode_Check(unicode)) {
3317 PyErr_BadArgument();
3318 return -1;
3319 }
3320
3321 res = unicode_get_widechar_size(unicode);
3322 if (w == NULL) {
3323 return res + 1;
3324 }
3325
3326 if (size > res) {
3327 size = res + 1;
3328 }
3329 else {
3330 res = size;
3331 }
3332 unicode_copy_as_widechar(unicode, w, size);
3333
3334 #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3335 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3336 non-Unicode locales and hence needs conversion first. */
3337 if (_Py_LocaleUsesNonUnicodeWchar()) {
3338 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3339 return -1;
3340 }
3341 }
3342 #endif
3343
3344 return res;
3345 }
3346
3347 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3348 PyUnicode_AsWideCharString(PyObject *unicode,
3349 Py_ssize_t *size)
3350 {
3351 wchar_t *buffer;
3352 Py_ssize_t buflen;
3353
3354 if (unicode == NULL) {
3355 PyErr_BadInternalCall();
3356 return NULL;
3357 }
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362
3363 buflen = unicode_get_widechar_size(unicode);
3364 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3365 if (buffer == NULL) {
3366 PyErr_NoMemory();
3367 return NULL;
3368 }
3369 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3370
3371 #if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3372 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3373 non-Unicode locales and hence needs conversion first. */
3374 if (_Py_LocaleUsesNonUnicodeWchar()) {
3375 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3376 return NULL;
3377 }
3378 }
3379 #endif
3380
3381 if (size != NULL) {
3382 *size = buflen;
3383 }
3384 else if (wcslen(buffer) != (size_t)buflen) {
3385 PyMem_Free(buffer);
3386 PyErr_SetString(PyExc_ValueError,
3387 "embedded null character");
3388 return NULL;
3389 }
3390 return buffer;
3391 }
3392
3393 #endif /* HAVE_WCHAR_H */
3394
3395 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3396 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3397 {
3398 wchar_t **p = (wchar_t **)ptr;
3399 if (obj == NULL) {
3400 #if !USE_UNICODE_WCHAR_CACHE
3401 PyMem_Free(*p);
3402 #endif /* USE_UNICODE_WCHAR_CACHE */
3403 *p = NULL;
3404 return 1;
3405 }
3406 if (PyUnicode_Check(obj)) {
3407 #if USE_UNICODE_WCHAR_CACHE
3408 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3409 if (*p == NULL) {
3410 return 0;
3411 }
3412 return 1;
3413 #else /* USE_UNICODE_WCHAR_CACHE */
3414 *p = PyUnicode_AsWideCharString(obj, NULL);
3415 if (*p == NULL) {
3416 return 0;
3417 }
3418 return Py_CLEANUP_SUPPORTED;
3419 #endif /* USE_UNICODE_WCHAR_CACHE */
3420 }
3421 PyErr_Format(PyExc_TypeError,
3422 "argument must be str, not %.50s",
3423 Py_TYPE(obj)->tp_name);
3424 return 0;
3425 }
3426
3427 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3428 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3429 {
3430 wchar_t **p = (wchar_t **)ptr;
3431 if (obj == NULL) {
3432 #if !USE_UNICODE_WCHAR_CACHE
3433 PyMem_Free(*p);
3434 #endif /* USE_UNICODE_WCHAR_CACHE */
3435 *p = NULL;
3436 return 1;
3437 }
3438 if (obj == Py_None) {
3439 *p = NULL;
3440 return 1;
3441 }
3442 if (PyUnicode_Check(obj)) {
3443 #if USE_UNICODE_WCHAR_CACHE
3444 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3445 if (*p == NULL) {
3446 return 0;
3447 }
3448 return 1;
3449 #else /* USE_UNICODE_WCHAR_CACHE */
3450 *p = PyUnicode_AsWideCharString(obj, NULL);
3451 if (*p == NULL) {
3452 return 0;
3453 }
3454 return Py_CLEANUP_SUPPORTED;
3455 #endif /* USE_UNICODE_WCHAR_CACHE */
3456 }
3457 PyErr_Format(PyExc_TypeError,
3458 "argument must be str or None, not %.50s",
3459 Py_TYPE(obj)->tp_name);
3460 return 0;
3461 }
3462
3463 PyObject *
PyUnicode_FromOrdinal(int ordinal)3464 PyUnicode_FromOrdinal(int ordinal)
3465 {
3466 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3467 PyErr_SetString(PyExc_ValueError,
3468 "chr() arg not in range(0x110000)");
3469 return NULL;
3470 }
3471
3472 return unicode_char((Py_UCS4)ordinal);
3473 }
3474
3475 PyObject *
PyUnicode_FromObject(PyObject * obj)3476 PyUnicode_FromObject(PyObject *obj)
3477 {
3478 /* XXX Perhaps we should make this API an alias of
3479 PyObject_Str() instead ?! */
3480 if (PyUnicode_CheckExact(obj)) {
3481 if (PyUnicode_READY(obj) == -1)
3482 return NULL;
3483 Py_INCREF(obj);
3484 return obj;
3485 }
3486 if (PyUnicode_Check(obj)) {
3487 /* For a Unicode subtype that's not a Unicode object,
3488 return a true Unicode object with the same data. */
3489 return _PyUnicode_Copy(obj);
3490 }
3491 PyErr_Format(PyExc_TypeError,
3492 "Can't convert '%.100s' object to str implicitly",
3493 Py_TYPE(obj)->tp_name);
3494 return NULL;
3495 }
3496
3497 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3498 PyUnicode_FromEncodedObject(PyObject *obj,
3499 const char *encoding,
3500 const char *errors)
3501 {
3502 Py_buffer buffer;
3503 PyObject *v;
3504
3505 if (obj == NULL) {
3506 PyErr_BadInternalCall();
3507 return NULL;
3508 }
3509
3510 /* Decoding bytes objects is the most common case and should be fast */
3511 if (PyBytes_Check(obj)) {
3512 if (PyBytes_GET_SIZE(obj) == 0) {
3513 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514 return NULL;
3515 }
3516 _Py_RETURN_UNICODE_EMPTY();
3517 }
3518 return PyUnicode_Decode(
3519 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3520 encoding, errors);
3521 }
3522
3523 if (PyUnicode_Check(obj)) {
3524 PyErr_SetString(PyExc_TypeError,
3525 "decoding str is not supported");
3526 return NULL;
3527 }
3528
3529 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3530 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3531 PyErr_Format(PyExc_TypeError,
3532 "decoding to str: need a bytes-like object, %.80s found",
3533 Py_TYPE(obj)->tp_name);
3534 return NULL;
3535 }
3536
3537 if (buffer.len == 0) {
3538 PyBuffer_Release(&buffer);
3539 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3540 return NULL;
3541 }
3542 _Py_RETURN_UNICODE_EMPTY();
3543 }
3544
3545 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3546 PyBuffer_Release(&buffer);
3547 return v;
3548 }
3549
3550 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3551 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3552 longer than lower_len-1). */
3553 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3554 _Py_normalize_encoding(const char *encoding,
3555 char *lower,
3556 size_t lower_len)
3557 {
3558 const char *e;
3559 char *l;
3560 char *l_end;
3561 int punct;
3562
3563 assert(encoding != NULL);
3564
3565 e = encoding;
3566 l = lower;
3567 l_end = &lower[lower_len - 1];
3568 punct = 0;
3569 while (1) {
3570 char c = *e;
3571 if (c == 0) {
3572 break;
3573 }
3574
3575 if (Py_ISALNUM(c) || c == '.') {
3576 if (punct && l != lower) {
3577 if (l == l_end) {
3578 return 0;
3579 }
3580 *l++ = '_';
3581 }
3582 punct = 0;
3583
3584 if (l == l_end) {
3585 return 0;
3586 }
3587 *l++ = Py_TOLOWER(c);
3588 }
3589 else {
3590 punct = 1;
3591 }
3592
3593 e++;
3594 }
3595 *l = '\0';
3596 return 1;
3597 }
3598
3599 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3600 PyUnicode_Decode(const char *s,
3601 Py_ssize_t size,
3602 const char *encoding,
3603 const char *errors)
3604 {
3605 PyObject *buffer = NULL, *unicode;
3606 Py_buffer info;
3607 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3608
3609 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3610 return NULL;
3611 }
3612
3613 if (size == 0) {
3614 _Py_RETURN_UNICODE_EMPTY();
3615 }
3616
3617 if (encoding == NULL) {
3618 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3619 }
3620
3621 /* Shortcuts for common default encodings */
3622 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3623 char *lower = buflower;
3624
3625 /* Fast paths */
3626 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3627 lower += 3;
3628 if (*lower == '_') {
3629 /* Match "utf8" and "utf_8" */
3630 lower++;
3631 }
3632
3633 if (lower[0] == '8' && lower[1] == 0) {
3634 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3635 }
3636 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3637 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3638 }
3639 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3640 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3641 }
3642 }
3643 else {
3644 if (strcmp(lower, "ascii") == 0
3645 || strcmp(lower, "us_ascii") == 0) {
3646 return PyUnicode_DecodeASCII(s, size, errors);
3647 }
3648 #ifdef MS_WINDOWS
3649 else if (strcmp(lower, "mbcs") == 0) {
3650 return PyUnicode_DecodeMBCS(s, size, errors);
3651 }
3652 #endif
3653 else if (strcmp(lower, "latin1") == 0
3654 || strcmp(lower, "latin_1") == 0
3655 || strcmp(lower, "iso_8859_1") == 0
3656 || strcmp(lower, "iso8859_1") == 0) {
3657 return PyUnicode_DecodeLatin1(s, size, errors);
3658 }
3659 }
3660 }
3661
3662 /* Decode via the codec registry */
3663 buffer = NULL;
3664 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3665 goto onError;
3666 buffer = PyMemoryView_FromBuffer(&info);
3667 if (buffer == NULL)
3668 goto onError;
3669 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3670 if (unicode == NULL)
3671 goto onError;
3672 if (!PyUnicode_Check(unicode)) {
3673 PyErr_Format(PyExc_TypeError,
3674 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3675 "use codecs.decode() to decode to arbitrary types",
3676 encoding,
3677 Py_TYPE(unicode)->tp_name);
3678 Py_DECREF(unicode);
3679 goto onError;
3680 }
3681 Py_DECREF(buffer);
3682 return unicode_result(unicode);
3683
3684 onError:
3685 Py_XDECREF(buffer);
3686 return NULL;
3687 }
3688
3689 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3690 PyUnicode_AsDecodedObject(PyObject *unicode,
3691 const char *encoding,
3692 const char *errors)
3693 {
3694 if (!PyUnicode_Check(unicode)) {
3695 PyErr_BadArgument();
3696 return NULL;
3697 }
3698
3699 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3700 "PyUnicode_AsDecodedObject() is deprecated; "
3701 "use PyCodec_Decode() to decode from str", 1) < 0)
3702 return NULL;
3703
3704 if (encoding == NULL)
3705 encoding = PyUnicode_GetDefaultEncoding();
3706
3707 /* Decode via the codec registry */
3708 return PyCodec_Decode(unicode, encoding, errors);
3709 }
3710
3711 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3712 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3713 const char *encoding,
3714 const char *errors)
3715 {
3716 PyObject *v;
3717
3718 if (!PyUnicode_Check(unicode)) {
3719 PyErr_BadArgument();
3720 goto onError;
3721 }
3722
3723 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3724 "PyUnicode_AsDecodedUnicode() is deprecated; "
3725 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3726 return NULL;
3727
3728 if (encoding == NULL)
3729 encoding = PyUnicode_GetDefaultEncoding();
3730
3731 /* Decode via the codec registry */
3732 v = PyCodec_Decode(unicode, encoding, errors);
3733 if (v == NULL)
3734 goto onError;
3735 if (!PyUnicode_Check(v)) {
3736 PyErr_Format(PyExc_TypeError,
3737 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3738 "use codecs.decode() to decode to arbitrary types",
3739 encoding,
3740 Py_TYPE(unicode)->tp_name);
3741 Py_DECREF(v);
3742 goto onError;
3743 }
3744 return unicode_result(v);
3745
3746 onError:
3747 return NULL;
3748 }
3749
3750 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3751 PyUnicode_Encode(const Py_UNICODE *s,
3752 Py_ssize_t size,
3753 const char *encoding,
3754 const char *errors)
3755 {
3756 PyObject *v, *unicode;
3757
3758 unicode = PyUnicode_FromWideChar(s, size);
3759 if (unicode == NULL)
3760 return NULL;
3761 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3762 Py_DECREF(unicode);
3763 return v;
3764 }
3765
3766 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3767 PyUnicode_AsEncodedObject(PyObject *unicode,
3768 const char *encoding,
3769 const char *errors)
3770 {
3771 PyObject *v;
3772
3773 if (!PyUnicode_Check(unicode)) {
3774 PyErr_BadArgument();
3775 goto onError;
3776 }
3777
3778 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3779 "PyUnicode_AsEncodedObject() is deprecated; "
3780 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3781 "or PyCodec_Encode() for generic encoding", 1) < 0)
3782 return NULL;
3783
3784 if (encoding == NULL)
3785 encoding = PyUnicode_GetDefaultEncoding();
3786
3787 /* Encode via the codec registry */
3788 v = PyCodec_Encode(unicode, encoding, errors);
3789 if (v == NULL)
3790 goto onError;
3791 return v;
3792
3793 onError:
3794 return NULL;
3795 }
3796
3797
3798 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3799 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3800 int current_locale)
3801 {
3802 Py_ssize_t wlen;
3803 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3804 if (wstr == NULL) {
3805 return NULL;
3806 }
3807
3808 if ((size_t)wlen != wcslen(wstr)) {
3809 PyErr_SetString(PyExc_ValueError, "embedded null character");
3810 PyMem_Free(wstr);
3811 return NULL;
3812 }
3813
3814 char *str;
3815 size_t error_pos;
3816 const char *reason;
3817 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3818 current_locale, error_handler);
3819 PyMem_Free(wstr);
3820
3821 if (res != 0) {
3822 if (res == -2) {
3823 PyObject *exc;
3824 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3825 "locale", unicode,
3826 (Py_ssize_t)error_pos,
3827 (Py_ssize_t)(error_pos+1),
3828 reason);
3829 if (exc != NULL) {
3830 PyCodec_StrictErrors(exc);
3831 Py_DECREF(exc);
3832 }
3833 }
3834 else if (res == -3) {
3835 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3836 }
3837 else {
3838 PyErr_NoMemory();
3839 }
3840 return NULL;
3841 }
3842
3843 PyObject *bytes = PyBytes_FromString(str);
3844 PyMem_RawFree(str);
3845 return bytes;
3846 }
3847
3848 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3849 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3850 {
3851 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3852 return unicode_encode_locale(unicode, error_handler, 1);
3853 }
3854
3855 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3856 PyUnicode_EncodeFSDefault(PyObject *unicode)
3857 {
3858 PyInterpreterState *interp = _PyInterpreterState_GET();
3859 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3860 if (fs_codec->utf8) {
3861 return unicode_encode_utf8(unicode,
3862 fs_codec->error_handler,
3863 fs_codec->errors);
3864 }
3865 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3866 else if (fs_codec->encoding) {
3867 return PyUnicode_AsEncodedString(unicode,
3868 fs_codec->encoding,
3869 fs_codec->errors);
3870 }
3871 #endif
3872 else {
3873 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3874 machinery is not ready and so cannot be used:
3875 use wcstombs() in this case. */
3876 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3877 const wchar_t *filesystem_errors = config->filesystem_errors;
3878 assert(filesystem_errors != NULL);
3879 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3880 assert(errors != _Py_ERROR_UNKNOWN);
3881 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3882 return unicode_encode_utf8(unicode, errors, NULL);
3883 #else
3884 return unicode_encode_locale(unicode, errors, 0);
3885 #endif
3886 }
3887 }
3888
3889 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3890 PyUnicode_AsEncodedString(PyObject *unicode,
3891 const char *encoding,
3892 const char *errors)
3893 {
3894 PyObject *v;
3895 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3896
3897 if (!PyUnicode_Check(unicode)) {
3898 PyErr_BadArgument();
3899 return NULL;
3900 }
3901
3902 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3903 return NULL;
3904 }
3905
3906 if (encoding == NULL) {
3907 return _PyUnicode_AsUTF8String(unicode, errors);
3908 }
3909
3910 /* Shortcuts for common default encodings */
3911 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3912 char *lower = buflower;
3913
3914 /* Fast paths */
3915 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3916 lower += 3;
3917 if (*lower == '_') {
3918 /* Match "utf8" and "utf_8" */
3919 lower++;
3920 }
3921
3922 if (lower[0] == '8' && lower[1] == 0) {
3923 return _PyUnicode_AsUTF8String(unicode, errors);
3924 }
3925 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3926 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3927 }
3928 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3929 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3930 }
3931 }
3932 else {
3933 if (strcmp(lower, "ascii") == 0
3934 || strcmp(lower, "us_ascii") == 0) {
3935 return _PyUnicode_AsASCIIString(unicode, errors);
3936 }
3937 #ifdef MS_WINDOWS
3938 else if (strcmp(lower, "mbcs") == 0) {
3939 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3940 }
3941 #endif
3942 else if (strcmp(lower, "latin1") == 0 ||
3943 strcmp(lower, "latin_1") == 0 ||
3944 strcmp(lower, "iso_8859_1") == 0 ||
3945 strcmp(lower, "iso8859_1") == 0) {
3946 return _PyUnicode_AsLatin1String(unicode, errors);
3947 }
3948 }
3949 }
3950
3951 /* Encode via the codec registry */
3952 v = _PyCodec_EncodeText(unicode, encoding, errors);
3953 if (v == NULL)
3954 return NULL;
3955
3956 /* The normal path */
3957 if (PyBytes_Check(v))
3958 return v;
3959
3960 /* If the codec returns a buffer, raise a warning and convert to bytes */
3961 if (PyByteArray_Check(v)) {
3962 int error;
3963 PyObject *b;
3964
3965 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3966 "encoder %s returned bytearray instead of bytes; "
3967 "use codecs.encode() to encode to arbitrary types",
3968 encoding);
3969 if (error) {
3970 Py_DECREF(v);
3971 return NULL;
3972 }
3973
3974 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3975 PyByteArray_GET_SIZE(v));
3976 Py_DECREF(v);
3977 return b;
3978 }
3979
3980 PyErr_Format(PyExc_TypeError,
3981 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3982 "use codecs.encode() to encode to arbitrary types",
3983 encoding,
3984 Py_TYPE(v)->tp_name);
3985 Py_DECREF(v);
3986 return NULL;
3987 }
3988
3989 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3990 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3991 const char *encoding,
3992 const char *errors)
3993 {
3994 PyObject *v;
3995
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 goto onError;
3999 }
4000
4001 if (PyErr_WarnEx(PyExc_DeprecationWarning,
4002 "PyUnicode_AsEncodedUnicode() is deprecated; "
4003 "use PyCodec_Encode() to encode from str to str", 1) < 0)
4004 return NULL;
4005
4006 if (encoding == NULL)
4007 encoding = PyUnicode_GetDefaultEncoding();
4008
4009 /* Encode via the codec registry */
4010 v = PyCodec_Encode(unicode, encoding, errors);
4011 if (v == NULL)
4012 goto onError;
4013 if (!PyUnicode_Check(v)) {
4014 PyErr_Format(PyExc_TypeError,
4015 "'%.400s' encoder returned '%.400s' instead of 'str'; "
4016 "use codecs.encode() to encode to arbitrary types",
4017 encoding,
4018 Py_TYPE(v)->tp_name);
4019 Py_DECREF(v);
4020 goto onError;
4021 }
4022 return v;
4023
4024 onError:
4025 return NULL;
4026 }
4027
4028 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)4029 unicode_decode_locale(const char *str, Py_ssize_t len,
4030 _Py_error_handler errors, int current_locale)
4031 {
4032 if (str[len] != '\0' || (size_t)len != strlen(str)) {
4033 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4034 return NULL;
4035 }
4036
4037 wchar_t *wstr;
4038 size_t wlen;
4039 const char *reason;
4040 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4041 current_locale, errors);
4042 if (res != 0) {
4043 if (res == -2) {
4044 PyObject *exc;
4045 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4046 "locale", str, len,
4047 (Py_ssize_t)wlen,
4048 (Py_ssize_t)(wlen + 1),
4049 reason);
4050 if (exc != NULL) {
4051 PyCodec_StrictErrors(exc);
4052 Py_DECREF(exc);
4053 }
4054 }
4055 else if (res == -3) {
4056 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4057 }
4058 else {
4059 PyErr_NoMemory();
4060 }
4061 return NULL;
4062 }
4063
4064 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4065 PyMem_RawFree(wstr);
4066 return unicode;
4067 }
4068
4069 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4070 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4071 const char *errors)
4072 {
4073 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4074 return unicode_decode_locale(str, len, error_handler, 1);
4075 }
4076
4077 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4078 PyUnicode_DecodeLocale(const char *str, const char *errors)
4079 {
4080 Py_ssize_t size = (Py_ssize_t)strlen(str);
4081 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4082 return unicode_decode_locale(str, size, error_handler, 1);
4083 }
4084
4085
4086 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4087 PyUnicode_DecodeFSDefault(const char *s) {
4088 Py_ssize_t size = (Py_ssize_t)strlen(s);
4089 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4090 }
4091
4092 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4093 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4094 {
4095 PyInterpreterState *interp = _PyInterpreterState_GET();
4096 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4097 if (fs_codec->utf8) {
4098 return unicode_decode_utf8(s, size,
4099 fs_codec->error_handler,
4100 fs_codec->errors,
4101 NULL);
4102 }
4103 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4104 else if (fs_codec->encoding) {
4105 return PyUnicode_Decode(s, size,
4106 fs_codec->encoding,
4107 fs_codec->errors);
4108 }
4109 #endif
4110 else {
4111 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4112 machinery is not ready and so cannot be used:
4113 use mbstowcs() in this case. */
4114 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4115 const wchar_t *filesystem_errors = config->filesystem_errors;
4116 assert(filesystem_errors != NULL);
4117 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4118 assert(errors != _Py_ERROR_UNKNOWN);
4119 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4120 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4121 #else
4122 return unicode_decode_locale(s, size, errors, 0);
4123 #endif
4124 }
4125 }
4126
4127
4128 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4129 PyUnicode_FSConverter(PyObject* arg, void* addr)
4130 {
4131 PyObject *path = NULL;
4132 PyObject *output = NULL;
4133 Py_ssize_t size;
4134 const char *data;
4135 if (arg == NULL) {
4136 Py_DECREF(*(PyObject**)addr);
4137 *(PyObject**)addr = NULL;
4138 return 1;
4139 }
4140 path = PyOS_FSPath(arg);
4141 if (path == NULL) {
4142 return 0;
4143 }
4144 if (PyBytes_Check(path)) {
4145 output = path;
4146 }
4147 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4148 output = PyUnicode_EncodeFSDefault(path);
4149 Py_DECREF(path);
4150 if (!output) {
4151 return 0;
4152 }
4153 assert(PyBytes_Check(output));
4154 }
4155
4156 size = PyBytes_GET_SIZE(output);
4157 data = PyBytes_AS_STRING(output);
4158 if ((size_t)size != strlen(data)) {
4159 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4160 Py_DECREF(output);
4161 return 0;
4162 }
4163 *(PyObject**)addr = output;
4164 return Py_CLEANUP_SUPPORTED;
4165 }
4166
4167
4168 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4169 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4170 {
4171 int is_buffer = 0;
4172 PyObject *path = NULL;
4173 PyObject *output = NULL;
4174 if (arg == NULL) {
4175 Py_DECREF(*(PyObject**)addr);
4176 *(PyObject**)addr = NULL;
4177 return 1;
4178 }
4179
4180 is_buffer = PyObject_CheckBuffer(arg);
4181 if (!is_buffer) {
4182 path = PyOS_FSPath(arg);
4183 if (path == NULL) {
4184 return 0;
4185 }
4186 }
4187 else {
4188 path = arg;
4189 Py_INCREF(arg);
4190 }
4191
4192 if (PyUnicode_Check(path)) {
4193 output = path;
4194 }
4195 else if (PyBytes_Check(path) || is_buffer) {
4196 PyObject *path_bytes = NULL;
4197
4198 if (!PyBytes_Check(path) &&
4199 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4200 "path should be string, bytes, or os.PathLike, not %.200s",
4201 Py_TYPE(arg)->tp_name)) {
4202 Py_DECREF(path);
4203 return 0;
4204 }
4205 path_bytes = PyBytes_FromObject(path);
4206 Py_DECREF(path);
4207 if (!path_bytes) {
4208 return 0;
4209 }
4210 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4211 PyBytes_GET_SIZE(path_bytes));
4212 Py_DECREF(path_bytes);
4213 if (!output) {
4214 return 0;
4215 }
4216 }
4217 else {
4218 PyErr_Format(PyExc_TypeError,
4219 "path should be string, bytes, or os.PathLike, not %.200s",
4220 Py_TYPE(arg)->tp_name);
4221 Py_DECREF(path);
4222 return 0;
4223 }
4224 if (PyUnicode_READY(output) == -1) {
4225 Py_DECREF(output);
4226 return 0;
4227 }
4228 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4229 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4230 PyErr_SetString(PyExc_ValueError, "embedded null character");
4231 Py_DECREF(output);
4232 return 0;
4233 }
4234 *(PyObject**)addr = output;
4235 return Py_CLEANUP_SUPPORTED;
4236 }
4237
4238
4239 static int unicode_fill_utf8(PyObject *unicode);
4240
4241 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4242 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4243 {
4244 if (!PyUnicode_Check(unicode)) {
4245 PyErr_BadArgument();
4246 return NULL;
4247 }
4248 if (PyUnicode_READY(unicode) == -1)
4249 return NULL;
4250
4251 if (PyUnicode_UTF8(unicode) == NULL) {
4252 if (unicode_fill_utf8(unicode) == -1) {
4253 return NULL;
4254 }
4255 }
4256
4257 if (psize)
4258 *psize = PyUnicode_UTF8_LENGTH(unicode);
4259 return PyUnicode_UTF8(unicode);
4260 }
4261
4262 const char *
PyUnicode_AsUTF8(PyObject * unicode)4263 PyUnicode_AsUTF8(PyObject *unicode)
4264 {
4265 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4266 }
4267
4268 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4269 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4270 {
4271 if (!PyUnicode_Check(unicode)) {
4272 PyErr_BadArgument();
4273 return NULL;
4274 }
4275 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4276 if (w == NULL) {
4277 /* Non-ASCII compact unicode object */
4278 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4279 assert(PyUnicode_IS_READY(unicode));
4280
4281 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4282 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4283 PyErr_NoMemory();
4284 return NULL;
4285 }
4286 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4287 if (w == NULL) {
4288 PyErr_NoMemory();
4289 return NULL;
4290 }
4291 unicode_copy_as_widechar(unicode, w, wlen + 1);
4292 _PyUnicode_WSTR(unicode) = w;
4293 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4294 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4295 }
4296 }
4297 if (size != NULL)
4298 *size = PyUnicode_WSTR_LENGTH(unicode);
4299 return w;
4300 }
4301
4302 /* Deprecated APIs */
4303
4304 _Py_COMP_DIAG_PUSH
4305 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4306
4307 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4308 PyUnicode_AsUnicode(PyObject *unicode)
4309 {
4310 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4311 }
4312
4313 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4314 _PyUnicode_AsUnicode(PyObject *unicode)
4315 {
4316 Py_ssize_t size;
4317 const Py_UNICODE *wstr;
4318
4319 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4320 if (wstr && wcslen(wstr) != (size_t)size) {
4321 PyErr_SetString(PyExc_ValueError, "embedded null character");
4322 return NULL;
4323 }
4324 return wstr;
4325 }
4326
4327
4328 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4329 PyUnicode_GetSize(PyObject *unicode)
4330 {
4331 if (!PyUnicode_Check(unicode)) {
4332 PyErr_BadArgument();
4333 goto onError;
4334 }
4335 if (_PyUnicode_WSTR(unicode) == NULL) {
4336 if (PyUnicode_AsUnicode(unicode) == NULL)
4337 goto onError;
4338 }
4339 return PyUnicode_WSTR_LENGTH(unicode);
4340
4341 onError:
4342 return -1;
4343 }
4344
4345 _Py_COMP_DIAG_POP
4346
4347 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4348 PyUnicode_GetLength(PyObject *unicode)
4349 {
4350 if (!PyUnicode_Check(unicode)) {
4351 PyErr_BadArgument();
4352 return -1;
4353 }
4354 if (PyUnicode_READY(unicode) == -1)
4355 return -1;
4356 return PyUnicode_GET_LENGTH(unicode);
4357 }
4358
4359 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4360 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4361 {
4362 const void *data;
4363 int kind;
4364
4365 if (!PyUnicode_Check(unicode)) {
4366 PyErr_BadArgument();
4367 return (Py_UCS4)-1;
4368 }
4369 if (PyUnicode_READY(unicode) == -1) {
4370 return (Py_UCS4)-1;
4371 }
4372 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4373 PyErr_SetString(PyExc_IndexError, "string index out of range");
4374 return (Py_UCS4)-1;
4375 }
4376 data = PyUnicode_DATA(unicode);
4377 kind = PyUnicode_KIND(unicode);
4378 return PyUnicode_READ(kind, data, index);
4379 }
4380
4381 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4382 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4383 {
4384 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4385 PyErr_BadArgument();
4386 return -1;
4387 }
4388 assert(PyUnicode_IS_READY(unicode));
4389 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4390 PyErr_SetString(PyExc_IndexError, "string index out of range");
4391 return -1;
4392 }
4393 if (unicode_check_modifiable(unicode))
4394 return -1;
4395 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4396 PyErr_SetString(PyExc_ValueError, "character out of range");
4397 return -1;
4398 }
4399 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4400 index, ch);
4401 return 0;
4402 }
4403
4404 const char *
PyUnicode_GetDefaultEncoding(void)4405 PyUnicode_GetDefaultEncoding(void)
4406 {
4407 return "utf-8";
4408 }
4409
4410 /* create or adjust a UnicodeDecodeError */
4411 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4412 make_decode_exception(PyObject **exceptionObject,
4413 const char *encoding,
4414 const char *input, Py_ssize_t length,
4415 Py_ssize_t startpos, Py_ssize_t endpos,
4416 const char *reason)
4417 {
4418 if (*exceptionObject == NULL) {
4419 *exceptionObject = PyUnicodeDecodeError_Create(
4420 encoding, input, length, startpos, endpos, reason);
4421 }
4422 else {
4423 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4424 goto onError;
4425 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4426 goto onError;
4427 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4428 goto onError;
4429 }
4430 return;
4431
4432 onError:
4433 Py_CLEAR(*exceptionObject);
4434 }
4435
4436 #ifdef MS_WINDOWS
4437 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4438 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4439 {
4440 if (newsize > *size) {
4441 wchar_t *newbuf = *buf;
4442 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4443 PyErr_NoMemory();
4444 return -1;
4445 }
4446 *buf = newbuf;
4447 }
4448 *size = newsize;
4449 return 0;
4450 }
4451
4452 /* error handling callback helper:
4453 build arguments, call the callback and check the arguments,
4454 if no exception occurred, copy the replacement to the output
4455 and adjust various state variables.
4456 return 0 on success, -1 on error
4457 */
4458
4459 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4460 unicode_decode_call_errorhandler_wchar(
4461 const char *errors, PyObject **errorHandler,
4462 const char *encoding, const char *reason,
4463 const char **input, const char **inend, Py_ssize_t *startinpos,
4464 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4465 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4466 {
4467 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4468
4469 PyObject *restuple = NULL;
4470 PyObject *repunicode = NULL;
4471 Py_ssize_t outsize;
4472 Py_ssize_t insize;
4473 Py_ssize_t requiredsize;
4474 Py_ssize_t newpos;
4475 PyObject *inputobj = NULL;
4476 Py_ssize_t repwlen;
4477
4478 if (*errorHandler == NULL) {
4479 *errorHandler = PyCodec_LookupError(errors);
4480 if (*errorHandler == NULL)
4481 goto onError;
4482 }
4483
4484 make_decode_exception(exceptionObject,
4485 encoding,
4486 *input, *inend - *input,
4487 *startinpos, *endinpos,
4488 reason);
4489 if (*exceptionObject == NULL)
4490 goto onError;
4491
4492 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4493 if (restuple == NULL)
4494 goto onError;
4495 if (!PyTuple_Check(restuple)) {
4496 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4497 goto onError;
4498 }
4499 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4500 goto onError;
4501
4502 /* Copy back the bytes variables, which might have been modified by the
4503 callback */
4504 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4505 if (!inputobj)
4506 goto onError;
4507 *input = PyBytes_AS_STRING(inputobj);
4508 insize = PyBytes_GET_SIZE(inputobj);
4509 *inend = *input + insize;
4510 /* we can DECREF safely, as the exception has another reference,
4511 so the object won't go away. */
4512 Py_DECREF(inputobj);
4513
4514 if (newpos<0)
4515 newpos = insize+newpos;
4516 if (newpos<0 || newpos>insize) {
4517 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4518 goto onError;
4519 }
4520
4521 #if USE_UNICODE_WCHAR_CACHE
4522 _Py_COMP_DIAG_PUSH
4523 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4524 repwlen = PyUnicode_GetSize(repunicode);
4525 if (repwlen < 0)
4526 goto onError;
4527 _Py_COMP_DIAG_POP
4528 #else /* USE_UNICODE_WCHAR_CACHE */
4529 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4530 if (repwlen < 0)
4531 goto onError;
4532 repwlen--;
4533 #endif /* USE_UNICODE_WCHAR_CACHE */
4534 /* need more space? (at least enough for what we
4535 have+the replacement+the rest of the string (starting
4536 at the new input position), so we won't have to check space
4537 when there are no errors in the rest of the string) */
4538 requiredsize = *outpos;
4539 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4540 goto overflow;
4541 requiredsize += repwlen;
4542 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4543 goto overflow;
4544 requiredsize += insize - newpos;
4545 outsize = *bufsize;
4546 if (requiredsize > outsize) {
4547 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4548 requiredsize = 2*outsize;
4549 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4550 goto onError;
4551 }
4552 }
4553 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4554 *outpos += repwlen;
4555 *endinpos = newpos;
4556 *inptr = *input + newpos;
4557
4558 /* we made it! */
4559 Py_DECREF(restuple);
4560 return 0;
4561
4562 overflow:
4563 PyErr_SetString(PyExc_OverflowError,
4564 "decoded result is too long for a Python string");
4565
4566 onError:
4567 Py_XDECREF(restuple);
4568 return -1;
4569 }
4570 #endif /* MS_WINDOWS */
4571
4572 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4573 unicode_decode_call_errorhandler_writer(
4574 const char *errors, PyObject **errorHandler,
4575 const char *encoding, const char *reason,
4576 const char **input, const char **inend, Py_ssize_t *startinpos,
4577 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4578 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4579 {
4580 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4581
4582 PyObject *restuple = NULL;
4583 PyObject *repunicode = NULL;
4584 Py_ssize_t insize;
4585 Py_ssize_t newpos;
4586 Py_ssize_t replen;
4587 Py_ssize_t remain;
4588 PyObject *inputobj = NULL;
4589 int need_to_grow = 0;
4590 const char *new_inptr;
4591
4592 if (*errorHandler == NULL) {
4593 *errorHandler = PyCodec_LookupError(errors);
4594 if (*errorHandler == NULL)
4595 goto onError;
4596 }
4597
4598 make_decode_exception(exceptionObject,
4599 encoding,
4600 *input, *inend - *input,
4601 *startinpos, *endinpos,
4602 reason);
4603 if (*exceptionObject == NULL)
4604 goto onError;
4605
4606 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4607 if (restuple == NULL)
4608 goto onError;
4609 if (!PyTuple_Check(restuple)) {
4610 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4611 goto onError;
4612 }
4613 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4614 goto onError;
4615
4616 /* Copy back the bytes variables, which might have been modified by the
4617 callback */
4618 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4619 if (!inputobj)
4620 goto onError;
4621 remain = *inend - *input - *endinpos;
4622 *input = PyBytes_AS_STRING(inputobj);
4623 insize = PyBytes_GET_SIZE(inputobj);
4624 *inend = *input + insize;
4625 /* we can DECREF safely, as the exception has another reference,
4626 so the object won't go away. */
4627 Py_DECREF(inputobj);
4628
4629 if (newpos<0)
4630 newpos = insize+newpos;
4631 if (newpos<0 || newpos>insize) {
4632 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4633 goto onError;
4634 }
4635
4636 replen = PyUnicode_GET_LENGTH(repunicode);
4637 if (replen > 1) {
4638 writer->min_length += replen - 1;
4639 need_to_grow = 1;
4640 }
4641 new_inptr = *input + newpos;
4642 if (*inend - new_inptr > remain) {
4643 /* We don't know the decoding algorithm here so we make the worst
4644 assumption that one byte decodes to one unicode character.
4645 If unfortunately one byte could decode to more unicode characters,
4646 the decoder may write out-of-bound then. Is it possible for the
4647 algorithms using this function? */
4648 writer->min_length += *inend - new_inptr - remain;
4649 need_to_grow = 1;
4650 }
4651 if (need_to_grow) {
4652 writer->overallocate = 1;
4653 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4654 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4655 goto onError;
4656 }
4657 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4658 goto onError;
4659
4660 *endinpos = newpos;
4661 *inptr = new_inptr;
4662
4663 /* we made it! */
4664 Py_DECREF(restuple);
4665 return 0;
4666
4667 onError:
4668 Py_XDECREF(restuple);
4669 return -1;
4670 }
4671
4672 /* --- UTF-7 Codec -------------------------------------------------------- */
4673
4674 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4675
4676 /* Three simple macros defining base-64. */
4677
4678 /* Is c a base-64 character? */
4679
4680 #define IS_BASE64(c) \
4681 (((c) >= 'A' && (c) <= 'Z') || \
4682 ((c) >= 'a' && (c) <= 'z') || \
4683 ((c) >= '0' && (c) <= '9') || \
4684 (c) == '+' || (c) == '/')
4685
4686 /* given that c is a base-64 character, what is its base-64 value? */
4687
4688 #define FROM_BASE64(c) \
4689 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4690 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4691 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4692 (c) == '+' ? 62 : 63)
4693
4694 /* What is the base-64 character of the bottom 6 bits of n? */
4695
4696 #define TO_BASE64(n) \
4697 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4698
4699 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4700 * decoded as itself. We are permissive on decoding; the only ASCII
4701 * byte not decoding to itself is the + which begins a base64
4702 * string. */
4703
4704 #define DECODE_DIRECT(c) \
4705 ((c) <= 127 && (c) != '+')
4706
4707 /* The UTF-7 encoder treats ASCII characters differently according to
4708 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4709 * the above). See RFC2152. This array identifies these different
4710 * sets:
4711 * 0 : "Set D"
4712 * alphanumeric and '(),-./:?
4713 * 1 : "Set O"
4714 * !"#$%&*;<=>@[]^_`{|}
4715 * 2 : "whitespace"
4716 * ht nl cr sp
4717 * 3 : special (must be base64 encoded)
4718 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4719 */
4720
4721 static
4722 char utf7_category[128] = {
4723 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4724 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4725 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4726 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4727 /* sp ! " # $ % & ' ( ) * + , - . / */
4728 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4729 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4731 /* @ A B C D E F G H I J K L M N O */
4732 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4733 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4735 /* ` a b c d e f g h i j k l m n o */
4736 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4737 /* p q r s t u v w x y z { | } ~ del */
4738 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4739 };
4740
4741 /* ENCODE_DIRECT: this character should be encoded as itself. The
4742 * answer depends on whether we are encoding set O as itself, and also
4743 * on whether we are encoding whitespace as itself. RFC2152 makes it
4744 * clear that the answers to these questions vary between
4745 * applications, so this code needs to be flexible. */
4746
4747 #define ENCODE_DIRECT(c, directO, directWS) \
4748 ((c) < 128 && (c) > 0 && \
4749 ((utf7_category[(c)] == 0) || \
4750 (directWS && (utf7_category[(c)] == 2)) || \
4751 (directO && (utf7_category[(c)] == 1))))
4752
4753 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4754 PyUnicode_DecodeUTF7(const char *s,
4755 Py_ssize_t size,
4756 const char *errors)
4757 {
4758 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4759 }
4760
4761 /* The decoder. The only state we preserve is our read position,
4762 * i.e. how many characters we have consumed. So if we end in the
4763 * middle of a shift sequence we have to back off the read position
4764 * and the output to the beginning of the sequence, otherwise we lose
4765 * all the shift state (seen bits, number of bits seen, high
4766 * surrogate). */
4767
4768 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4769 PyUnicode_DecodeUTF7Stateful(const char *s,
4770 Py_ssize_t size,
4771 const char *errors,
4772 Py_ssize_t *consumed)
4773 {
4774 const char *starts = s;
4775 Py_ssize_t startinpos;
4776 Py_ssize_t endinpos;
4777 const char *e;
4778 _PyUnicodeWriter writer;
4779 const char *errmsg = "";
4780 int inShift = 0;
4781 Py_ssize_t shiftOutStart;
4782 unsigned int base64bits = 0;
4783 unsigned long base64buffer = 0;
4784 Py_UCS4 surrogate = 0;
4785 PyObject *errorHandler = NULL;
4786 PyObject *exc = NULL;
4787
4788 if (size == 0) {
4789 if (consumed)
4790 *consumed = 0;
4791 _Py_RETURN_UNICODE_EMPTY();
4792 }
4793
4794 /* Start off assuming it's all ASCII. Widen later as necessary. */
4795 _PyUnicodeWriter_Init(&writer);
4796 writer.min_length = size;
4797
4798 shiftOutStart = 0;
4799 e = s + size;
4800
4801 while (s < e) {
4802 Py_UCS4 ch;
4803 restart:
4804 ch = (unsigned char) *s;
4805
4806 if (inShift) { /* in a base-64 section */
4807 if (IS_BASE64(ch)) { /* consume a base-64 character */
4808 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4809 base64bits += 6;
4810 s++;
4811 if (base64bits >= 16) {
4812 /* we have enough bits for a UTF-16 value */
4813 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4814 base64bits -= 16;
4815 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4816 assert(outCh <= 0xffff);
4817 if (surrogate) {
4818 /* expecting a second surrogate */
4819 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4820 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4821 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4822 goto onError;
4823 surrogate = 0;
4824 continue;
4825 }
4826 else {
4827 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4828 goto onError;
4829 surrogate = 0;
4830 }
4831 }
4832 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4833 /* first surrogate */
4834 surrogate = outCh;
4835 }
4836 else {
4837 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4838 goto onError;
4839 }
4840 }
4841 }
4842 else { /* now leaving a base-64 section */
4843 inShift = 0;
4844 if (base64bits > 0) { /* left-over bits */
4845 if (base64bits >= 6) {
4846 /* We've seen at least one base-64 character */
4847 s++;
4848 errmsg = "partial character in shift sequence";
4849 goto utf7Error;
4850 }
4851 else {
4852 /* Some bits remain; they should be zero */
4853 if (base64buffer != 0) {
4854 s++;
4855 errmsg = "non-zero padding bits in shift sequence";
4856 goto utf7Error;
4857 }
4858 }
4859 }
4860 if (surrogate && DECODE_DIRECT(ch)) {
4861 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4862 goto onError;
4863 }
4864 surrogate = 0;
4865 if (ch == '-') {
4866 /* '-' is absorbed; other terminating
4867 characters are preserved */
4868 s++;
4869 }
4870 }
4871 }
4872 else if ( ch == '+' ) {
4873 startinpos = s-starts;
4874 s++; /* consume '+' */
4875 if (s < e && *s == '-') { /* '+-' encodes '+' */
4876 s++;
4877 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4878 goto onError;
4879 }
4880 else if (s < e && !IS_BASE64(*s)) {
4881 s++;
4882 errmsg = "ill-formed sequence";
4883 goto utf7Error;
4884 }
4885 else { /* begin base64-encoded section */
4886 inShift = 1;
4887 surrogate = 0;
4888 shiftOutStart = writer.pos;
4889 base64bits = 0;
4890 base64buffer = 0;
4891 }
4892 }
4893 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4894 s++;
4895 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4896 goto onError;
4897 }
4898 else {
4899 startinpos = s-starts;
4900 s++;
4901 errmsg = "unexpected special character";
4902 goto utf7Error;
4903 }
4904 continue;
4905 utf7Error:
4906 endinpos = s-starts;
4907 if (unicode_decode_call_errorhandler_writer(
4908 errors, &errorHandler,
4909 "utf7", errmsg,
4910 &starts, &e, &startinpos, &endinpos, &exc, &s,
4911 &writer))
4912 goto onError;
4913 }
4914
4915 /* end of string */
4916
4917 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4918 /* if we're in an inconsistent state, that's an error */
4919 inShift = 0;
4920 if (surrogate ||
4921 (base64bits >= 6) ||
4922 (base64bits > 0 && base64buffer != 0)) {
4923 endinpos = size;
4924 if (unicode_decode_call_errorhandler_writer(
4925 errors, &errorHandler,
4926 "utf7", "unterminated shift sequence",
4927 &starts, &e, &startinpos, &endinpos, &exc, &s,
4928 &writer))
4929 goto onError;
4930 if (s < e)
4931 goto restart;
4932 }
4933 }
4934
4935 /* return state */
4936 if (consumed) {
4937 if (inShift) {
4938 *consumed = startinpos;
4939 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4940 PyObject *result = PyUnicode_FromKindAndData(
4941 writer.kind, writer.data, shiftOutStart);
4942 Py_XDECREF(errorHandler);
4943 Py_XDECREF(exc);
4944 _PyUnicodeWriter_Dealloc(&writer);
4945 return result;
4946 }
4947 writer.pos = shiftOutStart; /* back off output */
4948 }
4949 else {
4950 *consumed = s-starts;
4951 }
4952 }
4953
4954 Py_XDECREF(errorHandler);
4955 Py_XDECREF(exc);
4956 return _PyUnicodeWriter_Finish(&writer);
4957
4958 onError:
4959 Py_XDECREF(errorHandler);
4960 Py_XDECREF(exc);
4961 _PyUnicodeWriter_Dealloc(&writer);
4962 return NULL;
4963 }
4964
4965
4966 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4967 _PyUnicode_EncodeUTF7(PyObject *str,
4968 int base64SetO,
4969 int base64WhiteSpace,
4970 const char *errors)
4971 {
4972 int kind;
4973 const void *data;
4974 Py_ssize_t len;
4975 PyObject *v;
4976 int inShift = 0;
4977 Py_ssize_t i;
4978 unsigned int base64bits = 0;
4979 unsigned long base64buffer = 0;
4980 char * out;
4981 const char * start;
4982
4983 if (PyUnicode_READY(str) == -1)
4984 return NULL;
4985 kind = PyUnicode_KIND(str);
4986 data = PyUnicode_DATA(str);
4987 len = PyUnicode_GET_LENGTH(str);
4988
4989 if (len == 0)
4990 return PyBytes_FromStringAndSize(NULL, 0);
4991
4992 /* It might be possible to tighten this worst case */
4993 if (len > PY_SSIZE_T_MAX / 8)
4994 return PyErr_NoMemory();
4995 v = PyBytes_FromStringAndSize(NULL, len * 8);
4996 if (v == NULL)
4997 return NULL;
4998
4999 start = out = PyBytes_AS_STRING(v);
5000 for (i = 0; i < len; ++i) {
5001 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5002
5003 if (inShift) {
5004 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5005 /* shifting out */
5006 if (base64bits) { /* output remaining bits */
5007 *out++ = TO_BASE64(base64buffer << (6-base64bits));
5008 base64buffer = 0;
5009 base64bits = 0;
5010 }
5011 inShift = 0;
5012 /* Characters not in the BASE64 set implicitly unshift the sequence
5013 so no '-' is required, except if the character is itself a '-' */
5014 if (IS_BASE64(ch) || ch == '-') {
5015 *out++ = '-';
5016 }
5017 *out++ = (char) ch;
5018 }
5019 else {
5020 goto encode_char;
5021 }
5022 }
5023 else { /* not in a shift sequence */
5024 if (ch == '+') {
5025 *out++ = '+';
5026 *out++ = '-';
5027 }
5028 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5029 *out++ = (char) ch;
5030 }
5031 else {
5032 *out++ = '+';
5033 inShift = 1;
5034 goto encode_char;
5035 }
5036 }
5037 continue;
5038 encode_char:
5039 if (ch >= 0x10000) {
5040 assert(ch <= MAX_UNICODE);
5041
5042 /* code first surrogate */
5043 base64bits += 16;
5044 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
5045 while (base64bits >= 6) {
5046 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5047 base64bits -= 6;
5048 }
5049 /* prepare second surrogate */
5050 ch = Py_UNICODE_LOW_SURROGATE(ch);
5051 }
5052 base64bits += 16;
5053 base64buffer = (base64buffer << 16) | ch;
5054 while (base64bits >= 6) {
5055 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5056 base64bits -= 6;
5057 }
5058 }
5059 if (base64bits)
5060 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5061 if (inShift)
5062 *out++ = '-';
5063 if (_PyBytes_Resize(&v, out - start) < 0)
5064 return NULL;
5065 return v;
5066 }
5067 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)5068 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5069 Py_ssize_t size,
5070 int base64SetO,
5071 int base64WhiteSpace,
5072 const char *errors)
5073 {
5074 PyObject *result;
5075 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5076 if (tmp == NULL)
5077 return NULL;
5078 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
5079 base64WhiteSpace, errors);
5080 Py_DECREF(tmp);
5081 return result;
5082 }
5083
5084 #undef IS_BASE64
5085 #undef FROM_BASE64
5086 #undef TO_BASE64
5087 #undef DECODE_DIRECT
5088 #undef ENCODE_DIRECT
5089
5090 /* --- UTF-8 Codec -------------------------------------------------------- */
5091
5092 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5093 PyUnicode_DecodeUTF8(const char *s,
5094 Py_ssize_t size,
5095 const char *errors)
5096 {
5097 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5098 }
5099
5100 #include "stringlib/asciilib.h"
5101 #include "stringlib/codecs.h"
5102 #include "stringlib/undef.h"
5103
5104 #include "stringlib/ucs1lib.h"
5105 #include "stringlib/codecs.h"
5106 #include "stringlib/undef.h"
5107
5108 #include "stringlib/ucs2lib.h"
5109 #include "stringlib/codecs.h"
5110 #include "stringlib/undef.h"
5111
5112 #include "stringlib/ucs4lib.h"
5113 #include "stringlib/codecs.h"
5114 #include "stringlib/undef.h"
5115
5116 /* Mask to quickly check whether a C 'size_t' contains a
5117 non-ASCII, UTF8-encoded char. */
5118 #if (SIZEOF_SIZE_T == 8)
5119 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5120 #elif (SIZEOF_SIZE_T == 4)
5121 # define ASCII_CHAR_MASK 0x80808080U
5122 #else
5123 # error C 'size_t' size should be either 4 or 8!
5124 #endif
5125
5126 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5127 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5128 {
5129 const char *p = start;
5130
5131 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5132 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5133 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5134 /* Fast path, see in STRINGLIB(utf8_decode) for
5135 an explanation. */
5136 /* Help allocation */
5137 const char *_p = p;
5138 Py_UCS1 * q = dest;
5139 while (_p + SIZEOF_SIZE_T <= end) {
5140 size_t value = *(const size_t *) _p;
5141 if (value & ASCII_CHAR_MASK)
5142 break;
5143 *((size_t *)q) = value;
5144 _p += SIZEOF_SIZE_T;
5145 q += SIZEOF_SIZE_T;
5146 }
5147 p = _p;
5148 while (p < end) {
5149 if ((unsigned char)*p & 0x80)
5150 break;
5151 *q++ = *p++;
5152 }
5153 return p - start;
5154 }
5155 #endif
5156 while (p < end) {
5157 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5158 for an explanation. */
5159 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5160 /* Help allocation */
5161 const char *_p = p;
5162 while (_p + SIZEOF_SIZE_T <= end) {
5163 size_t value = *(const size_t *) _p;
5164 if (value & ASCII_CHAR_MASK)
5165 break;
5166 _p += SIZEOF_SIZE_T;
5167 }
5168 p = _p;
5169 if (_p == end)
5170 break;
5171 }
5172 if ((unsigned char)*p & 0x80)
5173 break;
5174 ++p;
5175 }
5176 memcpy(dest, start, p - start);
5177 return p - start;
5178 }
5179
5180 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5181 unicode_decode_utf8(const char *s, Py_ssize_t size,
5182 _Py_error_handler error_handler, const char *errors,
5183 Py_ssize_t *consumed)
5184 {
5185 if (size == 0) {
5186 if (consumed)
5187 *consumed = 0;
5188 _Py_RETURN_UNICODE_EMPTY();
5189 }
5190
5191 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5192 if (size == 1 && (unsigned char)s[0] < 128) {
5193 if (consumed) {
5194 *consumed = 1;
5195 }
5196 return get_latin1_char((unsigned char)s[0]);
5197 }
5198
5199 const char *starts = s;
5200 const char *end = s + size;
5201
5202 // fast path: try ASCII string.
5203 PyObject *u = PyUnicode_New(size, 127);
5204 if (u == NULL) {
5205 return NULL;
5206 }
5207 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5208 if (s == end) {
5209 return u;
5210 }
5211
5212 // Use _PyUnicodeWriter after fast path is failed.
5213 _PyUnicodeWriter writer;
5214 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5215 writer.pos = s - starts;
5216
5217 Py_ssize_t startinpos, endinpos;
5218 const char *errmsg = "";
5219 PyObject *error_handler_obj = NULL;
5220 PyObject *exc = NULL;
5221
5222 while (s < end) {
5223 Py_UCS4 ch;
5224 int kind = writer.kind;
5225
5226 if (kind == PyUnicode_1BYTE_KIND) {
5227 if (PyUnicode_IS_ASCII(writer.buffer))
5228 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5229 else
5230 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5231 } else if (kind == PyUnicode_2BYTE_KIND) {
5232 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5233 } else {
5234 assert(kind == PyUnicode_4BYTE_KIND);
5235 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5236 }
5237
5238 switch (ch) {
5239 case 0:
5240 if (s == end || consumed)
5241 goto End;
5242 errmsg = "unexpected end of data";
5243 startinpos = s - starts;
5244 endinpos = end - starts;
5245 break;
5246 case 1:
5247 errmsg = "invalid start byte";
5248 startinpos = s - starts;
5249 endinpos = startinpos + 1;
5250 break;
5251 case 2:
5252 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5253 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5254 {
5255 /* Truncated surrogate code in range D800-DFFF */
5256 goto End;
5257 }
5258 /* fall through */
5259 case 3:
5260 case 4:
5261 errmsg = "invalid continuation byte";
5262 startinpos = s - starts;
5263 endinpos = startinpos + ch - 1;
5264 break;
5265 default:
5266 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5267 goto onError;
5268 continue;
5269 }
5270
5271 if (error_handler == _Py_ERROR_UNKNOWN)
5272 error_handler = _Py_GetErrorHandler(errors);
5273
5274 switch (error_handler) {
5275 case _Py_ERROR_IGNORE:
5276 s += (endinpos - startinpos);
5277 break;
5278
5279 case _Py_ERROR_REPLACE:
5280 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5281 goto onError;
5282 s += (endinpos - startinpos);
5283 break;
5284
5285 case _Py_ERROR_SURROGATEESCAPE:
5286 {
5287 Py_ssize_t i;
5288
5289 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5290 goto onError;
5291 for (i=startinpos; i<endinpos; i++) {
5292 ch = (Py_UCS4)(unsigned char)(starts[i]);
5293 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5294 ch + 0xdc00);
5295 writer.pos++;
5296 }
5297 s += (endinpos - startinpos);
5298 break;
5299 }
5300
5301 default:
5302 if (unicode_decode_call_errorhandler_writer(
5303 errors, &error_handler_obj,
5304 "utf-8", errmsg,
5305 &starts, &end, &startinpos, &endinpos, &exc, &s,
5306 &writer))
5307 goto onError;
5308 }
5309 }
5310
5311 End:
5312 if (consumed)
5313 *consumed = s - starts;
5314
5315 Py_XDECREF(error_handler_obj);
5316 Py_XDECREF(exc);
5317 return _PyUnicodeWriter_Finish(&writer);
5318
5319 onError:
5320 Py_XDECREF(error_handler_obj);
5321 Py_XDECREF(exc);
5322 _PyUnicodeWriter_Dealloc(&writer);
5323 return NULL;
5324 }
5325
5326
5327 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5328 PyUnicode_DecodeUTF8Stateful(const char *s,
5329 Py_ssize_t size,
5330 const char *errors,
5331 Py_ssize_t *consumed)
5332 {
5333 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5334 }
5335
5336
5337 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5338 non-zero, use strict error handler otherwise.
5339
5340 On success, write a pointer to a newly allocated wide character string into
5341 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5342 (in number of wchar_t units) into *wlen (if wlen is set).
5343
5344 On memory allocation failure, return -1.
5345
5346 On decoding error (if surrogateescape is zero), return -2. If wlen is
5347 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5348 is not NULL, write the decoding error message into *reason. */
5349 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5350 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5351 const char **reason, _Py_error_handler errors)
5352 {
5353 const char *orig_s = s;
5354 const char *e;
5355 wchar_t *unicode;
5356 Py_ssize_t outpos;
5357
5358 int surrogateescape = 0;
5359 int surrogatepass = 0;
5360 switch (errors)
5361 {
5362 case _Py_ERROR_STRICT:
5363 break;
5364 case _Py_ERROR_SURROGATEESCAPE:
5365 surrogateescape = 1;
5366 break;
5367 case _Py_ERROR_SURROGATEPASS:
5368 surrogatepass = 1;
5369 break;
5370 default:
5371 return -3;
5372 }
5373
5374 /* Note: size will always be longer than the resulting Unicode
5375 character count */
5376 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5377 return -1;
5378 }
5379
5380 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5381 if (!unicode) {
5382 return -1;
5383 }
5384
5385 /* Unpack UTF-8 encoded data */
5386 e = s + size;
5387 outpos = 0;
5388 while (s < e) {
5389 Py_UCS4 ch;
5390 #if SIZEOF_WCHAR_T == 4
5391 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5392 #else
5393 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5394 #endif
5395 if (ch > 0xFF) {
5396 #if SIZEOF_WCHAR_T == 4
5397 Py_UNREACHABLE();
5398 #else
5399 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5400 /* write a surrogate pair */
5401 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5402 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5403 #endif
5404 }
5405 else {
5406 if (!ch && s == e) {
5407 break;
5408 }
5409
5410 if (surrogateescape) {
5411 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5412 }
5413 else {
5414 /* Is it a valid three-byte code? */
5415 if (surrogatepass
5416 && (e - s) >= 3
5417 && (s[0] & 0xf0) == 0xe0
5418 && (s[1] & 0xc0) == 0x80
5419 && (s[2] & 0xc0) == 0x80)
5420 {
5421 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5422 s += 3;
5423 unicode[outpos++] = ch;
5424 }
5425 else {
5426 PyMem_RawFree(unicode );
5427 if (reason != NULL) {
5428 switch (ch) {
5429 case 0:
5430 *reason = "unexpected end of data";
5431 break;
5432 case 1:
5433 *reason = "invalid start byte";
5434 break;
5435 /* 2, 3, 4 */
5436 default:
5437 *reason = "invalid continuation byte";
5438 break;
5439 }
5440 }
5441 if (wlen != NULL) {
5442 *wlen = s - orig_s;
5443 }
5444 return -2;
5445 }
5446 }
5447 }
5448 }
5449 unicode[outpos] = L'\0';
5450 if (wlen) {
5451 *wlen = outpos;
5452 }
5453 *wstr = unicode;
5454 return 0;
5455 }
5456
5457
5458 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5459 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5460 size_t *wlen)
5461 {
5462 wchar_t *wstr;
5463 int res = _Py_DecodeUTF8Ex(arg, arglen,
5464 &wstr, wlen,
5465 NULL, _Py_ERROR_SURROGATEESCAPE);
5466 if (res != 0) {
5467 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5468 assert(res != -3);
5469 if (wlen) {
5470 *wlen = (size_t)res;
5471 }
5472 return NULL;
5473 }
5474 return wstr;
5475 }
5476
5477
5478 /* UTF-8 encoder using the surrogateescape error handler .
5479
5480 On success, return 0 and write the newly allocated character string (use
5481 PyMem_Free() to free the memory) into *str.
5482
5483 On encoding failure, return -2 and write the position of the invalid
5484 surrogate character into *error_pos (if error_pos is set) and the decoding
5485 error message into *reason (if reason is set).
5486
5487 On memory allocation failure, return -1. */
5488 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5489 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5490 const char **reason, int raw_malloc, _Py_error_handler errors)
5491 {
5492 const Py_ssize_t max_char_size = 4;
5493 Py_ssize_t len = wcslen(text);
5494
5495 assert(len >= 0);
5496
5497 int surrogateescape = 0;
5498 int surrogatepass = 0;
5499 switch (errors)
5500 {
5501 case _Py_ERROR_STRICT:
5502 break;
5503 case _Py_ERROR_SURROGATEESCAPE:
5504 surrogateescape = 1;
5505 break;
5506 case _Py_ERROR_SURROGATEPASS:
5507 surrogatepass = 1;
5508 break;
5509 default:
5510 return -3;
5511 }
5512
5513 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5514 return -1;
5515 }
5516 char *bytes;
5517 if (raw_malloc) {
5518 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5519 }
5520 else {
5521 bytes = PyMem_Malloc((len + 1) * max_char_size);
5522 }
5523 if (bytes == NULL) {
5524 return -1;
5525 }
5526
5527 char *p = bytes;
5528 Py_ssize_t i;
5529 for (i = 0; i < len; ) {
5530 Py_ssize_t ch_pos = i;
5531 Py_UCS4 ch = text[i];
5532 i++;
5533 #if Py_UNICODE_SIZE == 2
5534 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5535 && i < len
5536 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5537 {
5538 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5539 i++;
5540 }
5541 #endif
5542
5543 if (ch < 0x80) {
5544 /* Encode ASCII */
5545 *p++ = (char) ch;
5546
5547 }
5548 else if (ch < 0x0800) {
5549 /* Encode Latin-1 */
5550 *p++ = (char)(0xc0 | (ch >> 6));
5551 *p++ = (char)(0x80 | (ch & 0x3f));
5552 }
5553 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5554 /* surrogateescape error handler */
5555 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5556 if (error_pos != NULL) {
5557 *error_pos = (size_t)ch_pos;
5558 }
5559 if (reason != NULL) {
5560 *reason = "encoding error";
5561 }
5562 if (raw_malloc) {
5563 PyMem_RawFree(bytes);
5564 }
5565 else {
5566 PyMem_Free(bytes);
5567 }
5568 return -2;
5569 }
5570 *p++ = (char)(ch & 0xff);
5571 }
5572 else if (ch < 0x10000) {
5573 *p++ = (char)(0xe0 | (ch >> 12));
5574 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5575 *p++ = (char)(0x80 | (ch & 0x3f));
5576 }
5577 else { /* ch >= 0x10000 */
5578 assert(ch <= MAX_UNICODE);
5579 /* Encode UCS4 Unicode ordinals */
5580 *p++ = (char)(0xf0 | (ch >> 18));
5581 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5582 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5583 *p++ = (char)(0x80 | (ch & 0x3f));
5584 }
5585 }
5586 *p++ = '\0';
5587
5588 size_t final_size = (p - bytes);
5589 char *bytes2;
5590 if (raw_malloc) {
5591 bytes2 = PyMem_RawRealloc(bytes, final_size);
5592 }
5593 else {
5594 bytes2 = PyMem_Realloc(bytes, final_size);
5595 }
5596 if (bytes2 == NULL) {
5597 if (error_pos != NULL) {
5598 *error_pos = (size_t)-1;
5599 }
5600 if (raw_malloc) {
5601 PyMem_RawFree(bytes);
5602 }
5603 else {
5604 PyMem_Free(bytes);
5605 }
5606 return -1;
5607 }
5608 *str = bytes2;
5609 return 0;
5610 }
5611
5612
5613 /* Primary internal function which creates utf8 encoded bytes objects.
5614
5615 Allocation strategy: if the string is short, convert into a stack buffer
5616 and allocate exactly as much space needed at the end. Else allocate the
5617 maximum possible needed (4 result bytes per Unicode character), and return
5618 the excess memory at the end.
5619 */
5620 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5621 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5622 const char *errors)
5623 {
5624 if (!PyUnicode_Check(unicode)) {
5625 PyErr_BadArgument();
5626 return NULL;
5627 }
5628
5629 if (PyUnicode_READY(unicode) == -1)
5630 return NULL;
5631
5632 if (PyUnicode_UTF8(unicode))
5633 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5634 PyUnicode_UTF8_LENGTH(unicode));
5635
5636 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5637 const void *data = PyUnicode_DATA(unicode);
5638 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5639
5640 _PyBytesWriter writer;
5641 char *end;
5642
5643 switch (kind) {
5644 default:
5645 Py_UNREACHABLE();
5646 case PyUnicode_1BYTE_KIND:
5647 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5648 assert(!PyUnicode_IS_ASCII(unicode));
5649 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5650 break;
5651 case PyUnicode_2BYTE_KIND:
5652 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5653 break;
5654 case PyUnicode_4BYTE_KIND:
5655 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5656 break;
5657 }
5658
5659 if (end == NULL) {
5660 _PyBytesWriter_Dealloc(&writer);
5661 return NULL;
5662 }
5663 return _PyBytesWriter_Finish(&writer, end);
5664 }
5665
5666 static int
unicode_fill_utf8(PyObject * unicode)5667 unicode_fill_utf8(PyObject *unicode)
5668 {
5669 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5670 assert(!PyUnicode_IS_ASCII(unicode));
5671
5672 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5673 const void *data = PyUnicode_DATA(unicode);
5674 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5675
5676 _PyBytesWriter writer;
5677 char *end;
5678
5679 switch (kind) {
5680 default:
5681 Py_UNREACHABLE();
5682 case PyUnicode_1BYTE_KIND:
5683 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5684 _Py_ERROR_STRICT, NULL);
5685 break;
5686 case PyUnicode_2BYTE_KIND:
5687 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5688 _Py_ERROR_STRICT, NULL);
5689 break;
5690 case PyUnicode_4BYTE_KIND:
5691 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5692 _Py_ERROR_STRICT, NULL);
5693 break;
5694 }
5695 if (end == NULL) {
5696 _PyBytesWriter_Dealloc(&writer);
5697 return -1;
5698 }
5699
5700 const char *start = writer.use_small_buffer ? writer.small_buffer :
5701 PyBytes_AS_STRING(writer.buffer);
5702 Py_ssize_t len = end - start;
5703
5704 char *cache = PyObject_Malloc(len + 1);
5705 if (cache == NULL) {
5706 _PyBytesWriter_Dealloc(&writer);
5707 PyErr_NoMemory();
5708 return -1;
5709 }
5710 _PyUnicode_UTF8(unicode) = cache;
5711 _PyUnicode_UTF8_LENGTH(unicode) = len;
5712 memcpy(cache, start, len);
5713 cache[len] = '\0';
5714 _PyBytesWriter_Dealloc(&writer);
5715 return 0;
5716 }
5717
5718 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5719 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5720 {
5721 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5722 }
5723
5724
5725 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5726 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5727 Py_ssize_t size,
5728 const char *errors)
5729 {
5730 PyObject *v, *unicode;
5731
5732 unicode = PyUnicode_FromWideChar(s, size);
5733 if (unicode == NULL)
5734 return NULL;
5735 v = _PyUnicode_AsUTF8String(unicode, errors);
5736 Py_DECREF(unicode);
5737 return v;
5738 }
5739
5740 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5741 PyUnicode_AsUTF8String(PyObject *unicode)
5742 {
5743 return _PyUnicode_AsUTF8String(unicode, NULL);
5744 }
5745
5746 /* --- UTF-32 Codec ------------------------------------------------------- */
5747
5748 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5749 PyUnicode_DecodeUTF32(const char *s,
5750 Py_ssize_t size,
5751 const char *errors,
5752 int *byteorder)
5753 {
5754 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5755 }
5756
5757 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5758 PyUnicode_DecodeUTF32Stateful(const char *s,
5759 Py_ssize_t size,
5760 const char *errors,
5761 int *byteorder,
5762 Py_ssize_t *consumed)
5763 {
5764 const char *starts = s;
5765 Py_ssize_t startinpos;
5766 Py_ssize_t endinpos;
5767 _PyUnicodeWriter writer;
5768 const unsigned char *q, *e;
5769 int le, bo = 0; /* assume native ordering by default */
5770 const char *encoding;
5771 const char *errmsg = "";
5772 PyObject *errorHandler = NULL;
5773 PyObject *exc = NULL;
5774
5775 q = (const unsigned char *)s;
5776 e = q + size;
5777
5778 if (byteorder)
5779 bo = *byteorder;
5780
5781 /* Check for BOM marks (U+FEFF) in the input and adjust current
5782 byte order setting accordingly. In native mode, the leading BOM
5783 mark is skipped, in all other modes, it is copied to the output
5784 stream as-is (giving a ZWNBSP character). */
5785 if (bo == 0 && size >= 4) {
5786 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5787 if (bom == 0x0000FEFF) {
5788 bo = -1;
5789 q += 4;
5790 }
5791 else if (bom == 0xFFFE0000) {
5792 bo = 1;
5793 q += 4;
5794 }
5795 if (byteorder)
5796 *byteorder = bo;
5797 }
5798
5799 if (q == e) {
5800 if (consumed)
5801 *consumed = size;
5802 _Py_RETURN_UNICODE_EMPTY();
5803 }
5804
5805 #ifdef WORDS_BIGENDIAN
5806 le = bo < 0;
5807 #else
5808 le = bo <= 0;
5809 #endif
5810 encoding = le ? "utf-32-le" : "utf-32-be";
5811
5812 _PyUnicodeWriter_Init(&writer);
5813 writer.min_length = (e - q + 3) / 4;
5814 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5815 goto onError;
5816
5817 while (1) {
5818 Py_UCS4 ch = 0;
5819 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5820
5821 if (e - q >= 4) {
5822 enum PyUnicode_Kind kind = writer.kind;
5823 void *data = writer.data;
5824 const unsigned char *last = e - 4;
5825 Py_ssize_t pos = writer.pos;
5826 if (le) {
5827 do {
5828 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5829 if (ch > maxch)
5830 break;
5831 if (kind != PyUnicode_1BYTE_KIND &&
5832 Py_UNICODE_IS_SURROGATE(ch))
5833 break;
5834 PyUnicode_WRITE(kind, data, pos++, ch);
5835 q += 4;
5836 } while (q <= last);
5837 }
5838 else {
5839 do {
5840 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5841 if (ch > maxch)
5842 break;
5843 if (kind != PyUnicode_1BYTE_KIND &&
5844 Py_UNICODE_IS_SURROGATE(ch))
5845 break;
5846 PyUnicode_WRITE(kind, data, pos++, ch);
5847 q += 4;
5848 } while (q <= last);
5849 }
5850 writer.pos = pos;
5851 }
5852
5853 if (Py_UNICODE_IS_SURROGATE(ch)) {
5854 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5855 startinpos = ((const char *)q) - starts;
5856 endinpos = startinpos + 4;
5857 }
5858 else if (ch <= maxch) {
5859 if (q == e || consumed)
5860 break;
5861 /* remaining bytes at the end? (size should be divisible by 4) */
5862 errmsg = "truncated data";
5863 startinpos = ((const char *)q) - starts;
5864 endinpos = ((const char *)e) - starts;
5865 }
5866 else {
5867 if (ch < 0x110000) {
5868 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5869 goto onError;
5870 q += 4;
5871 continue;
5872 }
5873 errmsg = "code point not in range(0x110000)";
5874 startinpos = ((const char *)q) - starts;
5875 endinpos = startinpos + 4;
5876 }
5877
5878 /* The remaining input chars are ignored if the callback
5879 chooses to skip the input */
5880 if (unicode_decode_call_errorhandler_writer(
5881 errors, &errorHandler,
5882 encoding, errmsg,
5883 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5884 &writer))
5885 goto onError;
5886 }
5887
5888 if (consumed)
5889 *consumed = (const char *)q-starts;
5890
5891 Py_XDECREF(errorHandler);
5892 Py_XDECREF(exc);
5893 return _PyUnicodeWriter_Finish(&writer);
5894
5895 onError:
5896 _PyUnicodeWriter_Dealloc(&writer);
5897 Py_XDECREF(errorHandler);
5898 Py_XDECREF(exc);
5899 return NULL;
5900 }
5901
5902 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5903 _PyUnicode_EncodeUTF32(PyObject *str,
5904 const char *errors,
5905 int byteorder)
5906 {
5907 enum PyUnicode_Kind kind;
5908 const void *data;
5909 Py_ssize_t len;
5910 PyObject *v;
5911 uint32_t *out;
5912 #if PY_LITTLE_ENDIAN
5913 int native_ordering = byteorder <= 0;
5914 #else
5915 int native_ordering = byteorder >= 0;
5916 #endif
5917 const char *encoding;
5918 Py_ssize_t nsize, pos;
5919 PyObject *errorHandler = NULL;
5920 PyObject *exc = NULL;
5921 PyObject *rep = NULL;
5922
5923 if (!PyUnicode_Check(str)) {
5924 PyErr_BadArgument();
5925 return NULL;
5926 }
5927 if (PyUnicode_READY(str) == -1)
5928 return NULL;
5929 kind = PyUnicode_KIND(str);
5930 data = PyUnicode_DATA(str);
5931 len = PyUnicode_GET_LENGTH(str);
5932
5933 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5934 return PyErr_NoMemory();
5935 nsize = len + (byteorder == 0);
5936 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5937 if (v == NULL)
5938 return NULL;
5939
5940 /* output buffer is 4-bytes aligned */
5941 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5942 out = (uint32_t *)PyBytes_AS_STRING(v);
5943 if (byteorder == 0)
5944 *out++ = 0xFEFF;
5945 if (len == 0)
5946 goto done;
5947
5948 if (byteorder == -1)
5949 encoding = "utf-32-le";
5950 else if (byteorder == 1)
5951 encoding = "utf-32-be";
5952 else
5953 encoding = "utf-32";
5954
5955 if (kind == PyUnicode_1BYTE_KIND) {
5956 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5957 goto done;
5958 }
5959
5960 pos = 0;
5961 while (pos < len) {
5962 Py_ssize_t repsize, moreunits;
5963
5964 if (kind == PyUnicode_2BYTE_KIND) {
5965 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5966 &out, native_ordering);
5967 }
5968 else {
5969 assert(kind == PyUnicode_4BYTE_KIND);
5970 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5971 &out, native_ordering);
5972 }
5973 if (pos == len)
5974 break;
5975
5976 rep = unicode_encode_call_errorhandler(
5977 errors, &errorHandler,
5978 encoding, "surrogates not allowed",
5979 str, &exc, pos, pos + 1, &pos);
5980 if (!rep)
5981 goto error;
5982
5983 if (PyBytes_Check(rep)) {
5984 repsize = PyBytes_GET_SIZE(rep);
5985 if (repsize & 3) {
5986 raise_encode_exception(&exc, encoding,
5987 str, pos - 1, pos,
5988 "surrogates not allowed");
5989 goto error;
5990 }
5991 moreunits = repsize / 4;
5992 }
5993 else {
5994 assert(PyUnicode_Check(rep));
5995 if (PyUnicode_READY(rep) < 0)
5996 goto error;
5997 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5998 if (!PyUnicode_IS_ASCII(rep)) {
5999 raise_encode_exception(&exc, encoding,
6000 str, pos - 1, pos,
6001 "surrogates not allowed");
6002 goto error;
6003 }
6004 }
6005
6006 /* four bytes are reserved for each surrogate */
6007 if (moreunits > 1) {
6008 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6009 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6010 /* integer overflow */
6011 PyErr_NoMemory();
6012 goto error;
6013 }
6014 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
6015 goto error;
6016 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6017 }
6018
6019 if (PyBytes_Check(rep)) {
6020 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6021 out += moreunits;
6022 } else /* rep is unicode */ {
6023 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6024 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6025 &out, native_ordering);
6026 }
6027
6028 Py_CLEAR(rep);
6029 }
6030
6031 /* Cut back to size actually needed. This is necessary for, for example,
6032 encoding of a string containing isolated surrogates and the 'ignore'
6033 handler is used. */
6034 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6035 if (nsize != PyBytes_GET_SIZE(v))
6036 _PyBytes_Resize(&v, nsize);
6037 Py_XDECREF(errorHandler);
6038 Py_XDECREF(exc);
6039 done:
6040 return v;
6041 error:
6042 Py_XDECREF(rep);
6043 Py_XDECREF(errorHandler);
6044 Py_XDECREF(exc);
6045 Py_XDECREF(v);
6046 return NULL;
6047 }
6048
6049 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6050 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6051 Py_ssize_t size,
6052 const char *errors,
6053 int byteorder)
6054 {
6055 PyObject *result;
6056 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6057 if (tmp == NULL)
6058 return NULL;
6059 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6060 Py_DECREF(tmp);
6061 return result;
6062 }
6063
6064 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)6065 PyUnicode_AsUTF32String(PyObject *unicode)
6066 {
6067 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6068 }
6069
6070 /* --- UTF-16 Codec ------------------------------------------------------- */
6071
6072 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)6073 PyUnicode_DecodeUTF16(const char *s,
6074 Py_ssize_t size,
6075 const char *errors,
6076 int *byteorder)
6077 {
6078 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6079 }
6080
6081 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)6082 PyUnicode_DecodeUTF16Stateful(const char *s,
6083 Py_ssize_t size,
6084 const char *errors,
6085 int *byteorder,
6086 Py_ssize_t *consumed)
6087 {
6088 const char *starts = s;
6089 Py_ssize_t startinpos;
6090 Py_ssize_t endinpos;
6091 _PyUnicodeWriter writer;
6092 const unsigned char *q, *e;
6093 int bo = 0; /* assume native ordering by default */
6094 int native_ordering;
6095 const char *errmsg = "";
6096 PyObject *errorHandler = NULL;
6097 PyObject *exc = NULL;
6098 const char *encoding;
6099
6100 q = (const unsigned char *)s;
6101 e = q + size;
6102
6103 if (byteorder)
6104 bo = *byteorder;
6105
6106 /* Check for BOM marks (U+FEFF) in the input and adjust current
6107 byte order setting accordingly. In native mode, the leading BOM
6108 mark is skipped, in all other modes, it is copied to the output
6109 stream as-is (giving a ZWNBSP character). */
6110 if (bo == 0 && size >= 2) {
6111 const Py_UCS4 bom = (q[1] << 8) | q[0];
6112 if (bom == 0xFEFF) {
6113 q += 2;
6114 bo = -1;
6115 }
6116 else if (bom == 0xFFFE) {
6117 q += 2;
6118 bo = 1;
6119 }
6120 if (byteorder)
6121 *byteorder = bo;
6122 }
6123
6124 if (q == e) {
6125 if (consumed)
6126 *consumed = size;
6127 _Py_RETURN_UNICODE_EMPTY();
6128 }
6129
6130 #if PY_LITTLE_ENDIAN
6131 native_ordering = bo <= 0;
6132 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6133 #else
6134 native_ordering = bo >= 0;
6135 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6136 #endif
6137
6138 /* Note: size will always be longer than the resulting Unicode
6139 character count normally. Error handler will take care of
6140 resizing when needed. */
6141 _PyUnicodeWriter_Init(&writer);
6142 writer.min_length = (e - q + 1) / 2;
6143 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6144 goto onError;
6145
6146 while (1) {
6147 Py_UCS4 ch = 0;
6148 if (e - q >= 2) {
6149 int kind = writer.kind;
6150 if (kind == PyUnicode_1BYTE_KIND) {
6151 if (PyUnicode_IS_ASCII(writer.buffer))
6152 ch = asciilib_utf16_decode(&q, e,
6153 (Py_UCS1*)writer.data, &writer.pos,
6154 native_ordering);
6155 else
6156 ch = ucs1lib_utf16_decode(&q, e,
6157 (Py_UCS1*)writer.data, &writer.pos,
6158 native_ordering);
6159 } else if (kind == PyUnicode_2BYTE_KIND) {
6160 ch = ucs2lib_utf16_decode(&q, e,
6161 (Py_UCS2*)writer.data, &writer.pos,
6162 native_ordering);
6163 } else {
6164 assert(kind == PyUnicode_4BYTE_KIND);
6165 ch = ucs4lib_utf16_decode(&q, e,
6166 (Py_UCS4*)writer.data, &writer.pos,
6167 native_ordering);
6168 }
6169 }
6170
6171 switch (ch)
6172 {
6173 case 0:
6174 /* remaining byte at the end? (size should be even) */
6175 if (q == e || consumed)
6176 goto End;
6177 errmsg = "truncated data";
6178 startinpos = ((const char *)q) - starts;
6179 endinpos = ((const char *)e) - starts;
6180 break;
6181 /* The remaining input chars are ignored if the callback
6182 chooses to skip the input */
6183 case 1:
6184 q -= 2;
6185 if (consumed)
6186 goto End;
6187 errmsg = "unexpected end of data";
6188 startinpos = ((const char *)q) - starts;
6189 endinpos = ((const char *)e) - starts;
6190 break;
6191 case 2:
6192 errmsg = "illegal encoding";
6193 startinpos = ((const char *)q) - 2 - starts;
6194 endinpos = startinpos + 2;
6195 break;
6196 case 3:
6197 errmsg = "illegal UTF-16 surrogate";
6198 startinpos = ((const char *)q) - 4 - starts;
6199 endinpos = startinpos + 2;
6200 break;
6201 default:
6202 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6203 goto onError;
6204 continue;
6205 }
6206
6207 if (unicode_decode_call_errorhandler_writer(
6208 errors,
6209 &errorHandler,
6210 encoding, errmsg,
6211 &starts,
6212 (const char **)&e,
6213 &startinpos,
6214 &endinpos,
6215 &exc,
6216 (const char **)&q,
6217 &writer))
6218 goto onError;
6219 }
6220
6221 End:
6222 if (consumed)
6223 *consumed = (const char *)q-starts;
6224
6225 Py_XDECREF(errorHandler);
6226 Py_XDECREF(exc);
6227 return _PyUnicodeWriter_Finish(&writer);
6228
6229 onError:
6230 _PyUnicodeWriter_Dealloc(&writer);
6231 Py_XDECREF(errorHandler);
6232 Py_XDECREF(exc);
6233 return NULL;
6234 }
6235
6236 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6237 _PyUnicode_EncodeUTF16(PyObject *str,
6238 const char *errors,
6239 int byteorder)
6240 {
6241 enum PyUnicode_Kind kind;
6242 const void *data;
6243 Py_ssize_t len;
6244 PyObject *v;
6245 unsigned short *out;
6246 Py_ssize_t pairs;
6247 #if PY_BIG_ENDIAN
6248 int native_ordering = byteorder >= 0;
6249 #else
6250 int native_ordering = byteorder <= 0;
6251 #endif
6252 const char *encoding;
6253 Py_ssize_t nsize, pos;
6254 PyObject *errorHandler = NULL;
6255 PyObject *exc = NULL;
6256 PyObject *rep = NULL;
6257
6258 if (!PyUnicode_Check(str)) {
6259 PyErr_BadArgument();
6260 return NULL;
6261 }
6262 if (PyUnicode_READY(str) == -1)
6263 return NULL;
6264 kind = PyUnicode_KIND(str);
6265 data = PyUnicode_DATA(str);
6266 len = PyUnicode_GET_LENGTH(str);
6267
6268 pairs = 0;
6269 if (kind == PyUnicode_4BYTE_KIND) {
6270 const Py_UCS4 *in = (const Py_UCS4 *)data;
6271 const Py_UCS4 *end = in + len;
6272 while (in < end) {
6273 if (*in++ >= 0x10000) {
6274 pairs++;
6275 }
6276 }
6277 }
6278 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6279 return PyErr_NoMemory();
6280 }
6281 nsize = len + pairs + (byteorder == 0);
6282 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6283 if (v == NULL) {
6284 return NULL;
6285 }
6286
6287 /* output buffer is 2-bytes aligned */
6288 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6289 out = (unsigned short *)PyBytes_AS_STRING(v);
6290 if (byteorder == 0) {
6291 *out++ = 0xFEFF;
6292 }
6293 if (len == 0) {
6294 goto done;
6295 }
6296
6297 if (kind == PyUnicode_1BYTE_KIND) {
6298 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6299 goto done;
6300 }
6301
6302 if (byteorder < 0) {
6303 encoding = "utf-16-le";
6304 }
6305 else if (byteorder > 0) {
6306 encoding = "utf-16-be";
6307 }
6308 else {
6309 encoding = "utf-16";
6310 }
6311
6312 pos = 0;
6313 while (pos < len) {
6314 Py_ssize_t repsize, moreunits;
6315
6316 if (kind == PyUnicode_2BYTE_KIND) {
6317 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6318 &out, native_ordering);
6319 }
6320 else {
6321 assert(kind == PyUnicode_4BYTE_KIND);
6322 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6323 &out, native_ordering);
6324 }
6325 if (pos == len)
6326 break;
6327
6328 rep = unicode_encode_call_errorhandler(
6329 errors, &errorHandler,
6330 encoding, "surrogates not allowed",
6331 str, &exc, pos, pos + 1, &pos);
6332 if (!rep)
6333 goto error;
6334
6335 if (PyBytes_Check(rep)) {
6336 repsize = PyBytes_GET_SIZE(rep);
6337 if (repsize & 1) {
6338 raise_encode_exception(&exc, encoding,
6339 str, pos - 1, pos,
6340 "surrogates not allowed");
6341 goto error;
6342 }
6343 moreunits = repsize / 2;
6344 }
6345 else {
6346 assert(PyUnicode_Check(rep));
6347 if (PyUnicode_READY(rep) < 0)
6348 goto error;
6349 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6350 if (!PyUnicode_IS_ASCII(rep)) {
6351 raise_encode_exception(&exc, encoding,
6352 str, pos - 1, pos,
6353 "surrogates not allowed");
6354 goto error;
6355 }
6356 }
6357
6358 /* two bytes are reserved for each surrogate */
6359 if (moreunits > 1) {
6360 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6361 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6362 /* integer overflow */
6363 PyErr_NoMemory();
6364 goto error;
6365 }
6366 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6367 goto error;
6368 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6369 }
6370
6371 if (PyBytes_Check(rep)) {
6372 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6373 out += moreunits;
6374 } else /* rep is unicode */ {
6375 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6376 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6377 &out, native_ordering);
6378 }
6379
6380 Py_CLEAR(rep);
6381 }
6382
6383 /* Cut back to size actually needed. This is necessary for, for example,
6384 encoding of a string containing isolated surrogates and the 'ignore' handler
6385 is used. */
6386 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6387 if (nsize != PyBytes_GET_SIZE(v))
6388 _PyBytes_Resize(&v, nsize);
6389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
6391 done:
6392 return v;
6393 error:
6394 Py_XDECREF(rep);
6395 Py_XDECREF(errorHandler);
6396 Py_XDECREF(exc);
6397 Py_XDECREF(v);
6398 return NULL;
6399 #undef STORECHAR
6400 }
6401
6402 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6403 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6404 Py_ssize_t size,
6405 const char *errors,
6406 int byteorder)
6407 {
6408 PyObject *result;
6409 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6410 if (tmp == NULL)
6411 return NULL;
6412 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6413 Py_DECREF(tmp);
6414 return result;
6415 }
6416
6417 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6418 PyUnicode_AsUTF16String(PyObject *unicode)
6419 {
6420 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6421 }
6422
6423 /* --- Unicode Escape Codec ----------------------------------------------- */
6424
6425 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6426
6427 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6428 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6429 Py_ssize_t size,
6430 const char *errors,
6431 Py_ssize_t *consumed,
6432 const char **first_invalid_escape)
6433 {
6434 const char *starts = s;
6435 _PyUnicodeWriter writer;
6436 const char *end;
6437 PyObject *errorHandler = NULL;
6438 PyObject *exc = NULL;
6439
6440 // so we can remember if we've seen an invalid escape char or not
6441 *first_invalid_escape = NULL;
6442
6443 if (size == 0) {
6444 if (consumed) {
6445 *consumed = 0;
6446 }
6447 _Py_RETURN_UNICODE_EMPTY();
6448 }
6449 /* Escaped strings will always be longer than the resulting
6450 Unicode string, so we start with size here and then reduce the
6451 length after conversion to the true value.
6452 (but if the error callback returns a long replacement string
6453 we'll have to allocate more space) */
6454 _PyUnicodeWriter_Init(&writer);
6455 writer.min_length = size;
6456 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6457 goto onError;
6458 }
6459
6460 end = s + size;
6461 while (s < end) {
6462 unsigned char c = (unsigned char) *s++;
6463 Py_UCS4 ch;
6464 int count;
6465 const char *message;
6466
6467 #define WRITE_ASCII_CHAR(ch) \
6468 do { \
6469 assert(ch <= 127); \
6470 assert(writer.pos < writer.size); \
6471 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6472 } while(0)
6473
6474 #define WRITE_CHAR(ch) \
6475 do { \
6476 if (ch <= writer.maxchar) { \
6477 assert(writer.pos < writer.size); \
6478 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6479 } \
6480 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6481 goto onError; \
6482 } \
6483 } while(0)
6484
6485 /* Non-escape characters are interpreted as Unicode ordinals */
6486 if (c != '\\') {
6487 WRITE_CHAR(c);
6488 continue;
6489 }
6490
6491 Py_ssize_t startinpos = s - starts - 1;
6492 /* \ - Escapes */
6493 if (s >= end) {
6494 message = "\\ at end of string";
6495 goto incomplete;
6496 }
6497 c = (unsigned char) *s++;
6498
6499 assert(writer.pos < writer.size);
6500 switch (c) {
6501
6502 /* \x escapes */
6503 case '\n': continue;
6504 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6505 case '\'': WRITE_ASCII_CHAR('\''); continue;
6506 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6507 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6508 /* FF */
6509 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6510 case 't': WRITE_ASCII_CHAR('\t'); continue;
6511 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6512 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6513 /* VT */
6514 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6515 /* BEL, not classic C */
6516 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6517
6518 /* \OOO (octal) escapes */
6519 case '0': case '1': case '2': case '3':
6520 case '4': case '5': case '6': case '7':
6521 ch = c - '0';
6522 if (s < end && '0' <= *s && *s <= '7') {
6523 ch = (ch<<3) + *s++ - '0';
6524 if (s < end && '0' <= *s && *s <= '7') {
6525 ch = (ch<<3) + *s++ - '0';
6526 }
6527 }
6528 WRITE_CHAR(ch);
6529 continue;
6530
6531 /* hex escapes */
6532 /* \xXX */
6533 case 'x':
6534 count = 2;
6535 message = "truncated \\xXX escape";
6536 goto hexescape;
6537
6538 /* \uXXXX */
6539 case 'u':
6540 count = 4;
6541 message = "truncated \\uXXXX escape";
6542 goto hexescape;
6543
6544 /* \UXXXXXXXX */
6545 case 'U':
6546 count = 8;
6547 message = "truncated \\UXXXXXXXX escape";
6548 hexescape:
6549 for (ch = 0; count; ++s, --count) {
6550 if (s >= end) {
6551 goto incomplete;
6552 }
6553 c = (unsigned char)*s;
6554 ch <<= 4;
6555 if (c >= '0' && c <= '9') {
6556 ch += c - '0';
6557 }
6558 else if (c >= 'a' && c <= 'f') {
6559 ch += c - ('a' - 10);
6560 }
6561 else if (c >= 'A' && c <= 'F') {
6562 ch += c - ('A' - 10);
6563 }
6564 else {
6565 goto error;
6566 }
6567 }
6568
6569 /* when we get here, ch is a 32-bit unicode character */
6570 if (ch > MAX_UNICODE) {
6571 message = "illegal Unicode character";
6572 goto error;
6573 }
6574
6575 WRITE_CHAR(ch);
6576 continue;
6577
6578 /* \N{name} */
6579 case 'N':
6580 if (ucnhash_capi == NULL) {
6581 /* load the unicode data module */
6582 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6583 PyUnicodeData_CAPSULE_NAME, 1);
6584 if (ucnhash_capi == NULL) {
6585 PyErr_SetString(
6586 PyExc_UnicodeError,
6587 "\\N escapes not supported (can't load unicodedata module)"
6588 );
6589 goto onError;
6590 }
6591 }
6592
6593 message = "malformed \\N character escape";
6594 if (s >= end) {
6595 goto incomplete;
6596 }
6597 if (*s == '{') {
6598 const char *start = ++s;
6599 size_t namelen;
6600 /* look for the closing brace */
6601 while (s < end && *s != '}')
6602 s++;
6603 if (s >= end) {
6604 goto incomplete;
6605 }
6606 namelen = s - start;
6607 if (namelen) {
6608 /* found a name. look it up in the unicode database */
6609 s++;
6610 ch = 0xffffffff; /* in case 'getcode' messes up */
6611 if (namelen <= INT_MAX &&
6612 ucnhash_capi->getcode(start, (int)namelen,
6613 &ch, 0)) {
6614 assert(ch <= MAX_UNICODE);
6615 WRITE_CHAR(ch);
6616 continue;
6617 }
6618 message = "unknown Unicode character name";
6619 }
6620 }
6621 goto error;
6622
6623 default:
6624 if (*first_invalid_escape == NULL) {
6625 *first_invalid_escape = s-1; /* Back up one char, since we've
6626 already incremented s. */
6627 }
6628 WRITE_ASCII_CHAR('\\');
6629 WRITE_CHAR(c);
6630 continue;
6631 }
6632
6633 incomplete:
6634 if (consumed) {
6635 *consumed = startinpos;
6636 break;
6637 }
6638 error:;
6639 Py_ssize_t endinpos = s-starts;
6640 writer.min_length = end - s + writer.pos;
6641 if (unicode_decode_call_errorhandler_writer(
6642 errors, &errorHandler,
6643 "unicodeescape", message,
6644 &starts, &end, &startinpos, &endinpos, &exc, &s,
6645 &writer)) {
6646 goto onError;
6647 }
6648 assert(end - s <= writer.size - writer.pos);
6649
6650 #undef WRITE_ASCII_CHAR
6651 #undef WRITE_CHAR
6652 }
6653
6654 Py_XDECREF(errorHandler);
6655 Py_XDECREF(exc);
6656 return _PyUnicodeWriter_Finish(&writer);
6657
6658 onError:
6659 _PyUnicodeWriter_Dealloc(&writer);
6660 Py_XDECREF(errorHandler);
6661 Py_XDECREF(exc);
6662 return NULL;
6663 }
6664
6665 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6666 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6667 Py_ssize_t size,
6668 const char *errors,
6669 Py_ssize_t *consumed)
6670 {
6671 const char *first_invalid_escape;
6672 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6673 consumed,
6674 &first_invalid_escape);
6675 if (result == NULL)
6676 return NULL;
6677 if (first_invalid_escape != NULL) {
6678 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6679 "invalid escape sequence '\\%c'",
6680 (unsigned char)*first_invalid_escape) < 0) {
6681 Py_DECREF(result);
6682 return NULL;
6683 }
6684 }
6685 return result;
6686 }
6687
6688 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6689 PyUnicode_DecodeUnicodeEscape(const char *s,
6690 Py_ssize_t size,
6691 const char *errors)
6692 {
6693 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6694 }
6695
6696 /* Return a Unicode-Escape string version of the Unicode object. */
6697
6698 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6699 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6700 {
6701 Py_ssize_t i, len;
6702 PyObject *repr;
6703 char *p;
6704 enum PyUnicode_Kind kind;
6705 const void *data;
6706 Py_ssize_t expandsize;
6707
6708 /* Initial allocation is based on the longest-possible character
6709 escape.
6710
6711 For UCS1 strings it's '\xxx', 4 bytes per source character.
6712 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6713 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6714 */
6715
6716 if (!PyUnicode_Check(unicode)) {
6717 PyErr_BadArgument();
6718 return NULL;
6719 }
6720 if (PyUnicode_READY(unicode) == -1) {
6721 return NULL;
6722 }
6723
6724 len = PyUnicode_GET_LENGTH(unicode);
6725 if (len == 0) {
6726 return PyBytes_FromStringAndSize(NULL, 0);
6727 }
6728
6729 kind = PyUnicode_KIND(unicode);
6730 data = PyUnicode_DATA(unicode);
6731 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6732 bytes, and 1 byte characters 4. */
6733 expandsize = kind * 2 + 2;
6734 if (len > PY_SSIZE_T_MAX / expandsize) {
6735 return PyErr_NoMemory();
6736 }
6737 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6738 if (repr == NULL) {
6739 return NULL;
6740 }
6741
6742 p = PyBytes_AS_STRING(repr);
6743 for (i = 0; i < len; i++) {
6744 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6745
6746 /* U+0000-U+00ff range */
6747 if (ch < 0x100) {
6748 if (ch >= ' ' && ch < 127) {
6749 if (ch != '\\') {
6750 /* Copy printable US ASCII as-is */
6751 *p++ = (char) ch;
6752 }
6753 /* Escape backslashes */
6754 else {
6755 *p++ = '\\';
6756 *p++ = '\\';
6757 }
6758 }
6759
6760 /* Map special whitespace to '\t', \n', '\r' */
6761 else if (ch == '\t') {
6762 *p++ = '\\';
6763 *p++ = 't';
6764 }
6765 else if (ch == '\n') {
6766 *p++ = '\\';
6767 *p++ = 'n';
6768 }
6769 else if (ch == '\r') {
6770 *p++ = '\\';
6771 *p++ = 'r';
6772 }
6773
6774 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6775 else {
6776 *p++ = '\\';
6777 *p++ = 'x';
6778 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6779 *p++ = Py_hexdigits[ch & 0x000F];
6780 }
6781 }
6782 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6783 else if (ch < 0x10000) {
6784 *p++ = '\\';
6785 *p++ = 'u';
6786 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6787 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6788 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6789 *p++ = Py_hexdigits[ch & 0x000F];
6790 }
6791 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6792 else {
6793
6794 /* Make sure that the first two digits are zero */
6795 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6796 *p++ = '\\';
6797 *p++ = 'U';
6798 *p++ = '0';
6799 *p++ = '0';
6800 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6801 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6802 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6803 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6804 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6805 *p++ = Py_hexdigits[ch & 0x0000000F];
6806 }
6807 }
6808
6809 assert(p - PyBytes_AS_STRING(repr) > 0);
6810 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6811 return NULL;
6812 }
6813 return repr;
6814 }
6815
6816 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6817 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6818 Py_ssize_t size)
6819 {
6820 PyObject *result;
6821 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6822 if (tmp == NULL) {
6823 return NULL;
6824 }
6825
6826 result = PyUnicode_AsUnicodeEscapeString(tmp);
6827 Py_DECREF(tmp);
6828 return result;
6829 }
6830
6831 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6832
6833 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6834 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6835 Py_ssize_t size,
6836 const char *errors,
6837 Py_ssize_t *consumed)
6838 {
6839 const char *starts = s;
6840 _PyUnicodeWriter writer;
6841 const char *end;
6842 PyObject *errorHandler = NULL;
6843 PyObject *exc = NULL;
6844
6845 if (size == 0) {
6846 if (consumed) {
6847 *consumed = 0;
6848 }
6849 _Py_RETURN_UNICODE_EMPTY();
6850 }
6851
6852 /* Escaped strings will always be longer than the resulting
6853 Unicode string, so we start with size here and then reduce the
6854 length after conversion to the true value. (But decoding error
6855 handler might have to resize the string) */
6856 _PyUnicodeWriter_Init(&writer);
6857 writer.min_length = size;
6858 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6859 goto onError;
6860 }
6861
6862 end = s + size;
6863 while (s < end) {
6864 unsigned char c = (unsigned char) *s++;
6865 Py_UCS4 ch;
6866 int count;
6867 const char *message;
6868
6869 #define WRITE_CHAR(ch) \
6870 do { \
6871 if (ch <= writer.maxchar) { \
6872 assert(writer.pos < writer.size); \
6873 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6874 } \
6875 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6876 goto onError; \
6877 } \
6878 } while(0)
6879
6880 /* Non-escape characters are interpreted as Unicode ordinals */
6881 if (c != '\\' || (s >= end && !consumed)) {
6882 WRITE_CHAR(c);
6883 continue;
6884 }
6885
6886 Py_ssize_t startinpos = s - starts - 1;
6887 /* \ - Escapes */
6888 if (s >= end) {
6889 assert(consumed);
6890 // Set message to silent compiler warning.
6891 // Actually it is never used.
6892 message = "\\ at end of string";
6893 goto incomplete;
6894 }
6895
6896 c = (unsigned char) *s++;
6897 if (c == 'u') {
6898 count = 4;
6899 message = "truncated \\uXXXX escape";
6900 }
6901 else if (c == 'U') {
6902 count = 8;
6903 message = "truncated \\UXXXXXXXX escape";
6904 }
6905 else {
6906 assert(writer.pos < writer.size);
6907 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6908 WRITE_CHAR(c);
6909 continue;
6910 }
6911
6912 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6913 for (ch = 0; count; ++s, --count) {
6914 if (s >= end) {
6915 goto incomplete;
6916 }
6917 c = (unsigned char)*s;
6918 ch <<= 4;
6919 if (c >= '0' && c <= '9') {
6920 ch += c - '0';
6921 }
6922 else if (c >= 'a' && c <= 'f') {
6923 ch += c - ('a' - 10);
6924 }
6925 else if (c >= 'A' && c <= 'F') {
6926 ch += c - ('A' - 10);
6927 }
6928 else {
6929 goto error;
6930 }
6931 }
6932 if (ch > MAX_UNICODE) {
6933 message = "\\Uxxxxxxxx out of range";
6934 goto error;
6935 }
6936 WRITE_CHAR(ch);
6937 continue;
6938
6939 incomplete:
6940 if (consumed) {
6941 *consumed = startinpos;
6942 break;
6943 }
6944 error:;
6945 Py_ssize_t endinpos = s-starts;
6946 writer.min_length = end - s + writer.pos;
6947 if (unicode_decode_call_errorhandler_writer(
6948 errors, &errorHandler,
6949 "rawunicodeescape", message,
6950 &starts, &end, &startinpos, &endinpos, &exc, &s,
6951 &writer)) {
6952 goto onError;
6953 }
6954 assert(end - s <= writer.size - writer.pos);
6955
6956 #undef WRITE_CHAR
6957 }
6958 Py_XDECREF(errorHandler);
6959 Py_XDECREF(exc);
6960 return _PyUnicodeWriter_Finish(&writer);
6961
6962 onError:
6963 _PyUnicodeWriter_Dealloc(&writer);
6964 Py_XDECREF(errorHandler);
6965 Py_XDECREF(exc);
6966 return NULL;
6967 }
6968
6969 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6970 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6971 Py_ssize_t size,
6972 const char *errors)
6973 {
6974 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6975 }
6976
6977
6978 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6979 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6980 {
6981 PyObject *repr;
6982 char *p;
6983 Py_ssize_t expandsize, pos;
6984 int kind;
6985 const void *data;
6986 Py_ssize_t len;
6987
6988 if (!PyUnicode_Check(unicode)) {
6989 PyErr_BadArgument();
6990 return NULL;
6991 }
6992 if (PyUnicode_READY(unicode) == -1) {
6993 return NULL;
6994 }
6995 kind = PyUnicode_KIND(unicode);
6996 data = PyUnicode_DATA(unicode);
6997 len = PyUnicode_GET_LENGTH(unicode);
6998 if (kind == PyUnicode_1BYTE_KIND) {
6999 return PyBytes_FromStringAndSize(data, len);
7000 }
7001
7002 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7003 bytes, and 1 byte characters 4. */
7004 expandsize = kind * 2 + 2;
7005
7006 if (len > PY_SSIZE_T_MAX / expandsize) {
7007 return PyErr_NoMemory();
7008 }
7009 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7010 if (repr == NULL) {
7011 return NULL;
7012 }
7013 if (len == 0) {
7014 return repr;
7015 }
7016
7017 p = PyBytes_AS_STRING(repr);
7018 for (pos = 0; pos < len; pos++) {
7019 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7020
7021 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7022 if (ch < 0x100) {
7023 *p++ = (char) ch;
7024 }
7025 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7026 else if (ch < 0x10000) {
7027 *p++ = '\\';
7028 *p++ = 'u';
7029 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7030 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7031 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7032 *p++ = Py_hexdigits[ch & 15];
7033 }
7034 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7035 else {
7036 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7037 *p++ = '\\';
7038 *p++ = 'U';
7039 *p++ = '0';
7040 *p++ = '0';
7041 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7042 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7043 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7044 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7045 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7046 *p++ = Py_hexdigits[ch & 15];
7047 }
7048 }
7049
7050 assert(p > PyBytes_AS_STRING(repr));
7051 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7052 return NULL;
7053 }
7054 return repr;
7055 }
7056
7057 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)7058 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7059 Py_ssize_t size)
7060 {
7061 PyObject *result;
7062 PyObject *tmp = PyUnicode_FromWideChar(s, size);
7063 if (tmp == NULL)
7064 return NULL;
7065 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7066 Py_DECREF(tmp);
7067 return result;
7068 }
7069
7070 /* --- Latin-1 Codec ------------------------------------------------------ */
7071
7072 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)7073 PyUnicode_DecodeLatin1(const char *s,
7074 Py_ssize_t size,
7075 const char *errors)
7076 {
7077 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7078 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7079 }
7080
7081 /* create or adjust a UnicodeEncodeError */
7082 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7083 make_encode_exception(PyObject **exceptionObject,
7084 const char *encoding,
7085 PyObject *unicode,
7086 Py_ssize_t startpos, Py_ssize_t endpos,
7087 const char *reason)
7088 {
7089 if (*exceptionObject == NULL) {
7090 *exceptionObject = PyObject_CallFunction(
7091 PyExc_UnicodeEncodeError, "sOnns",
7092 encoding, unicode, startpos, endpos, reason);
7093 }
7094 else {
7095 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7096 goto onError;
7097 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7098 goto onError;
7099 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7100 goto onError;
7101 return;
7102 onError:
7103 Py_CLEAR(*exceptionObject);
7104 }
7105 }
7106
7107 /* raises a UnicodeEncodeError */
7108 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)7109 raise_encode_exception(PyObject **exceptionObject,
7110 const char *encoding,
7111 PyObject *unicode,
7112 Py_ssize_t startpos, Py_ssize_t endpos,
7113 const char *reason)
7114 {
7115 make_encode_exception(exceptionObject,
7116 encoding, unicode, startpos, endpos, reason);
7117 if (*exceptionObject != NULL)
7118 PyCodec_StrictErrors(*exceptionObject);
7119 }
7120
7121 /* error handling callback helper:
7122 build arguments, call the callback and check the arguments,
7123 put the result into newpos and return the replacement string, which
7124 has to be freed by the caller */
7125 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)7126 unicode_encode_call_errorhandler(const char *errors,
7127 PyObject **errorHandler,
7128 const char *encoding, const char *reason,
7129 PyObject *unicode, PyObject **exceptionObject,
7130 Py_ssize_t startpos, Py_ssize_t endpos,
7131 Py_ssize_t *newpos)
7132 {
7133 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7134 Py_ssize_t len;
7135 PyObject *restuple;
7136 PyObject *resunicode;
7137
7138 if (*errorHandler == NULL) {
7139 *errorHandler = PyCodec_LookupError(errors);
7140 if (*errorHandler == NULL)
7141 return NULL;
7142 }
7143
7144 if (PyUnicode_READY(unicode) == -1)
7145 return NULL;
7146 len = PyUnicode_GET_LENGTH(unicode);
7147
7148 make_encode_exception(exceptionObject,
7149 encoding, unicode, startpos, endpos, reason);
7150 if (*exceptionObject == NULL)
7151 return NULL;
7152
7153 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154 if (restuple == NULL)
7155 return NULL;
7156 if (!PyTuple_Check(restuple)) {
7157 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158 Py_DECREF(restuple);
7159 return NULL;
7160 }
7161 if (!PyArg_ParseTuple(restuple, argparse,
7162 &resunicode, newpos)) {
7163 Py_DECREF(restuple);
7164 return NULL;
7165 }
7166 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168 Py_DECREF(restuple);
7169 return NULL;
7170 }
7171 if (*newpos<0)
7172 *newpos = len + *newpos;
7173 if (*newpos<0 || *newpos>len) {
7174 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175 Py_DECREF(restuple);
7176 return NULL;
7177 }
7178 Py_INCREF(resunicode);
7179 Py_DECREF(restuple);
7180 return resunicode;
7181 }
7182
7183 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7184 unicode_encode_ucs1(PyObject *unicode,
7185 const char *errors,
7186 const Py_UCS4 limit)
7187 {
7188 /* input state */
7189 Py_ssize_t pos=0, size;
7190 int kind;
7191 const void *data;
7192 /* pointer into the output */
7193 char *str;
7194 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7195 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7196 PyObject *error_handler_obj = NULL;
7197 PyObject *exc = NULL;
7198 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7199 PyObject *rep = NULL;
7200 /* output object */
7201 _PyBytesWriter writer;
7202
7203 if (PyUnicode_READY(unicode) == -1)
7204 return NULL;
7205 size = PyUnicode_GET_LENGTH(unicode);
7206 kind = PyUnicode_KIND(unicode);
7207 data = PyUnicode_DATA(unicode);
7208 /* allocate enough for a simple encoding without
7209 replacements, if we need more, we'll resize */
7210 if (size == 0)
7211 return PyBytes_FromStringAndSize(NULL, 0);
7212
7213 _PyBytesWriter_Init(&writer);
7214 str = _PyBytesWriter_Alloc(&writer, size);
7215 if (str == NULL)
7216 return NULL;
7217
7218 while (pos < size) {
7219 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7220
7221 /* can we encode this? */
7222 if (ch < limit) {
7223 /* no overflow check, because we know that the space is enough */
7224 *str++ = (char)ch;
7225 ++pos;
7226 }
7227 else {
7228 Py_ssize_t newpos, i;
7229 /* startpos for collecting unencodable chars */
7230 Py_ssize_t collstart = pos;
7231 Py_ssize_t collend = collstart + 1;
7232 /* find all unecodable characters */
7233
7234 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7235 ++collend;
7236
7237 /* Only overallocate the buffer if it's not the last write */
7238 writer.overallocate = (collend < size);
7239
7240 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7241 if (error_handler == _Py_ERROR_UNKNOWN)
7242 error_handler = _Py_GetErrorHandler(errors);
7243
7244 switch (error_handler) {
7245 case _Py_ERROR_STRICT:
7246 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7247 goto onError;
7248
7249 case _Py_ERROR_REPLACE:
7250 memset(str, '?', collend - collstart);
7251 str += (collend - collstart);
7252 /* fall through */
7253 case _Py_ERROR_IGNORE:
7254 pos = collend;
7255 break;
7256
7257 case _Py_ERROR_BACKSLASHREPLACE:
7258 /* subtract preallocated bytes */
7259 writer.min_size -= (collend - collstart);
7260 str = backslashreplace(&writer, str,
7261 unicode, collstart, collend);
7262 if (str == NULL)
7263 goto onError;
7264 pos = collend;
7265 break;
7266
7267 case _Py_ERROR_XMLCHARREFREPLACE:
7268 /* subtract preallocated bytes */
7269 writer.min_size -= (collend - collstart);
7270 str = xmlcharrefreplace(&writer, str,
7271 unicode, collstart, collend);
7272 if (str == NULL)
7273 goto onError;
7274 pos = collend;
7275 break;
7276
7277 case _Py_ERROR_SURROGATEESCAPE:
7278 for (i = collstart; i < collend; ++i) {
7279 ch = PyUnicode_READ(kind, data, i);
7280 if (ch < 0xdc80 || 0xdcff < ch) {
7281 /* Not a UTF-8b surrogate */
7282 break;
7283 }
7284 *str++ = (char)(ch - 0xdc00);
7285 ++pos;
7286 }
7287 if (i >= collend)
7288 break;
7289 collstart = pos;
7290 assert(collstart != collend);
7291 /* fall through */
7292
7293 default:
7294 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7295 encoding, reason, unicode, &exc,
7296 collstart, collend, &newpos);
7297 if (rep == NULL)
7298 goto onError;
7299
7300 /* subtract preallocated bytes */
7301 writer.min_size -= newpos - collstart;
7302
7303 if (PyBytes_Check(rep)) {
7304 /* Directly copy bytes result to output. */
7305 str = _PyBytesWriter_WriteBytes(&writer, str,
7306 PyBytes_AS_STRING(rep),
7307 PyBytes_GET_SIZE(rep));
7308 }
7309 else {
7310 assert(PyUnicode_Check(rep));
7311
7312 if (PyUnicode_READY(rep) < 0)
7313 goto onError;
7314
7315 if (limit == 256 ?
7316 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317 !PyUnicode_IS_ASCII(rep))
7318 {
7319 /* Not all characters are smaller than limit */
7320 raise_encode_exception(&exc, encoding, unicode,
7321 collstart, collend, reason);
7322 goto onError;
7323 }
7324 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325 str = _PyBytesWriter_WriteBytes(&writer, str,
7326 PyUnicode_DATA(rep),
7327 PyUnicode_GET_LENGTH(rep));
7328 }
7329 if (str == NULL)
7330 goto onError;
7331
7332 pos = newpos;
7333 Py_CLEAR(rep);
7334 }
7335
7336 /* If overallocation was disabled, ensure that it was the last
7337 write. Otherwise, we missed an optimization */
7338 assert(writer.overallocate || pos == size);
7339 }
7340 }
7341
7342 Py_XDECREF(error_handler_obj);
7343 Py_XDECREF(exc);
7344 return _PyBytesWriter_Finish(&writer, str);
7345
7346 onError:
7347 Py_XDECREF(rep);
7348 _PyBytesWriter_Dealloc(&writer);
7349 Py_XDECREF(error_handler_obj);
7350 Py_XDECREF(exc);
7351 return NULL;
7352 }
7353
7354 /* Deprecated */
7355 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7356 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
7357 Py_ssize_t size,
7358 const char *errors)
7359 {
7360 PyObject *result;
7361 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7362 if (unicode == NULL)
7363 return NULL;
7364 result = unicode_encode_ucs1(unicode, errors, 256);
7365 Py_DECREF(unicode);
7366 return result;
7367 }
7368
7369 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7370 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7371 {
7372 if (!PyUnicode_Check(unicode)) {
7373 PyErr_BadArgument();
7374 return NULL;
7375 }
7376 if (PyUnicode_READY(unicode) == -1)
7377 return NULL;
7378 /* Fast path: if it is a one-byte string, construct
7379 bytes object directly. */
7380 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7381 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7382 PyUnicode_GET_LENGTH(unicode));
7383 /* Non-Latin-1 characters present. Defer to above function to
7384 raise the exception. */
7385 return unicode_encode_ucs1(unicode, errors, 256);
7386 }
7387
7388 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7389 PyUnicode_AsLatin1String(PyObject *unicode)
7390 {
7391 return _PyUnicode_AsLatin1String(unicode, NULL);
7392 }
7393
7394 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7395
7396 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7397 PyUnicode_DecodeASCII(const char *s,
7398 Py_ssize_t size,
7399 const char *errors)
7400 {
7401 const char *starts = s;
7402 const char *e = s + size;
7403 PyObject *error_handler_obj = NULL;
7404 PyObject *exc = NULL;
7405 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7406
7407 if (size == 0)
7408 _Py_RETURN_UNICODE_EMPTY();
7409
7410 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7411 if (size == 1 && (unsigned char)s[0] < 128) {
7412 return get_latin1_char((unsigned char)s[0]);
7413 }
7414
7415 // Shortcut for simple case
7416 PyObject *u = PyUnicode_New(size, 127);
7417 if (u == NULL) {
7418 return NULL;
7419 }
7420 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7421 if (outpos == size) {
7422 return u;
7423 }
7424
7425 _PyUnicodeWriter writer;
7426 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7427 writer.pos = outpos;
7428
7429 s += outpos;
7430 int kind = writer.kind;
7431 void *data = writer.data;
7432 Py_ssize_t startinpos, endinpos;
7433
7434 while (s < e) {
7435 unsigned char c = (unsigned char)*s;
7436 if (c < 128) {
7437 PyUnicode_WRITE(kind, data, writer.pos, c);
7438 writer.pos++;
7439 ++s;
7440 continue;
7441 }
7442
7443 /* byte outsize range 0x00..0x7f: call the error handler */
7444
7445 if (error_handler == _Py_ERROR_UNKNOWN)
7446 error_handler = _Py_GetErrorHandler(errors);
7447
7448 switch (error_handler)
7449 {
7450 case _Py_ERROR_REPLACE:
7451 case _Py_ERROR_SURROGATEESCAPE:
7452 /* Fast-path: the error handler only writes one character,
7453 but we may switch to UCS2 at the first write */
7454 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7455 goto onError;
7456 kind = writer.kind;
7457 data = writer.data;
7458
7459 if (error_handler == _Py_ERROR_REPLACE)
7460 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7461 else
7462 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7463 writer.pos++;
7464 ++s;
7465 break;
7466
7467 case _Py_ERROR_IGNORE:
7468 ++s;
7469 break;
7470
7471 default:
7472 startinpos = s-starts;
7473 endinpos = startinpos + 1;
7474 if (unicode_decode_call_errorhandler_writer(
7475 errors, &error_handler_obj,
7476 "ascii", "ordinal not in range(128)",
7477 &starts, &e, &startinpos, &endinpos, &exc, &s,
7478 &writer))
7479 goto onError;
7480 kind = writer.kind;
7481 data = writer.data;
7482 }
7483 }
7484 Py_XDECREF(error_handler_obj);
7485 Py_XDECREF(exc);
7486 return _PyUnicodeWriter_Finish(&writer);
7487
7488 onError:
7489 _PyUnicodeWriter_Dealloc(&writer);
7490 Py_XDECREF(error_handler_obj);
7491 Py_XDECREF(exc);
7492 return NULL;
7493 }
7494
7495 /* Deprecated */
7496 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7497 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7498 Py_ssize_t size,
7499 const char *errors)
7500 {
7501 PyObject *result;
7502 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7503 if (unicode == NULL)
7504 return NULL;
7505 result = unicode_encode_ucs1(unicode, errors, 128);
7506 Py_DECREF(unicode);
7507 return result;
7508 }
7509
7510 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7511 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7512 {
7513 if (!PyUnicode_Check(unicode)) {
7514 PyErr_BadArgument();
7515 return NULL;
7516 }
7517 if (PyUnicode_READY(unicode) == -1)
7518 return NULL;
7519 /* Fast path: if it is an ASCII-only string, construct bytes object
7520 directly. Else defer to above function to raise the exception. */
7521 if (PyUnicode_IS_ASCII(unicode))
7522 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7523 PyUnicode_GET_LENGTH(unicode));
7524 return unicode_encode_ucs1(unicode, errors, 128);
7525 }
7526
7527 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7528 PyUnicode_AsASCIIString(PyObject *unicode)
7529 {
7530 return _PyUnicode_AsASCIIString(unicode, NULL);
7531 }
7532
7533 #ifdef MS_WINDOWS
7534
7535 /* --- MBCS codecs for Windows -------------------------------------------- */
7536
7537 #if SIZEOF_INT < SIZEOF_SIZE_T
7538 #define NEED_RETRY
7539 #endif
7540
7541 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7542 transcoding from UTF-16), but INT_MAX / 4 performs better in
7543 both cases also and avoids partial characters overrunning the
7544 length limit in MultiByteToWideChar on Windows */
7545 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7546
7547 #ifndef WC_ERR_INVALID_CHARS
7548 # define WC_ERR_INVALID_CHARS 0x0080
7549 #endif
7550
7551 static const char*
code_page_name(UINT code_page,PyObject ** obj)7552 code_page_name(UINT code_page, PyObject **obj)
7553 {
7554 *obj = NULL;
7555 if (code_page == CP_ACP)
7556 return "mbcs";
7557 if (code_page == CP_UTF7)
7558 return "CP_UTF7";
7559 if (code_page == CP_UTF8)
7560 return "CP_UTF8";
7561
7562 *obj = PyBytes_FromFormat("cp%u", code_page);
7563 if (*obj == NULL)
7564 return NULL;
7565 return PyBytes_AS_STRING(*obj);
7566 }
7567
7568 static DWORD
decode_code_page_flags(UINT code_page)7569 decode_code_page_flags(UINT code_page)
7570 {
7571 if (code_page == CP_UTF7) {
7572 /* The CP_UTF7 decoder only supports flags=0 */
7573 return 0;
7574 }
7575 else
7576 return MB_ERR_INVALID_CHARS;
7577 }
7578
7579 /*
7580 * Decode a byte string from a Windows code page into unicode object in strict
7581 * mode.
7582 *
7583 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7584 * OSError and returns -1 on other error.
7585 */
7586 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7587 decode_code_page_strict(UINT code_page,
7588 wchar_t **buf,
7589 Py_ssize_t *bufsize,
7590 const char *in,
7591 int insize)
7592 {
7593 DWORD flags = MB_ERR_INVALID_CHARS;
7594 wchar_t *out;
7595 DWORD outsize;
7596
7597 /* First get the size of the result */
7598 assert(insize > 0);
7599 while ((outsize = MultiByteToWideChar(code_page, flags,
7600 in, insize, NULL, 0)) <= 0)
7601 {
7602 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7603 goto error;
7604 }
7605 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7606 flags = 0;
7607 }
7608
7609 /* Extend a wchar_t* buffer */
7610 Py_ssize_t n = *bufsize; /* Get the current length */
7611 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7612 return -1;
7613 }
7614 out = *buf + n;
7615
7616 /* Do the conversion */
7617 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7618 if (outsize <= 0)
7619 goto error;
7620 return insize;
7621
7622 error:
7623 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7624 return -2;
7625 PyErr_SetFromWindowsErr(0);
7626 return -1;
7627 }
7628
7629 /*
7630 * Decode a byte string from a code page into unicode object with an error
7631 * handler.
7632 *
7633 * Returns consumed size if succeed, or raise an OSError or
7634 * UnicodeDecodeError exception and returns -1 on error.
7635 */
7636 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7637 decode_code_page_errors(UINT code_page,
7638 wchar_t **buf,
7639 Py_ssize_t *bufsize,
7640 const char *in, const int size,
7641 const char *errors, int final)
7642 {
7643 const char *startin = in;
7644 const char *endin = in + size;
7645 DWORD flags = MB_ERR_INVALID_CHARS;
7646 /* Ideally, we should get reason from FormatMessage. This is the Windows
7647 2000 English version of the message. */
7648 const char *reason = "No mapping for the Unicode character exists "
7649 "in the target code page.";
7650 /* each step cannot decode more than 1 character, but a character can be
7651 represented as a surrogate pair */
7652 wchar_t buffer[2], *out;
7653 int insize;
7654 Py_ssize_t outsize;
7655 PyObject *errorHandler = NULL;
7656 PyObject *exc = NULL;
7657 PyObject *encoding_obj = NULL;
7658 const char *encoding;
7659 DWORD err;
7660 int ret = -1;
7661
7662 assert(size > 0);
7663
7664 encoding = code_page_name(code_page, &encoding_obj);
7665 if (encoding == NULL)
7666 return -1;
7667
7668 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7669 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7670 UnicodeDecodeError. */
7671 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7672 if (exc != NULL) {
7673 PyCodec_StrictErrors(exc);
7674 Py_CLEAR(exc);
7675 }
7676 goto error;
7677 }
7678
7679 /* Extend a wchar_t* buffer */
7680 Py_ssize_t n = *bufsize; /* Get the current length */
7681 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7682 PyErr_NoMemory();
7683 goto error;
7684 }
7685 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7686 goto error;
7687 }
7688 out = *buf + n;
7689
7690 /* Decode the byte string character per character */
7691 while (in < endin)
7692 {
7693 /* Decode a character */
7694 insize = 1;
7695 do
7696 {
7697 outsize = MultiByteToWideChar(code_page, flags,
7698 in, insize,
7699 buffer, Py_ARRAY_LENGTH(buffer));
7700 if (outsize > 0)
7701 break;
7702 err = GetLastError();
7703 if (err == ERROR_INVALID_FLAGS && flags) {
7704 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7705 flags = 0;
7706 continue;
7707 }
7708 if (err != ERROR_NO_UNICODE_TRANSLATION
7709 && err != ERROR_INSUFFICIENT_BUFFER)
7710 {
7711 PyErr_SetFromWindowsErr(0);
7712 goto error;
7713 }
7714 insize++;
7715 }
7716 /* 4=maximum length of a UTF-8 sequence */
7717 while (insize <= 4 && (in + insize) <= endin);
7718
7719 if (outsize <= 0) {
7720 Py_ssize_t startinpos, endinpos, outpos;
7721
7722 /* last character in partial decode? */
7723 if (in + insize >= endin && !final)
7724 break;
7725
7726 startinpos = in - startin;
7727 endinpos = startinpos + 1;
7728 outpos = out - *buf;
7729 if (unicode_decode_call_errorhandler_wchar(
7730 errors, &errorHandler,
7731 encoding, reason,
7732 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7733 buf, bufsize, &outpos))
7734 {
7735 goto error;
7736 }
7737 out = *buf + outpos;
7738 }
7739 else {
7740 in += insize;
7741 memcpy(out, buffer, outsize * sizeof(wchar_t));
7742 out += outsize;
7743 }
7744 }
7745
7746 /* Shrink the buffer */
7747 assert(out - *buf <= *bufsize);
7748 *bufsize = out - *buf;
7749 /* (in - startin) <= size and size is an int */
7750 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7751
7752 error:
7753 Py_XDECREF(encoding_obj);
7754 Py_XDECREF(errorHandler);
7755 Py_XDECREF(exc);
7756 return ret;
7757 }
7758
7759 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7760 decode_code_page_stateful(int code_page,
7761 const char *s, Py_ssize_t size,
7762 const char *errors, Py_ssize_t *consumed)
7763 {
7764 wchar_t *buf = NULL;
7765 Py_ssize_t bufsize = 0;
7766 int chunk_size, final, converted, done;
7767
7768 if (code_page < 0) {
7769 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7770 return NULL;
7771 }
7772 if (size < 0) {
7773 PyErr_BadInternalCall();
7774 return NULL;
7775 }
7776
7777 if (consumed)
7778 *consumed = 0;
7779
7780 do
7781 {
7782 #ifdef NEED_RETRY
7783 if (size > DECODING_CHUNK_SIZE) {
7784 chunk_size = DECODING_CHUNK_SIZE;
7785 final = 0;
7786 done = 0;
7787 }
7788 else
7789 #endif
7790 {
7791 chunk_size = (int)size;
7792 final = (consumed == NULL);
7793 done = 1;
7794 }
7795
7796 if (chunk_size == 0 && done) {
7797 if (buf != NULL)
7798 break;
7799 _Py_RETURN_UNICODE_EMPTY();
7800 }
7801
7802 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7803 s, chunk_size);
7804 if (converted == -2)
7805 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7806 s, chunk_size,
7807 errors, final);
7808 assert(converted != 0 || done);
7809
7810 if (converted < 0) {
7811 PyMem_Free(buf);
7812 return NULL;
7813 }
7814
7815 if (consumed)
7816 *consumed += converted;
7817
7818 s += converted;
7819 size -= converted;
7820 } while (!done);
7821
7822 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7823 PyMem_Free(buf);
7824 return v;
7825 }
7826
7827 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7828 PyUnicode_DecodeCodePageStateful(int code_page,
7829 const char *s,
7830 Py_ssize_t size,
7831 const char *errors,
7832 Py_ssize_t *consumed)
7833 {
7834 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7835 }
7836
7837 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7838 PyUnicode_DecodeMBCSStateful(const char *s,
7839 Py_ssize_t size,
7840 const char *errors,
7841 Py_ssize_t *consumed)
7842 {
7843 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7844 }
7845
7846 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7847 PyUnicode_DecodeMBCS(const char *s,
7848 Py_ssize_t size,
7849 const char *errors)
7850 {
7851 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7852 }
7853
7854 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7855 encode_code_page_flags(UINT code_page, const char *errors)
7856 {
7857 if (code_page == CP_UTF8) {
7858 return WC_ERR_INVALID_CHARS;
7859 }
7860 else if (code_page == CP_UTF7) {
7861 /* CP_UTF7 only supports flags=0 */
7862 return 0;
7863 }
7864 else {
7865 if (errors != NULL && strcmp(errors, "replace") == 0)
7866 return 0;
7867 else
7868 return WC_NO_BEST_FIT_CHARS;
7869 }
7870 }
7871
7872 /*
7873 * Encode a Unicode string to a Windows code page into a byte string in strict
7874 * mode.
7875 *
7876 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7877 * an OSError and returns -1 on other error.
7878 */
7879 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7880 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7881 PyObject *unicode, Py_ssize_t offset, int len,
7882 const char* errors)
7883 {
7884 BOOL usedDefaultChar = FALSE;
7885 BOOL *pusedDefaultChar = &usedDefaultChar;
7886 int outsize;
7887 wchar_t *p;
7888 Py_ssize_t size;
7889 const DWORD flags = encode_code_page_flags(code_page, NULL);
7890 char *out;
7891 /* Create a substring so that we can get the UTF-16 representation
7892 of just the slice under consideration. */
7893 PyObject *substring;
7894 int ret = -1;
7895
7896 assert(len > 0);
7897
7898 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7899 pusedDefaultChar = &usedDefaultChar;
7900 else
7901 pusedDefaultChar = NULL;
7902
7903 substring = PyUnicode_Substring(unicode, offset, offset+len);
7904 if (substring == NULL)
7905 return -1;
7906 #if USE_UNICODE_WCHAR_CACHE
7907 _Py_COMP_DIAG_PUSH
7908 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7909 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7910 if (p == NULL) {
7911 Py_DECREF(substring);
7912 return -1;
7913 }
7914 _Py_COMP_DIAG_POP
7915 #else /* USE_UNICODE_WCHAR_CACHE */
7916 p = PyUnicode_AsWideCharString(substring, &size);
7917 Py_CLEAR(substring);
7918 if (p == NULL) {
7919 return -1;
7920 }
7921 #endif /* USE_UNICODE_WCHAR_CACHE */
7922 assert(size <= INT_MAX);
7923
7924 /* First get the size of the result */
7925 outsize = WideCharToMultiByte(code_page, flags,
7926 p, (int)size,
7927 NULL, 0,
7928 NULL, pusedDefaultChar);
7929 if (outsize <= 0)
7930 goto error;
7931 /* If we used a default char, then we failed! */
7932 if (pusedDefaultChar && *pusedDefaultChar) {
7933 ret = -2;
7934 goto done;
7935 }
7936
7937 if (*outbytes == NULL) {
7938 /* Create string object */
7939 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7940 if (*outbytes == NULL) {
7941 goto done;
7942 }
7943 out = PyBytes_AS_STRING(*outbytes);
7944 }
7945 else {
7946 /* Extend string object */
7947 const Py_ssize_t n = PyBytes_Size(*outbytes);
7948 if (outsize > PY_SSIZE_T_MAX - n) {
7949 PyErr_NoMemory();
7950 goto done;
7951 }
7952 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7953 goto done;
7954 }
7955 out = PyBytes_AS_STRING(*outbytes) + n;
7956 }
7957
7958 /* Do the conversion */
7959 outsize = WideCharToMultiByte(code_page, flags,
7960 p, (int)size,
7961 out, outsize,
7962 NULL, pusedDefaultChar);
7963 if (outsize <= 0)
7964 goto error;
7965 if (pusedDefaultChar && *pusedDefaultChar) {
7966 ret = -2;
7967 goto done;
7968 }
7969 ret = 0;
7970
7971 done:
7972 #if USE_UNICODE_WCHAR_CACHE
7973 Py_DECREF(substring);
7974 #else /* USE_UNICODE_WCHAR_CACHE */
7975 PyMem_Free(p);
7976 #endif /* USE_UNICODE_WCHAR_CACHE */
7977 return ret;
7978
7979 error:
7980 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7981 ret = -2;
7982 goto done;
7983 }
7984 PyErr_SetFromWindowsErr(0);
7985 goto done;
7986 }
7987
7988 /*
7989 * Encode a Unicode string to a Windows code page into a byte string using an
7990 * error handler.
7991 *
7992 * Returns consumed characters if succeed, or raise an OSError and returns
7993 * -1 on other error.
7994 */
7995 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7996 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7997 PyObject *unicode, Py_ssize_t unicode_offset,
7998 Py_ssize_t insize, const char* errors)
7999 {
8000 const DWORD flags = encode_code_page_flags(code_page, errors);
8001 Py_ssize_t pos = unicode_offset;
8002 Py_ssize_t endin = unicode_offset + insize;
8003 /* Ideally, we should get reason from FormatMessage. This is the Windows
8004 2000 English version of the message. */
8005 const char *reason = "invalid character";
8006 /* 4=maximum length of a UTF-8 sequence */
8007 char buffer[4];
8008 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8009 Py_ssize_t outsize;
8010 char *out;
8011 PyObject *errorHandler = NULL;
8012 PyObject *exc = NULL;
8013 PyObject *encoding_obj = NULL;
8014 const char *encoding;
8015 Py_ssize_t newpos, newoutsize;
8016 PyObject *rep;
8017 int ret = -1;
8018
8019 assert(insize > 0);
8020
8021 encoding = code_page_name(code_page, &encoding_obj);
8022 if (encoding == NULL)
8023 return -1;
8024
8025 if (errors == NULL || strcmp(errors, "strict") == 0) {
8026 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8027 then we raise a UnicodeEncodeError. */
8028 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8029 if (exc != NULL) {
8030 PyCodec_StrictErrors(exc);
8031 Py_DECREF(exc);
8032 }
8033 Py_XDECREF(encoding_obj);
8034 return -1;
8035 }
8036
8037 if (code_page != CP_UTF8 && code_page != CP_UTF7)
8038 pusedDefaultChar = &usedDefaultChar;
8039 else
8040 pusedDefaultChar = NULL;
8041
8042 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8043 PyErr_NoMemory();
8044 goto error;
8045 }
8046 outsize = insize * Py_ARRAY_LENGTH(buffer);
8047
8048 if (*outbytes == NULL) {
8049 /* Create string object */
8050 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8051 if (*outbytes == NULL)
8052 goto error;
8053 out = PyBytes_AS_STRING(*outbytes);
8054 }
8055 else {
8056 /* Extend string object */
8057 Py_ssize_t n = PyBytes_Size(*outbytes);
8058 if (n > PY_SSIZE_T_MAX - outsize) {
8059 PyErr_NoMemory();
8060 goto error;
8061 }
8062 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8063 goto error;
8064 out = PyBytes_AS_STRING(*outbytes) + n;
8065 }
8066
8067 /* Encode the string character per character */
8068 while (pos < endin)
8069 {
8070 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8071 wchar_t chars[2];
8072 int charsize;
8073 if (ch < 0x10000) {
8074 chars[0] = (wchar_t)ch;
8075 charsize = 1;
8076 }
8077 else {
8078 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8079 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8080 charsize = 2;
8081 }
8082
8083 outsize = WideCharToMultiByte(code_page, flags,
8084 chars, charsize,
8085 buffer, Py_ARRAY_LENGTH(buffer),
8086 NULL, pusedDefaultChar);
8087 if (outsize > 0) {
8088 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8089 {
8090 pos++;
8091 memcpy(out, buffer, outsize);
8092 out += outsize;
8093 continue;
8094 }
8095 }
8096 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8097 PyErr_SetFromWindowsErr(0);
8098 goto error;
8099 }
8100
8101 rep = unicode_encode_call_errorhandler(
8102 errors, &errorHandler, encoding, reason,
8103 unicode, &exc,
8104 pos, pos + 1, &newpos);
8105 if (rep == NULL)
8106 goto error;
8107 pos = newpos;
8108
8109 if (PyBytes_Check(rep)) {
8110 outsize = PyBytes_GET_SIZE(rep);
8111 if (outsize != 1) {
8112 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8113 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8114 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8115 Py_DECREF(rep);
8116 goto error;
8117 }
8118 out = PyBytes_AS_STRING(*outbytes) + offset;
8119 }
8120 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8121 out += outsize;
8122 }
8123 else {
8124 Py_ssize_t i;
8125 enum PyUnicode_Kind kind;
8126 const void *data;
8127
8128 if (PyUnicode_READY(rep) == -1) {
8129 Py_DECREF(rep);
8130 goto error;
8131 }
8132
8133 outsize = PyUnicode_GET_LENGTH(rep);
8134 if (outsize != 1) {
8135 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8136 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8137 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8138 Py_DECREF(rep);
8139 goto error;
8140 }
8141 out = PyBytes_AS_STRING(*outbytes) + offset;
8142 }
8143 kind = PyUnicode_KIND(rep);
8144 data = PyUnicode_DATA(rep);
8145 for (i=0; i < outsize; i++) {
8146 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8147 if (ch > 127) {
8148 raise_encode_exception(&exc,
8149 encoding, unicode,
8150 pos, pos + 1,
8151 "unable to encode error handler result to ASCII");
8152 Py_DECREF(rep);
8153 goto error;
8154 }
8155 *out = (unsigned char)ch;
8156 out++;
8157 }
8158 }
8159 Py_DECREF(rep);
8160 }
8161 /* write a NUL byte */
8162 *out = 0;
8163 outsize = out - PyBytes_AS_STRING(*outbytes);
8164 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8165 if (_PyBytes_Resize(outbytes, outsize) < 0)
8166 goto error;
8167 ret = 0;
8168
8169 error:
8170 Py_XDECREF(encoding_obj);
8171 Py_XDECREF(errorHandler);
8172 Py_XDECREF(exc);
8173 return ret;
8174 }
8175
8176 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8177 encode_code_page(int code_page,
8178 PyObject *unicode,
8179 const char *errors)
8180 {
8181 Py_ssize_t len;
8182 PyObject *outbytes = NULL;
8183 Py_ssize_t offset;
8184 int chunk_len, ret, done;
8185
8186 if (!PyUnicode_Check(unicode)) {
8187 PyErr_BadArgument();
8188 return NULL;
8189 }
8190
8191 if (PyUnicode_READY(unicode) == -1)
8192 return NULL;
8193 len = PyUnicode_GET_LENGTH(unicode);
8194
8195 if (code_page < 0) {
8196 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8197 return NULL;
8198 }
8199
8200 if (len == 0)
8201 return PyBytes_FromStringAndSize(NULL, 0);
8202
8203 offset = 0;
8204 do
8205 {
8206 #ifdef NEED_RETRY
8207 if (len > DECODING_CHUNK_SIZE) {
8208 chunk_len = DECODING_CHUNK_SIZE;
8209 done = 0;
8210 }
8211 else
8212 #endif
8213 {
8214 chunk_len = (int)len;
8215 done = 1;
8216 }
8217
8218 ret = encode_code_page_strict(code_page, &outbytes,
8219 unicode, offset, chunk_len,
8220 errors);
8221 if (ret == -2)
8222 ret = encode_code_page_errors(code_page, &outbytes,
8223 unicode, offset,
8224 chunk_len, errors);
8225 if (ret < 0) {
8226 Py_XDECREF(outbytes);
8227 return NULL;
8228 }
8229
8230 offset += chunk_len;
8231 len -= chunk_len;
8232 } while (!done);
8233
8234 return outbytes;
8235 }
8236
8237 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)8238 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8239 Py_ssize_t size,
8240 const char *errors)
8241 {
8242 PyObject *unicode, *res;
8243 unicode = PyUnicode_FromWideChar(p, size);
8244 if (unicode == NULL)
8245 return NULL;
8246 res = encode_code_page(CP_ACP, unicode, errors);
8247 Py_DECREF(unicode);
8248 return res;
8249 }
8250
8251 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8252 PyUnicode_EncodeCodePage(int code_page,
8253 PyObject *unicode,
8254 const char *errors)
8255 {
8256 return encode_code_page(code_page, unicode, errors);
8257 }
8258
8259 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8260 PyUnicode_AsMBCSString(PyObject *unicode)
8261 {
8262 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8263 }
8264
8265 #undef NEED_RETRY
8266
8267 #endif /* MS_WINDOWS */
8268
8269 /* --- Character Mapping Codec -------------------------------------------- */
8270
8271 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8272 charmap_decode_string(const char *s,
8273 Py_ssize_t size,
8274 PyObject *mapping,
8275 const char *errors,
8276 _PyUnicodeWriter *writer)
8277 {
8278 const char *starts = s;
8279 const char *e;
8280 Py_ssize_t startinpos, endinpos;
8281 PyObject *errorHandler = NULL, *exc = NULL;
8282 Py_ssize_t maplen;
8283 enum PyUnicode_Kind mapkind;
8284 const void *mapdata;
8285 Py_UCS4 x;
8286 unsigned char ch;
8287
8288 if (PyUnicode_READY(mapping) == -1)
8289 return -1;
8290
8291 maplen = PyUnicode_GET_LENGTH(mapping);
8292 mapdata = PyUnicode_DATA(mapping);
8293 mapkind = PyUnicode_KIND(mapping);
8294
8295 e = s + size;
8296
8297 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8298 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8299 * is disabled in encoding aliases, latin1 is preferred because
8300 * its implementation is faster. */
8301 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8302 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8303 Py_UCS4 maxchar = writer->maxchar;
8304
8305 assert (writer->kind == PyUnicode_1BYTE_KIND);
8306 while (s < e) {
8307 ch = *s;
8308 x = mapdata_ucs1[ch];
8309 if (x > maxchar) {
8310 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8311 goto onError;
8312 maxchar = writer->maxchar;
8313 outdata = (Py_UCS1 *)writer->data;
8314 }
8315 outdata[writer->pos] = x;
8316 writer->pos++;
8317 ++s;
8318 }
8319 return 0;
8320 }
8321
8322 while (s < e) {
8323 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8324 enum PyUnicode_Kind outkind = writer->kind;
8325 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8326 if (outkind == PyUnicode_1BYTE_KIND) {
8327 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8328 Py_UCS4 maxchar = writer->maxchar;
8329 while (s < e) {
8330 ch = *s;
8331 x = mapdata_ucs2[ch];
8332 if (x > maxchar)
8333 goto Error;
8334 outdata[writer->pos] = x;
8335 writer->pos++;
8336 ++s;
8337 }
8338 break;
8339 }
8340 else if (outkind == PyUnicode_2BYTE_KIND) {
8341 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8342 while (s < e) {
8343 ch = *s;
8344 x = mapdata_ucs2[ch];
8345 if (x == 0xFFFE)
8346 goto Error;
8347 outdata[writer->pos] = x;
8348 writer->pos++;
8349 ++s;
8350 }
8351 break;
8352 }
8353 }
8354 ch = *s;
8355
8356 if (ch < maplen)
8357 x = PyUnicode_READ(mapkind, mapdata, ch);
8358 else
8359 x = 0xfffe; /* invalid value */
8360 Error:
8361 if (x == 0xfffe)
8362 {
8363 /* undefined mapping */
8364 startinpos = s-starts;
8365 endinpos = startinpos+1;
8366 if (unicode_decode_call_errorhandler_writer(
8367 errors, &errorHandler,
8368 "charmap", "character maps to <undefined>",
8369 &starts, &e, &startinpos, &endinpos, &exc, &s,
8370 writer)) {
8371 goto onError;
8372 }
8373 continue;
8374 }
8375
8376 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8377 goto onError;
8378 ++s;
8379 }
8380 Py_XDECREF(errorHandler);
8381 Py_XDECREF(exc);
8382 return 0;
8383
8384 onError:
8385 Py_XDECREF(errorHandler);
8386 Py_XDECREF(exc);
8387 return -1;
8388 }
8389
8390 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8391 charmap_decode_mapping(const char *s,
8392 Py_ssize_t size,
8393 PyObject *mapping,
8394 const char *errors,
8395 _PyUnicodeWriter *writer)
8396 {
8397 const char *starts = s;
8398 const char *e;
8399 Py_ssize_t startinpos, endinpos;
8400 PyObject *errorHandler = NULL, *exc = NULL;
8401 unsigned char ch;
8402 PyObject *key, *item = NULL;
8403
8404 e = s + size;
8405
8406 while (s < e) {
8407 ch = *s;
8408
8409 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8410 key = PyLong_FromLong((long)ch);
8411 if (key == NULL)
8412 goto onError;
8413
8414 item = PyObject_GetItem(mapping, key);
8415 Py_DECREF(key);
8416 if (item == NULL) {
8417 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8418 /* No mapping found means: mapping is undefined. */
8419 PyErr_Clear();
8420 goto Undefined;
8421 } else
8422 goto onError;
8423 }
8424
8425 /* Apply mapping */
8426 if (item == Py_None)
8427 goto Undefined;
8428 if (PyLong_Check(item)) {
8429 long value = PyLong_AS_LONG(item);
8430 if (value == 0xFFFE)
8431 goto Undefined;
8432 if (value < 0 || value > MAX_UNICODE) {
8433 PyErr_Format(PyExc_TypeError,
8434 "character mapping must be in range(0x%x)",
8435 (unsigned long)MAX_UNICODE + 1);
8436 goto onError;
8437 }
8438
8439 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8440 goto onError;
8441 }
8442 else if (PyUnicode_Check(item)) {
8443 if (PyUnicode_READY(item) == -1)
8444 goto onError;
8445 if (PyUnicode_GET_LENGTH(item) == 1) {
8446 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8447 if (value == 0xFFFE)
8448 goto Undefined;
8449 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8450 goto onError;
8451 }
8452 else {
8453 writer->overallocate = 1;
8454 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8455 goto onError;
8456 }
8457 }
8458 else {
8459 /* wrong return value */
8460 PyErr_SetString(PyExc_TypeError,
8461 "character mapping must return integer, None or str");
8462 goto onError;
8463 }
8464 Py_CLEAR(item);
8465 ++s;
8466 continue;
8467
8468 Undefined:
8469 /* undefined mapping */
8470 Py_CLEAR(item);
8471 startinpos = s-starts;
8472 endinpos = startinpos+1;
8473 if (unicode_decode_call_errorhandler_writer(
8474 errors, &errorHandler,
8475 "charmap", "character maps to <undefined>",
8476 &starts, &e, &startinpos, &endinpos, &exc, &s,
8477 writer)) {
8478 goto onError;
8479 }
8480 }
8481 Py_XDECREF(errorHandler);
8482 Py_XDECREF(exc);
8483 return 0;
8484
8485 onError:
8486 Py_XDECREF(item);
8487 Py_XDECREF(errorHandler);
8488 Py_XDECREF(exc);
8489 return -1;
8490 }
8491
8492 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8493 PyUnicode_DecodeCharmap(const char *s,
8494 Py_ssize_t size,
8495 PyObject *mapping,
8496 const char *errors)
8497 {
8498 _PyUnicodeWriter writer;
8499
8500 /* Default to Latin-1 */
8501 if (mapping == NULL)
8502 return PyUnicode_DecodeLatin1(s, size, errors);
8503
8504 if (size == 0)
8505 _Py_RETURN_UNICODE_EMPTY();
8506 _PyUnicodeWriter_Init(&writer);
8507 writer.min_length = size;
8508 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8509 goto onError;
8510
8511 if (PyUnicode_CheckExact(mapping)) {
8512 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8513 goto onError;
8514 }
8515 else {
8516 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8517 goto onError;
8518 }
8519 return _PyUnicodeWriter_Finish(&writer);
8520
8521 onError:
8522 _PyUnicodeWriter_Dealloc(&writer);
8523 return NULL;
8524 }
8525
8526 /* Charmap encoding: the lookup table */
8527
8528 struct encoding_map {
8529 PyObject_HEAD
8530 unsigned char level1[32];
8531 int count2, count3;
8532 unsigned char level23[1];
8533 };
8534
8535 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8536 encoding_map_size(PyObject *obj, PyObject* args)
8537 {
8538 struct encoding_map *map = (struct encoding_map*)obj;
8539 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8540 128*map->count3);
8541 }
8542
8543 static PyMethodDef encoding_map_methods[] = {
8544 {"size", encoding_map_size, METH_NOARGS,
8545 PyDoc_STR("Return the size (in bytes) of this object") },
8546 { 0 }
8547 };
8548
8549 static PyTypeObject EncodingMapType = {
8550 PyVarObject_HEAD_INIT(NULL, 0)
8551 "EncodingMap", /*tp_name*/
8552 sizeof(struct encoding_map), /*tp_basicsize*/
8553 0, /*tp_itemsize*/
8554 /* methods */
8555 0, /*tp_dealloc*/
8556 0, /*tp_vectorcall_offset*/
8557 0, /*tp_getattr*/
8558 0, /*tp_setattr*/
8559 0, /*tp_as_async*/
8560 0, /*tp_repr*/
8561 0, /*tp_as_number*/
8562 0, /*tp_as_sequence*/
8563 0, /*tp_as_mapping*/
8564 0, /*tp_hash*/
8565 0, /*tp_call*/
8566 0, /*tp_str*/
8567 0, /*tp_getattro*/
8568 0, /*tp_setattro*/
8569 0, /*tp_as_buffer*/
8570 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8571 0, /*tp_doc*/
8572 0, /*tp_traverse*/
8573 0, /*tp_clear*/
8574 0, /*tp_richcompare*/
8575 0, /*tp_weaklistoffset*/
8576 0, /*tp_iter*/
8577 0, /*tp_iternext*/
8578 encoding_map_methods, /*tp_methods*/
8579 0, /*tp_members*/
8580 0, /*tp_getset*/
8581 0, /*tp_base*/
8582 0, /*tp_dict*/
8583 0, /*tp_descr_get*/
8584 0, /*tp_descr_set*/
8585 0, /*tp_dictoffset*/
8586 0, /*tp_init*/
8587 0, /*tp_alloc*/
8588 0, /*tp_new*/
8589 0, /*tp_free*/
8590 0, /*tp_is_gc*/
8591 };
8592
8593 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8594 PyUnicode_BuildEncodingMap(PyObject* string)
8595 {
8596 PyObject *result;
8597 struct encoding_map *mresult;
8598 int i;
8599 int need_dict = 0;
8600 unsigned char level1[32];
8601 unsigned char level2[512];
8602 unsigned char *mlevel1, *mlevel2, *mlevel3;
8603 int count2 = 0, count3 = 0;
8604 int kind;
8605 const void *data;
8606 Py_ssize_t length;
8607 Py_UCS4 ch;
8608
8609 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8610 PyErr_BadArgument();
8611 return NULL;
8612 }
8613 kind = PyUnicode_KIND(string);
8614 data = PyUnicode_DATA(string);
8615 length = PyUnicode_GET_LENGTH(string);
8616 length = Py_MIN(length, 256);
8617 memset(level1, 0xFF, sizeof level1);
8618 memset(level2, 0xFF, sizeof level2);
8619
8620 /* If there isn't a one-to-one mapping of NULL to \0,
8621 or if there are non-BMP characters, we need to use
8622 a mapping dictionary. */
8623 if (PyUnicode_READ(kind, data, 0) != 0)
8624 need_dict = 1;
8625 for (i = 1; i < length; i++) {
8626 int l1, l2;
8627 ch = PyUnicode_READ(kind, data, i);
8628 if (ch == 0 || ch > 0xFFFF) {
8629 need_dict = 1;
8630 break;
8631 }
8632 if (ch == 0xFFFE)
8633 /* unmapped character */
8634 continue;
8635 l1 = ch >> 11;
8636 l2 = ch >> 7;
8637 if (level1[l1] == 0xFF)
8638 level1[l1] = count2++;
8639 if (level2[l2] == 0xFF)
8640 level2[l2] = count3++;
8641 }
8642
8643 if (count2 >= 0xFF || count3 >= 0xFF)
8644 need_dict = 1;
8645
8646 if (need_dict) {
8647 PyObject *result = PyDict_New();
8648 PyObject *key, *value;
8649 if (!result)
8650 return NULL;
8651 for (i = 0; i < length; i++) {
8652 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8653 value = PyLong_FromLong(i);
8654 if (!key || !value)
8655 goto failed1;
8656 if (PyDict_SetItem(result, key, value) == -1)
8657 goto failed1;
8658 Py_DECREF(key);
8659 Py_DECREF(value);
8660 }
8661 return result;
8662 failed1:
8663 Py_XDECREF(key);
8664 Py_XDECREF(value);
8665 Py_DECREF(result);
8666 return NULL;
8667 }
8668
8669 /* Create a three-level trie */
8670 result = PyObject_Malloc(sizeof(struct encoding_map) +
8671 16*count2 + 128*count3 - 1);
8672 if (!result) {
8673 return PyErr_NoMemory();
8674 }
8675
8676 _PyObject_Init(result, &EncodingMapType);
8677 mresult = (struct encoding_map*)result;
8678 mresult->count2 = count2;
8679 mresult->count3 = count3;
8680 mlevel1 = mresult->level1;
8681 mlevel2 = mresult->level23;
8682 mlevel3 = mresult->level23 + 16*count2;
8683 memcpy(mlevel1, level1, 32);
8684 memset(mlevel2, 0xFF, 16*count2);
8685 memset(mlevel3, 0, 128*count3);
8686 count3 = 0;
8687 for (i = 1; i < length; i++) {
8688 int o1, o2, o3, i2, i3;
8689 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8690 if (ch == 0xFFFE)
8691 /* unmapped character */
8692 continue;
8693 o1 = ch>>11;
8694 o2 = (ch>>7) & 0xF;
8695 i2 = 16*mlevel1[o1] + o2;
8696 if (mlevel2[i2] == 0xFF)
8697 mlevel2[i2] = count3++;
8698 o3 = ch & 0x7F;
8699 i3 = 128*mlevel2[i2] + o3;
8700 mlevel3[i3] = i;
8701 }
8702 return result;
8703 }
8704
8705 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8706 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8707 {
8708 struct encoding_map *map = (struct encoding_map*)mapping;
8709 int l1 = c>>11;
8710 int l2 = (c>>7) & 0xF;
8711 int l3 = c & 0x7F;
8712 int i;
8713
8714 if (c > 0xFFFF)
8715 return -1;
8716 if (c == 0)
8717 return 0;
8718 /* level 1*/
8719 i = map->level1[l1];
8720 if (i == 0xFF) {
8721 return -1;
8722 }
8723 /* level 2*/
8724 i = map->level23[16*i+l2];
8725 if (i == 0xFF) {
8726 return -1;
8727 }
8728 /* level 3 */
8729 i = map->level23[16*map->count2 + 128*i + l3];
8730 if (i == 0) {
8731 return -1;
8732 }
8733 return i;
8734 }
8735
8736 /* Lookup the character ch in the mapping. If the character
8737 can't be found, Py_None is returned (or NULL, if another
8738 error occurred). */
8739 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8740 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8741 {
8742 PyObject *w = PyLong_FromLong((long)c);
8743 PyObject *x;
8744
8745 if (w == NULL)
8746 return NULL;
8747 x = PyObject_GetItem(mapping, w);
8748 Py_DECREF(w);
8749 if (x == NULL) {
8750 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8751 /* No mapping found means: mapping is undefined. */
8752 PyErr_Clear();
8753 Py_RETURN_NONE;
8754 } else
8755 return NULL;
8756 }
8757 else if (x == Py_None)
8758 return x;
8759 else if (PyLong_Check(x)) {
8760 long value = PyLong_AS_LONG(x);
8761 if (value < 0 || value > 255) {
8762 PyErr_SetString(PyExc_TypeError,
8763 "character mapping must be in range(256)");
8764 Py_DECREF(x);
8765 return NULL;
8766 }
8767 return x;
8768 }
8769 else if (PyBytes_Check(x))
8770 return x;
8771 else {
8772 /* wrong return value */
8773 PyErr_Format(PyExc_TypeError,
8774 "character mapping must return integer, bytes or None, not %.400s",
8775 Py_TYPE(x)->tp_name);
8776 Py_DECREF(x);
8777 return NULL;
8778 }
8779 }
8780
8781 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8782 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8783 {
8784 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8785 /* exponentially overallocate to minimize reallocations */
8786 if (requiredsize < 2*outsize)
8787 requiredsize = 2*outsize;
8788 if (_PyBytes_Resize(outobj, requiredsize))
8789 return -1;
8790 return 0;
8791 }
8792
8793 typedef enum charmapencode_result {
8794 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8795 } charmapencode_result;
8796 /* lookup the character, put the result in the output string and adjust
8797 various state variables. Resize the output bytes object if not enough
8798 space is available. Return a new reference to the object that
8799 was put in the output buffer, or Py_None, if the mapping was undefined
8800 (in which case no character was written) or NULL, if a
8801 reallocation error occurred. The caller must decref the result */
8802 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8803 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8804 PyObject **outobj, Py_ssize_t *outpos)
8805 {
8806 PyObject *rep;
8807 char *outstart;
8808 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8809
8810 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8811 int res = encoding_map_lookup(c, mapping);
8812 Py_ssize_t requiredsize = *outpos+1;
8813 if (res == -1)
8814 return enc_FAILED;
8815 if (outsize<requiredsize)
8816 if (charmapencode_resize(outobj, outpos, requiredsize))
8817 return enc_EXCEPTION;
8818 outstart = PyBytes_AS_STRING(*outobj);
8819 outstart[(*outpos)++] = (char)res;
8820 return enc_SUCCESS;
8821 }
8822
8823 rep = charmapencode_lookup(c, mapping);
8824 if (rep==NULL)
8825 return enc_EXCEPTION;
8826 else if (rep==Py_None) {
8827 Py_DECREF(rep);
8828 return enc_FAILED;
8829 } else {
8830 if (PyLong_Check(rep)) {
8831 Py_ssize_t requiredsize = *outpos+1;
8832 if (outsize<requiredsize)
8833 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8834 Py_DECREF(rep);
8835 return enc_EXCEPTION;
8836 }
8837 outstart = PyBytes_AS_STRING(*outobj);
8838 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8839 }
8840 else {
8841 const char *repchars = PyBytes_AS_STRING(rep);
8842 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8843 Py_ssize_t requiredsize = *outpos+repsize;
8844 if (outsize<requiredsize)
8845 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8846 Py_DECREF(rep);
8847 return enc_EXCEPTION;
8848 }
8849 outstart = PyBytes_AS_STRING(*outobj);
8850 memcpy(outstart + *outpos, repchars, repsize);
8851 *outpos += repsize;
8852 }
8853 }
8854 Py_DECREF(rep);
8855 return enc_SUCCESS;
8856 }
8857
8858 /* handle an error in PyUnicode_EncodeCharmap
8859 Return 0 on success, -1 on error */
8860 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8861 charmap_encoding_error(
8862 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8863 PyObject **exceptionObject,
8864 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8865 PyObject **res, Py_ssize_t *respos)
8866 {
8867 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8868 Py_ssize_t size, repsize;
8869 Py_ssize_t newpos;
8870 enum PyUnicode_Kind kind;
8871 const void *data;
8872 Py_ssize_t index;
8873 /* startpos for collecting unencodable chars */
8874 Py_ssize_t collstartpos = *inpos;
8875 Py_ssize_t collendpos = *inpos+1;
8876 Py_ssize_t collpos;
8877 const char *encoding = "charmap";
8878 const char *reason = "character maps to <undefined>";
8879 charmapencode_result x;
8880 Py_UCS4 ch;
8881 int val;
8882
8883 if (PyUnicode_READY(unicode) == -1)
8884 return -1;
8885 size = PyUnicode_GET_LENGTH(unicode);
8886 /* find all unencodable characters */
8887 while (collendpos < size) {
8888 PyObject *rep;
8889 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8890 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8891 val = encoding_map_lookup(ch, mapping);
8892 if (val != -1)
8893 break;
8894 ++collendpos;
8895 continue;
8896 }
8897
8898 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8899 rep = charmapencode_lookup(ch, mapping);
8900 if (rep==NULL)
8901 return -1;
8902 else if (rep!=Py_None) {
8903 Py_DECREF(rep);
8904 break;
8905 }
8906 Py_DECREF(rep);
8907 ++collendpos;
8908 }
8909 /* cache callback name lookup
8910 * (if not done yet, i.e. it's the first error) */
8911 if (*error_handler == _Py_ERROR_UNKNOWN)
8912 *error_handler = _Py_GetErrorHandler(errors);
8913
8914 switch (*error_handler) {
8915 case _Py_ERROR_STRICT:
8916 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8917 return -1;
8918
8919 case _Py_ERROR_REPLACE:
8920 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8921 x = charmapencode_output('?', mapping, res, respos);
8922 if (x==enc_EXCEPTION) {
8923 return -1;
8924 }
8925 else if (x==enc_FAILED) {
8926 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8927 return -1;
8928 }
8929 }
8930 /* fall through */
8931 case _Py_ERROR_IGNORE:
8932 *inpos = collendpos;
8933 break;
8934
8935 case _Py_ERROR_XMLCHARREFREPLACE:
8936 /* generate replacement (temporarily (mis)uses p) */
8937 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8938 char buffer[2+29+1+1];
8939 char *cp;
8940 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8941 for (cp = buffer; *cp; ++cp) {
8942 x = charmapencode_output(*cp, mapping, res, respos);
8943 if (x==enc_EXCEPTION)
8944 return -1;
8945 else if (x==enc_FAILED) {
8946 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8947 return -1;
8948 }
8949 }
8950 }
8951 *inpos = collendpos;
8952 break;
8953
8954 default:
8955 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8956 encoding, reason, unicode, exceptionObject,
8957 collstartpos, collendpos, &newpos);
8958 if (repunicode == NULL)
8959 return -1;
8960 if (PyBytes_Check(repunicode)) {
8961 /* Directly copy bytes result to output. */
8962 Py_ssize_t outsize = PyBytes_Size(*res);
8963 Py_ssize_t requiredsize;
8964 repsize = PyBytes_Size(repunicode);
8965 requiredsize = *respos + repsize;
8966 if (requiredsize > outsize)
8967 /* Make room for all additional bytes. */
8968 if (charmapencode_resize(res, respos, requiredsize)) {
8969 Py_DECREF(repunicode);
8970 return -1;
8971 }
8972 memcpy(PyBytes_AsString(*res) + *respos,
8973 PyBytes_AsString(repunicode), repsize);
8974 *respos += repsize;
8975 *inpos = newpos;
8976 Py_DECREF(repunicode);
8977 break;
8978 }
8979 /* generate replacement */
8980 if (PyUnicode_READY(repunicode) == -1) {
8981 Py_DECREF(repunicode);
8982 return -1;
8983 }
8984 repsize = PyUnicode_GET_LENGTH(repunicode);
8985 data = PyUnicode_DATA(repunicode);
8986 kind = PyUnicode_KIND(repunicode);
8987 for (index = 0; index < repsize; index++) {
8988 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8989 x = charmapencode_output(repch, mapping, res, respos);
8990 if (x==enc_EXCEPTION) {
8991 Py_DECREF(repunicode);
8992 return -1;
8993 }
8994 else if (x==enc_FAILED) {
8995 Py_DECREF(repunicode);
8996 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8997 return -1;
8998 }
8999 }
9000 *inpos = newpos;
9001 Py_DECREF(repunicode);
9002 }
9003 return 0;
9004 }
9005
9006 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)9007 _PyUnicode_EncodeCharmap(PyObject *unicode,
9008 PyObject *mapping,
9009 const char *errors)
9010 {
9011 /* output object */
9012 PyObject *res = NULL;
9013 /* current input position */
9014 Py_ssize_t inpos = 0;
9015 Py_ssize_t size;
9016 /* current output position */
9017 Py_ssize_t respos = 0;
9018 PyObject *error_handler_obj = NULL;
9019 PyObject *exc = NULL;
9020 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9021 const void *data;
9022 int kind;
9023
9024 if (PyUnicode_READY(unicode) == -1)
9025 return NULL;
9026 size = PyUnicode_GET_LENGTH(unicode);
9027 data = PyUnicode_DATA(unicode);
9028 kind = PyUnicode_KIND(unicode);
9029
9030 /* Default to Latin-1 */
9031 if (mapping == NULL)
9032 return unicode_encode_ucs1(unicode, errors, 256);
9033
9034 /* allocate enough for a simple encoding without
9035 replacements, if we need more, we'll resize */
9036 res = PyBytes_FromStringAndSize(NULL, size);
9037 if (res == NULL)
9038 goto onError;
9039 if (size == 0)
9040 return res;
9041
9042 while (inpos<size) {
9043 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9044 /* try to encode it */
9045 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9046 if (x==enc_EXCEPTION) /* error */
9047 goto onError;
9048 if (x==enc_FAILED) { /* unencodable character */
9049 if (charmap_encoding_error(unicode, &inpos, mapping,
9050 &exc,
9051 &error_handler, &error_handler_obj, errors,
9052 &res, &respos)) {
9053 goto onError;
9054 }
9055 }
9056 else
9057 /* done with this character => adjust input position */
9058 ++inpos;
9059 }
9060
9061 /* Resize if we allocated to much */
9062 if (respos<PyBytes_GET_SIZE(res))
9063 if (_PyBytes_Resize(&res, respos) < 0)
9064 goto onError;
9065
9066 Py_XDECREF(exc);
9067 Py_XDECREF(error_handler_obj);
9068 return res;
9069
9070 onError:
9071 Py_XDECREF(res);
9072 Py_XDECREF(exc);
9073 Py_XDECREF(error_handler_obj);
9074 return NULL;
9075 }
9076
9077 /* Deprecated */
9078 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9079 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9080 Py_ssize_t size,
9081 PyObject *mapping,
9082 const char *errors)
9083 {
9084 PyObject *result;
9085 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9086 if (unicode == NULL)
9087 return NULL;
9088 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9089 Py_DECREF(unicode);
9090 return result;
9091 }
9092
9093 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)9094 PyUnicode_AsCharmapString(PyObject *unicode,
9095 PyObject *mapping)
9096 {
9097 if (!PyUnicode_Check(unicode) || mapping == NULL) {
9098 PyErr_BadArgument();
9099 return NULL;
9100 }
9101 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9102 }
9103
9104 /* create or adjust a UnicodeTranslateError */
9105 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)9106 make_translate_exception(PyObject **exceptionObject,
9107 PyObject *unicode,
9108 Py_ssize_t startpos, Py_ssize_t endpos,
9109 const char *reason)
9110 {
9111 if (*exceptionObject == NULL) {
9112 *exceptionObject = _PyUnicodeTranslateError_Create(
9113 unicode, startpos, endpos, reason);
9114 }
9115 else {
9116 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9117 goto onError;
9118 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9119 goto onError;
9120 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9121 goto onError;
9122 return;
9123 onError:
9124 Py_CLEAR(*exceptionObject);
9125 }
9126 }
9127
9128 /* error handling callback helper:
9129 build arguments, call the callback and check the arguments,
9130 put the result into newpos and return the replacement string, which
9131 has to be freed by the caller */
9132 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)9133 unicode_translate_call_errorhandler(const char *errors,
9134 PyObject **errorHandler,
9135 const char *reason,
9136 PyObject *unicode, PyObject **exceptionObject,
9137 Py_ssize_t startpos, Py_ssize_t endpos,
9138 Py_ssize_t *newpos)
9139 {
9140 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9141
9142 Py_ssize_t i_newpos;
9143 PyObject *restuple;
9144 PyObject *resunicode;
9145
9146 if (*errorHandler == NULL) {
9147 *errorHandler = PyCodec_LookupError(errors);
9148 if (*errorHandler == NULL)
9149 return NULL;
9150 }
9151
9152 make_translate_exception(exceptionObject,
9153 unicode, startpos, endpos, reason);
9154 if (*exceptionObject == NULL)
9155 return NULL;
9156
9157 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9158 if (restuple == NULL)
9159 return NULL;
9160 if (!PyTuple_Check(restuple)) {
9161 PyErr_SetString(PyExc_TypeError, &argparse[3]);
9162 Py_DECREF(restuple);
9163 return NULL;
9164 }
9165 if (!PyArg_ParseTuple(restuple, argparse,
9166 &resunicode, &i_newpos)) {
9167 Py_DECREF(restuple);
9168 return NULL;
9169 }
9170 if (i_newpos<0)
9171 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9172 else
9173 *newpos = i_newpos;
9174 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9175 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9176 Py_DECREF(restuple);
9177 return NULL;
9178 }
9179 Py_INCREF(resunicode);
9180 Py_DECREF(restuple);
9181 return resunicode;
9182 }
9183
9184 /* Lookup the character ch in the mapping and put the result in result,
9185 which must be decrefed by the caller.
9186 Return 0 on success, -1 on error */
9187 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)9188 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
9189 {
9190 PyObject *w = PyLong_FromLong((long)c);
9191 PyObject *x;
9192
9193 if (w == NULL)
9194 return -1;
9195 x = PyObject_GetItem(mapping, w);
9196 Py_DECREF(w);
9197 if (x == NULL) {
9198 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9199 /* No mapping found means: use 1:1 mapping. */
9200 PyErr_Clear();
9201 *result = NULL;
9202 return 0;
9203 } else
9204 return -1;
9205 }
9206 else if (x == Py_None) {
9207 *result = x;
9208 return 0;
9209 }
9210 else if (PyLong_Check(x)) {
9211 long value = PyLong_AS_LONG(x);
9212 if (value < 0 || value > MAX_UNICODE) {
9213 PyErr_Format(PyExc_ValueError,
9214 "character mapping must be in range(0x%x)",
9215 MAX_UNICODE+1);
9216 Py_DECREF(x);
9217 return -1;
9218 }
9219 *result = x;
9220 return 0;
9221 }
9222 else if (PyUnicode_Check(x)) {
9223 *result = x;
9224 return 0;
9225 }
9226 else {
9227 /* wrong return value */
9228 PyErr_SetString(PyExc_TypeError,
9229 "character mapping must return integer, None or str");
9230 Py_DECREF(x);
9231 return -1;
9232 }
9233 }
9234
9235 /* lookup the character, write the result into the writer.
9236 Return 1 if the result was written into the writer, return 0 if the mapping
9237 was undefined, raise an exception return -1 on error. */
9238 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9239 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9240 _PyUnicodeWriter *writer)
9241 {
9242 PyObject *item;
9243
9244 if (charmaptranslate_lookup(ch, mapping, &item))
9245 return -1;
9246
9247 if (item == NULL) {
9248 /* not found => default to 1:1 mapping */
9249 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9250 return -1;
9251 }
9252 return 1;
9253 }
9254
9255 if (item == Py_None) {
9256 Py_DECREF(item);
9257 return 0;
9258 }
9259
9260 if (PyLong_Check(item)) {
9261 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9262 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9263 used it */
9264 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9265 Py_DECREF(item);
9266 return -1;
9267 }
9268 Py_DECREF(item);
9269 return 1;
9270 }
9271
9272 if (!PyUnicode_Check(item)) {
9273 Py_DECREF(item);
9274 return -1;
9275 }
9276
9277 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9278 Py_DECREF(item);
9279 return -1;
9280 }
9281
9282 Py_DECREF(item);
9283 return 1;
9284 }
9285
9286 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9287 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9288 Py_UCS1 *translate)
9289 {
9290 PyObject *item = NULL;
9291 int ret = 0;
9292
9293 if (charmaptranslate_lookup(ch, mapping, &item)) {
9294 return -1;
9295 }
9296
9297 if (item == Py_None) {
9298 /* deletion */
9299 translate[ch] = 0xfe;
9300 }
9301 else if (item == NULL) {
9302 /* not found => default to 1:1 mapping */
9303 translate[ch] = ch;
9304 return 1;
9305 }
9306 else if (PyLong_Check(item)) {
9307 long replace = PyLong_AS_LONG(item);
9308 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9309 used it */
9310 if (127 < replace) {
9311 /* invalid character or character outside ASCII:
9312 skip the fast translate */
9313 goto exit;
9314 }
9315 translate[ch] = (Py_UCS1)replace;
9316 }
9317 else if (PyUnicode_Check(item)) {
9318 Py_UCS4 replace;
9319
9320 if (PyUnicode_READY(item) == -1) {
9321 Py_DECREF(item);
9322 return -1;
9323 }
9324 if (PyUnicode_GET_LENGTH(item) != 1)
9325 goto exit;
9326
9327 replace = PyUnicode_READ_CHAR(item, 0);
9328 if (replace > 127)
9329 goto exit;
9330 translate[ch] = (Py_UCS1)replace;
9331 }
9332 else {
9333 /* not None, NULL, long or unicode */
9334 goto exit;
9335 }
9336 ret = 1;
9337
9338 exit:
9339 Py_DECREF(item);
9340 return ret;
9341 }
9342
9343 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9344 was translated into writer, return 0 if the input string was partially
9345 translated into writer, raise an exception and return -1 on error. */
9346 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9347 unicode_fast_translate(PyObject *input, PyObject *mapping,
9348 _PyUnicodeWriter *writer, int ignore,
9349 Py_ssize_t *input_pos)
9350 {
9351 Py_UCS1 ascii_table[128], ch, ch2;
9352 Py_ssize_t len;
9353 const Py_UCS1 *in, *end;
9354 Py_UCS1 *out;
9355 int res = 0;
9356
9357 len = PyUnicode_GET_LENGTH(input);
9358
9359 memset(ascii_table, 0xff, 128);
9360
9361 in = PyUnicode_1BYTE_DATA(input);
9362 end = in + len;
9363
9364 assert(PyUnicode_IS_ASCII(writer->buffer));
9365 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9366 out = PyUnicode_1BYTE_DATA(writer->buffer);
9367
9368 for (; in < end; in++) {
9369 ch = *in;
9370 ch2 = ascii_table[ch];
9371 if (ch2 == 0xff) {
9372 int translate = unicode_fast_translate_lookup(mapping, ch,
9373 ascii_table);
9374 if (translate < 0)
9375 return -1;
9376 if (translate == 0)
9377 goto exit;
9378 ch2 = ascii_table[ch];
9379 }
9380 if (ch2 == 0xfe) {
9381 if (ignore)
9382 continue;
9383 goto exit;
9384 }
9385 assert(ch2 < 128);
9386 *out = ch2;
9387 out++;
9388 }
9389 res = 1;
9390
9391 exit:
9392 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9393 *input_pos = in - PyUnicode_1BYTE_DATA(input);
9394 return res;
9395 }
9396
9397 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9398 _PyUnicode_TranslateCharmap(PyObject *input,
9399 PyObject *mapping,
9400 const char *errors)
9401 {
9402 /* input object */
9403 const void *data;
9404 Py_ssize_t size, i;
9405 int kind;
9406 /* output buffer */
9407 _PyUnicodeWriter writer;
9408 /* error handler */
9409 const char *reason = "character maps to <undefined>";
9410 PyObject *errorHandler = NULL;
9411 PyObject *exc = NULL;
9412 int ignore;
9413 int res;
9414
9415 if (mapping == NULL) {
9416 PyErr_BadArgument();
9417 return NULL;
9418 }
9419
9420 if (PyUnicode_READY(input) == -1)
9421 return NULL;
9422 data = PyUnicode_DATA(input);
9423 kind = PyUnicode_KIND(input);
9424 size = PyUnicode_GET_LENGTH(input);
9425
9426 if (size == 0)
9427 return PyUnicode_FromObject(input);
9428
9429 /* allocate enough for a simple 1:1 translation without
9430 replacements, if we need more, we'll resize */
9431 _PyUnicodeWriter_Init(&writer);
9432 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9433 goto onError;
9434
9435 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9436
9437 if (PyUnicode_READY(input) == -1)
9438 return NULL;
9439 if (PyUnicode_IS_ASCII(input)) {
9440 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9441 if (res < 0) {
9442 _PyUnicodeWriter_Dealloc(&writer);
9443 return NULL;
9444 }
9445 if (res == 1)
9446 return _PyUnicodeWriter_Finish(&writer);
9447 }
9448 else {
9449 i = 0;
9450 }
9451
9452 while (i<size) {
9453 /* try to encode it */
9454 int translate;
9455 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9456 Py_ssize_t newpos;
9457 /* startpos for collecting untranslatable chars */
9458 Py_ssize_t collstart;
9459 Py_ssize_t collend;
9460 Py_UCS4 ch;
9461
9462 ch = PyUnicode_READ(kind, data, i);
9463 translate = charmaptranslate_output(ch, mapping, &writer);
9464 if (translate < 0)
9465 goto onError;
9466
9467 if (translate != 0) {
9468 /* it worked => adjust input pointer */
9469 ++i;
9470 continue;
9471 }
9472
9473 /* untranslatable character */
9474 collstart = i;
9475 collend = i+1;
9476
9477 /* find all untranslatable characters */
9478 while (collend < size) {
9479 PyObject *x;
9480 ch = PyUnicode_READ(kind, data, collend);
9481 if (charmaptranslate_lookup(ch, mapping, &x))
9482 goto onError;
9483 Py_XDECREF(x);
9484 if (x != Py_None)
9485 break;
9486 ++collend;
9487 }
9488
9489 if (ignore) {
9490 i = collend;
9491 }
9492 else {
9493 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9494 reason, input, &exc,
9495 collstart, collend, &newpos);
9496 if (repunicode == NULL)
9497 goto onError;
9498 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9499 Py_DECREF(repunicode);
9500 goto onError;
9501 }
9502 Py_DECREF(repunicode);
9503 i = newpos;
9504 }
9505 }
9506 Py_XDECREF(exc);
9507 Py_XDECREF(errorHandler);
9508 return _PyUnicodeWriter_Finish(&writer);
9509
9510 onError:
9511 _PyUnicodeWriter_Dealloc(&writer);
9512 Py_XDECREF(exc);
9513 Py_XDECREF(errorHandler);
9514 return NULL;
9515 }
9516
9517 /* Deprecated. Use PyUnicode_Translate instead. */
9518 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9519 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9520 Py_ssize_t size,
9521 PyObject *mapping,
9522 const char *errors)
9523 {
9524 PyObject *result;
9525 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9526 if (!unicode)
9527 return NULL;
9528 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9529 Py_DECREF(unicode);
9530 return result;
9531 }
9532
9533 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9534 PyUnicode_Translate(PyObject *str,
9535 PyObject *mapping,
9536 const char *errors)
9537 {
9538 if (ensure_unicode(str) < 0)
9539 return NULL;
9540 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9541 }
9542
9543 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9544 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9545 {
9546 if (!PyUnicode_Check(unicode)) {
9547 PyErr_BadInternalCall();
9548 return NULL;
9549 }
9550 if (PyUnicode_READY(unicode) == -1)
9551 return NULL;
9552 if (PyUnicode_IS_ASCII(unicode)) {
9553 /* If the string is already ASCII, just return the same string */
9554 Py_INCREF(unicode);
9555 return unicode;
9556 }
9557
9558 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9559 PyObject *result = PyUnicode_New(len, 127);
9560 if (result == NULL) {
9561 return NULL;
9562 }
9563
9564 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9565 int kind = PyUnicode_KIND(unicode);
9566 const void *data = PyUnicode_DATA(unicode);
9567 Py_ssize_t i;
9568 for (i = 0; i < len; ++i) {
9569 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9570 if (ch < 127) {
9571 out[i] = ch;
9572 }
9573 else if (Py_UNICODE_ISSPACE(ch)) {
9574 out[i] = ' ';
9575 }
9576 else {
9577 int decimal = Py_UNICODE_TODECIMAL(ch);
9578 if (decimal < 0) {
9579 out[i] = '?';
9580 out[i+1] = '\0';
9581 _PyUnicode_LENGTH(result) = i + 1;
9582 break;
9583 }
9584 out[i] = '0' + decimal;
9585 }
9586 }
9587
9588 assert(_PyUnicode_CheckConsistency(result, 1));
9589 return result;
9590 }
9591
9592 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9593 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9594 Py_ssize_t length)
9595 {
9596 PyObject *decimal;
9597 Py_ssize_t i;
9598 Py_UCS4 maxchar;
9599 enum PyUnicode_Kind kind;
9600 const void *data;
9601
9602 maxchar = 127;
9603 for (i = 0; i < length; i++) {
9604 Py_UCS4 ch = s[i];
9605 if (ch > 127) {
9606 int decimal = Py_UNICODE_TODECIMAL(ch);
9607 if (decimal >= 0)
9608 ch = '0' + decimal;
9609 maxchar = Py_MAX(maxchar, ch);
9610 }
9611 }
9612
9613 /* Copy to a new string */
9614 decimal = PyUnicode_New(length, maxchar);
9615 if (decimal == NULL)
9616 return decimal;
9617 kind = PyUnicode_KIND(decimal);
9618 data = PyUnicode_DATA(decimal);
9619 /* Iterate over code points */
9620 for (i = 0; i < length; i++) {
9621 Py_UCS4 ch = s[i];
9622 if (ch > 127) {
9623 int decimal = Py_UNICODE_TODECIMAL(ch);
9624 if (decimal >= 0)
9625 ch = '0' + decimal;
9626 }
9627 PyUnicode_WRITE(kind, data, i, ch);
9628 }
9629 return unicode_result(decimal);
9630 }
9631 /* --- Decimal Encoder ---------------------------------------------------- */
9632
9633 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9634 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9635 Py_ssize_t length,
9636 char *output,
9637 const char *errors)
9638 {
9639 PyObject *unicode;
9640 Py_ssize_t i;
9641 enum PyUnicode_Kind kind;
9642 const void *data;
9643
9644 if (output == NULL) {
9645 PyErr_BadArgument();
9646 return -1;
9647 }
9648
9649 unicode = PyUnicode_FromWideChar(s, length);
9650 if (unicode == NULL)
9651 return -1;
9652
9653 kind = PyUnicode_KIND(unicode);
9654 data = PyUnicode_DATA(unicode);
9655
9656 for (i=0; i < length; ) {
9657 PyObject *exc;
9658 Py_UCS4 ch;
9659 int decimal;
9660 Py_ssize_t startpos;
9661
9662 ch = PyUnicode_READ(kind, data, i);
9663
9664 if (Py_UNICODE_ISSPACE(ch)) {
9665 *output++ = ' ';
9666 i++;
9667 continue;
9668 }
9669 decimal = Py_UNICODE_TODECIMAL(ch);
9670 if (decimal >= 0) {
9671 *output++ = '0' + decimal;
9672 i++;
9673 continue;
9674 }
9675 if (0 < ch && ch < 256) {
9676 *output++ = (char)ch;
9677 i++;
9678 continue;
9679 }
9680
9681 startpos = i;
9682 exc = NULL;
9683 raise_encode_exception(&exc, "decimal", unicode,
9684 startpos, startpos+1,
9685 "invalid decimal Unicode string");
9686 Py_XDECREF(exc);
9687 Py_DECREF(unicode);
9688 return -1;
9689 }
9690 /* 0-terminate the output string */
9691 *output++ = '\0';
9692 Py_DECREF(unicode);
9693 return 0;
9694 }
9695
9696 /* --- Helpers ------------------------------------------------------------ */
9697
9698 /* helper macro to fixup start/end slice values */
9699 #define ADJUST_INDICES(start, end, len) \
9700 if (end > len) \
9701 end = len; \
9702 else if (end < 0) { \
9703 end += len; \
9704 if (end < 0) \
9705 end = 0; \
9706 } \
9707 if (start < 0) { \
9708 start += len; \
9709 if (start < 0) \
9710 start = 0; \
9711 }
9712
9713 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9714 any_find_slice(PyObject* s1, PyObject* s2,
9715 Py_ssize_t start,
9716 Py_ssize_t end,
9717 int direction)
9718 {
9719 int kind1, kind2;
9720 const void *buf1, *buf2;
9721 Py_ssize_t len1, len2, result;
9722
9723 kind1 = PyUnicode_KIND(s1);
9724 kind2 = PyUnicode_KIND(s2);
9725 if (kind1 < kind2)
9726 return -1;
9727
9728 len1 = PyUnicode_GET_LENGTH(s1);
9729 len2 = PyUnicode_GET_LENGTH(s2);
9730 ADJUST_INDICES(start, end, len1);
9731 if (end - start < len2)
9732 return -1;
9733
9734 buf1 = PyUnicode_DATA(s1);
9735 buf2 = PyUnicode_DATA(s2);
9736 if (len2 == 1) {
9737 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9738 result = findchar((const char *)buf1 + kind1*start,
9739 kind1, end - start, ch, direction);
9740 if (result == -1)
9741 return -1;
9742 else
9743 return start + result;
9744 }
9745
9746 if (kind2 != kind1) {
9747 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9748 if (!buf2)
9749 return -2;
9750 }
9751
9752 if (direction > 0) {
9753 switch (kind1) {
9754 case PyUnicode_1BYTE_KIND:
9755 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9756 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9757 else
9758 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9759 break;
9760 case PyUnicode_2BYTE_KIND:
9761 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9762 break;
9763 case PyUnicode_4BYTE_KIND:
9764 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9765 break;
9766 default:
9767 Py_UNREACHABLE();
9768 }
9769 }
9770 else {
9771 switch (kind1) {
9772 case PyUnicode_1BYTE_KIND:
9773 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9774 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9775 else
9776 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9777 break;
9778 case PyUnicode_2BYTE_KIND:
9779 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9780 break;
9781 case PyUnicode_4BYTE_KIND:
9782 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9783 break;
9784 default:
9785 Py_UNREACHABLE();
9786 }
9787 }
9788
9789 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9790 if (kind2 != kind1)
9791 PyMem_Free((void *)buf2);
9792
9793 return result;
9794 }
9795
9796 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9797 #include "stringlib/localeutil.h"
9798
9799 /**
9800 * InsertThousandsGrouping:
9801 * @writer: Unicode writer.
9802 * @n_buffer: Number of characters in @buffer.
9803 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9804 * @d_pos: Start of digits string.
9805 * @n_digits: The number of digits in the string, in which we want
9806 * to put the grouping chars.
9807 * @min_width: The minimum width of the digits in the output string.
9808 * Output will be zero-padded on the left to fill.
9809 * @grouping: see definition in localeconv().
9810 * @thousands_sep: see definition in localeconv().
9811 *
9812 * There are 2 modes: counting and filling. If @writer is NULL,
9813 * we are in counting mode, else filling mode.
9814 * If counting, the required buffer size is returned.
9815 * If filling, we know the buffer will be large enough, so we don't
9816 * need to pass in the buffer size.
9817 * Inserts thousand grouping characters (as defined by grouping and
9818 * thousands_sep) into @writer.
9819 *
9820 * Return value: -1 on error, number of characters otherwise.
9821 **/
9822 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9823 _PyUnicode_InsertThousandsGrouping(
9824 _PyUnicodeWriter *writer,
9825 Py_ssize_t n_buffer,
9826 PyObject *digits,
9827 Py_ssize_t d_pos,
9828 Py_ssize_t n_digits,
9829 Py_ssize_t min_width,
9830 const char *grouping,
9831 PyObject *thousands_sep,
9832 Py_UCS4 *maxchar)
9833 {
9834 min_width = Py_MAX(0, min_width);
9835 if (writer) {
9836 assert(digits != NULL);
9837 assert(maxchar == NULL);
9838 }
9839 else {
9840 assert(digits == NULL);
9841 assert(maxchar != NULL);
9842 }
9843 assert(0 <= d_pos);
9844 assert(0 <= n_digits);
9845 assert(grouping != NULL);
9846
9847 if (digits != NULL) {
9848 if (PyUnicode_READY(digits) == -1) {
9849 return -1;
9850 }
9851 }
9852 if (PyUnicode_READY(thousands_sep) == -1) {
9853 return -1;
9854 }
9855
9856 Py_ssize_t count = 0;
9857 Py_ssize_t n_zeros;
9858 int loop_broken = 0;
9859 int use_separator = 0; /* First time through, don't append the
9860 separator. They only go between
9861 groups. */
9862 Py_ssize_t buffer_pos;
9863 Py_ssize_t digits_pos;
9864 Py_ssize_t len;
9865 Py_ssize_t n_chars;
9866 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9867 be looked at */
9868 /* A generator that returns all of the grouping widths, until it
9869 returns 0. */
9870 GroupGenerator groupgen;
9871 GroupGenerator_init(&groupgen, grouping);
9872 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9873
9874 /* if digits are not grouped, thousands separator
9875 should be an empty string */
9876 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9877
9878 digits_pos = d_pos + n_digits;
9879 if (writer) {
9880 buffer_pos = writer->pos + n_buffer;
9881 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9882 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9883 }
9884 else {
9885 buffer_pos = n_buffer;
9886 }
9887
9888 if (!writer) {
9889 *maxchar = 127;
9890 }
9891
9892 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9893 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9894 n_zeros = Py_MAX(0, len - remaining);
9895 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9896
9897 /* Use n_zero zero's and n_chars chars */
9898
9899 /* Count only, don't do anything. */
9900 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9901
9902 /* Copy into the writer. */
9903 InsertThousandsGrouping_fill(writer, &buffer_pos,
9904 digits, &digits_pos,
9905 n_chars, n_zeros,
9906 use_separator ? thousands_sep : NULL,
9907 thousands_sep_len, maxchar);
9908
9909 /* Use a separator next time. */
9910 use_separator = 1;
9911
9912 remaining -= n_chars;
9913 min_width -= len;
9914
9915 if (remaining <= 0 && min_width <= 0) {
9916 loop_broken = 1;
9917 break;
9918 }
9919 min_width -= thousands_sep_len;
9920 }
9921 if (!loop_broken) {
9922 /* We left the loop without using a break statement. */
9923
9924 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9925 n_zeros = Py_MAX(0, len - remaining);
9926 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9927
9928 /* Use n_zero zero's and n_chars chars */
9929 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9930
9931 /* Copy into the writer. */
9932 InsertThousandsGrouping_fill(writer, &buffer_pos,
9933 digits, &digits_pos,
9934 n_chars, n_zeros,
9935 use_separator ? thousands_sep : NULL,
9936 thousands_sep_len, maxchar);
9937 }
9938 return count;
9939 }
9940
9941
9942 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9943 PyUnicode_Count(PyObject *str,
9944 PyObject *substr,
9945 Py_ssize_t start,
9946 Py_ssize_t end)
9947 {
9948 Py_ssize_t result;
9949 int kind1, kind2;
9950 const void *buf1 = NULL, *buf2 = NULL;
9951 Py_ssize_t len1, len2;
9952
9953 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9954 return -1;
9955
9956 kind1 = PyUnicode_KIND(str);
9957 kind2 = PyUnicode_KIND(substr);
9958 if (kind1 < kind2)
9959 return 0;
9960
9961 len1 = PyUnicode_GET_LENGTH(str);
9962 len2 = PyUnicode_GET_LENGTH(substr);
9963 ADJUST_INDICES(start, end, len1);
9964 if (end - start < len2)
9965 return 0;
9966
9967 buf1 = PyUnicode_DATA(str);
9968 buf2 = PyUnicode_DATA(substr);
9969 if (kind2 != kind1) {
9970 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9971 if (!buf2)
9972 goto onError;
9973 }
9974
9975 switch (kind1) {
9976 case PyUnicode_1BYTE_KIND:
9977 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9978 result = asciilib_count(
9979 ((const Py_UCS1*)buf1) + start, end - start,
9980 buf2, len2, PY_SSIZE_T_MAX
9981 );
9982 else
9983 result = ucs1lib_count(
9984 ((const Py_UCS1*)buf1) + start, end - start,
9985 buf2, len2, PY_SSIZE_T_MAX
9986 );
9987 break;
9988 case PyUnicode_2BYTE_KIND:
9989 result = ucs2lib_count(
9990 ((const Py_UCS2*)buf1) + start, end - start,
9991 buf2, len2, PY_SSIZE_T_MAX
9992 );
9993 break;
9994 case PyUnicode_4BYTE_KIND:
9995 result = ucs4lib_count(
9996 ((const Py_UCS4*)buf1) + start, end - start,
9997 buf2, len2, PY_SSIZE_T_MAX
9998 );
9999 break;
10000 default:
10001 Py_UNREACHABLE();
10002 }
10003
10004 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10005 if (kind2 != kind1)
10006 PyMem_Free((void *)buf2);
10007
10008 return result;
10009 onError:
10010 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
10011 if (kind2 != kind1)
10012 PyMem_Free((void *)buf2);
10013 return -1;
10014 }
10015
10016 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10017 PyUnicode_Find(PyObject *str,
10018 PyObject *substr,
10019 Py_ssize_t start,
10020 Py_ssize_t end,
10021 int direction)
10022 {
10023 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10024 return -2;
10025
10026 return any_find_slice(str, substr, start, end, direction);
10027 }
10028
10029 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)10030 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10031 Py_ssize_t start, Py_ssize_t end,
10032 int direction)
10033 {
10034 int kind;
10035 Py_ssize_t len, result;
10036 if (PyUnicode_READY(str) == -1)
10037 return -2;
10038 len = PyUnicode_GET_LENGTH(str);
10039 ADJUST_INDICES(start, end, len);
10040 if (end - start < 1)
10041 return -1;
10042 kind = PyUnicode_KIND(str);
10043 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10044 kind, end-start, ch, direction);
10045 if (result == -1)
10046 return -1;
10047 else
10048 return start + result;
10049 }
10050
10051 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)10052 tailmatch(PyObject *self,
10053 PyObject *substring,
10054 Py_ssize_t start,
10055 Py_ssize_t end,
10056 int direction)
10057 {
10058 int kind_self;
10059 int kind_sub;
10060 const void *data_self;
10061 const void *data_sub;
10062 Py_ssize_t offset;
10063 Py_ssize_t i;
10064 Py_ssize_t end_sub;
10065
10066 if (PyUnicode_READY(self) == -1 ||
10067 PyUnicode_READY(substring) == -1)
10068 return -1;
10069
10070 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10071 end -= PyUnicode_GET_LENGTH(substring);
10072 if (end < start)
10073 return 0;
10074
10075 if (PyUnicode_GET_LENGTH(substring) == 0)
10076 return 1;
10077
10078 kind_self = PyUnicode_KIND(self);
10079 data_self = PyUnicode_DATA(self);
10080 kind_sub = PyUnicode_KIND(substring);
10081 data_sub = PyUnicode_DATA(substring);
10082 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10083
10084 if (direction > 0)
10085 offset = end;
10086 else
10087 offset = start;
10088
10089 if (PyUnicode_READ(kind_self, data_self, offset) ==
10090 PyUnicode_READ(kind_sub, data_sub, 0) &&
10091 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10092 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10093 /* If both are of the same kind, memcmp is sufficient */
10094 if (kind_self == kind_sub) {
10095 return ! memcmp((char *)data_self +
10096 (offset * PyUnicode_KIND(substring)),
10097 data_sub,
10098 PyUnicode_GET_LENGTH(substring) *
10099 PyUnicode_KIND(substring));
10100 }
10101 /* otherwise we have to compare each character by first accessing it */
10102 else {
10103 /* We do not need to compare 0 and len(substring)-1 because
10104 the if statement above ensured already that they are equal
10105 when we end up here. */
10106 for (i = 1; i < end_sub; ++i) {
10107 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10108 PyUnicode_READ(kind_sub, data_sub, i))
10109 return 0;
10110 }
10111 return 1;
10112 }
10113 }
10114
10115 return 0;
10116 }
10117
10118 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)10119 PyUnicode_Tailmatch(PyObject *str,
10120 PyObject *substr,
10121 Py_ssize_t start,
10122 Py_ssize_t end,
10123 int direction)
10124 {
10125 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
10126 return -1;
10127
10128 return tailmatch(str, substr, start, end, direction);
10129 }
10130
10131 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)10132 ascii_upper_or_lower(PyObject *self, int lower)
10133 {
10134 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
10135 const char *data = PyUnicode_DATA(self);
10136 char *resdata;
10137 PyObject *res;
10138
10139 res = PyUnicode_New(len, 127);
10140 if (res == NULL)
10141 return NULL;
10142 resdata = PyUnicode_DATA(res);
10143 if (lower)
10144 _Py_bytes_lower(resdata, data, len);
10145 else
10146 _Py_bytes_upper(resdata, data, len);
10147 return res;
10148 }
10149
10150 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)10151 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10152 {
10153 Py_ssize_t j;
10154 int final_sigma;
10155 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
10156 /* U+03A3 is in the Final_Sigma context when, it is found like this:
10157
10158 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10159
10160 where ! is a negation and \p{xxx} is a character with property xxx.
10161 */
10162 for (j = i - 1; j >= 0; j--) {
10163 c = PyUnicode_READ(kind, data, j);
10164 if (!_PyUnicode_IsCaseIgnorable(c))
10165 break;
10166 }
10167 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10168 if (final_sigma) {
10169 for (j = i + 1; j < length; j++) {
10170 c = PyUnicode_READ(kind, data, j);
10171 if (!_PyUnicode_IsCaseIgnorable(c))
10172 break;
10173 }
10174 final_sigma = j == length || !_PyUnicode_IsCased(c);
10175 }
10176 return (final_sigma) ? 0x3C2 : 0x3C3;
10177 }
10178
10179 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)10180 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10181 Py_UCS4 c, Py_UCS4 *mapped)
10182 {
10183 /* Obscure special case. */
10184 if (c == 0x3A3) {
10185 mapped[0] = handle_capital_sigma(kind, data, length, i);
10186 return 1;
10187 }
10188 return _PyUnicode_ToLowerFull(c, mapped);
10189 }
10190
10191 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10192 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10193 {
10194 Py_ssize_t i, k = 0;
10195 int n_res, j;
10196 Py_UCS4 c, mapped[3];
10197
10198 c = PyUnicode_READ(kind, data, 0);
10199 n_res = _PyUnicode_ToTitleFull(c, mapped);
10200 for (j = 0; j < n_res; j++) {
10201 *maxchar = Py_MAX(*maxchar, mapped[j]);
10202 res[k++] = mapped[j];
10203 }
10204 for (i = 1; i < length; i++) {
10205 c = PyUnicode_READ(kind, data, i);
10206 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10207 for (j = 0; j < n_res; j++) {
10208 *maxchar = Py_MAX(*maxchar, mapped[j]);
10209 res[k++] = mapped[j];
10210 }
10211 }
10212 return k;
10213 }
10214
10215 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10216 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10217 Py_ssize_t i, k = 0;
10218
10219 for (i = 0; i < length; i++) {
10220 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10221 int n_res, j;
10222 if (Py_UNICODE_ISUPPER(c)) {
10223 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10224 }
10225 else if (Py_UNICODE_ISLOWER(c)) {
10226 n_res = _PyUnicode_ToUpperFull(c, mapped);
10227 }
10228 else {
10229 n_res = 1;
10230 mapped[0] = c;
10231 }
10232 for (j = 0; j < n_res; j++) {
10233 *maxchar = Py_MAX(*maxchar, mapped[j]);
10234 res[k++] = mapped[j];
10235 }
10236 }
10237 return k;
10238 }
10239
10240 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)10241 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10242 Py_UCS4 *maxchar, int lower)
10243 {
10244 Py_ssize_t i, k = 0;
10245
10246 for (i = 0; i < length; i++) {
10247 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10248 int n_res, j;
10249 if (lower)
10250 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10251 else
10252 n_res = _PyUnicode_ToUpperFull(c, mapped);
10253 for (j = 0; j < n_res; j++) {
10254 *maxchar = Py_MAX(*maxchar, mapped[j]);
10255 res[k++] = mapped[j];
10256 }
10257 }
10258 return k;
10259 }
10260
10261 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10262 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10263 {
10264 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10265 }
10266
10267 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10268 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10269 {
10270 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10271 }
10272
10273 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10274 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10275 {
10276 Py_ssize_t i, k = 0;
10277
10278 for (i = 0; i < length; i++) {
10279 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10280 Py_UCS4 mapped[3];
10281 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10282 for (j = 0; j < n_res; j++) {
10283 *maxchar = Py_MAX(*maxchar, mapped[j]);
10284 res[k++] = mapped[j];
10285 }
10286 }
10287 return k;
10288 }
10289
10290 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)10291 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10292 {
10293 Py_ssize_t i, k = 0;
10294 int previous_is_cased;
10295
10296 previous_is_cased = 0;
10297 for (i = 0; i < length; i++) {
10298 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10299 Py_UCS4 mapped[3];
10300 int n_res, j;
10301
10302 if (previous_is_cased)
10303 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10304 else
10305 n_res = _PyUnicode_ToTitleFull(c, mapped);
10306
10307 for (j = 0; j < n_res; j++) {
10308 *maxchar = Py_MAX(*maxchar, mapped[j]);
10309 res[k++] = mapped[j];
10310 }
10311
10312 previous_is_cased = _PyUnicode_IsCased(c);
10313 }
10314 return k;
10315 }
10316
10317 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))10318 case_operation(PyObject *self,
10319 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10320 {
10321 PyObject *res = NULL;
10322 Py_ssize_t length, newlength = 0;
10323 int kind, outkind;
10324 const void *data;
10325 void *outdata;
10326 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10327
10328 assert(PyUnicode_IS_READY(self));
10329
10330 kind = PyUnicode_KIND(self);
10331 data = PyUnicode_DATA(self);
10332 length = PyUnicode_GET_LENGTH(self);
10333 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10334 PyErr_SetString(PyExc_OverflowError, "string is too long");
10335 return NULL;
10336 }
10337 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10338 if (tmp == NULL)
10339 return PyErr_NoMemory();
10340 newlength = perform(kind, data, length, tmp, &maxchar);
10341 res = PyUnicode_New(newlength, maxchar);
10342 if (res == NULL)
10343 goto leave;
10344 tmpend = tmp + newlength;
10345 outdata = PyUnicode_DATA(res);
10346 outkind = PyUnicode_KIND(res);
10347 switch (outkind) {
10348 case PyUnicode_1BYTE_KIND:
10349 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10350 break;
10351 case PyUnicode_2BYTE_KIND:
10352 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10353 break;
10354 case PyUnicode_4BYTE_KIND:
10355 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10356 break;
10357 default:
10358 Py_UNREACHABLE();
10359 }
10360 leave:
10361 PyMem_Free(tmp);
10362 return res;
10363 }
10364
10365 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10366 PyUnicode_Join(PyObject *separator, PyObject *seq)
10367 {
10368 PyObject *res;
10369 PyObject *fseq;
10370 Py_ssize_t seqlen;
10371 PyObject **items;
10372
10373 fseq = PySequence_Fast(seq, "can only join an iterable");
10374 if (fseq == NULL) {
10375 return NULL;
10376 }
10377
10378 /* NOTE: the following code can't call back into Python code,
10379 * so we are sure that fseq won't be mutated.
10380 */
10381
10382 items = PySequence_Fast_ITEMS(fseq);
10383 seqlen = PySequence_Fast_GET_SIZE(fseq);
10384 res = _PyUnicode_JoinArray(separator, items, seqlen);
10385 Py_DECREF(fseq);
10386 return res;
10387 }
10388
10389 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10390 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10391 {
10392 PyObject *res = NULL; /* the result */
10393 PyObject *sep = NULL;
10394 Py_ssize_t seplen;
10395 PyObject *item;
10396 Py_ssize_t sz, i, res_offset;
10397 Py_UCS4 maxchar;
10398 Py_UCS4 item_maxchar;
10399 int use_memcpy;
10400 unsigned char *res_data = NULL, *sep_data = NULL;
10401 PyObject *last_obj;
10402 unsigned int kind = 0;
10403
10404 /* If empty sequence, return u"". */
10405 if (seqlen == 0) {
10406 _Py_RETURN_UNICODE_EMPTY();
10407 }
10408
10409 /* If singleton sequence with an exact Unicode, return that. */
10410 last_obj = NULL;
10411 if (seqlen == 1) {
10412 if (PyUnicode_CheckExact(items[0])) {
10413 res = items[0];
10414 Py_INCREF(res);
10415 return res;
10416 }
10417 seplen = 0;
10418 maxchar = 0;
10419 }
10420 else {
10421 /* Set up sep and seplen */
10422 if (separator == NULL) {
10423 /* fall back to a blank space separator */
10424 sep = PyUnicode_FromOrdinal(' ');
10425 if (!sep)
10426 goto onError;
10427 seplen = 1;
10428 maxchar = 32;
10429 }
10430 else {
10431 if (!PyUnicode_Check(separator)) {
10432 PyErr_Format(PyExc_TypeError,
10433 "separator: expected str instance,"
10434 " %.80s found",
10435 Py_TYPE(separator)->tp_name);
10436 goto onError;
10437 }
10438 if (PyUnicode_READY(separator))
10439 goto onError;
10440 sep = separator;
10441 seplen = PyUnicode_GET_LENGTH(separator);
10442 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10443 /* inc refcount to keep this code path symmetric with the
10444 above case of a blank separator */
10445 Py_INCREF(sep);
10446 }
10447 last_obj = sep;
10448 }
10449
10450 /* There are at least two things to join, or else we have a subclass
10451 * of str in the sequence.
10452 * Do a pre-pass to figure out the total amount of space we'll
10453 * need (sz), and see whether all argument are strings.
10454 */
10455 sz = 0;
10456 #ifdef Py_DEBUG
10457 use_memcpy = 0;
10458 #else
10459 use_memcpy = 1;
10460 #endif
10461 for (i = 0; i < seqlen; i++) {
10462 size_t add_sz;
10463 item = items[i];
10464 if (!PyUnicode_Check(item)) {
10465 PyErr_Format(PyExc_TypeError,
10466 "sequence item %zd: expected str instance,"
10467 " %.80s found",
10468 i, Py_TYPE(item)->tp_name);
10469 goto onError;
10470 }
10471 if (PyUnicode_READY(item) == -1)
10472 goto onError;
10473 add_sz = PyUnicode_GET_LENGTH(item);
10474 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10475 maxchar = Py_MAX(maxchar, item_maxchar);
10476 if (i != 0) {
10477 add_sz += seplen;
10478 }
10479 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10480 PyErr_SetString(PyExc_OverflowError,
10481 "join() result is too long for a Python string");
10482 goto onError;
10483 }
10484 sz += add_sz;
10485 if (use_memcpy && last_obj != NULL) {
10486 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10487 use_memcpy = 0;
10488 }
10489 last_obj = item;
10490 }
10491
10492 res = PyUnicode_New(sz, maxchar);
10493 if (res == NULL)
10494 goto onError;
10495
10496 /* Catenate everything. */
10497 #ifdef Py_DEBUG
10498 use_memcpy = 0;
10499 #else
10500 if (use_memcpy) {
10501 res_data = PyUnicode_1BYTE_DATA(res);
10502 kind = PyUnicode_KIND(res);
10503 if (seplen != 0)
10504 sep_data = PyUnicode_1BYTE_DATA(sep);
10505 }
10506 #endif
10507 if (use_memcpy) {
10508 for (i = 0; i < seqlen; ++i) {
10509 Py_ssize_t itemlen;
10510 item = items[i];
10511
10512 /* Copy item, and maybe the separator. */
10513 if (i && seplen != 0) {
10514 memcpy(res_data,
10515 sep_data,
10516 kind * seplen);
10517 res_data += kind * seplen;
10518 }
10519
10520 itemlen = PyUnicode_GET_LENGTH(item);
10521 if (itemlen != 0) {
10522 memcpy(res_data,
10523 PyUnicode_DATA(item),
10524 kind * itemlen);
10525 res_data += kind * itemlen;
10526 }
10527 }
10528 assert(res_data == PyUnicode_1BYTE_DATA(res)
10529 + kind * PyUnicode_GET_LENGTH(res));
10530 }
10531 else {
10532 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10533 Py_ssize_t itemlen;
10534 item = items[i];
10535
10536 /* Copy item, and maybe the separator. */
10537 if (i && seplen != 0) {
10538 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10539 res_offset += seplen;
10540 }
10541
10542 itemlen = PyUnicode_GET_LENGTH(item);
10543 if (itemlen != 0) {
10544 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10545 res_offset += itemlen;
10546 }
10547 }
10548 assert(res_offset == PyUnicode_GET_LENGTH(res));
10549 }
10550
10551 Py_XDECREF(sep);
10552 assert(_PyUnicode_CheckConsistency(res, 1));
10553 return res;
10554
10555 onError:
10556 Py_XDECREF(sep);
10557 Py_XDECREF(res);
10558 return NULL;
10559 }
10560
10561 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10562 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10563 Py_UCS4 fill_char)
10564 {
10565 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10566 void *data = PyUnicode_DATA(unicode);
10567 assert(PyUnicode_IS_READY(unicode));
10568 assert(unicode_modifiable(unicode));
10569 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10570 assert(start >= 0);
10571 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10572 unicode_fill(kind, data, fill_char, start, length);
10573 }
10574
10575 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10576 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10577 Py_UCS4 fill_char)
10578 {
10579 Py_ssize_t maxlen;
10580
10581 if (!PyUnicode_Check(unicode)) {
10582 PyErr_BadInternalCall();
10583 return -1;
10584 }
10585 if (PyUnicode_READY(unicode) == -1)
10586 return -1;
10587 if (unicode_check_modifiable(unicode))
10588 return -1;
10589
10590 if (start < 0) {
10591 PyErr_SetString(PyExc_IndexError, "string index out of range");
10592 return -1;
10593 }
10594 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10595 PyErr_SetString(PyExc_ValueError,
10596 "fill character is bigger than "
10597 "the string maximum character");
10598 return -1;
10599 }
10600
10601 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10602 length = Py_MIN(maxlen, length);
10603 if (length <= 0)
10604 return 0;
10605
10606 _PyUnicode_FastFill(unicode, start, length, fill_char);
10607 return length;
10608 }
10609
10610 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10611 pad(PyObject *self,
10612 Py_ssize_t left,
10613 Py_ssize_t right,
10614 Py_UCS4 fill)
10615 {
10616 PyObject *u;
10617 Py_UCS4 maxchar;
10618 int kind;
10619 void *data;
10620
10621 if (left < 0)
10622 left = 0;
10623 if (right < 0)
10624 right = 0;
10625
10626 if (left == 0 && right == 0)
10627 return unicode_result_unchanged(self);
10628
10629 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10630 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10631 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10632 return NULL;
10633 }
10634 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10635 maxchar = Py_MAX(maxchar, fill);
10636 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10637 if (!u)
10638 return NULL;
10639
10640 kind = PyUnicode_KIND(u);
10641 data = PyUnicode_DATA(u);
10642 if (left)
10643 unicode_fill(kind, data, fill, 0, left);
10644 if (right)
10645 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10646 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10647 assert(_PyUnicode_CheckConsistency(u, 1));
10648 return u;
10649 }
10650
10651 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10652 PyUnicode_Splitlines(PyObject *string, int keepends)
10653 {
10654 PyObject *list;
10655
10656 if (ensure_unicode(string) < 0)
10657 return NULL;
10658
10659 switch (PyUnicode_KIND(string)) {
10660 case PyUnicode_1BYTE_KIND:
10661 if (PyUnicode_IS_ASCII(string))
10662 list = asciilib_splitlines(
10663 string, PyUnicode_1BYTE_DATA(string),
10664 PyUnicode_GET_LENGTH(string), keepends);
10665 else
10666 list = ucs1lib_splitlines(
10667 string, PyUnicode_1BYTE_DATA(string),
10668 PyUnicode_GET_LENGTH(string), keepends);
10669 break;
10670 case PyUnicode_2BYTE_KIND:
10671 list = ucs2lib_splitlines(
10672 string, PyUnicode_2BYTE_DATA(string),
10673 PyUnicode_GET_LENGTH(string), keepends);
10674 break;
10675 case PyUnicode_4BYTE_KIND:
10676 list = ucs4lib_splitlines(
10677 string, PyUnicode_4BYTE_DATA(string),
10678 PyUnicode_GET_LENGTH(string), keepends);
10679 break;
10680 default:
10681 Py_UNREACHABLE();
10682 }
10683 return list;
10684 }
10685
10686 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10687 split(PyObject *self,
10688 PyObject *substring,
10689 Py_ssize_t maxcount)
10690 {
10691 int kind1, kind2;
10692 const void *buf1, *buf2;
10693 Py_ssize_t len1, len2;
10694 PyObject* out;
10695
10696 if (maxcount < 0)
10697 maxcount = PY_SSIZE_T_MAX;
10698
10699 if (PyUnicode_READY(self) == -1)
10700 return NULL;
10701
10702 if (substring == NULL)
10703 switch (PyUnicode_KIND(self)) {
10704 case PyUnicode_1BYTE_KIND:
10705 if (PyUnicode_IS_ASCII(self))
10706 return asciilib_split_whitespace(
10707 self, PyUnicode_1BYTE_DATA(self),
10708 PyUnicode_GET_LENGTH(self), maxcount
10709 );
10710 else
10711 return ucs1lib_split_whitespace(
10712 self, PyUnicode_1BYTE_DATA(self),
10713 PyUnicode_GET_LENGTH(self), maxcount
10714 );
10715 case PyUnicode_2BYTE_KIND:
10716 return ucs2lib_split_whitespace(
10717 self, PyUnicode_2BYTE_DATA(self),
10718 PyUnicode_GET_LENGTH(self), maxcount
10719 );
10720 case PyUnicode_4BYTE_KIND:
10721 return ucs4lib_split_whitespace(
10722 self, PyUnicode_4BYTE_DATA(self),
10723 PyUnicode_GET_LENGTH(self), maxcount
10724 );
10725 default:
10726 Py_UNREACHABLE();
10727 }
10728
10729 if (PyUnicode_READY(substring) == -1)
10730 return NULL;
10731
10732 kind1 = PyUnicode_KIND(self);
10733 kind2 = PyUnicode_KIND(substring);
10734 len1 = PyUnicode_GET_LENGTH(self);
10735 len2 = PyUnicode_GET_LENGTH(substring);
10736 if (kind1 < kind2 || len1 < len2) {
10737 out = PyList_New(1);
10738 if (out == NULL)
10739 return NULL;
10740 Py_INCREF(self);
10741 PyList_SET_ITEM(out, 0, self);
10742 return out;
10743 }
10744 buf1 = PyUnicode_DATA(self);
10745 buf2 = PyUnicode_DATA(substring);
10746 if (kind2 != kind1) {
10747 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10748 if (!buf2)
10749 return NULL;
10750 }
10751
10752 switch (kind1) {
10753 case PyUnicode_1BYTE_KIND:
10754 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10755 out = asciilib_split(
10756 self, buf1, len1, buf2, len2, maxcount);
10757 else
10758 out = ucs1lib_split(
10759 self, buf1, len1, buf2, len2, maxcount);
10760 break;
10761 case PyUnicode_2BYTE_KIND:
10762 out = ucs2lib_split(
10763 self, buf1, len1, buf2, len2, maxcount);
10764 break;
10765 case PyUnicode_4BYTE_KIND:
10766 out = ucs4lib_split(
10767 self, buf1, len1, buf2, len2, maxcount);
10768 break;
10769 default:
10770 out = NULL;
10771 }
10772 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10773 if (kind2 != kind1)
10774 PyMem_Free((void *)buf2);
10775 return out;
10776 }
10777
10778 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10779 rsplit(PyObject *self,
10780 PyObject *substring,
10781 Py_ssize_t maxcount)
10782 {
10783 int kind1, kind2;
10784 const void *buf1, *buf2;
10785 Py_ssize_t len1, len2;
10786 PyObject* out;
10787
10788 if (maxcount < 0)
10789 maxcount = PY_SSIZE_T_MAX;
10790
10791 if (PyUnicode_READY(self) == -1)
10792 return NULL;
10793
10794 if (substring == NULL)
10795 switch (PyUnicode_KIND(self)) {
10796 case PyUnicode_1BYTE_KIND:
10797 if (PyUnicode_IS_ASCII(self))
10798 return asciilib_rsplit_whitespace(
10799 self, PyUnicode_1BYTE_DATA(self),
10800 PyUnicode_GET_LENGTH(self), maxcount
10801 );
10802 else
10803 return ucs1lib_rsplit_whitespace(
10804 self, PyUnicode_1BYTE_DATA(self),
10805 PyUnicode_GET_LENGTH(self), maxcount
10806 );
10807 case PyUnicode_2BYTE_KIND:
10808 return ucs2lib_rsplit_whitespace(
10809 self, PyUnicode_2BYTE_DATA(self),
10810 PyUnicode_GET_LENGTH(self), maxcount
10811 );
10812 case PyUnicode_4BYTE_KIND:
10813 return ucs4lib_rsplit_whitespace(
10814 self, PyUnicode_4BYTE_DATA(self),
10815 PyUnicode_GET_LENGTH(self), maxcount
10816 );
10817 default:
10818 Py_UNREACHABLE();
10819 }
10820
10821 if (PyUnicode_READY(substring) == -1)
10822 return NULL;
10823
10824 kind1 = PyUnicode_KIND(self);
10825 kind2 = PyUnicode_KIND(substring);
10826 len1 = PyUnicode_GET_LENGTH(self);
10827 len2 = PyUnicode_GET_LENGTH(substring);
10828 if (kind1 < kind2 || len1 < len2) {
10829 out = PyList_New(1);
10830 if (out == NULL)
10831 return NULL;
10832 Py_INCREF(self);
10833 PyList_SET_ITEM(out, 0, self);
10834 return out;
10835 }
10836 buf1 = PyUnicode_DATA(self);
10837 buf2 = PyUnicode_DATA(substring);
10838 if (kind2 != kind1) {
10839 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10840 if (!buf2)
10841 return NULL;
10842 }
10843
10844 switch (kind1) {
10845 case PyUnicode_1BYTE_KIND:
10846 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10847 out = asciilib_rsplit(
10848 self, buf1, len1, buf2, len2, maxcount);
10849 else
10850 out = ucs1lib_rsplit(
10851 self, buf1, len1, buf2, len2, maxcount);
10852 break;
10853 case PyUnicode_2BYTE_KIND:
10854 out = ucs2lib_rsplit(
10855 self, buf1, len1, buf2, len2, maxcount);
10856 break;
10857 case PyUnicode_4BYTE_KIND:
10858 out = ucs4lib_rsplit(
10859 self, buf1, len1, buf2, len2, maxcount);
10860 break;
10861 default:
10862 out = NULL;
10863 }
10864 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10865 if (kind2 != kind1)
10866 PyMem_Free((void *)buf2);
10867 return out;
10868 }
10869
10870 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10871 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10872 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10873 {
10874 switch (kind) {
10875 case PyUnicode_1BYTE_KIND:
10876 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10877 return asciilib_find(buf1, len1, buf2, len2, offset);
10878 else
10879 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10880 case PyUnicode_2BYTE_KIND:
10881 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10882 case PyUnicode_4BYTE_KIND:
10883 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10884 }
10885 Py_UNREACHABLE();
10886 }
10887
10888 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10889 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10890 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10891 {
10892 switch (kind) {
10893 case PyUnicode_1BYTE_KIND:
10894 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10895 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10896 else
10897 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10898 case PyUnicode_2BYTE_KIND:
10899 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10900 case PyUnicode_4BYTE_KIND:
10901 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10902 }
10903 Py_UNREACHABLE();
10904 }
10905
10906 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10907 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10908 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10909 {
10910 int kind = PyUnicode_KIND(u);
10911 void *data = PyUnicode_DATA(u);
10912 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10913 if (kind == PyUnicode_1BYTE_KIND) {
10914 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10915 (Py_UCS1 *)data + len,
10916 u1, u2, maxcount);
10917 }
10918 else if (kind == PyUnicode_2BYTE_KIND) {
10919 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10920 (Py_UCS2 *)data + len,
10921 u1, u2, maxcount);
10922 }
10923 else {
10924 assert(kind == PyUnicode_4BYTE_KIND);
10925 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10926 (Py_UCS4 *)data + len,
10927 u1, u2, maxcount);
10928 }
10929 }
10930
10931 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10932 replace(PyObject *self, PyObject *str1,
10933 PyObject *str2, Py_ssize_t maxcount)
10934 {
10935 PyObject *u;
10936 const char *sbuf = PyUnicode_DATA(self);
10937 const void *buf1 = PyUnicode_DATA(str1);
10938 const void *buf2 = PyUnicode_DATA(str2);
10939 int srelease = 0, release1 = 0, release2 = 0;
10940 int skind = PyUnicode_KIND(self);
10941 int kind1 = PyUnicode_KIND(str1);
10942 int kind2 = PyUnicode_KIND(str2);
10943 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10944 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10945 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10946 int mayshrink;
10947 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10948
10949 if (slen < len1)
10950 goto nothing;
10951
10952 if (maxcount < 0)
10953 maxcount = PY_SSIZE_T_MAX;
10954 else if (maxcount == 0)
10955 goto nothing;
10956
10957 if (str1 == str2)
10958 goto nothing;
10959
10960 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10961 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10962 if (maxchar < maxchar_str1)
10963 /* substring too wide to be present */
10964 goto nothing;
10965 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10966 /* Replacing str1 with str2 may cause a maxchar reduction in the
10967 result string. */
10968 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10969 maxchar = Py_MAX(maxchar, maxchar_str2);
10970
10971 if (len1 == len2) {
10972 /* same length */
10973 if (len1 == 0)
10974 goto nothing;
10975 if (len1 == 1) {
10976 /* replace characters */
10977 Py_UCS4 u1, u2;
10978 Py_ssize_t pos;
10979
10980 u1 = PyUnicode_READ(kind1, buf1, 0);
10981 pos = findchar(sbuf, skind, slen, u1, 1);
10982 if (pos < 0)
10983 goto nothing;
10984 u2 = PyUnicode_READ(kind2, buf2, 0);
10985 u = PyUnicode_New(slen, maxchar);
10986 if (!u)
10987 goto error;
10988
10989 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10990 replace_1char_inplace(u, pos, u1, u2, maxcount);
10991 }
10992 else {
10993 int rkind = skind;
10994 char *res;
10995 Py_ssize_t i;
10996
10997 if (kind1 < rkind) {
10998 /* widen substring */
10999 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11000 if (!buf1) goto error;
11001 release1 = 1;
11002 }
11003 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
11004 if (i < 0)
11005 goto nothing;
11006 if (rkind > kind2) {
11007 /* widen replacement */
11008 buf2 = unicode_askind(kind2, buf2, len2, rkind);
11009 if (!buf2) goto error;
11010 release2 = 1;
11011 }
11012 else if (rkind < kind2) {
11013 /* widen self and buf1 */
11014 rkind = kind2;
11015 if (release1) {
11016 assert(buf1 != PyUnicode_DATA(str1));
11017 PyMem_Free((void *)buf1);
11018 buf1 = PyUnicode_DATA(str1);
11019 release1 = 0;
11020 }
11021 sbuf = unicode_askind(skind, sbuf, slen, rkind);
11022 if (!sbuf) goto error;
11023 srelease = 1;
11024 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11025 if (!buf1) goto error;
11026 release1 = 1;
11027 }
11028 u = PyUnicode_New(slen, maxchar);
11029 if (!u)
11030 goto error;
11031 assert(PyUnicode_KIND(u) == rkind);
11032 res = PyUnicode_DATA(u);
11033
11034 memcpy(res, sbuf, rkind * slen);
11035 /* change everything in-place, starting with this one */
11036 memcpy(res + rkind * i,
11037 buf2,
11038 rkind * len2);
11039 i += len1;
11040
11041 while ( --maxcount > 0) {
11042 i = anylib_find(rkind, self,
11043 sbuf+rkind*i, slen-i,
11044 str1, buf1, len1, i);
11045 if (i == -1)
11046 break;
11047 memcpy(res + rkind * i,
11048 buf2,
11049 rkind * len2);
11050 i += len1;
11051 }
11052 }
11053 }
11054 else {
11055 Py_ssize_t n, i, j, ires;
11056 Py_ssize_t new_size;
11057 int rkind = skind;
11058 char *res;
11059
11060 if (kind1 < rkind) {
11061 /* widen substring */
11062 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11063 if (!buf1) goto error;
11064 release1 = 1;
11065 }
11066 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
11067 if (n == 0)
11068 goto nothing;
11069 if (kind2 < rkind) {
11070 /* widen replacement */
11071 buf2 = unicode_askind(kind2, buf2, len2, rkind);
11072 if (!buf2) goto error;
11073 release2 = 1;
11074 }
11075 else if (kind2 > rkind) {
11076 /* widen self and buf1 */
11077 rkind = kind2;
11078 sbuf = unicode_askind(skind, sbuf, slen, rkind);
11079 if (!sbuf) goto error;
11080 srelease = 1;
11081 if (release1) {
11082 assert(buf1 != PyUnicode_DATA(str1));
11083 PyMem_Free((void *)buf1);
11084 buf1 = PyUnicode_DATA(str1);
11085 release1 = 0;
11086 }
11087 buf1 = unicode_askind(kind1, buf1, len1, rkind);
11088 if (!buf1) goto error;
11089 release1 = 1;
11090 }
11091 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
11092 PyUnicode_GET_LENGTH(str1)); */
11093 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
11094 PyErr_SetString(PyExc_OverflowError,
11095 "replace string is too long");
11096 goto error;
11097 }
11098 new_size = slen + n * (len2 - len1);
11099 if (new_size == 0) {
11100 u = unicode_new_empty();
11101 goto done;
11102 }
11103 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
11104 PyErr_SetString(PyExc_OverflowError,
11105 "replace string is too long");
11106 goto error;
11107 }
11108 u = PyUnicode_New(new_size, maxchar);
11109 if (!u)
11110 goto error;
11111 assert(PyUnicode_KIND(u) == rkind);
11112 res = PyUnicode_DATA(u);
11113 ires = i = 0;
11114 if (len1 > 0) {
11115 while (n-- > 0) {
11116 /* look for next match */
11117 j = anylib_find(rkind, self,
11118 sbuf + rkind * i, slen-i,
11119 str1, buf1, len1, i);
11120 if (j == -1)
11121 break;
11122 else if (j > i) {
11123 /* copy unchanged part [i:j] */
11124 memcpy(res + rkind * ires,
11125 sbuf + rkind * i,
11126 rkind * (j-i));
11127 ires += j - i;
11128 }
11129 /* copy substitution string */
11130 if (len2 > 0) {
11131 memcpy(res + rkind * ires,
11132 buf2,
11133 rkind * len2);
11134 ires += len2;
11135 }
11136 i = j + len1;
11137 }
11138 if (i < slen)
11139 /* copy tail [i:] */
11140 memcpy(res + rkind * ires,
11141 sbuf + rkind * i,
11142 rkind * (slen-i));
11143 }
11144 else {
11145 /* interleave */
11146 while (n > 0) {
11147 memcpy(res + rkind * ires,
11148 buf2,
11149 rkind * len2);
11150 ires += len2;
11151 if (--n <= 0)
11152 break;
11153 memcpy(res + rkind * ires,
11154 sbuf + rkind * i,
11155 rkind);
11156 ires++;
11157 i++;
11158 }
11159 memcpy(res + rkind * ires,
11160 sbuf + rkind * i,
11161 rkind * (slen-i));
11162 }
11163 }
11164
11165 if (mayshrink) {
11166 unicode_adjust_maxchar(&u);
11167 if (u == NULL)
11168 goto error;
11169 }
11170
11171 done:
11172 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11173 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11174 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11175 if (srelease)
11176 PyMem_Free((void *)sbuf);
11177 if (release1)
11178 PyMem_Free((void *)buf1);
11179 if (release2)
11180 PyMem_Free((void *)buf2);
11181 assert(_PyUnicode_CheckConsistency(u, 1));
11182 return u;
11183
11184 nothing:
11185 /* nothing to replace; return original string (when possible) */
11186 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11187 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11188 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11189 if (srelease)
11190 PyMem_Free((void *)sbuf);
11191 if (release1)
11192 PyMem_Free((void *)buf1);
11193 if (release2)
11194 PyMem_Free((void *)buf2);
11195 return unicode_result_unchanged(self);
11196
11197 error:
11198 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11199 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11200 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11201 if (srelease)
11202 PyMem_Free((void *)sbuf);
11203 if (release1)
11204 PyMem_Free((void *)buf1);
11205 if (release2)
11206 PyMem_Free((void *)buf2);
11207 return NULL;
11208 }
11209
11210 /* --- Unicode Object Methods --------------------------------------------- */
11211
11212 /*[clinic input]
11213 str.title as unicode_title
11214
11215 Return a version of the string where each word is titlecased.
11216
11217 More specifically, words start with uppercased characters and all remaining
11218 cased characters have lower case.
11219 [clinic start generated code]*/
11220
11221 static PyObject *
unicode_title_impl(PyObject * self)11222 unicode_title_impl(PyObject *self)
11223 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11224 {
11225 if (PyUnicode_READY(self) == -1)
11226 return NULL;
11227 return case_operation(self, do_title);
11228 }
11229
11230 /*[clinic input]
11231 str.capitalize as unicode_capitalize
11232
11233 Return a capitalized version of the string.
11234
11235 More specifically, make the first character have upper case and the rest lower
11236 case.
11237 [clinic start generated code]*/
11238
11239 static PyObject *
unicode_capitalize_impl(PyObject * self)11240 unicode_capitalize_impl(PyObject *self)
11241 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11242 {
11243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 if (PyUnicode_GET_LENGTH(self) == 0)
11246 return unicode_result_unchanged(self);
11247 return case_operation(self, do_capitalize);
11248 }
11249
11250 /*[clinic input]
11251 str.casefold as unicode_casefold
11252
11253 Return a version of the string suitable for caseless comparisons.
11254 [clinic start generated code]*/
11255
11256 static PyObject *
unicode_casefold_impl(PyObject * self)11257 unicode_casefold_impl(PyObject *self)
11258 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11259 {
11260 if (PyUnicode_READY(self) == -1)
11261 return NULL;
11262 if (PyUnicode_IS_ASCII(self))
11263 return ascii_upper_or_lower(self, 1);
11264 return case_operation(self, do_casefold);
11265 }
11266
11267
11268 /* Argument converter. Accepts a single Unicode character. */
11269
11270 static int
convert_uc(PyObject * obj,void * addr)11271 convert_uc(PyObject *obj, void *addr)
11272 {
11273 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11274
11275 if (!PyUnicode_Check(obj)) {
11276 PyErr_Format(PyExc_TypeError,
11277 "The fill character must be a unicode character, "
11278 "not %.100s", Py_TYPE(obj)->tp_name);
11279 return 0;
11280 }
11281 if (PyUnicode_READY(obj) < 0)
11282 return 0;
11283 if (PyUnicode_GET_LENGTH(obj) != 1) {
11284 PyErr_SetString(PyExc_TypeError,
11285 "The fill character must be exactly one character long");
11286 return 0;
11287 }
11288 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11289 return 1;
11290 }
11291
11292 /*[clinic input]
11293 str.center as unicode_center
11294
11295 width: Py_ssize_t
11296 fillchar: Py_UCS4 = ' '
11297 /
11298
11299 Return a centered string of length width.
11300
11301 Padding is done using the specified fill character (default is a space).
11302 [clinic start generated code]*/
11303
11304 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)11305 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11306 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11307 {
11308 Py_ssize_t marg, left;
11309
11310 if (PyUnicode_READY(self) == -1)
11311 return NULL;
11312
11313 if (PyUnicode_GET_LENGTH(self) >= width)
11314 return unicode_result_unchanged(self);
11315
11316 marg = width - PyUnicode_GET_LENGTH(self);
11317 left = marg / 2 + (marg & width & 1);
11318
11319 return pad(self, left, marg - left, fillchar);
11320 }
11321
11322 /* This function assumes that str1 and str2 are readied by the caller. */
11323
11324 static int
unicode_compare(PyObject * str1,PyObject * str2)11325 unicode_compare(PyObject *str1, PyObject *str2)
11326 {
11327 #define COMPARE(TYPE1, TYPE2) \
11328 do { \
11329 TYPE1* p1 = (TYPE1 *)data1; \
11330 TYPE2* p2 = (TYPE2 *)data2; \
11331 TYPE1* end = p1 + len; \
11332 Py_UCS4 c1, c2; \
11333 for (; p1 != end; p1++, p2++) { \
11334 c1 = *p1; \
11335 c2 = *p2; \
11336 if (c1 != c2) \
11337 return (c1 < c2) ? -1 : 1; \
11338 } \
11339 } \
11340 while (0)
11341
11342 int kind1, kind2;
11343 const void *data1, *data2;
11344 Py_ssize_t len1, len2, len;
11345
11346 kind1 = PyUnicode_KIND(str1);
11347 kind2 = PyUnicode_KIND(str2);
11348 data1 = PyUnicode_DATA(str1);
11349 data2 = PyUnicode_DATA(str2);
11350 len1 = PyUnicode_GET_LENGTH(str1);
11351 len2 = PyUnicode_GET_LENGTH(str2);
11352 len = Py_MIN(len1, len2);
11353
11354 switch(kind1) {
11355 case PyUnicode_1BYTE_KIND:
11356 {
11357 switch(kind2) {
11358 case PyUnicode_1BYTE_KIND:
11359 {
11360 int cmp = memcmp(data1, data2, len);
11361 /* normalize result of memcmp() into the range [-1; 1] */
11362 if (cmp < 0)
11363 return -1;
11364 if (cmp > 0)
11365 return 1;
11366 break;
11367 }
11368 case PyUnicode_2BYTE_KIND:
11369 COMPARE(Py_UCS1, Py_UCS2);
11370 break;
11371 case PyUnicode_4BYTE_KIND:
11372 COMPARE(Py_UCS1, Py_UCS4);
11373 break;
11374 default:
11375 Py_UNREACHABLE();
11376 }
11377 break;
11378 }
11379 case PyUnicode_2BYTE_KIND:
11380 {
11381 switch(kind2) {
11382 case PyUnicode_1BYTE_KIND:
11383 COMPARE(Py_UCS2, Py_UCS1);
11384 break;
11385 case PyUnicode_2BYTE_KIND:
11386 {
11387 COMPARE(Py_UCS2, Py_UCS2);
11388 break;
11389 }
11390 case PyUnicode_4BYTE_KIND:
11391 COMPARE(Py_UCS2, Py_UCS4);
11392 break;
11393 default:
11394 Py_UNREACHABLE();
11395 }
11396 break;
11397 }
11398 case PyUnicode_4BYTE_KIND:
11399 {
11400 switch(kind2) {
11401 case PyUnicode_1BYTE_KIND:
11402 COMPARE(Py_UCS4, Py_UCS1);
11403 break;
11404 case PyUnicode_2BYTE_KIND:
11405 COMPARE(Py_UCS4, Py_UCS2);
11406 break;
11407 case PyUnicode_4BYTE_KIND:
11408 {
11409 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11410 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11411 /* normalize result of wmemcmp() into the range [-1; 1] */
11412 if (cmp < 0)
11413 return -1;
11414 if (cmp > 0)
11415 return 1;
11416 #else
11417 COMPARE(Py_UCS4, Py_UCS4);
11418 #endif
11419 break;
11420 }
11421 default:
11422 Py_UNREACHABLE();
11423 }
11424 break;
11425 }
11426 default:
11427 Py_UNREACHABLE();
11428 }
11429
11430 if (len1 == len2)
11431 return 0;
11432 if (len1 < len2)
11433 return -1;
11434 else
11435 return 1;
11436
11437 #undef COMPARE
11438 }
11439
11440 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11441 unicode_compare_eq(PyObject *str1, PyObject *str2)
11442 {
11443 int kind;
11444 const void *data1, *data2;
11445 Py_ssize_t len;
11446 int cmp;
11447
11448 len = PyUnicode_GET_LENGTH(str1);
11449 if (PyUnicode_GET_LENGTH(str2) != len)
11450 return 0;
11451 kind = PyUnicode_KIND(str1);
11452 if (PyUnicode_KIND(str2) != kind)
11453 return 0;
11454 data1 = PyUnicode_DATA(str1);
11455 data2 = PyUnicode_DATA(str2);
11456
11457 cmp = memcmp(data1, data2, len * kind);
11458 return (cmp == 0);
11459 }
11460
11461
11462 int
PyUnicode_Compare(PyObject * left,PyObject * right)11463 PyUnicode_Compare(PyObject *left, PyObject *right)
11464 {
11465 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11466 if (PyUnicode_READY(left) == -1 ||
11467 PyUnicode_READY(right) == -1)
11468 return -1;
11469
11470 /* a string is equal to itself */
11471 if (left == right)
11472 return 0;
11473
11474 return unicode_compare(left, right);
11475 }
11476 PyErr_Format(PyExc_TypeError,
11477 "Can't compare %.100s and %.100s",
11478 Py_TYPE(left)->tp_name,
11479 Py_TYPE(right)->tp_name);
11480 return -1;
11481 }
11482
11483 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11484 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11485 {
11486 Py_ssize_t i;
11487 int kind;
11488 Py_UCS4 chr;
11489 const unsigned char *ustr = (const unsigned char *)str;
11490
11491 assert(_PyUnicode_CHECK(uni));
11492 if (!PyUnicode_IS_READY(uni)) {
11493 const wchar_t *ws = _PyUnicode_WSTR(uni);
11494 /* Compare Unicode string and source character set string */
11495 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11496 if (chr != ustr[i])
11497 return (chr < ustr[i]) ? -1 : 1;
11498 }
11499 /* This check keeps Python strings that end in '\0' from comparing equal
11500 to C strings identical up to that point. */
11501 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11502 return 1; /* uni is longer */
11503 if (ustr[i])
11504 return -1; /* str is longer */
11505 return 0;
11506 }
11507 kind = PyUnicode_KIND(uni);
11508 if (kind == PyUnicode_1BYTE_KIND) {
11509 const void *data = PyUnicode_1BYTE_DATA(uni);
11510 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11511 size_t len, len2 = strlen(str);
11512 int cmp;
11513
11514 len = Py_MIN(len1, len2);
11515 cmp = memcmp(data, str, len);
11516 if (cmp != 0) {
11517 if (cmp < 0)
11518 return -1;
11519 else
11520 return 1;
11521 }
11522 if (len1 > len2)
11523 return 1; /* uni is longer */
11524 if (len1 < len2)
11525 return -1; /* str is longer */
11526 return 0;
11527 }
11528 else {
11529 const void *data = PyUnicode_DATA(uni);
11530 /* Compare Unicode string and source character set string */
11531 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11532 if (chr != (unsigned char)str[i])
11533 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11534 /* This check keeps Python strings that end in '\0' from comparing equal
11535 to C strings identical up to that point. */
11536 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11537 return 1; /* uni is longer */
11538 if (str[i])
11539 return -1; /* str is longer */
11540 return 0;
11541 }
11542 }
11543
11544 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11545 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11546 {
11547 size_t i, len;
11548 const wchar_t *p;
11549 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11550 if (strlen(str) != len)
11551 return 0;
11552 p = _PyUnicode_WSTR(unicode);
11553 assert(p);
11554 for (i = 0; i < len; i++) {
11555 unsigned char c = (unsigned char)str[i];
11556 if (c >= 128 || p[i] != (wchar_t)c)
11557 return 0;
11558 }
11559 return 1;
11560 }
11561
11562 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11563 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11564 {
11565 size_t len;
11566 assert(_PyUnicode_CHECK(unicode));
11567 assert(str);
11568 #ifndef NDEBUG
11569 for (const char *p = str; *p; p++) {
11570 assert((unsigned char)*p < 128);
11571 }
11572 #endif
11573 if (PyUnicode_READY(unicode) == -1) {
11574 /* Memory error or bad data */
11575 PyErr_Clear();
11576 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11577 }
11578 if (!PyUnicode_IS_ASCII(unicode))
11579 return 0;
11580 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11581 return strlen(str) == len &&
11582 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11583 }
11584
11585 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11586 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11587 {
11588 PyObject *right_uni;
11589
11590 assert(_PyUnicode_CHECK(left));
11591 assert(right->string);
11592 #ifndef NDEBUG
11593 for (const char *p = right->string; *p; p++) {
11594 assert((unsigned char)*p < 128);
11595 }
11596 #endif
11597
11598 if (PyUnicode_READY(left) == -1) {
11599 /* memory error or bad data */
11600 PyErr_Clear();
11601 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11602 }
11603
11604 if (!PyUnicode_IS_ASCII(left))
11605 return 0;
11606
11607 right_uni = _PyUnicode_FromId(right); /* borrowed */
11608 if (right_uni == NULL) {
11609 /* memory error or bad data */
11610 PyErr_Clear();
11611 return _PyUnicode_EqualToASCIIString(left, right->string);
11612 }
11613
11614 if (left == right_uni)
11615 return 1;
11616
11617 if (PyUnicode_CHECK_INTERNED(left))
11618 return 0;
11619
11620 #ifdef INTERNED_STRINGS
11621 assert(_PyUnicode_HASH(right_uni) != -1);
11622 Py_hash_t hash = _PyUnicode_HASH(left);
11623 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11624 return 0;
11625 }
11626 #endif
11627
11628 return unicode_compare_eq(left, right_uni);
11629 }
11630
11631 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11632 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11633 {
11634 int result;
11635
11636 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11637 Py_RETURN_NOTIMPLEMENTED;
11638
11639 if (PyUnicode_READY(left) == -1 ||
11640 PyUnicode_READY(right) == -1)
11641 return NULL;
11642
11643 if (left == right) {
11644 switch (op) {
11645 case Py_EQ:
11646 case Py_LE:
11647 case Py_GE:
11648 /* a string is equal to itself */
11649 Py_RETURN_TRUE;
11650 case Py_NE:
11651 case Py_LT:
11652 case Py_GT:
11653 Py_RETURN_FALSE;
11654 default:
11655 PyErr_BadArgument();
11656 return NULL;
11657 }
11658 }
11659 else if (op == Py_EQ || op == Py_NE) {
11660 result = unicode_compare_eq(left, right);
11661 result ^= (op == Py_NE);
11662 return PyBool_FromLong(result);
11663 }
11664 else {
11665 result = unicode_compare(left, right);
11666 Py_RETURN_RICHCOMPARE(result, 0, op);
11667 }
11668 }
11669
11670 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11671 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11672 {
11673 return unicode_eq(aa, bb);
11674 }
11675
11676 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11677 PyUnicode_Contains(PyObject *str, PyObject *substr)
11678 {
11679 int kind1, kind2;
11680 const void *buf1, *buf2;
11681 Py_ssize_t len1, len2;
11682 int result;
11683
11684 if (!PyUnicode_Check(substr)) {
11685 PyErr_Format(PyExc_TypeError,
11686 "'in <string>' requires string as left operand, not %.100s",
11687 Py_TYPE(substr)->tp_name);
11688 return -1;
11689 }
11690 if (PyUnicode_READY(substr) == -1)
11691 return -1;
11692 if (ensure_unicode(str) < 0)
11693 return -1;
11694
11695 kind1 = PyUnicode_KIND(str);
11696 kind2 = PyUnicode_KIND(substr);
11697 if (kind1 < kind2)
11698 return 0;
11699 len1 = PyUnicode_GET_LENGTH(str);
11700 len2 = PyUnicode_GET_LENGTH(substr);
11701 if (len1 < len2)
11702 return 0;
11703 buf1 = PyUnicode_DATA(str);
11704 buf2 = PyUnicode_DATA(substr);
11705 if (len2 == 1) {
11706 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11707 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11708 return result;
11709 }
11710 if (kind2 != kind1) {
11711 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11712 if (!buf2)
11713 return -1;
11714 }
11715
11716 switch (kind1) {
11717 case PyUnicode_1BYTE_KIND:
11718 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11719 break;
11720 case PyUnicode_2BYTE_KIND:
11721 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11722 break;
11723 case PyUnicode_4BYTE_KIND:
11724 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11725 break;
11726 default:
11727 Py_UNREACHABLE();
11728 }
11729
11730 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11731 if (kind2 != kind1)
11732 PyMem_Free((void *)buf2);
11733
11734 return result;
11735 }
11736
11737 /* Concat to string or Unicode object giving a new Unicode object. */
11738
11739 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11740 PyUnicode_Concat(PyObject *left, PyObject *right)
11741 {
11742 PyObject *result;
11743 Py_UCS4 maxchar, maxchar2;
11744 Py_ssize_t left_len, right_len, new_len;
11745
11746 if (ensure_unicode(left) < 0)
11747 return NULL;
11748
11749 if (!PyUnicode_Check(right)) {
11750 PyErr_Format(PyExc_TypeError,
11751 "can only concatenate str (not \"%.200s\") to str",
11752 Py_TYPE(right)->tp_name);
11753 return NULL;
11754 }
11755 if (PyUnicode_READY(right) < 0)
11756 return NULL;
11757
11758 /* Shortcuts */
11759 PyObject *empty = unicode_get_empty(); // Borrowed reference
11760 if (left == empty) {
11761 return PyUnicode_FromObject(right);
11762 }
11763 if (right == empty) {
11764 return PyUnicode_FromObject(left);
11765 }
11766
11767 left_len = PyUnicode_GET_LENGTH(left);
11768 right_len = PyUnicode_GET_LENGTH(right);
11769 if (left_len > PY_SSIZE_T_MAX - right_len) {
11770 PyErr_SetString(PyExc_OverflowError,
11771 "strings are too large to concat");
11772 return NULL;
11773 }
11774 new_len = left_len + right_len;
11775
11776 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11777 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11778 maxchar = Py_MAX(maxchar, maxchar2);
11779
11780 /* Concat the two Unicode strings */
11781 result = PyUnicode_New(new_len, maxchar);
11782 if (result == NULL)
11783 return NULL;
11784 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11785 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11786 assert(_PyUnicode_CheckConsistency(result, 1));
11787 return result;
11788 }
11789
11790 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11791 PyUnicode_Append(PyObject **p_left, PyObject *right)
11792 {
11793 PyObject *left, *res;
11794 Py_UCS4 maxchar, maxchar2;
11795 Py_ssize_t left_len, right_len, new_len;
11796
11797 if (p_left == NULL) {
11798 if (!PyErr_Occurred())
11799 PyErr_BadInternalCall();
11800 return;
11801 }
11802 left = *p_left;
11803 if (right == NULL || left == NULL
11804 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11805 if (!PyErr_Occurred())
11806 PyErr_BadInternalCall();
11807 goto error;
11808 }
11809
11810 if (PyUnicode_READY(left) == -1)
11811 goto error;
11812 if (PyUnicode_READY(right) == -1)
11813 goto error;
11814
11815 /* Shortcuts */
11816 PyObject *empty = unicode_get_empty(); // Borrowed reference
11817 if (left == empty) {
11818 Py_DECREF(left);
11819 Py_INCREF(right);
11820 *p_left = right;
11821 return;
11822 }
11823 if (right == empty) {
11824 return;
11825 }
11826
11827 left_len = PyUnicode_GET_LENGTH(left);
11828 right_len = PyUnicode_GET_LENGTH(right);
11829 if (left_len > PY_SSIZE_T_MAX - right_len) {
11830 PyErr_SetString(PyExc_OverflowError,
11831 "strings are too large to concat");
11832 goto error;
11833 }
11834 new_len = left_len + right_len;
11835
11836 if (unicode_modifiable(left)
11837 && PyUnicode_CheckExact(right)
11838 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11839 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11840 to change the structure size, but characters are stored just after
11841 the structure, and so it requires to move all characters which is
11842 not so different than duplicating the string. */
11843 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11844 {
11845 /* append inplace */
11846 if (unicode_resize(p_left, new_len) != 0)
11847 goto error;
11848
11849 /* copy 'right' into the newly allocated area of 'left' */
11850 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11851 }
11852 else {
11853 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11854 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11855 maxchar = Py_MAX(maxchar, maxchar2);
11856
11857 /* Concat the two Unicode strings */
11858 res = PyUnicode_New(new_len, maxchar);
11859 if (res == NULL)
11860 goto error;
11861 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11862 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11863 Py_DECREF(left);
11864 *p_left = res;
11865 }
11866 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11867 return;
11868
11869 error:
11870 Py_CLEAR(*p_left);
11871 }
11872
11873 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11874 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11875 {
11876 PyUnicode_Append(pleft, right);
11877 Py_XDECREF(right);
11878 }
11879
11880 /*
11881 Wraps stringlib_parse_args_finds() and additionally ensures that the
11882 first argument is a unicode object.
11883 */
11884
11885 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11886 parse_args_finds_unicode(const char * function_name, PyObject *args,
11887 PyObject **substring,
11888 Py_ssize_t *start, Py_ssize_t *end)
11889 {
11890 if(stringlib_parse_args_finds(function_name, args, substring,
11891 start, end)) {
11892 if (ensure_unicode(*substring) < 0)
11893 return 0;
11894 return 1;
11895 }
11896 return 0;
11897 }
11898
11899 PyDoc_STRVAR(count__doc__,
11900 "S.count(sub[, start[, end]]) -> int\n\
11901 \n\
11902 Return the number of non-overlapping occurrences of substring sub in\n\
11903 string S[start:end]. Optional arguments start and end are\n\
11904 interpreted as in slice notation.");
11905
11906 static PyObject *
unicode_count(PyObject * self,PyObject * args)11907 unicode_count(PyObject *self, PyObject *args)
11908 {
11909 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11910 Py_ssize_t start = 0;
11911 Py_ssize_t end = PY_SSIZE_T_MAX;
11912 PyObject *result;
11913 int kind1, kind2;
11914 const void *buf1, *buf2;
11915 Py_ssize_t len1, len2, iresult;
11916
11917 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11918 return NULL;
11919
11920 kind1 = PyUnicode_KIND(self);
11921 kind2 = PyUnicode_KIND(substring);
11922 if (kind1 < kind2)
11923 return PyLong_FromLong(0);
11924
11925 len1 = PyUnicode_GET_LENGTH(self);
11926 len2 = PyUnicode_GET_LENGTH(substring);
11927 ADJUST_INDICES(start, end, len1);
11928 if (end - start < len2)
11929 return PyLong_FromLong(0);
11930
11931 buf1 = PyUnicode_DATA(self);
11932 buf2 = PyUnicode_DATA(substring);
11933 if (kind2 != kind1) {
11934 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11935 if (!buf2)
11936 return NULL;
11937 }
11938 switch (kind1) {
11939 case PyUnicode_1BYTE_KIND:
11940 iresult = ucs1lib_count(
11941 ((const Py_UCS1*)buf1) + start, end - start,
11942 buf2, len2, PY_SSIZE_T_MAX
11943 );
11944 break;
11945 case PyUnicode_2BYTE_KIND:
11946 iresult = ucs2lib_count(
11947 ((const Py_UCS2*)buf1) + start, end - start,
11948 buf2, len2, PY_SSIZE_T_MAX
11949 );
11950 break;
11951 case PyUnicode_4BYTE_KIND:
11952 iresult = ucs4lib_count(
11953 ((const Py_UCS4*)buf1) + start, end - start,
11954 buf2, len2, PY_SSIZE_T_MAX
11955 );
11956 break;
11957 default:
11958 Py_UNREACHABLE();
11959 }
11960
11961 result = PyLong_FromSsize_t(iresult);
11962
11963 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11964 if (kind2 != kind1)
11965 PyMem_Free((void *)buf2);
11966
11967 return result;
11968 }
11969
11970 /*[clinic input]
11971 str.encode as unicode_encode
11972
11973 encoding: str(c_default="NULL") = 'utf-8'
11974 The encoding in which to encode the string.
11975 errors: str(c_default="NULL") = 'strict'
11976 The error handling scheme to use for encoding errors.
11977 The default is 'strict' meaning that encoding errors raise a
11978 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11979 'xmlcharrefreplace' as well as any other name registered with
11980 codecs.register_error that can handle UnicodeEncodeErrors.
11981
11982 Encode the string using the codec registered for encoding.
11983 [clinic start generated code]*/
11984
11985 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11986 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11987 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11988 {
11989 return PyUnicode_AsEncodedString(self, encoding, errors);
11990 }
11991
11992 /*[clinic input]
11993 str.expandtabs as unicode_expandtabs
11994
11995 tabsize: int = 8
11996
11997 Return a copy where all tab characters are expanded using spaces.
11998
11999 If tabsize is not given, a tab size of 8 characters is assumed.
12000 [clinic start generated code]*/
12001
12002 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)12003 unicode_expandtabs_impl(PyObject *self, int tabsize)
12004 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
12005 {
12006 Py_ssize_t i, j, line_pos, src_len, incr;
12007 Py_UCS4 ch;
12008 PyObject *u;
12009 const void *src_data;
12010 void *dest_data;
12011 int kind;
12012 int found;
12013
12014 if (PyUnicode_READY(self) == -1)
12015 return NULL;
12016
12017 /* First pass: determine size of output string */
12018 src_len = PyUnicode_GET_LENGTH(self);
12019 i = j = line_pos = 0;
12020 kind = PyUnicode_KIND(self);
12021 src_data = PyUnicode_DATA(self);
12022 found = 0;
12023 for (; i < src_len; i++) {
12024 ch = PyUnicode_READ(kind, src_data, i);
12025 if (ch == '\t') {
12026 found = 1;
12027 if (tabsize > 0) {
12028 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
12029 if (j > PY_SSIZE_T_MAX - incr)
12030 goto overflow;
12031 line_pos += incr;
12032 j += incr;
12033 }
12034 }
12035 else {
12036 if (j > PY_SSIZE_T_MAX - 1)
12037 goto overflow;
12038 line_pos++;
12039 j++;
12040 if (ch == '\n' || ch == '\r')
12041 line_pos = 0;
12042 }
12043 }
12044 if (!found)
12045 return unicode_result_unchanged(self);
12046
12047 /* Second pass: create output string and fill it */
12048 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
12049 if (!u)
12050 return NULL;
12051 dest_data = PyUnicode_DATA(u);
12052
12053 i = j = line_pos = 0;
12054
12055 for (; i < src_len; i++) {
12056 ch = PyUnicode_READ(kind, src_data, i);
12057 if (ch == '\t') {
12058 if (tabsize > 0) {
12059 incr = tabsize - (line_pos % tabsize);
12060 line_pos += incr;
12061 unicode_fill(kind, dest_data, ' ', j, incr);
12062 j += incr;
12063 }
12064 }
12065 else {
12066 line_pos++;
12067 PyUnicode_WRITE(kind, dest_data, j, ch);
12068 j++;
12069 if (ch == '\n' || ch == '\r')
12070 line_pos = 0;
12071 }
12072 }
12073 assert (j == PyUnicode_GET_LENGTH(u));
12074 return unicode_result(u);
12075
12076 overflow:
12077 PyErr_SetString(PyExc_OverflowError, "new string is too long");
12078 return NULL;
12079 }
12080
12081 PyDoc_STRVAR(find__doc__,
12082 "S.find(sub[, start[, end]]) -> int\n\
12083 \n\
12084 Return the lowest index in S where substring sub is found,\n\
12085 such that sub is contained within S[start:end]. Optional\n\
12086 arguments start and end are interpreted as in slice notation.\n\
12087 \n\
12088 Return -1 on failure.");
12089
12090 static PyObject *
unicode_find(PyObject * self,PyObject * args)12091 unicode_find(PyObject *self, PyObject *args)
12092 {
12093 /* initialize variables to prevent gcc warning */
12094 PyObject *substring = NULL;
12095 Py_ssize_t start = 0;
12096 Py_ssize_t end = 0;
12097 Py_ssize_t result;
12098
12099 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
12100 return NULL;
12101
12102 if (PyUnicode_READY(self) == -1)
12103 return NULL;
12104
12105 result = any_find_slice(self, substring, start, end, 1);
12106
12107 if (result == -2)
12108 return NULL;
12109
12110 return PyLong_FromSsize_t(result);
12111 }
12112
12113 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)12114 unicode_getitem(PyObject *self, Py_ssize_t index)
12115 {
12116 const void *data;
12117 enum PyUnicode_Kind kind;
12118 Py_UCS4 ch;
12119
12120 if (!PyUnicode_Check(self)) {
12121 PyErr_BadArgument();
12122 return NULL;
12123 }
12124 if (PyUnicode_READY(self) == -1) {
12125 return NULL;
12126 }
12127 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12128 PyErr_SetString(PyExc_IndexError, "string index out of range");
12129 return NULL;
12130 }
12131 kind = PyUnicode_KIND(self);
12132 data = PyUnicode_DATA(self);
12133 ch = PyUnicode_READ(kind, data, index);
12134 return unicode_char(ch);
12135 }
12136
12137 /* Believe it or not, this produces the same value for ASCII strings
12138 as bytes_hash(). */
12139 static Py_hash_t
unicode_hash(PyObject * self)12140 unicode_hash(PyObject *self)
12141 {
12142 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
12143
12144 #ifdef Py_DEBUG
12145 assert(_Py_HashSecret_Initialized);
12146 #endif
12147 if (_PyUnicode_HASH(self) != -1)
12148 return _PyUnicode_HASH(self);
12149 if (PyUnicode_READY(self) == -1)
12150 return -1;
12151
12152 x = _Py_HashBytes(PyUnicode_DATA(self),
12153 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
12154 _PyUnicode_HASH(self) = x;
12155 return x;
12156 }
12157
12158 PyDoc_STRVAR(index__doc__,
12159 "S.index(sub[, start[, end]]) -> int\n\
12160 \n\
12161 Return the lowest index in S where substring sub is found,\n\
12162 such that sub is contained within S[start:end]. Optional\n\
12163 arguments start and end are interpreted as in slice notation.\n\
12164 \n\
12165 Raises ValueError when the substring is not found.");
12166
12167 static PyObject *
unicode_index(PyObject * self,PyObject * args)12168 unicode_index(PyObject *self, PyObject *args)
12169 {
12170 /* initialize variables to prevent gcc warning */
12171 Py_ssize_t result;
12172 PyObject *substring = NULL;
12173 Py_ssize_t start = 0;
12174 Py_ssize_t end = 0;
12175
12176 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
12177 return NULL;
12178
12179 if (PyUnicode_READY(self) == -1)
12180 return NULL;
12181
12182 result = any_find_slice(self, substring, start, end, 1);
12183
12184 if (result == -2)
12185 return NULL;
12186
12187 if (result < 0) {
12188 PyErr_SetString(PyExc_ValueError, "substring not found");
12189 return NULL;
12190 }
12191
12192 return PyLong_FromSsize_t(result);
12193 }
12194
12195 /*[clinic input]
12196 str.isascii as unicode_isascii
12197
12198 Return True if all characters in the string are ASCII, False otherwise.
12199
12200 ASCII characters have code points in the range U+0000-U+007F.
12201 Empty string is ASCII too.
12202 [clinic start generated code]*/
12203
12204 static PyObject *
unicode_isascii_impl(PyObject * self)12205 unicode_isascii_impl(PyObject *self)
12206 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12207 {
12208 if (PyUnicode_READY(self) == -1) {
12209 return NULL;
12210 }
12211 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12212 }
12213
12214 /*[clinic input]
12215 str.islower as unicode_islower
12216
12217 Return True if the string is a lowercase string, False otherwise.
12218
12219 A string is lowercase if all cased characters in the string are lowercase and
12220 there is at least one cased character in the string.
12221 [clinic start generated code]*/
12222
12223 static PyObject *
unicode_islower_impl(PyObject * self)12224 unicode_islower_impl(PyObject *self)
12225 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12226 {
12227 Py_ssize_t i, length;
12228 int kind;
12229 const void *data;
12230 int cased;
12231
12232 if (PyUnicode_READY(self) == -1)
12233 return NULL;
12234 length = PyUnicode_GET_LENGTH(self);
12235 kind = PyUnicode_KIND(self);
12236 data = PyUnicode_DATA(self);
12237
12238 /* Shortcut for single character strings */
12239 if (length == 1)
12240 return PyBool_FromLong(
12241 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12242
12243 /* Special case for empty strings */
12244 if (length == 0)
12245 Py_RETURN_FALSE;
12246
12247 cased = 0;
12248 for (i = 0; i < length; i++) {
12249 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12250
12251 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12252 Py_RETURN_FALSE;
12253 else if (!cased && Py_UNICODE_ISLOWER(ch))
12254 cased = 1;
12255 }
12256 return PyBool_FromLong(cased);
12257 }
12258
12259 /*[clinic input]
12260 str.isupper as unicode_isupper
12261
12262 Return True if the string is an uppercase string, False otherwise.
12263
12264 A string is uppercase if all cased characters in the string are uppercase and
12265 there is at least one cased character in the string.
12266 [clinic start generated code]*/
12267
12268 static PyObject *
unicode_isupper_impl(PyObject * self)12269 unicode_isupper_impl(PyObject *self)
12270 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12271 {
12272 Py_ssize_t i, length;
12273 int kind;
12274 const void *data;
12275 int cased;
12276
12277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 length = PyUnicode_GET_LENGTH(self);
12280 kind = PyUnicode_KIND(self);
12281 data = PyUnicode_DATA(self);
12282
12283 /* Shortcut for single character strings */
12284 if (length == 1)
12285 return PyBool_FromLong(
12286 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12287
12288 /* Special case for empty strings */
12289 if (length == 0)
12290 Py_RETURN_FALSE;
12291
12292 cased = 0;
12293 for (i = 0; i < length; i++) {
12294 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12295
12296 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12297 Py_RETURN_FALSE;
12298 else if (!cased && Py_UNICODE_ISUPPER(ch))
12299 cased = 1;
12300 }
12301 return PyBool_FromLong(cased);
12302 }
12303
12304 /*[clinic input]
12305 str.istitle as unicode_istitle
12306
12307 Return True if the string is a title-cased string, False otherwise.
12308
12309 In a title-cased string, upper- and title-case characters may only
12310 follow uncased characters and lowercase characters only cased ones.
12311 [clinic start generated code]*/
12312
12313 static PyObject *
unicode_istitle_impl(PyObject * self)12314 unicode_istitle_impl(PyObject *self)
12315 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12316 {
12317 Py_ssize_t i, length;
12318 int kind;
12319 const void *data;
12320 int cased, previous_is_cased;
12321
12322 if (PyUnicode_READY(self) == -1)
12323 return NULL;
12324 length = PyUnicode_GET_LENGTH(self);
12325 kind = PyUnicode_KIND(self);
12326 data = PyUnicode_DATA(self);
12327
12328 /* Shortcut for single character strings */
12329 if (length == 1) {
12330 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12331 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12332 (Py_UNICODE_ISUPPER(ch) != 0));
12333 }
12334
12335 /* Special case for empty strings */
12336 if (length == 0)
12337 Py_RETURN_FALSE;
12338
12339 cased = 0;
12340 previous_is_cased = 0;
12341 for (i = 0; i < length; i++) {
12342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12343
12344 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12345 if (previous_is_cased)
12346 Py_RETURN_FALSE;
12347 previous_is_cased = 1;
12348 cased = 1;
12349 }
12350 else if (Py_UNICODE_ISLOWER(ch)) {
12351 if (!previous_is_cased)
12352 Py_RETURN_FALSE;
12353 previous_is_cased = 1;
12354 cased = 1;
12355 }
12356 else
12357 previous_is_cased = 0;
12358 }
12359 return PyBool_FromLong(cased);
12360 }
12361
12362 /*[clinic input]
12363 str.isspace as unicode_isspace
12364
12365 Return True if the string is a whitespace string, False otherwise.
12366
12367 A string is whitespace if all characters in the string are whitespace and there
12368 is at least one character in the string.
12369 [clinic start generated code]*/
12370
12371 static PyObject *
unicode_isspace_impl(PyObject * self)12372 unicode_isspace_impl(PyObject *self)
12373 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12374 {
12375 Py_ssize_t i, length;
12376 int kind;
12377 const void *data;
12378
12379 if (PyUnicode_READY(self) == -1)
12380 return NULL;
12381 length = PyUnicode_GET_LENGTH(self);
12382 kind = PyUnicode_KIND(self);
12383 data = PyUnicode_DATA(self);
12384
12385 /* Shortcut for single character strings */
12386 if (length == 1)
12387 return PyBool_FromLong(
12388 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12389
12390 /* Special case for empty strings */
12391 if (length == 0)
12392 Py_RETURN_FALSE;
12393
12394 for (i = 0; i < length; i++) {
12395 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12396 if (!Py_UNICODE_ISSPACE(ch))
12397 Py_RETURN_FALSE;
12398 }
12399 Py_RETURN_TRUE;
12400 }
12401
12402 /*[clinic input]
12403 str.isalpha as unicode_isalpha
12404
12405 Return True if the string is an alphabetic string, False otherwise.
12406
12407 A string is alphabetic if all characters in the string are alphabetic and there
12408 is at least one character in the string.
12409 [clinic start generated code]*/
12410
12411 static PyObject *
unicode_isalpha_impl(PyObject * self)12412 unicode_isalpha_impl(PyObject *self)
12413 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12414 {
12415 Py_ssize_t i, length;
12416 int kind;
12417 const void *data;
12418
12419 if (PyUnicode_READY(self) == -1)
12420 return NULL;
12421 length = PyUnicode_GET_LENGTH(self);
12422 kind = PyUnicode_KIND(self);
12423 data = PyUnicode_DATA(self);
12424
12425 /* Shortcut for single character strings */
12426 if (length == 1)
12427 return PyBool_FromLong(
12428 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12429
12430 /* Special case for empty strings */
12431 if (length == 0)
12432 Py_RETURN_FALSE;
12433
12434 for (i = 0; i < length; i++) {
12435 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12436 Py_RETURN_FALSE;
12437 }
12438 Py_RETURN_TRUE;
12439 }
12440
12441 /*[clinic input]
12442 str.isalnum as unicode_isalnum
12443
12444 Return True if the string is an alpha-numeric string, False otherwise.
12445
12446 A string is alpha-numeric if all characters in the string are alpha-numeric and
12447 there is at least one character in the string.
12448 [clinic start generated code]*/
12449
12450 static PyObject *
unicode_isalnum_impl(PyObject * self)12451 unicode_isalnum_impl(PyObject *self)
12452 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12453 {
12454 int kind;
12455 const void *data;
12456 Py_ssize_t len, i;
12457
12458 if (PyUnicode_READY(self) == -1)
12459 return NULL;
12460
12461 kind = PyUnicode_KIND(self);
12462 data = PyUnicode_DATA(self);
12463 len = PyUnicode_GET_LENGTH(self);
12464
12465 /* Shortcut for single character strings */
12466 if (len == 1) {
12467 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12468 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12469 }
12470
12471 /* Special case for empty strings */
12472 if (len == 0)
12473 Py_RETURN_FALSE;
12474
12475 for (i = 0; i < len; i++) {
12476 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12477 if (!Py_UNICODE_ISALNUM(ch))
12478 Py_RETURN_FALSE;
12479 }
12480 Py_RETURN_TRUE;
12481 }
12482
12483 /*[clinic input]
12484 str.isdecimal as unicode_isdecimal
12485
12486 Return True if the string is a decimal string, False otherwise.
12487
12488 A string is a decimal string if all characters in the string are decimal and
12489 there is at least one character in the string.
12490 [clinic start generated code]*/
12491
12492 static PyObject *
unicode_isdecimal_impl(PyObject * self)12493 unicode_isdecimal_impl(PyObject *self)
12494 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12495 {
12496 Py_ssize_t i, length;
12497 int kind;
12498 const void *data;
12499
12500 if (PyUnicode_READY(self) == -1)
12501 return NULL;
12502 length = PyUnicode_GET_LENGTH(self);
12503 kind = PyUnicode_KIND(self);
12504 data = PyUnicode_DATA(self);
12505
12506 /* Shortcut for single character strings */
12507 if (length == 1)
12508 return PyBool_FromLong(
12509 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12510
12511 /* Special case for empty strings */
12512 if (length == 0)
12513 Py_RETURN_FALSE;
12514
12515 for (i = 0; i < length; i++) {
12516 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12517 Py_RETURN_FALSE;
12518 }
12519 Py_RETURN_TRUE;
12520 }
12521
12522 /*[clinic input]
12523 str.isdigit as unicode_isdigit
12524
12525 Return True if the string is a digit string, False otherwise.
12526
12527 A string is a digit string if all characters in the string are digits and there
12528 is at least one character in the string.
12529 [clinic start generated code]*/
12530
12531 static PyObject *
unicode_isdigit_impl(PyObject * self)12532 unicode_isdigit_impl(PyObject *self)
12533 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12534 {
12535 Py_ssize_t i, length;
12536 int kind;
12537 const void *data;
12538
12539 if (PyUnicode_READY(self) == -1)
12540 return NULL;
12541 length = PyUnicode_GET_LENGTH(self);
12542 kind = PyUnicode_KIND(self);
12543 data = PyUnicode_DATA(self);
12544
12545 /* Shortcut for single character strings */
12546 if (length == 1) {
12547 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12548 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12549 }
12550
12551 /* Special case for empty strings */
12552 if (length == 0)
12553 Py_RETURN_FALSE;
12554
12555 for (i = 0; i < length; i++) {
12556 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12557 Py_RETURN_FALSE;
12558 }
12559 Py_RETURN_TRUE;
12560 }
12561
12562 /*[clinic input]
12563 str.isnumeric as unicode_isnumeric
12564
12565 Return True if the string is a numeric string, False otherwise.
12566
12567 A string is numeric if all characters in the string are numeric and there is at
12568 least one character in the string.
12569 [clinic start generated code]*/
12570
12571 static PyObject *
unicode_isnumeric_impl(PyObject * self)12572 unicode_isnumeric_impl(PyObject *self)
12573 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12574 {
12575 Py_ssize_t i, length;
12576 int kind;
12577 const void *data;
12578
12579 if (PyUnicode_READY(self) == -1)
12580 return NULL;
12581 length = PyUnicode_GET_LENGTH(self);
12582 kind = PyUnicode_KIND(self);
12583 data = PyUnicode_DATA(self);
12584
12585 /* Shortcut for single character strings */
12586 if (length == 1)
12587 return PyBool_FromLong(
12588 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12589
12590 /* Special case for empty strings */
12591 if (length == 0)
12592 Py_RETURN_FALSE;
12593
12594 for (i = 0; i < length; i++) {
12595 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12596 Py_RETURN_FALSE;
12597 }
12598 Py_RETURN_TRUE;
12599 }
12600
12601 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12602 _PyUnicode_ScanIdentifier(PyObject *self)
12603 {
12604 Py_ssize_t i;
12605 if (PyUnicode_READY(self) == -1)
12606 return -1;
12607
12608 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12609 if (len == 0) {
12610 /* an empty string is not a valid identifier */
12611 return 0;
12612 }
12613
12614 int kind = PyUnicode_KIND(self);
12615 const void *data = PyUnicode_DATA(self);
12616 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12617 /* PEP 3131 says that the first character must be in
12618 XID_Start and subsequent characters in XID_Continue,
12619 and for the ASCII range, the 2.x rules apply (i.e
12620 start with letters and underscore, continue with
12621 letters, digits, underscore). However, given the current
12622 definition of XID_Start and XID_Continue, it is sufficient
12623 to check just for these, except that _ must be allowed
12624 as starting an identifier. */
12625 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12626 return 0;
12627 }
12628
12629 for (i = 1; i < len; i++) {
12630 ch = PyUnicode_READ(kind, data, i);
12631 if (!_PyUnicode_IsXidContinue(ch)) {
12632 return i;
12633 }
12634 }
12635 return i;
12636 }
12637
12638 int
PyUnicode_IsIdentifier(PyObject * self)12639 PyUnicode_IsIdentifier(PyObject *self)
12640 {
12641 if (PyUnicode_IS_READY(self)) {
12642 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12643 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12644 /* an empty string is not a valid identifier */
12645 return len && i == len;
12646 }
12647 else {
12648 _Py_COMP_DIAG_PUSH
12649 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12650 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12651 if (len == 0) {
12652 /* an empty string is not a valid identifier */
12653 return 0;
12654 }
12655
12656 const wchar_t *wstr = _PyUnicode_WSTR(self);
12657 Py_UCS4 ch = wstr[i++];
12658 #if SIZEOF_WCHAR_T == 2
12659 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12660 && i < len
12661 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12662 {
12663 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12664 i++;
12665 }
12666 #endif
12667 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12668 return 0;
12669 }
12670
12671 while (i < len) {
12672 ch = wstr[i++];
12673 #if SIZEOF_WCHAR_T == 2
12674 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12675 && i < len
12676 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12677 {
12678 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12679 i++;
12680 }
12681 #endif
12682 if (!_PyUnicode_IsXidContinue(ch)) {
12683 return 0;
12684 }
12685 }
12686 return 1;
12687 _Py_COMP_DIAG_POP
12688 }
12689 }
12690
12691 /*[clinic input]
12692 str.isidentifier as unicode_isidentifier
12693
12694 Return True if the string is a valid Python identifier, False otherwise.
12695
12696 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12697 such as "def" or "class".
12698 [clinic start generated code]*/
12699
12700 static PyObject *
unicode_isidentifier_impl(PyObject * self)12701 unicode_isidentifier_impl(PyObject *self)
12702 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12703 {
12704 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12705 }
12706
12707 /*[clinic input]
12708 str.isprintable as unicode_isprintable
12709
12710 Return True if the string is printable, False otherwise.
12711
12712 A string is printable if all of its characters are considered printable in
12713 repr() or if it is empty.
12714 [clinic start generated code]*/
12715
12716 static PyObject *
unicode_isprintable_impl(PyObject * self)12717 unicode_isprintable_impl(PyObject *self)
12718 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12719 {
12720 Py_ssize_t i, length;
12721 int kind;
12722 const void *data;
12723
12724 if (PyUnicode_READY(self) == -1)
12725 return NULL;
12726 length = PyUnicode_GET_LENGTH(self);
12727 kind = PyUnicode_KIND(self);
12728 data = PyUnicode_DATA(self);
12729
12730 /* Shortcut for single character strings */
12731 if (length == 1)
12732 return PyBool_FromLong(
12733 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12734
12735 for (i = 0; i < length; i++) {
12736 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12737 Py_RETURN_FALSE;
12738 }
12739 }
12740 Py_RETURN_TRUE;
12741 }
12742
12743 /*[clinic input]
12744 str.join as unicode_join
12745
12746 iterable: object
12747 /
12748
12749 Concatenate any number of strings.
12750
12751 The string whose method is called is inserted in between each given string.
12752 The result is returned as a new string.
12753
12754 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12755 [clinic start generated code]*/
12756
12757 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12758 unicode_join(PyObject *self, PyObject *iterable)
12759 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12760 {
12761 return PyUnicode_Join(self, iterable);
12762 }
12763
12764 static Py_ssize_t
unicode_length(PyObject * self)12765 unicode_length(PyObject *self)
12766 {
12767 if (PyUnicode_READY(self) == -1)
12768 return -1;
12769 return PyUnicode_GET_LENGTH(self);
12770 }
12771
12772 /*[clinic input]
12773 str.ljust as unicode_ljust
12774
12775 width: Py_ssize_t
12776 fillchar: Py_UCS4 = ' '
12777 /
12778
12779 Return a left-justified string of length width.
12780
12781 Padding is done using the specified fill character (default is a space).
12782 [clinic start generated code]*/
12783
12784 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12785 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12786 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12787 {
12788 if (PyUnicode_READY(self) == -1)
12789 return NULL;
12790
12791 if (PyUnicode_GET_LENGTH(self) >= width)
12792 return unicode_result_unchanged(self);
12793
12794 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12795 }
12796
12797 /*[clinic input]
12798 str.lower as unicode_lower
12799
12800 Return a copy of the string converted to lowercase.
12801 [clinic start generated code]*/
12802
12803 static PyObject *
unicode_lower_impl(PyObject * self)12804 unicode_lower_impl(PyObject *self)
12805 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12806 {
12807 if (PyUnicode_READY(self) == -1)
12808 return NULL;
12809 if (PyUnicode_IS_ASCII(self))
12810 return ascii_upper_or_lower(self, 1);
12811 return case_operation(self, do_lower);
12812 }
12813
12814 #define LEFTSTRIP 0
12815 #define RIGHTSTRIP 1
12816 #define BOTHSTRIP 2
12817
12818 /* Arrays indexed by above */
12819 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12820
12821 #define STRIPNAME(i) (stripfuncnames[i])
12822
12823 /* externally visible for str.strip(unicode) */
12824 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12825 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12826 {
12827 const void *data;
12828 int kind;
12829 Py_ssize_t i, j, len;
12830 BLOOM_MASK sepmask;
12831 Py_ssize_t seplen;
12832
12833 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12834 return NULL;
12835
12836 kind = PyUnicode_KIND(self);
12837 data = PyUnicode_DATA(self);
12838 len = PyUnicode_GET_LENGTH(self);
12839 seplen = PyUnicode_GET_LENGTH(sepobj);
12840 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12841 PyUnicode_DATA(sepobj),
12842 seplen);
12843
12844 i = 0;
12845 if (striptype != RIGHTSTRIP) {
12846 while (i < len) {
12847 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12848 if (!BLOOM(sepmask, ch))
12849 break;
12850 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12851 break;
12852 i++;
12853 }
12854 }
12855
12856 j = len;
12857 if (striptype != LEFTSTRIP) {
12858 j--;
12859 while (j >= i) {
12860 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12861 if (!BLOOM(sepmask, ch))
12862 break;
12863 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12864 break;
12865 j--;
12866 }
12867
12868 j++;
12869 }
12870
12871 return PyUnicode_Substring(self, i, j);
12872 }
12873
12874 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12875 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12876 {
12877 const unsigned char *data;
12878 int kind;
12879 Py_ssize_t length;
12880
12881 if (PyUnicode_READY(self) == -1)
12882 return NULL;
12883
12884 length = PyUnicode_GET_LENGTH(self);
12885 end = Py_MIN(end, length);
12886
12887 if (start == 0 && end == length)
12888 return unicode_result_unchanged(self);
12889
12890 if (start < 0 || end < 0) {
12891 PyErr_SetString(PyExc_IndexError, "string index out of range");
12892 return NULL;
12893 }
12894 if (start >= length || end < start)
12895 _Py_RETURN_UNICODE_EMPTY();
12896
12897 length = end - start;
12898 if (PyUnicode_IS_ASCII(self)) {
12899 data = PyUnicode_1BYTE_DATA(self);
12900 return _PyUnicode_FromASCII((const char*)(data + start), length);
12901 }
12902 else {
12903 kind = PyUnicode_KIND(self);
12904 data = PyUnicode_1BYTE_DATA(self);
12905 return PyUnicode_FromKindAndData(kind,
12906 data + kind * start,
12907 length);
12908 }
12909 }
12910
12911 static PyObject *
do_strip(PyObject * self,int striptype)12912 do_strip(PyObject *self, int striptype)
12913 {
12914 Py_ssize_t len, i, j;
12915
12916 if (PyUnicode_READY(self) == -1)
12917 return NULL;
12918
12919 len = PyUnicode_GET_LENGTH(self);
12920
12921 if (PyUnicode_IS_ASCII(self)) {
12922 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12923
12924 i = 0;
12925 if (striptype != RIGHTSTRIP) {
12926 while (i < len) {
12927 Py_UCS1 ch = data[i];
12928 if (!_Py_ascii_whitespace[ch])
12929 break;
12930 i++;
12931 }
12932 }
12933
12934 j = len;
12935 if (striptype != LEFTSTRIP) {
12936 j--;
12937 while (j >= i) {
12938 Py_UCS1 ch = data[j];
12939 if (!_Py_ascii_whitespace[ch])
12940 break;
12941 j--;
12942 }
12943 j++;
12944 }
12945 }
12946 else {
12947 int kind = PyUnicode_KIND(self);
12948 const void *data = PyUnicode_DATA(self);
12949
12950 i = 0;
12951 if (striptype != RIGHTSTRIP) {
12952 while (i < len) {
12953 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12954 if (!Py_UNICODE_ISSPACE(ch))
12955 break;
12956 i++;
12957 }
12958 }
12959
12960 j = len;
12961 if (striptype != LEFTSTRIP) {
12962 j--;
12963 while (j >= i) {
12964 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12965 if (!Py_UNICODE_ISSPACE(ch))
12966 break;
12967 j--;
12968 }
12969 j++;
12970 }
12971 }
12972
12973 return PyUnicode_Substring(self, i, j);
12974 }
12975
12976
12977 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12978 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12979 {
12980 if (sep != Py_None) {
12981 if (PyUnicode_Check(sep))
12982 return _PyUnicode_XStrip(self, striptype, sep);
12983 else {
12984 PyErr_Format(PyExc_TypeError,
12985 "%s arg must be None or str",
12986 STRIPNAME(striptype));
12987 return NULL;
12988 }
12989 }
12990
12991 return do_strip(self, striptype);
12992 }
12993
12994
12995 /*[clinic input]
12996 str.strip as unicode_strip
12997
12998 chars: object = None
12999 /
13000
13001 Return a copy of the string with leading and trailing whitespace removed.
13002
13003 If chars is given and not None, remove characters in chars instead.
13004 [clinic start generated code]*/
13005
13006 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)13007 unicode_strip_impl(PyObject *self, PyObject *chars)
13008 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
13009 {
13010 return do_argstrip(self, BOTHSTRIP, chars);
13011 }
13012
13013
13014 /*[clinic input]
13015 str.lstrip as unicode_lstrip
13016
13017 chars: object = None
13018 /
13019
13020 Return a copy of the string with leading whitespace removed.
13021
13022 If chars is given and not None, remove characters in chars instead.
13023 [clinic start generated code]*/
13024
13025 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)13026 unicode_lstrip_impl(PyObject *self, PyObject *chars)
13027 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
13028 {
13029 return do_argstrip(self, LEFTSTRIP, chars);
13030 }
13031
13032
13033 /*[clinic input]
13034 str.rstrip as unicode_rstrip
13035
13036 chars: object = None
13037 /
13038
13039 Return a copy of the string with trailing whitespace removed.
13040
13041 If chars is given and not None, remove characters in chars instead.
13042 [clinic start generated code]*/
13043
13044 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)13045 unicode_rstrip_impl(PyObject *self, PyObject *chars)
13046 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
13047 {
13048 return do_argstrip(self, RIGHTSTRIP, chars);
13049 }
13050
13051
13052 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)13053 unicode_repeat(PyObject *str, Py_ssize_t len)
13054 {
13055 PyObject *u;
13056 Py_ssize_t nchars, n;
13057
13058 if (len < 1)
13059 _Py_RETURN_UNICODE_EMPTY();
13060
13061 /* no repeat, return original string */
13062 if (len == 1)
13063 return unicode_result_unchanged(str);
13064
13065 if (PyUnicode_READY(str) == -1)
13066 return NULL;
13067
13068 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
13069 PyErr_SetString(PyExc_OverflowError,
13070 "repeated string is too long");
13071 return NULL;
13072 }
13073 nchars = len * PyUnicode_GET_LENGTH(str);
13074
13075 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
13076 if (!u)
13077 return NULL;
13078 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
13079
13080 if (PyUnicode_GET_LENGTH(str) == 1) {
13081 int kind = PyUnicode_KIND(str);
13082 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
13083 if (kind == PyUnicode_1BYTE_KIND) {
13084 void *to = PyUnicode_DATA(u);
13085 memset(to, (unsigned char)fill_char, len);
13086 }
13087 else if (kind == PyUnicode_2BYTE_KIND) {
13088 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
13089 for (n = 0; n < len; ++n)
13090 ucs2[n] = fill_char;
13091 } else {
13092 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13093 assert(kind == PyUnicode_4BYTE_KIND);
13094 for (n = 0; n < len; ++n)
13095 ucs4[n] = fill_char;
13096 }
13097 }
13098 else {
13099 /* number of characters copied this far */
13100 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
13101 Py_ssize_t char_size = PyUnicode_KIND(str);
13102 char *to = (char *) PyUnicode_DATA(u);
13103 memcpy(to, PyUnicode_DATA(str),
13104 PyUnicode_GET_LENGTH(str) * char_size);
13105 while (done < nchars) {
13106 n = (done <= nchars-done) ? done : nchars-done;
13107 memcpy(to + (done * char_size), to, n * char_size);
13108 done += n;
13109 }
13110 }
13111
13112 assert(_PyUnicode_CheckConsistency(u, 1));
13113 return u;
13114 }
13115
13116 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)13117 PyUnicode_Replace(PyObject *str,
13118 PyObject *substr,
13119 PyObject *replstr,
13120 Py_ssize_t maxcount)
13121 {
13122 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13123 ensure_unicode(replstr) < 0)
13124 return NULL;
13125 return replace(str, substr, replstr, maxcount);
13126 }
13127
13128 /*[clinic input]
13129 str.replace as unicode_replace
13130
13131 old: unicode
13132 new: unicode
13133 count: Py_ssize_t = -1
13134 Maximum number of occurrences to replace.
13135 -1 (the default value) means replace all occurrences.
13136 /
13137
13138 Return a copy with all occurrences of substring old replaced by new.
13139
13140 If the optional argument count is given, only the first count occurrences are
13141 replaced.
13142 [clinic start generated code]*/
13143
13144 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)13145 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13146 Py_ssize_t count)
13147 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
13148 {
13149 if (PyUnicode_READY(self) == -1)
13150 return NULL;
13151 return replace(self, old, new, count);
13152 }
13153
13154 /*[clinic input]
13155 str.removeprefix as unicode_removeprefix
13156
13157 prefix: unicode
13158 /
13159
13160 Return a str with the given prefix string removed if present.
13161
13162 If the string starts with the prefix string, return string[len(prefix):].
13163 Otherwise, return a copy of the original string.
13164 [clinic start generated code]*/
13165
13166 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)13167 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13168 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13169 {
13170 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13171 if (match == -1) {
13172 return NULL;
13173 }
13174 if (match) {
13175 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13176 PyUnicode_GET_LENGTH(self));
13177 }
13178 return unicode_result_unchanged(self);
13179 }
13180
13181 /*[clinic input]
13182 str.removesuffix as unicode_removesuffix
13183
13184 suffix: unicode
13185 /
13186
13187 Return a str with the given suffix string removed if present.
13188
13189 If the string ends with the suffix string and that suffix is not empty,
13190 return string[:-len(suffix)]. Otherwise, return a copy of the original
13191 string.
13192 [clinic start generated code]*/
13193
13194 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)13195 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13196 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13197 {
13198 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13199 if (match == -1) {
13200 return NULL;
13201 }
13202 if (match) {
13203 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13204 - PyUnicode_GET_LENGTH(suffix));
13205 }
13206 return unicode_result_unchanged(self);
13207 }
13208
13209 static PyObject *
unicode_repr(PyObject * unicode)13210 unicode_repr(PyObject *unicode)
13211 {
13212 PyObject *repr;
13213 Py_ssize_t isize;
13214 Py_ssize_t osize, squote, dquote, i, o;
13215 Py_UCS4 max, quote;
13216 int ikind, okind, unchanged;
13217 const void *idata;
13218 void *odata;
13219
13220 if (PyUnicode_READY(unicode) == -1)
13221 return NULL;
13222
13223 isize = PyUnicode_GET_LENGTH(unicode);
13224 idata = PyUnicode_DATA(unicode);
13225
13226 /* Compute length of output, quote characters, and
13227 maximum character */
13228 osize = 0;
13229 max = 127;
13230 squote = dquote = 0;
13231 ikind = PyUnicode_KIND(unicode);
13232 for (i = 0; i < isize; i++) {
13233 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13234 Py_ssize_t incr = 1;
13235 switch (ch) {
13236 case '\'': squote++; break;
13237 case '"': dquote++; break;
13238 case '\\': case '\t': case '\r': case '\n':
13239 incr = 2;
13240 break;
13241 default:
13242 /* Fast-path ASCII */
13243 if (ch < ' ' || ch == 0x7f)
13244 incr = 4; /* \xHH */
13245 else if (ch < 0x7f)
13246 ;
13247 else if (Py_UNICODE_ISPRINTABLE(ch))
13248 max = ch > max ? ch : max;
13249 else if (ch < 0x100)
13250 incr = 4; /* \xHH */
13251 else if (ch < 0x10000)
13252 incr = 6; /* \uHHHH */
13253 else
13254 incr = 10; /* \uHHHHHHHH */
13255 }
13256 if (osize > PY_SSIZE_T_MAX - incr) {
13257 PyErr_SetString(PyExc_OverflowError,
13258 "string is too long to generate repr");
13259 return NULL;
13260 }
13261 osize += incr;
13262 }
13263
13264 quote = '\'';
13265 unchanged = (osize == isize);
13266 if (squote) {
13267 unchanged = 0;
13268 if (dquote)
13269 /* Both squote and dquote present. Use squote,
13270 and escape them */
13271 osize += squote;
13272 else
13273 quote = '"';
13274 }
13275 osize += 2; /* quotes */
13276
13277 repr = PyUnicode_New(osize, max);
13278 if (repr == NULL)
13279 return NULL;
13280 okind = PyUnicode_KIND(repr);
13281 odata = PyUnicode_DATA(repr);
13282
13283 PyUnicode_WRITE(okind, odata, 0, quote);
13284 PyUnicode_WRITE(okind, odata, osize-1, quote);
13285 if (unchanged) {
13286 _PyUnicode_FastCopyCharacters(repr, 1,
13287 unicode, 0,
13288 isize);
13289 }
13290 else {
13291 for (i = 0, o = 1; i < isize; i++) {
13292 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
13293
13294 /* Escape quotes and backslashes */
13295 if ((ch == quote) || (ch == '\\')) {
13296 PyUnicode_WRITE(okind, odata, o++, '\\');
13297 PyUnicode_WRITE(okind, odata, o++, ch);
13298 continue;
13299 }
13300
13301 /* Map special whitespace to '\t', \n', '\r' */
13302 if (ch == '\t') {
13303 PyUnicode_WRITE(okind, odata, o++, '\\');
13304 PyUnicode_WRITE(okind, odata, o++, 't');
13305 }
13306 else if (ch == '\n') {
13307 PyUnicode_WRITE(okind, odata, o++, '\\');
13308 PyUnicode_WRITE(okind, odata, o++, 'n');
13309 }
13310 else if (ch == '\r') {
13311 PyUnicode_WRITE(okind, odata, o++, '\\');
13312 PyUnicode_WRITE(okind, odata, o++, 'r');
13313 }
13314
13315 /* Map non-printable US ASCII to '\xhh' */
13316 else if (ch < ' ' || ch == 0x7F) {
13317 PyUnicode_WRITE(okind, odata, o++, '\\');
13318 PyUnicode_WRITE(okind, odata, o++, 'x');
13319 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13320 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13321 }
13322
13323 /* Copy ASCII characters as-is */
13324 else if (ch < 0x7F) {
13325 PyUnicode_WRITE(okind, odata, o++, ch);
13326 }
13327
13328 /* Non-ASCII characters */
13329 else {
13330 /* Map Unicode whitespace and control characters
13331 (categories Z* and C* except ASCII space)
13332 */
13333 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13334 PyUnicode_WRITE(okind, odata, o++, '\\');
13335 /* Map 8-bit characters to '\xhh' */
13336 if (ch <= 0xff) {
13337 PyUnicode_WRITE(okind, odata, o++, 'x');
13338 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13339 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13340 }
13341 /* Map 16-bit characters to '\uxxxx' */
13342 else if (ch <= 0xffff) {
13343 PyUnicode_WRITE(okind, odata, o++, 'u');
13344 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13345 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13346 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13347 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13348 }
13349 /* Map 21-bit characters to '\U00xxxxxx' */
13350 else {
13351 PyUnicode_WRITE(okind, odata, o++, 'U');
13352 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13353 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13354 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13355 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13356 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13357 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13358 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13359 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13360 }
13361 }
13362 /* Copy characters as-is */
13363 else {
13364 PyUnicode_WRITE(okind, odata, o++, ch);
13365 }
13366 }
13367 }
13368 }
13369 /* Closing quote already added at the beginning */
13370 assert(_PyUnicode_CheckConsistency(repr, 1));
13371 return repr;
13372 }
13373
13374 PyDoc_STRVAR(rfind__doc__,
13375 "S.rfind(sub[, start[, end]]) -> int\n\
13376 \n\
13377 Return the highest index in S where substring sub is found,\n\
13378 such that sub is contained within S[start:end]. Optional\n\
13379 arguments start and end are interpreted as in slice notation.\n\
13380 \n\
13381 Return -1 on failure.");
13382
13383 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13384 unicode_rfind(PyObject *self, PyObject *args)
13385 {
13386 /* initialize variables to prevent gcc warning */
13387 PyObject *substring = NULL;
13388 Py_ssize_t start = 0;
13389 Py_ssize_t end = 0;
13390 Py_ssize_t result;
13391
13392 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13393 return NULL;
13394
13395 if (PyUnicode_READY(self) == -1)
13396 return NULL;
13397
13398 result = any_find_slice(self, substring, start, end, -1);
13399
13400 if (result == -2)
13401 return NULL;
13402
13403 return PyLong_FromSsize_t(result);
13404 }
13405
13406 PyDoc_STRVAR(rindex__doc__,
13407 "S.rindex(sub[, start[, end]]) -> int\n\
13408 \n\
13409 Return the highest index in S where substring sub is found,\n\
13410 such that sub is contained within S[start:end]. Optional\n\
13411 arguments start and end are interpreted as in slice notation.\n\
13412 \n\
13413 Raises ValueError when the substring is not found.");
13414
13415 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13416 unicode_rindex(PyObject *self, PyObject *args)
13417 {
13418 /* initialize variables to prevent gcc warning */
13419 PyObject *substring = NULL;
13420 Py_ssize_t start = 0;
13421 Py_ssize_t end = 0;
13422 Py_ssize_t result;
13423
13424 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13425 return NULL;
13426
13427 if (PyUnicode_READY(self) == -1)
13428 return NULL;
13429
13430 result = any_find_slice(self, substring, start, end, -1);
13431
13432 if (result == -2)
13433 return NULL;
13434
13435 if (result < 0) {
13436 PyErr_SetString(PyExc_ValueError, "substring not found");
13437 return NULL;
13438 }
13439
13440 return PyLong_FromSsize_t(result);
13441 }
13442
13443 /*[clinic input]
13444 str.rjust as unicode_rjust
13445
13446 width: Py_ssize_t
13447 fillchar: Py_UCS4 = ' '
13448 /
13449
13450 Return a right-justified string of length width.
13451
13452 Padding is done using the specified fill character (default is a space).
13453 [clinic start generated code]*/
13454
13455 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13456 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13457 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13458 {
13459 if (PyUnicode_READY(self) == -1)
13460 return NULL;
13461
13462 if (PyUnicode_GET_LENGTH(self) >= width)
13463 return unicode_result_unchanged(self);
13464
13465 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13466 }
13467
13468 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13469 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13470 {
13471 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13472 return NULL;
13473
13474 return split(s, sep, maxsplit);
13475 }
13476
13477 /*[clinic input]
13478 str.split as unicode_split
13479
13480 sep: object = None
13481 The delimiter according which to split the string.
13482 None (the default value) means split according to any whitespace,
13483 and discard empty strings from the result.
13484 maxsplit: Py_ssize_t = -1
13485 Maximum number of splits to do.
13486 -1 (the default value) means no limit.
13487
13488 Return a list of the words in the string, using sep as the delimiter string.
13489 [clinic start generated code]*/
13490
13491 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13492 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13493 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
13494 {
13495 if (sep == Py_None)
13496 return split(self, NULL, maxsplit);
13497 if (PyUnicode_Check(sep))
13498 return split(self, sep, maxsplit);
13499
13500 PyErr_Format(PyExc_TypeError,
13501 "must be str or None, not %.100s",
13502 Py_TYPE(sep)->tp_name);
13503 return NULL;
13504 }
13505
13506 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13507 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13508 {
13509 PyObject* out;
13510 int kind1, kind2;
13511 const void *buf1, *buf2;
13512 Py_ssize_t len1, len2;
13513
13514 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13515 return NULL;
13516
13517 kind1 = PyUnicode_KIND(str_obj);
13518 kind2 = PyUnicode_KIND(sep_obj);
13519 len1 = PyUnicode_GET_LENGTH(str_obj);
13520 len2 = PyUnicode_GET_LENGTH(sep_obj);
13521 if (kind1 < kind2 || len1 < len2) {
13522 PyObject *empty = unicode_get_empty(); // Borrowed reference
13523 return PyTuple_Pack(3, str_obj, empty, empty);
13524 }
13525 buf1 = PyUnicode_DATA(str_obj);
13526 buf2 = PyUnicode_DATA(sep_obj);
13527 if (kind2 != kind1) {
13528 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13529 if (!buf2)
13530 return NULL;
13531 }
13532
13533 switch (kind1) {
13534 case PyUnicode_1BYTE_KIND:
13535 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13536 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13537 else
13538 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13539 break;
13540 case PyUnicode_2BYTE_KIND:
13541 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13542 break;
13543 case PyUnicode_4BYTE_KIND:
13544 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13545 break;
13546 default:
13547 Py_UNREACHABLE();
13548 }
13549
13550 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13551 if (kind2 != kind1)
13552 PyMem_Free((void *)buf2);
13553
13554 return out;
13555 }
13556
13557
13558 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13559 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13560 {
13561 PyObject* out;
13562 int kind1, kind2;
13563 const void *buf1, *buf2;
13564 Py_ssize_t len1, len2;
13565
13566 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13567 return NULL;
13568
13569 kind1 = PyUnicode_KIND(str_obj);
13570 kind2 = PyUnicode_KIND(sep_obj);
13571 len1 = PyUnicode_GET_LENGTH(str_obj);
13572 len2 = PyUnicode_GET_LENGTH(sep_obj);
13573 if (kind1 < kind2 || len1 < len2) {
13574 PyObject *empty = unicode_get_empty(); // Borrowed reference
13575 return PyTuple_Pack(3, empty, empty, str_obj);
13576 }
13577 buf1 = PyUnicode_DATA(str_obj);
13578 buf2 = PyUnicode_DATA(sep_obj);
13579 if (kind2 != kind1) {
13580 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13581 if (!buf2)
13582 return NULL;
13583 }
13584
13585 switch (kind1) {
13586 case PyUnicode_1BYTE_KIND:
13587 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13588 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13589 else
13590 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13591 break;
13592 case PyUnicode_2BYTE_KIND:
13593 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13594 break;
13595 case PyUnicode_4BYTE_KIND:
13596 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13597 break;
13598 default:
13599 Py_UNREACHABLE();
13600 }
13601
13602 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13603 if (kind2 != kind1)
13604 PyMem_Free((void *)buf2);
13605
13606 return out;
13607 }
13608
13609 /*[clinic input]
13610 str.partition as unicode_partition
13611
13612 sep: object
13613 /
13614
13615 Partition the string into three parts using the given separator.
13616
13617 This will search for the separator in the string. If the separator is found,
13618 returns a 3-tuple containing the part before the separator, the separator
13619 itself, and the part after it.
13620
13621 If the separator is not found, returns a 3-tuple containing the original string
13622 and two empty strings.
13623 [clinic start generated code]*/
13624
13625 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13626 unicode_partition(PyObject *self, PyObject *sep)
13627 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13628 {
13629 return PyUnicode_Partition(self, sep);
13630 }
13631
13632 /*[clinic input]
13633 str.rpartition as unicode_rpartition = str.partition
13634
13635 Partition the string into three parts using the given separator.
13636
13637 This will search for the separator in the string, starting at the end. If
13638 the separator is found, returns a 3-tuple containing the part before the
13639 separator, the separator itself, and the part after it.
13640
13641 If the separator is not found, returns a 3-tuple containing two empty strings
13642 and the original string.
13643 [clinic start generated code]*/
13644
13645 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13646 unicode_rpartition(PyObject *self, PyObject *sep)
13647 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13648 {
13649 return PyUnicode_RPartition(self, sep);
13650 }
13651
13652 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13653 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13654 {
13655 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13656 return NULL;
13657
13658 return rsplit(s, sep, maxsplit);
13659 }
13660
13661 /*[clinic input]
13662 str.rsplit as unicode_rsplit = str.split
13663
13664 Return a list of the words in the string, using sep as the delimiter string.
13665
13666 Splits are done starting at the end of the string and working to the front.
13667 [clinic start generated code]*/
13668
13669 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13670 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13671 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13672 {
13673 if (sep == Py_None)
13674 return rsplit(self, NULL, maxsplit);
13675 if (PyUnicode_Check(sep))
13676 return rsplit(self, sep, maxsplit);
13677
13678 PyErr_Format(PyExc_TypeError,
13679 "must be str or None, not %.100s",
13680 Py_TYPE(sep)->tp_name);
13681 return NULL;
13682 }
13683
13684 /*[clinic input]
13685 str.splitlines as unicode_splitlines
13686
13687 keepends: bool(accept={int}) = False
13688
13689 Return a list of the lines in the string, breaking at line boundaries.
13690
13691 Line breaks are not included in the resulting list unless keepends is given and
13692 true.
13693 [clinic start generated code]*/
13694
13695 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13696 unicode_splitlines_impl(PyObject *self, int keepends)
13697 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13698 {
13699 return PyUnicode_Splitlines(self, keepends);
13700 }
13701
13702 static
unicode_str(PyObject * self)13703 PyObject *unicode_str(PyObject *self)
13704 {
13705 return unicode_result_unchanged(self);
13706 }
13707
13708 /*[clinic input]
13709 str.swapcase as unicode_swapcase
13710
13711 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13712 [clinic start generated code]*/
13713
13714 static PyObject *
unicode_swapcase_impl(PyObject * self)13715 unicode_swapcase_impl(PyObject *self)
13716 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13717 {
13718 if (PyUnicode_READY(self) == -1)
13719 return NULL;
13720 return case_operation(self, do_swapcase);
13721 }
13722
13723 /*[clinic input]
13724
13725 @staticmethod
13726 str.maketrans as unicode_maketrans
13727
13728 x: object
13729
13730 y: unicode=NULL
13731
13732 z: unicode=NULL
13733
13734 /
13735
13736 Return a translation table usable for str.translate().
13737
13738 If there is only one argument, it must be a dictionary mapping Unicode
13739 ordinals (integers) or characters to Unicode ordinals, strings or None.
13740 Character keys will be then converted to ordinals.
13741 If there are two arguments, they must be strings of equal length, and
13742 in the resulting dictionary, each character in x will be mapped to the
13743 character at the same position in y. If there is a third argument, it
13744 must be a string, whose characters will be mapped to None in the result.
13745 [clinic start generated code]*/
13746
13747 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13748 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13749 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13750 {
13751 PyObject *new = NULL, *key, *value;
13752 Py_ssize_t i = 0;
13753 int res;
13754
13755 new = PyDict_New();
13756 if (!new)
13757 return NULL;
13758 if (y != NULL) {
13759 int x_kind, y_kind, z_kind;
13760 const void *x_data, *y_data, *z_data;
13761
13762 /* x must be a string too, of equal length */
13763 if (!PyUnicode_Check(x)) {
13764 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13765 "be a string if there is a second argument");
13766 goto err;
13767 }
13768 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13769 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13770 "arguments must have equal length");
13771 goto err;
13772 }
13773 /* create entries for translating chars in x to those in y */
13774 x_kind = PyUnicode_KIND(x);
13775 y_kind = PyUnicode_KIND(y);
13776 x_data = PyUnicode_DATA(x);
13777 y_data = PyUnicode_DATA(y);
13778 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13779 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13780 if (!key)
13781 goto err;
13782 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13783 if (!value) {
13784 Py_DECREF(key);
13785 goto err;
13786 }
13787 res = PyDict_SetItem(new, key, value);
13788 Py_DECREF(key);
13789 Py_DECREF(value);
13790 if (res < 0)
13791 goto err;
13792 }
13793 /* create entries for deleting chars in z */
13794 if (z != NULL) {
13795 z_kind = PyUnicode_KIND(z);
13796 z_data = PyUnicode_DATA(z);
13797 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13798 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13799 if (!key)
13800 goto err;
13801 res = PyDict_SetItem(new, key, Py_None);
13802 Py_DECREF(key);
13803 if (res < 0)
13804 goto err;
13805 }
13806 }
13807 } else {
13808 int kind;
13809 const void *data;
13810
13811 /* x must be a dict */
13812 if (!PyDict_CheckExact(x)) {
13813 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13814 "to maketrans it must be a dict");
13815 goto err;
13816 }
13817 /* copy entries into the new dict, converting string keys to int keys */
13818 while (PyDict_Next(x, &i, &key, &value)) {
13819 if (PyUnicode_Check(key)) {
13820 /* convert string keys to integer keys */
13821 PyObject *newkey;
13822 if (PyUnicode_GET_LENGTH(key) != 1) {
13823 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13824 "table must be of length 1");
13825 goto err;
13826 }
13827 kind = PyUnicode_KIND(key);
13828 data = PyUnicode_DATA(key);
13829 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13830 if (!newkey)
13831 goto err;
13832 res = PyDict_SetItem(new, newkey, value);
13833 Py_DECREF(newkey);
13834 if (res < 0)
13835 goto err;
13836 } else if (PyLong_Check(key)) {
13837 /* just keep integer keys */
13838 if (PyDict_SetItem(new, key, value) < 0)
13839 goto err;
13840 } else {
13841 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13842 "be strings or integers");
13843 goto err;
13844 }
13845 }
13846 }
13847 return new;
13848 err:
13849 Py_DECREF(new);
13850 return NULL;
13851 }
13852
13853 /*[clinic input]
13854 str.translate as unicode_translate
13855
13856 table: object
13857 Translation table, which must be a mapping of Unicode ordinals to
13858 Unicode ordinals, strings, or None.
13859 /
13860
13861 Replace each character in the string using the given translation table.
13862
13863 The table must implement lookup/indexing via __getitem__, for instance a
13864 dictionary or list. If this operation raises LookupError, the character is
13865 left untouched. Characters mapped to None are deleted.
13866 [clinic start generated code]*/
13867
13868 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13869 unicode_translate(PyObject *self, PyObject *table)
13870 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13871 {
13872 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13873 }
13874
13875 /*[clinic input]
13876 str.upper as unicode_upper
13877
13878 Return a copy of the string converted to uppercase.
13879 [clinic start generated code]*/
13880
13881 static PyObject *
unicode_upper_impl(PyObject * self)13882 unicode_upper_impl(PyObject *self)
13883 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13884 {
13885 if (PyUnicode_READY(self) == -1)
13886 return NULL;
13887 if (PyUnicode_IS_ASCII(self))
13888 return ascii_upper_or_lower(self, 0);
13889 return case_operation(self, do_upper);
13890 }
13891
13892 /*[clinic input]
13893 str.zfill as unicode_zfill
13894
13895 width: Py_ssize_t
13896 /
13897
13898 Pad a numeric string with zeros on the left, to fill a field of the given width.
13899
13900 The string is never truncated.
13901 [clinic start generated code]*/
13902
13903 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13904 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13905 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13906 {
13907 Py_ssize_t fill;
13908 PyObject *u;
13909 int kind;
13910 const void *data;
13911 Py_UCS4 chr;
13912
13913 if (PyUnicode_READY(self) == -1)
13914 return NULL;
13915
13916 if (PyUnicode_GET_LENGTH(self) >= width)
13917 return unicode_result_unchanged(self);
13918
13919 fill = width - PyUnicode_GET_LENGTH(self);
13920
13921 u = pad(self, fill, 0, '0');
13922
13923 if (u == NULL)
13924 return NULL;
13925
13926 kind = PyUnicode_KIND(u);
13927 data = PyUnicode_DATA(u);
13928 chr = PyUnicode_READ(kind, data, fill);
13929
13930 if (chr == '+' || chr == '-') {
13931 /* move sign to beginning of string */
13932 PyUnicode_WRITE(kind, data, 0, chr);
13933 PyUnicode_WRITE(kind, data, fill, '0');
13934 }
13935
13936 assert(_PyUnicode_CheckConsistency(u, 1));
13937 return u;
13938 }
13939
13940 #if 0
13941 static PyObject *
13942 unicode__decimal2ascii(PyObject *self)
13943 {
13944 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13945 }
13946 #endif
13947
13948 PyDoc_STRVAR(startswith__doc__,
13949 "S.startswith(prefix[, start[, end]]) -> bool\n\
13950 \n\
13951 Return True if S starts with the specified prefix, False otherwise.\n\
13952 With optional start, test S beginning at that position.\n\
13953 With optional end, stop comparing S at that position.\n\
13954 prefix can also be a tuple of strings to try.");
13955
13956 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13957 unicode_startswith(PyObject *self,
13958 PyObject *args)
13959 {
13960 PyObject *subobj;
13961 PyObject *substring;
13962 Py_ssize_t start = 0;
13963 Py_ssize_t end = PY_SSIZE_T_MAX;
13964 int result;
13965
13966 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13967 return NULL;
13968 if (PyTuple_Check(subobj)) {
13969 Py_ssize_t i;
13970 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13971 substring = PyTuple_GET_ITEM(subobj, i);
13972 if (!PyUnicode_Check(substring)) {
13973 PyErr_Format(PyExc_TypeError,
13974 "tuple for startswith must only contain str, "
13975 "not %.100s",
13976 Py_TYPE(substring)->tp_name);
13977 return NULL;
13978 }
13979 result = tailmatch(self, substring, start, end, -1);
13980 if (result == -1)
13981 return NULL;
13982 if (result) {
13983 Py_RETURN_TRUE;
13984 }
13985 }
13986 /* nothing matched */
13987 Py_RETURN_FALSE;
13988 }
13989 if (!PyUnicode_Check(subobj)) {
13990 PyErr_Format(PyExc_TypeError,
13991 "startswith first arg must be str or "
13992 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13993 return NULL;
13994 }
13995 result = tailmatch(self, subobj, start, end, -1);
13996 if (result == -1)
13997 return NULL;
13998 return PyBool_FromLong(result);
13999 }
14000
14001
14002 PyDoc_STRVAR(endswith__doc__,
14003 "S.endswith(suffix[, start[, end]]) -> bool\n\
14004 \n\
14005 Return True if S ends with the specified suffix, False otherwise.\n\
14006 With optional start, test S beginning at that position.\n\
14007 With optional end, stop comparing S at that position.\n\
14008 suffix can also be a tuple of strings to try.");
14009
14010 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)14011 unicode_endswith(PyObject *self,
14012 PyObject *args)
14013 {
14014 PyObject *subobj;
14015 PyObject *substring;
14016 Py_ssize_t start = 0;
14017 Py_ssize_t end = PY_SSIZE_T_MAX;
14018 int result;
14019
14020 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
14021 return NULL;
14022 if (PyTuple_Check(subobj)) {
14023 Py_ssize_t i;
14024 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
14025 substring = PyTuple_GET_ITEM(subobj, i);
14026 if (!PyUnicode_Check(substring)) {
14027 PyErr_Format(PyExc_TypeError,
14028 "tuple for endswith must only contain str, "
14029 "not %.100s",
14030 Py_TYPE(substring)->tp_name);
14031 return NULL;
14032 }
14033 result = tailmatch(self, substring, start, end, +1);
14034 if (result == -1)
14035 return NULL;
14036 if (result) {
14037 Py_RETURN_TRUE;
14038 }
14039 }
14040 Py_RETURN_FALSE;
14041 }
14042 if (!PyUnicode_Check(subobj)) {
14043 PyErr_Format(PyExc_TypeError,
14044 "endswith first arg must be str or "
14045 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
14046 return NULL;
14047 }
14048 result = tailmatch(self, subobj, start, end, +1);
14049 if (result == -1)
14050 return NULL;
14051 return PyBool_FromLong(result);
14052 }
14053
14054 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)14055 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
14056 {
14057 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14058 writer->data = PyUnicode_DATA(writer->buffer);
14059
14060 if (!writer->readonly) {
14061 writer->kind = PyUnicode_KIND(writer->buffer);
14062 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
14063 }
14064 else {
14065 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14066 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14067 writer->kind = PyUnicode_WCHAR_KIND;
14068 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14069
14070 /* Copy-on-write mode: set buffer size to 0 so
14071 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14072 * next write. */
14073 writer->size = 0;
14074 }
14075 }
14076
14077 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)14078 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
14079 {
14080 memset(writer, 0, sizeof(*writer));
14081
14082 /* ASCII is the bare minimum */
14083 writer->min_char = 127;
14084
14085 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14086 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14087 writer->kind = PyUnicode_WCHAR_KIND;
14088 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14089 }
14090
14091 // Initialize _PyUnicodeWriter with initial buffer
14092 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)14093 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14094 {
14095 memset(writer, 0, sizeof(*writer));
14096 writer->buffer = buffer;
14097 _PyUnicodeWriter_Update(writer);
14098 writer->min_length = writer->size;
14099 }
14100
14101 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)14102 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14103 Py_ssize_t length, Py_UCS4 maxchar)
14104 {
14105 Py_ssize_t newlen;
14106 PyObject *newbuffer;
14107
14108 assert(maxchar <= MAX_UNICODE);
14109
14110 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
14111 assert((maxchar > writer->maxchar && length >= 0)
14112 || length > 0);
14113
14114 if (length > PY_SSIZE_T_MAX - writer->pos) {
14115 PyErr_NoMemory();
14116 return -1;
14117 }
14118 newlen = writer->pos + length;
14119
14120 maxchar = Py_MAX(maxchar, writer->min_char);
14121
14122 if (writer->buffer == NULL) {
14123 assert(!writer->readonly);
14124 if (writer->overallocate
14125 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14126 /* overallocate to limit the number of realloc() */
14127 newlen += newlen / OVERALLOCATE_FACTOR;
14128 }
14129 if (newlen < writer->min_length)
14130 newlen = writer->min_length;
14131
14132 writer->buffer = PyUnicode_New(newlen, maxchar);
14133 if (writer->buffer == NULL)
14134 return -1;
14135 }
14136 else if (newlen > writer->size) {
14137 if (writer->overallocate
14138 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14139 /* overallocate to limit the number of realloc() */
14140 newlen += newlen / OVERALLOCATE_FACTOR;
14141 }
14142 if (newlen < writer->min_length)
14143 newlen = writer->min_length;
14144
14145 if (maxchar > writer->maxchar || writer->readonly) {
14146 /* resize + widen */
14147 maxchar = Py_MAX(maxchar, writer->maxchar);
14148 newbuffer = PyUnicode_New(newlen, maxchar);
14149 if (newbuffer == NULL)
14150 return -1;
14151 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14152 writer->buffer, 0, writer->pos);
14153 Py_DECREF(writer->buffer);
14154 writer->readonly = 0;
14155 }
14156 else {
14157 newbuffer = resize_compact(writer->buffer, newlen);
14158 if (newbuffer == NULL)
14159 return -1;
14160 }
14161 writer->buffer = newbuffer;
14162 }
14163 else if (maxchar > writer->maxchar) {
14164 assert(!writer->readonly);
14165 newbuffer = PyUnicode_New(writer->size, maxchar);
14166 if (newbuffer == NULL)
14167 return -1;
14168 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14169 writer->buffer, 0, writer->pos);
14170 Py_SETREF(writer->buffer, newbuffer);
14171 }
14172 _PyUnicodeWriter_Update(writer);
14173 return 0;
14174
14175 #undef OVERALLOCATE_FACTOR
14176 }
14177
14178 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)14179 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14180 enum PyUnicode_Kind kind)
14181 {
14182 Py_UCS4 maxchar;
14183
14184 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14185 assert(writer->kind < kind);
14186
14187 switch (kind)
14188 {
14189 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14190 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14191 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
14192 default:
14193 Py_UNREACHABLE();
14194 }
14195
14196 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14197 }
14198
14199 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)14200 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
14201 {
14202 assert(ch <= MAX_UNICODE);
14203 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14204 return -1;
14205 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14206 writer->pos++;
14207 return 0;
14208 }
14209
14210 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)14211 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14212 {
14213 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14214 }
14215
14216 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)14217 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14218 {
14219 Py_UCS4 maxchar;
14220 Py_ssize_t len;
14221
14222 if (PyUnicode_READY(str) == -1)
14223 return -1;
14224 len = PyUnicode_GET_LENGTH(str);
14225 if (len == 0)
14226 return 0;
14227 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14228 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
14229 if (writer->buffer == NULL && !writer->overallocate) {
14230 assert(_PyUnicode_CheckConsistency(str, 1));
14231 writer->readonly = 1;
14232 Py_INCREF(str);
14233 writer->buffer = str;
14234 _PyUnicodeWriter_Update(writer);
14235 writer->pos += len;
14236 return 0;
14237 }
14238 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14239 return -1;
14240 }
14241 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14242 str, 0, len);
14243 writer->pos += len;
14244 return 0;
14245 }
14246
14247 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)14248 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14249 Py_ssize_t start, Py_ssize_t end)
14250 {
14251 Py_UCS4 maxchar;
14252 Py_ssize_t len;
14253
14254 if (PyUnicode_READY(str) == -1)
14255 return -1;
14256
14257 assert(0 <= start);
14258 assert(end <= PyUnicode_GET_LENGTH(str));
14259 assert(start <= end);
14260
14261 if (end == 0)
14262 return 0;
14263
14264 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14265 return _PyUnicodeWriter_WriteStr(writer, str);
14266
14267 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14268 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14269 else
14270 maxchar = writer->maxchar;
14271 len = end - start;
14272
14273 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14274 return -1;
14275
14276 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14277 str, start, len);
14278 writer->pos += len;
14279 return 0;
14280 }
14281
14282 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)14283 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14284 const char *ascii, Py_ssize_t len)
14285 {
14286 if (len == -1)
14287 len = strlen(ascii);
14288
14289 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14290
14291 if (writer->buffer == NULL && !writer->overallocate) {
14292 PyObject *str;
14293
14294 str = _PyUnicode_FromASCII(ascii, len);
14295 if (str == NULL)
14296 return -1;
14297
14298 writer->readonly = 1;
14299 writer->buffer = str;
14300 _PyUnicodeWriter_Update(writer);
14301 writer->pos += len;
14302 return 0;
14303 }
14304
14305 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14306 return -1;
14307
14308 switch (writer->kind)
14309 {
14310 case PyUnicode_1BYTE_KIND:
14311 {
14312 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14313 Py_UCS1 *data = writer->data;
14314
14315 memcpy(data + writer->pos, str, len);
14316 break;
14317 }
14318 case PyUnicode_2BYTE_KIND:
14319 {
14320 _PyUnicode_CONVERT_BYTES(
14321 Py_UCS1, Py_UCS2,
14322 ascii, ascii + len,
14323 (Py_UCS2 *)writer->data + writer->pos);
14324 break;
14325 }
14326 case PyUnicode_4BYTE_KIND:
14327 {
14328 _PyUnicode_CONVERT_BYTES(
14329 Py_UCS1, Py_UCS4,
14330 ascii, ascii + len,
14331 (Py_UCS4 *)writer->data + writer->pos);
14332 break;
14333 }
14334 default:
14335 Py_UNREACHABLE();
14336 }
14337
14338 writer->pos += len;
14339 return 0;
14340 }
14341
14342 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14343 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14344 const char *str, Py_ssize_t len)
14345 {
14346 Py_UCS4 maxchar;
14347
14348 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14349 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14350 return -1;
14351 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14352 writer->pos += len;
14353 return 0;
14354 }
14355
14356 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14357 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14358 {
14359 PyObject *str;
14360
14361 if (writer->pos == 0) {
14362 Py_CLEAR(writer->buffer);
14363 _Py_RETURN_UNICODE_EMPTY();
14364 }
14365
14366 str = writer->buffer;
14367 writer->buffer = NULL;
14368
14369 if (writer->readonly) {
14370 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14371 return str;
14372 }
14373
14374 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14375 PyObject *str2;
14376 str2 = resize_compact(str, writer->pos);
14377 if (str2 == NULL) {
14378 Py_DECREF(str);
14379 return NULL;
14380 }
14381 str = str2;
14382 }
14383
14384 assert(_PyUnicode_CheckConsistency(str, 1));
14385 return unicode_result_ready(str);
14386 }
14387
14388 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14389 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14390 {
14391 Py_CLEAR(writer->buffer);
14392 }
14393
14394 #include "stringlib/unicode_format.h"
14395
14396 PyDoc_STRVAR(format__doc__,
14397 "S.format(*args, **kwargs) -> str\n\
14398 \n\
14399 Return a formatted version of S, using substitutions from args and kwargs.\n\
14400 The substitutions are identified by braces ('{' and '}').");
14401
14402 PyDoc_STRVAR(format_map__doc__,
14403 "S.format_map(mapping) -> str\n\
14404 \n\
14405 Return a formatted version of S, using substitutions from mapping.\n\
14406 The substitutions are identified by braces ('{' and '}').");
14407
14408 /*[clinic input]
14409 str.__format__ as unicode___format__
14410
14411 format_spec: unicode
14412 /
14413
14414 Return a formatted version of the string as described by format_spec.
14415 [clinic start generated code]*/
14416
14417 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14418 unicode___format___impl(PyObject *self, PyObject *format_spec)
14419 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14420 {
14421 _PyUnicodeWriter writer;
14422 int ret;
14423
14424 if (PyUnicode_READY(self) == -1)
14425 return NULL;
14426 _PyUnicodeWriter_Init(&writer);
14427 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14428 self, format_spec, 0,
14429 PyUnicode_GET_LENGTH(format_spec));
14430 if (ret == -1) {
14431 _PyUnicodeWriter_Dealloc(&writer);
14432 return NULL;
14433 }
14434 return _PyUnicodeWriter_Finish(&writer);
14435 }
14436
14437 /*[clinic input]
14438 str.__sizeof__ as unicode_sizeof
14439
14440 Return the size of the string in memory, in bytes.
14441 [clinic start generated code]*/
14442
14443 static PyObject *
unicode_sizeof_impl(PyObject * self)14444 unicode_sizeof_impl(PyObject *self)
14445 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14446 {
14447 Py_ssize_t size;
14448
14449 /* If it's a compact object, account for base structure +
14450 character data. */
14451 if (PyUnicode_IS_COMPACT_ASCII(self))
14452 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14453 else if (PyUnicode_IS_COMPACT(self))
14454 size = sizeof(PyCompactUnicodeObject) +
14455 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14456 else {
14457 /* If it is a two-block object, account for base object, and
14458 for character block if present. */
14459 size = sizeof(PyUnicodeObject);
14460 if (_PyUnicode_DATA_ANY(self))
14461 size += (PyUnicode_GET_LENGTH(self) + 1) *
14462 PyUnicode_KIND(self);
14463 }
14464 /* If the wstr pointer is present, account for it unless it is shared
14465 with the data pointer. Check if the data is not shared. */
14466 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14467 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14468 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14469 size += PyUnicode_UTF8_LENGTH(self) + 1;
14470
14471 return PyLong_FromSsize_t(size);
14472 }
14473
14474 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14475 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14476 {
14477 PyObject *copy = _PyUnicode_Copy(v);
14478 if (!copy)
14479 return NULL;
14480 return Py_BuildValue("(N)", copy);
14481 }
14482
14483 static PyMethodDef unicode_methods[] = {
14484 UNICODE_ENCODE_METHODDEF
14485 UNICODE_REPLACE_METHODDEF
14486 UNICODE_SPLIT_METHODDEF
14487 UNICODE_RSPLIT_METHODDEF
14488 UNICODE_JOIN_METHODDEF
14489 UNICODE_CAPITALIZE_METHODDEF
14490 UNICODE_CASEFOLD_METHODDEF
14491 UNICODE_TITLE_METHODDEF
14492 UNICODE_CENTER_METHODDEF
14493 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14494 UNICODE_EXPANDTABS_METHODDEF
14495 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14496 UNICODE_PARTITION_METHODDEF
14497 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14498 UNICODE_LJUST_METHODDEF
14499 UNICODE_LOWER_METHODDEF
14500 UNICODE_LSTRIP_METHODDEF
14501 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14502 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14503 UNICODE_RJUST_METHODDEF
14504 UNICODE_RSTRIP_METHODDEF
14505 UNICODE_RPARTITION_METHODDEF
14506 UNICODE_SPLITLINES_METHODDEF
14507 UNICODE_STRIP_METHODDEF
14508 UNICODE_SWAPCASE_METHODDEF
14509 UNICODE_TRANSLATE_METHODDEF
14510 UNICODE_UPPER_METHODDEF
14511 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14512 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14513 UNICODE_REMOVEPREFIX_METHODDEF
14514 UNICODE_REMOVESUFFIX_METHODDEF
14515 UNICODE_ISASCII_METHODDEF
14516 UNICODE_ISLOWER_METHODDEF
14517 UNICODE_ISUPPER_METHODDEF
14518 UNICODE_ISTITLE_METHODDEF
14519 UNICODE_ISSPACE_METHODDEF
14520 UNICODE_ISDECIMAL_METHODDEF
14521 UNICODE_ISDIGIT_METHODDEF
14522 UNICODE_ISNUMERIC_METHODDEF
14523 UNICODE_ISALPHA_METHODDEF
14524 UNICODE_ISALNUM_METHODDEF
14525 UNICODE_ISIDENTIFIER_METHODDEF
14526 UNICODE_ISPRINTABLE_METHODDEF
14527 UNICODE_ZFILL_METHODDEF
14528 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
14529 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14530 UNICODE___FORMAT___METHODDEF
14531 UNICODE_MAKETRANS_METHODDEF
14532 UNICODE_SIZEOF_METHODDEF
14533 #if 0
14534 /* These methods are just used for debugging the implementation. */
14535 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
14536 #endif
14537
14538 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14539 {NULL, NULL}
14540 };
14541
14542 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14543 unicode_mod(PyObject *v, PyObject *w)
14544 {
14545 if (!PyUnicode_Check(v))
14546 Py_RETURN_NOTIMPLEMENTED;
14547 return PyUnicode_Format(v, w);
14548 }
14549
14550 static PyNumberMethods unicode_as_number = {
14551 0, /*nb_add*/
14552 0, /*nb_subtract*/
14553 0, /*nb_multiply*/
14554 unicode_mod, /*nb_remainder*/
14555 };
14556
14557 static PySequenceMethods unicode_as_sequence = {
14558 (lenfunc) unicode_length, /* sq_length */
14559 PyUnicode_Concat, /* sq_concat */
14560 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14561 (ssizeargfunc) unicode_getitem, /* sq_item */
14562 0, /* sq_slice */
14563 0, /* sq_ass_item */
14564 0, /* sq_ass_slice */
14565 PyUnicode_Contains, /* sq_contains */
14566 };
14567
14568 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14569 unicode_subscript(PyObject* self, PyObject* item)
14570 {
14571 if (PyUnicode_READY(self) == -1)
14572 return NULL;
14573
14574 if (_PyIndex_Check(item)) {
14575 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14576 if (i == -1 && PyErr_Occurred())
14577 return NULL;
14578 if (i < 0)
14579 i += PyUnicode_GET_LENGTH(self);
14580 return unicode_getitem(self, i);
14581 } else if (PySlice_Check(item)) {
14582 Py_ssize_t start, stop, step, slicelength, i;
14583 size_t cur;
14584 PyObject *result;
14585 const void *src_data;
14586 void *dest_data;
14587 int src_kind, dest_kind;
14588 Py_UCS4 ch, max_char, kind_limit;
14589
14590 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14591 return NULL;
14592 }
14593 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14594 &start, &stop, step);
14595
14596 if (slicelength <= 0) {
14597 _Py_RETURN_UNICODE_EMPTY();
14598 } else if (start == 0 && step == 1 &&
14599 slicelength == PyUnicode_GET_LENGTH(self)) {
14600 return unicode_result_unchanged(self);
14601 } else if (step == 1) {
14602 return PyUnicode_Substring(self,
14603 start, start + slicelength);
14604 }
14605 /* General case */
14606 src_kind = PyUnicode_KIND(self);
14607 src_data = PyUnicode_DATA(self);
14608 if (!PyUnicode_IS_ASCII(self)) {
14609 kind_limit = kind_maxchar_limit(src_kind);
14610 max_char = 0;
14611 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14612 ch = PyUnicode_READ(src_kind, src_data, cur);
14613 if (ch > max_char) {
14614 max_char = ch;
14615 if (max_char >= kind_limit)
14616 break;
14617 }
14618 }
14619 }
14620 else
14621 max_char = 127;
14622 result = PyUnicode_New(slicelength, max_char);
14623 if (result == NULL)
14624 return NULL;
14625 dest_kind = PyUnicode_KIND(result);
14626 dest_data = PyUnicode_DATA(result);
14627
14628 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14629 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14630 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14631 }
14632 assert(_PyUnicode_CheckConsistency(result, 1));
14633 return result;
14634 } else {
14635 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14636 return NULL;
14637 }
14638 }
14639
14640 static PyMappingMethods unicode_as_mapping = {
14641 (lenfunc)unicode_length, /* mp_length */
14642 (binaryfunc)unicode_subscript, /* mp_subscript */
14643 (objobjargproc)0, /* mp_ass_subscript */
14644 };
14645
14646
14647 /* Helpers for PyUnicode_Format() */
14648
14649 struct unicode_formatter_t {
14650 PyObject *args;
14651 int args_owned;
14652 Py_ssize_t arglen, argidx;
14653 PyObject *dict;
14654
14655 enum PyUnicode_Kind fmtkind;
14656 Py_ssize_t fmtcnt, fmtpos;
14657 const void *fmtdata;
14658 PyObject *fmtstr;
14659
14660 _PyUnicodeWriter writer;
14661 };
14662
14663 struct unicode_format_arg_t {
14664 Py_UCS4 ch;
14665 int flags;
14666 Py_ssize_t width;
14667 int prec;
14668 int sign;
14669 };
14670
14671 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14672 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14673 {
14674 Py_ssize_t argidx = ctx->argidx;
14675
14676 if (argidx < ctx->arglen) {
14677 ctx->argidx++;
14678 if (ctx->arglen < 0)
14679 return ctx->args;
14680 else
14681 return PyTuple_GetItem(ctx->args, argidx);
14682 }
14683 PyErr_SetString(PyExc_TypeError,
14684 "not enough arguments for format string");
14685 return NULL;
14686 }
14687
14688 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14689
14690 /* Format a float into the writer if the writer is not NULL, or into *p_output
14691 otherwise.
14692
14693 Return 0 on success, raise an exception and return -1 on error. */
14694 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14695 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14696 PyObject **p_output,
14697 _PyUnicodeWriter *writer)
14698 {
14699 char *p;
14700 double x;
14701 Py_ssize_t len;
14702 int prec;
14703 int dtoa_flags;
14704
14705 x = PyFloat_AsDouble(v);
14706 if (x == -1.0 && PyErr_Occurred())
14707 return -1;
14708
14709 prec = arg->prec;
14710 if (prec < 0)
14711 prec = 6;
14712
14713 if (arg->flags & F_ALT)
14714 dtoa_flags = Py_DTSF_ALT;
14715 else
14716 dtoa_flags = 0;
14717 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14718 if (p == NULL)
14719 return -1;
14720 len = strlen(p);
14721 if (writer) {
14722 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14723 PyMem_Free(p);
14724 return -1;
14725 }
14726 }
14727 else
14728 *p_output = _PyUnicode_FromASCII(p, len);
14729 PyMem_Free(p);
14730 return 0;
14731 }
14732
14733 /* formatlong() emulates the format codes d, u, o, x and X, and
14734 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14735 * Python's regular ints.
14736 * Return value: a new PyUnicodeObject*, or NULL if error.
14737 * The output string is of the form
14738 * "-"? ("0x" | "0X")? digit+
14739 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14740 * set in flags. The case of hex digits will be correct,
14741 * There will be at least prec digits, zero-filled on the left if
14742 * necessary to get that many.
14743 * val object to be converted
14744 * flags bitmask of format flags; only F_ALT is looked at
14745 * prec minimum number of digits; 0-fill on left if needed
14746 * type a character in [duoxX]; u acts the same as d
14747 *
14748 * CAUTION: o, x and X conversions on regular ints can never
14749 * produce a '-' sign, but can for Python's unbounded ints.
14750 */
14751 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14752 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14753 {
14754 PyObject *result = NULL;
14755 char *buf;
14756 Py_ssize_t i;
14757 int sign; /* 1 if '-', else 0 */
14758 int len; /* number of characters */
14759 Py_ssize_t llen;
14760 int numdigits; /* len == numnondigits + numdigits */
14761 int numnondigits = 0;
14762
14763 /* Avoid exceeding SSIZE_T_MAX */
14764 if (prec > INT_MAX-3) {
14765 PyErr_SetString(PyExc_OverflowError,
14766 "precision too large");
14767 return NULL;
14768 }
14769
14770 assert(PyLong_Check(val));
14771
14772 switch (type) {
14773 default:
14774 Py_UNREACHABLE();
14775 case 'd':
14776 case 'i':
14777 case 'u':
14778 /* int and int subclasses should print numerically when a numeric */
14779 /* format code is used (see issue18780) */
14780 result = PyNumber_ToBase(val, 10);
14781 break;
14782 case 'o':
14783 numnondigits = 2;
14784 result = PyNumber_ToBase(val, 8);
14785 break;
14786 case 'x':
14787 case 'X':
14788 numnondigits = 2;
14789 result = PyNumber_ToBase(val, 16);
14790 break;
14791 }
14792 if (!result)
14793 return NULL;
14794
14795 assert(unicode_modifiable(result));
14796 assert(PyUnicode_IS_READY(result));
14797 assert(PyUnicode_IS_ASCII(result));
14798
14799 /* To modify the string in-place, there can only be one reference. */
14800 if (Py_REFCNT(result) != 1) {
14801 Py_DECREF(result);
14802 PyErr_BadInternalCall();
14803 return NULL;
14804 }
14805 buf = PyUnicode_DATA(result);
14806 llen = PyUnicode_GET_LENGTH(result);
14807 if (llen > INT_MAX) {
14808 Py_DECREF(result);
14809 PyErr_SetString(PyExc_ValueError,
14810 "string too large in _PyUnicode_FormatLong");
14811 return NULL;
14812 }
14813 len = (int)llen;
14814 sign = buf[0] == '-';
14815 numnondigits += sign;
14816 numdigits = len - numnondigits;
14817 assert(numdigits > 0);
14818
14819 /* Get rid of base marker unless F_ALT */
14820 if (((alt) == 0 &&
14821 (type == 'o' || type == 'x' || type == 'X'))) {
14822 assert(buf[sign] == '0');
14823 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14824 buf[sign+1] == 'o');
14825 numnondigits -= 2;
14826 buf += 2;
14827 len -= 2;
14828 if (sign)
14829 buf[0] = '-';
14830 assert(len == numnondigits + numdigits);
14831 assert(numdigits > 0);
14832 }
14833
14834 /* Fill with leading zeroes to meet minimum width. */
14835 if (prec > numdigits) {
14836 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14837 numnondigits + prec);
14838 char *b1;
14839 if (!r1) {
14840 Py_DECREF(result);
14841 return NULL;
14842 }
14843 b1 = PyBytes_AS_STRING(r1);
14844 for (i = 0; i < numnondigits; ++i)
14845 *b1++ = *buf++;
14846 for (i = 0; i < prec - numdigits; i++)
14847 *b1++ = '0';
14848 for (i = 0; i < numdigits; i++)
14849 *b1++ = *buf++;
14850 *b1 = '\0';
14851 Py_DECREF(result);
14852 result = r1;
14853 buf = PyBytes_AS_STRING(result);
14854 len = numnondigits + prec;
14855 }
14856
14857 /* Fix up case for hex conversions. */
14858 if (type == 'X') {
14859 /* Need to convert all lower case letters to upper case.
14860 and need to convert 0x to 0X (and -0x to -0X). */
14861 for (i = 0; i < len; i++)
14862 if (buf[i] >= 'a' && buf[i] <= 'x')
14863 buf[i] -= 'a'-'A';
14864 }
14865 if (!PyUnicode_Check(result)
14866 || buf != PyUnicode_DATA(result)) {
14867 PyObject *unicode;
14868 unicode = _PyUnicode_FromASCII(buf, len);
14869 Py_DECREF(result);
14870 result = unicode;
14871 }
14872 else if (len != PyUnicode_GET_LENGTH(result)) {
14873 if (PyUnicode_Resize(&result, len) < 0)
14874 Py_CLEAR(result);
14875 }
14876 return result;
14877 }
14878
14879 /* Format an integer or a float as an integer.
14880 * Return 1 if the number has been formatted into the writer,
14881 * 0 if the number has been formatted into *p_output
14882 * -1 and raise an exception on error */
14883 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14884 mainformatlong(PyObject *v,
14885 struct unicode_format_arg_t *arg,
14886 PyObject **p_output,
14887 _PyUnicodeWriter *writer)
14888 {
14889 PyObject *iobj, *res;
14890 char type = (char)arg->ch;
14891
14892 if (!PyNumber_Check(v))
14893 goto wrongtype;
14894
14895 /* make sure number is a type of integer for o, x, and X */
14896 if (!PyLong_Check(v)) {
14897 if (type == 'o' || type == 'x' || type == 'X') {
14898 iobj = _PyNumber_Index(v);
14899 }
14900 else {
14901 iobj = PyNumber_Long(v);
14902 }
14903 if (iobj == NULL ) {
14904 if (PyErr_ExceptionMatches(PyExc_TypeError))
14905 goto wrongtype;
14906 return -1;
14907 }
14908 assert(PyLong_Check(iobj));
14909 }
14910 else {
14911 iobj = v;
14912 Py_INCREF(iobj);
14913 }
14914
14915 if (PyLong_CheckExact(v)
14916 && arg->width == -1 && arg->prec == -1
14917 && !(arg->flags & (F_SIGN | F_BLANK))
14918 && type != 'X')
14919 {
14920 /* Fast path */
14921 int alternate = arg->flags & F_ALT;
14922 int base;
14923
14924 switch(type)
14925 {
14926 default:
14927 Py_UNREACHABLE();
14928 case 'd':
14929 case 'i':
14930 case 'u':
14931 base = 10;
14932 break;
14933 case 'o':
14934 base = 8;
14935 break;
14936 case 'x':
14937 case 'X':
14938 base = 16;
14939 break;
14940 }
14941
14942 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14943 Py_DECREF(iobj);
14944 return -1;
14945 }
14946 Py_DECREF(iobj);
14947 return 1;
14948 }
14949
14950 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14951 Py_DECREF(iobj);
14952 if (res == NULL)
14953 return -1;
14954 *p_output = res;
14955 return 0;
14956
14957 wrongtype:
14958 switch(type)
14959 {
14960 case 'o':
14961 case 'x':
14962 case 'X':
14963 PyErr_Format(PyExc_TypeError,
14964 "%%%c format: an integer is required, "
14965 "not %.200s",
14966 type, Py_TYPE(v)->tp_name);
14967 break;
14968 default:
14969 PyErr_Format(PyExc_TypeError,
14970 "%%%c format: a real number is required, "
14971 "not %.200s",
14972 type, Py_TYPE(v)->tp_name);
14973 break;
14974 }
14975 return -1;
14976 }
14977
14978 static Py_UCS4
formatchar(PyObject * v)14979 formatchar(PyObject *v)
14980 {
14981 /* presume that the buffer is at least 3 characters long */
14982 if (PyUnicode_Check(v)) {
14983 if (PyUnicode_GET_LENGTH(v) == 1) {
14984 return PyUnicode_READ_CHAR(v, 0);
14985 }
14986 goto onError;
14987 }
14988 else {
14989 int overflow;
14990 long x = PyLong_AsLongAndOverflow(v, &overflow);
14991 if (x == -1 && PyErr_Occurred()) {
14992 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14993 goto onError;
14994 }
14995 return (Py_UCS4) -1;
14996 }
14997
14998 if (x < 0 || x > MAX_UNICODE) {
14999 /* this includes an overflow in converting to C long */
15000 PyErr_SetString(PyExc_OverflowError,
15001 "%c arg not in range(0x110000)");
15002 return (Py_UCS4) -1;
15003 }
15004
15005 return (Py_UCS4) x;
15006 }
15007
15008 onError:
15009 PyErr_SetString(PyExc_TypeError,
15010 "%c requires int or char");
15011 return (Py_UCS4) -1;
15012 }
15013
15014 /* Parse options of an argument: flags, width, precision.
15015 Handle also "%(name)" syntax.
15016
15017 Return 0 if the argument has been formatted into arg->str.
15018 Return 1 if the argument has been written into ctx->writer,
15019 Raise an exception and return -1 on error. */
15020 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)15021 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15022 struct unicode_format_arg_t *arg)
15023 {
15024 #define FORMAT_READ(ctx) \
15025 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15026
15027 PyObject *v;
15028
15029 if (arg->ch == '(') {
15030 /* Get argument value from a dictionary. Example: "%(name)s". */
15031 Py_ssize_t keystart;
15032 Py_ssize_t keylen;
15033 PyObject *key;
15034 int pcount = 1;
15035
15036 if (ctx->dict == NULL) {
15037 PyErr_SetString(PyExc_TypeError,
15038 "format requires a mapping");
15039 return -1;
15040 }
15041 ++ctx->fmtpos;
15042 --ctx->fmtcnt;
15043 keystart = ctx->fmtpos;
15044 /* Skip over balanced parentheses */
15045 while (pcount > 0 && --ctx->fmtcnt >= 0) {
15046 arg->ch = FORMAT_READ(ctx);
15047 if (arg->ch == ')')
15048 --pcount;
15049 else if (arg->ch == '(')
15050 ++pcount;
15051 ctx->fmtpos++;
15052 }
15053 keylen = ctx->fmtpos - keystart - 1;
15054 if (ctx->fmtcnt < 0 || pcount > 0) {
15055 PyErr_SetString(PyExc_ValueError,
15056 "incomplete format key");
15057 return -1;
15058 }
15059 key = PyUnicode_Substring(ctx->fmtstr,
15060 keystart, keystart + keylen);
15061 if (key == NULL)
15062 return -1;
15063 if (ctx->args_owned) {
15064 ctx->args_owned = 0;
15065 Py_DECREF(ctx->args);
15066 }
15067 ctx->args = PyObject_GetItem(ctx->dict, key);
15068 Py_DECREF(key);
15069 if (ctx->args == NULL)
15070 return -1;
15071 ctx->args_owned = 1;
15072 ctx->arglen = -1;
15073 ctx->argidx = -2;
15074 }
15075
15076 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15077 while (--ctx->fmtcnt >= 0) {
15078 arg->ch = FORMAT_READ(ctx);
15079 ctx->fmtpos++;
15080 switch (arg->ch) {
15081 case '-': arg->flags |= F_LJUST; continue;
15082 case '+': arg->flags |= F_SIGN; continue;
15083 case ' ': arg->flags |= F_BLANK; continue;
15084 case '#': arg->flags |= F_ALT; continue;
15085 case '0': arg->flags |= F_ZERO; continue;
15086 }
15087 break;
15088 }
15089
15090 /* Parse width. Example: "%10s" => width=10 */
15091 if (arg->ch == '*') {
15092 v = unicode_format_getnextarg(ctx);
15093 if (v == NULL)
15094 return -1;
15095 if (!PyLong_Check(v)) {
15096 PyErr_SetString(PyExc_TypeError,
15097 "* wants int");
15098 return -1;
15099 }
15100 arg->width = PyLong_AsSsize_t(v);
15101 if (arg->width == -1 && PyErr_Occurred())
15102 return -1;
15103 if (arg->width < 0) {
15104 arg->flags |= F_LJUST;
15105 arg->width = -arg->width;
15106 }
15107 if (--ctx->fmtcnt >= 0) {
15108 arg->ch = FORMAT_READ(ctx);
15109 ctx->fmtpos++;
15110 }
15111 }
15112 else if (arg->ch >= '0' && arg->ch <= '9') {
15113 arg->width = arg->ch - '0';
15114 while (--ctx->fmtcnt >= 0) {
15115 arg->ch = FORMAT_READ(ctx);
15116 ctx->fmtpos++;
15117 if (arg->ch < '0' || arg->ch > '9')
15118 break;
15119 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15120 mixing signed and unsigned comparison. Since arg->ch is between
15121 '0' and '9', casting to int is safe. */
15122 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15123 PyErr_SetString(PyExc_ValueError,
15124 "width too big");
15125 return -1;
15126 }
15127 arg->width = arg->width*10 + (arg->ch - '0');
15128 }
15129 }
15130
15131 /* Parse precision. Example: "%.3f" => prec=3 */
15132 if (arg->ch == '.') {
15133 arg->prec = 0;
15134 if (--ctx->fmtcnt >= 0) {
15135 arg->ch = FORMAT_READ(ctx);
15136 ctx->fmtpos++;
15137 }
15138 if (arg->ch == '*') {
15139 v = unicode_format_getnextarg(ctx);
15140 if (v == NULL)
15141 return -1;
15142 if (!PyLong_Check(v)) {
15143 PyErr_SetString(PyExc_TypeError,
15144 "* wants int");
15145 return -1;
15146 }
15147 arg->prec = _PyLong_AsInt(v);
15148 if (arg->prec == -1 && PyErr_Occurred())
15149 return -1;
15150 if (arg->prec < 0)
15151 arg->prec = 0;
15152 if (--ctx->fmtcnt >= 0) {
15153 arg->ch = FORMAT_READ(ctx);
15154 ctx->fmtpos++;
15155 }
15156 }
15157 else if (arg->ch >= '0' && arg->ch <= '9') {
15158 arg->prec = arg->ch - '0';
15159 while (--ctx->fmtcnt >= 0) {
15160 arg->ch = FORMAT_READ(ctx);
15161 ctx->fmtpos++;
15162 if (arg->ch < '0' || arg->ch > '9')
15163 break;
15164 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15165 PyErr_SetString(PyExc_ValueError,
15166 "precision too big");
15167 return -1;
15168 }
15169 arg->prec = arg->prec*10 + (arg->ch - '0');
15170 }
15171 }
15172 }
15173
15174 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15175 if (ctx->fmtcnt >= 0) {
15176 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15177 if (--ctx->fmtcnt >= 0) {
15178 arg->ch = FORMAT_READ(ctx);
15179 ctx->fmtpos++;
15180 }
15181 }
15182 }
15183 if (ctx->fmtcnt < 0) {
15184 PyErr_SetString(PyExc_ValueError,
15185 "incomplete format");
15186 return -1;
15187 }
15188 return 0;
15189
15190 #undef FORMAT_READ
15191 }
15192
15193 /* Format one argument. Supported conversion specifiers:
15194
15195 - "s", "r", "a": any type
15196 - "i", "d", "u": int or float
15197 - "o", "x", "X": int
15198 - "e", "E", "f", "F", "g", "G": float
15199 - "c": int or str (1 character)
15200
15201 When possible, the output is written directly into the Unicode writer
15202 (ctx->writer). A string is created when padding is required.
15203
15204 Return 0 if the argument has been formatted into *p_str,
15205 1 if the argument has been written into ctx->writer,
15206 -1 on error. */
15207 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)15208 unicode_format_arg_format(struct unicode_formatter_t *ctx,
15209 struct unicode_format_arg_t *arg,
15210 PyObject **p_str)
15211 {
15212 PyObject *v;
15213 _PyUnicodeWriter *writer = &ctx->writer;
15214
15215 if (ctx->fmtcnt == 0)
15216 ctx->writer.overallocate = 0;
15217
15218 v = unicode_format_getnextarg(ctx);
15219 if (v == NULL)
15220 return -1;
15221
15222
15223 switch (arg->ch) {
15224 case 's':
15225 case 'r':
15226 case 'a':
15227 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15228 /* Fast path */
15229 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15230 return -1;
15231 return 1;
15232 }
15233
15234 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15235 *p_str = v;
15236 Py_INCREF(*p_str);
15237 }
15238 else {
15239 if (arg->ch == 's')
15240 *p_str = PyObject_Str(v);
15241 else if (arg->ch == 'r')
15242 *p_str = PyObject_Repr(v);
15243 else
15244 *p_str = PyObject_ASCII(v);
15245 }
15246 break;
15247
15248 case 'i':
15249 case 'd':
15250 case 'u':
15251 case 'o':
15252 case 'x':
15253 case 'X':
15254 {
15255 int ret = mainformatlong(v, arg, p_str, writer);
15256 if (ret != 0)
15257 return ret;
15258 arg->sign = 1;
15259 break;
15260 }
15261
15262 case 'e':
15263 case 'E':
15264 case 'f':
15265 case 'F':
15266 case 'g':
15267 case 'G':
15268 if (arg->width == -1 && arg->prec == -1
15269 && !(arg->flags & (F_SIGN | F_BLANK)))
15270 {
15271 /* Fast path */
15272 if (formatfloat(v, arg, NULL, writer) == -1)
15273 return -1;
15274 return 1;
15275 }
15276
15277 arg->sign = 1;
15278 if (formatfloat(v, arg, p_str, NULL) == -1)
15279 return -1;
15280 break;
15281
15282 case 'c':
15283 {
15284 Py_UCS4 ch = formatchar(v);
15285 if (ch == (Py_UCS4) -1)
15286 return -1;
15287 if (arg->width == -1 && arg->prec == -1) {
15288 /* Fast path */
15289 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15290 return -1;
15291 return 1;
15292 }
15293 *p_str = PyUnicode_FromOrdinal(ch);
15294 break;
15295 }
15296
15297 default:
15298 PyErr_Format(PyExc_ValueError,
15299 "unsupported format character '%c' (0x%x) "
15300 "at index %zd",
15301 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15302 (int)arg->ch,
15303 ctx->fmtpos - 1);
15304 return -1;
15305 }
15306 if (*p_str == NULL)
15307 return -1;
15308 assert (PyUnicode_Check(*p_str));
15309 return 0;
15310 }
15311
15312 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)15313 unicode_format_arg_output(struct unicode_formatter_t *ctx,
15314 struct unicode_format_arg_t *arg,
15315 PyObject *str)
15316 {
15317 Py_ssize_t len;
15318 enum PyUnicode_Kind kind;
15319 const void *pbuf;
15320 Py_ssize_t pindex;
15321 Py_UCS4 signchar;
15322 Py_ssize_t buflen;
15323 Py_UCS4 maxchar;
15324 Py_ssize_t sublen;
15325 _PyUnicodeWriter *writer = &ctx->writer;
15326 Py_UCS4 fill;
15327
15328 fill = ' ';
15329 if (arg->sign && arg->flags & F_ZERO)
15330 fill = '0';
15331
15332 if (PyUnicode_READY(str) == -1)
15333 return -1;
15334
15335 len = PyUnicode_GET_LENGTH(str);
15336 if ((arg->width == -1 || arg->width <= len)
15337 && (arg->prec == -1 || arg->prec >= len)
15338 && !(arg->flags & (F_SIGN | F_BLANK)))
15339 {
15340 /* Fast path */
15341 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15342 return -1;
15343 return 0;
15344 }
15345
15346 /* Truncate the string for "s", "r" and "a" formats
15347 if the precision is set */
15348 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15349 if (arg->prec >= 0 && len > arg->prec)
15350 len = arg->prec;
15351 }
15352
15353 /* Adjust sign and width */
15354 kind = PyUnicode_KIND(str);
15355 pbuf = PyUnicode_DATA(str);
15356 pindex = 0;
15357 signchar = '\0';
15358 if (arg->sign) {
15359 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15360 if (ch == '-' || ch == '+') {
15361 signchar = ch;
15362 len--;
15363 pindex++;
15364 }
15365 else if (arg->flags & F_SIGN)
15366 signchar = '+';
15367 else if (arg->flags & F_BLANK)
15368 signchar = ' ';
15369 else
15370 arg->sign = 0;
15371 }
15372 if (arg->width < len)
15373 arg->width = len;
15374
15375 /* Prepare the writer */
15376 maxchar = writer->maxchar;
15377 if (!(arg->flags & F_LJUST)) {
15378 if (arg->sign) {
15379 if ((arg->width-1) > len)
15380 maxchar = Py_MAX(maxchar, fill);
15381 }
15382 else {
15383 if (arg->width > len)
15384 maxchar = Py_MAX(maxchar, fill);
15385 }
15386 }
15387 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15388 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15389 maxchar = Py_MAX(maxchar, strmaxchar);
15390 }
15391
15392 buflen = arg->width;
15393 if (arg->sign && len == arg->width)
15394 buflen++;
15395 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15396 return -1;
15397
15398 /* Write the sign if needed */
15399 if (arg->sign) {
15400 if (fill != ' ') {
15401 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15402 writer->pos += 1;
15403 }
15404 if (arg->width > len)
15405 arg->width--;
15406 }
15407
15408 /* Write the numeric prefix for "x", "X" and "o" formats
15409 if the alternate form is used.
15410 For example, write "0x" for the "%#x" format. */
15411 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15412 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15413 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15414 if (fill != ' ') {
15415 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15416 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15417 writer->pos += 2;
15418 pindex += 2;
15419 }
15420 arg->width -= 2;
15421 if (arg->width < 0)
15422 arg->width = 0;
15423 len -= 2;
15424 }
15425
15426 /* Pad left with the fill character if needed */
15427 if (arg->width > len && !(arg->flags & F_LJUST)) {
15428 sublen = arg->width - len;
15429 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15430 writer->pos += sublen;
15431 arg->width = len;
15432 }
15433
15434 /* If padding with spaces: write sign if needed and/or numeric prefix if
15435 the alternate form is used */
15436 if (fill == ' ') {
15437 if (arg->sign) {
15438 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15439 writer->pos += 1;
15440 }
15441 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15442 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15443 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15444 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15445 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15446 writer->pos += 2;
15447 pindex += 2;
15448 }
15449 }
15450
15451 /* Write characters */
15452 if (len) {
15453 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15454 str, pindex, len);
15455 writer->pos += len;
15456 }
15457
15458 /* Pad right with the fill character if needed */
15459 if (arg->width > len) {
15460 sublen = arg->width - len;
15461 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15462 writer->pos += sublen;
15463 }
15464 return 0;
15465 }
15466
15467 /* Helper of PyUnicode_Format(): format one arg.
15468 Return 0 on success, raise an exception and return -1 on error. */
15469 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15470 unicode_format_arg(struct unicode_formatter_t *ctx)
15471 {
15472 struct unicode_format_arg_t arg;
15473 PyObject *str;
15474 int ret;
15475
15476 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15477 if (arg.ch == '%') {
15478 ctx->fmtpos++;
15479 ctx->fmtcnt--;
15480 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15481 return -1;
15482 return 0;
15483 }
15484 arg.flags = 0;
15485 arg.width = -1;
15486 arg.prec = -1;
15487 arg.sign = 0;
15488 str = NULL;
15489
15490 ret = unicode_format_arg_parse(ctx, &arg);
15491 if (ret == -1)
15492 return -1;
15493
15494 ret = unicode_format_arg_format(ctx, &arg, &str);
15495 if (ret == -1)
15496 return -1;
15497
15498 if (ret != 1) {
15499 ret = unicode_format_arg_output(ctx, &arg, str);
15500 Py_DECREF(str);
15501 if (ret == -1)
15502 return -1;
15503 }
15504
15505 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15506 PyErr_SetString(PyExc_TypeError,
15507 "not all arguments converted during string formatting");
15508 return -1;
15509 }
15510 return 0;
15511 }
15512
15513 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15514 PyUnicode_Format(PyObject *format, PyObject *args)
15515 {
15516 struct unicode_formatter_t ctx;
15517
15518 if (format == NULL || args == NULL) {
15519 PyErr_BadInternalCall();
15520 return NULL;
15521 }
15522
15523 if (ensure_unicode(format) < 0)
15524 return NULL;
15525
15526 ctx.fmtstr = format;
15527 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15528 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15529 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15530 ctx.fmtpos = 0;
15531
15532 _PyUnicodeWriter_Init(&ctx.writer);
15533 ctx.writer.min_length = ctx.fmtcnt + 100;
15534 ctx.writer.overallocate = 1;
15535
15536 if (PyTuple_Check(args)) {
15537 ctx.arglen = PyTuple_Size(args);
15538 ctx.argidx = 0;
15539 }
15540 else {
15541 ctx.arglen = -1;
15542 ctx.argidx = -2;
15543 }
15544 ctx.args_owned = 0;
15545 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15546 ctx.dict = args;
15547 else
15548 ctx.dict = NULL;
15549 ctx.args = args;
15550
15551 while (--ctx.fmtcnt >= 0) {
15552 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15553 Py_ssize_t nonfmtpos;
15554
15555 nonfmtpos = ctx.fmtpos++;
15556 while (ctx.fmtcnt >= 0 &&
15557 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15558 ctx.fmtpos++;
15559 ctx.fmtcnt--;
15560 }
15561 if (ctx.fmtcnt < 0) {
15562 ctx.fmtpos--;
15563 ctx.writer.overallocate = 0;
15564 }
15565
15566 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15567 nonfmtpos, ctx.fmtpos) < 0)
15568 goto onError;
15569 }
15570 else {
15571 ctx.fmtpos++;
15572 if (unicode_format_arg(&ctx) == -1)
15573 goto onError;
15574 }
15575 }
15576
15577 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15578 PyErr_SetString(PyExc_TypeError,
15579 "not all arguments converted during string formatting");
15580 goto onError;
15581 }
15582
15583 if (ctx.args_owned) {
15584 Py_DECREF(ctx.args);
15585 }
15586 return _PyUnicodeWriter_Finish(&ctx.writer);
15587
15588 onError:
15589 _PyUnicodeWriter_Dealloc(&ctx.writer);
15590 if (ctx.args_owned) {
15591 Py_DECREF(ctx.args);
15592 }
15593 return NULL;
15594 }
15595
15596 static PyObject *
15597 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15598
15599 /*[clinic input]
15600 @classmethod
15601 str.__new__ as unicode_new
15602
15603 object as x: object = NULL
15604 encoding: str = NULL
15605 errors: str = NULL
15606
15607 [clinic start generated code]*/
15608
15609 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15610 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15611 const char *errors)
15612 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15613 {
15614 PyObject *unicode;
15615 if (x == NULL) {
15616 unicode = unicode_new_empty();
15617 }
15618 else if (encoding == NULL && errors == NULL) {
15619 unicode = PyObject_Str(x);
15620 }
15621 else {
15622 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15623 }
15624
15625 if (unicode != NULL && type != &PyUnicode_Type) {
15626 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15627 }
15628 return unicode;
15629 }
15630
15631 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15632 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15633 {
15634 PyObject *self;
15635 Py_ssize_t length, char_size;
15636 int share_wstr, share_utf8;
15637 unsigned int kind;
15638 void *data;
15639
15640 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15641 assert(_PyUnicode_CHECK(unicode));
15642 if (PyUnicode_READY(unicode) == -1) {
15643 return NULL;
15644 }
15645
15646 self = type->tp_alloc(type, 0);
15647 if (self == NULL) {
15648 return NULL;
15649 }
15650 kind = PyUnicode_KIND(unicode);
15651 length = PyUnicode_GET_LENGTH(unicode);
15652
15653 _PyUnicode_LENGTH(self) = length;
15654 #ifdef Py_DEBUG
15655 _PyUnicode_HASH(self) = -1;
15656 #else
15657 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15658 #endif
15659 _PyUnicode_STATE(self).interned = 0;
15660 _PyUnicode_STATE(self).kind = kind;
15661 _PyUnicode_STATE(self).compact = 0;
15662 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15663 _PyUnicode_STATE(self).ready = 1;
15664 _PyUnicode_WSTR(self) = NULL;
15665 _PyUnicode_UTF8_LENGTH(self) = 0;
15666 _PyUnicode_UTF8(self) = NULL;
15667 _PyUnicode_WSTR_LENGTH(self) = 0;
15668 _PyUnicode_DATA_ANY(self) = NULL;
15669
15670 share_utf8 = 0;
15671 share_wstr = 0;
15672 if (kind == PyUnicode_1BYTE_KIND) {
15673 char_size = 1;
15674 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15675 share_utf8 = 1;
15676 }
15677 else if (kind == PyUnicode_2BYTE_KIND) {
15678 char_size = 2;
15679 if (sizeof(wchar_t) == 2)
15680 share_wstr = 1;
15681 }
15682 else {
15683 assert(kind == PyUnicode_4BYTE_KIND);
15684 char_size = 4;
15685 if (sizeof(wchar_t) == 4)
15686 share_wstr = 1;
15687 }
15688
15689 /* Ensure we won't overflow the length. */
15690 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15691 PyErr_NoMemory();
15692 goto onError;
15693 }
15694 data = PyObject_Malloc((length + 1) * char_size);
15695 if (data == NULL) {
15696 PyErr_NoMemory();
15697 goto onError;
15698 }
15699
15700 _PyUnicode_DATA_ANY(self) = data;
15701 if (share_utf8) {
15702 _PyUnicode_UTF8_LENGTH(self) = length;
15703 _PyUnicode_UTF8(self) = data;
15704 }
15705 if (share_wstr) {
15706 _PyUnicode_WSTR_LENGTH(self) = length;
15707 _PyUnicode_WSTR(self) = (wchar_t *)data;
15708 }
15709
15710 memcpy(data, PyUnicode_DATA(unicode),
15711 kind * (length + 1));
15712 assert(_PyUnicode_CheckConsistency(self, 1));
15713 #ifdef Py_DEBUG
15714 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15715 #endif
15716 return self;
15717
15718 onError:
15719 Py_DECREF(self);
15720 return NULL;
15721 }
15722
15723 PyDoc_STRVAR(unicode_doc,
15724 "str(object='') -> str\n\
15725 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15726 \n\
15727 Create a new string object from the given object. If encoding or\n\
15728 errors is specified, then the object must expose a data buffer\n\
15729 that will be decoded using the given encoding and error handler.\n\
15730 Otherwise, returns the result of object.__str__() (if defined)\n\
15731 or repr(object).\n\
15732 encoding defaults to sys.getdefaultencoding().\n\
15733 errors defaults to 'strict'.");
15734
15735 static PyObject *unicode_iter(PyObject *seq);
15736
15737 PyTypeObject PyUnicode_Type = {
15738 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15739 "str", /* tp_name */
15740 sizeof(PyUnicodeObject), /* tp_basicsize */
15741 0, /* tp_itemsize */
15742 /* Slots */
15743 (destructor)unicode_dealloc, /* tp_dealloc */
15744 0, /* tp_vectorcall_offset */
15745 0, /* tp_getattr */
15746 0, /* tp_setattr */
15747 0, /* tp_as_async */
15748 unicode_repr, /* tp_repr */
15749 &unicode_as_number, /* tp_as_number */
15750 &unicode_as_sequence, /* tp_as_sequence */
15751 &unicode_as_mapping, /* tp_as_mapping */
15752 (hashfunc) unicode_hash, /* tp_hash*/
15753 0, /* tp_call*/
15754 (reprfunc) unicode_str, /* tp_str */
15755 PyObject_GenericGetAttr, /* tp_getattro */
15756 0, /* tp_setattro */
15757 0, /* tp_as_buffer */
15758 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15759 Py_TPFLAGS_UNICODE_SUBCLASS |
15760 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15761 unicode_doc, /* tp_doc */
15762 0, /* tp_traverse */
15763 0, /* tp_clear */
15764 PyUnicode_RichCompare, /* tp_richcompare */
15765 0, /* tp_weaklistoffset */
15766 unicode_iter, /* tp_iter */
15767 0, /* tp_iternext */
15768 unicode_methods, /* tp_methods */
15769 0, /* tp_members */
15770 0, /* tp_getset */
15771 &PyBaseObject_Type, /* tp_base */
15772 0, /* tp_dict */
15773 0, /* tp_descr_get */
15774 0, /* tp_descr_set */
15775 0, /* tp_dictoffset */
15776 0, /* tp_init */
15777 0, /* tp_alloc */
15778 unicode_new, /* tp_new */
15779 PyObject_Del, /* tp_free */
15780 };
15781
15782 /* Initialize the Unicode implementation */
15783
15784 PyStatus
_PyUnicode_Init(PyInterpreterState * interp)15785 _PyUnicode_Init(PyInterpreterState *interp)
15786 {
15787 struct _Py_unicode_state *state = &interp->unicode;
15788 if (unicode_create_empty_string_singleton(state) < 0) {
15789 return _PyStatus_NO_MEMORY();
15790 }
15791
15792 if (_Py_IsMainInterpreter(interp)) {
15793 /* initialize the linebreak bloom filter */
15794 const Py_UCS2 linebreak[] = {
15795 0x000A, /* LINE FEED */
15796 0x000D, /* CARRIAGE RETURN */
15797 0x001C, /* FILE SEPARATOR */
15798 0x001D, /* GROUP SEPARATOR */
15799 0x001E, /* RECORD SEPARATOR */
15800 0x0085, /* NEXT LINE */
15801 0x2028, /* LINE SEPARATOR */
15802 0x2029, /* PARAGRAPH SEPARATOR */
15803 };
15804 bloom_linebreak = make_bloom_mask(
15805 PyUnicode_2BYTE_KIND, linebreak,
15806 Py_ARRAY_LENGTH(linebreak));
15807 }
15808
15809 return _PyStatus_OK();
15810 }
15811
15812
15813 PyStatus
_PyUnicode_InitTypes(void)15814 _PyUnicode_InitTypes(void)
15815 {
15816 if (PyType_Ready(&PyUnicode_Type) < 0) {
15817 return _PyStatus_ERR("Can't initialize unicode type");
15818 }
15819 if (PyType_Ready(&EncodingMapType) < 0) {
15820 return _PyStatus_ERR("Can't initialize encoding map type");
15821 }
15822 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15823 return _PyStatus_ERR("Can't initialize field name iterator type");
15824 }
15825 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15826 return _PyStatus_ERR("Can't initialize formatter iter type");
15827 }
15828 return _PyStatus_OK();
15829 }
15830
15831
15832 void
PyUnicode_InternInPlace(PyObject ** p)15833 PyUnicode_InternInPlace(PyObject **p)
15834 {
15835 PyObject *s = *p;
15836 #ifdef Py_DEBUG
15837 assert(s != NULL);
15838 assert(_PyUnicode_CHECK(s));
15839 #else
15840 if (s == NULL || !PyUnicode_Check(s)) {
15841 return;
15842 }
15843 #endif
15844
15845 /* If it's a subclass, we don't really know what putting
15846 it in the interned dict might do. */
15847 if (!PyUnicode_CheckExact(s)) {
15848 return;
15849 }
15850
15851 if (PyUnicode_CHECK_INTERNED(s)) {
15852 return;
15853 }
15854
15855 #ifdef INTERNED_STRINGS
15856 if (PyUnicode_READY(s) == -1) {
15857 PyErr_Clear();
15858 return;
15859 }
15860
15861 if (interned == NULL) {
15862 interned = PyDict_New();
15863 if (interned == NULL) {
15864 PyErr_Clear(); /* Don't leave an exception */
15865 return;
15866 }
15867 }
15868
15869 PyObject *t = PyDict_SetDefault(interned, s, s);
15870 if (t == NULL) {
15871 PyErr_Clear();
15872 return;
15873 }
15874
15875 if (t != s) {
15876 Py_INCREF(t);
15877 Py_SETREF(*p, t);
15878 return;
15879 }
15880
15881 /* The two references in interned dict (key and value) are not counted by
15882 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15883 this. */
15884 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15885 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15886 #else
15887 // PyDict expects that interned strings have their hash
15888 // (PyASCIIObject.hash) already computed.
15889 (void)unicode_hash(s);
15890 #endif
15891 }
15892
15893 void
PyUnicode_InternImmortal(PyObject ** p)15894 PyUnicode_InternImmortal(PyObject **p)
15895 {
15896 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15897 "PyUnicode_InternImmortal() is deprecated; "
15898 "use PyUnicode_InternInPlace() instead", 1) < 0)
15899 {
15900 // The function has no return value, the exception cannot
15901 // be reported to the caller, so just log it.
15902 PyErr_WriteUnraisable(NULL);
15903 }
15904
15905 PyUnicode_InternInPlace(p);
15906 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15907 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15908 Py_INCREF(*p);
15909 }
15910 }
15911
15912 PyObject *
PyUnicode_InternFromString(const char * cp)15913 PyUnicode_InternFromString(const char *cp)
15914 {
15915 PyObject *s = PyUnicode_FromString(cp);
15916 if (s == NULL)
15917 return NULL;
15918 PyUnicode_InternInPlace(&s);
15919 return s;
15920 }
15921
15922
15923 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15924 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15925 {
15926 if (!_Py_IsMainInterpreter(interp)) {
15927 // interned dict is shared by all interpreters
15928 return;
15929 }
15930
15931 if (interned == NULL) {
15932 return;
15933 }
15934 assert(PyDict_CheckExact(interned));
15935
15936 /* Interned unicode strings are not forcibly deallocated; rather, we give
15937 them their stolen references back, and then clear and DECREF the
15938 interned dict. */
15939
15940 #ifdef INTERNED_STATS
15941 fprintf(stderr, "releasing %zd interned strings\n",
15942 PyDict_GET_SIZE(interned));
15943
15944 Py_ssize_t immortal_size = 0, mortal_size = 0;
15945 #endif
15946 Py_ssize_t pos = 0;
15947 PyObject *s, *ignored_value;
15948 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15949 assert(PyUnicode_IS_READY(s));
15950
15951 switch (PyUnicode_CHECK_INTERNED(s)) {
15952 case SSTATE_INTERNED_IMMORTAL:
15953 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15954 #ifdef INTERNED_STATS
15955 immortal_size += PyUnicode_GET_LENGTH(s);
15956 #endif
15957 break;
15958 case SSTATE_INTERNED_MORTAL:
15959 // Restore the two references (key and value) ignored
15960 // by PyUnicode_InternInPlace().
15961 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15962 #ifdef INTERNED_STATS
15963 mortal_size += PyUnicode_GET_LENGTH(s);
15964 #endif
15965 break;
15966 case SSTATE_NOT_INTERNED:
15967 /* fall through */
15968 default:
15969 Py_UNREACHABLE();
15970 }
15971 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15972 }
15973 #ifdef INTERNED_STATS
15974 fprintf(stderr,
15975 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15976 mortal_size, immortal_size);
15977 #endif
15978
15979 PyDict_Clear(interned);
15980 Py_CLEAR(interned);
15981 }
15982
15983
15984 /********************* Unicode Iterator **************************/
15985
15986 typedef struct {
15987 PyObject_HEAD
15988 Py_ssize_t it_index;
15989 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15990 } unicodeiterobject;
15991
15992 static void
unicodeiter_dealloc(unicodeiterobject * it)15993 unicodeiter_dealloc(unicodeiterobject *it)
15994 {
15995 _PyObject_GC_UNTRACK(it);
15996 Py_XDECREF(it->it_seq);
15997 PyObject_GC_Del(it);
15998 }
15999
16000 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)16001 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
16002 {
16003 Py_VISIT(it->it_seq);
16004 return 0;
16005 }
16006
16007 static PyObject *
unicodeiter_next(unicodeiterobject * it)16008 unicodeiter_next(unicodeiterobject *it)
16009 {
16010 PyObject *seq, *item;
16011
16012 assert(it != NULL);
16013 seq = it->it_seq;
16014 if (seq == NULL)
16015 return NULL;
16016 assert(_PyUnicode_CHECK(seq));
16017
16018 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16019 int kind = PyUnicode_KIND(seq);
16020 const void *data = PyUnicode_DATA(seq);
16021 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16022 item = PyUnicode_FromOrdinal(chr);
16023 if (item != NULL)
16024 ++it->it_index;
16025 return item;
16026 }
16027
16028 it->it_seq = NULL;
16029 Py_DECREF(seq);
16030 return NULL;
16031 }
16032
16033 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16034 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16035 {
16036 Py_ssize_t len = 0;
16037 if (it->it_seq)
16038 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16039 return PyLong_FromSsize_t(len);
16040 }
16041
16042 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16043
16044 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))16045 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
16046 {
16047 _Py_IDENTIFIER(iter);
16048 if (it->it_seq != NULL) {
16049 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
16050 it->it_seq, it->it_index);
16051 } else {
16052 PyObject *u = (PyObject *)_PyUnicode_New(0);
16053 if (u == NULL)
16054 return NULL;
16055 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
16056 }
16057 }
16058
16059 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16060
16061 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)16062 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16063 {
16064 Py_ssize_t index = PyLong_AsSsize_t(state);
16065 if (index == -1 && PyErr_Occurred())
16066 return NULL;
16067 if (it->it_seq != NULL) {
16068 if (index < 0)
16069 index = 0;
16070 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16071 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16072 it->it_index = index;
16073 }
16074 Py_RETURN_NONE;
16075 }
16076
16077 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16078
16079 static PyMethodDef unicodeiter_methods[] = {
16080 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
16081 length_hint_doc},
16082 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16083 reduce_doc},
16084 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16085 setstate_doc},
16086 {NULL, NULL} /* sentinel */
16087 };
16088
16089 PyTypeObject PyUnicodeIter_Type = {
16090 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16091 "str_iterator", /* tp_name */
16092 sizeof(unicodeiterobject), /* tp_basicsize */
16093 0, /* tp_itemsize */
16094 /* methods */
16095 (destructor)unicodeiter_dealloc, /* tp_dealloc */
16096 0, /* tp_vectorcall_offset */
16097 0, /* tp_getattr */
16098 0, /* tp_setattr */
16099 0, /* tp_as_async */
16100 0, /* tp_repr */
16101 0, /* tp_as_number */
16102 0, /* tp_as_sequence */
16103 0, /* tp_as_mapping */
16104 0, /* tp_hash */
16105 0, /* tp_call */
16106 0, /* tp_str */
16107 PyObject_GenericGetAttr, /* tp_getattro */
16108 0, /* tp_setattro */
16109 0, /* tp_as_buffer */
16110 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16111 0, /* tp_doc */
16112 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16113 0, /* tp_clear */
16114 0, /* tp_richcompare */
16115 0, /* tp_weaklistoffset */
16116 PyObject_SelfIter, /* tp_iter */
16117 (iternextfunc)unicodeiter_next, /* tp_iternext */
16118 unicodeiter_methods, /* tp_methods */
16119 0,
16120 };
16121
16122 static PyObject *
unicode_iter(PyObject * seq)16123 unicode_iter(PyObject *seq)
16124 {
16125 unicodeiterobject *it;
16126
16127 if (!PyUnicode_Check(seq)) {
16128 PyErr_BadInternalCall();
16129 return NULL;
16130 }
16131 if (PyUnicode_READY(seq) == -1)
16132 return NULL;
16133 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16134 if (it == NULL)
16135 return NULL;
16136 it->it_index = 0;
16137 Py_INCREF(seq);
16138 it->it_seq = seq;
16139 _PyObject_GC_TRACK(it);
16140 return (PyObject *)it;
16141 }
16142
16143 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)16144 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16145 {
16146 int res;
16147 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16148 if (res == -2) {
16149 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16150 return -1;
16151 }
16152 if (res < 0) {
16153 PyErr_NoMemory();
16154 return -1;
16155 }
16156 return 0;
16157 }
16158
16159
16160 static int
config_get_codec_name(wchar_t ** config_encoding)16161 config_get_codec_name(wchar_t **config_encoding)
16162 {
16163 char *encoding;
16164 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16165 return -1;
16166 }
16167
16168 PyObject *name_obj = NULL;
16169 PyObject *codec = _PyCodec_Lookup(encoding);
16170 PyMem_RawFree(encoding);
16171
16172 if (!codec)
16173 goto error;
16174
16175 name_obj = PyObject_GetAttrString(codec, "name");
16176 Py_CLEAR(codec);
16177 if (!name_obj) {
16178 goto error;
16179 }
16180
16181 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16182 Py_DECREF(name_obj);
16183 if (wname == NULL) {
16184 goto error;
16185 }
16186
16187 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16188 if (raw_wname == NULL) {
16189 PyMem_Free(wname);
16190 PyErr_NoMemory();
16191 goto error;
16192 }
16193
16194 PyMem_RawFree(*config_encoding);
16195 *config_encoding = raw_wname;
16196
16197 PyMem_Free(wname);
16198 return 0;
16199
16200 error:
16201 Py_XDECREF(codec);
16202 Py_XDECREF(name_obj);
16203 return -1;
16204 }
16205
16206
16207 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)16208 init_stdio_encoding(PyInterpreterState *interp)
16209 {
16210 /* Update the stdio encoding to the normalized Python codec name. */
16211 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16212 if (config_get_codec_name(&config->stdio_encoding) < 0) {
16213 return _PyStatus_ERR("failed to get the Python codec name "
16214 "of the stdio encoding");
16215 }
16216 return _PyStatus_OK();
16217 }
16218
16219
16220 static int
init_fs_codec(PyInterpreterState * interp)16221 init_fs_codec(PyInterpreterState *interp)
16222 {
16223 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16224
16225 _Py_error_handler error_handler;
16226 error_handler = get_error_handler_wide(config->filesystem_errors);
16227 if (error_handler == _Py_ERROR_UNKNOWN) {
16228 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16229 return -1;
16230 }
16231
16232 char *encoding, *errors;
16233 if (encode_wstr_utf8(config->filesystem_encoding,
16234 &encoding,
16235 "filesystem_encoding") < 0) {
16236 return -1;
16237 }
16238
16239 if (encode_wstr_utf8(config->filesystem_errors,
16240 &errors,
16241 "filesystem_errors") < 0) {
16242 PyMem_RawFree(encoding);
16243 return -1;
16244 }
16245
16246 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16247 PyMem_RawFree(fs_codec->encoding);
16248 fs_codec->encoding = encoding;
16249 /* encoding has been normalized by init_fs_encoding() */
16250 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16251 PyMem_RawFree(fs_codec->errors);
16252 fs_codec->errors = errors;
16253 fs_codec->error_handler = error_handler;
16254
16255 #ifdef _Py_FORCE_UTF8_FS_ENCODING
16256 assert(fs_codec->utf8 == 1);
16257 #endif
16258
16259 /* At this point, PyUnicode_EncodeFSDefault() and
16260 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16261 the C implementation of the filesystem encoding. */
16262
16263 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16264 global configuration variables. */
16265 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16266 fs_codec->errors) < 0) {
16267 PyErr_NoMemory();
16268 return -1;
16269 }
16270 return 0;
16271 }
16272
16273
16274 static PyStatus
init_fs_encoding(PyThreadState * tstate)16275 init_fs_encoding(PyThreadState *tstate)
16276 {
16277 PyInterpreterState *interp = tstate->interp;
16278
16279 /* Update the filesystem encoding to the normalized Python codec name.
16280 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16281 (Python codec name). */
16282 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16283 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16284 _Py_DumpPathConfig(tstate);
16285 return _PyStatus_ERR("failed to get the Python codec "
16286 "of the filesystem encoding");
16287 }
16288
16289 if (init_fs_codec(interp) < 0) {
16290 return _PyStatus_ERR("cannot initialize filesystem codec");
16291 }
16292 return _PyStatus_OK();
16293 }
16294
16295
16296 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16297 _PyUnicode_InitEncodings(PyThreadState *tstate)
16298 {
16299 PyStatus status = init_fs_encoding(tstate);
16300 if (_PyStatus_EXCEPTION(status)) {
16301 return status;
16302 }
16303
16304 return init_stdio_encoding(tstate->interp);
16305 }
16306
16307
16308 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16309 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16310 {
16311 PyMem_RawFree(fs_codec->encoding);
16312 fs_codec->encoding = NULL;
16313 fs_codec->utf8 = 0;
16314 PyMem_RawFree(fs_codec->errors);
16315 fs_codec->errors = NULL;
16316 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16317 }
16318
16319
16320 #ifdef MS_WINDOWS
16321 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16322 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16323 {
16324 PyInterpreterState *interp = _PyInterpreterState_GET();
16325 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16326
16327 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16328 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16329 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16330 if (encoding == NULL || errors == NULL) {
16331 PyMem_RawFree(encoding);
16332 PyMem_RawFree(errors);
16333 PyErr_NoMemory();
16334 return -1;
16335 }
16336
16337 PyMem_RawFree(config->filesystem_encoding);
16338 config->filesystem_encoding = encoding;
16339 PyMem_RawFree(config->filesystem_errors);
16340 config->filesystem_errors = errors;
16341
16342 return init_fs_codec(interp);
16343 }
16344 #endif
16345
16346
16347 void
_PyUnicode_Fini(PyInterpreterState * interp)16348 _PyUnicode_Fini(PyInterpreterState *interp)
16349 {
16350 struct _Py_unicode_state *state = &interp->unicode;
16351
16352 if (_Py_IsMainInterpreter(interp)) {
16353 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16354 assert(interned == NULL);
16355 }
16356
16357 _PyUnicode_FiniEncodings(&state->fs_codec);
16358
16359 unicode_clear_identifiers(state);
16360
16361 for (Py_ssize_t i = 0; i < 256; i++) {
16362 Py_CLEAR(state->latin1[i]);
16363 }
16364 Py_CLEAR(state->empty_string);
16365 }
16366
16367
16368 /* A _string module, to export formatter_parser and formatter_field_name_split
16369 to the string.Formatter class implemented in Python. */
16370
16371 static PyMethodDef _string_methods[] = {
16372 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16373 METH_O, PyDoc_STR("split the argument as a field name")},
16374 {"formatter_parser", (PyCFunction) formatter_parser,
16375 METH_O, PyDoc_STR("parse the argument as a format string")},
16376 {NULL, NULL}
16377 };
16378
16379 static struct PyModuleDef _string_module = {
16380 PyModuleDef_HEAD_INIT,
16381 .m_name = "_string",
16382 .m_doc = PyDoc_STR("string helper module"),
16383 .m_size = 0,
16384 .m_methods = _string_methods,
16385 };
16386
16387 PyMODINIT_FUNC
PyInit__string(void)16388 PyInit__string(void)
16389 {
16390 return PyModuleDef_Init(&_string_module);
16391 }
16392
16393
16394 #ifdef __cplusplus
16395 }
16396 #endif
16397